diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000000..6b8710a711f3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.git diff --git a/.github/workflows/Dockerfile b/.github/workflows/Dockerfile new file mode 100644 index 000000000000..e08a623f6dbd --- /dev/null +++ b/.github/workflows/Dockerfile @@ -0,0 +1,31 @@ +FROM ubuntu:20.04 AS builder + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=UTC + +RUN apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates git \ + build-essential cmake ninja-build python3 libjemalloc-dev && \ + rm -rf /var/lib/apt/lists + +WORKDIR /home/bolt + +COPY . llvm-bolt + +WORKDIR build + +RUN cmake -G Ninja ../llvm-bolt/llvm \ + -DLLVM_ENABLE_PROJECTS="bolt" \ + -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_EXE_LINKER_FLAGS="-Wl,--push-state -Wl,-whole-archive -ljemalloc_pic -Wl,--pop-state -lpthread -lstdc++ -lm -ldl" \ + -DCMAKE_INSTALL_PREFIX=/home/bolt/install + +RUN ninja check-bolt && \ + ninja install-llvm-bolt install-perf2bolt install-merge-fdata \ + install-llvm-boltdiff install-bolt_rt + +FROM ubuntu:20.04 + +COPY --from=builder /home/bolt/install /usr/local diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 000000000000..21c505d85c0a --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,16 @@ +name: Docker Image CI + +on: + push: + branches: [ rebased ] + pull_request: + branches: [ rebased ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Build the Docker image and test + run: docker build . --file .github/workflows/Dockerfile --tag ubuntu-bolt:$(date +%s) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt new file mode 100644 index 000000000000..4f5664d7abc6 --- /dev/null +++ b/bolt/CMakeLists.txt @@ -0,0 +1,29 @@ +include(ExternalProject) + +set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(CMAKE_CXX_STANDARD 14) + +ExternalProject_Add(bolt_rt + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime" + STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} + -DCMAKE_INSTALL_PREFIX=${LLVM_BINARY_DIR} + # You might want to set this to True if actively developing bolt_rt, otherwise + # cmake will not rebuild it after source code changes + BUILD_ALWAYS True + ) + +install(CODE "execute_process\(COMMAND \${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=\${CMAKE_INSTALL_PREFIX} -P ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/cmake_install.cmake \)" + COMPONENT bolt_rt) + +add_llvm_install_targets(install-bolt_rt + DEPENDS bolt_rt + COMPONENT bolt_rt) + +add_subdirectory(src) +add_subdirectory(test) diff --git a/bolt/LICENSE.TXT b/bolt/LICENSE.TXT new file mode 100644 index 000000000000..fa6ac5400070 --- /dev/null +++ b/bolt/LICENSE.TXT @@ -0,0 +1,279 @@ +============================================================================== +The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: +============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +---- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + +============================================================================== +Software from third parties included in the LLVM Project: +============================================================================== +The LLVM Project contains third party software which is under different license +terms. All such code will be identified clearly using at least one of two +mechanisms: +1) It will be in a separate directory tree with its own `LICENSE.txt` or + `LICENSE` file at the top containing the specific license and restrictions + which apply to that software, or +2) It will contain specific license and restriction terms at the top of every + file. + +============================================================================== +Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + diff --git a/bolt/README.md b/bolt/README.md new file mode 100644 index 000000000000..ffa2119fd501 --- /dev/null +++ b/bolt/README.md @@ -0,0 +1,221 @@ +# BOLT + +BOLT is a post-link optimizer developed to speed up large applications. +It achieves the improvements by optimizing application's code layout based on +execution profile gathered by sampling profiler, such as Linux `perf` tool. +An overview of the ideas implemented in BOLT along with a discussion of its +potential and current results is available in +[CGO'19 paper](https://research.fb.com/publications/bolt-a-practical-binary-optimizer-for-data-centers-and-beyond/). + +## Input Binary Requirements + +BOLT operates on X86-64 and AArch64 ELF binaries. At the minimum, the binaries +should have an unstripped symbol table, and, to get maximum performance gains, +they should be linked with relocations (`--emit-relocs` or `-q` linker flag). + +BOLT disassembles functions and reconstructs the control flow graph (CFG) +before it runs optimizations. Since this is a nontrivial task, +especially when indirect branches are present, we rely on certain heuristics +to accomplish it. These heuristics have been tested on a code generated with +Clang and GCC compilers. The main requirement for C/C++ code is not to rely +on code layout properties, such as function pointer deltas. +Assembly code can be processed too. Requirements for it include a clear +separation of code and data, with data objects being placed into data +sections/segments. If indirect jumps are used for intra-function control +transfer (e.g., jump tables), the code patterns should be matching those +generated by Clang/GCC. + +NOTE: BOLT is currently incompatible with the `-freorder-blocks-and-partition` +compiler option. Since GCC8 enables this option by default, you have to +explicitly disable it by adding `-fno-reorder-blocks-and-partition` flag if +you are compiling with GCC8. + +PIE and .so support has been added recently. Please report bugs if you +encounter any issues. + +## Installation + +### Docker Image + +You can build and use the docker image containing BOLT using our [docker file](./utils/docker/Dockerfile). +Alternatively, you can build BOLT manually using the steps below. + +### Manual Build + +BOLT heavily uses LLVM libraries, and by design, it is built as one of LLVM +tools. The build process is not much different from a regular LLVM build. +The following instructions are assuming that you are running under Linux. + +Start with cloning LLVM and BOLT repos: + +``` +> git clone https://github.com/llvm-mirror/llvm llvm +> cd llvm/tools +> git checkout -b llvm-bolt f137ed238db11440f03083b1c88b7ffc0f4af65e +> git clone https://github.com/facebookincubator/BOLT llvm-bolt +> cd .. +> patch -p 1 < tools/llvm-bolt/llvm.patch +``` + +Proceed to a normal LLVM build using a compiler with C++11 support (for GCC +use version 4.9 or later): + +``` +> cd .. +> mkdir build +> cd build +> cmake -G Ninja ../llvm -DLLVM_TARGETS_TO_BUILD="X86;AArch64" -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON +> ninja +``` + +`llvm-bolt` will be available under `bin/`. Add this directory to your path to +ensure the rest of the commands in this tutorial work. + +Note that we use a specific revision of LLVM as we currently rely on a set of +patches that are not yet upstreamed. + +## Optimizing BOLT's Performance + +BOLT runs many internal passes in parallel. If you foresee heavy usage of +BOLT, you can improve the processing time by linking against one of memory +allocation libraries with good support for concurrency. E.g. to use jemalloc: + +``` +> sudo yum install jemalloc-devel +> LD_PRELOAD=/usr/lib64/libjemalloc.so llvm-bolt .... +``` +Or if you rather use tcmalloc: +``` +> sudo yum install gperftools-devel +> LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so llvm-bolt .... +``` + +## Usage + +For a complete practical guide of using BOLT see [Optimizing Clang with BOLT](./docs/OptimizingClang.md). + +### Step 0 + +In order to allow BOLT to re-arrange functions (in addition to re-arranging +code within functions) in your program, it needs a little help from the linker. +Add `--emit-relocs` to the final link step of your application. You can verify +the presence of relocations by checking for `.rela.text` section in the binary. +BOLT will also report if it detects relocations while processing the binary. + +### Step 1: Collect Profile + +This step is different for different kinds of executables. If you can invoke +your program to run on a representative input from a command line, then check +**For Applications** section below. If your program typically runs as a +server/service, then skip to **For Services** section. + +The version of `perf` command used for the following steps has to support +`-F brstack` option. We recommend using `perf` version 4.5 or later. + +#### For Applications + +This assumes you can run your program from a command line with a typical input. +In this case, simply prepend the command line invocation with `perf`: +``` +$ perf record -e cycles:u -j any,u -o perf.data -- ... +``` + +#### For Services + +Once you get the service deployed and warmed-up, it is time to collect perf +data with LBR (branch information). The exact perf command to use will depend +on the service. E.g., to collect the data for all processes running on the +server for the next 3 minutes use: +``` +$ perf record -e cycles:u -j any,u -a -o perf.data -- sleep 180 +``` + +Depending on the application, you may need more samples to be included with +your profile. It's hard to tell upfront what would be a sweet spot for your +application. We recommend the profile to cover 1B instructions as reported +by BOLT `-dyno-stats` option. If you need to increase the number of samples +in the profile, you can either run the `sleep` command for longer and use +`-F` option with `perf` to increase sampling frequency. + +Note that for profile collection we recommend using cycle events and not +`BR_INST_RETIRED.*`. Empirically we found it to produce better results. + +If the collection of a profile with branches is not available, e.g., when you run on +a VM or on hardware that does not support it, then you can use only sample +events, such as cycles. In this case, the quality of the profile information +would not be as good, and performance gains with BOLT are expected to be lower. + +#### With instrumentation + +If perf record is not available to you, you may collect profile by first +instrumenting the binary with BOLT and then running it. +``` +llvm-bolt -instrument -o +``` + +After you run instrumented-executable with the desired workload, its BOLT +profile should be ready for you in `/tmp/prof.fdata` and you can skip +**Step 2**. + +Run BOLT with the `-help` option and check the category "BOLT instrumentation +options" for a quick reference on instrumentation knobs. + +### Step 2: Convert Profile to BOLT Format + +NOTE: you can skip this step and feed `perf.data` directly to BOLT using +experimental `-p perf.data` option. + +For this step, you will need `perf.data` file collected from the previous step and +a copy of the binary that was running. The binary has to be either +unstripped, or should have a symbol table intact (i.e., running `strip -g` is +okay). + +Make sure `perf` is in your `PATH`, and execute `perf2bolt`: +``` +$ perf2bolt -p perf.data -o perf.fdata +``` + +This command will aggregate branch data from `perf.data` and store it in a +format that is both more compact and more resilient to binary modifications. + +If the profile was collected without LBRs, you will need to add `-nl` flag to +the command line above. + +### Step 3: Optimize with BOLT + +Once you have `perf.fdata` ready, you can use it for optimizations with +BOLT. Assuming your environment is setup to include the right path, execute +`llvm-bolt`: +``` +$ llvm-bolt -o .bolt -data=perf.fdata -reorder-blocks=cache+ -reorder-functions=hfsort -split-functions=2 -split-all-cold -split-eh -dyno-stats +``` + +If you do need an updated debug info, then add `-update-debug-sections` option +to the command above. The processing time will be slightly longer. + +For a full list of options see `-help`/`-help-hidden` output. + +The input binary for this step does not have to 100% match the binary used for +profile collection in **Step 1**. This could happen when you are doing active +development, and the source code constantly changes, yet you want to benefit +from profile-guided optimizations. However, since the binary is not precisely the +same, the profile information could become invalid or stale, and BOLT will +report the number of functions with a stale profile. The higher the +number, the less performance improvement should be expected. Thus, it is +crucial to update `.fdata` for release branches. + +## Multiple Profiles + +Suppose your application can run in different modes, and you can generate +multiple profiles for each one of them. To generate a single binary that can +benefit all modes (assuming the profiles don't contradict each other) you can +use `merge-fdata` tool: +``` +$ merge-fdata *.fdata > combined.fdata +``` +Use `combined.fdata` for **Step 3** above to generate a universally optimized +binary. + +## License + +BOLT is licensed under the [Apache License v2.0 with LLVM Exceptions](./LICENSE.TXT). diff --git a/bolt/docs/Heatmap.png b/bolt/docs/Heatmap.png new file mode 100644 index 000000000000..e3f76fb22932 Binary files /dev/null and b/bolt/docs/Heatmap.png differ diff --git a/bolt/docs/Heatmaps.md b/bolt/docs/Heatmaps.md new file mode 100644 index 000000000000..526b3900b2d1 --- /dev/null +++ b/bolt/docs/Heatmaps.md @@ -0,0 +1,50 @@ +# Code Heatmaps + +BOLT has gained the ability to print code heatmaps based on +sampling-based LBR profiles generated by `perf`. The output is produced +in colored ASCII to be displayed in a color-capable terminal. It looks +something like this: + +![](./Heatmap.png) + +Heatmaps can be generated for BOLTed and non-BOLTed binaries. You can +use them to compare the code layout before and after optimizations. + +To generate a heatmap, start with running your app under `perf`: + +```bash +$ perf record -e cycles:u -j any,u -- +``` +or if you want to monitor the existing process(es): +```bash +$ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep +``` + +Note that at the moment running with LBR (`-j any,u` or `-b`) is +a requirement. + +Once the run is complete, and `perf.data` is generated, run BOLT in +a heatmap mode: + +```bash +$ llvm-bolt heatmap -p perf.data +``` + +By default the heatmap will be dumped to *stdout*. You can change it +with `-o ` option. Each character/block in the heatmap +shows the execution data accumulated for corresponding 64 bytes of +code. You can change this granularity with a `-block-size` option. +E.g. set it to 4096 to see code usage grouped by 4K pages. +Other useful options are: + +```bash +-line-size= - number of entries per line (default 256) +-max-address= - maximum address considered valid for heatmap (default 4GB) +``` + +If you prefer to look at the data in a browser (or would like to share +it that way), then you can use an HTML conversion tool. E.g.: + +```bash +$ aha -b -f > .html +``` diff --git a/bolt/docs/OptimizingClang.md b/bolt/docs/OptimizingClang.md new file mode 100644 index 000000000000..12ac2fe19e23 --- /dev/null +++ b/bolt/docs/OptimizingClang.md @@ -0,0 +1,266 @@ +# Optimizing Clang : A Practical Example of Applying BOLT + +## Preface + +*BOLT* (Binary Optimization and Layout Tool) is designed to improve the application +performance by laying out code in a manner that helps CPU better utilize its caching and +branch predicting resources. + +The most obvious candidates for BOLT optimizations +are programs that suffer from many instruction cache and iTLB misses, such as +large applications measuring over hundreds of megabytes in size. However, medium-sized +programs can benefit too. Clang, one of the most popular open-source C/C++ compilers, +is a good example of the latter. Its code size could easily be in the order of tens of megabytes. +As we will see, the Clang binary suffers from many instruction cache +misses and can be significantly improved with BOLT, even on top of profile-guided and +link-time optimizations. + +In this tutorial we will first build Clang with PGO and LTO, and then will show steps on how to +apply BOLT optimizations to make Clang up to 15% faster. We will also analyze where +the compile-time performance gains are coming from, and verify that the speed-ups are +sustainable while building other applications. + +## Building Clang + +The process of getting Clang sources and performing the build is very similar to the +one described at http://clang.llvm.org/get_started.html. For completeness, we provide the detailed steps +on how to obtain and build Clang in [Bootstrapping Clang-7 with PGO and LTO](#bootstrapping-clang-7-with-pgo-and-lto) section. + +The only difference from the standard Clang build is that we require the `-Wl,-q` flag to be present during +the final link. This option saves relocation metadata in the executable file, but does not affect +the generated code in any way. + +## Optimizing Clang with BOLT + +We will use the setup described in [Bootstrapping Clang-7 with PGO and LTO](#bootstrapping-clang-7-with-pgo-and-lto). +Adjust the steps accordingly if you skipped that section. We will also assume that `llvm-bolt` is present in your `$PATH`. + +Before we can run BOLT optimizations, we need to collect the profile for Clang, and we will use +Clang/LLVM sources for that. +Collecting accurate profile requires running `perf` on a hardware that +implements taken branch sampling (`-b/-j` flag). For that reason, it may not be possible to +collect the accurate profile in a virtualized environment, e.g. in the cloud. +We do support regular sampling profiles, but the performance +improvements are expected to be more modest. + +```bash +$ mkdir ${TOPLEV}/stage3 +$ cd ${TOPLEV}/stage3 +$ CPATH=${TOPLEV}/stage2-prof-use-lto/install/bin/ +$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \ + -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3/install +$ perf record -e cycles:u -j any,u -- ninja clang +``` + +Once the last command is finished, it will create a `perf.data` file larger than 10GiB. +We will first convert this profile into a more compact aggregated +form suitable to be consumed by BOLT: +```bash + $ perf2bolt $CPATH/clang-7 -p perf.data -o clang-7.fdata -w clang-7.yaml +``` +Notice that we are passing `clang-7` to `perf2bolt` which is the real binary that +`clang` and `clang++` are symlinking to. The next step will optimize Clang using +the generated profile: +```bash +$ llvm-bolt $CPATH/clang-7 -o $CPATH/clang-7.bolt -b clang-7.yaml \ + -reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 \ + -split-all-cold -dyno-stats -icf=1 -use-gnu-stack +``` +The output will look similar to the one below: +```t +... +BOLT-INFO: enabling relocation mode +BOLT-INFO: 11415 functions out of 104526 simple functions (10.9%) have non-empty execution profile. +... +BOLT-INFO: ICF folded 29144 out of 105177 functions in 8 passes. 82 functions had jump tables. +BOLT-INFO: Removing all identical functions will save 5466.69 KB of code space. Folded functions were called 2131985 times based on profile. +BOLT-INFO: basic block reordering modified layout of 7848 (10.32%) functions +... + 660155947 : executed forward branches (-2.3%) + 48252553 : taken forward branches (-57.2%) + 129897961 : executed backward branches (+13.8%) + 52389551 : taken backward branches (-19.5%) + 35650038 : executed unconditional branches (-33.2%) + 128338874 : all function calls (=) + 19010563 : indirect calls (=) + 9918250 : PLT calls (=) + 6113398840 : executed instructions (-0.6%) + 1519537463 : executed load instructions (=) + 943321306 : executed store instructions (=) + 20467109 : taken jump table branches (=) + 825703946 : total branches (-2.1%) + 136292142 : taken branches (-41.1%) + 689411804 : non-taken conditional branches (+12.6%) + 100642104 : taken conditional branches (-43.4%) + 790053908 : all conditional branches (=) +... +``` +The statistics in the output is based on the LBR profile collected with `perf`, and since we were using +the `cycles` counter, its accuracy is affected. However, the relative improvement in `taken conditional + branches` is a good indication that BOLT was able to straighten out the code even after PGO. + +## Measuring Compile-time Improvement + +`clang-7.bolt` can be used as a replacement for *PGO+LTO* Clang: +```bash +$ mv $CPATH/clang-7 $CPATH/clang-7.org +$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7 +``` +Doing a new build of Clang using the new binary shows a significant overall +build time reduction on a 48-core Haswell system: +```bash +$ ln -fs $CPATH/clang-7.org $CPATH/clang-7 +$ ninja clean && /bin/time -f %e ninja clang -j48 +202.72 +$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7 +$ ninja clean && /bin/time -f %e ninja clang -j48 +180.11 +``` +That's 22.61 seconds (or 12%) faster compared to the *PGO+LTO* build. +Notice that we are measuring an improvement of the total build time, which includes the time spent in the linker. +Compilation time improvements for individual files differ, and speedups over 15% are not uncommon. +If we run BOLT on a Clang binary compiled without *PGO+LTO* (in which case the build is finished in 253.32 seconds), +the gains we see are over 50 seconds (25%), +but, as expected, the result is still slower than *PGO+LTO+BOLT* build. + +## Source of the Wins + +We mentioned that Clang suffers from considerable instruction cache misses. This can be measured with `perf`: +```bash +$ ln -fs $CPATH/clang-7.org $CPATH/clang-7 +$ ninja clean && perf stat -e instructions,L1-icache-misses -- ninja clang -j48 + ... + 16,366,101,626,647 instructions + 359,996,216,537 L1-icache-misses +``` +That's about 22 instruction cache misses per thousand instructions. As a rule of thumb, if the application +has over 10 misses per thousand instructions, it is a good indication that it will be improved by BOLT. +Now let's see how many misses are in the BOLTed binary: +```bash +$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7 +$ ninja clean && perf stat -e instructions,L1-icache-misses -- ninja clang -j48 + ... + 16,319,818,488,769 instructions + 244,888,677,972 L1-icache-misses +``` +The number of misses per thousand instructions went down from 22 to 15, significantly reducing +the number of stalls in the CPU front-end. +Notice how the number of executed instructions stayed roughly the same. That's because we didn't +run any optimizations beyond the ones affecting the code layout. Other than instruction cache misses, +BOLT also improves branch mispredictions, iTLB misses, and misses in L2 and L3. + +## Using Clang for Other Applications + +We have collected profile for Clang using its own source code. Would it be enough to speed up +the compilation of other projects? We picked `mysqld`, an open-source database, to do the test. + +On our 48-core Haswell system using the *PGO+LTO* Clang, the build finished in 136.06 seconds, while using the *PGO+LTO+BOLT* Clang, 126.10 seconds. +That's a noticeable improvement, but not as significant as the one we saw on Clang itself. +This is partially because the number of instruction cache misses is slightly lower on this scenario : 19 vs 22. +Another reason is that Clang is run with a different set of options while building `mysqld` compared +to the training run. + +Different options exercise different code paths, and +if we trained without a specific option, we may have misplaced parts of the code responsible for handling it. +To test this theory, we have collected another `perf` profile while building `mysqld`, and merged it with an existing profile +using the `merge-fdata` utility that comes with BOLT. Optimized with that profile, the *PGO+LTO+BOLT* Clang was able +to perform the `mysqld` build in 124.74 seconds, i.e. 11 seconds or 9% faster compared to *PGO+LGO* Clang. +The merged profile didn't make the original Clang compilation slower either, while the number of profiled functions in Clang increased from 11,415 to 14,025. + +Ideally, the profile run has to be done with a superset of all commonly used options. However, the main improvement is expected with just the basic set. + +## Summary + +In this tutorial we demonstrated how to use BOLT to improve the +performance of the Clang compiler. Similarly, BOLT could be used to improve the performance +of GCC, or any other application suffering from a high number of instruction +cache misses. + +---- +# Appendix + +## Bootstrapping Clang-7 with PGO and LTO + +Below we describe detailed steps to build Clang, and make it ready for BOLT optimizations. If you +already have the build setup, you can skip this section, except for the last step that adds `-Wl,-q` linker flag to the final build. + +### Getting Clang-7 Sources + +Set `$TOPLEV` to the directory of your preference where you would like to do +builds. E.g. `TOPLEV=~/clang-7/`. Follow with commands to clone the `release_70` branches +of LLVM, Clang, lld linker, and the compiler runtime: +```bash +$ cd ${TOPLEV} +$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/llvm.git/ llvm +$ cd llvm/tools +$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/clang.git/ +$ cd ../projects +$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/lld.git/ +$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/compiler-rt.git/ +``` + +### Building Stage 1 Compiler + +Stage 1 will be the first build we are going to do, and we will be using the +default system compiler to build Clang. If your system lacks a compiler, use your distribution package manager to install one +that supports C++11. In this example we are going to use GCC. In addition to the compiler, +you will need the `cmake` and `ninja` packages. +```bash +$ mkdir ${TOPLEV}stage1 +$ cd ${TOPLEV}/stage1 +$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_ASM_COMPILER=gcc \ + -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage1/install +$ ninja install +``` + +### Building Stage 2 Compiler With Instrumentation + +Using the freshly-baked stage 1 Clang compiler, we are going to build Clang with profile generation capabilities: +```bash +$ mkdir ${TOPLEV}/stage2-prof-gen +$ cd ${TOPLEV}/stage2-prof-gen +$ CPATH=${TOPLEV}/stage1/install/bin/ +$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \ + -DLLVM_USE_LINKER=lld -DLLVM_BUILD_INSTRUMENTED=ON \ + -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage2-prof-gen/install +$ ninja install +``` + +### Generating Profile for PGO + +While there are many ways to obtain the profile data, we are going to use the source code already at our +disposal, i.e. we are going to collect the profile while building Clang itself: +```bash +$ mkdir ${TOPLEV}/stage3-train +$ cd ${TOPLEV}/stage3-train +$ CPATH=${TOPLEV}/stage2-prof-gen/install/bin +$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \ + -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3-train/install +$ ninja clang +``` +Once the build is completed, the profile files will be saved under `${TOPLEV}/stage2-prof-gen/profiles`. We will merge them before they can be passed back into Clang: +```bash +$ cd ${TOPLEV}/stage2-prof-gen/profiles +$ ${TOPLEV}/stage1/install/bin/llvm-profdata merge -output=clang.profdata * +``` + +### Building Clang with PGO and LTO + +Now the profile can be used to guide optimizations to produce better code for our scenario, i.e. building Clang. +We will also enable link-time optimizations to allow cross-module inlining and other optimizations. Finally, we are going to add one extra step that is useful for BOLT: a linker flag instructing it to preserve relocations in the output binary. Note that this flag does not affect the generated code or data used at runtime, it only writes metadata to the file on disk: +```bash +$ mkdir ${TOPLEV}/stage2-prof-use-lto +$ cd ${TOPLEV}/stage2-prof-use-lto +$ CPATH=${TOPLEV}/stage1/install/bin/ +$ export LDFLAGS="-Wl,-q" +$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \ + -DLLVM_ENABLE_LTO=Full -DLLVM_PROFDATA_FILE=${TOPLEV}/stage2-prof-gen/profiles/clang.profdata \ + -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage2-prof-use-lto/install +$ ninja install +``` +Now we have a Clang compiler that can build itself much faster. As we will see, it builds other applications faster as well, and, with BOLT, the compile time can be improved even further. diff --git a/bolt/docs/RuntimeLibrary.md b/bolt/docs/RuntimeLibrary.md new file mode 100644 index 000000000000..58d9497a195b --- /dev/null +++ b/bolt/docs/RuntimeLibrary.md @@ -0,0 +1,37 @@ +# BOLT ORC-based linker + +A high-level view on the simple linker used to insert auxiliary/library code into the final binary produced by BOLT. This is built on top of LLVM's ORC infra (the newest iteration on JITting for LLVM). + +## Several levels of code injection + +When BOLT starts processing an input executable, its first task is to raise the binary to a low-level IR with CFG. After this is done, we are ready to change code in this binary. Throughout BOLT's pipeline of code transformations, there are plenty of situations when we need to insert new code or fix existing code. + +If operating with small code changes inside a basic block, we typically defer this work to MCPlusBuilder. This is our target-independent interface to create new instructions, but it also contains some functions that may create code spanning multiple basic blocks (for instance, when doing indirect call promotion and unrolling an indirect call into a ladder of comparisons/direct calls). The implementation here usually boils down to programmatically creating new MCInst instructions while setting their opcodes according to the target list (see X86GenInstOpcodes.inc generated by tablegen in an LLVM build). + +However, this approach quickly becomes awkward if we want to insert a lot of code, especially if this code is frozen and never changes. In these situations, it is more convenient to have a runtime library with all the code you need to insert. This library defines some symbols and can be linked into the final binary. In this case, all you need to do in a BOLT transformation is to insert a call to your library. + +## The runtime library + +Currently, our runtime library is written in C++ and contains code that helps us instrument a binary. + +### Limitations +Our library is not written with regular C++ code as it is not linked against any other libraries (this means we cannnot rely on anything defined on libstdc++, glibc, libgcc etc), but is self sufficient. In runtime/CMakeLists.txt, we can see it is built with -ffreestanding, which requires the compiler to avoid using a runtime library by itself. + +While this requires us to make our own syscalls, it does simplify our linker a lot, which is very limited and can only do basic function name resolving. However, this is a big improvement in comparison with programmatically generating the code in assembly language using MCInsts. + +A few more quirks: + +* No BSS section: don't use uninitialized globals +* No dependencies on foreign code: self sufficient +* You should closely watch the generated bolt_rt object files, anything requiring fancy linker features will break. We only support bare bones .text, .data and nothing else. + +Read instr.cpp opening comment for more details. + + +## Linking + +While RewriteInstance::emitAndLink() will perform an initial link step to resolve all references of the input program, it will not start linking the runtime library right away. The input program lives in its own module that may end up with unresolved references to the runtime library. + +RewriteInstance::linkRuntime() has the job of actually reading individual .o files and adding them to the binary. We currently have a single .o file, so after it is read, ORC can finally resolve references from the first module to the newly inserted .o objects. + +This sequence of steps is done by calls to addObject() and emitAndFinalize(). The latter will trigger symbol resolution, relying on the symbol resolver provided by us when calling createLegacyLookupResolver(). diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt new file mode 100644 index 000000000000..cc679c31ff6d --- /dev/null +++ b/bolt/runtime/CMakeLists.txt @@ -0,0 +1,46 @@ +cmake_minimum_required(VERSION 3.1.0) + +include(CheckIncludeFiles) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +project(libbolt_rt_project) + +check_include_files(elf.h HAVE_ELF_H) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/config.h) + +add_library(bolt_rt_instr STATIC + instr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/config.h + ) +add_library(bolt_rt_hugify STATIC + hugify.cpp + ${CMAKE_CURRENT_BINARY_DIR}/config.h + ) + +# Don't let the compiler think it can create calls to standard libs +target_compile_options(bolt_rt_instr PRIVATE -ffreestanding -fno-exceptions -fno-rtti -fPIE) +target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_options(bolt_rt_hugify PRIVATE -ffreestanding -fno-exceptions -fno-rtti) +target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +install(TARGETS bolt_rt_instr DESTINATION lib) +install(TARGETS bolt_rt_hugify DESTINATION lib) + +if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") + add_library(bolt_rt_instr_osx STATIC + instr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/config.h + ) + target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + target_compile_options(bolt_rt_instr_osx PRIVATE + -target x86_64-apple-darwin19.6.0 + -ffreestanding + -fno-exceptions + -fno-rtti + -fno-stack-protector) + install(TARGETS bolt_rt_instr_osx DESTINATION lib) +endif() diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h new file mode 100644 index 000000000000..732ea3425179 --- /dev/null +++ b/bolt/runtime/common.h @@ -0,0 +1,499 @@ +//===-- common.h ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#if !defined(__APPLE__) + +#include +#include + +#else + +typedef __SIZE_TYPE__ size_t; +#define __SSIZE_TYPE__ \ + __typeof__(_Generic((__SIZE_TYPE__)0, unsigned long long int \ + : (long long int)0, unsigned long int \ + : (long int)0, unsigned int \ + : (int)0, unsigned short \ + : (short)0, unsigned char \ + : (signed char)0)) +typedef __SSIZE_TYPE__ ssize_t; + +typedef unsigned long long uint64_t; +typedef unsigned uint32_t; +typedef unsigned char uint8_t; + +typedef long long int64_t; +typedef int int32_t; + +#endif + +#include "config.h" + +#ifdef HAVE_ELF_H +#include +#endif + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "push %%rax\n" \ + "push %%rbx\n" \ + "push %%rcx\n" \ + "push %%rdx\n" \ + "push %%rdi\n" \ + "push %%rsi\n" \ + "push %%rbp\n" \ + "push %%r8\n" \ + "push %%r9\n" \ + "push %%r10\n" \ + "push %%r11\n" \ + "push %%r12\n" \ + "push %%r13\n" \ + "push %%r14\n" \ + "push %%r15\n" \ + "sub $8, %%rsp\n" + +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "add $8, %%rsp\n" \ + "pop %%r15\n" \ + "pop %%r14\n" \ + "pop %%r13\n" \ + "pop %%r12\n" \ + "pop %%r11\n" \ + "pop %%r10\n" \ + "pop %%r9\n" \ + "pop %%r8\n" \ + "pop %%rbp\n" \ + "pop %%rsi\n" \ + "pop %%rdi\n" \ + "pop %%rdx\n" \ + "pop %%rcx\n" \ + "pop %%rbx\n" \ + "pop %%rax\n" + +// Anonymous namespace covering everything but our library entry point +namespace { + +constexpr uint32_t BufSize = 10240; + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define READ_SYSCALL 0x2000003 +#else +#define READ_SYSCALL 0 +#endif + __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define WRITE_SYSCALL 0x2000004 +#else +#define WRITE_SYSCALL 1 +#endif + __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { +#if defined(__APPLE__) +#define MMAP_SYSCALL 0x20000c5 +#else +#define MMAP_SYSCALL 9 +#endif + void *ret; + register uint64_t r8 asm("r8") = fd; + register uint64_t r9 asm("r9") = offset; + register uint64_t r10 asm("r10") = flags; + __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), + "r"(r9) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { +#if defined(__APPLE__) +#define MUNMAP_SYSCALL 0x2000049 +#else +#define MUNMAP_SYSCALL 11 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +#define SIG_BLOCK 0 +#define SIG_UNBLOCK 1 +#define SIG_SETMASK 2 + +static const uint64_t MaskAllSignals[] = {-1ULL}; + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { +#if defined(__APPLE__) +#define SIGPROCMASK_SYSCALL 0x2000030 +#else +#define SIGPROCMASK_SYSCALL 14 +#endif + uint64_t ret; + register long r10 asm("r10") = sizeof(uint64_t); + __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(how), "S"(set), "d"(oldset), "r"(r10) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { +#if defined(__APPLE__) +#define EXIT_SYSCALL 0x2000001 +#else +#define EXIT_SYSCALL 231 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(code) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +// Helper functions for writing strings to the .fdata file. We intentionally +// avoid using libc names (lowercase memset) to make it clear it is our impl. + +/// Write number Num using Base to the buffer in OutBuf, returns a pointer to +/// the end of the string. +char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) { + const char *Chars = "0123456789abcdef"; + char Buf[21]; + char *Ptr = Buf; + while (Num) { + *Ptr++ = *(Chars + (Num % Base)); + Num /= Base; + } + if (Ptr == Buf) { + *OutBuf++ = '0'; + return OutBuf; + } + while (Ptr != Buf) { + *OutBuf++ = *--Ptr; + } + return OutBuf; +} + +/// Copy Str to OutBuf, returns a pointer to the end of the copied string +char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) { + while (*Str) { + *OutBuf++ = *Str++; + if (--Size <= 0) + return OutBuf; + } + return OutBuf; +} + +/// Compare two strings, at most Num bytes. +int strnCmp(const char *Str1, const char *Str2, size_t Num) { + while (Num && *Str1 && (*Str1 == *Str2)) { + Num--; + Str1++; + Str2++; + } + if (Num == 0) + return 0; + return *(unsigned char *)Str1 - *(unsigned char *)Str2; +} + +void memSet(char *Buf, char C, uint32_t Size) { + for (int I = 0; I < Size; ++I) + *Buf++ = C; +} + +void *memCpy(void *Dest, const void *Src, size_t Len) { + char *d = static_cast(Dest); + const char *s = static_cast(Src); + while (Len--) + *d++ = *s++; + return Dest; +} + +uint32_t strLen(const char *Str) { + uint32_t Size = 0; + while (*Str++) + ++Size; + return Size; +} + +void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) { + char Buf[BufSize]; + char *Ptr = Buf; + Ptr = strCopy(Ptr, Msg, BufSize - 23); + Ptr = intToStr(Ptr, Num, Base); + Ptr = strCopy(Ptr, "\n"); + __write(2, Buf, Ptr - Buf); +} + +void report(const char *Msg) { __write(2, Msg, strLen(Msg)); } + +unsigned long hexToLong(const char *str) { + unsigned long res = 0; + while (*str != '\0') { + res <<= 4; + if ('0' <= *str && *str <= '9') + res += *str++ - '0'; + else if ('a' <= *str && *str <= 'f') + res += *str++ - 'a' + 10; + else if ('A' <= *str && *str <= 'F') + res += *str++ - 'A' + 10; + else { + return 0; + } + } + return res; +} + +#if !defined(__APPLE__) +// We use a stack-allocated buffer for string manipulation in many pieces of +// this code, including the code that prints each line of the fdata file. This +// buffer needs to accomodate large function names, but shouldn't be arbitrarily +// large (dynamically allocated) for simplicity of our memory space usage. + +// Declare some syscall wrappers we use throughout this code to avoid linking +// against system libc. +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + __asm__ __volatile__("movq $2, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(flags), "d"(mode) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +struct dirent { + unsigned long d_ino; /* Inode number */ + unsigned long d_off; /* Offset to next linux_dirent */ + unsigned short d_reclen; /* Length of this linux_dirent */ + char d_name[]; /* Filename (null-terminated) */ + /* length is actually (d_reclen - 2 - + offsetof(struct linux_dirent, d_name)) */ +}; + +long __getdents(unsigned int fd, dirent *dirp, size_t count) { + long ret; + __asm__ __volatile__("movq $78, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(fd), "S"(dirp), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + __asm__ __volatile__("movq $89, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(buf), "d"(bufsize) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + __asm__ __volatile__("movq $8, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(pos), "d"(whence) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __close(uint64_t fd) { + uint64_t ret; + __asm__ __volatile__("movq $3, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + __asm__ __volatile__("movq $28, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(length), "d"(advice) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +struct timespec { + uint64_t tv_sec; /* seconds */ + uint64_t tv_nsec; /* nanoseconds */ +}; + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + __asm__ __volatile__("movq $35, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(req), "S"(rem) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + __asm__ __volatile__("movq $57, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + __asm__ __volatile__("movq $10, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(len), "d"(prot) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; + __asm__ __volatile__("movq $39, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + __asm__ __volatile__("movq $110, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + __asm__ __volatile__("movq $109, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(pgid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + __asm__ __volatile__("movq $121, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + __asm__ __volatile__("movq $62, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(sig) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +#endif + +void reportError(const char *Msg, uint64_t Size) { + __write(2, Msg, Size); + __exit(1); +} + +void assert(bool Assertion, const char *Msg) { + if (Assertion) + return; + char Buf[BufSize]; + char *Ptr = Buf; + Ptr = strCopy(Ptr, "Assertion failed: "); + Ptr = strCopy(Ptr, Msg, BufSize - 40); + Ptr = strCopy(Ptr, "\n"); + reportError(Buf, Ptr - Buf); +} + +/// 1B mutex accessed by lock xchg +class Mutex { + volatile bool InUse{false}; + +public: + bool acquire() { + bool Result = true; + asm volatile("lock; xchg %0, %1" : "+m"(InUse), "=r"(Result) : : "cc"); + return !Result; + } + void release() { InUse = false; } +}; + +/// RAII wrapper for Mutex +class Lock { + Mutex &M; + uint64_t SignalMask[1] = {}; + +public: + Lock(Mutex &M) : M(M) { + __sigprocmask(SIG_BLOCK, MaskAllSignals, SignalMask); + while (!M.acquire()) { + } + } + + ~Lock() { + M.release(); + __sigprocmask(SIG_SETMASK, SignalMask, nullptr); + } +}; + +inline uint64_t alignTo(uint64_t Value, uint64_t Align) { + return (Value + Align - 1) / Align * Align; +} + +} // anonymous namespace diff --git a/bolt/runtime/config.h.in b/bolt/runtime/config.h.in new file mode 100644 index 000000000000..e52919dea83b --- /dev/null +++ b/bolt/runtime/config.h.in @@ -0,0 +1 @@ +#cmakedefine HAVE_ELF_H diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp new file mode 100644 index 000000000000..aa3e1f78ccaf --- /dev/null +++ b/bolt/runtime/hugify.cpp @@ -0,0 +1,127 @@ +//===-- hugify.cpp ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if !defined(__APPLE__) + +#include "common.h" +#include + +// Enables a very verbose logging to stderr useful when debugging +//#define ENABLE_DEBUG + +// Function pointers to init routines in the binary, so we can resume +// regular execution of the function that we hooked. +extern void (*__bolt_hugify_init_ptr)(); + +// The __hot_start and __hot_end symbols set by Bolt. We use them to figure +// out the rage for marking huge pages. +extern uint64_t __hot_start; +extern uint64_t __hot_end; + +#ifdef MADV_HUGEPAGE +/// Check whether the kernel supports THP via corresponding sysfs entry. +static bool has_pagecache_thp_support() { + char buf[256] = {0}; + const char *madviseStr = "always [madvise] never"; + + int fd = __open("/sys/kernel/mm/transparent_hugepage/enabled", + 0 /* O_RDONLY */, 0); + if (fd < 0) + return false; + + size_t res = __read(fd, buf, 256); + if (res < 0) + return false; + + int cmp = strnCmp(buf, madviseStr, strLen(madviseStr)); + return cmp == 0; +} + +static void hugify_for_old_kernel(uint8_t *from, uint8_t *to) { + size_t size = to - from; + + uint8_t *mem = reinterpret_cast( + __mmap(0, size, 0x3 /* PROT_READ | PROT_WRITE*/, + 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0)); + + if (mem == (void *)MAP_FAILED) { + char msg[] = "Could not allocate memory for text move\n"; + reportError(msg, sizeof(msg)); + } +#ifdef ENABLE_DEBUG + reportNumber("Allocated temporary space: ", (uint64_t)mem, 16); +#endif + + // Copy the hot code to a temproary location. + memCpy(mem, from, size); + + // Maps out the existing hot code. + if (__mmap(reinterpret_cast(from), size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, + 0) == (void *)MAP_FAILED) { + char msg[] = "failed to mmap memory for large page move terminating\n"; + reportError(msg, sizeof(msg)); + } + + // Mark the hot code page to be huge page. + if (__madvise(from, size, MADV_HUGEPAGE) == -1) { + char msg[] = "failed to allocate large page\n"; + reportError(msg, sizeof(msg)); + } + + // Copy the hot code back. + memCpy(from, mem, size); + + // Change permission back to read-only, ignore failure + __mprotect(from, size, PROT_READ | PROT_EXEC); + + __munmap(mem, size); +} +#endif + +extern "C" void __bolt_hugify_self_impl() { +#ifdef MADV_HUGEPAGE + uint8_t *hotStart = (uint8_t *)&__hot_start; + uint8_t *hotEnd = (uint8_t *)&__hot_end; + // Make sure the start and end are aligned with huge page address + const size_t hugePageBytes = 2L * 1024 * 1024; + uint8_t *from = hotStart - ((intptr_t)hotStart & (hugePageBytes - 1)); + uint8_t *to = hotEnd + (hugePageBytes - 1); + to -= (intptr_t)to & (hugePageBytes - 1); + +#ifdef ENABLE_DEBUG + reportNumber("[hugify] hot start: ", (uint64_t)hotStart, 16); + reportNumber("[hugify] hot end: ", (uint64_t)hotEnd, 16); + reportNumber("[hugify] aligned huge page from: ", (uint64_t)from, 16); + reportNumber("[hugify] aligned huge page to: ", (uint64_t)to, 16); +#endif + + if (!has_pagecache_thp_support()) { + hugify_for_old_kernel(from, to); + return; + } + + if (__madvise(from, (to - from), MADV_HUGEPAGE) == -1) { + char msg[] = "failed to allocate large page\n"; + // TODO: allow user to control the failure behavior. + reportError(msg, sizeof(msg)); + } +#endif +} + +/// This is hooking ELF's entry, it needs to save all machine state. +extern "C" __attribute((naked)) void __bolt_hugify_self() { + __asm__ __volatile__(SAVE_ALL + "call __bolt_hugify_self_impl\n" + RESTORE_ALL + "jmp *__bolt_hugify_init_ptr(%%rip)\n" + :::); +} + +#endif diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp new file mode 100644 index 000000000000..160f9c1c66da --- /dev/null +++ b/bolt/runtime/instr.cpp @@ -0,0 +1,1673 @@ +//===-- instr.cpp -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does +// not support linking modules with dependencies on one another into the final +// binary (TODO?), which means this library has to be self-contained in a single +// module. +// +// All extern declarations here need to be defined by BOLT itself. Those will be +// undefined symbols that BOLT needs to resolve by emitting these symbols with +// MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible +// for defining the symbols here and these two files have a tight coupling: one +// working statically when you run BOLT and another during program runtime when +// you run an instrumented binary. The main goal here is to output an fdata file +// (BOLT profile) with the instrumentation counters inserted by the static pass. +// Counters for indirect calls are an exception, as we can't know them +// statically. These counters are created and managed here. To allow this, we +// need a minimal framework for allocating memory dynamically. We provide this +// with the BumpPtrAllocator class (not LLVM's, but our own version of it). +// +// Since this code is intended to be inserted into any executable, we decided to +// make it standalone and do not depend on any external libraries (i.e. language +// support libraries, such as glibc or stdc++). To allow this, we provide a few +// light implementations of common OS interacting functionalities using direct +// syscall wrappers. Our simple allocator doesn't manage deallocations that +// fragment the memory space, so it's stack based. This is the minimal framework +// provided here to allow processing instrumented counters and writing fdata. +// +// In the C++ idiom used here, we never use or rely on constructors or +// destructors for global objects. That's because those need support from the +// linker in initialization/finalization code, and we want to keep our linker +// very simple. Similarly, we don't create any global objects that are zero +// initialized, since those would need to go .bss, which our simple linker also +// don't support (TODO?). +// +//===----------------------------------------------------------------------===// + +#include "common.h" + +// Enables a very verbose logging to stderr useful when debugging +//#define ENABLE_DEBUG + +#ifdef ENABLE_DEBUG +#define DEBUG(X) \ + { X; } +#else +#define DEBUG(X) \ + {} +#endif + + +extern "C" { + +#if defined(__APPLE__) +extern uint64_t* _bolt_instr_locations_getter(); +extern uint32_t _bolt_num_counters_getter(); + +extern uint8_t* _bolt_instr_tables_getter(); +extern uint32_t _bolt_instr_num_funcs_getter(); + +#else + +// Main counters inserted by instrumentation, incremented during runtime when +// points of interest (locations) in the program are reached. Those are direct +// calls and direct and indirect branches (local ones). There are also counters +// for basic block execution if they are a spanning tree leaf and need to be +// counted in order to infer the execution count of other edges of the CFG. +extern uint64_t __bolt_instr_locations[]; +extern uint32_t __bolt_num_counters; +// Descriptions are serialized metadata about binary functions written by BOLT, +// so we have a minimal understanding about the program structure. For a +// reference on the exact format of this metadata, see *Description structs, +// Location, IntrumentedNode and EntryNode. +// Number of indirect call site descriptions +extern uint32_t __bolt_instr_num_ind_calls; +// Number of indirect call target descriptions +extern uint32_t __bolt_instr_num_ind_targets; +// Number of function descriptions +extern uint32_t __bolt_instr_num_funcs; +// Time to sleep across dumps (when we write the fdata profile to disk) +extern uint32_t __bolt_instr_sleep_time; +// Do not clear counters across dumps, rewrite file with the updated values +extern bool __bolt_instr_no_counters_clear; +// Wait until all forks of instrumented process will finish +extern bool __bolt_instr_wait_forks; +// Filename to dump data to +extern char __bolt_instr_filename[]; +// Instumented binary file path +extern char __bolt_instr_binpath[]; +// If true, append current PID to the fdata filename when creating it so +// different invocations of the same program can be differentiated. +extern bool __bolt_instr_use_pid; +// Functions that will be used to instrument indirect calls. BOLT static pass +// will identify indirect calls and modify them to load the address in these +// trampolines and call this address instead. BOLT can't use direct calls to +// our handlers because our addresses here are not known at analysis time. We +// only support resolving dependencies from this file to the output of BOLT, +// *not* the other way around. +// TODO: We need better linking support to make that happen. +extern void (*__bolt_ind_call_instr_pointer)(); +extern void (*__bolt_ind_tailcall_instr_pointer)(); +// init/fini trampoline routines in the binary, so we can +// resume regular execution of these functions that we hooked +extern void __bolt_start_trampoline(); +extern void __bolt_fini_trampoline(); + +#endif + +} + +namespace { + +/// A simple allocator that mmaps a fixed size region and manages this space +/// in a stack fashion, meaning you always deallocate the last element that +/// was allocated. In practice, we don't need to deallocate individual elements. +/// We monotonically increase our usage and then deallocate everything once we +/// are done processing something. +class BumpPtrAllocator { + /// This is written before each allocation and act as a canary to detect when + /// a bug caused our program to cross allocation boundaries. + struct EntryMetadata { + uint64_t Magic; + uint64_t AllocSize; + }; + +public: + void *allocate(size_t Size) { + Lock L(M); + + if (StackBase == nullptr) { +#if defined(__APPLE__) + int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002; +#else + int MAP_PRIVATE_MAP_ANONYMOUS = 0x22; +#endif + StackBase = reinterpret_cast( + __mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/, + Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/ + : MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/, + -1, 0)); + StackSize = 0; + } + + Size = alignTo(Size + sizeof(EntryMetadata), 16); + uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata); + auto *M = reinterpret_cast(StackBase + StackSize); + M->Magic = Magic; + M->AllocSize = Size; + StackSize += Size; + assert(StackSize < MaxSize, "allocator ran out of memory"); + return AllocAddress; + } + +#ifdef DEBUG + /// Element-wise deallocation is only used for debugging to catch memory + /// bugs by checking magic bytes. Ordinarily, we reset the allocator once + /// we are done with it. Reset is done with clear(). There's no need + /// to deallocate each element individually. + void deallocate(void *Ptr) { + Lock L(M); + uint8_t MetadataOffset = sizeof(EntryMetadata); + auto *M = reinterpret_cast( + reinterpret_cast(Ptr) - MetadataOffset); + const uint8_t *StackTop = StackBase + StackSize + MetadataOffset; + // Validate size + if (Ptr != StackTop - M->AllocSize) { + // Failed validation, check if it is a pointer returned by operator new [] + MetadataOffset += + sizeof(uint64_t); // Space for number of elements alloc'ed + M = reinterpret_cast(reinterpret_cast(Ptr) - + MetadataOffset); + // Ok, it failed both checks if this assertion fails. Stop the program, we + // have a memory bug. + assert(Ptr == StackTop - M->AllocSize, + "must deallocate the last element alloc'ed"); + } + assert(M->Magic == Magic, "allocator magic is corrupt"); + StackSize -= M->AllocSize; + } +#else + void deallocate(void *) {} +#endif + + void clear() { + Lock L(M); + StackSize = 0; + } + + /// Set mmap reservation size (only relevant before first allocation) + void setMaxSize(uint64_t Size) { MaxSize = Size; } + + /// Set mmap reservation privacy (only relevant before first allocation) + void setShared(bool S) { Shared = S; } + + void destroy() { + if (StackBase == nullptr) + return; + __munmap(StackBase, MaxSize); + } + +private: + static constexpr uint64_t Magic = 0x1122334455667788ull; + uint64_t MaxSize = 0xa00000; + uint8_t *StackBase{nullptr}; + uint64_t StackSize{0}; + bool Shared{false}; + Mutex M; +}; + +/// Used for allocating indirect call instrumentation counters. Initialized by +/// __bolt_instr_setup, our initialization routine. +BumpPtrAllocator GlobalAlloc; +} // anonymous namespace + +// User-defined placement new operators. We only use those (as opposed to +// overriding the regular operator new) so we can keep our allocator in the +// stack instead of in a data section (global). +void *operator new(size_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); } +void *operator new(size_t Sz, BumpPtrAllocator &A, char C) { + auto *Ptr = reinterpret_cast(A.allocate(Sz)); + memSet(Ptr, C, Sz); + return Ptr; +} +void *operator new[](size_t Sz, BumpPtrAllocator &A) { + return A.allocate(Sz); +} +void *operator new[](size_t Sz, BumpPtrAllocator &A, char C) { + auto *Ptr = reinterpret_cast(A.allocate(Sz)); + memSet(Ptr, C, Sz); + return Ptr; +} +// Only called during exception unwinding (useless). We must manually dealloc. +// C++ language weirdness +void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); } + +namespace { + +/// Basic key-val atom stored in our hash +struct SimpleHashTableEntryBase { + uint64_t Key; + uint64_t Val; +}; + +/// This hash table implementation starts by allocating a table of size +/// InitialSize. When conflicts happen in this main table, it resolves +/// them by chaining a new table of size IncSize. It never reallocs as our +/// allocator doesn't support it. The key is intended to be function pointers. +/// There's no clever hash function (it's just x mod size, size being prime). +/// I never tuned the coefficientes in the modular equation (TODO) +/// This is used for indirect calls (each call site has one of this, so it +/// should have a small footprint) and for tallying call counts globally for +/// each target to check if we missed the origin of some calls (this one is a +/// large instantiation of this template, since it is global for all call sites) +template +class SimpleHashTable { +public: + using MapEntry = T; + + /// Increment by 1 the value of \p Key. If it is not in this table, it will be + /// added to the table and its value set to 1. + void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) { + ++get(Key, Alloc).Val; + } + + /// Basic member accessing interface. Here we pass the allocator explicitly to + /// avoid storing a pointer to it as part of this table (remember there is one + /// hash for each indirect call site, so we wan't to minimize our footprint). + MapEntry &get(uint64_t Key, BumpPtrAllocator &Alloc) { + Lock L(M); + if (TableRoot) + return getEntry(TableRoot, Key, Key, Alloc, 0); + return firstAllocation(Key, Alloc); + } + + /// Traverses all elements in the table + template + void forEachElement(void (*Callback)(MapEntry &, Args...), Args... args) { + if (!TableRoot) + return; + return forEachElement(Callback, InitialSize, TableRoot, args...); + } + + void resetCounters(); + +private: + constexpr static uint64_t VacantMarker = 0; + constexpr static uint64_t FollowUpTableMarker = 0x8000000000000000ull; + + MapEntry *TableRoot{nullptr}; + Mutex M; + + template + void forEachElement(void (*Callback)(MapEntry &, Args...), + uint32_t NumEntries, MapEntry *Entries, Args... args) { + for (uint32_t I = 0; I < NumEntries; ++I) { + MapEntry &Entry = Entries[I]; + if (Entry.Key == VacantMarker) + continue; + if (Entry.Key & FollowUpTableMarker) { + forEachElement(Callback, IncSize, + reinterpret_cast(Entry.Key & + ~FollowUpTableMarker), + args...); + continue; + } + Callback(Entry, args...); + } + } + + MapEntry &firstAllocation(uint64_t Key, BumpPtrAllocator &Alloc) { + TableRoot = new (Alloc, 0) MapEntry[InitialSize]; + MapEntry &Entry = TableRoot[Key % InitialSize]; + Entry.Key = Key; + return Entry; + } + + MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector, + BumpPtrAllocator &Alloc, int CurLevel) { + const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize; + uint64_t Remainder = Selector / NumEntries; + Selector = Selector % NumEntries; + MapEntry &Entry = Entries[Selector]; + + // A hit + if (Entry.Key == Key) { + return Entry; + } + + // Vacant - add new entry + if (Entry.Key == VacantMarker) { + Entry.Key = Key; + return Entry; + } + + // Defer to the next level + if (Entry.Key & FollowUpTableMarker) { + return getEntry( + reinterpret_cast(Entry.Key & ~FollowUpTableMarker), + Key, Remainder, Alloc, CurLevel + 1); + } + + // Conflict - create the next level + MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize]; + uint64_t CurEntrySelector = Entry.Key / InitialSize; + for (int I = 0; I < CurLevel; ++I) + CurEntrySelector /= IncSize; + CurEntrySelector = CurEntrySelector % IncSize; + NextLevelTbl[CurEntrySelector] = Entry; + Entry.Key = reinterpret_cast(NextLevelTbl) | FollowUpTableMarker; + return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1); + } +}; + +template void resetIndCallCounter(T &Entry) { + Entry.Val = 0; +} + +template +void SimpleHashTable::resetCounters() { + Lock L(M); + forEachElement(resetIndCallCounter); +} + +/// Represents a hash table mapping a function target address to its counter. +using IndirectCallHashTable = SimpleHashTable<>; + +/// Initialize with number 1 instead of 0 so we don't go into .bss. This is the +/// global array of all hash tables storing indirect call destinations happening +/// during runtime, one table per call site. +IndirectCallHashTable *GlobalIndCallCounters{ + reinterpret_cast(1)}; + +/// Don't allow reentrancy in the fdata writing phase - only one thread writes +/// it +Mutex *GlobalWriteProfileMutex{reinterpret_cast(1)}; + +/// Store number of calls in additional to target address (Key) and frequency +/// as perceived by the basic block counter (Val). +struct CallFlowEntryBase : public SimpleHashTableEntryBase { + uint64_t Calls; +}; + +using CallFlowHashTableBase = SimpleHashTable; + +/// This is a large table indexing all possible call targets (indirect and +/// direct ones). The goal is to find mismatches between number of calls (for +/// those calls we were able to track) and the entry basic block counter of the +/// callee. In most cases, these two should be equal. If not, there are two +/// possible scenarios here: +/// +/// * Entry BB has higher frequency than all known calls to this function. +/// In this case, we have dynamic library code or any uninstrumented code +/// calling this function. We will write the profile for these untracked +/// calls as having source "0 [unknown] 0" in the fdata file. +/// +/// * Number of known calls is higher than the frequency of entry BB +/// This only happens when there is no counter for the entry BB / callee +/// function is not simple (in BOLT terms). We don't do anything special +/// here and just ignore those (we still report all calls to the non-simple +/// function, though). +/// +class CallFlowHashTable : public CallFlowHashTableBase { +public: + CallFlowHashTable(BumpPtrAllocator &Alloc) : Alloc(Alloc) {} + + MapEntry &get(uint64_t Key) { return CallFlowHashTableBase::get(Key, Alloc); } + +private: + // Different than the hash table for indirect call targets, we do store the + // allocator here since there is only one call flow hash and space overhead + // is negligible. + BumpPtrAllocator &Alloc; +}; + +/// +/// Description metadata emitted by BOLT to describe the program - refer to +/// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote() +/// +struct Location { + uint32_t FunctionName; + uint32_t Offset; +}; + +struct CallDescription { + Location From; + uint32_t FromNode; + Location To; + uint32_t Counter; + uint64_t TargetAddress; +}; + +using IndCallDescription = Location; + +struct IndCallTargetDescription { + Location Loc; + uint64_t Address; +}; + +struct EdgeDescription { + Location From; + uint32_t FromNode; + Location To; + uint32_t ToNode; + uint32_t Counter; +}; + +struct InstrumentedNode { + uint32_t Node; + uint32_t Counter; +}; + +struct EntryNode { + uint64_t Node; + uint64_t Address; +}; + +struct FunctionDescription { + uint32_t NumLeafNodes; + const InstrumentedNode *LeafNodes; + uint32_t NumEdges; + const EdgeDescription *Edges; + uint32_t NumCalls; + const CallDescription *Calls; + uint32_t NumEntryNodes; + const EntryNode *EntryNodes; + + /// Constructor will parse the serialized function metadata written by BOLT + FunctionDescription(const uint8_t *FuncDesc); + + uint64_t getSize() const { + return 16 + NumLeafNodes * sizeof(InstrumentedNode) + + NumEdges * sizeof(EdgeDescription) + + NumCalls * sizeof(CallDescription) + + NumEntryNodes * sizeof(EntryNode); + } +}; + +/// The context is created when the fdata profile needs to be written to disk +/// and we need to interpret our runtime counters. It contains pointers to the +/// mmaped binary (only the BOLT written metadata section). Deserialization +/// should be straightforward as most data is POD or an array of POD elements. +/// This metadata is used to reconstruct function CFGs. +struct ProfileWriterContext { + IndCallDescription *IndCallDescriptions; + IndCallTargetDescription *IndCallTargets; + uint8_t *FuncDescriptions; + char *Strings; // String table with function names used in this binary + int FileDesc; // File descriptor for the file on disk backing this + // information in memory via mmap + void *MMapPtr; // The mmap ptr + int MMapSize; // The mmap size + + /// Hash table storing all possible call destinations to detect untracked + /// calls and correctly report them as [unknown] in output fdata. + CallFlowHashTable *CallFlowTable; + + /// Lookup the sorted indirect call target vector to fetch function name and + /// offset for an arbitrary function pointer. + const IndCallTargetDescription *lookupIndCallTarget(uint64_t Target) const; +}; + +/// Perform a string comparison and returns zero if Str1 matches Str2. Compares +/// at most Size characters. +int compareStr(const char *Str1, const char *Str2, int Size) { + while (*Str1 == *Str2) { + if (*Str1 == '\0' || --Size == 0) + return 0; + ++Str1; + ++Str2; + } + return 1; +} + +/// Output Location to the fdata file +char *serializeLoc(const ProfileWriterContext &Ctx, char *OutBuf, + const Location Loc, uint32_t BufSize) { + // fdata location format: Type Name Offset + // Type 1 - regular symbol + OutBuf = strCopy(OutBuf, "1 "); + const char *Str = Ctx.Strings + Loc.FunctionName; + uint32_t Size = 25; + while (*Str) { + *OutBuf++ = *Str++; + if (++Size >= BufSize) + break; + } + assert(!*Str, "buffer overflow, function name too large"); + *OutBuf++ = ' '; + OutBuf = intToStr(OutBuf, Loc.Offset, 16); + *OutBuf++ = ' '; + return OutBuf; +} + +/// Read and deserialize a function description written by BOLT. \p FuncDesc +/// points at the beginning of the function metadata structure in the file. +/// See Instrumentation::emitTablesAsELFNote() +FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) { + NumLeafNodes = *reinterpret_cast(FuncDesc); + DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10)); + LeafNodes = reinterpret_cast(FuncDesc + 4); + + NumEdges = *reinterpret_cast( + FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode)); + DEBUG(reportNumber("NumEdges = ", NumEdges, 10)); + Edges = reinterpret_cast( + FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode)); + + NumCalls = *reinterpret_cast( + FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) + + NumEdges * sizeof(EdgeDescription)); + DEBUG(reportNumber("NumCalls = ", NumCalls, 10)); + Calls = reinterpret_cast( + FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) + + NumEdges * sizeof(EdgeDescription)); + NumEntryNodes = *reinterpret_cast( + FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) + + NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription)); + DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes, 10)); + EntryNodes = reinterpret_cast( + FuncDesc + 16 + NumLeafNodes * sizeof(InstrumentedNode) + + NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription)); +} + +/// Read and mmap descriptions written by BOLT from the executable's notes +/// section +#if defined(HAVE_ELF_H) and !defined(__APPLE__) +#define BUF_SIZE 1024 +#define NAME_MAX 256 +#define ADDR_SIZE 32 + +void *__attribute__((noinline)) __get_pc() { + return __builtin_extract_return_addr(__builtin_return_address(0)); +} + +/// Get full path to the real binary by getting current virtual address +/// and searching for the appropriate link in address range in +/// /proc/self/map_files +static char *getBinaryPath() { + const char dirPath[] = "/proc/self/map_files/"; + static char target_path[NAME_MAX] = {}; + char buf[BUF_SIZE], start[ADDR_SIZE], end[ADDR_SIZE]; + char findBuf[NAME_MAX]; + char *strcp; + int lenS, lenE; + long nread; + unsigned long curAddr, saddr, eaddr; + struct dirent *d; + + if (__bolt_instr_binpath[0] != '\0') + return __bolt_instr_binpath; + + if (target_path[0] != '\0') + return target_path; + + curAddr = (unsigned long)__get_pc(); + uint64_t FDdir = __open(dirPath, + /*flags=*/0 /*O_RDONLY*/, + /*mode=*/0666); + assert(static_cast(FDdir) > 0, + "failed to open /proc/self/map_files"); + + for (;;) { + nread = __getdents(FDdir, (struct dirent *)buf, BUF_SIZE); + assert(static_cast(nread) != -1, "failed to get folder entries"); + if (nread == 0) + break; + for (long bpos = 0; bpos < nread; bpos += d->d_reclen) { + d = (struct dirent *)(buf + bpos); + strcp = d->d_name; + + lenS = 0, lenE = 0; + while (*strcp != '-') { + if (*strcp == '\0') { + lenS = 0; + break; + } + *strcp++; + lenS++; + } + if (lenS == 0) + continue; + *strcp++; + while (*strcp != '\0') { + *strcp++; + lenE++; + } + assert(lenS + lenE < ADDR_SIZE, "file name too long"); + + strCopy(start, d->d_name, lenS); + strCopy(end, d->d_name + lenS + 1, lenE); + start[lenS] = '\0'; + end[lenE] = '\0'; + saddr = hexToLong(start); + eaddr = hexToLong(end); + + if (curAddr >= saddr && curAddr <= eaddr) { + strCopy(findBuf, dirPath); + strCopy(findBuf + sizeof(dirPath) - 1, d->d_name); + findBuf[sizeof(dirPath) + lenS + lenE] = '\0'; + lenS = __readlink(findBuf, target_path, sizeof(target_path)); + assert(lenS != -1 && lenS != BUF_SIZE, "readlink error"); + target_path[lenS] = '\0'; + goto end; + } + } + } + return NULL; +end: + return target_path; +} + +ProfileWriterContext readDescriptions() { + ProfileWriterContext Result; + char *binPath = getBinaryPath(); + assert(binPath[0] != '\0', "failed to find binary path"); + + uint64_t FD = __open(binPath, + /*flags=*/0 /*O_RDONLY*/, + /*mode=*/0666); + assert(static_cast(FD) > 0, "failed to open binary path"); + + Result.FileDesc = FD; + + // mmap our binary to memory + uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/); + uint8_t *BinContents = reinterpret_cast( + __mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0)); + Result.MMapPtr = BinContents; + Result.MMapSize = Size; + Elf64_Ehdr *Hdr = reinterpret_cast(BinContents); + Elf64_Shdr *Shdr = reinterpret_cast(BinContents + Hdr->e_shoff); + Elf64_Shdr *StringTblHeader = reinterpret_cast( + BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize); + + // Find .bolt.instr.tables with the data we need and set pointers to it + for (int I = 0; I < Hdr->e_shnum; ++I) { + char *SecName = reinterpret_cast( + BinContents + StringTblHeader->sh_offset + Shdr->sh_name); + if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) { + Shdr = reinterpret_cast(BinContents + Hdr->e_shoff + + (I + 1) * Hdr->e_shentsize); + continue; + } + // Actual contents of the ELF note start after offset 20 decimal: + // Offset 0: Producer name size (4 bytes) + // Offset 4: Contents size (4 bytes) + // Offset 8: Note type (4 bytes) + // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary) + // Offset 20: Contents + uint32_t IndCallDescSize = + *reinterpret_cast(BinContents + Shdr->sh_offset + 20); + uint32_t IndCallTargetDescSize = *reinterpret_cast( + BinContents + Shdr->sh_offset + 24 + IndCallDescSize); + uint32_t FuncDescSize = + *reinterpret_cast(BinContents + Shdr->sh_offset + 28 + + IndCallDescSize + IndCallTargetDescSize); + Result.IndCallDescriptions = reinterpret_cast( + BinContents + Shdr->sh_offset + 24); + Result.IndCallTargets = reinterpret_cast( + BinContents + Shdr->sh_offset + 28 + IndCallDescSize); + Result.FuncDescriptions = BinContents + Shdr->sh_offset + 32 + + IndCallDescSize + IndCallTargetDescSize; + Result.Strings = reinterpret_cast( + BinContents + Shdr->sh_offset + 32 + IndCallDescSize + + IndCallTargetDescSize + FuncDescSize); + return Result; + } + const char ErrMsg[] = + "BOLT instrumentation runtime error: could not find section " + ".bolt.instr.tables\n"; + reportError(ErrMsg, sizeof(ErrMsg)); + return Result; +} + +#else + +ProfileWriterContext readDescriptions() { + ProfileWriterContext Result; + uint8_t *Tables = _bolt_instr_tables_getter(); + uint32_t IndCallDescSize = *reinterpret_cast(Tables); + uint32_t IndCallTargetDescSize = + *reinterpret_cast(Tables + 4 + IndCallDescSize); + uint32_t FuncDescSize = *reinterpret_cast( + Tables + 8 + IndCallDescSize + IndCallTargetDescSize); + Result.IndCallDescriptions = + reinterpret_cast(Tables + 4); + Result.IndCallTargets = reinterpret_cast( + Tables + 8 + IndCallDescSize); + Result.FuncDescriptions = + Tables + 12 + IndCallDescSize + IndCallTargetDescSize; + Result.Strings = reinterpret_cast( + Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize); + return Result; +} + +#endif + +#if !defined(__APPLE__) +/// Debug by printing overall metadata global numbers to check it is sane +void printStats(const ProfileWriterContext &Ctx) { + char StatMsg[BufSize]; + char *StatPtr = StatMsg; + StatPtr = + strCopy(StatPtr, + "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: "); + StatPtr = intToStr(StatPtr, + Ctx.FuncDescriptions - + reinterpret_cast(Ctx.IndCallDescriptions), + 10); + StatPtr = strCopy(StatPtr, "\nFuncDescSize: "); + StatPtr = intToStr( + StatPtr, + reinterpret_cast(Ctx.Strings) - Ctx.FuncDescriptions, 10); + StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_ind_calls: "); + StatPtr = intToStr(StatPtr, __bolt_instr_num_ind_calls, 10); + StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: "); + StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10); + StatPtr = strCopy(StatPtr, "\n"); + __write(2, StatMsg, StatPtr - StatMsg); +} +#endif + + +/// This is part of a simple CFG representation in memory, where we store +/// a dynamically sized array of input and output edges per node, and store +/// a dynamically sized array of nodes per graph. We also store the spanning +/// tree edges for that CFG in a separate array of nodes in +/// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes. +struct Edge { + uint32_t Node; // Index in nodes array regarding the destination of this edge + uint32_t ID; // Edge index in an array comprising all edges of the graph +}; + +/// A regular graph node or a spanning tree node +struct Node { + uint32_t NumInEdges{0}; // Input edge count used to size InEdge + uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges + Edge *InEdges{nullptr}; // Created and managed by \p Graph + Edge *OutEdges{nullptr}; // ditto +}; + +/// Main class for CFG representation in memory. Manages object creation and +/// destruction, populates an array of CFG nodes as well as corresponding +/// spanning tree nodes. +struct Graph { + uint32_t NumNodes; + Node *CFGNodes; + Node *SpanningTreeNodes; + uint64_t *EdgeFreqs; + uint64_t *CallFreqs; + BumpPtrAllocator &Alloc; + const FunctionDescription &D; + + /// Reads a list of edges from function description \p D and builds + /// the graph from it. Allocates several internal dynamic structures that are + /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all + /// spanning tree leaf nodes descriptions (their counters). They are the seed + /// used to compute the rest of the missing edge counts in a bottom-up + /// traversal of the spanning tree. + Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D, + const uint64_t *Counters, ProfileWriterContext &Ctx); + ~Graph(); + void dump() const; + +private: + void computeEdgeFrequencies(const uint64_t *Counters, + ProfileWriterContext &Ctx); + void dumpEdgeFreqs() const; +}; + +Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D, + const uint64_t *Counters, ProfileWriterContext &Ctx) + : Alloc(Alloc), D(D) { + DEBUG(reportNumber("G = 0x", (uint64_t)this, 16)); + // First pass to determine number of nodes + int32_t MaxNodes = -1; + CallFreqs = nullptr; + EdgeFreqs = nullptr; + for (int I = 0; I < D.NumEdges; ++I) { + if (static_cast(D.Edges[I].FromNode) > MaxNodes) + MaxNodes = D.Edges[I].FromNode; + if (static_cast(D.Edges[I].ToNode) > MaxNodes) + MaxNodes = D.Edges[I].ToNode; + } + + for (int I = 0; I < D.NumLeafNodes; ++I) { + if (static_cast(D.LeafNodes[I].Node) > MaxNodes) + MaxNodes = D.LeafNodes[I].Node; + } + for (int I = 0; I < D.NumCalls; ++I) { + if (static_cast(D.Calls[I].FromNode) > MaxNodes) + MaxNodes = D.Calls[I].FromNode; + } + // No nodes? Nothing to do + if (MaxNodes < 0) { + DEBUG(report("No nodes!\n")); + CFGNodes = nullptr; + SpanningTreeNodes = nullptr; + NumNodes = 0; + return; + } + ++MaxNodes; + DEBUG(reportNumber("NumNodes = ", MaxNodes, 10)); + NumNodes = static_cast(MaxNodes); + + // Initial allocations + CFGNodes = new (Alloc) Node[MaxNodes]; + + DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16)); + SpanningTreeNodes = new (Alloc) Node[MaxNodes]; + DEBUG(reportNumber("G->SpanningTreeNodes = 0x", + (uint64_t)SpanningTreeNodes, 16)); + + // Figure out how much to allocate to each vector (in/out edge sets) + for (int I = 0; I < D.NumEdges; ++I) { + CFGNodes[D.Edges[I].FromNode].NumOutEdges++; + CFGNodes[D.Edges[I].ToNode].NumInEdges++; + if (D.Edges[I].Counter != 0xffffffff) + continue; + + SpanningTreeNodes[D.Edges[I].FromNode].NumOutEdges++; + SpanningTreeNodes[D.Edges[I].ToNode].NumInEdges++; + } + + // Allocate in/out edge sets + for (int I = 0; I < MaxNodes; ++I) { + if (CFGNodes[I].NumInEdges > 0) + CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges]; + if (CFGNodes[I].NumOutEdges > 0) + CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges]; + if (SpanningTreeNodes[I].NumInEdges > 0) + SpanningTreeNodes[I].InEdges = + new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges]; + if (SpanningTreeNodes[I].NumOutEdges > 0) + SpanningTreeNodes[I].OutEdges = + new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges]; + CFGNodes[I].NumInEdges = 0; + CFGNodes[I].NumOutEdges = 0; + SpanningTreeNodes[I].NumInEdges = 0; + SpanningTreeNodes[I].NumOutEdges = 0; + } + + // Fill in/out edge sets + for (int I = 0; I < D.NumEdges; ++I) { + const uint32_t Src = D.Edges[I].FromNode; + const uint32_t Dst = D.Edges[I].ToNode; + Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++]; + E->Node = Dst; + E->ID = I; + + E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++]; + E->Node = Src; + E->ID = I; + + if (D.Edges[I].Counter != 0xffffffff) + continue; + + E = &SpanningTreeNodes[Src] + .OutEdges[SpanningTreeNodes[Src].NumOutEdges++]; + E->Node = Dst; + E->ID = I; + + E = &SpanningTreeNodes[Dst] + .InEdges[SpanningTreeNodes[Dst].NumInEdges++]; + E->Node = Src; + E->ID = I; + } + + computeEdgeFrequencies(Counters, Ctx); +} + +Graph::~Graph() { + if (CallFreqs) + Alloc.deallocate(CallFreqs); + if (EdgeFreqs) + Alloc.deallocate(EdgeFreqs); + for (int I = NumNodes - 1; I >= 0; --I) { + if (SpanningTreeNodes[I].OutEdges) + Alloc.deallocate(SpanningTreeNodes[I].OutEdges); + if (SpanningTreeNodes[I].InEdges) + Alloc.deallocate(SpanningTreeNodes[I].InEdges); + if (CFGNodes[I].OutEdges) + Alloc.deallocate(CFGNodes[I].OutEdges); + if (CFGNodes[I].InEdges) + Alloc.deallocate(CFGNodes[I].InEdges); + } + if (SpanningTreeNodes) + Alloc.deallocate(SpanningTreeNodes); + if (CFGNodes) + Alloc.deallocate(CFGNodes); +} + +void Graph::dump() const { + reportNumber("Dumping graph with number of nodes: ", NumNodes, 10); + report(" Full graph:\n"); + for (int I = 0; I < NumNodes; ++I) { + const Node *N = &CFGNodes[I]; + reportNumber(" Node #", I, 10); + reportNumber(" InEdges total ", N->NumInEdges, 10); + for (int J = 0; J < N->NumInEdges; ++J) + reportNumber(" ", N->InEdges[J].Node, 10); + reportNumber(" OutEdges total ", N->NumOutEdges, 10); + for (int J = 0; J < N->NumOutEdges; ++J) + reportNumber(" ", N->OutEdges[J].Node, 10); + report("\n"); + } + report(" Spanning tree:\n"); + for (int I = 0; I < NumNodes; ++I) { + const Node *N = &SpanningTreeNodes[I]; + reportNumber(" Node #", I, 10); + reportNumber(" InEdges total ", N->NumInEdges, 10); + for (int J = 0; J < N->NumInEdges; ++J) + reportNumber(" ", N->InEdges[J].Node, 10); + reportNumber(" OutEdges total ", N->NumOutEdges, 10); + for (int J = 0; J < N->NumOutEdges; ++J) + reportNumber(" ", N->OutEdges[J].Node, 10); + report("\n"); + } +} + +void Graph::dumpEdgeFreqs() const { + reportNumber( + "Dumping edge frequencies for graph with num edges: ", D.NumEdges, 10); + for (int I = 0; I < D.NumEdges; ++I) { + reportNumber("* Src: ", D.Edges[I].FromNode, 10); + reportNumber(" Dst: ", D.Edges[I].ToNode, 10); + reportNumber(" Cnt: ", EdgeFreqs[I], 10); + } +} + +/// Auxiliary map structure for fast lookups of which calls map to each node of +/// the function CFG +struct NodeToCallsMap { + struct MapEntry { + uint32_t NumCalls; + uint32_t *Calls; + }; + MapEntry *Entries; + BumpPtrAllocator &Alloc; + const uint32_t NumNodes; + + NodeToCallsMap(BumpPtrAllocator &Alloc, const FunctionDescription &D, + uint32_t NumNodes) + : Alloc(Alloc), NumNodes(NumNodes) { + Entries = new (Alloc, 0) MapEntry[NumNodes]; + for (int I = 0; I < D.NumCalls; ++I) { + DEBUG(reportNumber("Registering call in node ", D.Calls[I].FromNode, 10)); + ++Entries[D.Calls[I].FromNode].NumCalls; + } + for (int I = 0; I < NumNodes; ++I) { + Entries[I].Calls = Entries[I].NumCalls ? new (Alloc) + uint32_t[Entries[I].NumCalls] + : nullptr; + Entries[I].NumCalls = 0; + } + for (int I = 0; I < D.NumCalls; ++I) { + MapEntry &Entry = Entries[D.Calls[I].FromNode]; + Entry.Calls[Entry.NumCalls++] = I; + } + } + + /// Set the frequency of all calls in node \p NodeID to Freq. However, if + /// the calls have their own counters and do not depend on the basic block + /// counter, this means they have landing pads and throw exceptions. In this + /// case, set their frequency with their counters and return the maximum + /// value observed in such counters. This will be used as the new frequency + /// at basic block entry. This is used to fix the CFG edge frequencies in the + /// presence of exceptions. + uint64_t visitAllCallsIn(uint32_t NodeID, uint64_t Freq, uint64_t *CallFreqs, + const FunctionDescription &D, + const uint64_t *Counters, + ProfileWriterContext &Ctx) const { + const MapEntry &Entry = Entries[NodeID]; + uint64_t MaxValue = 0ull; + for (int I = 0, E = Entry.NumCalls; I != E; ++I) { + const uint32_t CallID = Entry.Calls[I]; + DEBUG(reportNumber(" Setting freq for call ID: ", CallID, 10)); + const CallDescription &CallDesc = D.Calls[CallID]; + if (CallDesc.Counter == 0xffffffff) { + CallFreqs[CallID] = Freq; + DEBUG(reportNumber(" with : ", Freq, 10)); + } else { + const uint64_t CounterVal = Counters[CallDesc.Counter]; + CallFreqs[CallID] = CounterVal; + MaxValue = CounterVal > MaxValue ? CounterVal : MaxValue; + DEBUG(reportNumber(" with (private counter) : ", CounterVal, 10)); + } + DEBUG(reportNumber(" Address: 0x", CallDesc.TargetAddress, 16)); + if (CallFreqs[CallID] > 0) + Ctx.CallFlowTable->get(CallDesc.TargetAddress).Calls += + CallFreqs[CallID]; + } + return MaxValue; + } + + ~NodeToCallsMap() { + for (int I = NumNodes - 1; I >= 0; --I) { + if (Entries[I].Calls) + Alloc.deallocate(Entries[I].Calls); + } + Alloc.deallocate(Entries); + } +}; + +/// Fill an array with the frequency of each edge in the function represented +/// by G, as well as another array for each call. +void Graph::computeEdgeFrequencies(const uint64_t *Counters, + ProfileWriterContext &Ctx) { + if (NumNodes == 0) + return; + + EdgeFreqs = D.NumEdges ? new (Alloc, 0) uint64_t [D.NumEdges] : nullptr; + CallFreqs = D.NumCalls ? new (Alloc, 0) uint64_t [D.NumCalls] : nullptr; + + // Setup a lookup for calls present in each node (BB) + NodeToCallsMap *CallMap = new (Alloc) NodeToCallsMap(Alloc, D, NumNodes); + + // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the + // spanning tree don't have explicit counters. We must infer their value using + // a linear combination of other counters (sum of counters of the outgoing + // edges minus sum of counters of the incoming edges). + uint32_t *Stack = new (Alloc) uint32_t [NumNodes]; + uint32_t StackTop = 0; + enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED }; + Status *Visited = new (Alloc, 0) Status[NumNodes]; + uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[NumNodes]; + uint64_t *EntryAddress = new (Alloc, 0) uint64_t[NumNodes]; + + // Setup a fast lookup for frequency of leaf nodes, which have special + // basic block frequency instrumentation (they are not edge profiled). + for (int I = 0; I < D.NumLeafNodes; ++I) { + LeafFrequency[D.LeafNodes[I].Node] = Counters[D.LeafNodes[I].Counter]; + DEBUG({ + if (Counters[D.LeafNodes[I].Counter] > 0) { + reportNumber("Leaf Node# ", D.LeafNodes[I].Node, 10); + reportNumber(" Counter: ", Counters[D.LeafNodes[I].Counter], 10); + } + }); + } + for (int I = 0; I < D.NumEntryNodes; ++I) { + EntryAddress[D.EntryNodes[I].Node] = D.EntryNodes[I].Address; + DEBUG({ + reportNumber("Entry Node# ", D.EntryNodes[I].Node, 10); + reportNumber(" Address: ", D.EntryNodes[I].Address, 16); + }); + } + // Add all root nodes to the stack + for (int I = 0; I < NumNodes; ++I) { + if (SpanningTreeNodes[I].NumInEdges == 0) + Stack[StackTop++] = I; + } + // Empty stack? + if (StackTop == 0) { + DEBUG(report("Empty stack!\n")); + Alloc.deallocate(EntryAddress); + Alloc.deallocate(LeafFrequency); + Alloc.deallocate(Visited); + Alloc.deallocate(Stack); + CallMap->~NodeToCallsMap(); + Alloc.deallocate(CallMap); + if (CallFreqs) + Alloc.deallocate(CallFreqs); + if (EdgeFreqs) + Alloc.deallocate(EdgeFreqs); + EdgeFreqs = nullptr; + CallFreqs = nullptr; + return; + } + // Add all known edge counts, will infer the rest + for (int I = 0; I < D.NumEdges; ++I) { + const uint32_t C = D.Edges[I].Counter; + if (C == 0xffffffff) // inferred counter - we will compute its value + continue; + EdgeFreqs[I] = Counters[C]; + } + + while (StackTop > 0) { + const uint32_t Cur = Stack[--StackTop]; + DEBUG({ + if (Visited[Cur] == S_VISITING) + report("(visiting) "); + else + report("(new) "); + reportNumber("Cur: ", Cur, 10); + }); + + // This shouldn't happen in a tree + assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack"); + if (Visited[Cur] == S_NEW) { + Visited[Cur] = S_VISITING; + Stack[StackTop++] = Cur; + assert(StackTop <= NumNodes, "stack grew too large"); + for (int I = 0, E = SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) { + const uint32_t Succ = SpanningTreeNodes[Cur].OutEdges[I].Node; + Stack[StackTop++] = Succ; + assert(StackTop <= NumNodes, "stack grew too large"); + } + continue; + } + Visited[Cur] = S_VISITED; + + // Establish our node frequency based on outgoing edges, which should all be + // resolved by now. + int64_t CurNodeFreq = LeafFrequency[Cur]; + // Not a leaf? + if (!CurNodeFreq) { + for (int I = 0, E = CFGNodes[Cur].NumOutEdges; I != E; ++I) { + const uint32_t SuccEdge = CFGNodes[Cur].OutEdges[I].ID; + CurNodeFreq += EdgeFreqs[SuccEdge]; + } + } + if (CurNodeFreq < 0) + CurNodeFreq = 0; + + const uint64_t CallFreq = CallMap->visitAllCallsIn( + Cur, CurNodeFreq > 0 ? CurNodeFreq : 0, CallFreqs, D, Counters, Ctx); + + // Exception handling affected our output flow? Fix with calls info + DEBUG({ + if (CallFreq > CurNodeFreq) + report("Bumping node frequency with call info\n"); + }); + CurNodeFreq = CallFreq > CurNodeFreq ? CallFreq : CurNodeFreq; + + if (CurNodeFreq > 0) { + if (uint64_t Addr = EntryAddress[Cur]) { + DEBUG( + reportNumber(" Setting flow at entry point address 0x", Addr, 16)); + DEBUG(reportNumber(" with: ", CurNodeFreq, 10)); + Ctx.CallFlowTable->get(Addr).Val = CurNodeFreq; + } + } + + // No parent? Reached a tree root, limit to call frequency updating. + if (SpanningTreeNodes[Cur].NumInEdges == 0) { + continue; + } + + assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent"); + const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node; + const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID; + + // Calculate parent edge freq. + int64_t ParentEdgeFreq = CurNodeFreq; + for (int I = 0, E = CFGNodes[Cur].NumInEdges; I != E; ++I) { + const uint32_t PredEdge = CFGNodes[Cur].InEdges[I].ID; + ParentEdgeFreq -= EdgeFreqs[PredEdge]; + } + + // Sometimes the conservative CFG that BOLT builds will lead to incorrect + // flow computation. For example, in a BB that transitively calls the exit + // syscall, BOLT will add a fall-through successor even though it should not + // have any successors. So this block execution will likely be wrong. We + // tolerate this imperfection since this case should be quite infrequent. + if (ParentEdgeFreq < 0) { + DEBUG(dumpEdgeFreqs()); + DEBUG(report("WARNING: incorrect flow")); + ParentEdgeFreq = 0; + } + DEBUG(reportNumber(" Setting freq for ParentEdge: ", ParentEdge, 10)); + DEBUG(reportNumber(" with ParentEdgeFreq: ", ParentEdgeFreq, 10)); + EdgeFreqs[ParentEdge] = ParentEdgeFreq; + } + + Alloc.deallocate(EntryAddress); + Alloc.deallocate(LeafFrequency); + Alloc.deallocate(Visited); + Alloc.deallocate(Stack); + CallMap->~NodeToCallsMap(); + Alloc.deallocate(CallMap); + DEBUG(dumpEdgeFreqs()); +} + +/// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses +/// \p Alloc to allocate helper dynamic structures used to compute profile for +/// edges that we do not explictly instrument. +const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx, + const uint8_t *FuncDesc, + BumpPtrAllocator &Alloc) { + const FunctionDescription F(FuncDesc); + const uint8_t *next = FuncDesc + F.getSize(); + +#if !defined(__APPLE__) + uint64_t *bolt_instr_locations = __bolt_instr_locations; +#else + uint64_t *bolt_instr_locations = _bolt_instr_locations_getter(); +#endif + + // Skip funcs we know are cold +#ifndef ENABLE_DEBUG + uint64_t CountersFreq = 0; + for (int I = 0; I < F.NumLeafNodes; ++I) { + CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter]; + } + if (CountersFreq == 0) { + for (int I = 0; I < F.NumEdges; ++I) { + const uint32_t C = F.Edges[I].Counter; + if (C == 0xffffffff) + continue; + CountersFreq += bolt_instr_locations[C]; + } + if (CountersFreq == 0) { + for (int I = 0; I < F.NumCalls; ++I) { + const uint32_t C = F.Calls[I].Counter; + if (C == 0xffffffff) + continue; + CountersFreq += bolt_instr_locations[C]; + } + if (CountersFreq == 0) + return next; + } + } +#endif + + Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx); + DEBUG(G->dump()); + + if (!G->EdgeFreqs && !G->CallFreqs) { + G->~Graph(); + Alloc.deallocate(G); + return next; + } + + for (int I = 0; I < F.NumEdges; ++I) { + const uint64_t Freq = G->EdgeFreqs[I]; + if (Freq == 0) + continue; + const EdgeDescription *Desc = &F.Edges[I]; + char LineBuf[BufSize]; + char *Ptr = LineBuf; + Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize); + Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf)); + Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22); + Ptr = intToStr(Ptr, Freq, 10); + *Ptr++ = '\n'; + __write(FD, LineBuf, Ptr - LineBuf); + } + + for (int I = 0; I < F.NumCalls; ++I) { + const uint64_t Freq = G->CallFreqs[I]; + if (Freq == 0) + continue; + char LineBuf[BufSize]; + char *Ptr = LineBuf; + const CallDescription *Desc = &F.Calls[I]; + Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize); + Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf)); + Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25); + Ptr = intToStr(Ptr, Freq, 10); + *Ptr++ = '\n'; + __write(FD, LineBuf, Ptr - LineBuf); + } + + G->~Graph(); + Alloc.deallocate(G); + return next; +} + +#if !defined(__APPLE__) +const IndCallTargetDescription * +ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const { + uint32_t B = 0; + uint32_t E = __bolt_instr_num_ind_targets; + if (E == 0) + return nullptr; + do { + uint32_t I = (E - B) / 2 + B; + if (IndCallTargets[I].Address == Target) + return &IndCallTargets[I]; + if (IndCallTargets[I].Address < Target) + B = I + 1; + else + E = I; + } while (B < E); + return nullptr; +} + +/// Write a single indirect call pair to the fdata file +void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry, + int FD, int CallsiteID, + ProfileWriterContext *Ctx) { + if (Entry.Val == 0) + return; + DEBUG(reportNumber("Target func 0x", Entry.Key, 16)); + DEBUG(reportNumber("Target freq: ", Entry.Val, 10)); + const IndCallDescription *CallsiteDesc = + &Ctx->IndCallDescriptions[CallsiteID]; + const IndCallTargetDescription *TargetDesc = + Ctx->lookupIndCallTarget(Entry.Key); + if (!TargetDesc) { + DEBUG(report("Failed to lookup indirect call target\n")); + char LineBuf[BufSize]; + char *Ptr = LineBuf; + Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize); + Ptr = strCopy(Ptr, "0 [unknown] 0 0 ", BufSize - (Ptr - LineBuf) - 40); + Ptr = intToStr(Ptr, Entry.Val, 10); + *Ptr++ = '\n'; + __write(FD, LineBuf, Ptr - LineBuf); + return; + } + Ctx->CallFlowTable->get(TargetDesc->Address).Calls += Entry.Val; + char LineBuf[BufSize]; + char *Ptr = LineBuf; + Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize); + Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf)); + Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25); + Ptr = intToStr(Ptr, Entry.Val, 10); + *Ptr++ = '\n'; + __write(FD, LineBuf, Ptr - LineBuf); +} + +/// Write to \p FD all of the indirect call profiles. +void writeIndirectCallProfile(int FD, ProfileWriterContext &Ctx) { + for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) { + DEBUG(reportNumber("IndCallsite #", I, 10)); + GlobalIndCallCounters[I].forEachElement(visitIndCallCounter, FD, I, &Ctx); + } +} + +/// Check a single call flow for a callee versus all known callers. If there are +/// less callers than what the callee expects, write the difference with source +/// [unknown] in the profile. +void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD, + ProfileWriterContext *Ctx) { + DEBUG(reportNumber("Call flow entry address: 0x", Entry.Key, 16)); + DEBUG(reportNumber("Calls: ", Entry.Calls, 10)); + DEBUG(reportNumber("Reported entry frequency: ", Entry.Val, 10)); + DEBUG({ + if (Entry.Calls > Entry.Val) + report(" More calls than expected!\n"); + }); + if (Entry.Val <= Entry.Calls) + return; + DEBUG(reportNumber( + " Balancing calls with traffic: ", Entry.Val - Entry.Calls, 10)); + const IndCallTargetDescription *TargetDesc = + Ctx->lookupIndCallTarget(Entry.Key); + if (!TargetDesc) { + // There is probably something wrong with this callee and this should be + // investigated, but I don't want to assert and lose all data collected. + DEBUG(report("WARNING: failed to look up call target!\n")); + return; + } + char LineBuf[BufSize]; + char *Ptr = LineBuf; + Ptr = strCopy(Ptr, "0 [unknown] 0 ", BufSize); + Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf)); + Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25); + Ptr = intToStr(Ptr, Entry.Val - Entry.Calls, 10); + *Ptr++ = '\n'; + __write(FD, LineBuf, Ptr - LineBuf); +} + +/// Open fdata file for writing and return a valid file descriptor, aborting +/// program upon failure. +int openProfile() { + // Build the profile name string by appending our PID + char Buf[BufSize]; + char *Ptr = Buf; + uint64_t PID = __getpid(); + Ptr = strCopy(Buf, __bolt_instr_filename, BufSize); + if (__bolt_instr_use_pid) { + Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1)); + Ptr = intToStr(Ptr, PID, 10); + Ptr = strCopy(Ptr, ".fdata", BufSize - (Ptr - Buf + 1)); + } + *Ptr++ = '\0'; + uint64_t FD = __open(Buf, + /*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/, + /*mode=*/0666); + if (static_cast(FD) < 0) { + report("Error while trying to open profile file for writing: "); + report(Buf); + reportNumber("\nFailed with error number: 0x", + 0 - static_cast(FD), 16); + __exit(1); + } + return FD; +} + +#endif + +} // anonymous namespace + +#if !defined(__APPLE__) + +/// Reset all counters in case you want to start profiling a new phase of your +/// program independently of prior phases. +/// The address of this function is printed by BOLT and this can be called by +/// any attached debugger during runtime. There is a useful oneliner for gdb: +/// +/// gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \ +/// -ex 'set confirm off' -ex quit +/// +/// Where 0xdeadbeef is this function address and PROCESSNAME your binary file +/// name. +extern "C" void __bolt_instr_clear_counters() { + memSet(reinterpret_cast(__bolt_instr_locations), 0, + __bolt_num_counters * 8); + for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) { + GlobalIndCallCounters[I].resetCounters(); + } +} + +/// This is the entry point for profile writing. +/// There are three ways of getting here: +/// +/// * Program execution ended, finalization methods are running and BOLT +/// hooked into FINI from your binary dynamic section; +/// * You used the sleep timer option and during initialization we forked +/// a separete process that will call this function periodically; +/// * BOLT prints this function address so you can attach a debugger and +/// call this function directly to get your profile written to disk +/// on demand. +/// +extern "C" void __attribute((force_align_arg_pointer)) +__bolt_instr_data_dump() { + // Already dumping + if (!GlobalWriteProfileMutex->acquire()) + return; + + BumpPtrAllocator HashAlloc; + HashAlloc.setMaxSize(0x6400000); + ProfileWriterContext Ctx = readDescriptions(); + Ctx.CallFlowTable = new (HashAlloc, 0) CallFlowHashTable(HashAlloc); + + DEBUG(printStats(Ctx)); + + int FD = openProfile(); + + BumpPtrAllocator Alloc; + const uint8_t *FuncDesc = Ctx.FuncDescriptions; + for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) { + FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc); + Alloc.clear(); + DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16)); + } + assert(FuncDesc == (void *)Ctx.Strings, + "FuncDesc ptr must be equal to stringtable"); + + writeIndirectCallProfile(FD, Ctx); + Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx); + + __close(FD); + __munmap(Ctx.MMapPtr, Ctx.MMapSize); + __close(Ctx.FileDesc); + HashAlloc.destroy(); + GlobalWriteProfileMutex->release(); + DEBUG(report("Finished writing profile.\n")); +} + +/// Event loop for our child process spawned during setup to dump profile data +/// at user-specified intervals +void watchProcess() { + timespec ts, rem; + uint64_t Ellapsed = 0ull; + uint64_t ppid; + if (__bolt_instr_wait_forks) { + // Store parent pgid + ppid = -__getpgid(0); + // And leave parent process group + __setpgid(0, 0); + } else { + // Store parent pid + ppid = __getppid(); + if (ppid == 1) { + // Parent already dead + goto out; + } + } + + ts.tv_sec = 1; + ts.tv_nsec = 0; + while (1) { + __nanosleep(&ts, &rem); + // This means our parent process or all its forks are dead, + // so no need for us to keep dumping. + if (__kill(ppid, 0) < 0) { + if (__bolt_instr_no_counters_clear) + __bolt_instr_data_dump(); + break; + } + + if (++Ellapsed < __bolt_instr_sleep_time) + continue; + + Ellapsed = 0; + __bolt_instr_data_dump(); + if (__bolt_instr_no_counters_clear == false) + __bolt_instr_clear_counters(); + } + +out:; + DEBUG(report("My parent process is dead, bye!\n")); + __exit(0); +} + +extern "C" void __bolt_instr_indirect_call(); +extern "C" void __bolt_instr_indirect_tailcall(); + +/// Initialization code +extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() { + const uint64_t CountersStart = + reinterpret_cast(&__bolt_instr_locations[0]); + const uint64_t CountersEnd = alignTo( + reinterpret_cast(&__bolt_instr_locations[__bolt_num_counters]), + 0x1000); + DEBUG(reportNumber("replace mmap start: ", CountersStart, 16)); + DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16)); + assert (CountersEnd > CountersStart, "no counters"); + // Maps our counters to be shared instead of private, so we keep counting for + // forked processes + __mmap(CountersStart, CountersEnd - CountersStart, + 0x3 /*PROT_READ|PROT_WRITE*/, + 0x31 /*MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED*/, -1, 0); + + __bolt_ind_call_instr_pointer = __bolt_instr_indirect_call; + __bolt_ind_tailcall_instr_pointer = __bolt_instr_indirect_tailcall; + // Conservatively reserve 100MiB shared pages + GlobalAlloc.setMaxSize(0x6400000); + GlobalAlloc.setShared(true); + GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex(); + if (__bolt_instr_num_ind_calls > 0) + GlobalIndCallCounters = + new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls]; + + if (__bolt_instr_sleep_time != 0) { + // Separate instrumented process to the own process group + if (__bolt_instr_wait_forks) + __setpgid(0, 0); + + if (long PID = __fork()) + return; + watchProcess(); + } +} + +extern "C" __attribute((force_align_arg_pointer)) void +instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { + GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc); +} + +/// We receive as in-stack arguments the identifier of the indirect call site +/// as well as the target address for the call +extern "C" __attribute((naked)) void __bolt_instr_indirect_call() +{ + __asm__ __volatile__(SAVE_ALL + "mov 0xa0(%%rsp), %%rdi\n" + "mov 0x98(%%rsp), %%rsi\n" + "call instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); +} + +extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() +{ + __asm__ __volatile__(SAVE_ALL + "mov 0x98(%%rsp), %%rdi\n" + "mov 0x90(%%rsp), %%rsi\n" + "call instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); +} + +/// This is hooking ELF's entry, it needs to save all machine state. +extern "C" __attribute((naked)) void __bolt_instr_start() +{ + __asm__ __volatile__(SAVE_ALL + "call __bolt_instr_setup\n" + RESTORE_ALL + "jmp __bolt_start_trampoline\n" + :::); +} + +/// This is hooking into ELF's DT_FINI +extern "C" void __bolt_instr_fini() { + __bolt_fini_trampoline(); + if (__bolt_instr_sleep_time == 0) + __bolt_instr_data_dump(); + DEBUG(report("Finished.\n")); +} + +#endif + +#if defined(__APPLE__) + +extern "C" void __bolt_instr_data_dump() { + ProfileWriterContext Ctx = readDescriptions(); + + int FD = 2; + BumpPtrAllocator Alloc; + const uint8_t *FuncDesc = Ctx.FuncDescriptions; + uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter(); + + for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) { + FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc); + Alloc.clear(); + DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16)); + } + assert(FuncDesc == (void *)Ctx.Strings, + "FuncDesc ptr must be equal to stringtable"); +} + +// On OSX/iOS the final symbol name of an extern "C" function/variable contains +// one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup. +extern "C" +__attribute__((section("__TEXT,__setup"))) +__attribute__((force_align_arg_pointer)) +void _bolt_instr_setup() { + __asm__ __volatile__(SAVE_ALL :::); + + report("Hello!\n"); + + __asm__ __volatile__(RESTORE_ALL :::); +} + +extern "C" +__attribute__((section("__TEXT,__fini"))) +__attribute__((force_align_arg_pointer)) +void _bolt_instr_fini() { + report("Bye!\n"); + __bolt_instr_data_dump(); +} + +#endif diff --git a/bolt/src/BinaryBasicBlock.cpp b/bolt/src/BinaryBasicBlock.cpp new file mode 100644 index 000000000000..7e04279cd9a6 --- /dev/null +++ b/bolt/src/BinaryBasicBlock.cpp @@ -0,0 +1,623 @@ +//===--- BinaryBasicBlock.cpp - Interface for assembly-level basic block --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Errc.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +namespace llvm { +namespace bolt { + +constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET; + +bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { + return LHS.Index < RHS.Index; +} + +bool BinaryBasicBlock::hasCFG() const { + return getParent()->hasCFG(); +} + +bool BinaryBasicBlock::isEntryPoint() const { + return getParent()->isEntryPoint(*this); +} + +bool BinaryBasicBlock::hasInstructions() const { + return getParent()->hasInstructions(); +} + +void BinaryBasicBlock::adjustNumPseudos(const MCInst &Inst, int Sign) { + BinaryContext &BC = Function->getBinaryContext(); + if (BC.MIB->isPseudo(Inst)) + NumPseudos += Sign; +} + +BinaryBasicBlock::iterator BinaryBasicBlock::getFirstNonPseudo() { + const BinaryContext &BC = Function->getBinaryContext(); + for (auto II = Instructions.begin(), E = Instructions.end(); II != E; ++II) { + if (!BC.MIB->isPseudo(*II)) + return II; + } + return end(); +} + +BinaryBasicBlock::reverse_iterator BinaryBasicBlock::getLastNonPseudo() { + const BinaryContext &BC = Function->getBinaryContext(); + for (auto RII = Instructions.rbegin(), E = Instructions.rend(); + RII != E; ++RII) { + if (!BC.MIB->isPseudo(*RII)) + return RII; + } + return rend(); +} + +bool BinaryBasicBlock::validateSuccessorInvariants() { + const MCInst *Inst = getLastNonPseudoInstr(); + const JumpTable *JT = Inst ? Function->getJumpTable(*Inst) : nullptr; + BinaryContext &BC = Function->getBinaryContext(); + bool Valid = true; + + if (JT) { + // Note: for now we assume that successors do not reference labels from + // any overlapping jump tables. We only look at the entries for the jump + // table that is referenced at the last instruction. + const auto Range = JT->getEntriesForAddress(BC.MIB->getJumpTable(*Inst)); + const std::vector Entries(&JT->Entries[Range.first], + &JT->Entries[Range.second]); + std::set UniqueSyms(Entries.begin(), Entries.end()); + for (BinaryBasicBlock *Succ : Successors) { + auto Itr = UniqueSyms.find(Succ->getLabel()); + if (Itr != UniqueSyms.end()) { + UniqueSyms.erase(Itr); + } else { + // Work on the assumption that jump table blocks don't + // have a conditional successor. + Valid = false; + errs() << "BOLT-WARNING: Jump table successor " + << Succ->getName() + << " not contained in the jump table.\n"; + } + } + // If there are any leftover entries in the jump table, they + // must be one of the function end labels. + if (Valid) { + for (const MCSymbol *Sym : UniqueSyms) { + Valid &= (Sym == Function->getFunctionEndLabel() || + Sym == Function->getFunctionColdEndLabel()); + if (!Valid) { + errs() << "BOLT-WARNING: Jump table contains illegal entry: " + << Sym->getName() << "\n"; + } + } + } + } else { + // Unknown control flow. + if (Inst && BC.MIB->isIndirectBranch(*Inst)) + return true; + + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + + if (analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { + switch (Successors.size()) { + case 0: + Valid = !CondBranch && !UncondBranch; + break; + case 1: { + const bool HasCondBlock = CondBranch && + Function->getBasicBlockForLabel(BC.MIB->getTargetSymbol(*CondBranch)); + Valid = !CondBranch || !HasCondBlock; + break; + } + case 2: + Valid = + (CondBranch && + (TBB == getConditionalSuccessor(true)->getLabel() && + ((!UncondBranch && !FBB) || + (UncondBranch && + FBB == getConditionalSuccessor(false)->getLabel())))); + break; + } + } + } + if (!Valid) { + errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ " + << getName() << "\n"; + if (JT) { + errs() << "Jump Table instruction addr = 0x" + << Twine::utohexstr(BC.MIB->getJumpTable(*Inst)) << "\n"; + JT->print(errs()); + } + getFunction()->dump(); + } + return Valid; +} + +BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { + if (!Label && succ_size() == 1) + return *succ_begin(); + + for (BinaryBasicBlock *BB : successors()) { + if (BB->getLabel() == Label) + return BB; + } + + return nullptr; +} + +BinaryBasicBlock * +BinaryBasicBlock::getSuccessor(const MCSymbol *Label, + BinaryBranchInfo &BI) const { + auto BIIter = branch_info_begin(); + for (BinaryBasicBlock *BB : successors()) { + if (BB->getLabel() == Label) { + BI = *BIIter; + return BB; + } + ++BIIter; + } + + return nullptr; +} + +BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const { + for (BinaryBasicBlock *BB : landing_pads()) { + if (BB->getLabel() == Label) + return BB; + } + + return nullptr; +} + +int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const { + assert( + getFunction()->getState() >= BinaryFunction::State::CFG && + "can only calculate CFI state when function is in or past the CFG state"); + + const std::vector &FDEProgram = + getFunction()->getFDEProgram(); + + // Find the last CFI preceding Instr in this basic block. + const MCInst *LastCFI = nullptr; + bool InstrSeen = (Instr == nullptr); + for (auto RII = Instructions.rbegin(), E = Instructions.rend(); + RII != E; ++RII) { + if (!InstrSeen) { + InstrSeen = (&*RII == Instr); + continue; + } + if (Function->getBinaryContext().MIB->isCFI(*RII)) { + LastCFI = &*RII; + break; + } + } + + assert(InstrSeen && "instruction expected in basic block"); + + // CFI state is the same as at basic block entry point. + if (!LastCFI) + return getCFIState(); + + // Fold all RememberState/RestoreState sequences, such as for: + // + // [ CFI #(K-1) ] + // RememberState (#K) + // .... + // RestoreState + // RememberState + // .... + // RestoreState + // [ GNU_args_size ] + // RememberState + // .... + // RestoreState <- LastCFI + // + // we return K - the most efficient state to (re-)generate. + int64_t State = LastCFI->getOperand(0).getImm(); + while (State >= 0 && + FDEProgram[State].getOperation() == MCCFIInstruction::OpRestoreState) { + int32_t Depth = 1; + --State; + assert(State >= 0 && "first CFI cannot be RestoreState"); + while (Depth && State >= 0) { + const MCCFIInstruction &CFIInstr = FDEProgram[State]; + if (CFIInstr.getOperation() == MCCFIInstruction::OpRestoreState) { + ++Depth; + } else if (CFIInstr.getOperation() == MCCFIInstruction::OpRememberState) { + --Depth; + } + --State; + } + assert(Depth == 0 && "unbalanced RememberState/RestoreState stack"); + + // Skip any GNU_args_size. + while (State >= 0 && + FDEProgram[State].getOperation() == MCCFIInstruction::OpGnuArgsSize){ + --State; + } + } + + assert((State + 1 >= 0) && "miscalculated CFI state"); + return State + 1; +} + +void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ, + uint64_t Count, + uint64_t MispredictedCount) { + Successors.push_back(Succ); + BranchInfo.push_back({Count, MispredictedCount}); + Succ->Predecessors.push_back(this); +} + +void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ, + BinaryBasicBlock *NewSucc, + uint64_t Count, + uint64_t MispredictedCount) { + Succ->removePredecessor(this, /*Multiple=*/false); + auto I = succ_begin(); + auto BI = BranchInfo.begin(); + for (; I != succ_end(); ++I) { + assert(BI != BranchInfo.end() && "missing BranchInfo entry"); + if (*I == Succ) + break; + ++BI; + } + assert(I != succ_end() && "no such successor!"); + + *I = NewSucc; + *BI = BinaryBranchInfo{Count, MispredictedCount}; + NewSucc->addPredecessor(this); +} + +void BinaryBasicBlock::removeAllSuccessors() { + for (BinaryBasicBlock *SuccessorBB : successors()) { + SuccessorBB->removePredecessor(this); + } + Successors.clear(); + BranchInfo.clear(); +} + +void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) { + Succ->removePredecessor(this, /*Multiple=*/false); + auto I = succ_begin(); + auto BI = BranchInfo.begin(); + for (; I != succ_end(); ++I) { + assert(BI != BranchInfo.end() && "missing BranchInfo entry"); + if (*I == Succ) + break; + ++BI; + } + assert(I != succ_end() && "no such successor!"); + + Successors.erase(I); + BranchInfo.erase(BI); +} + +void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) { + Predecessors.push_back(Pred); +} + +void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred, + bool Multiple) { + // Note: the predecessor could be listed multiple times. + bool Erased = false; + for (auto PredI = Predecessors.begin(); PredI != Predecessors.end(); ) { + if (*PredI == Pred) { + Erased = true; + PredI = Predecessors.erase(PredI); + if (!Multiple) + return; + } else { + ++PredI; + } + } + assert(Erased && "Pred is not a predecessor of this block!"); +} + +void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { + assert(succ_size() == 2 && Successors[0] == Successors[1] && + "conditional successors expected"); + + BinaryBasicBlock *Succ = Successors[0]; + const BinaryBranchInfo CondBI = BranchInfo[0]; + const BinaryBranchInfo UncondBI = BranchInfo[1]; + + eraseInstruction(findInstruction(CondBranch)); + + Successors.clear(); + BranchInfo.clear(); + + Successors.push_back(Succ); + + uint64_t Count = COUNT_NO_PROFILE; + if (CondBI.Count != COUNT_NO_PROFILE && UncondBI.Count != COUNT_NO_PROFILE) + Count = CondBI.Count + UncondBI.Count; + BranchInfo.push_back({Count, 0}); +} + +void BinaryBasicBlock::adjustExecutionCount(double Ratio) { + auto adjustedCount = [&](uint64_t Count) -> uint64_t { + double NewCount = Count * Ratio; + if (!NewCount && Count && (Ratio > 0.0)) + NewCount = 1; + return NewCount; + }; + + setExecutionCount(adjustedCount(getKnownExecutionCount())); + for (BinaryBranchInfo &BI : branch_info()) { + if (BI.Count != COUNT_NO_PROFILE) + BI.Count = adjustedCount(BI.Count); + if (BI.MispredictedCount != COUNT_INFERRED) + BI.MispredictedCount = adjustedCount(BI.MispredictedCount); + } +} + +bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, + const MCSymbol *&FBB, + MCInst *&CondBranch, + MCInst *&UncondBranch) { + auto &MIB = Function->getBinaryContext().MIB; + return MIB->analyzeBranch(Instructions.begin(), + Instructions.end(), + TBB, + FBB, + CondBranch, + UncondBranch); +} + +bool BinaryBasicBlock::isMacroOpFusionPair(const_iterator I) const { + auto &MIB = Function->getBinaryContext().MIB; + ArrayRef Insts = Instructions; + return MIB->isMacroOpFusionPair(Insts.slice(I - begin())); +} + +BinaryBasicBlock::const_iterator +BinaryBasicBlock::getMacroOpFusionPair() const { + if (!Function->getBinaryContext().isX86()) + return end(); + + if (getNumNonPseudos() < 2 || succ_size() != 2) + return end(); + + auto RI = getLastNonPseudo(); + assert(RI != rend() && "cannot have an empty block with 2 successors"); + + BinaryContext &BC = Function->getBinaryContext(); + + // Skip instruction if it's an unconditional branch following + // a conditional one. + if (BC.MIB->isUnconditionalBranch(*RI)) + ++RI; + + if (!BC.MIB->isConditionalBranch(*RI)) + return end(); + + // Start checking with instruction preceding the conditional branch. + ++RI; + if (RI == rend()) + return end(); + + auto II = std::prev(RI.base()); // convert to a forward iterator + if (isMacroOpFusionPair(II)) + return II; + + return end(); +} + +MCInst *BinaryBasicBlock::getTerminatorBefore(MCInst *Pos) { + BinaryContext &BC = Function->getBinaryContext(); + auto Itr = rbegin(); + bool Check = Pos ? false : true; + MCInst *FirstTerminator = nullptr; + while (Itr != rend()) { + if (!Check) { + if (&*Itr == Pos) + Check = true; + ++Itr; + continue; + } + if (BC.MIB->isTerminator(*Itr)) + FirstTerminator = &*Itr; + ++Itr; + } + return FirstTerminator; +} + +bool BinaryBasicBlock::hasTerminatorAfter(MCInst *Pos) { + BinaryContext &BC = Function->getBinaryContext(); + auto Itr = rbegin(); + while (Itr != rend()) { + if (&*Itr == Pos) + return false; + if (BC.MIB->isTerminator(*Itr)) + return true; + ++Itr; + } + return false; +} + +bool BinaryBasicBlock::swapConditionalSuccessors() { + if (succ_size() != 2) + return false; + + std::swap(Successors[0], Successors[1]); + std::swap(BranchInfo[0], BranchInfo[1]); + return true; +} + +void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) { + assert(isSuccessor(Successor)); + BinaryContext &BC = Function->getBinaryContext(); + MCInst NewInst; + std::unique_lock Lock(BC.CtxMutex); + BC.MIB->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get()); + Instructions.emplace_back(std::move(NewInst)); +} + +void BinaryBasicBlock::addTailCallInstruction(const MCSymbol *Target) { + BinaryContext &BC = Function->getBinaryContext(); + MCInst NewInst; + BC.MIB->createTailCall(NewInst, Target, BC.Ctx.get()); + Instructions.emplace_back(std::move(NewInst)); +} + +uint32_t BinaryBasicBlock::getNumCalls() const { + uint32_t N = 0; + BinaryContext &BC = Function->getBinaryContext(); + for (const MCInst &Instr : Instructions) { + if (BC.MIB->isCall(Instr)) + ++N; + } + return N; +} + +uint32_t BinaryBasicBlock::getNumPseudos() const { +#ifndef NDEBUG + BinaryContext &BC = Function->getBinaryContext(); + uint32_t N = 0; + for (const MCInst &Instr : Instructions) { + if (BC.MIB->isPseudo(Instr)) + ++N; + } + if (N != NumPseudos) { + errs() << "BOLT-ERROR: instructions for basic block " << getName() + << " in function " << *Function << ": calculated pseudos " + << N << ", set pseudos " << NumPseudos << ", size " << size() + << '\n'; + llvm_unreachable("pseudos mismatch"); + } +#endif + return NumPseudos; +} + +ErrorOr> +BinaryBasicBlock::getBranchStats(const BinaryBasicBlock *Succ) const { + if (Function->hasValidProfile()) { + uint64_t TotalCount = 0; + uint64_t TotalMispreds = 0; + for (const BinaryBranchInfo &BI : BranchInfo) { + if (BI.Count != COUNT_NO_PROFILE) { + TotalCount += BI.Count; + TotalMispreds += BI.MispredictedCount; + } + } + + if (TotalCount > 0) { + auto Itr = std::find(Successors.begin(), Successors.end(), Succ); + assert(Itr != Successors.end()); + const BinaryBranchInfo &BI = BranchInfo[Itr - Successors.begin()]; + if (BI.Count && BI.Count != COUNT_NO_PROFILE) { + if (TotalMispreds == 0) TotalMispreds = 1; + return std::make_pair(double(BI.Count) / TotalCount, + double(BI.MispredictedCount) / TotalMispreds); + } + } + } + return make_error_code(llvm::errc::result_out_of_range); +} + +void BinaryBasicBlock::dump() const { + BinaryContext &BC = Function->getBinaryContext(); + if (Label) outs() << Label->getName() << ":\n"; + BC.printInstructions(outs(), Instructions.begin(), Instructions.end(), + getOffset()); + outs() << "preds:"; + for (auto itr = pred_begin(); itr != pred_end(); ++itr) { + outs() << " " << (*itr)->getName(); + } + outs() << "\nsuccs:"; + for (auto itr = succ_begin(); itr != succ_end(); ++itr) { + outs() << " " << (*itr)->getName(); + } + outs() << "\n"; +} + +uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const { + return Function->getBinaryContext().computeCodeSize(begin(), end(), Emitter); +} + +BinaryBasicBlock::BinaryBranchInfo & +BinaryBasicBlock::getBranchInfo(const BinaryBasicBlock &Succ) { + auto BI = branch_info_begin(); + for (BinaryBasicBlock *BB : successors()) { + if (&Succ == BB) + return *BI; + ++BI; + } + + llvm_unreachable("Invalid successor"); + return *BI; +} + +BinaryBasicBlock::BinaryBranchInfo & +BinaryBasicBlock::getBranchInfo(const MCSymbol *Label) { + auto BI = branch_info_begin(); + for (BinaryBasicBlock *BB : successors()) { + if (BB->getLabel() == Label) + return *BI; + ++BI; + } + + llvm_unreachable("Invalid successor"); + return *BI; +} + +BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) { + assert(II != end() && "expected iterator pointing to instruction"); + + BinaryBasicBlock *NewBlock = getFunction()->addBasicBlock(0); + + // Adjust successors/predecessors and propagate the execution count. + moveAllSuccessorsTo(NewBlock); + addSuccessor(NewBlock, getExecutionCount(), 0); + + // Set correct CFI state for the new block. + NewBlock->setCFIState(getCFIStateAtInstr(&*II)); + + // Move instructions over. + adjustNumPseudos(II, end(), -1); + NewBlock->addInstructions(II, end()); + Instructions.erase(II, end()); + + return NewBlock; +} + +void BinaryBasicBlock::updateOutputValues(const MCAsmLayout &Layout) { + if (!LocSyms) + return; + + const uint64_t BBAddress = getOutputAddressRange().first; + const uint64_t BBOffset = Layout.getSymbolOffset(*getLabel()); + for (const auto &LocSymKV : *LocSyms) { + const uint32_t InputFunctionOffset = LocSymKV.first; + const uint32_t OutputOffset = static_cast( + Layout.getSymbolOffset(*LocSymKV.second) - BBOffset); + getOffsetTranslationTable().emplace_back( + std::make_pair(OutputOffset, InputFunctionOffset)); + + // Update reverse (relative to BAT) address lookup table for function. + if (getFunction()->requiresAddressTranslation()) { + getFunction()->getInputOffsetToAddressMap().emplace( + std::make_pair(InputFunctionOffset, OutputOffset + BBAddress)); + } + } + LocSyms.reset(nullptr); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h new file mode 100644 index 000000000000..e6c96c78117e --- /dev/null +++ b/bolt/src/BinaryBasicBlock.h @@ -0,0 +1,1080 @@ +//===--- BinaryBasicBlock.h - Interface for assembly-level basic block ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H + +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +namespace llvm { +class MCCodeEmitter; + +namespace bolt { + +class BinaryFunction; + +/// The intention is to keep the structure similar to MachineBasicBlock as +/// we might switch to it at some point. +class BinaryBasicBlock { +public: + + /// Profile execution information for a given edge in CFG. + /// + /// If MispredictedCount equals COUNT_INFERRED, then we have a profile + /// data for a fall-through edge with a Count representing an inferred + /// execution count, i.e. the count we calculated internally, not the one + /// coming from profile data. + /// + /// For all other values of MispredictedCount, Count represents the number of + /// branch executions from a profile, and MispredictedCount is the number + /// of times the branch was mispredicted according to this profile. + struct BinaryBranchInfo { + uint64_t Count; + uint64_t MispredictedCount; /// number of branches mispredicted + + bool operator<(const BinaryBranchInfo &Other) const { + return (Count < Other.Count) || + (Count == Other.Count && + MispredictedCount < Other.MispredictedCount); + } + }; + + static constexpr uint32_t INVALID_OFFSET = + std::numeric_limits::max(); + +private: + /// Vector of all instructions in the block. + std::vector Instructions; + + /// CFG information. + std::vector Predecessors; + std::vector Successors; + std::vector Throwers; + std::vector LandingPads; + + /// Each successor has a corresponding BranchInfo entry in the list. + std::vector BranchInfo; + + /// Function that owns this basic block. + BinaryFunction *Function; + + /// Label associated with the block. + MCSymbol *Label{nullptr}; + + /// [Begin, End) address range for this block in the output binary. + std::pair OutputAddressRange{0, 0}; + + /// Original offset range of the basic block in the function. + std::pair InputRange{INVALID_OFFSET, INVALID_OFFSET}; + + /// Map input offset (from function start) of an instruction to an output + /// symbol. Enables writing BOLT address translation tables used for mapping + /// control transfer in the output binary back to the original binary. + using LocSymsTy = std::vector>; + std::unique_ptr LocSyms; + + /// After output/codegen, map output offsets of instructions in this basic + /// block to instruction offsets in the original function. Note that the + /// output basic block could be different from the input basic block. + /// We only map instruction of interest, such as calls, and sdt markers. + /// + /// We store the offset array in a basic block to facilitate BAT tables + /// generation. Otherwise, the mapping could be done at function level. + using OffsetTranslationTableTy = std::vector>; + std::unique_ptr OffsetTranslationTable; + + /// Alignment requirements for the block. + uint32_t Alignment{1}; + + /// Maximum number of bytes to use for alignment of the block. + uint32_t AlignmentMaxBytes{0}; + + /// Number of times this basic block was executed. + uint64_t ExecutionCount{COUNT_NO_PROFILE}; + + static constexpr unsigned InvalidIndex = ~0u; + + /// Index to BasicBlocks vector in BinaryFunction. + unsigned Index{InvalidIndex}; + + /// Index in the current layout. + mutable unsigned LayoutIndex{InvalidIndex}; + + /// Number of pseudo instructions in this block. + uint32_t NumPseudos{0}; + + /// CFI state at the entry to this basic block. + int32_t CFIState{-1}; + + /// In cases where the parent function has been split, IsCold == true means + /// this BB will be allocated outside its parent function. + bool IsCold{false}; + + /// Indicates if the block could be outlined. + bool CanOutline{true}; + + /// Flag to indicate whether this block is valid or not. Invalid + /// blocks may contain out of date or incorrect information. + bool IsValid{true}; + +private: + BinaryBasicBlock() = delete; + BinaryBasicBlock(const BinaryBasicBlock &) = delete; + BinaryBasicBlock& operator=(const BinaryBasicBlock &) = delete; + + explicit BinaryBasicBlock( + BinaryFunction *Function, + MCSymbol *Label, + uint32_t Offset = INVALID_OFFSET) + : Function(Function), Label(Label) { + assert(Function && "Function must be non-null"); + InputRange.first = Offset; + } + + // Exclusively managed by BinaryFunction. + friend class BinaryFunction; + friend bool operator<(const BinaryBasicBlock &LHS, + const BinaryBasicBlock &RHS); + + /// Assign new label to the basic block. + void setLabel(MCSymbol *Symbol) { + Label = Symbol; + } + +public: + static constexpr uint64_t COUNT_INFERRED = + std::numeric_limits::max(); + static constexpr uint64_t COUNT_NO_PROFILE = + std::numeric_limits::max(); + + // Instructions iterators. + using iterator = std::vector::iterator; + using const_iterator = std::vector::const_iterator; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + bool empty() const { assert(hasInstructions()); + return Instructions.empty(); } + size_t size() const { assert(hasInstructions()); + return Instructions.size(); } + MCInst &front() { assert(hasInstructions()); + return Instructions.front(); } + MCInst &back() { assert(hasInstructions()); + return Instructions.back(); } + const MCInst &front() const { assert(hasInstructions()); + return Instructions.front(); } + const MCInst &back() const { assert(hasInstructions()); + return Instructions.back(); } + + iterator begin() { assert(hasInstructions()); + return Instructions.begin(); } + const_iterator begin() const { assert(hasInstructions()); + return Instructions.begin(); } + iterator end () { assert(hasInstructions()); + return Instructions.end(); } + const_iterator end () const { assert(hasInstructions()); + return Instructions.end(); } + reverse_iterator rbegin() { assert(hasInstructions()); + return Instructions.rbegin(); } + const_reverse_iterator rbegin() const { assert(hasInstructions()); + return Instructions.rbegin(); } + reverse_iterator rend () { assert(hasInstructions()); + return Instructions.rend(); } + const_reverse_iterator rend () const { assert(hasInstructions()); + return Instructions.rend(); } + + // CFG iterators. + using pred_iterator = std::vector::iterator; + using const_pred_iterator = std::vector::const_iterator; + using succ_iterator = std::vector::iterator; + using const_succ_iterator = std::vector::const_iterator; + using throw_iterator = decltype(Throwers)::iterator; + using const_throw_iterator = decltype(Throwers)::const_iterator; + using lp_iterator = decltype(LandingPads)::iterator; + using const_lp_iterator = decltype(LandingPads)::const_iterator; + + using pred_reverse_iterator = std::reverse_iterator; + using const_pred_reverse_iterator = + std::reverse_iterator; + using succ_reverse_iterator = std::reverse_iterator; + using const_succ_reverse_iterator = + std::reverse_iterator; + + pred_iterator pred_begin() { return Predecessors.begin(); } + const_pred_iterator pred_begin() const { return Predecessors.begin(); } + pred_iterator pred_end() { return Predecessors.end(); } + const_pred_iterator pred_end() const { return Predecessors.end(); } + pred_reverse_iterator pred_rbegin() + { return Predecessors.rbegin();} + const_pred_reverse_iterator pred_rbegin() const + { return Predecessors.rbegin();} + pred_reverse_iterator pred_rend() + { return Predecessors.rend(); } + const_pred_reverse_iterator pred_rend() const + { return Predecessors.rend(); } + size_t pred_size() const { + return Predecessors.size(); + } + bool pred_empty() const { return Predecessors.empty(); } + + succ_iterator succ_begin() { return Successors.begin(); } + const_succ_iterator succ_begin() const { return Successors.begin(); } + succ_iterator succ_end() { return Successors.end(); } + const_succ_iterator succ_end() const { return Successors.end(); } + succ_reverse_iterator succ_rbegin() + { return Successors.rbegin(); } + const_succ_reverse_iterator succ_rbegin() const + { return Successors.rbegin(); } + succ_reverse_iterator succ_rend() + { return Successors.rend(); } + const_succ_reverse_iterator succ_rend() const + { return Successors.rend(); } + size_t succ_size() const { + return Successors.size(); + } + bool succ_empty() const { return Successors.empty(); } + + throw_iterator throw_begin() { return Throwers.begin(); } + const_throw_iterator throw_begin() const { return Throwers.begin(); } + throw_iterator throw_end() { return Throwers.end(); } + const_throw_iterator throw_end() const { return Throwers.end(); } + size_t throw_size() const { + return Throwers.size(); + } + bool throw_empty() const { return Throwers.empty(); } + bool isLandingPad() const { return !Throwers.empty(); } + + lp_iterator lp_begin() { return LandingPads.begin(); } + const_lp_iterator lp_begin() const { return LandingPads.begin(); } + lp_iterator lp_end() { return LandingPads.end(); } + const_lp_iterator lp_end() const { return LandingPads.end(); } + size_t lp_size() const { + return LandingPads.size(); + } + bool lp_empty() const { return LandingPads.empty(); } + + inline iterator_range instructions() { + assert(hasInstructions()); + return iterator_range(begin(), end()); + } + inline iterator_range instructions() const { + assert(hasInstructions()); + return iterator_range(begin(), end()); + } + inline iterator_range predecessors() { + assert(hasCFG()); + return iterator_range(pred_begin(), pred_end()); + } + inline iterator_range predecessors() const { + assert(hasCFG()); + return iterator_range(pred_begin(), pred_end()); + } + inline iterator_range successors() { + assert(hasCFG()); + return iterator_range(succ_begin(), succ_end()); + } + inline iterator_range successors() const { + assert(hasCFG()); + return iterator_range(succ_begin(), succ_end()); + } + inline iterator_range throwers() { + assert(hasCFG()); + return iterator_range(throw_begin(), throw_end()); + } + inline iterator_range throwers() const { + assert(hasCFG()); + return iterator_range(throw_begin(), throw_end()); + } + inline iterator_range landing_pads() { + assert(hasCFG()); + return iterator_range(lp_begin(), lp_end()); + } + inline iterator_range landing_pads() const { + assert(hasCFG()); + return iterator_range(lp_begin(), lp_end()); + } + + // BranchInfo iterators. + using branch_info_iterator = std::vector::iterator; + using const_branch_info_iterator = + std::vector::const_iterator; + using branch_info_reverse_iterator = + std::reverse_iterator; + using const_branch_info_reverse_iterator = + std::reverse_iterator; + + branch_info_iterator branch_info_begin() { return BranchInfo.begin(); } + branch_info_iterator branch_info_end() { return BranchInfo.end(); } + const_branch_info_iterator branch_info_begin() const { + return BranchInfo.begin(); + } + const_branch_info_iterator branch_info_end() const { + return BranchInfo.end(); + } + branch_info_reverse_iterator branch_info_rbegin() { + return BranchInfo.rbegin(); + } + branch_info_reverse_iterator branch_info_rend() { + return BranchInfo.rend(); + } + const_branch_info_reverse_iterator branch_info_rbegin() const { + return BranchInfo.rbegin(); + } + const_branch_info_reverse_iterator branch_info_rend() const { + return BranchInfo.rend(); + } + + size_t branch_info_size() const { return BranchInfo.size(); } + bool branch_info_empty() const { return BranchInfo.empty(); } + + inline iterator_range branch_info() { + return iterator_range( + BranchInfo.begin(), BranchInfo.end()); + } + inline iterator_range branch_info() const { + return iterator_range( + BranchInfo.begin(), BranchInfo.end()); + } + + /// Get instruction at given index. + MCInst &getInstructionAtIndex(unsigned Index) { + return Instructions.at(Index); + } + + const MCInst &getInstructionAtIndex(unsigned Index) const { + return Instructions.at(Index); + } + + /// Return symbol marking the start of this basic block. + MCSymbol *getLabel() { + return Label; + } + + /// Return symbol marking the start of this basic block (const version). + const MCSymbol *getLabel() const { + return Label; + } + + /// Get successor with given \p Label if \p Label != nullptr. + /// Returns nullptr if no such successor is found. + /// If the \p Label == nullptr and the block has only one successor then + /// return the successor. + BinaryBasicBlock *getSuccessor(const MCSymbol *Label = nullptr) const; + + /// Return the related branch info as well as the successor. + BinaryBasicBlock *getSuccessor(const MCSymbol *Label, + BinaryBranchInfo &BI) const; + + /// If the basic block ends with a conditional branch (possibly followed by + /// an unconditional branch) and thus has 2 successors, return a successor + /// corresponding to a jump condition which could be true or false. + /// Return nullptr if the basic block does not have a conditional jump. + BinaryBasicBlock *getConditionalSuccessor(bool Condition) { + if (succ_size() != 2) + return nullptr; + return Successors[Condition == true ? 0 : 1]; + } + + const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { + return + const_cast(this)->getConditionalSuccessor(Condition); + } + + /// Find the fallthrough successor for a block, or nullptr if there is + /// none. + BinaryBasicBlock* getFallthrough() { + if (succ_size() == 2) + return getConditionalSuccessor(false); + else + return getSuccessor(); + } + + const BinaryBasicBlock *getFallthrough() const { + return const_cast(this)->getFallthrough(); + } + + /// Return branch info corresponding to a taken branch. + const BinaryBranchInfo &getTakenBranchInfo() const { + assert(BranchInfo.size() == 2 && + "could only be called for blocks with 2 successors"); + return BranchInfo[0]; + }; + + /// Return branch info corresponding to a fall-through branch. + const BinaryBranchInfo &getFallthroughBranchInfo() const { + assert(BranchInfo.size() == 2 && + "could only be called for blocks with 2 successors"); + return BranchInfo[1]; + }; + + /// Return branch info corresponding to an edge going to \p Succ basic block. + BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ); + + /// Return branch info corresponding to an edge going to a basic block with + /// label \p Label. + BinaryBranchInfo &getBranchInfo(const MCSymbol *Label); + + /// Set branch information for the outgoing edge to block \p Succ. + void setSuccessorBranchInfo(const BinaryBasicBlock &Succ, + uint64_t Count, + uint64_t MispredictedCount) { + BinaryBranchInfo &BI = getBranchInfo(Succ); + BI.Count = Count; + BI.MispredictedCount = MispredictedCount; + } + + /// Try to compute the taken and misprediction frequencies for the given + /// successor. The result is an error if no information can be found. + ErrorOr> + getBranchStats(const BinaryBasicBlock *Succ) const; + + /// If the basic block ends with a conditional branch (possibly followed by + /// an unconditional branch) and thus has 2 successor, reverse the order of + /// its successors in CFG, update branch info, and return true. If the basic + /// block does not have 2 successors return false. + bool swapConditionalSuccessors(); + + /// Add an instruction with unconditional control transfer to \p Successor + /// basic block to the end of this basic block. + void addBranchInstruction(const BinaryBasicBlock *Successor); + + /// Add an instruction with tail call control transfer to \p Target + /// to the end of this basic block. + void addTailCallInstruction(const MCSymbol *Target); + + /// Return the number of call instructions in this basic block. + uint32_t getNumCalls() const; + + /// Get landing pad with given label. Returns nullptr if no such + /// landing pad is found. + BinaryBasicBlock *getLandingPad(const MCSymbol *Label) const; + + /// Return local name for the block. + StringRef getName() const { + return Label->getName(); + } + + /// Add instruction at the end of this basic block. + /// Returns iterator pointing to the inserted instruction. + iterator addInstruction(MCInst &&Inst) { + adjustNumPseudos(Inst, 1); + Instructions.emplace_back(Inst); + return std::prev(Instructions.end()); + } + + /// Add instruction at the end of this basic block. + /// Returns iterator pointing to the inserted instruction. + iterator addInstruction(const MCInst &Inst) { + adjustNumPseudos(Inst, 1); + Instructions.push_back(Inst); + return std::prev(Instructions.end()); + } + + /// Add a range of instructions to the end of this basic block. + template + void addInstructions(Itr Begin, Itr End) { + while (Begin != End) { + addInstruction(*Begin++); + } + } + + /// Add a range of instructions to the end of this basic block. + template + void addInstructions(RangeTy R) { + for (auto &I : R) + addInstruction(I); + } + + /// Add instruction before Pos in this basic block. + template + Itr insertPseudoInstr(Itr Pos, MCInst &Instr) { + ++NumPseudos; + return Instructions.emplace(Pos, Instr); + } + + /// Return the number of pseudo instructions in the basic block. + uint32_t getNumPseudos() const; + + /// Return the number of emitted instructions for this basic block. + uint32_t getNumNonPseudos() const { + return size() - getNumPseudos(); + } + + /// Return iterator to the first non-pseudo instruction or end() + /// if no such instruction was found. + iterator getFirstNonPseudo(); + + /// Return a pointer to the first non-pseudo instruction in this basic + /// block. Returns nullptr if none exists. + MCInst *getFirstNonPseudoInstr() { + auto II = getFirstNonPseudo(); + return II == Instructions.end() ? nullptr : &*II; + } + + /// Return reverse iterator to the last non-pseudo instruction or rend() + /// if no such instruction was found. + reverse_iterator getLastNonPseudo(); + const_reverse_iterator getLastNonPseudo() const { + return const_cast(this)->getLastNonPseudo(); + } + + /// Return a pointer to the last non-pseudo instruction in this basic + /// block. Returns nullptr if none exists. + MCInst *getLastNonPseudoInstr() { + auto RII = getLastNonPseudo(); + return RII == Instructions.rend() ? nullptr : &*RII; + } + const MCInst *getLastNonPseudoInstr() const { + auto RII = getLastNonPseudo(); + return RII == Instructions.rend() ? nullptr : &*RII; + } + + /// Set CFI state at entry to this basic block. + void setCFIState(int32_t NewCFIState) { + assert((CFIState == -1 || NewCFIState == CFIState) && + "unexpected change of CFI state for basic block"); + CFIState = NewCFIState; + } + + /// Return CFI state (expected) at entry of this basic block. + int32_t getCFIState() const { + assert(CFIState >= 0 && "unknown CFI state"); + return CFIState; + } + + /// Calculate and return CFI state right before instruction \p Instr in + /// this basic block. If \p Instr is nullptr then return the state at + /// the end of the basic block. + int32_t getCFIStateAtInstr(const MCInst *Instr) const; + + /// Calculate and return CFI state after execution of this basic block. + /// The state depends on CFI state at entry and CFI instructions inside the + /// basic block. + int32_t getCFIStateAtExit() const { + return getCFIStateAtInstr(nullptr); + } + + /// Set minimum alignment for the basic block. + void setAlignment(uint32_t Align) { + Alignment = Align; + } + + /// Return required alignment for the block. + uint32_t getAlignment() const { + return Alignment; + } + + /// Set the maximum number of bytes to use for the block alignment. + void setAlignmentMaxBytes(uint32_t Value) { + AlignmentMaxBytes = Value; + } + + /// Return the maximum number of bytes to use for the block alignment. + uint32_t getAlignmentMaxBytes() const { + return AlignmentMaxBytes; + } + + /// Adds block to successor list, and also updates predecessor list for + /// successor block. + /// Set branch info for this path. + void addSuccessor(BinaryBasicBlock *Succ, + uint64_t Count = 0, + uint64_t MispredictedCount = 0); + + void addSuccessor(BinaryBasicBlock *Succ, const BinaryBranchInfo &BI) { + addSuccessor(Succ, BI.Count, BI.MispredictedCount); + } + + /// Add a range of successors. + template + void addSuccessors(Itr Begin, Itr End) { + while (Begin != End) { + addSuccessor(*Begin++); + } + } + + /// Add a range of successors with branch info. + template + void addSuccessors(Itr Begin, Itr End, BrItr BrBegin, BrItr BrEnd) { + assert(std::distance(Begin, End) == std::distance(BrBegin, BrEnd)); + while (Begin != End) { + addSuccessor(*Begin++, *BrBegin++); + } + } + + /// Replace Succ with NewSucc. This routine is helpful for preserving + /// the order of conditional successors when editing the CFG. + void replaceSuccessor(BinaryBasicBlock *Succ, + BinaryBasicBlock *NewSucc, + uint64_t Count = 0, + uint64_t MispredictedCount = 0); + + /// Move all of this block's successors to a new block, and set the + /// execution count of this new block with our execution count. This is + /// useful when splitting a block in two. + void moveAllSuccessorsTo(BinaryBasicBlock *New) { + New->addSuccessors(successors().begin(), + successors().end(), + branch_info_begin(), + branch_info_end()); + removeAllSuccessors(); + + // Update the execution count on the new block. + New->setExecutionCount(getExecutionCount()); + } + + /// Remove /p Succ basic block from the list of successors. Update the + /// list of predecessors of /p Succ and update branch info. + void removeSuccessor(BinaryBasicBlock *Succ); + + /// Remove all successors of the basic block, and remove the block + /// from respective lists of predecessors. + void removeAllSuccessors(); + + /// Remove useless duplicate successors. When the conditional + /// successor is the same as the unconditional successor, we can + /// remove the conditional successor and branch instruction. + void removeDuplicateConditionalSuccessor(MCInst *CondBranch); + + /// Test if BB is a predecessor of this block. + bool isPredecessor(const BinaryBasicBlock *BB) const { + auto Itr = std::find(Predecessors.begin(), Predecessors.end(), BB); + return Itr != Predecessors.end(); + } + + /// Test if BB is a successor of this block. + bool isSuccessor(const BinaryBasicBlock *BB) const { + auto Itr = std::find(Successors.begin(), Successors.end(), BB); + return Itr != Successors.end(); + } + + /// Test if this BB has a valid execution count. + bool hasProfile() const { + return ExecutionCount != COUNT_NO_PROFILE; + } + + /// Return the information about the number of times this basic block was + /// executed. + /// + /// Return COUNT_NO_PROFILE if there's no profile info. + uint64_t getExecutionCount() const { + return ExecutionCount; + } + + /// Return the execution count for blocks with known profile. + /// Return 0 if the block has no profile. + uint64_t getKnownExecutionCount() const { + return !hasProfile() ? 0 : ExecutionCount; + } + + /// Set the execution count for this block. + void setExecutionCount(uint64_t Count) { + ExecutionCount = Count; + } + + /// Apply a given \p Ratio to the profile information of this basic block. + void adjustExecutionCount(double Ratio); + + /// Return true if the basic block is an entry point into the function + /// (either primary or secondary). + bool isEntryPoint() const; + + bool isValid() const { + return IsValid; + } + + void markValid(const bool Valid) { + IsValid = Valid; + } + + bool isCold() const { + return IsCold; + } + + void setIsCold(const bool Flag) { + IsCold = Flag; + } + + /// Return true if the block can be outlined. At the moment we disallow + /// outlining of blocks that can potentially throw exceptions or are + /// the beginning of a landing pad. The entry basic block also can + /// never be outlined. + bool canOutline() const { + return CanOutline; + } + + void setCanOutline(const bool Flag) { + CanOutline = Flag; + } + + /// Erase pseudo instruction at a given iterator. + /// Return iterator following the removed instruction. + iterator erasePseudoInstruction(iterator II) { + --NumPseudos; + return Instructions.erase(II); + } + + /// Erase non-pseudo instruction at a given iterator \p II. + /// Return iterator following the removed instruction. + iterator eraseInstruction(iterator II) { + adjustNumPseudos(*II, -1); + return Instructions.erase(II); + } + + /// Erase instructions in the specified range. + template + void eraseInstructions(ItrType Begin, ItrType End) { + while (End > Begin) { + eraseInstruction(findInstruction(*--End)); + } + } + + /// Erase all instructions. + void clear() { + Instructions.clear(); + NumPseudos = 0; + } + + /// Retrieve iterator for \p Inst or return end iterator if instruction is not + /// from this basic block. + decltype(Instructions)::iterator findInstruction(const MCInst *Inst) { + if (Instructions.empty()) + return Instructions.end(); + size_t Index = Inst - &Instructions[0]; + return Index >= Instructions.size() ? Instructions.end() + : Instructions.begin() + Index; + } + + /// Replace instruction referenced by iterator \II with a sequence of + /// instructions defined by [\p Begin, \p End] range. + /// + /// Return iterator pointing to the first inserted instruction. + template + iterator replaceInstruction(iterator II, Itr Begin, Itr End) { + adjustNumPseudos(*II, -1); + adjustNumPseudos(Begin, End, 1); + + auto I = II - Instructions.begin(); + Instructions.insert(Instructions.erase(II), Begin, End); + return I + Instructions.begin(); + } + + iterator replaceInstruction(iterator II, + const std::vector &Replacement) { + return replaceInstruction(II, Replacement.begin(), Replacement.end()); + } + + /// Insert \p NewInst before \p At, which must be an existing instruction in + /// this BB. Return iterator pointing to the newly inserted instruction. + iterator insertInstruction(iterator At, MCInst &&NewInst) { + adjustNumPseudos(NewInst, 1); + return Instructions.emplace(At, std::move(NewInst)); + } + + iterator insertInstruction(iterator At, MCInst &NewInst) { + adjustNumPseudos(NewInst, 1); + return Instructions.emplace(At, NewInst); + } + + /// Helper to retrieve any terminators in \p BB before \p Pos. This is used + /// to skip CFI instructions and to retrieve the first terminator instruction + /// in basic blocks with two terminators (conditional jump and unconditional + /// jump). + MCInst *getTerminatorBefore(MCInst *Pos); + + /// Used to identify whether an instruction is before a terminator and whether + /// moving it to the end of the BB would render it dead code. + bool hasTerminatorAfter(MCInst *Pos); + + /// Split apart the instructions in this basic block starting at Inst. + /// The instructions following Inst are removed and returned in a vector. + std::vector splitInstructions(const MCInst *Inst) { + std::vector SplitInst; + + assert(!Instructions.empty()); + while(&Instructions.back() != Inst) { + SplitInst.push_back(Instructions.back()); + Instructions.pop_back(); + } + std::reverse(SplitInst.begin(), SplitInst.end()); + NumPseudos = 0; + adjustNumPseudos(Instructions.begin(), Instructions.end(), 1); + return SplitInst; + } + + /// Split basic block at the instruction pointed to by II. + /// All iterators pointing after II get invalidated. + /// + /// Return the new basic block that starts with the instruction + /// at the split point. + BinaryBasicBlock *splitAt(iterator II); + + /// Sets address of the basic block in the output. + void setOutputStartAddress(uint64_t Address) { + OutputAddressRange.first = Address; + } + + /// Sets address past the end of the basic block in the output. + void setOutputEndAddress(uint64_t Address) { + OutputAddressRange.second = Address; + } + + /// Gets the memory address range of this BB in the input binary. + std::pair getInputAddressRange() const { + return InputRange; + } + + /// Gets the memory address range of this BB in the output binary. + std::pair getOutputAddressRange() const { + return OutputAddressRange; + } + + /// Update addresses of special instructions inside this basic block. + void updateOutputValues(const MCAsmLayout &Layout); + + /// Return mapping of input offsets to symbols in the output. + LocSymsTy &getLocSyms() { + return LocSyms ? *LocSyms : *(LocSyms = std::make_unique()); + } + + /// Return mapping of input offsets to symbols in the output. + const LocSymsTy &getLocSyms() const { + return const_cast(this)->getLocSyms(); + } + + /// Return offset translation table for the basic block. + OffsetTranslationTableTy &getOffsetTranslationTable() { + return OffsetTranslationTable ? + *OffsetTranslationTable : + *(OffsetTranslationTable = std::make_unique()); + } + + /// Return offset translation table for the basic block. + const OffsetTranslationTableTy &getOffsetTranslationTable() const { + return const_cast(this)->getOffsetTranslationTable(); + } + + /// Return size of the basic block in the output binary. + uint64_t getOutputSize() const { + return OutputAddressRange.second - OutputAddressRange.first; + } + + BinaryFunction *getFunction() const { + return Function; + } + + /// Analyze and interpret the terminators of this basic block. TBB must be + /// initialized with the original fall-through for this BB. + bool analyzeBranch(const MCSymbol *&TBB, + const MCSymbol *&FBB, + MCInst *&CondBranch, + MCInst *&UncondBranch); + + /// Return true if iterator \p I is pointing to the first instruction in + /// a pair that could be macro-fused. + bool isMacroOpFusionPair(const_iterator I) const; + + /// If the basic block has a pair of instructions suitable for macro-fusion, + /// return iterator to the first instruction of the pair. + /// Otherwise return end(). + const_iterator getMacroOpFusionPair() const; + + /// Printer required for printing dominator trees. + void printAsOperand(raw_ostream &OS, bool PrintType = true) { + if (PrintType) { + OS << "basic block "; + } + OS << getName(); + } + + /// A simple dump function for debugging. + void dump() const; + + /// Validate successor invariants for this BB. + bool validateSuccessorInvariants(); + + /// Return offset of the basic block from the function start on input. + uint32_t getInputOffset() const { + return InputRange.first; + } + + /// Return offset from the function start to location immediately past + /// the end of the basic block. + uint32_t getEndOffset() const { + return InputRange.second; + } + + /// Return size of the basic block on input. + uint32_t getOriginalSize() const { + return InputRange.second - InputRange.first; + } + + /// Returns an estimate of size of basic block during run time optionally + /// using a user-supplied emitter for lock-free multi-thread work. + /// MCCodeEmitter is not thread safe and each thread should operate with its + /// own copy of it. + uint64_t estimateSize(const MCCodeEmitter *Emitter = nullptr) const; + + /// Return index in the current layout. The user is responsible for + /// making sure the indices are up to date, + /// e.g. by calling BinaryFunction::updateLayoutIndices(); + unsigned getLayoutIndex() const { + assert(isValid()); + return LayoutIndex; + } + + /// Set layout index. To be used by BinaryFunction. + void setLayoutIndex(unsigned Index) const { + LayoutIndex = Index; + } + + /// Needed by graph traits. + BinaryFunction *getParent() const { + return getFunction(); + } + + /// Return true if the containing function is in CFG state. + bool hasCFG() const; + + /// Return true if the containing function is in a state with instructions. + bool hasInstructions() const; + + /// Return offset of the basic block from the function start. + uint32_t getOffset() const { + return InputRange.first; + } + + /// Get the index of this basic block. + unsigned getIndex() const { + assert(isValid()); + return Index; + } + +private: + void adjustNumPseudos(const MCInst &Inst, int Sign); + + template + void adjustNumPseudos(Itr Begin, Itr End, int Sign) { + while (Begin != End) { + adjustNumPseudos(*Begin++, Sign); + } + } + + /// Adds predecessor to the BB. Most likely you don't need to call this. + void addPredecessor(BinaryBasicBlock *Pred); + + /// Remove predecessor of the basic block. Don't use directly, instead + /// use removeSuccessor() function. + /// If \p Multiple is set to true, it will remove all predecessors that + /// are equal to \p Pred. Otherwise, the first instance of \p Pred found + /// will be removed. This only matters in awkward, redundant CFGs. + void removePredecessor(BinaryBasicBlock *Pred, bool Multiple=true); + + /// Set end offset of this basic block. + void setEndOffset(uint32_t Offset) { + InputRange.second = Offset; + } + + /// Set the index of this basic block. + void setIndex(unsigned I) { + Index = I; + } + + template void clearList(T& List) { + T TempList; + TempList.swap(List); + } + + /// Release memory taken by CFG edges and instructions. + void releaseCFG() { + clearList(Predecessors); + clearList(Successors); + clearList(Throwers); + clearList(LandingPads); + clearList(BranchInfo); + clearList(Instructions); + } +}; + +/// Keep the size of the BinaryBasicBlock within a reasonable size class. +static_assert(sizeof(BinaryBasicBlock) <= 256, ""); + +bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); + +} // namespace bolt + + +// GraphTraits specializations for basic block graphs (CFGs) +template <> struct GraphTraits { + using NodeRef = bolt::BinaryBasicBlock *; + using ChildIteratorType = bolt::BinaryBasicBlock::succ_iterator; + + static NodeRef getEntryNode(bolt::BinaryBasicBlock *BB) { return BB; } + static inline ChildIteratorType child_begin(NodeRef N) { + return N->succ_begin(); + } + static inline ChildIteratorType child_end(NodeRef N) { + return N->succ_end(); + } +}; + +template <> struct GraphTraits { + using NodeRef = const bolt::BinaryBasicBlock *; + using ChildIteratorType = bolt::BinaryBasicBlock::const_succ_iterator; + + static NodeRef getEntryNode(const bolt::BinaryBasicBlock *BB) { + return BB; + } + static inline ChildIteratorType child_begin(NodeRef N) { + return N->succ_begin(); + } + static inline ChildIteratorType child_end(NodeRef N) { + return N->succ_end(); + } +}; + +template <> struct GraphTraits> { + using NodeRef = bolt::BinaryBasicBlock *; + using ChildIteratorType = bolt::BinaryBasicBlock::pred_iterator; + static NodeRef getEntryNode(Inverse G) { + return G.Graph; + } + static inline ChildIteratorType child_begin(NodeRef N) { + return N->pred_begin(); + } + static inline ChildIteratorType child_end(NodeRef N) { + return N->pred_end(); + } +}; + +template <> struct GraphTraits> { + using NodeRef = const bolt::BinaryBasicBlock *; + using ChildIteratorType = bolt::BinaryBasicBlock::const_pred_iterator; + static NodeRef getEntryNode(Inverse G) { + return G.Graph; + } + static inline ChildIteratorType child_begin(NodeRef N) { + return N->pred_begin(); + } + static inline ChildIteratorType child_end(NodeRef N) { + return N->pred_end(); + } +}; + +} // namespace llvm + +#endif diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp new file mode 100644 index 000000000000..15f756b8ddaf --- /dev/null +++ b/bolt/src/BinaryContext.cpp @@ -0,0 +1,2179 @@ +//===--- BinaryContext.cpp - Interface for machine-level context ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryContext.h" +#include "BinaryEmitter.h" +#include "BinaryFunction.h" +#include "NameResolver.h" +#include "Utils.h" +#include "llvm/ADT/Twine.h" +#include "llvm/DebugInfo/DWARF/DWARFFormValue.h" +#include "llvm/DebugInfo/DWARF/DWARFUnit.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include +#include + +using namespace llvm; + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +namespace opts { + +extern cl::OptionCategory BoltCategory; + +extern cl::opt AggregateOnly; +extern cl::opt HotText; +extern cl::opt HotData; +extern cl::opt StrictMode; +extern cl::opt UseOldText; +extern cl::opt Verbosity; +extern cl::opt ExecutionCountThreshold; + +extern bool processAllFunctions(); + +cl::opt +NoHugePages("no-huge-pages", + cl::desc("use regular size pages for code alignment"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +PrintDebugInfo("print-debug-info", + cl::desc("print debug info when printing functions"), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +cl::opt +PrintRelocations("print-relocations", + cl::desc("print relocations when printing functions/objects"), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +static cl::opt +PrintMemData("print-mem-data", + cl::desc("print memory data annotations when printing functions"), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +BinaryContext::BinaryContext(std::unique_ptr Ctx, + std::unique_ptr DwCtx, + std::unique_ptr TheTriple, + const Target *TheTarget, + std::string TripleName, + std::unique_ptr MCE, + std::unique_ptr MOFI, + std::unique_ptr AsmInfo, + std::unique_ptr MII, + std::unique_ptr STI, + std::unique_ptr InstPrinter, + std::unique_ptr MIA, + std::unique_ptr MIB, + std::unique_ptr MRI, + std::unique_ptr DisAsm) + : Ctx(std::move(Ctx)), + DwCtx(std::move(DwCtx)), + TheTriple(std::move(TheTriple)), + TheTarget(TheTarget), + TripleName(TripleName), + MCE(std::move(MCE)), + MOFI(std::move(MOFI)), + AsmInfo(std::move(AsmInfo)), + MII(std::move(MII)), + STI(std::move(STI)), + InstPrinter(std::move(InstPrinter)), + MIA(std::move(MIA)), + MIB(std::move(MIB)), + MRI(std::move(MRI)), + DisAsm(std::move(DisAsm)) { + Relocation::Arch = this->TheTriple->getArch(); + PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize; +} + +BinaryContext::~BinaryContext() { + for (BinarySection *Section : Sections) { + delete Section; + } + for (BinaryFunction *InjectedFunction : InjectedBinaryFunctions) { + delete InjectedFunction; + } + for (std::pair JTI : JumpTables) { + delete JTI.second; + } + clearBinaryData(); +} + +extern MCPlusBuilder *createX86MCPlusBuilder(const MCInstrAnalysis *, + const MCInstrInfo *, + const MCRegisterInfo *); +extern MCPlusBuilder *createAArch64MCPlusBuilder(const MCInstrAnalysis *, + const MCInstrInfo *, + const MCRegisterInfo *); + +namespace { + +MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch, + const MCInstrAnalysis *Analysis, + const MCInstrInfo *Info, + const MCRegisterInfo *RegInfo) { +#ifdef X86_AVAILABLE + if (Arch == Triple::x86_64) + return createX86MCPlusBuilder(Analysis, Info, RegInfo); +#endif + +#ifdef AARCH64_AVAILABLE + if (Arch == Triple::aarch64) + return createAArch64MCPlusBuilder(Analysis, Info, RegInfo); +#endif + + llvm_unreachable("architecture unsupport by MCPlusBuilder"); +} + +} // anonymous namespace + +/// Create BinaryContext for a given architecture \p ArchName and +/// triple \p TripleName. +std::unique_ptr +BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC, + std::unique_ptr DwCtx) { + StringRef ArchName = ""; + StringRef FeaturesStr = ""; + switch (File->getArch()) { + case llvm::Triple::x86_64: + ArchName = "x86-64"; + FeaturesStr = "+nopl"; + break; + case llvm::Triple::aarch64: + ArchName = "aarch64"; + FeaturesStr = "+fp-armv8,+neon,+crypto,+dotprod,+crc,+lse,+ras,+rdm," + "+fullfp16,+spe,+fuse-aes,+rcpc"; + break; + default: + errs() << "BOLT-ERROR: Unrecognized machine in ELF file.\n"; + return nullptr; + } + + auto TheTriple = std::make_unique(File->makeTriple()); + const std::string TripleName = TheTriple->str(); + + std::string Error; + const Target *TheTarget = + TargetRegistry::lookupTarget(std::string(ArchName), *TheTriple, Error); + if (!TheTarget) { + errs() << "BOLT-ERROR: " << Error; + return nullptr; + } + + std::unique_ptr MRI( + TheTarget->createMCRegInfo(TripleName)); + if (!MRI) { + errs() << "BOLT-ERROR: no register info for target " << TripleName << "\n"; + return nullptr; + } + + // Set up disassembler. + std::unique_ptr AsmInfo( + TheTarget->createMCAsmInfo(*MRI, TripleName, MCTargetOptions())); + if (!AsmInfo) { + errs() << "BOLT-ERROR: no assembly info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr STI( + TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr)); + if (!STI) { + errs() << "BOLT-ERROR: no subtarget info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MII(TheTarget->createMCInstrInfo()); + if (!MII) { + errs() << "BOLT-ERROR: no instruction info for target " << TripleName + << "\n"; + return nullptr; + } + + std::unique_ptr Ctx( + new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get())); + std::unique_ptr MOFI( + TheTarget->createMCObjectFileInfo(*Ctx, IsPIC)); + Ctx->setObjectFileInfo(MOFI.get()); + // We do not support X86 Large code model. Change this in the future. + bool Large = false; + if (TheTriple->getArch() == llvm::Triple::aarch64) + Large = true; + unsigned LSDAEncoding = + Large ? dwarf::DW_EH_PE_absptr : dwarf::DW_EH_PE_udata4; + unsigned TTypeEncoding = + Large ? dwarf::DW_EH_PE_absptr : dwarf::DW_EH_PE_udata4; + if (IsPIC) { + LSDAEncoding = dwarf::DW_EH_PE_pcrel | + (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4); + TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | + (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4); + } + + std::unique_ptr DisAsm( + TheTarget->createMCDisassembler(*STI, *Ctx)); + + if (!DisAsm) { + errs() << "BOLT-ERROR: no disassembler for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MIA( + TheTarget->createMCInstrAnalysis(MII.get())); + if (!MIA) { + errs() << "BOLT-ERROR: failed to create instruction analysis for target" + << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MIB(createMCPlusBuilder( + TheTriple->getArch(), MIA.get(), MII.get(), MRI.get())); + if (!MIB) { + errs() << "BOLT-ERROR: failed to create instruction builder for target" + << TripleName << "\n"; + return nullptr; + } + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + std::unique_ptr InstructionPrinter( + TheTarget->createMCInstPrinter(*TheTriple, AsmPrinterVariant, *AsmInfo, + *MII, *MRI)); + if (!InstructionPrinter) { + errs() << "BOLT-ERROR: no instruction printer for target " << TripleName + << '\n'; + return nullptr; + } + InstructionPrinter->setPrintImmHex(true); + + std::unique_ptr MCE( + TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx)); + + // Make sure we don't miss any output on core dumps. + outs().SetUnbuffered(); + errs().SetUnbuffered(); + dbgs().SetUnbuffered(); + + auto BC = std::make_unique( + std::move(Ctx), std::move(DwCtx), std::move(TheTriple), TheTarget, + std::string(TripleName), std::move(MCE), std::move(MOFI), + std::move(AsmInfo), std::move(MII), std::move(STI), + std::move(InstructionPrinter), std::move(MIA), std::move(MIB), + std::move(MRI), std::move(DisAsm)); + + BC->TTypeEncoding = TTypeEncoding; + BC->LSDAEncoding = LSDAEncoding; + + BC->MAB = std::unique_ptr( + BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions())); + + BC->setFilename(File->getFileName()); + + BC->HasFixedLoadAddress = !IsPIC; + + return BC; +} + +bool BinaryContext::forceSymbolRelocations(StringRef SymbolName) const { + if (opts::HotText && (SymbolName == "__hot_start" || + SymbolName == "__hot_end")) + return true; + + if (opts::HotData && (SymbolName == "__hot_data_start" || + SymbolName == "__hot_data_end")) + return true; + + if (SymbolName == "_end") + return true; + + return false; +} + +std::unique_ptr +BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { + return MAB->createObjectWriter(OS); +} + +bool BinaryContext::validateObjectNesting() const { + auto Itr = BinaryDataMap.begin(); + auto End = BinaryDataMap.end(); + bool Valid = true; + while (Itr != End) { + auto Next = std::next(Itr); + while (Next != End && + Itr->second->getSection() == Next->second->getSection() && + Itr->second->containsRange(Next->second->getAddress(), + Next->second->getSize())) { + if (Next->second->Parent != Itr->second) { + errs() << "BOLT-WARNING: object nesting incorrect for:\n" + << "BOLT-WARNING: " << *Itr->second << "\n" + << "BOLT-WARNING: " << *Next->second << "\n"; + Valid = false; + } + ++Next; + } + Itr = Next; + } + return Valid; +} + +bool BinaryContext::validateHoles() const { + bool Valid = true; + for (BinarySection &Section : sections()) { + for (const Relocation &Rel : Section.relocations()) { + uint64_t RelAddr = Rel.Offset + Section.getAddress(); + const BinaryData *BD = getBinaryDataContainingAddress(RelAddr); + if (!BD) { + errs() << "BOLT-WARNING: no BinaryData found for relocation at address" + << " 0x" << Twine::utohexstr(RelAddr) << " in " + << Section.getName() << "\n"; + Valid = false; + } else if (!BD->getAtomicRoot()) { + errs() << "BOLT-WARNING: no atomic BinaryData found for relocation at " + << "address 0x" << Twine::utohexstr(RelAddr) << " in " + << Section.getName() << "\n"; + Valid = false; + } + } + } + return Valid; +} + +void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) { + const uint64_t Address = GAI->second->getAddress(); + const uint64_t Size = GAI->second->getSize(); + + auto fixParents = + [&](BinaryDataMapType::iterator Itr, BinaryData *NewParent) { + BinaryData *OldParent = Itr->second->Parent; + Itr->second->Parent = NewParent; + ++Itr; + while (Itr != BinaryDataMap.end() && OldParent && + Itr->second->Parent == OldParent) { + Itr->second->Parent = NewParent; + ++Itr; + } + }; + + // Check if the previous symbol contains the newly added symbol. + if (GAI != BinaryDataMap.begin()) { + BinaryData *Prev = std::prev(GAI)->second; + while (Prev) { + if (Prev->getSection() == GAI->second->getSection() && + Prev->containsRange(Address, Size)) { + fixParents(GAI, Prev); + } else { + fixParents(GAI, nullptr); + } + Prev = Prev->Parent; + } + } + + // Check if the newly added symbol contains any subsequent symbols. + if (Size != 0) { + BinaryData *BD = GAI->second->Parent ? GAI->second->Parent : GAI->second; + auto Itr = std::next(GAI); + while (Itr != BinaryDataMap.end() && + BD->containsRange(Itr->second->getAddress(), + Itr->second->getSize())) { + Itr->second->Parent = BD; + ++Itr; + } + } +} + +iterator_range +BinaryContext::getSubBinaryData(BinaryData *BD) { + auto Start = std::next(BinaryDataMap.find(BD->getAddress())); + auto End = Start; + while (End != BinaryDataMap.end() && + BD->isAncestorOf(End->second)) { + ++End; + } + return make_range(Start, End); +} + +std::pair +BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, + bool IsPCRel) { + uint64_t Addend = 0; + + if (isAArch64()) { + // Check if this is an access to a constant island and create bookkeeping + // to keep track of it and emit it later as part of this function. + if (MCSymbol *IslandSym = BF.getOrCreateIslandAccess(Address)) + return std::make_pair(IslandSym, Addend); + + // Detect custom code written in assembly that refers to arbitrary + // constant islands from other functions. Write this reference so we + // can pull this constant island and emit it as part of this function + // too. + auto IslandIter = AddressToConstantIslandMap.lower_bound(Address); + if (IslandIter != AddressToConstantIslandMap.end()) { + if (MCSymbol *IslandSym = + IslandIter->second->getOrCreateProxyIslandAccess(Address, BF)) { + /// Make this function depend on IslandIter->second because we have + /// a reference to its constant island. When emitting this function, + /// we will also emit IslandIter->second's constants. This only + /// happens in custom AArch64 assembly code. + BF.Islands.Dependency.insert(IslandIter->second); + BF.Islands.ProxySymbols[IslandSym] = IslandIter->second; + return std::make_pair(IslandSym, Addend); + } + } + } + + // Note that the address does not necessarily have to reside inside + // a section, it could be an absolute address too. + ErrorOr Section = getSectionForAddress(Address); + if (Section && Section->isText()) { + if (BF.containsAddress(Address, /*UseMaxSize=*/ isAArch64())) { + if (Address != BF.getAddress()) { + // The address could potentially escape. Mark it as another entry + // point into the function. + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: potentially escaped address 0x" + << Twine::utohexstr(Address) << " in function " + << BF << '\n'; + } + BF.HasInternalLabelReference = true; + return std::make_pair( + BF.addEntryPointAtOffset(Address - BF.getAddress()), + Addend); + } + } else { + BF.InterproceduralReferences.insert(Address); + } + } + + // With relocations, catch jump table references outside of the basic block + // containing the indirect jump. + if (HasRelocations) { + const MemoryContentsType MemType = analyzeMemoryAt(Address, BF); + if (MemType == MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE && IsPCRel) { + const MCSymbol *Symbol = + getOrCreateJumpTable(BF, Address, JumpTable::JTT_PIC); + + return std::make_pair(Symbol, Addend); + } + } + + if (BinaryData *BD = getBinaryDataContainingAddress(Address)) { + return std::make_pair(BD->getSymbol(), Address - BD->getAddress()); + } + + // TODO: use DWARF info to get size/alignment here? + MCSymbol *TargetSymbol = getOrCreateGlobalSymbol(Address, "DATAat"); + LLVM_DEBUG(dbgs() << "Created symbol " << TargetSymbol->getName() << '\n'); + return std::make_pair(TargetSymbol, Addend); +} + +MemoryContentsType +BinaryContext::analyzeMemoryAt(uint64_t Address, BinaryFunction &BF) { + if (!isX86()) + return MemoryContentsType::UNKNOWN; + + ErrorOr Section = getSectionForAddress(Address); + if (!Section) { + // No section - possibly an absolute address. Since we don't allow + // internal function addresses to escape the function scope - we + // consider it a tail call. + if (opts::Verbosity > 1) { + errs() << "BOLT-WARNING: no section for address 0x" + << Twine::utohexstr(Address) << " referenced from function " + << BF << '\n'; + } + return MemoryContentsType::UNKNOWN; + } + + if (Section->isVirtual()) { + // The contents are filled at runtime. + return MemoryContentsType::UNKNOWN; + } + + // No support for jump tables in code yet. + if (Section->isText()) + return MemoryContentsType::UNKNOWN; + + // Start with checking for PIC jump table. We expect non-PIC jump tables + // to have high 32 bits set to 0. + if (analyzeJumpTable(Address, JumpTable::JTT_PIC, BF)) + return MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE; + + if (analyzeJumpTable(Address, JumpTable::JTT_NORMAL, BF)) + return MemoryContentsType::POSSIBLE_JUMP_TABLE; + + return MemoryContentsType::UNKNOWN; +} + +bool BinaryContext::analyzeJumpTable(const uint64_t Address, + const JumpTable::JumpTableType Type, + BinaryFunction &BF, + const uint64_t NextJTAddress, + JumpTable::OffsetsType *Offsets) { + // Is one of the targets __builtin_unreachable? + bool HasUnreachable = false; + + // Number of targets other than __builtin_unreachable. + uint64_t NumRealEntries = 0; + + constexpr uint64_t INVALID_OFFSET = std::numeric_limits::max(); + auto addOffset = [&](uint64_t Offset) { + if (Offsets) + Offsets->emplace_back(Offset); + }; + + auto isFragment = [](BinaryFunction &Fragment, + BinaryFunction &Parent) -> bool { + // Check if == .cold(.\d+)? + for (StringRef BFName : Parent.getNames()) { + std::string BFNamePrefix = Regex::escape(NameResolver::restore(BFName)); + std::string BFNameRegex = + Twine(BFNamePrefix, "\\.cold(\\.[0-9]+)?").str(); + if (Fragment.hasRestoredNameRegex(BFNameRegex)) + return true; + } + return false; + }; + + auto doesBelongToFunction = [&](const uint64_t Addr, + BinaryFunction *TargetBF) -> bool { + if (BF.containsAddress(Addr)) + return true; + // Nothing to do if we failed to identify the containing function. + if (!TargetBF) + return false; + // Case 1: check if BF is a fragment and TargetBF is its parent. + if (BF.isFragment()) { + // BF is a fragment, but parent function is not registered. + // This means there's no direct jump between parent and fragment. + // Set parent link here in jump table analysis, based on function name + // matching heuristic. + if (!BF.getParentFragment() && isFragment(BF, *TargetBF)) + registerFragment(BF, *TargetBF); + return BF.getParentFragment() == TargetBF; + } + // Case 2: check if TargetBF is a fragment and BF is its parent. + if (TargetBF->isFragment()) { + // TargetBF is a fragment, but parent function is not registered. + // This means there's no direct jump between parent and fragment. + // Set parent link here in jump table analysis, based on function name + // matching heuristic. + if (!TargetBF->getParentFragment() && isFragment(*TargetBF, BF)) + registerFragment(*TargetBF, BF); + return TargetBF->getParentFragment() == &BF; + } + return false; + }; + + ErrorOr Section = getSectionForAddress(Address); + if (!Section) + return false; + + // The upper bound is defined by containing object, section limits, and + // the next jump table in memory. + uint64_t UpperBound = Section->getEndAddress(); + const BinaryData *JumpTableBD = getBinaryDataAtAddress(Address); + if (JumpTableBD && JumpTableBD->getSize()) { + assert(JumpTableBD->getEndAddress() <= UpperBound && + "data object cannot cross a section boundary"); + UpperBound = JumpTableBD->getEndAddress(); + } + if (NextJTAddress) { + UpperBound = std::min(NextJTAddress, UpperBound); + } + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: analyzeJumpTable in " << BF.getPrintName() + << '\n'); + const uint64_t EntrySize = getJumpTableEntrySize(Type); + for (uint64_t EntryAddress = Address; EntryAddress <= UpperBound - EntrySize; + EntryAddress += EntrySize) { + LLVM_DEBUG(dbgs() << " * Checking 0x" << Twine::utohexstr(EntryAddress) + << " -> "); + // Check if there's a proper relocation against the jump table entry. + if (HasRelocations) { + if (Type == JumpTable::JTT_PIC && + !DataPCRelocations.count(EntryAddress)) { + LLVM_DEBUG( + dbgs() << "FAIL: JTT_PIC table, no relocation for this address\n"); + break; + } + if (Type == JumpTable::JTT_NORMAL && !getRelocationAt(EntryAddress)) { + LLVM_DEBUG( + dbgs() + << "FAIL: JTT_NORMAL table, no relocation for this address\n"); + break; + } + } + + const uint64_t Value = (Type == JumpTable::JTT_PIC) + ? Address + *getSignedValueAtAddress(EntryAddress, EntrySize) + : *getPointerAtAddress(EntryAddress); + + // __builtin_unreachable() case. + if (Value == BF.getAddress() + BF.getSize()) { + addOffset(Value - BF.getAddress()); + HasUnreachable = true; + LLVM_DEBUG(dbgs() << "OK: __builtin_unreachable\n"); + continue; + } + + // Function or one of its fragments. + BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value); + + // We assume that a jump table cannot have function start as an entry. + if (!doesBelongToFunction(Value, TargetBF) || Value == BF.getAddress()) { + LLVM_DEBUG({ + if (!BF.containsAddress(Value)) { + dbgs() << "FAIL: function doesn't contain this address\n"; + if (TargetBF) { + dbgs() << " ! function containing this address: " + << TargetBF->getPrintName() << '\n'; + if (TargetBF->isFragment()) + dbgs() << " ! is a fragment\n"; + BinaryFunction *TargetParent = TargetBF->getParentFragment(); + dbgs() << " ! its parent is " + << (TargetParent ? TargetParent->getPrintName() : "(none)") + << '\n'; + } + } + if (Value == BF.getAddress()) + dbgs() << "FAIL: jump table cannot have function start as an entry\n"; + }); + break; + } + + // Check there's an instruction at this offset. + if (TargetBF->getState() == BinaryFunction::State::Disassembled && + !TargetBF->getInstructionAtOffset(Value - TargetBF->getAddress())) { + LLVM_DEBUG(dbgs() << "FAIL: no instruction at this offset\n"); + break; + } + + ++NumRealEntries; + + if (TargetBF == &BF) { + // Address inside the function. + addOffset(Value - TargetBF->getAddress()); + LLVM_DEBUG(dbgs() << "OK: real entry\n"); + } else { + // Address in split fragment. + BF.setHasSplitJumpTable(true); + // Add invalid offset for proper identification of jump table size. + addOffset(INVALID_OFFSET); + LLVM_DEBUG(dbgs() << "OK: address in split fragment\n"); + } + } + + // It's a jump table if the number of real entries is more than 1, or there's + // one real entry and "unreachable" targets. If there are only multiple + // "unreachable" targets, then it's not a jump table. + return NumRealEntries + HasUnreachable >= 2; +} + +void BinaryContext::populateJumpTables() { + std::vector FuncsToSkip; + LLVM_DEBUG(dbgs() << "DataPCRelocations: " << DataPCRelocations.size() + << '\n'); + for (auto JTI = JumpTables.begin(), JTE = JumpTables.end(); JTI != JTE; + ++JTI) { + JumpTable *JT = JTI->second; + BinaryFunction &BF = *JT->Parent; + + if (!BF.isSimple()) + continue; + + uint64_t NextJTAddress = 0; + auto NextJTI = std::next(JTI); + if (NextJTI != JTE) { + NextJTAddress = NextJTI->second->getAddress(); + } + + const bool Success = analyzeJumpTable(JT->getAddress(), JT->Type, BF, + NextJTAddress, &JT->OffsetEntries); + if (!Success) { + dbgs() << "failed to analyze jump table in function " << BF << '\n'; + JT->print(dbgs()); + if (NextJTI != JTE) { + dbgs() << "next jump table at 0x" + << Twine::utohexstr(NextJTI->second->getAddress()) + << " belongs to function " << *NextJTI->second->Parent << '\n'; + NextJTI->second->print(dbgs()); + } + llvm_unreachable("jump table heuristic failure"); + } + + for (uint64_t EntryOffset : JT->OffsetEntries) { + if (EntryOffset == BF.getSize()) + BF.IgnoredBranches.emplace_back(EntryOffset, BF.getSize()); + else + BF.registerReferencedOffset(EntryOffset); + } + + // In strict mode, erase PC-relative relocation record. Later we check that + // all such records are erased and thus have been accounted for. + if (opts::StrictMode && JT->Type == JumpTable::JTT_PIC) { + for (uint64_t Address = JT->getAddress(); + Address < JT->getAddress() + JT->getSize(); + Address += JT->EntrySize) { + DataPCRelocations.erase(DataPCRelocations.find(Address)); + } + } + + // Mark to skip the function and all its fragments. + if (BF.hasSplitJumpTable()) + FuncsToSkip.push_back(&BF); + } + + if (opts::StrictMode && DataPCRelocations.size()) { + LLVM_DEBUG({ + dbgs() << DataPCRelocations.size() + << " unclaimed PC-relative relocations left in data:\n"; + for (uint64_t Reloc : DataPCRelocations) + dbgs() << Twine::utohexstr(Reloc) << '\n'; + }); + assert(0 && "unclaimed PC-relative relocations left in data\n"); + } + clearList(DataPCRelocations); + // Functions containing split jump tables need to be skipped with all + // fragments. + for (BinaryFunction *BF : FuncsToSkip) { + BinaryFunction *ParentBF = + const_cast(BF->getTopmostFragment()); + LLVM_DEBUG(dbgs() << "Skipping " << ParentBF->getPrintName() + << " family\n"); + ParentBF->setIgnored(); + ParentBF->ignoreFragments(); + } +} + +MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, + Twine Prefix, + uint64_t Size, + uint16_t Alignment, + unsigned Flags) { + auto Itr = BinaryDataMap.find(Address); + if (Itr != BinaryDataMap.end()) { + assert(Itr->second->getSize() == Size || !Size); + return Itr->second->getSymbol(); + } + + std::string Name = (Prefix + "0x" + Twine::utohexstr(Address)).str(); + assert(!GlobalSymbols.count(Name) && "created name is not unique"); + return registerNameAtAddress(Name, Address, Size, Alignment, Flags); +} + +MCSymbol *BinaryContext::getOrCreateUndefinedGlobalSymbol(StringRef Name) { + return Ctx->getOrCreateSymbol(Name); +} + +BinaryFunction *BinaryContext::createBinaryFunction( + const std::string &Name, BinarySection &Section, uint64_t Address, + uint64_t Size, uint64_t SymbolSize, uint16_t Alignment) { + auto Result = BinaryFunctions.emplace( + Address, BinaryFunction(Name, Section, Address, Size, *this)); + assert(Result.second == true && "unexpected duplicate function"); + BinaryFunction *BF = &Result.first->second; + registerNameAtAddress(Name, Address, SymbolSize ? SymbolSize : Size, + Alignment); + setSymbolToFunctionMap(BF->getSymbol(), BF); + return BF; +} + +const MCSymbol * +BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address, + JumpTable::JumpTableType Type) { + if (JumpTable *JT = getJumpTableContainingAddress(Address)) { + assert(JT->Type == Type && "jump table types have to match"); + assert(JT->Parent == &Function && + "cannot re-use jump table of a different function"); + assert(Address == JT->getAddress() && "unexpected non-empty jump table"); + + return JT->getFirstLabel(); + } + + // Re-use the existing symbol if possible. + MCSymbol *JTLabel = nullptr; + if (BinaryData *Object = getBinaryDataAtAddress(Address)) { + if (!isInternalSymbolName(Object->getSymbol()->getName())) + JTLabel = Object->getSymbol(); + } + + const uint64_t EntrySize = getJumpTableEntrySize(Type); + if (!JTLabel) { + const std::string JumpTableName = generateJumpTableName(Function, Address); + JTLabel = registerNameAtAddress(JumpTableName, Address, 0, EntrySize); + } + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " << JTLabel->getName() + << " in function " << Function << '\n'); + + JumpTable *JT = new JumpTable(*JTLabel, Address, EntrySize, Type, + JumpTable::LabelMapType{{0, JTLabel}}, Function, + *getSectionForAddress(Address)); + JumpTables.emplace(Address, JT); + + // Duplicate the entry for the parent function for easy access. + Function.JumpTables.emplace(Address, JT); + + return JTLabel; +} + +std::pair +BinaryContext::duplicateJumpTable(BinaryFunction &Function, JumpTable *JT, + const MCSymbol *OldLabel) { + auto L = scopeLock(); + unsigned Offset = 0; + bool Found = false; + for (std::pair Elmt : JT->Labels) { + if (Elmt.second != OldLabel) + continue; + Offset = Elmt.first; + Found = true; + break; + } + assert(Found && "Label not found"); + MCSymbol *NewLabel = Ctx->createNamedTempSymbol("duplicatedJT"); + JumpTable *NewJT = + new JumpTable(*NewLabel, JT->getAddress(), JT->EntrySize, JT->Type, + JumpTable::LabelMapType{{Offset, NewLabel}}, Function, + *getSectionForAddress(JT->getAddress())); + NewJT->Entries = JT->Entries; + NewJT->Counts = JT->Counts; + uint64_t JumpTableID = ++DuplicatedJumpTables; + // Invert it to differentiate from regular jump tables whose IDs are their + // addresses in the input binary memory space + JumpTableID = ~JumpTableID; + JumpTables.emplace(JumpTableID, NewJT); + Function.JumpTables.emplace(JumpTableID, NewJT); + return std::make_pair(JumpTableID, NewLabel); +} + +std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF, + uint64_t Address) { + size_t Id; + uint64_t Offset = 0; + if (const JumpTable *JT = BF.getJumpTableContainingAddress(Address)) { + Offset = Address - JT->getAddress(); + auto Itr = JT->Labels.find(Offset); + if (Itr != JT->Labels.end()) { + return std::string(Itr->second->getName()); + } + Id = JumpTableIds.at(JT->getAddress()); + } else { + Id = JumpTableIds[Address] = BF.JumpTables.size(); + } + return ("JUMP_TABLE/" + BF.getOneName().str() + "." + std::to_string(Id) + + (Offset ? ("." + std::to_string(Offset)) : "")); +} + +bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) { + // FIXME: aarch64 support is missing. + if (!isX86()) + return true; + + if (BF.getSize() == BF.getMaxSize()) + return true; + + ErrorOr> FunctionData = BF.getData(); + assert(FunctionData && "cannot get function as data"); + + uint64_t Offset = BF.getSize(); + MCInst Instr; + uint64_t InstrSize = 0; + uint64_t InstrAddress = BF.getAddress() + Offset; + using std::placeholders::_1; + + // Skip instructions that satisfy the predicate condition. + auto skipInstructions = [&](std::function Predicate) { + const uint64_t StartOffset = Offset; + for (; Offset < BF.getMaxSize(); + Offset += InstrSize, InstrAddress += InstrSize) { + if (!DisAsm->getInstruction(Instr, + InstrSize, + FunctionData->slice(Offset), + InstrAddress, + nulls())) + break; + if (!Predicate(Instr)) + break; + } + + return Offset - StartOffset; + }; + + // Skip a sequence of zero bytes. + auto skipZeros = [&]() { + const uint64_t StartOffset = Offset; + for (; Offset < BF.getMaxSize(); ++Offset) + if ((*FunctionData)[Offset] != 0) + break; + + return Offset - StartOffset; + }; + + // Accept the whole padding area filled with breakpoints. + auto isBreakpoint = std::bind(&MCPlusBuilder::isBreakpoint, MIB.get(), _1); + if (skipInstructions(isBreakpoint) && Offset == BF.getMaxSize()) + return true; + + auto isNoop = std::bind(&MCPlusBuilder::isNoop, MIB.get(), _1); + + // Some functions have a jump to the next function or to the padding area + // inserted after the body. + auto isSkipJump = [&](const MCInst &Instr) { + uint64_t TargetAddress = 0; + if (MIB->isUnconditionalBranch(Instr) && + MIB->evaluateBranch(Instr, InstrAddress, InstrSize, TargetAddress)) { + if (TargetAddress >= InstrAddress + InstrSize && + TargetAddress <= BF.getAddress() + BF.getMaxSize()) { + return true; + } + } + return false; + }; + + // Skip over nops, jumps, and zero padding. Allow interleaving (this happens). + while (skipInstructions(isNoop) || + skipInstructions(isSkipJump) || + skipZeros()) + ; + + if (Offset == BF.getMaxSize()) + return true; + + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: bad padding at address 0x" + << Twine::utohexstr(BF.getAddress() + BF.getSize()) + << " starting at offset " + << (Offset - BF.getSize()) << " in function " + << BF << '\n' + << FunctionData->slice(BF.getSize(), BF.getMaxSize() - BF.getSize()) + << '\n'; + } + + return false; +} + +void BinaryContext::adjustCodePadding() { + for (auto &BFI : BinaryFunctions) { + BinaryFunction &BF = BFI.second; + if (!shouldEmit(BF)) + continue; + + if (!hasValidCodePadding(BF)) { + if (HasRelocations) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: function " << BF + << " has invalid padding. Ignoring the function.\n"; + } + BF.setIgnored(); + } else { + BF.setMaxSize(BF.getSize()); + } + } + } +} + +MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, + uint64_t Address, + uint64_t Size, + uint16_t Alignment, + unsigned Flags) { + // Register the name with MCContext. + MCSymbol *Symbol = Ctx->getOrCreateSymbol(Name); + + auto GAI = BinaryDataMap.find(Address); + BinaryData *BD; + if (GAI == BinaryDataMap.end()) { + ErrorOr SectionOrErr = getSectionForAddress(Address); + BinarySection &Section = + SectionOrErr ? SectionOrErr.get() : absoluteSection(); + BD = new BinaryData(*Symbol, + Address, + Size, + Alignment ? Alignment : 1, + Section, + Flags); + GAI = BinaryDataMap.emplace(Address, BD).first; + GlobalSymbols[Name] = BD; + updateObjectNesting(GAI); + } else { + BD = GAI->second; + if (!BD->hasName(Name)) { + GlobalSymbols[Name] = BD; + BD->Symbols.push_back(Symbol); + } + } + + return Symbol; +} + +const BinaryData * +BinaryContext::getBinaryDataContainingAddressImpl(uint64_t Address) const { + auto NI = BinaryDataMap.lower_bound(Address); + auto End = BinaryDataMap.end(); + if ((NI != End && Address == NI->first) || + ((NI != BinaryDataMap.begin()) && (NI-- != BinaryDataMap.begin()))) { + if (NI->second->containsAddress(Address)) { + return NI->second; + } + + // If this is a sub-symbol, see if a parent data contains the address. + const BinaryData *BD = NI->second->getParent(); + while (BD) { + if (BD->containsAddress(Address)) + return BD; + BD = BD->getParent(); + } + } + return nullptr; +} + +bool BinaryContext::setBinaryDataSize(uint64_t Address, uint64_t Size) { + auto NI = BinaryDataMap.find(Address); + assert(NI != BinaryDataMap.end()); + if (NI == BinaryDataMap.end()) + return false; + // TODO: it's possible that a jump table starts at the same address + // as a larger blob of private data. When we set the size of the + // jump table, it might be smaller than the total blob size. In this + // case we just leave the original size since (currently) it won't really + // affect anything. See T26915981. + assert((!NI->second->Size || NI->second->Size == Size || + (NI->second->isJumpTable() && NI->second->Size > Size)) && + "can't change the size of a symbol that has already had its " + "size set"); + if (!NI->second->Size) { + NI->second->Size = Size; + updateObjectNesting(NI); + return true; + } + return false; +} + +void BinaryContext::generateSymbolHashes() { + auto isPadding = [](const BinaryData &BD) { + StringRef Contents = BD.getSection().getContents(); + StringRef SymData = Contents.substr(BD.getOffset(), BD.getSize()); + return (BD.getName().startswith("HOLEat") || + SymData.find_first_not_of(0) == StringRef::npos); + }; + + uint64_t NumCollisions = 0; + for (auto &Entry : BinaryDataMap) { + BinaryData &BD = *Entry.second; + StringRef Name = BD.getName(); + + if (!isInternalSymbolName(Name)) + continue; + + // First check if a non-anonymous alias exists and move it to the front. + if (BD.getSymbols().size() > 1) { + auto Itr = std::find_if(BD.getSymbols().begin(), + BD.getSymbols().end(), + [&](const MCSymbol *Symbol) { + return !isInternalSymbolName(Symbol->getName()); + }); + if (Itr != BD.getSymbols().end()) { + size_t Idx = std::distance(BD.getSymbols().begin(), Itr); + std::swap(BD.getSymbols()[0], BD.getSymbols()[Idx]); + continue; + } + } + + // We have to skip 0 size symbols since they will all collide. + if (BD.getSize() == 0) { + continue; + } + + const uint64_t Hash = BD.getSection().hash(BD); + const size_t Idx = Name.find("0x"); + std::string NewName = (Twine(Name.substr(0, Idx)) + + "_" + Twine::utohexstr(Hash)).str(); + if (getBinaryDataByName(NewName)) { + // Ignore collisions for symbols that appear to be padding + // (i.e. all zeros or a "hole") + if (!isPadding(BD)) { + if (opts::Verbosity) { + errs() << "BOLT-WARNING: collision detected when hashing " << BD + << " with new name (" << NewName << "), skipping.\n"; + } + ++NumCollisions; + } + continue; + } + BD.Symbols.insert(BD.Symbols.begin(), + Ctx->getOrCreateSymbol(NewName)); + GlobalSymbols[NewName] = &BD; + } + if (NumCollisions) { + errs() << "BOLT-WARNING: " << NumCollisions + << " collisions detected while hashing binary objects"; + if (!opts::Verbosity) + errs() << ". Use -v=1 to see the list."; + errs() << '\n'; + } +} + +void BinaryContext::registerFragment(BinaryFunction &TargetFunction, + BinaryFunction &Function) const { + // Only a parent function (or a sibling) can reach its fragment. + assert(!Function.IsFragment && + "only one cold fragment is supported at this time"); + if (BinaryFunction *TargetParent = TargetFunction.getParentFragment()) { + assert(TargetParent == &Function && "mismatching parent function"); + return; + } + TargetFunction.setParentFragment(Function); + Function.addFragment(TargetFunction); + if (!HasRelocations) { + TargetFunction.setSimple(false); + Function.setSimple(false); + } + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: marking " << TargetFunction + << " as a fragment of " << Function << '\n'; + } +} + +void BinaryContext::processInterproceduralReferences(BinaryFunction &Function) { + for (uint64_t Address : Function.InterproceduralReferences) { + if (!Address) + continue; + + BinaryFunction *TargetFunction = + getBinaryFunctionContainingAddress(Address); + if (&Function == TargetFunction) + continue; + + if (TargetFunction) { + if (TargetFunction->IsFragment) + registerFragment(*TargetFunction, Function); + if (uint64_t Offset = Address - TargetFunction->getAddress()) + TargetFunction->addEntryPointAtOffset(Offset); + + continue; + } + + // Check if address falls in function padding space - this could be + // unmarked data in code. In this case adjust the padding space size. + ErrorOr Section = getSectionForAddress(Address); + assert(Section && "cannot get section for referenced address"); + + if (!Section->isText()) + continue; + + // PLT requires special handling and could be ignored in this context. + StringRef SectionName = Section->getName(); + if (SectionName == ".plt" || SectionName == ".plt.got") + continue; + + if (opts::processAllFunctions()) { + errs() << "BOLT-ERROR: cannot process binaries with unmarked " + << "object in code at address 0x" + << Twine::utohexstr(Address) << " belonging to section " + << SectionName << " in current mode\n"; + exit(1); + } + + TargetFunction = + getBinaryFunctionContainingAddress(Address, + /*CheckPastEnd=*/false, + /*UseMaxSize=*/true); + // We are not going to overwrite non-simple functions, but for simple + // ones - adjust the padding size. + if (TargetFunction && TargetFunction->isSimple()) { + errs() << "BOLT-WARNING: function " << *TargetFunction + << " has an object detected in a padding region at address 0x" + << Twine::utohexstr(Address) << '\n'; + TargetFunction->setMaxSize(TargetFunction->getSize()); + } + } + + clearList(Function.InterproceduralReferences); +} + +void BinaryContext::postProcessSymbolTable() { + fixBinaryDataHoles(); + bool Valid = true; + for (auto &Entry : BinaryDataMap) { + BinaryData *BD = Entry.second; + if ((BD->getName().startswith("SYMBOLat") || + BD->getName().startswith("DATAat")) && + !BD->getParent() && + !BD->getSize() && + !BD->isAbsolute() && + BD->getSection()) { + errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD << "\n"; + Valid = false; + } + } + assert(Valid); + generateSymbolHashes(); +} + +void BinaryContext::foldFunction(BinaryFunction &ChildBF, + BinaryFunction &ParentBF) { + assert(!ChildBF.isMultiEntry() && !ParentBF.isMultiEntry() && + "cannot merge functions with multiple entry points"); + + std::unique_lock WriteCtxLock(CtxMutex, + std::defer_lock); + std::unique_lock WriteSymbolMapLock( + SymbolToFunctionMapMutex, std::defer_lock); + + const StringRef ChildName = ChildBF.getOneName(); + + // Move symbols over and update bookkeeping info. + for (MCSymbol *Symbol : ChildBF.getSymbols()) { + ParentBF.getSymbols().push_back(Symbol); + WriteSymbolMapLock.lock(); + SymbolToFunctionMap[Symbol] = &ParentBF; + WriteSymbolMapLock.unlock(); + // NB: there's no need to update BinaryDataMap and GlobalSymbols. + } + ChildBF.getSymbols().clear(); + + // Move other names the child function is known under. + std::move(ChildBF.Aliases.begin(), ChildBF.Aliases.end(), + std::back_inserter(ParentBF.Aliases)); + ChildBF.Aliases.clear(); + + if (HasRelocations) { + // Merge execution counts of ChildBF into those of ParentBF. + // Without relocations, we cannot reliably merge profiles as both functions + // continue to exist and either one can be executed. + ChildBF.mergeProfileDataInto(ParentBF); + + std::shared_lock ReadBfsLock(BinaryFunctionsMutex, + std::defer_lock); + std::unique_lock WriteBfsLock(BinaryFunctionsMutex, + std::defer_lock); + // Remove ChildBF from the global set of functions in relocs mode. + ReadBfsLock.lock(); + auto FI = BinaryFunctions.find(ChildBF.getAddress()); + ReadBfsLock.unlock(); + + assert(FI != BinaryFunctions.end() && "function not found"); + assert(&ChildBF == &FI->second && "function mismatch"); + + WriteBfsLock.lock(); + ChildBF.clearDisasmState(); + FI = BinaryFunctions.erase(FI); + WriteBfsLock.unlock(); + + } else { + // In non-relocation mode we keep the function, but rename it. + std::string NewName = "__ICF_" + ChildName.str(); + + WriteCtxLock.lock(); + ChildBF.getSymbols().push_back(Ctx->getOrCreateSymbol(NewName)); + WriteCtxLock.unlock(); + + ChildBF.setFolded(&ParentBF); + } +} + +void BinaryContext::fixBinaryDataHoles() { + assert(validateObjectNesting() && "object nesting inconsitency detected"); + + for (BinarySection &Section : allocatableSections()) { + std::vector> Holes; + + auto isNotHole = [&Section](const binary_data_iterator &Itr) { + BinaryData *BD = Itr->second; + bool isHole = (!BD->getParent() && + !BD->getSize() && + BD->isObject() && + (BD->getName().startswith("SYMBOLat0x") || + BD->getName().startswith("DATAat0x") || + BD->getName().startswith("ANONYMOUS"))); + return !isHole && BD->getSection() == Section && !BD->getParent(); + }; + + auto BDStart = BinaryDataMap.begin(); + auto BDEnd = BinaryDataMap.end(); + auto Itr = FilteredBinaryDataIterator(isNotHole, BDStart, BDEnd); + auto End = FilteredBinaryDataIterator(isNotHole, BDEnd, BDEnd); + + uint64_t EndAddress = Section.getAddress(); + + while (Itr != End) { + if (Itr->second->getAddress() > EndAddress) { + uint64_t Gap = Itr->second->getAddress() - EndAddress; + Holes.emplace_back(EndAddress, Gap); + } + EndAddress = Itr->second->getEndAddress(); + ++Itr; + } + + if (EndAddress < Section.getEndAddress()) { + Holes.emplace_back(EndAddress, Section.getEndAddress() - EndAddress); + } + + // If there is already a symbol at the start of the hole, grow that symbol + // to cover the rest. Otherwise, create a new symbol to cover the hole. + for (std::pair &Hole : Holes) { + BinaryData *BD = getBinaryDataAtAddress(Hole.first); + if (BD) { + // BD->getSection() can be != Section if there are sections that + // overlap. In this case it is probably safe to just skip the holes + // since the overlapping section will not(?) have any symbols in it. + if (BD->getSection() == Section) + setBinaryDataSize(Hole.first, Hole.second); + } else { + getOrCreateGlobalSymbol(Hole.first, "HOLEat", Hole.second, 1); + } + } + } + + assert(validateObjectNesting() && "object nesting inconsitency detected"); + assert(validateHoles() && "top level hole detected in object map"); +} + +void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { + const BinarySection* CurrentSection = nullptr; + bool FirstSection = true; + + for (auto &Entry : BinaryDataMap) { + const BinaryData *BD = Entry.second; + const BinarySection &Section = BD->getSection(); + if (FirstSection || Section != *CurrentSection) { + uint64_t Address, Size; + StringRef Name = Section.getName(); + if (Section) { + Address = Section.getAddress(); + Size = Section.getSize(); + } else { + Address = BD->getAddress(); + Size = BD->getSize(); + } + OS << "BOLT-INFO: Section " << Name << ", " + << "0x" + Twine::utohexstr(Address) << ":" + << "0x" + Twine::utohexstr(Address + Size) << "/" + << Size << "\n"; + CurrentSection = &Section; + FirstSection = false; + } + + OS << "BOLT-INFO: "; + const BinaryData *P = BD->getParent(); + while (P) { + OS << " "; + P = P->getParent(); + } + OS << *BD << "\n"; + } +} + +unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, + const uint32_t SrcCUID, + unsigned FileIndex) { + DWARFCompileUnit *SrcUnit = DwCtx->getCompileUnitForOffset(SrcCUID); + const DWARFDebugLine::LineTable *LineTable = + DwCtx->getLineTableForUnit(SrcUnit); + const std::vector &FileNames = + LineTable->Prologue.FileNames; + // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 + // means empty dir. + assert(FileIndex > 0 && FileIndex <= FileNames.size() && + "FileIndex out of range for the compilation unit."); + StringRef Dir = ""; + if (FileNames[FileIndex - 1].DirIdx != 0) { + if (Optional DirName = + LineTable->Prologue + .IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1] + .getAsCString()) { + Dir = *DirName; + } + } + StringRef FileName = ""; + if (Optional FName = + FileNames[FileIndex - 1].Name.getAsCString()) + FileName = *FName; + assert(FileName != ""); + return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, None, None, DestCUID)); +} + +std::vector BinaryContext::getSortedFunctions() { + std::vector SortedFunctions(BinaryFunctions.size()); + std::transform(BinaryFunctions.begin(), BinaryFunctions.end(), + SortedFunctions.begin(), + [](std::pair &BFI) { + return &BFI.second; + }); + + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + [] (const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } + return A->hasValidIndex(); + }); + return SortedFunctions; +} + +std::vector BinaryContext::getAllBinaryFunctions() { + std::vector AllFunctions; + AllFunctions.reserve(BinaryFunctions.size() + InjectedBinaryFunctions.size()); + std::transform(BinaryFunctions.begin(), BinaryFunctions.end(), + std::back_inserter(AllFunctions), + [](std::pair &BFI) { + return &BFI.second; + }); + std::copy(InjectedBinaryFunctions.begin(), InjectedBinaryFunctions.end(), + std::back_inserter(AllFunctions)); + + return AllFunctions; +} + +Optional BinaryContext::getDWOCU(uint64_t DWOId) { + auto Iter = DWOCUs.find(DWOId); + if (Iter == DWOCUs.end()) + return None; + + return Iter->second; +} + +DWARFContext *BinaryContext::getDWOContext() { + if (DWOCUs.empty()) + return nullptr; + return &DWOCUs.begin()->second->getContext(); +} + +/// Handles DWO sections that can either be in .o, .dwo or .dwp files. +void BinaryContext::preprocessDWODebugInfo() { + for (const std::unique_ptr &CU : DwCtx->compile_units()) { + DWARFUnit *const DwarfUnit = CU.get(); + if (llvm::Optional DWOId = DwarfUnit->getDWOId()) { + DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit(); + if (!DWOCU->isDWOUnit()) { + std::string DWOName = dwarf::toString( + DwarfUnit->getUnitDIE().find( + {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), + ""); + outs() << "BOLT-WARNING: Debug Fission: DWO debug information for " + << DWOName + << " was not retrieved and won't be updated. Please check " + "relative path.\n"; + continue; + } + DWOCUs[*DWOId] = DWOCU; + } + } +} + +void BinaryContext::preprocessDebugInfo() { + struct CURange { + uint64_t LowPC; + uint64_t HighPC; + DWARFUnit *Unit; + + bool operator<(const CURange &Other) const { + return LowPC < Other.LowPC; + } + }; + + // Building a map of address ranges to CUs similar to .debug_aranges and use + // it to assign CU to functions. + std::vector AllRanges; + AllRanges.reserve(DwCtx->getNumCompileUnits()); + for (const std::unique_ptr &CU : DwCtx->compile_units()) { + Expected RangesOrError = + CU->getUnitDIE().getAddressRanges(); + if (!RangesOrError) { + consumeError(RangesOrError.takeError()); + continue; + } + for (DWARFAddressRange &Range : *RangesOrError) { + // Parts of the debug info could be invalidated due to corresponding code + // being removed from the binary by the linker. Hence we check if the + // address is a valid one. + if (containsAddress(Range.LowPC)) + AllRanges.emplace_back(CURange{Range.LowPC, Range.HighPC, CU.get()}); + } + } + + std::sort(AllRanges.begin(), AllRanges.end()); + for (auto &KV : BinaryFunctions) { + const uint64_t FunctionAddress = KV.first; + BinaryFunction &Function = KV.second; + + auto It = std::partition_point(AllRanges.begin(), AllRanges.end(), + [=](CURange R) { return R.HighPC <= FunctionAddress; }); + if (It != AllRanges.end() && It->LowPC <= FunctionAddress) { + Function.setDWARFUnit(It->Unit); + } + } + + // Populate MCContext with DWARF files from all units. + StringRef GlobalPrefix = AsmInfo->getPrivateGlobalPrefix(); + for (const std::unique_ptr &CU : DwCtx->compile_units()) { + const uint64_t CUID = CU->getOffset(); + const DWARFDebugLine::LineTable *LineTable = + DwCtx->getLineTableForUnit(CU.get()); + const std::vector &FileNames = + LineTable->Prologue.FileNames; + + // Assign a unique label to every line table, one per CU. + Ctx->getMCDwarfLineTable(CUID).setLabel( + Ctx->getOrCreateSymbol(GlobalPrefix + "line_table_start" + Twine(CUID))); + + // Make sure empty debug line tables are registered too. + if (FileNames.empty()) { + cantFail(Ctx->getDwarfFile("", "", 0, None, None, CUID)); + continue; + } + for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { + // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 + // means empty dir. + StringRef Dir = ""; + if (FileNames[I].DirIdx != 0) + if (Optional DirName = + LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] + .getAsCString()) + Dir = *DirName; + StringRef FileName = ""; + if (Optional FName = FileNames[I].Name.getAsCString()) + FileName = *FName; + assert(FileName != ""); + cantFail(Ctx->getDwarfFile(Dir, FileName, 0, None, None, CUID)); + } + } + + preprocessDWODebugInfo(); +} + +bool BinaryContext::shouldEmit(const BinaryFunction &Function) const { + if (opts::processAllFunctions()) + return true; + + if (Function.isIgnored()) + return false; + + // In relocation mode we will emit non-simple functions with CFG. + // If the function does not have a CFG it should be marked as ignored. + return HasRelocations || Function.isSimple(); +} + +void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { + uint32_t Operation = Inst.getOperation(); + switch (Operation) { + case MCCFIInstruction::OpSameValue: + OS << "OpSameValue Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpRememberState: + OS << "OpRememberState"; + break; + case MCCFIInstruction::OpRestoreState: + OS << "OpRestoreState"; + break; + case MCCFIInstruction::OpOffset: + OS << "OpOffset Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: + OS << "OpDefCfaRegister Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpDefCfaOffset: + OS << "OpDefCfaOffset " << Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfa: + OS << "OpDefCfa Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpRelOffset: + OS << "OpRelOffset Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpAdjustCfaOffset: + OS << "OfAdjustCfaOffset " << Inst.getOffset(); + break; + case MCCFIInstruction::OpEscape: + OS << "OpEscape"; + break; + case MCCFIInstruction::OpRestore: + OS << "OpRestore Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpUndefined: + OS << "OpUndefined Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpRegister: + OS << "OpRegister Reg" << Inst.getRegister() << " Reg" + << Inst.getRegister2(); + break; + case MCCFIInstruction::OpWindowSave: + OS << "OpWindowSave"; + break; + case MCCFIInstruction::OpGnuArgsSize: + OS << "OpGnuArgsSize"; + break; + default: + OS << "Op#" << Operation; + break; + } +} + +void BinaryContext::printInstruction(raw_ostream &OS, + const MCInst &Instruction, + uint64_t Offset, + const BinaryFunction* Function, + bool PrintMCInst, + bool PrintMemData, + bool PrintRelocations) const { + if (MIB->isEHLabel(Instruction)) { + OS << " EH_LABEL: " << *MIB->getTargetSymbol(Instruction) << '\n'; + return; + } + OS << format(" %08" PRIx64 ": ", Offset); + if (MIB->isCFI(Instruction)) { + uint32_t Offset = Instruction.getOperand(0).getImm(); + OS << "\t!CFI\t$" << Offset << "\t; "; + if (Function) + printCFI(OS, *Function->getCFIFor(Instruction)); + OS << "\n"; + return; + } + InstPrinter->printInst(&Instruction, 0, "", *STI, OS); + if (MIB->isCall(Instruction)) { + if (MIB->isTailCall(Instruction)) + OS << " # TAILCALL "; + if (MIB->isInvoke(Instruction)) { + const Optional EHInfo = MIB->getEHInfo(Instruction); + OS << " # handler: "; + if (EHInfo->first) + OS << *EHInfo->first; + else + OS << '0'; + OS << "; action: " << EHInfo->second; + const int64_t GnuArgsSize = MIB->getGnuArgsSize(Instruction); + if (GnuArgsSize >= 0) + OS << "; GNU_args_size = " << GnuArgsSize; + } + } else if (MIB->isIndirectBranch(Instruction)) { + if (uint64_t JTAddress = MIB->getJumpTable(Instruction)) { + OS << " # JUMPTABLE @0x" << Twine::utohexstr(JTAddress); + } else { + OS << " # UNKNOWN CONTROL FLOW"; + } + } + + MIB->printAnnotations(Instruction, OS); + + if (opts::PrintDebugInfo) { + DebugLineTableRowRef RowRef = + DebugLineTableRowRef::fromSMLoc(Instruction.getLoc()); + if (RowRef != DebugLineTableRowRef::NULL_ROW) { + const DWARFDebugLine::LineTable *LineTable; + if (Function && Function->getDWARFUnit() && + Function->getDWARFUnit()->getOffset() == RowRef.DwCompileUnitIndex) { + LineTable = Function->getDWARFLineTable(); + } else { + LineTable = DwCtx->getLineTableForUnit( + DwCtx->getCompileUnitForOffset(RowRef.DwCompileUnitIndex)); + } + assert(LineTable && + "line table expected for instruction with debug info"); + + const DWARFDebugLine::Row &Row = LineTable->Rows[RowRef.RowIndex - 1]; + StringRef FileName = ""; + if (Optional FName = + LineTable->Prologue.FileNames[Row.File - 1].Name.getAsCString()) + FileName = *FName; + OS << " # debug line " << FileName << ":" << Row.Line; + if (Row.Column) + OS << ":" << Row.Column; + if (Row.Discriminator) + OS << " discriminator:" << Row.Discriminator; + } + } + + if ((opts::PrintRelocations || PrintRelocations) && Function) { + const uint64_t Size = computeCodeSize(&Instruction, &Instruction + 1); + Function->printRelocations(OS, Offset, Size); + } + + OS << "\n"; + + if (PrintMCInst) { + Instruction.dump_pretty(OS, InstPrinter.get()); + OS << "\n"; + } +} + +ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) { + auto SI = AddressToSection.upper_bound(Address); + if (SI != AddressToSection.begin()) { + --SI; + uint64_t UpperBound = SI->first + SI->second->getSize(); + if (!SI->second->getSize()) + UpperBound += 1; + if (UpperBound > Address) + return *SI->second; + } + return std::make_error_code(std::errc::bad_address); +} + +ErrorOr +BinaryContext::getSectionNameForAddress(uint64_t Address) const { + if (ErrorOr Section = getSectionForAddress(Address)) { + return Section->getName(); + } + return std::make_error_code(std::errc::bad_address); +} + +BinarySection &BinaryContext::registerSection(BinarySection *Section) { + auto Res = Sections.insert(Section); + (void)Res; + assert(Res.second && "can't register the same section twice."); + + // Only register allocatable sections in the AddressToSection map. + if (Section->isAllocatable() && Section->getAddress()) + AddressToSection.insert(std::make_pair(Section->getAddress(), Section)); + NameToSection.insert( + std::make_pair(std::string(Section->getName()), Section)); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: registering " << *Section << "\n"); + return *Section; +} + +BinarySection &BinaryContext::registerSection(SectionRef Section) { + return registerSection(new BinarySection(*this, Section)); +} + +BinarySection & +BinaryContext::registerSection(StringRef SectionName, + const BinarySection &OriginalSection) { + return registerSection(new BinarySection(*this, + SectionName, + OriginalSection)); +} + +BinarySection &BinaryContext::registerOrUpdateSection(StringRef Name, + unsigned ELFType, + unsigned ELFFlags, + uint8_t *Data, + uint64_t Size, + unsigned Alignment) { + auto NamedSections = getSectionByName(Name); + if (NamedSections.begin() != NamedSections.end()) { + assert(std::next(NamedSections.begin()) == NamedSections.end() && + "can only update unique sections"); + BinarySection *Section = NamedSections.begin()->second; + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: updating " << *Section << " -> "); + const bool Flag = Section->isAllocatable(); + (void)Flag; + Section->update(Data, Size, Alignment, ELFType, ELFFlags); + LLVM_DEBUG(dbgs() << *Section << "\n"); + // FIXME: Fix section flags/attributes for MachO. + if (isELF()) + assert(Flag == Section->isAllocatable() && + "can't change section allocation status"); + return *Section; + } + + return registerSection(new BinarySection(*this, Name, Data, Size, Alignment, + ELFType, ELFFlags)); +} + +bool BinaryContext::deregisterSection(BinarySection &Section) { + BinarySection *SectionPtr = &Section; + auto Itr = Sections.find(SectionPtr); + if (Itr != Sections.end()) { + auto Range = AddressToSection.equal_range(SectionPtr->getAddress()); + while (Range.first != Range.second) { + if (Range.first->second == SectionPtr) { + AddressToSection.erase(Range.first); + break; + } + ++Range.first; + } + + auto NameRange = + NameToSection.equal_range(std::string(SectionPtr->getName())); + while (NameRange.first != NameRange.second) { + if (NameRange.first->second == SectionPtr) { + NameToSection.erase(NameRange.first); + break; + } + ++NameRange.first; + } + + Sections.erase(Itr); + delete SectionPtr; + return true; + } + return false; +} + +void BinaryContext::printSections(raw_ostream &OS) const { + for (BinarySection *const &Section : Sections) { + OS << "BOLT-INFO: " << *Section << "\n"; + } +} + +BinarySection &BinaryContext::absoluteSection() { + if (ErrorOr Section = getUniqueSectionByName("")) + return *Section; + return registerOrUpdateSection("", ELF::SHT_NULL, 0u); +} + +ErrorOr +BinaryContext::getUnsignedValueAtAddress(uint64_t Address, + size_t Size) const { + const ErrorOr Section = getSectionForAddress(Address); + if (!Section) + return std::make_error_code(std::errc::bad_address); + + if (Section->isVirtual()) + return 0; + + DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(), + AsmInfo->getCodePointerSize()); + auto ValueOffset = static_cast(Address - Section->getAddress()); + return DE.getUnsigned(&ValueOffset, Size); +} + +ErrorOr +BinaryContext::getSignedValueAtAddress(uint64_t Address, + size_t Size) const { + const ErrorOr Section = getSectionForAddress(Address); + if (!Section) + return std::make_error_code(std::errc::bad_address); + + if (Section->isVirtual()) + return 0; + + DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(), + AsmInfo->getCodePointerSize()); + auto ValueOffset = static_cast(Address - Section->getAddress()); + return DE.getSigned(&ValueOffset, Size); +} + +void BinaryContext::addRelocation(uint64_t Address, + MCSymbol *Symbol, + uint64_t Type, + uint64_t Addend, + uint64_t Value) { + ErrorOr Section = getSectionForAddress(Address); + assert(Section && "cannot find section for address"); + Section->addRelocation(Address - Section->getAddress(), + Symbol, + Type, + Addend, + Value); +} + +void BinaryContext::addDynamicRelocation(uint64_t Address, + MCSymbol *Symbol, + uint64_t Type, + uint64_t Addend, + uint64_t Value) { + ErrorOr Section = getSectionForAddress(Address); + assert(Section && "cannot find section for address"); + Section->addDynamicRelocation(Address - Section->getAddress(), + Symbol, + Type, + Addend, + Value); +} + +bool BinaryContext::removeRelocationAt(uint64_t Address) { + ErrorOr Section = getSectionForAddress(Address); + assert(Section && "cannot find section for address"); + return Section->removeRelocationAt(Address - Section->getAddress()); +} + +const Relocation *BinaryContext::getRelocationAt(uint64_t Address) { + ErrorOr Section = getSectionForAddress(Address); + if (!Section) + return nullptr; + + return Section->getRelocationAt(Address - Section->getAddress()); +} + +const Relocation *BinaryContext::getDynamicRelocationAt(uint64_t Address) { + ErrorOr Section = getSectionForAddress(Address); + if (!Section) + return nullptr; + + return Section->getDynamicRelocationAt(Address - Section->getAddress()); +} + +void BinaryContext::markAmbiguousRelocations(BinaryData &BD, + const uint64_t Address) { + auto setImmovable = [&](BinaryData &BD) { + BinaryData *Root = BD.getAtomicRoot(); + LLVM_DEBUG(if (Root->isMoveable()) { + dbgs() << "BOLT-DEBUG: setting " << *Root << " as immovable " + << "due to ambiguous relocation referencing 0x" + << Twine::utohexstr(Address) << '\n'; + }); + Root->setIsMoveable(false); + }; + + if (Address == BD.getAddress()) { + setImmovable(BD); + + // Set previous symbol as immovable + BinaryData *Prev = getBinaryDataContainingAddress(Address - 1); + if (Prev && Prev->getEndAddress() == BD.getAddress()) + setImmovable(*Prev); + } + + if (Address == BD.getEndAddress()) { + setImmovable(BD); + + // Set next symbol as immovable + BinaryData *Next = getBinaryDataContainingAddress(BD.getEndAddress()); + if (Next && Next->getAddress() == BD.getEndAddress()) + setImmovable(*Next); + } +} + +BinaryFunction *BinaryContext::getFunctionForSymbol(const MCSymbol *Symbol, + uint64_t *EntryDesc) { + std::shared_lock Lock(SymbolToFunctionMapMutex); + auto BFI = SymbolToFunctionMap.find(Symbol); + if (BFI == SymbolToFunctionMap.end()) + return nullptr; + + BinaryFunction *BF = BFI->second; + if (EntryDesc) + *EntryDesc = BF->getEntryIDForSymbol(Symbol); + + return BF; +} + +void BinaryContext::exitWithBugReport(StringRef Message, + const BinaryFunction &Function) const { + errs() << "=======================================\n"; + errs() << "BOLT is unable to proceed because it couldn't properly understand " + "this function.\n"; + errs() << "If you are running the most recent version of BOLT, you may " + "want to " + "report this and paste this dump.\nPlease check that there is no " + "sensitive contents being shared in this dump.\n"; + errs() << "\nOffending function: " << Function.getPrintName() << "\n\n"; + ScopedPrinter SP(errs()); + SP.printBinaryBlock("Function contents", *Function.getData()); + errs() << "\n"; + Function.dump(); + errs() << "ERROR: " << Message; + errs() << "\n=======================================\n"; + exit(1); +} + +BinaryFunction * +BinaryContext::createInjectedBinaryFunction(const std::string &Name, + bool IsSimple) { + InjectedBinaryFunctions.push_back(new BinaryFunction(Name, *this, IsSimple)); + BinaryFunction *BF = InjectedBinaryFunctions.back(); + setSymbolToFunctionMap(BF->getSymbol(), BF); + BF->CurrentState = BinaryFunction::State::CFG; + return BF; +} + +std::pair +BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { + // Adjust branch instruction to match the current layout. + if (FixBranches) + BF.fixBranches(); + + // Create local MC context to isolate the effect of ephemeral code emission. + IndependentCodeEmitter MCEInstance = createIndependentMCCodeEmitter(); + MCContext *LocalCtx = MCEInstance.LocalCtx.get(); + MCAsmBackend *MAB = + TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions()); + + SmallString<256> Code; + raw_svector_ostream VecOS(Code); + + std::unique_ptr OW = MAB->createObjectWriter(VecOS); + std::unique_ptr Streamer(TheTarget->createMCObjectStreamer( + *TheTriple, *LocalCtx, std::unique_ptr(MAB), std::move(OW), + std::unique_ptr(MCEInstance.MCE.release()), *STI, + /*RelaxAll=*/false, + /*IncrementalLinkerCompatible=*/false, + /*DWARFMustBeAtTheEnd=*/false)); + + Streamer->InitSections(false); + + MCSection *Section = MCEInstance.LocalMOFI->getTextSection(); + Section->setHasInstructions(true); + + // Create symbols in the LocalCtx so that they get destroyed with it. + MCSymbol *StartLabel = LocalCtx->createTempSymbol(); + MCSymbol *EndLabel = LocalCtx->createTempSymbol(); + MCSymbol *ColdStartLabel = LocalCtx->createTempSymbol(); + MCSymbol *ColdEndLabel = LocalCtx->createTempSymbol(); + + Streamer->SwitchSection(Section); + Streamer->emitLabel(StartLabel); + emitFunctionBody(*Streamer, BF, /*EmitColdPart=*/false, + /*EmitCodeOnly=*/true); + Streamer->emitLabel(EndLabel); + + if (BF.isSplit()) { + MCSectionELF *ColdSection = + LocalCtx->getELFSection(BF.getColdCodeSectionName(), ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + ColdSection->setHasInstructions(true); + + Streamer->SwitchSection(ColdSection); + Streamer->emitLabel(ColdStartLabel); + emitFunctionBody(*Streamer, BF, /*EmitColdPart=*/true, + /*EmitCodeOnly=*/true); + Streamer->emitLabel(ColdEndLabel); + // To avoid calling MCObjectStreamer::flushPendingLabels() which is private + Streamer->emitBytes(StringRef("")); + Streamer->SwitchSection(Section); + } + + // To avoid calling MCObjectStreamer::flushPendingLabels() which is private or + // MCStreamer::Finish(), which does more than we want + Streamer->emitBytes(StringRef("")); + + MCAssembler &Assembler = + static_cast(Streamer.get())->getAssembler(); + MCAsmLayout Layout(Assembler); + Assembler.layout(Layout); + + const uint64_t HotSize = + Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel); + const uint64_t ColdSize = BF.isSplit() + ? Layout.getSymbolOffset(*ColdEndLabel) - + Layout.getSymbolOffset(*ColdStartLabel) + : 0ULL; + + // Clean-up the effect of the code emission. + for (const MCSymbol &Symbol : Assembler.symbols()) { + MCSymbol *MutableSymbol = const_cast(&Symbol); + MutableSymbol->setUndefined(); + MutableSymbol->setIsRegistered(false); + } + + return std::make_pair(HotSize, ColdSize); +} + +bool BinaryContext::validateEncoding(const MCInst &Inst, + ArrayRef InputEncoding) const { + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + + MCE->encodeInstruction(Inst, VecOS, Fixups, *STI); + auto EncodedData = ArrayRef((uint8_t *)Code.data(), Code.size()); + if (InputEncoding != EncodedData) { + if (opts::Verbosity > 1) { + errs() << "BOLT-WARNING: mismatched encoding detected\n" + << " input: " << InputEncoding << '\n' + << " output: " << EncodedData << '\n'; + } + return false; + } + + return true; +} + +uint64_t BinaryContext::getHotThreshold() const { + static uint64_t Threshold = 0; + if (Threshold == 0) { + Threshold = std::max((uint64_t)opts::ExecutionCountThreshold, + NumProfiledFuncs ? SumExecutionCount / (2 * NumProfiledFuncs) : 1); + } + return Threshold; +} + +BinaryFunction * +BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address, + bool CheckPastEnd, + bool UseMaxSize) { + auto FI = BinaryFunctions.upper_bound(Address); + if (FI == BinaryFunctions.begin()) + return nullptr; + --FI; + + const uint64_t UsedSize = + UseMaxSize ? FI->second.getMaxSize() : FI->second.getSize(); + + if (Address >= FI->first + UsedSize + (CheckPastEnd ? 1 : 0)) + return nullptr; + + return &FI->second; +} + +BinaryFunction * +BinaryContext::getBinaryFunctionAtAddress(uint64_t Address) { + // First, try to find a function starting at the given address. If the + // function was folded, this will get us the original folded function if it + // wasn't removed from the list, e.g. in non-relocation mode. + auto BFI = BinaryFunctions.find(Address); + if (BFI != BinaryFunctions.end()) { + return &BFI->second; + } + + // We might have folded the function matching the object at the given + // address. In such case, we look for a function matching the symbol + // registered at the original address. The new function (the one that the + // original was folded into) will hold the symbol. + if (const BinaryData *BD = getBinaryDataAtAddress(Address)) { + uint64_t EntryID = 0; + BinaryFunction *BF = getFunctionForSymbol(BD->getSymbol(), &EntryID); + if (BF && EntryID == 0) + return BF; + } + return nullptr; +} + +DebugAddressRangesVector BinaryContext::translateModuleAddressRanges( + const DWARFAddressRangesVector &InputRanges) const { + DebugAddressRangesVector OutputRanges; + + for (const DWARFAddressRange Range : InputRanges) { + auto BFI = BinaryFunctions.lower_bound(Range.LowPC); + while (BFI != BinaryFunctions.end()) { + const BinaryFunction &Function = BFI->second; + if (Function.getAddress() >= Range.HighPC) + break; + const DebugAddressRangesVector FunctionRanges = + Function.getOutputAddressRanges(); + std::move(std::begin(FunctionRanges), + std::end(FunctionRanges), + std::back_inserter(OutputRanges)); + std::advance(BFI, 1); + } + } + + return OutputRanges; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h new file mode 100644 index 000000000000..792b1a809208 --- /dev/null +++ b/bolt/src/BinaryContext.h @@ -0,0 +1,1260 @@ +//===--- BinaryContext.h - Interface for machine-level context -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Context for processing binary executables in files and/or memory. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H + +#include "BinaryData.h" +#include "BinarySection.h" +#include "DebugData.h" +#include "JumpTable.h" +#include "MCPlusBuilder.h" +#include "RuntimeLibs/RuntimeLibrary.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/iterator.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +class MCDisassembler; +class MCInstPrinter; + +using namespace object; + +namespace bolt { + +class BinaryFunction; +class ExecutableFileMemoryManager; + +/// Information on loadable part of the file. +struct SegmentInfo { + uint64_t Address; /// Address of the segment in memory. + uint64_t Size; /// Size of the segment in memory. + uint64_t FileOffset; /// Offset in the file. + uint64_t FileSize; /// Size in file. + uint64_t Alignment; /// Alignment of the segment. + + void print(raw_ostream &OS) const { + OS << "SegmentInfo { Address: 0x" + << Twine::utohexstr(Address) << ", Size: 0x" + << Twine::utohexstr(Size) << ", FileOffset: 0x" + << Twine::utohexstr(FileOffset) << ", FileSize: 0x" + << Twine::utohexstr(FileSize) << ", Alignment: 0x" + << Twine::utohexstr(Alignment) << "}"; + }; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) { + SegInfo.print(OS); + return OS; +} + +enum class MemoryContentsType : char { + UNKNOWN = 0, /// Unknown contents. + POSSIBLE_JUMP_TABLE, /// Possibly a non-PIC jump table. + POSSIBLE_PIC_JUMP_TABLE, /// Possibly a PIC jump table. +}; + +/// Helper function to truncate a \p Value to given size in \p Bytes. +inline int64_t truncateToSize(int64_t Value, unsigned Bytes) { + return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8)); +} + +/// Filter iterator. +template > +class FilterIterator + : public std::iterator::value_type> { + using Iterator = FilterIterator; + using T = typename std::iterator_traits::reference; + using PointerT = typename std::iterator_traits::pointer; + + PredType Pred; + ItrType Itr, End; + + void prev() { + while (!Pred(--Itr)) + ; + } + void next() { + ++Itr; + nextMatching(); + } + void nextMatching() { + while (Itr != End && !Pred(Itr)) + ++Itr; + } +public: + Iterator &operator++() { next(); return *this; } + Iterator &operator--() { prev(); return *this; } + Iterator operator++(int) { auto Tmp(Itr); next(); return Tmp; } + Iterator operator--(int) { auto Tmp(Itr); prev(); return Tmp; } + bool operator==(const Iterator& Other) const { + return Itr == Other.Itr; + } + bool operator!=(const Iterator& Other) const { + return !operator==(Other); + } + T operator*() { return *Itr; } + PointerT operator->() { return &operator*(); } + FilterIterator(PredType Pred, ItrType Itr, ItrType End) + : Pred(Pred), Itr(Itr), End(End) { + nextMatching(); + } +}; + +class BinaryContext { + BinaryContext() = delete; + + /// Name of the binary file the context originated from. + std::string Filename; + + /// Unique build ID if available for the binary. + Optional FileBuildID; + + /// Set of all sections. + struct CompareSections { + bool operator()(const BinarySection *A, const BinarySection *B) const { + return *A < *B; + } + }; + using SectionSetType = std::set; + SectionSetType Sections; + + using SectionIterator = pointee_iterator; + using SectionConstIterator = pointee_iterator; + + using FilteredSectionIterator = FilterIterator; + using FilteredSectionConstIterator = FilterIterator; + + /// Map virtual address to a section. It is possible to have more than one + /// section mapped to the same address, e.g. non-allocatable sections. + using AddressToSectionMapType = std::multimap; + AddressToSectionMapType AddressToSection; + + /// multimap of section name to BinarySection object. Some binaries + /// have multiple sections with the same name. + using NameToSectionMapType = std::multimap; + NameToSectionMapType NameToSection; + + /// Low level section registration. + BinarySection ®isterSection(BinarySection *Section); + + /// Store all functions in the binary, sorted by original address. + std::map BinaryFunctions; + + /// A mutex that is used to control parallel accesses to BinaryFunctions + mutable std::shared_timed_mutex BinaryFunctionsMutex; + + /// Functions injected by BOLT + std::vector InjectedBinaryFunctions; + + /// Jump tables for all functions mapped by address. + std::map JumpTables; + + /// Locations of PC-relative relocations in data objects. + std::unordered_set DataPCRelocations; + + /// Used in duplicateJumpTable() to uniquely identify a JT clone + /// Start our IDs with a high number so getJumpTableContainingAddress checks + /// with size won't overflow + uint32_t DuplicatedJumpTables{0x10000000}; + + /// The runtime library. + std::unique_ptr RtLibrary; + + /// DWP Context. + std::shared_ptr DWPContext; + + /// A map of DWO Ids to CUs. + using DWOIdToCUMapType = std::unordered_map; + DWOIdToCUMapType DWOCUs; + + /// Preprocess DWO debug information. + void preprocessDWODebugInfo(); + +public: + static std::unique_ptr + createBinaryContext(const ObjectFile *File, bool IsPIC, + std::unique_ptr DwCtx); + + /// Given DWOId returns CU if it exists in DWOCUs. + Optional getDWOCU(uint64_t DWOId); + + /// Returns DWOContext if it exists. + DWARFContext *getDWOContext(); + + /// Get Number of DWOCUs in a map. + uint32_t getNumDWOCUs() { return DWOCUs.size(); } + + /// [start memory address] -> [segment info] mapping. + std::map SegmentMapInfo; + + /// Symbols that are expected to be undefined in MCContext during emission. + std::unordered_set UndefinedSymbols; + + /// [name] -> [BinaryData*] map used for global symbol resolution. + using SymbolMapType = StringMap; + SymbolMapType GlobalSymbols; + + /// [address] -> [BinaryData], ... + /// Addresses never change. + /// Note: it is important that clients do not hold on to instances of + /// BinaryData* while the map is still being modified during BinaryFunction + /// disassembly. This is because of the possibility that a regular + /// BinaryData is later discovered to be a JumpTable. + using BinaryDataMapType = std::map; + using binary_data_iterator = BinaryDataMapType::iterator; + using binary_data_const_iterator = BinaryDataMapType::const_iterator; + BinaryDataMapType BinaryDataMap; + + using FilteredBinaryDataConstIterator = + FilterIterator; + using FilteredBinaryDataIterator = FilterIterator; + + /// Memory manager for sections and segments. Used to communicate with ORC + /// among other things. + std::shared_ptr EFMM; + + StringRef getFilename() const { return Filename; } + void setFilename(StringRef Name) { Filename = std::string(Name); } + + Optional getFileBuildID() const { + if (FileBuildID) { + return StringRef(*FileBuildID); + } + + return NoneType(); + } + void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); } + + bool hasSymbolsWithFileName() const { + return HasSymbolsWithFileName; + } + void setHasSymbolsWithFileName(bool Value) { + HasSymbolsWithFileName = true; + } + + /// Return true if relocations against symbol with a given name + /// must be created. + bool forceSymbolRelocations(StringRef SymbolName) const; + + uint64_t getNumUnusedProfiledObjects() const { + return NumUnusedProfiledObjects; + } + void setNumUnusedProfiledObjects(uint64_t N) { + NumUnusedProfiledObjects = N; + } + + RuntimeLibrary *getRuntimeLibrary() { return RtLibrary.get(); } + void setRuntimeLibrary(std::unique_ptr Lib) { + assert(!RtLibrary && "Cannot set runtime library twice."); + RtLibrary = std::move(Lib); + } + + /// Return BinaryFunction containing a given \p Address or nullptr if + /// no registered function contains the \p Address. + /// + /// In a binary a function has somewhat vague boundaries. E.g. a function can + /// refer to the first byte past the end of the function, and it will still be + /// referring to this function, not the function following it in the address + /// space. Thus we have the following flags that allow to lookup for + /// a function where a caller has more context for the search. + /// + /// If \p CheckPastEnd is true and the \p Address falls on a byte + /// immediately following the last byte of some function and there's no other + /// function that starts there, then return the function as the one containing + /// the \p Address. This is useful when we need to locate functions for + /// references pointing immediately past a function body. + /// + /// If \p UseMaxSize is true, then include the space between this function + /// body and the next object in address ranges that we check. + BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address, + bool CheckPastEnd = false, + bool UseMaxSize = false); + + /// Return a BinaryFunction that starts at a given \p Address. + BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address); + + const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const { + return const_cast(this)-> + getBinaryFunctionAtAddress(Address); + } + + /// Return size of an entry for the given jump table \p Type. + uint64_t getJumpTableEntrySize(JumpTable::JumpTableType Type) const { + return Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize(); + } + + /// Return JumpTable containing a given \p Address. + JumpTable *getJumpTableContainingAddress(uint64_t Address) { + auto JTI = JumpTables.upper_bound(Address); + if (JTI == JumpTables.begin()) + return nullptr; + --JTI; + if (JTI->first + JTI->second->getSize() > Address) + return JTI->second; + if (JTI->second->getSize() == 0 && JTI->first == Address) + return JTI->second; + return nullptr; + } + + unsigned getDWARFEncodingSize(unsigned Encoding) { + switch (Encoding & 0x0f) { + default: llvm_unreachable("unknown encoding"); + case dwarf::DW_EH_PE_absptr: + case dwarf::DW_EH_PE_signed: + return AsmInfo->getCodePointerSize(); + case dwarf::DW_EH_PE_udata2: + case dwarf::DW_EH_PE_sdata2: + return 2; + case dwarf::DW_EH_PE_udata4: + case dwarf::DW_EH_PE_sdata4: + return 4; + case dwarf::DW_EH_PE_udata8: + case dwarf::DW_EH_PE_sdata8: + return 8; + } + } + + /// [MCSymbol] -> [BinaryFunction] + /// + /// As we fold identical functions, multiple symbols can point + /// to the same BinaryFunction. + std::unordered_map SymbolToFunctionMap; + + /// A mutex that is used to control parallel accesses to SymbolToFunctionMap + mutable std::shared_timed_mutex SymbolToFunctionMapMutex; + + /// Look up the symbol entry that contains the given \p Address (based on + /// the start address and size for each symbol). Returns a pointer to + /// the BinaryData for that symbol. If no data is found, nullptr is returned. + const BinaryData *getBinaryDataContainingAddressImpl(uint64_t Address) const; + + /// Update the Parent fields in BinaryDatas after adding a new entry into + /// \p BinaryDataMap. + void updateObjectNesting(BinaryDataMapType::iterator GAI); + + /// Validate that if object address ranges overlap that the object with + /// the larger range is a parent of the object with the smaller range. + bool validateObjectNesting() const; + + /// Validate that there are no top level "holes" in each section + /// and that all relocations with a section are mapped to a valid + /// top level BinaryData. + bool validateHoles() const; + + /// Produce output address ranges based on input ranges for some module. + DebugAddressRangesVector translateModuleAddressRanges( + const DWARFAddressRangesVector &InputRanges) const; + + /// Get a bogus "absolute" section that will be associated with all + /// absolute BinaryDatas. + BinarySection &absoluteSection(); + + /// Process "holes" in between known BinaryData objects. For now, + /// symbols are padded with the space before the next BinaryData object. + void fixBinaryDataHoles(); + + /// Generate names based on data hashes for unknown symbols. + void generateSymbolHashes(); + + /// Construct BinaryFunction object and add it to internal maps. + BinaryFunction *createBinaryFunction(const std::string &Name, + BinarySection &Section, + uint64_t Address, + uint64_t Size, + uint64_t SymbolSize = 0, + uint16_t Alignment = 0); + + /// Return all functions for this rewrite instance. + std::map &getBinaryFunctions() { + return BinaryFunctions; + } + + /// Return all functions for this rewrite instance. + const std::map &getBinaryFunctions() const { + return BinaryFunctions; + } + + /// Create BOLT-injected function + BinaryFunction *createInjectedBinaryFunction(const std::string &Name, + bool IsSimple = true); + + std::vector &getInjectedBinaryFunctions() { + return InjectedBinaryFunctions; + } + + /// Return vector with all functions, i.e. include functions from the input + /// binary and functions created by BOLT. + std::vector getAllBinaryFunctions(); + + /// Construct a jump table for \p Function at \p Address or return an existing + /// one at that location. + /// + /// May create an embedded jump table and return its label as the second + /// element of the pair. + const MCSymbol *getOrCreateJumpTable(BinaryFunction &Function, + uint64_t Address, + JumpTable::JumpTableType Type); + + /// Analyze a possible jump table of type \p Type at a given \p Address. + /// \p BF is a function referencing the jump table. + /// Return true if the jump table was detected at \p Address, and false + /// otherwise. + /// + /// If \p NextJTAddress is different from zero, it is used as an upper + /// bound for jump table memory layout. + /// + /// Optionally, populate \p Offsets with jump table entries. The entries + /// could be partially populated if the jump table detection fails. + bool analyzeJumpTable(const uint64_t Address, + const JumpTable::JumpTableType Type, + BinaryFunction &BF, + const uint64_t NextJTAddress = 0, + JumpTable::OffsetsType *Offsets = nullptr); + + /// After jump table locations are established, this function will populate + /// their OffsetEntries based on memory contents. + void populateJumpTables(); + + /// Returns a jump table ID and label pointing to the duplicated jump table. + /// Ordinarily, jump tables are identified by their address in the input + /// binary. We return an ID with the high bit set to differentiate it from + /// regular addresses, avoiding conflicts with standard jump tables. + std::pair + duplicateJumpTable(BinaryFunction &Function, JumpTable *JT, + const MCSymbol *OldLabel); + + /// Generate a unique name for jump table at a given \p Address belonging + /// to function \p BF. + std::string generateJumpTableName(const BinaryFunction &BF, uint64_t Address); + + /// Return true if the array of bytes represents a valid code padding. + bool hasValidCodePadding(const BinaryFunction &BF); + + /// Verify padding area between functions, and adjust max function size + /// accordingly. + void adjustCodePadding(); + + /// Regular page size. + static constexpr unsigned RegularPageSize = 0x1000; + + /// Huge page size to use. + static constexpr unsigned HugePageSize = 0x200000; + + /// Map address to a constant island owner (constant data in code section) + std::map AddressToConstantIslandMap; + + /// A map from jump table address to insertion order. Used for generating + /// jump table names. + std::map JumpTableIds; + + std::unique_ptr Ctx; + + /// A mutex that is used to control parallel accesses to Ctx + mutable std::shared_timed_mutex CtxMutex; + std::unique_lock scopeLock() const { + return std::unique_lock(CtxMutex); + } + + std::unique_ptr DwCtx; + + std::unique_ptr TheTriple; + + const Target *TheTarget; + + std::string TripleName; + + std::unique_ptr MCE; + + std::unique_ptr MOFI; + + std::unique_ptr AsmInfo; + + std::unique_ptr MII; + + std::unique_ptr STI; + + std::unique_ptr InstPrinter; + + std::unique_ptr MIA; + + std::unique_ptr MIB; + + std::unique_ptr MRI; + + std::unique_ptr DisAsm; + + std::unique_ptr MAB; + + /// Indicates if relocations are available for usage. + bool HasRelocations{false}; + + /// Is the binary always loaded at a fixed address. Shared objects and + /// position-independent executables (PIEs) are examples of binaries that + /// will have HasFixedLoadAddress set to false. + bool HasFixedLoadAddress{true}; + + /// Set to true if the binary contains PT_DYNAMIC header. + bool hasDynamicHeader{false}; + + /// Set to true if the binary contains PT_INTERP header. + bool hasInterpHeader{false}; + + /// Indicates if any of local symbols used for functions or data objects + /// have an origin file name available. + bool HasSymbolsWithFileName{false}; + + /// Sum of execution count of all functions + uint64_t SumExecutionCount{0}; + + /// Number of functions with profile information + uint64_t NumProfiledFuncs{0}; + + /// Number of objects in profile whose profile was ignored. + uint64_t NumUnusedProfiledObjects{0}; + + /// Total hotness score according to profiling data for this binary. + uint64_t TotalScore{0}; + + /// Binary-wide stats for macro-fusion. + uint64_t MissedMacroFusionPairs{0}; + uint64_t MissedMacroFusionExecCount{0}; + + // Address of the first allocated segment. + uint64_t FirstAllocAddress{std::numeric_limits::max()}; + + /// Track next available address for new allocatable sections. RewriteInstance + /// sets this prior to running BOLT passes, so layout passes are aware of the + /// final addresses functions will have. + uint64_t LayoutStartAddress{0}; + + /// Old .text info. + uint64_t OldTextSectionAddress{0}; + uint64_t OldTextSectionOffset{0}; + uint64_t OldTextSectionSize{0}; + + /// Address of the code/function that is executed before any other code in + /// the binary. + Optional StartFunctionAddress; + + /// Address of the code/function that is going to be executed right before + /// the execution of the binary is completed. + Optional FiniFunctionAddress; + + /// Page alignment used for code layout. + uint64_t PageAlign{HugePageSize}; + + /// True if the binary requires immediate relocation processing. + bool RequiresZNow{false}; + + /// List of functions that always trap. + std::vector TrappedFunctions; + + /// Map SDT locations to SDT markers info + std::unordered_map SDTMarkers; + + /// Map linux kernel program locations/instructions to their pointers in + /// special linux kernel sections + std::unordered_map> LKMarkers; + + /// PseudoProbe decoder + MCPseudoProbeDecoder ProbeDecoder; + + /// DWARF encoding. Available encoding types defined in BinaryFormat/Dwarf.h + /// enum Constants, e.g. DW_EH_PE_omit. + unsigned TTypeEncoding = dwarf::DW_EH_PE_omit; + unsigned LSDAEncoding = dwarf::DW_EH_PE_omit; + + BinaryContext(std::unique_ptr Ctx, + std::unique_ptr DwCtx, + std::unique_ptr TheTriple, + const Target *TheTarget, + std::string TripleName, + std::unique_ptr MCE, + std::unique_ptr MOFI, + std::unique_ptr AsmInfo, + std::unique_ptr MII, + std::unique_ptr STI, + std::unique_ptr InstPrinter, + std::unique_ptr MIA, + std::unique_ptr MIB, + std::unique_ptr MRI, + std::unique_ptr DisAsm); + + ~BinaryContext(); + + std::unique_ptr createObjectWriter(raw_pwrite_stream &OS); + + bool isELF() const { + return TheTriple->isOSBinFormatELF(); + } + + bool isMachO() const { + return TheTriple->isOSBinFormatMachO(); + } + + bool isAArch64() const { + return TheTriple->getArch() == llvm::Triple::aarch64; + } + + bool isX86() const { + return TheTriple->getArch() == llvm::Triple::x86 || + TheTriple->getArch() == llvm::Triple::x86_64; + } + + bool isExecutable() const { + // TODO: handle static-pie binaries case + return hasInterpHeader; + } + + /// Iterate over all BinaryData. + iterator_range getBinaryData() const { + return make_range(BinaryDataMap.begin(), BinaryDataMap.end()); + } + + /// Iterate over all BinaryData. + iterator_range getBinaryData() { + return make_range(BinaryDataMap.begin(), BinaryDataMap.end()); + } + + /// Iterate over all BinaryData associated with the given \p Section. + iterator_range + getBinaryDataForSection(const BinarySection &Section) const { + auto Begin = BinaryDataMap.lower_bound(Section.getAddress()); + if (Begin != BinaryDataMap.begin()) { + --Begin; + } + auto End = BinaryDataMap.upper_bound(Section.getEndAddress()); + auto pred = + [&Section](const binary_data_const_iterator &Itr) -> bool { + return Itr->second->getSection() == Section; + }; + return make_range(FilteredBinaryDataConstIterator(pred, Begin, End), + FilteredBinaryDataConstIterator(pred, End, End)); + } + + /// Iterate over all BinaryData associated with the given \p Section. + iterator_range + getBinaryDataForSection(BinarySection &Section) { + auto Begin = BinaryDataMap.lower_bound(Section.getAddress()); + if (Begin != BinaryDataMap.begin()) { + --Begin; + } + auto End = BinaryDataMap.upper_bound(Section.getEndAddress()); + auto pred = [&Section](const binary_data_iterator &Itr) -> bool { + return Itr->second->getSection() == Section; + }; + return make_range(FilteredBinaryDataIterator(pred, Begin, End), + FilteredBinaryDataIterator(pred, End, End)); + } + + /// Iterate over all the sub-symbols of /p BD (if any). + iterator_range getSubBinaryData(BinaryData *BD); + + /// Clear the global symbol address -> name(s) map. + void clearBinaryData() { + GlobalSymbols.clear(); + for (auto &Entry : BinaryDataMap) { + delete Entry.second; + } + BinaryDataMap.clear(); + } + + /// Process \p Address reference from code in function \BF. + /// \p IsPCRel indicates if the reference is PC-relative. + /// Return pair corresponding to the \p Address. + std::pair handleAddressRef(uint64_t Address, + BinaryFunction &BF, + bool IsPCRel); + + /// Analyze memory contents at the given \p Address and return the type of + /// memory contents (such as a possible jump table). + MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF); + + /// Return a value of the global \p Symbol or an error if the value + /// was not set. + ErrorOr getSymbolValue(const MCSymbol &Symbol) const { + const BinaryData *BD = getBinaryDataByName(Symbol.getName()); + if (!BD) + return std::make_error_code(std::errc::bad_address); + return BD->getAddress(); + } + + /// Return a global symbol registered at a given \p Address and \p Size. + /// If no symbol exists, create one with unique name using \p Prefix. + /// If there are multiple symbols registered at the \p Address, then + /// return the first one. + MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, + Twine Prefix, + uint64_t Size = 0, + uint16_t Alignment = 0, + unsigned Flags = 0); + + /// Create a global symbol without registering an address. + MCSymbol *getOrCreateUndefinedGlobalSymbol(StringRef Name); + + /// Register a symbol with \p Name at a given \p Address using \p Size, + /// \p Alignment, and \p Flags. See llvm::SymbolRef::Flags for the definition + /// of \p Flags. + MCSymbol *registerNameAtAddress(StringRef Name, + uint64_t Address, + uint64_t Size, + uint16_t Alignment, + unsigned Flags = 0); + + /// Return BinaryData registered at a given \p Address or nullptr if no + /// global symbol was registered at the location. + const BinaryData *getBinaryDataAtAddress(uint64_t Address) const { + auto NI = BinaryDataMap.find(Address); + return NI != BinaryDataMap.end() ? NI->second : nullptr; + } + + BinaryData *getBinaryDataAtAddress(uint64_t Address) { + auto NI = BinaryDataMap.find(Address); + return NI != BinaryDataMap.end() ? NI->second : nullptr; + } + + /// Look up the symbol entry that contains the given \p Address (based on + /// the start address and size for each symbol). Returns a pointer to + /// the BinaryData for that symbol. If no data is found, nullptr is returned. + const BinaryData * + getBinaryDataContainingAddress(uint64_t Address) const { + return getBinaryDataContainingAddressImpl(Address); + } + + BinaryData *getBinaryDataContainingAddress(uint64_t Address) { + return + const_cast(getBinaryDataContainingAddressImpl(Address)); + } + + /// Return BinaryData for the given \p Name or nullptr if no + /// global symbol with that name exists. + const BinaryData *getBinaryDataByName(StringRef Name) const { + auto Itr = GlobalSymbols.find(Name); + return Itr != GlobalSymbols.end() ? Itr->second : nullptr; + } + + BinaryData *getBinaryDataByName(StringRef Name) { + auto Itr = GlobalSymbols.find(Name); + return Itr != GlobalSymbols.end() ? Itr->second : nullptr; + } + + /// Return true if \p SymbolName was generated internally and was not present + /// in the input binary. + bool isInternalSymbolName(const StringRef Name) { + return Name.startswith("SYMBOLat") || + Name.startswith("DATAat") || + Name.startswith("HOLEat"); + } + + MCSymbol *getHotTextStartSymbol() const { + return Ctx->getOrCreateSymbol("__hot_start"); + } + + MCSymbol *getHotTextEndSymbol() const { + return Ctx->getOrCreateSymbol("__hot_end"); + } + + MCSection *getTextSection() const { + return MOFI->getTextSection(); + } + + /// Return code section with a given name. + MCSection *getCodeSection(StringRef SectionName) const { + if (isELF()) + return Ctx->getELFSection(SectionName, ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + else + return Ctx->getMachOSection("__TEXT", SectionName, + MachO::S_ATTR_PURE_INSTRUCTIONS, + SectionKind::getText()); + } + + /// Return data section with a given name. + MCSection *getDataSection(StringRef SectionName) const { + return Ctx->getELFSection(SectionName, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + } + + /// \name Pre-assigned Section Names + /// @{ + + const char *getMainCodeSectionName() const { + return ".text"; + } + + const char *getColdCodeSectionName() const { + return ".text.cold"; + } + + const char *getHotTextMoverSectionName() const { + return ".text.mover"; + } + + const char *getInjectedCodeSectionName() const { + return ".text.injected"; + } + + const char *getInjectedColdCodeSectionName() const { + return ".text.injected.cold"; + } + + ErrorOr getGdbIndexSection() const { + return getUniqueSectionByName(".gdb_index"); + } + + /// @} + + /// Register \p TargetFunction as fragment of \p Function. + void registerFragment(BinaryFunction &TargetFunction, + BinaryFunction &Function) const; + + /// Resolve inter-procedural dependencies from \p Function. + void processInterproceduralReferences(BinaryFunction &Function); + + /// Perform any necessary post processing on the symbol table after + /// function disassembly is complete. This processing fixes top + /// level data holes and makes sure the symbol table is valid. + /// It also assigns all memory profiling info to the appropriate + /// BinaryData objects. + void postProcessSymbolTable(); + + /// Set the size of the global symbol located at \p Address. Return + /// false if no symbol exists, true otherwise. + bool setBinaryDataSize(uint64_t Address, uint64_t Size); + + /// Print the global symbol table. + void printGlobalSymbols(raw_ostream& OS) const; + + /// Register information about the given \p Section so we can look up + /// sections by address. + BinarySection ®isterSection(SectionRef Section); + + /// Register a copy of /p OriginalSection under a different name. + BinarySection ®isterSection(StringRef SectionName, + const BinarySection &OriginalSection); + + /// Register or update the information for the section with the given + /// /p Name. If the section already exists, the information in the + /// section will be updated with the new data. + BinarySection ®isterOrUpdateSection(StringRef Name, + unsigned ELFType, + unsigned ELFFlags, + uint8_t *Data = nullptr, + uint64_t Size = 0, + unsigned Alignment = 1); + + /// Register the information for the note (non-allocatable) section + /// with the given /p Name. If the section already exists, the + /// information in the section will be updated with the new data. + BinarySection & + registerOrUpdateNoteSection(StringRef Name, + uint8_t *Data = nullptr, + uint64_t Size = 0, + unsigned Alignment = 1, + bool IsReadOnly = true, + unsigned ELFType = ELF::SHT_PROGBITS) { + return registerOrUpdateSection(Name, ELFType, + BinarySection::getFlags(IsReadOnly), + Data, Size, Alignment); + } + + /// Remove the given /p Section from the set of all sections. Return + /// true if the section was removed (and deleted), otherwise false. + bool deregisterSection(BinarySection &Section); + + /// Iterate over all registered sections. + iterator_range sections() { + auto notNull = [](const SectionIterator &Itr) { + return (bool)*Itr; + }; + return make_range(FilteredSectionIterator(notNull, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(notNull, + Sections.end(), + Sections.end())); + } + + /// Iterate over all registered sections. + iterator_range sections() const { + return const_cast(this)->sections(); + } + + /// Iterate over all registered allocatable sections. + iterator_range allocatableSections() { + auto isAllocatable = [](const SectionIterator &Itr) { + return *Itr && Itr->isAllocatable(); + }; + return make_range(FilteredSectionIterator(isAllocatable, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(isAllocatable, + Sections.end(), + Sections.end())); + } + + /// Iterate over all registered code sections. + iterator_range textSections() { + auto isText = [](const SectionIterator &Itr) { + return *Itr && Itr->isAllocatable() && Itr->isText(); + }; + return make_range(FilteredSectionIterator(isText, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(isText, + Sections.end(), + Sections.end())); + } + + /// Iterate over all registered allocatable sections. + iterator_range allocatableSections() const { + return const_cast(this)->allocatableSections(); + } + + /// Iterate over all registered non-allocatable sections. + iterator_range nonAllocatableSections() { + auto notAllocated = [](const SectionIterator &Itr) { + return *Itr && !Itr->isAllocatable(); + }; + return make_range(FilteredSectionIterator(notAllocated, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(notAllocated, + Sections.end(), + Sections.end())); + } + + /// Iterate over all registered non-allocatable sections. + iterator_range nonAllocatableSections() const { + return const_cast(this)->nonAllocatableSections(); + } + + /// Iterate over all allocatable relocation sections. + iterator_range allocatableRelaSections() { + auto isAllocatableRela = [](const SectionIterator &Itr) { + return *Itr && Itr->isAllocatable() && Itr->isRela(); + }; + return make_range(FilteredSectionIterator(isAllocatableRela, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(isAllocatableRela, + Sections.end(), + Sections.end())); + } + + /// Check if the address belongs to this binary's static allocation space. + bool containsAddress(uint64_t Address) const { + return Address >= FirstAllocAddress && Address < LayoutStartAddress; + } + + /// Return section name containing the given \p Address. + ErrorOr getSectionNameForAddress(uint64_t Address) const; + + /// Print all sections. + void printSections(raw_ostream& OS) const; + + /// Return largest section containing the given \p Address. These + /// functions only work for allocatable sections, i.e. ones with non-zero + /// addresses. + ErrorOr getSectionForAddress(uint64_t Address); + ErrorOr getSectionForAddress(uint64_t Address) const { + return const_cast(this)->getSectionForAddress(Address); + } + + /// Return section(s) associated with given \p Name. + iterator_range + getSectionByName(StringRef Name) { + return make_range(NameToSection.equal_range(std::string(Name))); + } + iterator_range + getSectionByName(StringRef Name) const { + return make_range(NameToSection.equal_range(std::string(Name))); + } + + /// Return the unique section associated with given \p Name. + /// If there is more than one section with the same name, return an error + /// object. + ErrorOr getUniqueSectionByName(StringRef SectionName) const { + auto Sections = getSectionByName(SectionName); + if (Sections.begin() != Sections.end() && + std::next(Sections.begin()) == Sections.end()) + return *Sections.begin()->second; + return std::make_error_code(std::errc::bad_address); + } + + /// Return an unsigned value of \p Size stored at \p Address. The address has + /// to be a valid statically allocated address for the binary. + ErrorOr getUnsignedValueAtAddress(uint64_t Address, + size_t Size) const; + + /// Return a signed value of \p Size stored at \p Address. The address has + /// to be a valid statically allocated address for the binary. + ErrorOr getSignedValueAtAddress(uint64_t Address, + size_t Size) const; + + /// Special case of getUnsignedValueAtAddress() that uses a pointer size. + ErrorOr getPointerAtAddress(uint64_t Address) const { + return getUnsignedValueAtAddress(Address, AsmInfo->getCodePointerSize()); + } + + /// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then + /// removed from the list of functions \p BFs. The profile data of \p ChildBF + /// is merged into that of \p ParentBF. This function is thread safe. + void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF); + + /// Add a Section relocation at a given \p Address. + void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type, + uint64_t Addend = 0, uint64_t Value = 0); + + /// Return a relocation registered at a given \p Address, or nullptr if there + /// is no relocation at such address. + const Relocation *getRelocationAt(uint64_t Address); + + /// Register a presence of PC-relative relocation at the given \p Address. + void addPCRelativeDataRelocation(uint64_t Address) { + DataPCRelocations.emplace(Address); + } + + /// Register dynamic relocation at \p Address. + void addDynamicRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type, + uint64_t Addend, uint64_t Value = 0); + + /// Return a dynamic relocation registered at a given \p Address, or nullptr + /// if there is no dynamic relocation at such address. + const Relocation *getDynamicRelocationAt(uint64_t Address); + + /// Remove registered relocation at a given \p Address. + bool removeRelocationAt(uint64_t Address); + + /// This function makes sure that symbols referenced by ambiguous relocations + /// are marked as immovable. For now, if a section relocation points at the + /// boundary between two symbols then those symbols are marked as immovable. + void markAmbiguousRelocations(BinaryData &BD, const uint64_t Address); + + /// Return BinaryFunction corresponding to \p Symbol. If \p EntryDesc is not + /// nullptr, set it to entry descriminator corresponding to \p Symbol + /// (0 for single-entry functions). This function is thread safe. + BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol, + uint64_t *EntryDesc = nullptr); + + const BinaryFunction *getFunctionForSymbol( + const MCSymbol *Symbol, uint64_t *EntryDesc = nullptr) const { + return const_cast(this)-> + getFunctionForSymbol(Symbol, EntryDesc); + } + + /// Associate the symbol \p Sym with the function \p BF for lookups with + /// getFunctionForSymbol(). + void setSymbolToFunctionMap(const MCSymbol *Sym, BinaryFunction *BF) { + SymbolToFunctionMap[Sym] = BF; + } + + /// Populate some internal data structures with debug info. + void preprocessDebugInfo(); + + /// Add a filename entry from SrcCUID to DestCUID. + unsigned addDebugFilenameToUnit(const uint32_t DestCUID, + const uint32_t SrcCUID, + unsigned FileIndex); + + /// Return functions in output layout order + std::vector getSortedFunctions(); + + /// Do the best effort to calculate the size of the function by emitting + /// its code, and relaxing branch instructions. By default, branch + /// instructions are updated to match the layout. Pass \p FixBranches set to + /// false if the branches are known to be up to date with the code layout. + /// + /// Return the pair where the first size is for the main part, and the second + /// size is for the cold one. + std::pair + calculateEmittedSize(BinaryFunction &BF, bool FixBranches = true); + + /// Calculate the size of the instruction \p Inst optionally using a + /// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is + /// not thread safe and each thread should operate with its own copy of it. + uint64_t + computeInstructionSize(const MCInst &Inst, + const MCCodeEmitter *Emitter = nullptr) const { + if (!Emitter) + Emitter = this->MCE.get(); + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + Emitter->encodeInstruction(Inst, VecOS, Fixups, *STI); + return Code.size(); + } + + /// Compute the native code size for a range of instructions. + /// Note: this can be imprecise wrt the final binary since happening prior to + /// relaxation, as well as wrt the original binary because of opcode + /// shortening.MCCodeEmitter is not thread safe and each thread should operate + /// with its own copy of it. + template + uint64_t computeCodeSize(Itr Beg, Itr End, + const MCCodeEmitter *Emitter = nullptr) const { + uint64_t Size = 0; + while (Beg != End) { + if (!MIB->isPseudo(*Beg)) + Size += computeInstructionSize(*Beg, Emitter); + ++Beg; + } + return Size; + } + + /// Verify that assembling instruction \p Inst results in the same sequence of + /// bytes as \p Encoding. + bool validateEncoding(const MCInst &Instruction, + ArrayRef Encoding) const; + + /// Return a function execution count threshold for determining whether + /// the function is 'hot'. Consider it hot if count is above the average exec + /// count of profiled functions. + uint64_t getHotThreshold() const; + + /// Return true if instruction \p Inst requires an offset for further + /// processing (e.g. assigning a profile). + bool keepOffsetForInstruction(const MCInst &Inst) const { + if (MIB->isCall(Inst) || MIB->isBranch(Inst) || MIB->isReturn(Inst) || + MIB->isPrefix(Inst) || MIB->isIndirectBranch(Inst)) { + return true; + } + return false; + } + + /// Return true if the function should be emitted to the output file. + bool shouldEmit(const BinaryFunction &Function) const; + + /// Print the string name for a CFI operation. + static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst); + + /// Print a single MCInst in native format. If Function is non-null, + /// the instruction will be annotated with CFI and possibly DWARF line table + /// info. + /// If printMCInst is true, the instruction is also printed in the + /// architecture independent format. + void printInstruction(raw_ostream &OS, + const MCInst &Instruction, + uint64_t Offset = 0, + const BinaryFunction *Function = nullptr, + bool PrintMCInst = false, + bool PrintMemData = false, + bool PrintRelocations = false) const; + + /// Print a range of instructions. + template + uint64_t printInstructions(raw_ostream &OS, + Itr Begin, + Itr End, + uint64_t Offset = 0, + const BinaryFunction *Function = nullptr, + bool PrintMCInst = false, + bool PrintMemData = false, + bool PrintRelocations = false) const { + while (Begin != End) { + printInstruction(OS, *Begin, Offset, Function, PrintMCInst, + PrintMemData, PrintRelocations); + Offset += computeCodeSize(Begin, Begin + 1); + ++Begin; + } + return Offset; + } + + void exitWithBugReport(StringRef Message, + const BinaryFunction &Function) const; + + struct IndependentCodeEmitter { + std::unique_ptr LocalMOFI; + std::unique_ptr LocalCtx; + std::unique_ptr MCE; + }; + + /// Encapsulates an independent MCCodeEmitter that doesn't share resources + /// with the main one available through BinaryContext::MCE, managed by + /// BinaryContext. + /// This is intended to create a lock-free environment for an auxiliary thread + /// that needs to perform work with an MCCodeEmitter that can be transient or + /// won't be used in the main code emitter. + IndependentCodeEmitter createIndependentMCCodeEmitter() const { + IndependentCodeEmitter MCEInstance; + MCEInstance.LocalCtx.reset( + new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get())); + MCEInstance.LocalMOFI.reset( + TheTarget->createMCObjectFileInfo(*MCEInstance.LocalCtx.get(), + /*PIC=*/!HasFixedLoadAddress)); + MCEInstance.LocalCtx->setObjectFileInfo(MCEInstance.LocalMOFI.get()); + MCEInstance.MCE.reset( + TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx)); + return MCEInstance; + } + + /// Creating MCStreamer instance. + std::unique_ptr + createStreamer(llvm::raw_pwrite_stream &OS) const { + MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx); + MCAsmBackend *MAB = + TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions()); + std::unique_ptr OW = MAB->createObjectWriter(OS); + std::unique_ptr Streamer(TheTarget->createMCObjectStreamer( + *TheTriple, *Ctx, std::unique_ptr(MAB), std::move(OW), + std::unique_ptr(MCE), *STI, + /* RelaxAll */ false, + /* IncrementalLinkerCompatible */ false, + /* DWARFMustBeAtTheEnd */ false)); + return Streamer; + } +}; + +template > +inline raw_ostream &operator<<(raw_ostream &OS, + const ArrayRef &ByteArray) { + const char *Sep = ""; + for (const auto Byte : ByteArray) { + OS << Sep << format("%.2x", Byte); + Sep = " "; + } + return OS; +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/BinaryData.cpp b/bolt/src/BinaryData.cpp new file mode 100644 index 000000000000..8a2fbe41ef80 --- /dev/null +++ b/bolt/src/BinaryData.cpp @@ -0,0 +1,159 @@ +//===--- BinaryData.cpp - Representation of section data objects ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryData.h" +#include "BinarySection.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" + +using namespace llvm; +using namespace bolt; + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +namespace opts { +extern cl::OptionCategory BoltCategory; +extern cl::opt Verbosity; + +cl::opt +PrintSymbolAliases("print-aliases", + cl::desc("print aliases when printing objects"), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); +} + +bool BinaryData::isAbsolute() const { + return Flags & SymbolRef::SF_Absolute; +} + +bool BinaryData::isMoveable() const { + return (!isAbsolute() && + (IsMoveable && + (!Parent || isTopLevelJumpTable()))); +} + +void BinaryData::merge(const BinaryData *Other) { + assert(!Size || !Other->Size || Size == Other->Size); + assert(Address == Other->Address); + assert(*Section == *Other->Section); + assert(OutputOffset == Other->OutputOffset); + assert(OutputSection == Other->OutputSection); + Symbols.insert(Symbols.end(), Other->Symbols.begin(), Other->Symbols.end()); + Flags |= Other->Flags; + if (!Size) + Size = Other->Size; +} + +bool BinaryData::hasName(StringRef Name) const { + for (const MCSymbol *Symbol : Symbols) { + if (Name == Symbol->getName()) + return true; + } + return false; +} + +bool BinaryData::hasNameRegex(StringRef NameRegex) const { + Regex MatchName(NameRegex); + for (const MCSymbol *Symbol : Symbols) { + if (MatchName.match(Symbol->getName())) + return true; + } + return false; +} + +bool BinaryData::nameStartsWith(StringRef Prefix) const { + for (const MCSymbol *Symbol : Symbols) { + if (Symbol->getName().startswith(Prefix)) + return true; + } + return false; +} + +StringRef BinaryData::getSectionName() const { + return getSection().getName(); +} + +StringRef BinaryData::getOutputSectionName() const { + return getOutputSection().getName(); +} + +uint64_t BinaryData::getOutputAddress() const { + assert(OutputSection->getOutputAddress()); + return OutputSection->getOutputAddress() + OutputOffset; +} + +uint64_t BinaryData::getOffset() const { + return Address - getSection().getAddress(); +} + +void BinaryData::setSection(BinarySection &NewSection) { + if (OutputSection == Section) + OutputSection = &NewSection; + Section = &NewSection; +} + +bool BinaryData::isMoved() const { + return (getOffset() != OutputOffset || OutputSection != Section); +} + +void BinaryData::print(raw_ostream &OS) const { + printBrief(OS); +} + +void BinaryData::printBrief(raw_ostream &OS) const { + OS << "("; + + if (isJumpTable()) + OS << "jump-table: "; + else + OS << "object: "; + + OS << getName(); + + if ((opts::PrintSymbolAliases || opts::Verbosity > 1) && Symbols.size() > 1) { + OS << ", aliases:"; + for (unsigned I = 1u; I < Symbols.size(); ++I) { + OS << (I == 1 ? " (" : ", ") << Symbols[I]->getName(); + } + OS << ")"; + } + + if (Parent) { + OS << " (parent: "; + Parent->printBrief(OS); + OS << ")"; + } + + OS << ", 0x" << Twine::utohexstr(getAddress()) + << ":0x" << Twine::utohexstr(getEndAddress()) + << "/" << getSize() << "/" << getAlignment() + << "/0x" << Twine::utohexstr(Flags); + + OS << ")"; +} + +BinaryData::BinaryData(MCSymbol &Symbol, + uint64_t Address, + uint64_t Size, + uint16_t Alignment, + BinarySection &Section, + unsigned Flags) +: Section(&Section), + Address(Address), + Size(Size), + Alignment(Alignment), + Flags(Flags), + OutputSection(&Section), + OutputOffset(getOffset()) +{ + Symbols.push_back(&Symbol); +} diff --git a/bolt/src/BinaryData.h b/bolt/src/BinaryData.h new file mode 100644 index 000000000000..563d30fc33c6 --- /dev/null +++ b/bolt/src/BinaryData.h @@ -0,0 +1,246 @@ +//===--- BinaryData.h - Representation of section data objects -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_DATA_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_DATA_H + +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +namespace llvm { +namespace bolt { + +class BinarySection; + +/// \p BinaryData represents an indivisible part of a data section section. +/// BinaryData's may contain sub-components, e.g. jump tables but they are +/// considered to be part of the parent symbol in terms of divisibility and +/// reordering. +class BinaryData { + friend class BinaryContext; + /// Non-null if this BinaryData is contained in a larger BinaryData object, + /// i.e. the start and end addresses are contained within another object. + BinaryData *Parent{nullptr}; + + // non-copyable + BinaryData() = delete; + BinaryData(const BinaryData &) = delete; + BinaryData &operator=(const BinaryData &) = delete; + +protected: + /// All symbols associated with this data. + std::vector Symbols; + + /// Section this data belongs to. + BinarySection *Section{nullptr}; + + /// Start address of this symbol. + uint64_t Address{0}; + /// Size of this data (can be 0). + uint64_t Size{0}; + /// Alignment of this data. + uint16_t Alignment{1}; + + bool IsMoveable{true}; + + /// Symbol flags (same as llvm::SymbolRef::Flags) + unsigned Flags{0}; + + /// Output section for this data if it has been moved from the original + /// section. + BinarySection *OutputSection{nullptr}; + + /// The offset of this symbol in the output section. This is different + /// from \p Address - Section.getAddress() when the data has been reordered. + uint64_t OutputOffset{0}; + + BinaryData *getRootData() { + BinaryData *BD = this; + while (BD->Parent) + BD = BD->Parent; + return BD; + } + +public: + BinaryData(BinaryData &&) = default; + BinaryData(MCSymbol &Symbol, + uint64_t Address, + uint64_t Size, + uint16_t Alignment, + BinarySection &Section, + unsigned Flags = 0); + virtual ~BinaryData() { } + + virtual bool isJumpTable() const { return false; } + virtual bool isObject() const { return !isJumpTable(); } + virtual void merge(const BinaryData *Other); + + bool isTopLevelJumpTable() const { + return (isJumpTable() && + (!Parent || (!Parent->Parent && Parent->isObject()))); + } + + // BinaryData that is considered atomic and potentially moveable. All + // MemInfo data and relocations should be wrt. to atomic data. + bool isAtomic() const { + return isTopLevelJumpTable() || !Parent; + } + + iterator_range::const_iterator> symbols() const { + return make_range(Symbols.begin(), Symbols.end()); + } + + StringRef getName() const { return getSymbol()->getName(); } + + MCSymbol *getSymbol() { return Symbols.front(); } + const MCSymbol *getSymbol() const { return Symbols.front(); } + + const std::vector &getSymbols() const { return Symbols; } + std::vector &getSymbols() { return Symbols; } + + bool hasName(StringRef Name) const; + bool hasNameRegex(StringRef Name) const; + bool nameStartsWith(StringRef Prefix) const; + + bool hasSymbol(const MCSymbol *Symbol) const { + return std::find(Symbols.begin(), Symbols.end(), Symbol) != Symbols.end(); + } + + bool isAbsolute() const; + bool isMoveable() const; + + uint64_t getAddress() const { return Address; } + uint64_t getEndAddress() const { return Address + Size; } + uint64_t getOffset() const; + uint64_t getSize() const { return Size; } + uint16_t getAlignment() const { return Alignment; } + + BinarySection &getSection() { return *Section; } + const BinarySection &getSection() const { return *Section; } + StringRef getSectionName() const; + + BinarySection &getOutputSection() { return *OutputSection; } + const BinarySection &getOutputSection() const { return *OutputSection; } + StringRef getOutputSectionName() const; + uint64_t getOutputAddress() const; + uint64_t getOutputOffset() const { return OutputOffset; } + uint64_t getOutputSize() const { return Size; } + + bool isMoved() const; + bool containsAddress(uint64_t Address) const { + return ((getAddress() <= Address && Address < getEndAddress()) || + (getAddress() == Address && !getSize())); + } + bool containsRange(uint64_t Address, uint64_t Size) const { + return containsAddress(Address) && Address + Size <= getEndAddress(); + } + + const BinaryData *getParent() const { + return Parent; + } + + const BinaryData *getRootData() const { + const BinaryData *BD = this; + while (BD->Parent) + BD = BD->Parent; + return BD; + } + + BinaryData *getAtomicRoot() { + BinaryData *BD = this; + while (!BD->isAtomic() && BD->Parent) + BD = BD->Parent; + return BD; + } + + const BinaryData *getAtomicRoot() const { + const BinaryData *BD = this; + while (!BD->isAtomic() && BD->Parent) + BD = BD->Parent; + return BD; + } + + bool isAncestorOf(const BinaryData *BD) const { + return Parent && (Parent == BD || Parent->isAncestorOf(BD)); + } + + void setIsMoveable(bool Flag) { IsMoveable = Flag; } + void setSection(BinarySection &NewSection); + void setOutputSection(BinarySection &NewSection) { + OutputSection = &NewSection; + } + void setOutputOffset(uint64_t Offset) { OutputOffset = Offset; } + void setOutputLocation(BinarySection &NewSection, uint64_t NewOffset) { + setOutputSection(NewSection); + setOutputOffset(NewOffset); + } + + virtual void printBrief(raw_ostream &OS) const; + virtual void print(raw_ostream &OS) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const BinaryData &BD) { + BD.printBrief(OS); + return OS; +} + +/// Address access info used for memory profiling. +struct AddressAccess { + BinaryData *MemoryObject; /// Object accessed or nullptr + uint64_t Offset; /// Offset within the object or absolute address + uint64_t Count; /// Number of accesses + bool operator==(const AddressAccess &Other) const { + return MemoryObject == Other.MemoryObject && + Offset == Other.Offset && + Count == Other.Count; + } +}; + +/// Aggregated memory access info per instruction. +struct MemoryAccessProfile { + uint64_t NextInstrOffset; + SmallVector AddressAccessInfo; + bool operator==(const MemoryAccessProfile &Other) const { + return NextInstrOffset == Other.NextInstrOffset && + AddressAccessInfo == Other.AddressAccessInfo; + } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, + const bolt::MemoryAccessProfile &MAP) { + std::string TempString; + raw_string_ostream SS(TempString); + + const char *Sep = "\n "; + uint64_t TotalCount = 0; + for (const AddressAccess &AccessInfo : MAP.AddressAccessInfo) { + SS << Sep << "{ "; + if (AccessInfo.MemoryObject) + SS << AccessInfo.MemoryObject->getName() << " + "; + SS << "0x" << Twine::utohexstr(AccessInfo.Offset) << ": " + << AccessInfo.Count << " }"; + Sep = ",\n "; + TotalCount += AccessInfo.Count; + } + SS.flush(); + + OS << TotalCount << " total counts : " << TempString; + return OS; +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/BinaryEmitter.cpp b/bolt/src/BinaryEmitter.cpp new file mode 100644 index 000000000000..76ef4b8159bc --- /dev/null +++ b/bolt/src/BinaryEmitter.cpp @@ -0,0 +1,1027 @@ +//===--- BinaryEmitter.cpp - collection of functions to emit code and data ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryEmitter.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/LEB128.h" +#include "llvm/Support/SMLoc.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltCategory; +extern cl::OptionCategory BoltOptCategory; +extern cl::OptionCategory BoltRelocCategory; + +extern cl::opt AlignText; +extern cl::opt HotText; +extern cl::opt JumpTables; +extern cl::opt PreserveBlocksAlignment; +extern cl::opt PrintCacheMetrics; +extern cl::opt UpdateDebugSections; +extern cl::opt Verbosity; + +cl::opt +AlignBlocks("align-blocks", + cl::desc("align basic blocks"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +AlignMacroOpFusion("align-macro-fusion", + cl::desc("fix instruction alignment for macro-fusion (x86 relocation mode)"), + cl::init(MFT_HOT), + cl::values(clEnumValN(MFT_NONE, "none", + "do not insert alignment no-ops for macro-fusion"), + clEnumValN(MFT_HOT, "hot", + "only insert alignment no-ops on hot execution paths (default)"), + clEnumValN(MFT_ALL, "all", + "always align instructions to allow macro-fusion")), + cl::ZeroOrMore, + cl::cat(BoltRelocCategory)); + +static cl::list +BreakFunctionNames("break-funcs", + cl::CommaSeparated, + cl::desc("list of functions to core dump on (debugging)"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::list +FunctionPadSpec("pad-funcs", + cl::CommaSeparated, + cl::desc("list of functions to pad with amount of bytes"), + cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +MarkFuncs("mark-funcs", + cl::desc("mark function boundaries with break instruction to make " + "sure we accidentally don't cross them"), + cl::ReallyHidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +static cl::opt +PrintJumpTables("print-jump-tables", + cl::desc("print jump tables"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +X86AlignBranchBoundaryHotOnly("x86-align-branch-boundary-hot-only", + cl::desc("only apply branch boundary alignment in hot code"), + cl::init(true), + cl::cat(BoltOptCategory)); + +size_t padFunction(const BinaryFunction &Function) { + static std::map FunctionPadding; + + if (FunctionPadding.empty() && !FunctionPadSpec.empty()) { + for (std::string &Spec : FunctionPadSpec) { + size_t N = Spec.find(':'); + if (N == std::string::npos) + continue; + std::string Name = Spec.substr(0, N); + size_t Padding = std::stoull(Spec.substr(N+1)); + FunctionPadding[Name] = Padding; + } + } + + for (auto &FPI : FunctionPadding) { + std::string Name = FPI.first; + size_t Padding = FPI.second; + if (Function.hasNameRegex(Name)) { + return Padding; + } + } + + return 0; +} + +} // namespace opts + +namespace { +using JumpTable = bolt::JumpTable; + +class BinaryEmitter { +private: + BinaryEmitter(const BinaryEmitter &) = delete; + BinaryEmitter &operator=(const BinaryEmitter &) = delete; + + MCStreamer &Streamer; + BinaryContext &BC; + +public: + BinaryEmitter(MCStreamer &Streamer, BinaryContext &BC) + : Streamer(Streamer), + BC(BC) {} + + /// Emit all code and data. + void emitAll(StringRef OrgSecPrefix); + + /// Emit function code. The caller is responsible for emitting function + /// symbol(s) and setting the section to emit the code to. + void emitFunctionBody(BinaryFunction &BF, bool EmitColdPart, + bool EmitCodeOnly = false); + +private: + /// Emit function code. + void emitFunctions(); + + /// Emit a single function. + bool emitFunction(BinaryFunction &BF, bool EmitColdPart); + + /// Helper for emitFunctionBody to write data inside a function + /// (used for AArch64) + void emitConstantIslands(BinaryFunction &BF, bool EmitColdPart, + BinaryFunction *OnBehalfOf = nullptr); + + /// Emit jump tables for the function. + void emitJumpTables(const BinaryFunction &BF); + + /// Emit jump table data. Callee supplies sections for the data. + void emitJumpTable(const JumpTable &JT, MCSection *HotSection, + MCSection *ColdSection); + + /// Emit exception handling ranges for the function. + void emitLSDA(BinaryFunction &BF, bool EmitColdPart); + + /// Emit line number information corresponding to \p NewLoc. \p PrevLoc + /// provides a context for de-duplication of line number info. + /// \p FirstInstr indicates if \p NewLoc represents the first instruction + /// in a sequence, such as a function fragment. + /// + /// Return new current location which is either \p NewLoc or \p PrevLoc. + SMLoc emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, SMLoc PrevLoc, + bool FirstInstr); + + /// Emit debug line information for functions that were not emitted. + void emitDebugLineInfoForOriginalFunctions(); + + /// Emit data sections that have code references in them. + void emitDataSections(StringRef OrgSecPrefix); +}; + +} // anonymous namespace + +void BinaryEmitter::emitAll(StringRef OrgSecPrefix) { + Streamer.InitSections(false); + + if (opts::UpdateDebugSections && BC.isELF()) { + // Force the emission of debug line info into allocatable section to ensure + // RuntimeDyld will process it without ProcessAllSections flag. + // + // NB: on MachO all sections are required for execution, hence no need + // to change flags/attributes. + MCSectionELF *ELFDwarfLineSection = + static_cast(BC.MOFI->getDwarfLineSection()); + ELFDwarfLineSection->setFlags(ELF::SHF_ALLOC); + } + + if (RuntimeLibrary *RtLibrary = BC.getRuntimeLibrary()) { + RtLibrary->emitBinary(BC, Streamer); + } + + BC.getTextSection()->setAlignment(Align(opts::AlignText)); + + emitFunctions(); + + if (opts::UpdateDebugSections) + emitDebugLineInfoForOriginalFunctions(); + + emitDataSections(OrgSecPrefix); + + Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("_end")); +} + +void BinaryEmitter::emitFunctions() { + auto emit = [&](const std::vector &Functions) { + const bool HasProfile = BC.NumProfiledFuncs > 0; + const bool OriginalAllowAutoPadding = Streamer.getAllowAutoPadding(); + for (BinaryFunction *Function : Functions) { + if (!BC.shouldEmit(*Function)) { + continue; + } + + LLVM_DEBUG(dbgs() << "BOLT: generating code for function \"" << *Function + << "\" : " << Function->getFunctionNumber() << '\n'); + + // Was any part of the function emitted. + bool Emitted = false; + + // Turn off Intel JCC Erratum mitigation for cold code if requested + if (HasProfile && opts::X86AlignBranchBoundaryHotOnly && + !Function->hasValidProfile()) + Streamer.setAllowAutoPadding(false); + + Emitted |= emitFunction(*Function, /*EmitColdPart=*/false); + + if (Function->isSplit()) { + if (opts::X86AlignBranchBoundaryHotOnly) + Streamer.setAllowAutoPadding(false); + Emitted |= emitFunction(*Function, /*EmitColdPart=*/true); + } + Streamer.setAllowAutoPadding(OriginalAllowAutoPadding); + + if (Emitted) + Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics); + } + }; + + // Mark the start of hot text. + if (opts::HotText) { + Streamer.SwitchSection(BC.getTextSection()); + Streamer.emitLabel(BC.getHotTextStartSymbol()); + } + + // Emit functions in sorted order. + std::vector SortedFunctions = BC.getSortedFunctions(); + emit(SortedFunctions); + + // Emit functions added by BOLT. + emit(BC.getInjectedBinaryFunctions()); + + // Mark the end of hot text. + if (opts::HotText) { + Streamer.SwitchSection(BC.getTextSection()); + Streamer.emitLabel(BC.getHotTextEndSymbol()); + } +} + +bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) { + if (Function.size() == 0) + return false; + + if (Function.getState() == BinaryFunction::State::Empty) + return false; + + MCSection *Section = + BC.getCodeSection(EmitColdPart ? Function.getColdCodeSectionName() + : Function.getCodeSectionName()); + Streamer.SwitchSection(Section); + Section->setHasInstructions(true); + BC.Ctx->addGenDwarfSection(Section); + + if (BC.HasRelocations) { + Streamer.emitCodeAlignment(BinaryFunction::MinAlign); + uint16_t MaxAlignBytes = EmitColdPart + ? Function.getMaxColdAlignmentBytes() + : Function.getMaxAlignmentBytes(); + if (MaxAlignBytes > 0) + Streamer.emitCodeAlignment(Function.getAlignment(), MaxAlignBytes); + } else { + Streamer.emitCodeAlignment(Function.getAlignment()); + } + + MCContext &Context = Streamer.getContext(); + const MCAsmInfo *MAI = Context.getAsmInfo(); + + MCSymbol *StartSymbol = nullptr; + + // Emit all symbols associated with the main function entry. + if (!EmitColdPart) { + StartSymbol = Function.getSymbol(); + for (MCSymbol *Symbol : Function.getSymbols()) { + Streamer.emitSymbolAttribute(Symbol, MCSA_ELF_TypeFunction); + Streamer.emitLabel(Symbol); + } + } else { + StartSymbol = Function.getColdSymbol(); + Streamer.emitSymbolAttribute(StartSymbol, MCSA_ELF_TypeFunction); + Streamer.emitLabel(StartSymbol); + } + + // Emit CFI start + if (Function.hasCFI()) { + Streamer.emitCFIStartProc(/*IsSimple=*/false); + if (Function.getPersonalityFunction() != nullptr) { + Streamer.emitCFIPersonality(Function.getPersonalityFunction(), + Function.getPersonalityEncoding()); + } + MCSymbol *LSDASymbol = + EmitColdPart ? Function.getColdLSDASymbol() : Function.getLSDASymbol(); + if (LSDASymbol) { + Streamer.emitCFILsda(LSDASymbol, BC.LSDAEncoding); + } else { + Streamer.emitCFILsda(0, dwarf::DW_EH_PE_omit); + } + // Emit CFI instructions relative to the CIE + for (const MCCFIInstruction &CFIInstr : Function.cie()) { + // Only write CIE CFI insns that LLVM will not already emit + const std::vector &FrameInstrs = + MAI->getInitialFrameState(); + if (std::find(FrameInstrs.begin(), FrameInstrs.end(), CFIInstr) == + FrameInstrs.end()) + Streamer.emitCFIInstruction(CFIInstr); + } + } + + assert((Function.empty() || !(*Function.begin()).isCold()) && + "first basic block should never be cold"); + + // Emit UD2 at the beginning if requested by user. + if (!opts::BreakFunctionNames.empty()) { + for (std::string &Name : opts::BreakFunctionNames) { + if (Function.hasNameRegex(Name)) { + Streamer.emitIntValue(0x0B0F, 2); // UD2: 0F 0B + break; + } + } + } + + // Emit code. + emitFunctionBody(Function, EmitColdPart, /*EmitCodeOnly=*/false); + + // Emit padding if requested. + if (size_t Padding = opts::padFunction(Function)) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with " + << Padding << " bytes\n"); + Streamer.emitFill(Padding, MAI->getTextAlignFillValue()); + } + + if (opts::MarkFuncs) { + Streamer.emitIntValue(MAI->getTrapFillValue(), 1); + } + + // Emit CFI end + if (Function.hasCFI()) + Streamer.emitCFIEndProc(); + + MCSymbol *EndSymbol = EmitColdPart ? Function.getFunctionColdEndLabel() + : Function.getFunctionEndLabel(); + Streamer.emitLabel(EndSymbol); + + if (MAI->hasDotTypeDotSizeDirective()) { + const MCExpr *SizeExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(EndSymbol, Context), + MCSymbolRefExpr::create(StartSymbol, Context), Context); + Streamer.emitELFSize(StartSymbol, SizeExpr); + } + + // Exception handling info for the function. + emitLSDA(Function, EmitColdPart); + + if (!EmitColdPart && opts::JumpTables > JTS_NONE) + emitJumpTables(Function); + + return true; +} + +void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, bool EmitColdPart, + bool EmitCodeOnly) { + if (!EmitCodeOnly && EmitColdPart && BF.hasConstantIsland()) + BF.duplicateConstantIslands(); + + // Track the first emitted instruction with debug info. + bool FirstInstr = true; + for (BinaryBasicBlock *BB : BF.layout()) { + if (EmitColdPart != BB->isCold()) + continue; + + if ((opts::AlignBlocks || opts::PreserveBlocksAlignment) + && BB->getAlignment() > 1) { + Streamer.emitCodeAlignment(BB->getAlignment(), + BB->getAlignmentMaxBytes()); + } + Streamer.emitLabel(BB->getLabel()); + if (!EmitCodeOnly) { + if (MCSymbol *EntrySymbol = BF.getSecondaryEntryPointSymbol(*BB)) { + Streamer.emitLabel(EntrySymbol); + } + } + + // Check if special alignment for macro-fusion is needed. + bool MayNeedMacroFusionAlignment = + (opts::AlignMacroOpFusion == MFT_ALL) || + (opts::AlignMacroOpFusion == MFT_HOT && + BB->getKnownExecutionCount()); + BinaryBasicBlock::const_iterator MacroFusionPair; + if (MayNeedMacroFusionAlignment) { + MacroFusionPair = BB->getMacroOpFusionPair(); + if (MacroFusionPair == BB->end()) + MayNeedMacroFusionAlignment = false; + } + + SMLoc LastLocSeen; + // Remember if the last instruction emitted was a prefix. + bool LastIsPrefix = false; + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + MCInst &Instr = *I; + + if (EmitCodeOnly && BC.MIB->isPseudo(Instr)) + continue; + + // Handle pseudo instructions. + if (BC.MIB->isEHLabel(Instr)) { + const MCSymbol *Label = BC.MIB->getTargetSymbol(Instr); + assert(Instr.getNumOperands() >= 1 && Label && + "bad EH_LABEL instruction"); + Streamer.emitLabel(const_cast(Label)); + continue; + } + if (BC.MIB->isCFI(Instr)) { + Streamer.emitCFIInstruction(*BF.getCFIFor(Instr)); + continue; + } + + // Handle macro-fusion alignment. If we emitted a prefix as + // the last instruction, we should've already emitted the associated + // alignment hint, so don't emit it twice. + if (MayNeedMacroFusionAlignment && !LastIsPrefix && I == MacroFusionPair){ + // This assumes the second instruction in the macro-op pair will get + // assigned to its own MCRelaxableFragment. Since all JCC instructions + // are relaxable, we should be safe. + Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64); + } + + if (!EmitCodeOnly && opts::UpdateDebugSections && BF.getDWARFUnit()) { + LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, FirstInstr); + FirstInstr = false; + } + + // Prepare to tag this location with a label if we need to keep track of + // the location of calls/returns for BOLT address translation maps + if (!EmitCodeOnly && BF.requiresAddressTranslation() && + BC.MIB->hasAnnotation(Instr, "Offset")) { + const auto Offset = BC.MIB->getAnnotationAs(Instr, "Offset"); + MCSymbol *LocSym = BC.Ctx->createTempSymbol(); + Streamer.emitLabel(LocSym); + BB->getLocSyms().emplace_back(Offset, LocSym); + } + + Streamer.emitInstruction(Instr, *BC.STI); + LastIsPrefix = BC.MIB->isPrefix(Instr); + } + } + + if (!EmitCodeOnly) + emitConstantIslands(BF, EmitColdPart); +} + +void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart, + BinaryFunction *OnBehalfOf) { + BinaryFunction::IslandInfo &Islands = BF.getIslandInfo(); + if (Islands.DataOffsets.empty() && Islands.Dependency.empty()) + return; + + if (!OnBehalfOf) { + if (!EmitColdPart) + Streamer.emitLabel(BF.getFunctionConstantIslandLabel()); + else + Streamer.emitLabel(BF.getFunctionColdConstantIslandLabel()); + } + + assert((!OnBehalfOf || Islands.Proxies[OnBehalfOf].size() > 0) && + "spurious OnBehalfOf constant island emission"); + + assert(!BF.isInjected() && + "injected functions should not have constant islands"); + // Raw contents of the function. + StringRef SectionContents = BF.getOriginSection()->getContents(); + + // Raw contents of the function. + StringRef FunctionContents = + SectionContents.substr( + BF.getAddress() - BF.getOriginSection()->getAddress(), + BF.getMaxSize()); + + if (opts::Verbosity && !OnBehalfOf) + outs() << "BOLT-INFO: emitting constant island for function " << BF << "\n"; + + // We split the island into smaller blocks and output labels between them. + auto IS = Islands.Offsets.begin(); + for (auto DataIter = Islands.DataOffsets.begin(); + DataIter != Islands.DataOffsets.end(); + ++DataIter) { + uint64_t FunctionOffset = *DataIter; + uint64_t EndOffset = 0ULL; + + // Determine size of this data chunk + auto NextData = std::next(DataIter); + auto CodeIter = Islands.CodeOffsets.lower_bound(*DataIter); + if (CodeIter == Islands.CodeOffsets.end() && + NextData == Islands.DataOffsets.end()) { + EndOffset = BF.getMaxSize(); + } else if (CodeIter == Islands.CodeOffsets.end()) { + EndOffset = *NextData; + } else if (NextData == Islands.DataOffsets.end()) { + EndOffset = *CodeIter; + } else { + EndOffset = (*CodeIter > *NextData) ? *NextData : *CodeIter; + } + + if (FunctionOffset == EndOffset) + continue; // Size is zero, nothing to emit + + // Emit labels, relocs and data + while (IS != Islands.Offsets.end() && IS->first < EndOffset) { + auto NextLabelOffset = + IS == Islands.Offsets.end() ? EndOffset : IS->first; + auto NextRelOffset = EndOffset; + auto NextStop = std::min(NextLabelOffset, NextRelOffset); + assert(NextStop <= EndOffset && "internal overflow error"); + if (FunctionOffset < NextStop) { + Streamer.emitBytes(FunctionContents.slice(FunctionOffset, NextStop)); + FunctionOffset = NextStop; + } + if (IS != Islands.Offsets.end() && FunctionOffset == IS->first) { + // This is a slightly complex code to decide which label to emit. We + // have 4 cases to handle: regular symbol, cold symbol, regular or cold + // symbol being emitted on behalf of an external function. + if (!OnBehalfOf) { + if (!EmitColdPart) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label " + << IS->second->getName() << " at offset 0x" + << Twine::utohexstr(IS->first) << '\n'); + if (IS->second->isUndefined()) + Streamer.emitLabel(IS->second); + else + assert(BF.hasName(std::string(IS->second->getName()))); + } else if (Islands.ColdSymbols.count(IS->second) != 0) { + LLVM_DEBUG(dbgs() + << "BOLT-DEBUG: emitted label " + << Islands.ColdSymbols[IS->second]->getName() << '\n'); + if (Islands.ColdSymbols[IS->second]->isUndefined()) + Streamer.emitLabel(Islands.ColdSymbols[IS->second]); + } + } else { + if (!EmitColdPart) { + if (MCSymbol *Sym = Islands.Proxies[OnBehalfOf][IS->second]) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label " + << Sym->getName() << '\n'); + Streamer.emitLabel(Sym); + } + } else if (MCSymbol *Sym = + Islands.ColdProxies[OnBehalfOf][IS->second]) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName() + << '\n'); + Streamer.emitLabel(Sym); + } + } + ++IS; + } + } + assert(FunctionOffset <= EndOffset && "overflow error"); + if (FunctionOffset < EndOffset) { + Streamer.emitBytes(FunctionContents.slice(FunctionOffset, EndOffset)); + } + } + assert(IS == Islands.Offsets.end() && "some symbols were not emitted!"); + + if (OnBehalfOf) + return; + // Now emit constant islands from other functions that we may have used in + // this function. + for (BinaryFunction *ExternalFunc : Islands.Dependency) { + emitConstantIslands(*ExternalFunc, EmitColdPart, &BF); + } +} + +SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, + SMLoc PrevLoc, bool FirstInstr) { + DWARFUnit *FunctionCU = BF.getDWARFUnit(); + const DWARFDebugLine::LineTable *FunctionLineTable = BF.getDWARFLineTable(); + assert(FunctionCU && "cannot emit line info for function without CU"); + + DebugLineTableRowRef RowReference = DebugLineTableRowRef::fromSMLoc(NewLoc); + + // Check if no new line info needs to be emitted. + if (RowReference == DebugLineTableRowRef::NULL_ROW || + NewLoc.getPointer() == PrevLoc.getPointer()) + return PrevLoc; + + unsigned CurrentFilenum = 0; + const DWARFDebugLine::LineTable *CurrentLineTable = FunctionLineTable; + + // If the CU id from the current instruction location does not + // match the CU id from the current function, it means that we + // have come across some inlined code. We must look up the CU + // for the instruction's original function and get the line table + // from that. + const uint64_t FunctionUnitIndex = FunctionCU->getOffset(); + const uint32_t CurrentUnitIndex = RowReference.DwCompileUnitIndex; + if (CurrentUnitIndex != FunctionUnitIndex) { + CurrentLineTable = BC.DwCtx->getLineTableForUnit( + BC.DwCtx->getCompileUnitForOffset(CurrentUnitIndex)); + // Add filename from the inlined function to the current CU. + CurrentFilenum = + BC.addDebugFilenameToUnit(FunctionUnitIndex, CurrentUnitIndex, + CurrentLineTable->Rows[RowReference.RowIndex - 1].File); + } + + const DWARFDebugLine::Row &CurrentRow = + CurrentLineTable->Rows[RowReference.RowIndex - 1]; + if (!CurrentFilenum) + CurrentFilenum = CurrentRow.File; + + unsigned Flags = (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin); + + // Always emit is_stmt at the beginning of function fragment. + if (FirstInstr) + Flags |= DWARF2_FLAG_IS_STMT; + + BC.Ctx->setCurrentDwarfLoc( + CurrentFilenum, + CurrentRow.Line, + CurrentRow.Column, + Flags, + CurrentRow.Isa, + CurrentRow.Discriminator); + BC.Ctx->setDwarfCompileUnitID(FunctionUnitIndex); + + return NewLoc; +} + +void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) { + MCSection *ReadOnlySection = BC.MOFI->getReadOnlySection(); + MCSection *ReadOnlyColdSection = BC.MOFI->getContext().getELFSection( + ".rodata.cold", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + + if (!BF.hasJumpTables()) + return; + + if (opts::PrintJumpTables) { + outs() << "BOLT-INFO: jump tables for function " << BF << ":\n"; + } + + for (auto &JTI : BF.jumpTables()) { + JumpTable &JT = *JTI.second; + if (opts::PrintJumpTables) + JT.print(outs()); + if ((opts::JumpTables == JTS_BASIC || !BF.isSimple()) && + BC.HasRelocations) { + JT.updateOriginal(); + } else { + MCSection *HotSection, *ColdSection; + if (opts::JumpTables == JTS_BASIC) { + // In non-relocation mode we have to emit jump tables in local sections. + // This way we only overwrite them when the corresponding function is + // overwritten. + std::string Name = ".local." + JT.Labels[0]->getName().str(); + std::replace(Name.begin(), Name.end(), '/', '.'); + BinarySection &Section = + BC.registerOrUpdateSection(Name, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Section.setAnonymous(true); + JT.setOutputSection(Section); + HotSection = BC.getDataSection(Name); + ColdSection = HotSection; + } else { + if (BF.isSimple()) { + HotSection = ReadOnlySection; + ColdSection = ReadOnlyColdSection; + } else { + HotSection = BF.hasProfile() ? ReadOnlySection : ReadOnlyColdSection; + ColdSection = HotSection; + } + } + emitJumpTable(JT, HotSection, ColdSection); + } + } +} + +void BinaryEmitter::emitJumpTable(const JumpTable &JT, MCSection *HotSection, + MCSection *ColdSection) { + // Pre-process entries for aggressive splitting. + // Each label represents a separate switch table and gets its own count + // determining its destination. + std::map LabelCounts; + if (opts::JumpTables > JTS_SPLIT && !JT.Counts.empty()) { + MCSymbol *CurrentLabel = JT.Labels.at(0); + uint64_t CurrentLabelCount = 0; + for (unsigned Index = 0; Index < JT.Entries.size(); ++Index) { + auto LI = JT.Labels.find(Index * JT.EntrySize); + if (LI != JT.Labels.end()) { + LabelCounts[CurrentLabel] = CurrentLabelCount; + CurrentLabel = LI->second; + CurrentLabelCount = 0; + } + CurrentLabelCount += JT.Counts[Index].Count; + } + LabelCounts[CurrentLabel] = CurrentLabelCount; + } else { + Streamer.SwitchSection(JT.Count > 0 ? HotSection : ColdSection); + Streamer.emitValueToAlignment(JT.EntrySize); + } + MCSymbol *LastLabel = nullptr; + uint64_t Offset = 0; + for (MCSymbol *Entry : JT.Entries) { + auto LI = JT.Labels.find(Offset); + if (LI != JT.Labels.end()) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table " + << LI->second->getName() + << " (originally was at address 0x" + << Twine::utohexstr(JT.getAddress() + Offset) + << (Offset ? "as part of larger jump table\n" : "\n")); + if (!LabelCounts.empty()) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump table count: " + << LabelCounts[LI->second] << '\n'); + if (LabelCounts[LI->second] > 0) { + Streamer.SwitchSection(HotSection); + } else { + Streamer.SwitchSection(ColdSection); + } + Streamer.emitValueToAlignment(JT.EntrySize); + } + Streamer.emitLabel(LI->second); + LastLabel = LI->second; + } + if (JT.Type == JumpTable::JTT_NORMAL) { + Streamer.emitSymbolValue(Entry, JT.OutputEntrySize); + } else { // JTT_PIC + const MCSymbolRefExpr *JTExpr = + MCSymbolRefExpr::create(LastLabel, Streamer.getContext()); + const MCSymbolRefExpr *E = + MCSymbolRefExpr::create(Entry, Streamer.getContext()); + const MCBinaryExpr *Value = + MCBinaryExpr::createSub(E, JTExpr, Streamer.getContext()); + Streamer.emitValue(Value, JT.EntrySize); + } + Offset += JT.EntrySize; + } +} + +// The code is based on EHStreamer::emitExceptionTable(). +void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) { + const std::vector *Sites = + EmitColdPart ? &BF.getColdCallSites() : &BF.getCallSites(); + if (Sites->empty()) { + return; + } + + // Calculate callsite table size. Size of each callsite entry is: + // + // sizeof(start) + sizeof(length) + sizeof(LP) + sizeof(uleb128(action)) + // + // or + // + // sizeof(dwarf::DW_EH_PE_data4) * 3 + sizeof(uleb128(action)) + uint64_t CallSiteTableLength = Sites->size() * 4 * 3; + for (const BinaryFunction::CallSite &CallSite : *Sites) { + CallSiteTableLength += getULEB128Size(CallSite.Action); + } + + Streamer.SwitchSection(BC.MOFI->getLSDASection()); + + const unsigned TTypeEncoding = BC.TTypeEncoding; + const unsigned TTypeEncodingSize = BC.getDWARFEncodingSize(TTypeEncoding); + const uint16_t TTypeAlignment = 4; + + // Type tables have to be aligned at 4 bytes. + Streamer.emitValueToAlignment(TTypeAlignment); + + // Emit the LSDA label. + MCSymbol *LSDASymbol = + EmitColdPart ? BF.getColdLSDASymbol() : BF.getLSDASymbol(); + assert(LSDASymbol && "no LSDA symbol set"); + Streamer.emitLabel(LSDASymbol); + + // Corresponding FDE start. + const MCSymbol *StartSymbol = EmitColdPart ? BF.getColdSymbol() + : BF.getSymbol(); + + // Emit the LSDA header. + + // If LPStart is omitted, then the start of the FDE is used as a base for + // landing pad displacements. Then if a cold fragment starts with + // a landing pad, this means that the first landing pad offset will be 0. + // As a result, the exception handling runtime will ignore this landing pad + // because zero offset denotes the absence of a landing pad. + // For this reason, when the binary has fixed starting address we emit LPStart + // as 0 and output the absolute value of the landing pad in the table. + // + // If the base address can change, we cannot use absolute addresses for + // landing pads (at least not without runtime relocations). Hence, we fall + // back to emitting landing pads relative to the FDE start. + // As we are emitting label differences, we have to guarantee both labels are + // defined in the same section and hence cannot place the landing pad into a + // cold fragment when the corresponding call site is in the hot fragment. + // Because of this issue and the previously described issue of possible + // zero-offset landing pad we disable splitting of exception-handling + // code for shared objects. + std::function emitLandingPad; + if (BC.HasFixedLoadAddress) { + Streamer.emitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format + Streamer.emitIntValue(0, 4); // LPStart + emitLandingPad = [&](const MCSymbol *LPSymbol) { + if (!LPSymbol) + Streamer.emitIntValue(0, 4); + else + Streamer.emitSymbolValue(LPSymbol, 4); + }; + } else { + assert(!EmitColdPart && + "cannot have exceptions in cold fragment for shared object"); + Streamer.emitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format + emitLandingPad = [&](const MCSymbol *LPSymbol) { + if (!LPSymbol) + Streamer.emitIntValue(0, 4); + else + Streamer.emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4); + }; + } + + Streamer.emitIntValue(TTypeEncoding, 1); // TType format + + // See the comment in EHStreamer::emitExceptionTable() on to use + // uleb128 encoding (which can use variable number of bytes to encode the same + // value) to ensure type info table is properly aligned at 4 bytes without + // iteratively fixing sizes of the tables. + unsigned CallSiteTableLengthSize = getULEB128Size(CallSiteTableLength); + unsigned TTypeBaseOffset = + sizeof(int8_t) + // Call site format + CallSiteTableLengthSize + // Call site table length size + CallSiteTableLength + // Call site table length + BF.getLSDAActionTable().size() + // Actions table size + BF.getLSDATypeTable().size() * TTypeEncodingSize; // Types table size + unsigned TTypeBaseOffsetSize = getULEB128Size(TTypeBaseOffset); + unsigned TotalSize = + sizeof(int8_t) + // LPStart format + sizeof(int8_t) + // TType format + TTypeBaseOffsetSize + // TType base offset size + TTypeBaseOffset; // TType base offset + unsigned SizeAlign = (4 - TotalSize) & 3; + + // Account for any extra padding that will be added to the call site table + // length. + Streamer.emitULEB128IntValue(TTypeBaseOffset, + /*PadTo=*/TTypeBaseOffsetSize + SizeAlign); + + // Emit the landing pad call site table. We use signed data4 since we can emit + // a landing pad in a different part of the split function that could appear + // earlier in the address space than LPStart. + Streamer.emitIntValue(dwarf::DW_EH_PE_sdata4, 1); + Streamer.emitULEB128IntValue(CallSiteTableLength); + + for (const BinaryFunction::CallSite &CallSite : *Sites) { + const MCSymbol *BeginLabel = CallSite.Start; + const MCSymbol *EndLabel = CallSite.End; + + assert(BeginLabel && "start EH label expected"); + assert(EndLabel && "end EH label expected"); + + // Start of the range is emitted relative to the start of current + // function split part. + Streamer.emitAbsoluteSymbolDiff(BeginLabel, StartSymbol, 4); + Streamer.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4); + emitLandingPad(CallSite.LP); + Streamer.emitULEB128IntValue(CallSite.Action); + } + + // Write out action, type, and type index tables at the end. + // + // For action and type index tables there's no need to change the original + // table format unless we are doing function splitting, in which case we can + // split and optimize the tables. + // + // For type table we (re-)encode the table using TTypeEncoding matching + // the current assembler mode. + for (uint8_t const &Byte : BF.getLSDAActionTable()) { + Streamer.emitIntValue(Byte, 1); + } + + const BinaryFunction::LSDATypeTableTy &TypeTable = + (TTypeEncoding & dwarf::DW_EH_PE_indirect) ? BF.getLSDATypeAddressTable() + : BF.getLSDATypeTable(); + assert(TypeTable.size() == BF.getLSDATypeTable().size() && + "indirect type table size mismatch"); + + for (int Index = TypeTable.size() - 1; Index >= 0; --Index) { + const uint64_t TypeAddress = TypeTable[Index]; + switch (TTypeEncoding & 0x70) { + default: + llvm_unreachable("unsupported TTypeEncoding"); + case dwarf::DW_EH_PE_absptr: + Streamer.emitIntValue(TypeAddress, TTypeEncodingSize); + break; + case dwarf::DW_EH_PE_pcrel: { + if (TypeAddress) { + const MCSymbol *TypeSymbol = + BC.getOrCreateGlobalSymbol(TypeAddress, "TI", 0, TTypeAlignment); + MCSymbol *DotSymbol = BC.Ctx->createNamedTempSymbol(); + Streamer.emitLabel(DotSymbol); + const MCBinaryExpr *SubDotExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(TypeSymbol, *BC.Ctx), + MCSymbolRefExpr::create(DotSymbol, *BC.Ctx), *BC.Ctx); + Streamer.emitValue(SubDotExpr, TTypeEncodingSize); + } else { + Streamer.emitIntValue(0, TTypeEncodingSize); + } + break; + } + } + } + for (uint8_t const &Byte : BF.getLSDATypeIndexTable()) { + Streamer.emitIntValue(Byte, 1); + } +} + +void BinaryEmitter::emitDebugLineInfoForOriginalFunctions() { + for (auto &It : BC.getBinaryFunctions()) { + const BinaryFunction &Function = It.second; + + // If the function was emitted, its line info was emitted with it. + if (Function.isEmitted()) + continue; + + DWARFUnit *Unit = Function.getDWARFUnit(); + const DWARFDebugLine::LineTable *LineTable = Function.getDWARFLineTable(); + + if (!LineTable) + continue; // nothing to update for this function + + std::vector Results; + MCSection *FunctionSection = + BC.getCodeSection(Function.getCodeSectionName()); + + uint64_t Address = It.first; + if (LineTable->lookupAddressRange({Address, 0}, Function.getMaxSize(), + Results)) { + MCLineSection &OutputLineTable = + BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections(); + for (uint32_t RowIndex : Results) { + const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex]; + BC.Ctx->setCurrentDwarfLoc( + Row.File, + Row.Line, + Row.Column, + (DWARF2_FLAG_IS_STMT * Row.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * Row.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * Row.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * Row.EpilogueBegin), + Row.Isa, + Row.Discriminator, + Row.Address.Address); + MCDwarfLoc Loc = BC.Ctx->getCurrentDwarfLoc(); + BC.Ctx->clearDwarfLocSeen(); + OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc}, + FunctionSection); + } + // Add an empty entry past the end of the function + // for end_sequence mark. + BC.Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0, + Address + Function.getMaxSize()); + MCDwarfLoc Loc = BC.Ctx->getCurrentDwarfLoc(); + BC.Ctx->clearDwarfLocSeen(); + OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc}, + FunctionSection); + } else { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: function " << Function + << " has no associated line number information\n"); + } + } +} + +void BinaryEmitter::emitDataSections(StringRef OrgSecPrefix) { + for (BinarySection &Section : BC.sections()) { + if (!Section.hasRelocations() || !Section.hasSectionRef()) + continue; + + StringRef SectionName = Section.getName(); + std::string EmitName = Section.isReordered() + ? std::string(Section.getOutputName()) + : OrgSecPrefix.str() + std::string(SectionName); + Section.emitAsData(Streamer, EmitName); + Section.clearRelocations(); + } +} + +namespace llvm { +namespace bolt { + +void emitBinaryContext(MCStreamer &Streamer, BinaryContext &BC, + StringRef OrgSecPrefix) { + BinaryEmitter(Streamer, BC).emitAll(OrgSecPrefix); +} + +void emitFunctionBody(MCStreamer &Streamer, BinaryFunction &BF, + bool EmitColdPart, bool EmitCodeOnly) { + BinaryEmitter(Streamer, BF.getBinaryContext()). + emitFunctionBody(BF, EmitColdPart, EmitCodeOnly); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/BinaryEmitter.h b/bolt/src/BinaryEmitter.h new file mode 100644 index 000000000000..d6bc6077e764 --- /dev/null +++ b/bolt/src/BinaryEmitter.h @@ -0,0 +1,39 @@ +//===--- BinaryEmitter.h - collection of functions to emit code and data --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_EMITTER_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_EMITTER_H + +#include "llvm/ADT/StringRef.h" + +namespace llvm { +class MCStreamer; + +namespace bolt { +class BinaryContext; +class BinaryFunction; + +/// Emit all code and data in the BinaryContext \p BC. +/// +/// \p OrgSecPrefix is used to modify name of emitted original sections +/// contained in \p BC. This is done to distinguish them from sections emitted +/// by LLVM backend. +void emitBinaryContext(MCStreamer &Streamer, BinaryContext &BC, + StringRef OrgSecPrefix = ""); + +/// Emit \p BF function code. The caller is responsible for emitting function +/// symbol(s) and setting the section to emit the code to. +void emitFunctionBody(MCStreamer &Streamer, BinaryFunction &BF, + bool EmitColdPart, bool EmitCodeOnly = false); + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp new file mode 100644 index 000000000000..4eea2a247905 --- /dev/null +++ b/bolt/src/BinaryFunction.cpp @@ -0,0 +1,4474 @@ +//===--- BinaryFunction.cpp - Interface for machine-level function --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryFunction.h" +#include "BinaryBasicBlock.h" +#include "DynoStats.h" +#include "MCPlusBuilder.h" +#include "NameResolver.h" +#include "NameShortener.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/edit_distance.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltCategory; +extern cl::OptionCategory BoltOptCategory; +extern cl::OptionCategory BoltRelocCategory; + +extern cl::opt EnableBAT; +extern cl::opt Instrument; +extern cl::opt StrictMode; +extern cl::opt UpdateDebugSections; +extern cl::opt Verbosity; + +extern bool processAllFunctions(); + +cl::opt +CheckEncoding("check-encoding", + cl::desc("perform verification of LLVM instruction encoding/decoding. " + "Every instruction in the input is decoded and re-encoded. " + "If the resulting bytes do not match the input, a warning message " + "is printed."), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +DotToolTipCode("dot-tooltip-code", + cl::desc("add basic block instructions as tool tips on nodes"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +cl::opt +JumpTables("jump-tables", + cl::desc("jump tables support (default=basic)"), + cl::init(JTS_BASIC), + cl::values( + clEnumValN(JTS_NONE, "none", + "do not optimize functions with jump tables"), + clEnumValN(JTS_BASIC, "basic", + "optimize functions with jump tables"), + clEnumValN(JTS_MOVE, "move", + "move jump tables to a separate section"), + clEnumValN(JTS_SPLIT, "split", + "split jump tables section into hot and cold based on " + "function execution frequency"), + clEnumValN(JTS_AGGRESSIVE, "aggressive", + "aggressively split jump tables section based on usage " + "of the tables")), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +NoScan("no-scan", + cl::desc("do not scan cold functions for external references (may result in " + "slower binary)"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +cl::opt +PreserveBlocksAlignment("preserve-blocks-alignment", + cl::desc("try to preserve basic block alignment"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +PrintDynoStats("dyno-stats", + cl::desc("print execution info based on profile"), + cl::cat(BoltCategory)); + +static cl::opt +PrintDynoStatsOnly("print-dyno-stats-only", + cl::desc("while printing functions output dyno-stats and skip instructions"), + cl::init(false), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::list +PrintOnly("print-only", + cl::CommaSeparated, + cl::desc("list of functions to print"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + +cl::opt +TimeBuild("time-build", + cl::desc("print time spent constructing binary functions"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +cl::opt +TrapOnAVX512("trap-avx512", + cl::desc("in relocation mode trap upon entry to any function that uses " + "AVX-512 instructions"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +bool shouldPrint(const BinaryFunction &Function) { + if (Function.isIgnored()) + return false; + + if (PrintOnly.empty()) + return true; + + for (std::string &Name : opts::PrintOnly) { + if (Function.hasNameRegex(Name)) { + return true; + } + } + + return false; +} + +} // namespace opts + +namespace llvm { +namespace bolt { + +constexpr unsigned BinaryFunction::MinAlign; + +namespace { + +template +bool emptyRange(const R &Range) { + return Range.begin() == Range.end(); +} + +/// Gets debug line information for the instruction located at the given +/// address in the original binary. The SMLoc's pointer is used +/// to point to this information, which is represented by a +/// DebugLineTableRowRef. The returned pointer is null if no debug line +/// information for this instruction was found. +SMLoc findDebugLineInformationForInstructionAt(uint64_t Address, + DWARFUnit *Unit, const DWARFDebugLine::LineTable *LineTable) { + // We use the pointer in SMLoc to store an instance of DebugLineTableRowRef, + // which occupies 64 bits. Thus, we can only proceed if the struct fits into + // the pointer itself. + assert( + sizeof(decltype(SMLoc().getPointer())) >= sizeof(DebugLineTableRowRef) && + "Cannot fit instruction debug line information into SMLoc's pointer"); + + SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc(); + uint32_t RowIndex = LineTable->lookupAddress( + {Address, object::SectionedAddress::UndefSection}); + if (RowIndex == LineTable->UnknownRowIndex) + return NullResult; + + assert(RowIndex < LineTable->Rows.size() && + "Line Table lookup returned invalid index."); + + decltype(SMLoc().getPointer()) Ptr; + DebugLineTableRowRef *InstructionLocation = + reinterpret_cast(&Ptr); + + InstructionLocation->DwCompileUnitIndex = Unit->getOffset(); + InstructionLocation->RowIndex = RowIndex + 1; + + return SMLoc::getFromPointer(Ptr); +} + +std::string buildSectionName(StringRef Prefix, StringRef Name, + const BinaryContext &BC) { + if (BC.isELF()) + return (Prefix + Name).str(); + static NameShortener NS; + return (Prefix + Twine(NS.getID(Name))).str(); +} + +} // namespace + +std::string BinaryFunction::buildCodeSectionName(StringRef Name, + const BinaryContext &BC) { + return buildSectionName(BC.isELF() ? ".local.text." : ".l.text.", Name, BC); +} + +std::string BinaryFunction::buildColdCodeSectionName(StringRef Name, + const BinaryContext &BC) { + return buildSectionName(BC.isELF() ? ".local.cold.text." : ".l.c.text.", Name, + BC); +} + +uint64_t BinaryFunction::Count = 0; + +Optional +BinaryFunction::hasNameRegex(const StringRef Name) const { + const std::string RegexName = (Twine("^") + StringRef(Name) + "$").str(); + Regex MatchName(RegexName); + Optional Match = forEachName( + [&MatchName](StringRef Name) { return MatchName.match(Name); }); + + return Match; +} + +Optional +BinaryFunction::hasRestoredNameRegex(const StringRef Name) const { + const std::string RegexName = (Twine("^") + StringRef(Name) + "$").str(); + Regex MatchName(RegexName); + Optional Match = forEachName([&MatchName](StringRef Name) { + return MatchName.match(NameResolver::restore(Name)); + }); + + return Match; +} + +std::string BinaryFunction::getDemangledName() const { + StringRef MangledName = NameResolver::restore(getOneName()); + int Status = 0; + char *const Name = + abi::__cxa_demangle(MangledName.str().c_str(), 0, 0, &Status); + const std::string NameStr(Status == 0 ? Name : MangledName); + ::free(Name); + return NameStr; +} + +BinaryBasicBlock * +BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { + if (Offset > Size) + return nullptr; + + if (BasicBlockOffsets.empty()) + return nullptr; + + /* + * This is commented out because it makes BOLT too slow. + * assert(std::is_sorted(BasicBlockOffsets.begin(), + * BasicBlockOffsets.end(), + * CompareBasicBlockOffsets()))); + */ + auto I = std::upper_bound(BasicBlockOffsets.begin(), + BasicBlockOffsets.end(), + BasicBlockOffset(Offset, nullptr), + CompareBasicBlockOffsets()); + assert(I != BasicBlockOffsets.begin() && "first basic block not at offset 0"); + --I; + BinaryBasicBlock *BB = I->second; + return (Offset < BB->getOffset() + BB->getOriginalSize()) ? BB : nullptr; +} + +void BinaryFunction::markUnreachableBlocks() { + std::stack Stack; + + for (BinaryBasicBlock *BB : layout()) { + BB->markValid(false); + } + + // Add all entries and landing pads as roots. + for (BinaryBasicBlock *BB : BasicBlocks) { + if (isEntryPoint(*BB) || BB->isLandingPad()) { + Stack.push(BB); + BB->markValid(true); + continue; + } + // FIXME: + // Also mark BBs with indirect jumps as reachable, since we do not + // support removing unused jump tables yet (T29418024 / GH-issue20) + for (const MCInst &Inst : *BB) { + if (BC.MIB->getJumpTable(Inst)) { + Stack.push(BB); + BB->markValid(true); + break; + } + } + } + + // Determine reachable BBs from the entry point + while (!Stack.empty()) { + BinaryBasicBlock *BB = Stack.top(); + Stack.pop(); + for (BinaryBasicBlock *Succ : BB->successors()) { + if (Succ->isValid()) + continue; + Succ->markValid(true); + Stack.push(Succ); + } + } +} + +// Any unnecessary fallthrough jumps revealed after calling eraseInvalidBBs +// will be cleaned up by fixBranches(). +std::pair BinaryFunction::eraseInvalidBBs() { + BasicBlockOrderType NewLayout; + unsigned Count = 0; + uint64_t Bytes = 0; + for (BinaryBasicBlock *BB : layout()) { + if (BB->isValid()) { + NewLayout.push_back(BB); + } else { + assert(!isEntryPoint(*BB) && "all entry blocks must be valid"); + ++Count; + Bytes += BC.computeCodeSize(BB->begin(), BB->end()); + } + } + BasicBlocksLayout = std::move(NewLayout); + + BasicBlockListType NewBasicBlocks; + for (auto I = BasicBlocks.begin(), E = BasicBlocks.end(); I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (BB->isValid()) { + NewBasicBlocks.push_back(BB); + } else { + // Make sure the block is removed from the list of predecessors. + BB->removeAllSuccessors(); + DeletedBasicBlocks.push_back(BB); + } + } + BasicBlocks = std::move(NewBasicBlocks); + + assert(BasicBlocks.size() == BasicBlocksLayout.size()); + + // Update CFG state if needed + if (Count > 0) + recomputeLandingPads(); + + return std::make_pair(Count, Bytes); +} + +bool BinaryFunction::isForwardCall(const MCSymbol *CalleeSymbol) const { + // This function should work properly before and after function reordering. + // In order to accomplish this, we use the function index (if it is valid). + // If the function indices are not valid, we fall back to the original + // addresses. This should be ok because the functions without valid indices + // should have been ordered with a stable sort. + const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CalleeSymbol); + if (CalleeBF) { + if(CalleeBF->isInjected()) + return true; + + if (hasValidIndex() && CalleeBF->hasValidIndex()) { + return getIndex() < CalleeBF->getIndex(); + } else if (hasValidIndex() && !CalleeBF->hasValidIndex()) { + return true; + } else if (!hasValidIndex() && CalleeBF->hasValidIndex()) { + return false; + } else { + return getAddress() < CalleeBF->getAddress(); + } + } else { + // Absolute symbol. + ErrorOr CalleeAddressOrError = BC.getSymbolValue(*CalleeSymbol); + assert(CalleeAddressOrError && "unregistered symbol found"); + return *CalleeAddressOrError > getAddress(); + } +} + +void BinaryFunction::dump(bool PrintInstructions) const { + print(dbgs(), "", PrintInstructions); +} + +void BinaryFunction::print(raw_ostream &OS, std::string Annotation, + bool PrintInstructions) const { + if (!opts::shouldPrint(*this)) + return; + + StringRef SectionName = + OriginSection ? OriginSection->getName() : ""; + OS << "Binary Function \"" << *this << "\" " << Annotation << " {"; + std::vector AllNames = getNames(); + if (AllNames.size() > 1) { + OS << "\n All names : "; + const char *Sep = ""; + for (const StringRef Name : AllNames) { + OS << Sep << Name; + Sep = "\n "; + } + } + OS << "\n Number : " << FunctionNumber + << "\n State : " << CurrentState + << "\n Address : 0x" << Twine::utohexstr(Address) + << "\n Size : 0x" << Twine::utohexstr(Size) + << "\n MaxSize : 0x" << Twine::utohexstr(MaxSize) + << "\n Offset : 0x" << Twine::utohexstr(FileOffset) + << "\n Section : " << SectionName + << "\n Orc Section : " << getCodeSectionName() + << "\n LSDA : 0x" << Twine::utohexstr(getLSDAAddress()) + << "\n IsSimple : " << IsSimple + << "\n IsMultiEntry: " << isMultiEntry() + << "\n IsSplit : " << isSplit() + << "\n BB Count : " << size(); + + if (HasFixedIndirectBranch) { + OS << "\n HasFixedIndirectBranch : true"; + } + if (HasUnknownControlFlow) { + OS << "\n Unknown CF : true"; + } + if (getPersonalityFunction()) { + OS << "\n Personality : " << getPersonalityFunction()->getName(); + } + if (IsFragment) { + OS << "\n IsFragment : true"; + } + if (isFolded()) { + OS << "\n FoldedInto : " << *getFoldedIntoFunction(); + } + if (ParentFragment) { + OS << "\n Parent : " << *ParentFragment; + } + if (!Fragments.empty()) { + OS << "\n Fragments : "; + const char *Sep = ""; + for (BinaryFunction *Frag : Fragments) { + OS << Sep << *Frag; + Sep = ", "; + } + } + if (hasCFG()) { + OS << "\n Hash : " << Twine::utohexstr(computeHash()); + } + if (isMultiEntry()) { + OS << "\n Secondary Entry Points : "; + const char *Sep = ""; + for (const std::pair &KV : + SecondaryEntryPoints) { + OS << Sep << KV.second->getName(); + Sep = ", "; + } + } + if (FrameInstructions.size()) { + OS << "\n CFI Instrs : " << FrameInstructions.size(); + } + if (BasicBlocksLayout.size()) { + OS << "\n BB Layout : "; + const char *Sep = ""; + for (BinaryBasicBlock *BB : BasicBlocksLayout) { + OS << Sep << BB->getName(); + Sep = ", "; + } + } + if (ImageAddress) + OS << "\n Image : 0x" << Twine::utohexstr(ImageAddress); + if (ExecutionCount != COUNT_NO_PROFILE) { + OS << "\n Exec Count : " << ExecutionCount; + OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); + } + + if (opts::PrintDynoStats && !BasicBlocksLayout.empty()) { + OS << '\n'; + DynoStats dynoStats = getDynoStats(*this); + OS << dynoStats; + } + + OS << "\n}\n"; + + if (opts::PrintDynoStatsOnly || !PrintInstructions || !BC.InstPrinter) + return; + + // Offset of the instruction in function. + uint64_t Offset = 0; + + if (BasicBlocks.empty() && !Instructions.empty()) { + // Print before CFG was built. + for (const std::pair &II : Instructions) { + Offset = II.first; + + // Print label if exists at this offset. + auto LI = Labels.find(Offset); + if (LI != Labels.end()) { + if (const MCSymbol *EntrySymbol = + getSecondaryEntryPointSymbol(LI->second)) + OS << EntrySymbol->getName() << " (Entry Point):\n"; + OS << LI->second->getName() << ":\n"; + } + + BC.printInstruction(OS, II.second, Offset, this); + } + } + + for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { + BinaryBasicBlock *BB = BasicBlocksLayout[I]; + if (I != 0 && + BB->isCold() != BasicBlocksLayout[I - 1]->isCold()) + OS << "------- HOT-COLD SPLIT POINT -------\n\n"; + + OS << BB->getName() << " (" + << BB->size() << " instructions, align : " << BB->getAlignment() + << ")\n"; + + if (isEntryPoint(*BB)) { + if (MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB)) + OS << " Secondary Entry Point: " << EntrySymbol->getName() << '\n'; + else + OS << " Entry Point\n"; + } + + if (BB->isLandingPad()) + OS << " Landing Pad\n"; + + uint64_t BBExecCount = BB->getExecutionCount(); + if (hasValidProfile()) { + OS << " Exec Count : "; + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + OS << BBExecCount << '\n'; + else + OS << "\n"; + } + if (BB->getCFIState() >= 0) { + OS << " CFI State : " << BB->getCFIState() << '\n'; + } + if (opts::EnableBAT) { + OS << " Input offset: " << Twine::utohexstr(BB->getInputOffset()) + << "\n"; + } + if (!BB->pred_empty()) { + OS << " Predecessors: "; + const char *Sep = ""; + for (BinaryBasicBlock *Pred : BB->predecessors()) { + OS << Sep << Pred->getName(); + Sep = ", "; + } + OS << '\n'; + } + if (!BB->throw_empty()) { + OS << " Throwers: "; + const char *Sep = ""; + for (BinaryBasicBlock *Throw : BB->throwers()) { + OS << Sep << Throw->getName(); + Sep = ", "; + } + OS << '\n'; + } + + Offset = alignTo(Offset, BB->getAlignment()); + + // Note: offsets are imprecise since this is happening prior to relaxation. + Offset = BC.printInstructions(OS, BB->begin(), BB->end(), Offset, this); + + if (!BB->succ_empty()) { + OS << " Successors: "; + // For more than 2 successors, sort them based on frequency. + std::vector Indices(BB->succ_size()); + std::iota(Indices.begin(), Indices.end(), 0); + if (BB->succ_size() > 2 && BB->getKnownExecutionCount()) { + std::stable_sort(Indices.begin(), Indices.end(), + [&](const uint64_t A, const uint64_t B) { + return BB->BranchInfo[B] < BB->BranchInfo[A]; + }); + } + const char *Sep = ""; + for (unsigned I = 0; I < Indices.size(); ++I) { + BinaryBasicBlock *Succ = BB->Successors[Indices[I]]; + BinaryBasicBlock::BinaryBranchInfo &BI = BB->BranchInfo[Indices[I]]; + OS << Sep << Succ->getName(); + if (ExecutionCount != COUNT_NO_PROFILE && + BI.MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { + OS << " (mispreds: " << BI.MispredictedCount + << ", count: " << BI.Count << ")"; + } else if (ExecutionCount != COUNT_NO_PROFILE && + BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) { + OS << " (inferred count: " << BI.Count << ")"; + } + Sep = ", "; + } + OS << '\n'; + } + + if (!BB->lp_empty()) { + OS << " Landing Pads: "; + const char *Sep = ""; + for (BinaryBasicBlock *LP : BB->landing_pads()) { + OS << Sep << LP->getName(); + if (ExecutionCount != COUNT_NO_PROFILE) { + OS << " (count: " << LP->getExecutionCount() << ")"; + } + Sep = ", "; + } + OS << '\n'; + } + + // In CFG_Finalized state we can miscalculate CFI state at exit. + if (CurrentState == State::CFG) { + const int32_t CFIStateAtExit = BB->getCFIStateAtExit(); + if (CFIStateAtExit >= 0) + OS << " CFI State: " << CFIStateAtExit << '\n'; + } + + OS << '\n'; + } + + // Dump new exception ranges for the function. + if (!CallSites.empty()) { + OS << "EH table:\n"; + for (const CallSite &CSI : CallSites) { + OS << " [" << *CSI.Start << ", " << *CSI.End << ") landing pad : "; + if (CSI.LP) + OS << *CSI.LP; + else + OS << "0"; + OS << ", action : " << CSI.Action << '\n'; + } + OS << '\n'; + } + + // Print all jump tables. + for (const std::pair &JTI : JumpTables) { + JTI.second->print(OS); + } + + OS << "DWARF CFI Instructions:\n"; + if (OffsetToCFI.size()) { + // Pre-buildCFG information + for (const std::pair &Elmt : OffsetToCFI) { + OS << format(" %08x:\t", Elmt.first); + assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset"); + BinaryContext::printCFI(OS, FrameInstructions[Elmt.second]); + OS << "\n"; + } + } else { + // Post-buildCFG information + for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) { + const MCCFIInstruction &CFI = FrameInstructions[I]; + OS << format(" %d:\t", I); + BinaryContext::printCFI(OS, CFI); + OS << "\n"; + } + } + if (FrameInstructions.empty()) + OS << " \n"; + + OS << "End of Function \"" << *this << "\"\n\n"; +} + +void BinaryFunction::printRelocations(raw_ostream &OS, + uint64_t Offset, + uint64_t Size) const { + const char *Sep = " # Relocs: "; + + auto RI = Relocations.lower_bound(Offset); + while (RI != Relocations.end() && RI->first < Offset + Size) { + OS << Sep << "(R: " << RI->second << ")"; + Sep = ", "; + ++RI; + } +} + +IndirectBranchType +BinaryFunction::processIndirectBranch(MCInst &Instruction, + unsigned Size, + uint64_t Offset, + uint64_t &TargetAddress) { + const unsigned PtrSize = BC.AsmInfo->getCodePointerSize(); + + // The instruction referencing memory used by the branch instruction. + // It could be the branch instruction itself or one of the instructions + // setting the value of the register used by the branch. + MCInst *MemLocInstr; + + // Address of the table referenced by MemLocInstr. Could be either an + // array of function pointers, or a jump table. + uint64_t ArrayStart = 0; + + unsigned BaseRegNum, IndexRegNum; + int64_t DispValue; + const MCExpr *DispExpr; + + // In AArch, identify the instruction adding the PC-relative offset to + // jump table entries to correctly decode it. + MCInst *PCRelBaseInstr; + uint64_t PCRelAddr = 0; + + auto Begin = Instructions.begin(); + if (BC.isAArch64()) { + PreserveNops = BC.HasRelocations; + // Start at the last label as an approximation of the current basic block. + // This is a heuristic, since the full set of labels have yet to be + // determined + for (auto LI = Labels.rbegin(); LI != Labels.rend(); ++LI) { + auto II = Instructions.find(LI->first); + if (II != Instructions.end()) { + Begin = II; + break; + } + } + } + + IndirectBranchType BranchType = + BC.MIB->analyzeIndirectBranch(Instruction, + Begin, + Instructions.end(), + PtrSize, + MemLocInstr, + BaseRegNum, + IndexRegNum, + DispValue, + DispExpr, + PCRelBaseInstr); + + if (BranchType == IndirectBranchType::UNKNOWN && !MemLocInstr) + return BranchType; + + if (MemLocInstr != &Instruction) + IndexRegNum = BC.MIB->getNoRegister(); + + if (BC.isAArch64()) { + const MCSymbol *Sym = BC.MIB->getTargetSymbol(*PCRelBaseInstr, 1); + assert(Sym && "Symbol extraction failed"); + ErrorOr SymValueOrError = BC.getSymbolValue(*Sym); + if (SymValueOrError) { + PCRelAddr = *SymValueOrError; + } else { + for (std::pair &Elmt : Labels) { + if (Elmt.second == Sym) { + PCRelAddr = Elmt.first + getAddress(); + break; + } + } + } + uint64_t InstrAddr = 0; + for (auto II = Instructions.rbegin(); II != Instructions.rend(); ++II) { + if (&II->second == PCRelBaseInstr) { + InstrAddr = II->first + getAddress(); + break; + } + } + assert(InstrAddr != 0 && "instruction not found"); + // We do this to avoid spurious references to code locations outside this + // function (for example, if the indirect jump lives in the last basic + // block of the function, it will create a reference to the next function). + // This replaces a symbol reference with an immediate. + BC.MIB->replaceMemOperandDisp(*PCRelBaseInstr, + MCOperand::createImm(PCRelAddr - InstrAddr)); + // FIXME: Disable full jump table processing for AArch64 until we have a + // proper way of determining the jump table limits. + return IndirectBranchType::UNKNOWN; + } + + // RIP-relative addressing should be converted to symbol form by now + // in processed instructions (but not in jump). + if (DispExpr) { + const MCSymbol *TargetSym; + uint64_t TargetOffset; + std::tie(TargetSym, TargetOffset) = BC.MIB->getTargetSymbolInfo(DispExpr); + ErrorOr SymValueOrError = BC.getSymbolValue(*TargetSym); + assert(SymValueOrError && "global symbol needs a value"); + ArrayStart = *SymValueOrError + TargetOffset; + BaseRegNum = BC.MIB->getNoRegister(); + if (BC.isAArch64()) { + ArrayStart &= ~0xFFFULL; + ArrayStart += DispValue & 0xFFFULL; + } + } else { + ArrayStart = static_cast(DispValue); + } + + if (BaseRegNum == BC.MRI->getProgramCounter()) + ArrayStart += getAddress() + Offset + Size; + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x" + << Twine::utohexstr(ArrayStart) << '\n'); + + ErrorOr Section = BC.getSectionForAddress(ArrayStart); + if (!Section) { + // No section - possibly an absolute address. Since we don't allow + // internal function addresses to escape the function scope - we + // consider it a tail call. + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: no section for address 0x" + << Twine::utohexstr(ArrayStart) << " referenced from function " + << *this << '\n'; + } + return IndirectBranchType::POSSIBLE_TAIL_CALL; + } + if (Section->isVirtual()) { + // The contents are filled at runtime. + return IndirectBranchType::POSSIBLE_TAIL_CALL; + } + + if (BranchType == IndirectBranchType::POSSIBLE_FIXED_BRANCH) { + ErrorOr Value = BC.getPointerAtAddress(ArrayStart); + if (!Value) + return IndirectBranchType::UNKNOWN; + + if (!BC.getSectionForAddress(ArrayStart)->isReadOnly()) + return IndirectBranchType::UNKNOWN; + + outs() << "BOLT-INFO: fixed indirect branch detected in " << *this + << " at 0x" << Twine::utohexstr(getAddress() + Offset) + << " referencing data at 0x" << Twine::utohexstr(ArrayStart) + << " the destination value is 0x" << Twine::utohexstr(*Value) + << '\n'; + + TargetAddress = *Value; + return BranchType; + } + + // Check if there's already a jump table registered at this address. + MemoryContentsType MemType; + if (JumpTable *JT = BC.getJumpTableContainingAddress(ArrayStart)) { + switch (JT->Type) { + case JumpTable::JTT_NORMAL: + MemType = MemoryContentsType::POSSIBLE_JUMP_TABLE; + break; + case JumpTable::JTT_PIC: + MemType = MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE; + break; + } + } else { + MemType = BC.analyzeMemoryAt(ArrayStart, *this); + } + + // Check that jump table type in instruction pattern matches memory contents. + JumpTable::JumpTableType JTType; + if (BranchType == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { + if (MemType != MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE) + return IndirectBranchType::UNKNOWN; + JTType = JumpTable::JTT_PIC; + } else { + if (MemType == MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE) + return IndirectBranchType::UNKNOWN; + + if (MemType == MemoryContentsType::UNKNOWN) + return IndirectBranchType::POSSIBLE_TAIL_CALL; + + BranchType = IndirectBranchType::POSSIBLE_JUMP_TABLE; + JTType = JumpTable::JTT_NORMAL; + } + + // Convert the instruction into jump table branch. + const MCSymbol *JTLabel = BC.getOrCreateJumpTable(*this, ArrayStart, JTType); + BC.MIB->replaceMemOperandDisp(*MemLocInstr, JTLabel, BC.Ctx.get()); + BC.MIB->setJumpTable(Instruction, ArrayStart, IndexRegNum); + + JTSites.emplace_back(Offset, ArrayStart); + + return BranchType; +} + +MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address, + bool CreatePastEnd) { + const uint64_t Offset = Address - getAddress(); + + if ((Offset == getSize()) && CreatePastEnd) + return getFunctionEndLabel(); + + auto LI = Labels.find(Offset); + if (LI != Labels.end()) + return LI->second; + + // For AArch64, check if this address is part of a constant island. + if (BC.isAArch64()) { + if (MCSymbol *IslandSym = getOrCreateIslandAccess(Address)) { + return IslandSym; + } + } + + MCSymbol *Label = BC.Ctx->createNamedTempSymbol(); + Labels[Offset] = Label; + + return Label; +} + +ErrorOr> BinaryFunction::getData() const { + BinarySection &Section = *getOriginSection(); + assert(Section.containsRange(getAddress(), getMaxSize()) && + "wrong section for function"); + + if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + return std::make_error_code(std::errc::bad_address); + } + + StringRef SectionContents = Section.getContents(); + + assert(SectionContents.size() == Section.getSize() && + "section size mismatch"); + + // Function offset from the section start. + uint64_t Offset = getAddress() - Section.getAddress(); + auto *Bytes = reinterpret_cast(SectionContents.data()); + return ArrayRef(Bytes + Offset, getMaxSize()); +} + +size_t BinaryFunction::getSizeOfDataInCodeAt(uint64_t Offset) const { + if (Islands.DataOffsets.find(Offset) == Islands.DataOffsets.end()) + return 0; + + auto Iter = Islands.CodeOffsets.upper_bound(Offset); + if (Iter != Islands.CodeOffsets.end()) { + return *Iter - Offset; + } + return getSize() - Offset; +} + +bool BinaryFunction::isZeroPaddingAt(uint64_t Offset) const { + ArrayRef FunctionData = *getData(); + uint64_t EndOfCode = getSize(); + auto Iter = Islands.DataOffsets.upper_bound(Offset); + if (Iter != Islands.DataOffsets.end()) + EndOfCode = *Iter; + for (uint64_t I = Offset; I < EndOfCode; ++I) { + if (FunctionData[I] != 0) { + return false; + } + } + + return true; +} + +bool BinaryFunction::disassemble() { + NamedRegionTimer T("disassemble", "Disassemble function", "buildfuncs", + "Build Binary Functions", opts::TimeBuild); + ErrorOr> ErrorOrFunctionData = getData(); + assert(ErrorOrFunctionData && "function data is not available"); + ArrayRef FunctionData = *ErrorOrFunctionData; + assert(FunctionData.size() == getMaxSize() && + "function size does not match raw data size"); + + auto &Ctx = BC.Ctx; + auto &MIB = BC.MIB; + + // Insert a label at the beginning of the function. This will be our first + // basic block. + Labels[0] = Ctx->createNamedTempSymbol("BB0"); + + auto handlePCRelOperand = + [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { + uint64_t TargetAddress = 0; + if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address, + Size)) { + errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n"; + BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, errs()); + errs() << '\n'; + Instruction.dump_pretty(errs(), BC.InstPrinter.get()); + errs() << '\n'; + errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x" + << Twine::utohexstr(Address) << ". Skipping function " << *this + << ".\n"; + if (BC.HasRelocations) + exit(1); + IsSimple = false; + return; + } + if (TargetAddress == 0 && opts::Verbosity >= 1) { + outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this + << '\n'; + } + + const MCSymbol *TargetSymbol; + uint64_t TargetOffset; + std::tie(TargetSymbol, TargetOffset) = + BC.handleAddressRef(TargetAddress, *this, /*IsPCRel*/ true); + const MCExpr *Expr = MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *BC.Ctx); + if (TargetOffset) { + const MCConstantExpr *Offset = + MCConstantExpr::create(TargetOffset, *BC.Ctx); + Expr = MCBinaryExpr::createAdd(Expr, Offset, *BC.Ctx); + } + MIB->replaceMemOperandDisp( + Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor( + Instruction, + Expr, + *BC.Ctx, 0))); + }; + + // Used to fix the target of linker-generated AArch64 stubs with no relocation + // info + auto fixStubTarget = [&](MCInst &LoadLowBits, MCInst &LoadHiBits, + uint64_t Target) { + const MCSymbol *TargetSymbol; + uint64_t Addend = 0; + std::tie(TargetSymbol, Addend) = BC.handleAddressRef(Target, *this, true); + + int64_t Val; + MIB->replaceImmWithSymbolRef(LoadHiBits, TargetSymbol, Addend, Ctx.get(), + Val, ELF::R_AARCH64_ADR_PREL_PG_HI21); + MIB->replaceImmWithSymbolRef(LoadLowBits, TargetSymbol, Addend, Ctx.get(), + Val, ELF::R_AARCH64_ADD_ABS_LO12_NC); + }; + + auto handleExternalReference = [&](MCInst &Instruction, uint64_t Size, + uint64_t Offset, uint64_t TargetAddress, + bool &IsCall) -> MCSymbol * { + const bool IsCondBranch = MIB->isConditionalBranch(Instruction); + const uint64_t AbsoluteInstrAddr = getAddress() + Offset; + MCSymbol *TargetSymbol = nullptr; + InterproceduralReferences.insert(TargetAddress); + if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !BC.HasRelocations) { + errs() << "BOLT-WARNING: relaxed tail call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this + << ". Code size will be increased.\n"; + } + + assert(!MIB->isTailCall(Instruction) && + "synthetic tail call instruction found"); + + // This is a call regardless of the opcode. + // Assign proper opcode for tail calls, so that they could be + // treated as calls. + if (!IsCall) { + if (!MIB->convertJmpToTailCall(Instruction)) { + assert(IsCondBranch && "unknown tail call instruction"); + if (opts::Verbosity >= 2) { + errs() << "BOLT-WARNING: conditional tail call detected in " + << "function " << *this << " at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << ".\n"; + } + } + IsCall = true; + } + + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "FUNCat"); + if (opts::Verbosity >= 2 && TargetAddress == 0) { + // We actually see calls to address 0 in presence of weak + // symbols originating from libraries. This code is never meant + // to be executed. + outs() << "BOLT-INFO: Function " << *this + << " has a call to address zero.\n"; + } + + return TargetSymbol; + }; + + auto handleIndirectBranch = [&](MCInst &Instruction, uint64_t Size, + uint64_t Offset) { + uint64_t IndirectTarget = 0; + IndirectBranchType Result = + processIndirectBranch(Instruction, Size, Offset, IndirectTarget); + switch (Result) { + default: + llvm_unreachable("unexpected result"); + case IndirectBranchType::POSSIBLE_TAIL_CALL: { + bool Result = MIB->convertJmpToTailCall(Instruction); + (void)Result; + assert(Result); + break; + } + case IndirectBranchType::POSSIBLE_JUMP_TABLE: + case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE: + if (opts::JumpTables == JTS_NONE) + IsSimple = false; + break; + case IndirectBranchType::POSSIBLE_FIXED_BRANCH: { + if (containsAddress(IndirectTarget)) { + const MCSymbol *TargetSymbol = getOrCreateLocalLabel(IndirectTarget); + Instruction.clear(); + MIB->createUncondBranch(Instruction, TargetSymbol, BC.Ctx.get()); + TakenBranches.emplace_back(Offset, IndirectTarget - getAddress()); + HasFixedIndirectBranch = true; + } else { + MIB->convertJmpToTailCall(Instruction); + InterproceduralReferences.insert(IndirectTarget); + } + break; + } + case IndirectBranchType::UNKNOWN: + // Keep processing. We'll do more checks and fixes in + // postProcessIndirectBranches(). + UnknownIndirectBranchOffsets.emplace(Offset); + break; + } + }; + + // Check for linker veneers, which lack relocations and need manual + // adjustments. + auto handleAArch64IndirectCall = [&](MCInst &Instruction, uint64_t Offset) { + const uint64_t AbsoluteInstrAddr = getAddress() + Offset; + MCInst *TargetHiBits, *TargetLowBits; + uint64_t TargetAddress; + if (MIB->matchLinkerVeneer(Instructions.begin(), Instructions.end(), + AbsoluteInstrAddr, Instruction, TargetHiBits, + TargetLowBits, TargetAddress)) { + MIB->addAnnotation(Instruction, "AArch64Veneer", true); + + uint8_t Counter = 0; + for (auto It = std::prev(Instructions.end()); Counter != 2; + --It, ++Counter) { + MIB->addAnnotation(It->second, "AArch64Veneer", true); + } + + fixStubTarget(*TargetLowBits, *TargetHiBits, TargetAddress); + } + }; + + uint64_t Size = 0; // instruction size + for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { + MCInst Instruction; + const uint64_t AbsoluteInstrAddr = getAddress() + Offset; + + // Check for data inside code and ignore it + if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) { + Size = DataInCodeSize; + continue; + } + + if (!BC.DisAsm->getInstruction(Instruction, + Size, + FunctionData.slice(Offset), + AbsoluteInstrAddr, + nulls())) { + // Functions with "soft" boundaries, e.g. coming from assembly source, + // can have 0-byte padding at the end. + if (isZeroPaddingAt(Offset)) + break; + + errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" + << Twine::utohexstr(Offset) << " (address 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " + << *this << '\n'; + // Some AVX-512 instructions could not be disassembled at all. + if (BC.HasRelocations && opts::TrapOnAVX512 && BC.isX86()) { + setTrapOnEntry(); + BC.TrappedFunctions.push_back(this); + } else { + setIgnored(); + } + + break; + } + + // Check integrity of LLVM assembler/disassembler. + if (opts::CheckEncoding && !BC.MIB->isBranch(Instruction) && + !BC.MIB->isCall(Instruction) && !BC.MIB->isNoop(Instruction)) { + if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) { + errs() << "BOLT-WARNING: mismatching LLVM encoding detected in " + << "function " << *this << " for instruction :\n"; + BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr); + errs() << '\n'; + } + } + + // Special handling for AVX-512 instructions. + if (MIB->hasEVEXEncoding(Instruction)) { + if (BC.HasRelocations && opts::TrapOnAVX512) { + setTrapOnEntry(); + BC.TrappedFunctions.push_back(this); + break; + } + + // Check if our disassembly is correct and matches the assembler output. + if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: internal assembler/disassembler error " + "detected for AVX512 instruction:\n"; + BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr); + errs() << " in function " << *this << '\n'; + } + + setIgnored(); + break; + } + } + + // Check if there's a relocation associated with this instruction. + bool UsedReloc = false; + for (auto Itr = Relocations.lower_bound(Offset), + ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) { + const Relocation &Relocation = Itr->second; + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: replacing immediate 0x" + << Twine::utohexstr(Relocation.Value) + << " with relocation" + " against " + << Relocation.Symbol << "+" << Relocation.Addend + << " in function " << *this + << " for instruction at offset 0x" + << Twine::utohexstr(Offset) << '\n'); + + // Process reference to the primary symbol. + if (!Relocation.isPCRelative()) + BC.handleAddressRef(Relocation.Value - Relocation.Addend, + *this, + /*IsPCRel*/ false); + + int64_t Value = Relocation.Value; + const bool Result = BC.MIB->replaceImmWithSymbolRef( + Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value, + Relocation.Type); + (void)Result; + assert(Result && "cannot replace immediate with relocation"); + + // For aarch, if we replaced an immediate with a symbol from a + // relocation, we mark it so we do not try to further process a + // pc-relative operand. All we need is the symbol. + if (BC.isAArch64()) + UsedReloc = true; + + // Make sure we replaced the correct immediate (instruction + // can have multiple immediate operands). + if (BC.isX86()) { + assert(truncateToSize(static_cast(Value), + Relocation::getSizeForType(Relocation.Type)) == + truncateToSize(Relocation.Value, + Relocation::getSizeForType(Relocation.Type)) && + "immediate value mismatch in function"); + } + } + + // Convert instruction to a shorter version that could be relaxed if + // needed. + MIB->shortenInstruction(Instruction); + + if (MIB->isBranch(Instruction) || MIB->isCall(Instruction)) { + uint64_t TargetAddress = 0; + if (MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size, + TargetAddress)) { + // Check if the target is within the same function. Otherwise it's + // a call, possibly a tail call. + // + // If the target *is* the function address it could be either a branch + // or a recursive call. + bool IsCall = MIB->isCall(Instruction); + const bool IsCondBranch = MIB->isConditionalBranch(Instruction); + MCSymbol *TargetSymbol = nullptr; + + if (BC.MIB->isUnsupportedBranch(Instruction.getOpcode())) { + setIgnored(); + if (BinaryFunction *TargetFunc = + BC.getBinaryFunctionContainingAddress(TargetAddress)) + TargetFunc->setIgnored(); + } + + if (IsCall && containsAddress(TargetAddress)) { + if (TargetAddress == getAddress()) { + // Recursive call. + TargetSymbol = getSymbol(); + } else { + if (BC.isX86()) { + // Dangerous old-style x86 PIC code. We may need to freeze this + // function, so preserve the function as is for now. + PreserveNops = true; + } else { + errs() << "BOLT-WARNING: internal call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << " in function " + << *this << ". Skipping.\n"; + IsSimple = false; + } + } + } + + if (!TargetSymbol) { + // Create either local label or external symbol. + if (containsAddress(TargetAddress)) { + TargetSymbol = getOrCreateLocalLabel(TargetAddress); + } else { + if (TargetAddress == getAddress() + getSize() && + TargetAddress < getAddress() + getMaxSize()) { + // Result of __builtin_unreachable(). + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump past end detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << " in function " << *this + << " : replacing with nop.\n"); + BC.MIB->createNoop(Instruction); + if (IsCondBranch) { + // Register branch offset for profile validation. + IgnoredBranches.emplace_back(Offset, Offset + Size); + } + goto add_instruction; + } + // May update Instruction and IsCall + TargetSymbol = handleExternalReference(Instruction, Size, Offset, + TargetAddress, IsCall); + } + } + + if (!IsCall) { + // Add taken branch info. + TakenBranches.emplace_back(Offset, TargetAddress - getAddress()); + } + BC.MIB->replaceBranchTarget(Instruction, TargetSymbol, &*Ctx); + + // Mark CTC. + if (IsCondBranch && IsCall) { + MIB->setConditionalTailCall(Instruction, TargetAddress); + } + } else { + // Could not evaluate branch. Should be an indirect call or an + // indirect branch. Bail out on the latter case. + if (MIB->isIndirectBranch(Instruction)) + handleIndirectBranch(Instruction, Size, Offset); + // Indirect call. We only need to fix it if the operand is RIP-relative. + if (IsSimple && MIB->hasPCRelOperand(Instruction)) + handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size); + + if (BC.isAArch64()) + handleAArch64IndirectCall(Instruction, Offset); + } + } else if (MIB->hasPCRelOperand(Instruction) && !UsedReloc) + handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size); + +add_instruction: + if (getDWARFLineTable()) { + Instruction.setLoc( + findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, + getDWARFUnit(), + getDWARFLineTable())); + } + + // Record offset of the instruction for profile matching. + if (BC.keepOffsetForInstruction(Instruction)) { + MIB->addAnnotation(Instruction, "Offset", static_cast(Offset)); + } + + addInstruction(Offset, std::move(Instruction)); + } + + clearList(Relocations); + + if (!IsSimple) { + clearList(Instructions); + return false; + } + + updateState(State::Disassembled); + + return true; +} + +bool BinaryFunction::scanExternalRefs() { + bool Success = true; + bool DisassemblyFailed = false; + + // Ignore pseudo functions. + if (isPseudo()) + return Success; + + if (opts::NoScan) { + clearList(Relocations); + clearList(ExternallyReferencedOffsets); + + return false; + } + + // List of external references for this function. + std::vector FunctionRelocations; + + static BinaryContext::IndependentCodeEmitter Emitter = + BC.createIndependentMCCodeEmitter(); + + ErrorOr> ErrorOrFunctionData = getData(); + assert(ErrorOrFunctionData && "function data is not available"); + ArrayRef FunctionData = *ErrorOrFunctionData; + assert(FunctionData.size() == getMaxSize() && + "function size does not match raw data size"); + + uint64_t Size = 0; // instruction size + for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { + // Check for data inside code and ignore it + if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) { + Size = DataInCodeSize; + continue; + } + + const uint64_t AbsoluteInstrAddr = getAddress() + Offset; + MCInst Instruction; + if (!BC.DisAsm->getInstruction(Instruction, + Size, + FunctionData.slice(Offset), + AbsoluteInstrAddr, + nulls())) { + if (opts::Verbosity >= 1 && !isZeroPaddingAt(Offset)) { + errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" + << Twine::utohexstr(Offset) << " (address 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " + << *this << '\n'; + } + Success = false; + DisassemblyFailed = true; + break; + } + + // Return true if we can skip handling the Target function reference. + auto ignoreFunctionRef = [&](const BinaryFunction &Target) { + if (&Target == this) + return true; + + // Note that later we may decide not to emit Target function. In that + // case, we conservatively create references that will be ignored or + // resolved to the same function. + if (!BC.shouldEmit(Target)) + return true; + + return false; + }; + + // Return true if we can ignore reference to the symbol. + auto ignoreReference = [&](const MCSymbol *TargetSymbol) { + if (!TargetSymbol) + return true; + + if (BC.forceSymbolRelocations(TargetSymbol->getName())) + return false; + + BinaryFunction *TargetFunction = BC.getFunctionForSymbol(TargetSymbol); + if (!TargetFunction) + return true; + + return ignoreFunctionRef(*TargetFunction); + }; + + // Detect if the instruction references an address. + // Without relocations, we can only trust PC-relative address modes. + uint64_t TargetAddress = 0; + bool IsPCRel = false; + bool IsBranch = false; + if (BC.MIB->hasPCRelOperand(Instruction)) { + if (BC.MIB->evaluateMemOperandTarget(Instruction, TargetAddress, + AbsoluteInstrAddr, Size)) { + IsPCRel = true; + } + } else if (BC.MIB->isCall(Instruction) || BC.MIB->isBranch(Instruction)) { + if (BC.MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size, + TargetAddress)) { + IsBranch = true; + } + } + + MCSymbol *TargetSymbol = nullptr; + + // Create an entry point at reference address if needed. + BinaryFunction *TargetFunction = + BC.getBinaryFunctionContainingAddress(TargetAddress); + if (TargetFunction && !ignoreFunctionRef(*TargetFunction)) { + const uint64_t FunctionOffset = + TargetAddress - TargetFunction->getAddress(); + TargetSymbol = FunctionOffset + ? TargetFunction->addEntryPointAtOffset(FunctionOffset) + : TargetFunction->getSymbol(); + } + + // Can't find more references and not creating relocations. + if (!BC.HasRelocations) + continue; + + // Create a relocation against the TargetSymbol as the symbol might get + // moved. + if (TargetSymbol) { + if (IsBranch) { + BC.MIB->replaceBranchTarget(Instruction, TargetSymbol, + Emitter.LocalCtx.get()); + } else if (IsPCRel) { + const MCExpr *Expr = MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *Emitter.LocalCtx.get()); + BC.MIB->replaceMemOperandDisp( + Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor( + Instruction, + Expr, + *Emitter.LocalCtx.get(), 0))); + } + } + + // Create more relocations based on input file relocations. + bool HasRel = false; + for (auto Itr = Relocations.lower_bound(Offset), + ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) { + Relocation &Relocation = Itr->second; + if (ignoreReference(Relocation.Symbol)) + continue; + + int64_t Value = Relocation.Value; + const bool Result = + BC.MIB->replaceImmWithSymbolRef(Instruction, + Relocation.Symbol, + Relocation.Addend, + Emitter.LocalCtx.get(), + Value, + Relocation.Type); + (void)Result; + assert(Result && "cannot replace immediate with relocation"); + + HasRel = true; + } + + if (!TargetSymbol && !HasRel) + continue; + + // Emit the instruction using temp emitter and generate relocations. + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + Emitter.MCE->encodeInstruction(Instruction, VecOS, Fixups, *BC.STI); + + // Create relocation for every fixup. + for (const MCFixup &Fixup : Fixups) { + Optional Rel = BC.MIB->createRelocation(Fixup, *BC.MAB); + if (!Rel) { + Success = false; + continue; + } + + if (Relocation::getSizeForType(Rel->Type) < 4) { + // If the instruction uses a short form, then we might not be able + // to handle the rewrite without relaxation, and hence cannot reliably + // create an external reference relocation. + Success = false; + continue; + } + Rel->Offset += getAddress() - getOriginSection()->getAddress() + Offset; + FunctionRelocations.push_back(*Rel); + } + + if (!Success) + break; + } + + // Add relocations unless disassembly failed for this function. + if (!DisassemblyFailed) { + for (Relocation &Rel : FunctionRelocations) { + getOriginSection()->addPendingRelocation(Rel); + } + } + + // Inform BinaryContext that this function symbols will not be defined and + // relocations should not be created against them. + if (BC.HasRelocations) { + for (std::pair &LI : Labels) { + BC.UndefinedSymbols.insert(LI.second); + } + if (FunctionEndLabel) { + BC.UndefinedSymbols.insert(FunctionEndLabel); + } + } + + clearList(Relocations); + clearList(ExternallyReferencedOffsets); + + if (Success && BC.HasRelocations) { + HasExternalRefRelocations = true; + } + + if (opts::Verbosity >= 1 && !Success) { + outs() << "BOLT-INFO: failed to scan refs for " << *this << '\n'; + } + + return Success; +} + +void BinaryFunction::postProcessEntryPoints() { + if (!isSimple()) + return; + + for (auto &KV : Labels) { + MCSymbol *Label = KV.second; + if (!getSecondaryEntryPointSymbol(Label)) + continue; + + // In non-relocation mode there's potentially an external undetectable + // reference to the entry point and hence we cannot move this entry + // point. Optimizing without moving could be difficult. + if (!BC.HasRelocations) + setSimple(false); + + const uint32_t Offset = KV.first; + + // If we are at Offset 0 and there is no instruction associated with it, + // this means this is an empty function. Just ignore. If we find an + // instruction at this offset, this entry point is valid. + if (!Offset || getInstructionAtOffset(Offset)) { + continue; + } + + // On AArch64 there are legitimate reasons to have references past the + // end of the function, e.g. jump tables. + if (BC.isAArch64() && Offset == getSize()) { + continue; + } + + errs() << "BOLT-WARNING: reference in the middle of instruction " + "detected in function " << *this + << " at offset 0x" << Twine::utohexstr(Offset) << '\n'; + if (BC.HasRelocations) { + setIgnored(); + } + setSimple(false); + return; + } +} + +void BinaryFunction::postProcessJumpTables() { + // Create labels for all entries. + for (auto &JTI : JumpTables) { + JumpTable &JT = *JTI.second; + if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) { + opts::JumpTables = JTS_MOVE; + outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was " + "detected in function " << *this << '\n'; + } + for (unsigned I = 0; I < JT.OffsetEntries.size(); ++I) { + MCSymbol *Label = + getOrCreateLocalLabel(getAddress() + JT.OffsetEntries[I], + /*CreatePastEnd*/ true); + JT.Entries.push_back(Label); + } + + const uint64_t BDSize = + BC.getBinaryDataAtAddress(JT.getAddress())->getSize(); + if (!BDSize) { + BC.setBinaryDataSize(JT.getAddress(), JT.getSize()); + } else { + assert(BDSize >= JT.getSize() && + "jump table cannot be larger than the containing object"); + } + } + + // Add TakenBranches from JumpTables. + // + // We want to do it after initial processing since we don't know jump tables' + // boundaries until we process them all. + for (auto &JTSite : JTSites) { + const uint64_t JTSiteOffset = JTSite.first; + const uint64_t JTAddress = JTSite.second; + const JumpTable *JT = getJumpTableContainingAddress(JTAddress); + assert(JT && "cannot find jump table for address"); + + uint64_t EntryOffset = JTAddress - JT->getAddress(); + while (EntryOffset < JT->getSize()) { + uint64_t TargetOffset = JT->OffsetEntries[EntryOffset / JT->EntrySize]; + if (TargetOffset < getSize()) { + TakenBranches.emplace_back(JTSiteOffset, TargetOffset); + + if (opts::StrictMode) + registerReferencedOffset(TargetOffset); + } + + EntryOffset += JT->EntrySize; + + // A label at the next entry means the end of this jump table. + if (JT->Labels.count(EntryOffset)) + break; + } + } + clearList(JTSites); + + // Free memory used by jump table offsets. + for (auto &JTI : JumpTables) { + JumpTable &JT = *JTI.second; + clearList(JT.OffsetEntries); + } + + // Conservatively populate all possible destinations for unknown indirect + // branches. + if (opts::StrictMode && hasInternalReference()) { + for (uint64_t Offset : UnknownIndirectBranchOffsets) { + for (uint64_t PossibleDestination : ExternallyReferencedOffsets) { + // Ignore __builtin_unreachable(). + if (PossibleDestination == getSize()) + continue; + TakenBranches.emplace_back(Offset, PossibleDestination); + } + } + } + + // Remove duplicates branches. We can get a bunch of them from jump tables. + // Without doing jump table value profiling we don't have use for extra + // (duplicate) branches. + std::sort(TakenBranches.begin(), TakenBranches.end()); + auto NewEnd = std::unique(TakenBranches.begin(), TakenBranches.end()); + TakenBranches.erase(NewEnd, TakenBranches.end()); +} + +bool BinaryFunction::postProcessIndirectBranches( + MCPlusBuilder::AllocatorIdTy AllocId) { + auto addUnknownControlFlow = [&](BinaryBasicBlock &BB) { + HasUnknownControlFlow = true; + BB.removeAllSuccessors(); + for (uint64_t PossibleDestination : ExternallyReferencedOffsets) { + if (BinaryBasicBlock *SuccBB = getBasicBlockAtOffset(PossibleDestination)) + BB.addSuccessor(SuccBB); + } + }; + + uint64_t NumIndirectJumps = 0; + MCInst *LastIndirectJump = nullptr; + BinaryBasicBlock *LastIndirectJumpBB = nullptr; + uint64_t LastJT = 0; + uint16_t LastJTIndexReg = BC.MIB->getNoRegister(); + for (BinaryBasicBlock *BB : layout()) { + for (MCInst &Instr : *BB) { + if (!BC.MIB->isIndirectBranch(Instr)) + continue; + + // If there's an indirect branch in a single-block function - + // it must be a tail call. + if (layout_size() == 1) { + BC.MIB->convertJmpToTailCall(Instr); + return true; + } + + ++NumIndirectJumps; + + if (opts::StrictMode && !hasInternalReference()) { + BC.MIB->convertJmpToTailCall(Instr); + break; + } + + // Validate the tail call or jump table assumptions now that we know + // basic block boundaries. + if (BC.MIB->isTailCall(Instr) || BC.MIB->getJumpTable(Instr)) { + const unsigned PtrSize = BC.AsmInfo->getCodePointerSize(); + MCInst *MemLocInstr; + unsigned BaseRegNum, IndexRegNum; + int64_t DispValue; + const MCExpr *DispExpr; + MCInst *PCRelBaseInstr; + IndirectBranchType Type = BC.MIB->analyzeIndirectBranch( + Instr, BB->begin(), BB->end(), PtrSize, MemLocInstr, BaseRegNum, + IndexRegNum, DispValue, DispExpr, PCRelBaseInstr); + if (Type != IndirectBranchType::UNKNOWN || MemLocInstr != nullptr) + continue; + + if (!opts::StrictMode) + return false; + + if (BC.MIB->isTailCall(Instr)) { + BC.MIB->convertTailCallToJmp(Instr); + } else { + LastIndirectJump = &Instr; + LastIndirectJumpBB = BB; + LastJT = BC.MIB->getJumpTable(Instr); + LastJTIndexReg = BC.MIB->getJumpTableIndexReg(Instr); + BC.MIB->unsetJumpTable(Instr); + + JumpTable *JT = BC.getJumpTableContainingAddress(LastJT); + if (JT->Type == JumpTable::JTT_NORMAL) { + // Invalidating the jump table may also invalidate other jump table + // boundaries. Until we have/need a support for this, mark the + // function as non-simple. + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: rejected jump table reference" + << JT->getName() << " in " << *this << '\n'); + return false; + } + } + + addUnknownControlFlow(*BB); + continue; + } + + // If this block contains an epilogue code and has an indirect branch, + // then most likely it's a tail call. Otherwise, we cannot tell for sure + // what it is and conservatively reject the function's CFG. + bool IsEpilogue = false; + for (const MCInst &Instr : *BB) { + if (BC.MIB->isLeave(Instr) || BC.MIB->isPop(Instr)) { + IsEpilogue = true; + break; + } + } + if (IsEpilogue) { + BC.MIB->convertJmpToTailCall(Instr); + BB->removeAllSuccessors(); + continue; + } + + if (opts::Verbosity >= 2) { + outs() << "BOLT-INFO: rejected potential indirect tail call in " + << "function " << *this << " in basic block " + << BB->getName() << ".\n"; + LLVM_DEBUG(BC.printInstructions(dbgs(), BB->begin(), BB->end(), + BB->getOffset(), this, true)); + } + + if (!opts::StrictMode) + return false; + + addUnknownControlFlow(*BB); + } + } + + if (HasInternalLabelReference) + return false; + + // If there's only one jump table, and one indirect jump, and no other + // references, then we should be able to derive the jump table even if we + // fail to match the pattern. + if (HasUnknownControlFlow && NumIndirectJumps == 1 && + JumpTables.size() == 1 && LastIndirectJump) { + BC.MIB->setJumpTable(*LastIndirectJump, LastJT, LastJTIndexReg, AllocId); + HasUnknownControlFlow = false; + + // re-populate successors based on the jump table. + std::set JTLabels; + LastIndirectJumpBB->removeAllSuccessors(); + const JumpTable *JT = getJumpTableContainingAddress(LastJT); + for (const MCSymbol *Label : JT->Entries) { + JTLabels.emplace(Label); + } + for (const MCSymbol *Label : JTLabels) { + BinaryBasicBlock *BB = getBasicBlockForLabel(Label); + // Ignore __builtin_unreachable() + if (!BB) { + assert(Label == getFunctionEndLabel() && "if no BB found, must be end"); + continue; + } + LastIndirectJumpBB->addSuccessor(BB); + } + } + + if (HasFixedIndirectBranch) + return false; + + if (HasUnknownControlFlow && !BC.HasRelocations) + return false; + + return true; +} + +void BinaryFunction::recomputeLandingPads() { + updateBBIndices(0); + + for (BinaryBasicBlock *BB : BasicBlocks) { + BB->LandingPads.clear(); + BB->Throwers.clear(); + } + + for (BinaryBasicBlock *BB : BasicBlocks) { + std::unordered_set BBLandingPads; + for (MCInst &Instr : *BB) { + if (!BC.MIB->isInvoke(Instr)) + continue; + + const Optional EHInfo = BC.MIB->getEHInfo(Instr); + if (!EHInfo || !EHInfo->first) + continue; + + BinaryBasicBlock *LPBlock = getBasicBlockForLabel(EHInfo->first); + if (!BBLandingPads.count(LPBlock)) { + BBLandingPads.insert(LPBlock); + BB->LandingPads.emplace_back(LPBlock); + LPBlock->Throwers.emplace_back(BB); + } + } + } +} + +bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) { + auto &MIB = BC.MIB; + + if (!isSimple()) { + assert(!BC.HasRelocations && + "cannot process file with non-simple function in relocs mode"); + return false; + } + + if (CurrentState != State::Disassembled) + return false; + + assert(BasicBlocks.empty() && "basic block list should be empty"); + assert((Labels.find(0) != Labels.end()) && + "first instruction should always have a label"); + + // Create basic blocks in the original layout order: + // + // * Every instruction with associated label marks + // the beginning of a basic block. + // * Conditional instruction marks the end of a basic block, + // except when the following instruction is an + // unconditional branch, and the unconditional branch is not + // a destination of another branch. In the latter case, the + // basic block will consist of a single unconditional branch + // (missed "double-jump" optimization). + // + // Created basic blocks are sorted in layout order since they are + // created in the same order as instructions, and instructions are + // sorted by offsets. + BinaryBasicBlock *InsertBB = nullptr; + BinaryBasicBlock *PrevBB = nullptr; + bool IsLastInstrNop = false; + uint64_t LastInstrOffset = 0; + + auto addCFIPlaceholders = + [this](uint64_t CFIOffset, BinaryBasicBlock *InsertBB) { + for (auto FI = OffsetToCFI.lower_bound(CFIOffset), + FE = OffsetToCFI.upper_bound(CFIOffset); + FI != FE; ++FI) { + addCFIPseudo(InsertBB, InsertBB->end(), FI->second); + } + }; + + // For profiling purposes we need to save the offset of the last instruction + // in the basic block. But in certain cases we don't if the instruction was + // the last one, and we have to go back and update its offset. + auto updateOffset = [&](uint64_t Offset) { + assert(PrevBB && PrevBB != InsertBB && "invalid previous block"); + MCInst *PrevInstr = PrevBB->getLastNonPseudoInstr(); + if (PrevInstr && !MIB->hasAnnotation(*PrevInstr, "Offset")) + MIB->addAnnotation(*PrevInstr, "Offset", static_cast(Offset), + AllocatorId); + }; + + for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { + const uint32_t Offset = I->first; + MCInst &Instr = I->second; + + auto LI = Labels.find(Offset); + if (LI != Labels.end()) { + // Always create new BB at branch destination. + PrevBB = InsertBB; + InsertBB = addBasicBlock(LI->first, LI->second, + opts::PreserveBlocksAlignment && IsLastInstrNop); + if (PrevBB) + updateOffset(LastInstrOffset); + } + + const uint64_t InstrInputAddr = I->first + Address; + bool IsSDTMarker = + MIB->isNoop(Instr) && BC.SDTMarkers.count(InstrInputAddr); + bool IsLKMarker = BC.LKMarkers.count(InstrInputAddr); + if (IsSDTMarker || IsLKMarker) { + HasSDTMarker = true; + LLVM_DEBUG(dbgs() << "SDTMarker or LKMarker detected in the input at : " + << utohexstr(InstrInputAddr) << "\n"); + if (!MIB->hasAnnotation(Instr, "Offset")) { + MIB->addAnnotation(Instr, "Offset", static_cast(Offset), + AllocatorId); + } + } + + // Ignore nops except SDT markers. We use nops to derive alignment of the + // next basic block. It will not always work, as some blocks are naturally + // aligned, but it's just part of heuristic for block alignment. + if (MIB->isNoop(Instr) && !PreserveNops && !IsSDTMarker && !IsLKMarker) { + IsLastInstrNop = true; + continue; + } + if (!InsertBB) { + // It must be a fallthrough or unreachable code. Create a new block unless + // we see an unconditional branch following a conditional one. The latter + // should not be a conditional tail call. + assert(PrevBB && "no previous basic block for a fall through"); + MCInst *PrevInstr = PrevBB->getLastNonPseudoInstr(); + assert(PrevInstr && "no previous instruction for a fall through"); + if (MIB->isUnconditionalBranch(Instr) && + !MIB->isUnconditionalBranch(*PrevInstr) && + !MIB->getConditionalTailCall(*PrevInstr)) { + // Temporarily restore inserter basic block. + InsertBB = PrevBB; + } else { + MCSymbol *Label; + { + auto L = BC.scopeLock(); + Label = BC.Ctx->createNamedTempSymbol("FT"); + } + InsertBB = addBasicBlock( + Offset, Label, opts::PreserveBlocksAlignment && IsLastInstrNop); + updateOffset(LastInstrOffset); + } + } + if (Offset == 0) { + // Add associated CFI pseudos in the first offset (0) + addCFIPlaceholders(0, InsertBB); + } + + const bool IsBlockEnd = MIB->isTerminator(Instr); + IsLastInstrNop = MIB->isNoop(Instr); + LastInstrOffset = Offset; + InsertBB->addInstruction(std::move(Instr)); + + // Add associated CFI instrs. We always add the CFI instruction that is + // located immediately after this instruction, since the next CFI + // instruction reflects the change in state caused by this instruction. + auto NextInstr = std::next(I); + uint64_t CFIOffset; + if (NextInstr != E) + CFIOffset = NextInstr->first; + else + CFIOffset = getSize(); + + // Note: this potentially invalidates instruction pointers/iterators. + addCFIPlaceholders(CFIOffset, InsertBB); + + if (IsBlockEnd) { + PrevBB = InsertBB; + InsertBB = nullptr; + } + } + + if (BasicBlocks.empty()) { + setSimple(false); + return false; + } + + // Intermediate dump. + LLVM_DEBUG(print(dbgs(), "after creating basic blocks")); + + // TODO: handle properly calls to no-return functions, + // e.g. exit(3), etc. Otherwise we'll see a false fall-through + // blocks. + + for (std::pair &Branch : TakenBranches) { + LLVM_DEBUG(dbgs() << "registering branch [0x" + << Twine::utohexstr(Branch.first) << "] -> [0x" + << Twine::utohexstr(Branch.second) << "]\n"); + BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first); + BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second); + if (!FromBB || !ToBB) { + if (!FromBB) + errs() << "BOLT-ERROR: cannot find BB containing the branch.\n"; + if (!ToBB) + errs() << "BOLT-ERROR: cannot find BB containing branch destination.\n"; + BC.exitWithBugReport("disassembly failed - inconsistent branch found.", + *this); + } + + FromBB->addSuccessor(ToBB); + } + + // Add fall-through branches. + PrevBB = nullptr; + bool IsPrevFT = false; // Is previous block a fall-through. + for (BinaryBasicBlock *BB : BasicBlocks) { + if (IsPrevFT) { + PrevBB->addSuccessor(BB); + } + if (BB->empty()) { + IsPrevFT = true; + PrevBB = BB; + continue; + } + + MCInst *LastInstr = BB->getLastNonPseudoInstr(); + assert(LastInstr && + "should have non-pseudo instruction in non-empty block"); + + if (BB->succ_size() == 0) { + // Since there's no existing successors, we know the last instruction is + // not a conditional branch. Thus if it's a terminator, it shouldn't be a + // fall-through. + // + // Conditional tail call is a special case since we don't add a taken + // branch successor for it. + IsPrevFT = !MIB->isTerminator(*LastInstr) || + MIB->getConditionalTailCall(*LastInstr); + } else if (BB->succ_size() == 1) { + IsPrevFT = MIB->isConditionalBranch(*LastInstr); + } else { + IsPrevFT = false; + } + + PrevBB = BB; + } + + if (!IsPrevFT) { + // Possibly a call that does not return. + LLVM_DEBUG(dbgs() << "last block was marked as a fall-through in " << *this + << '\n'); + } + + // Assign landing pads and throwers info. + recomputeLandingPads(); + + // Assign CFI information to each BB entry. + annotateCFIState(); + + // Annotate invoke instructions with GNU_args_size data. + propagateGnuArgsSizeInfo(AllocatorId); + + // Set the basic block layout to the original order and set end offsets. + PrevBB = nullptr; + for (BinaryBasicBlock *BB : BasicBlocks) { + BasicBlocksLayout.emplace_back(BB); + if (PrevBB) + PrevBB->setEndOffset(BB->getOffset()); + PrevBB = BB; + } + PrevBB->setEndOffset(getSize()); + + updateLayoutIndices(); + + normalizeCFIState(); + + // Clean-up memory taken by intermediate structures. + // + // NB: don't clear Labels list as we may need them if we mark the function + // as non-simple later in the process of discovering extra entry points. + clearList(Instructions); + clearList(OffsetToCFI); + clearList(TakenBranches); + + // Update the state. + CurrentState = State::CFG; + + // Make any necessary adjustments for indirect branches. + if (!postProcessIndirectBranches(AllocatorId)) { + if (opts::Verbosity) { + errs() << "BOLT-WARNING: failed to post-process indirect branches for " + << *this << '\n'; + } + // In relocation mode we want to keep processing the function but avoid + // optimizing it. + setSimple(false); + } + + clearList(ExternallyReferencedOffsets); + clearList(UnknownIndirectBranchOffsets); + + return true; +} + +void BinaryFunction::postProcessCFG() { + if (isSimple() && !BasicBlocks.empty()) { + // Convert conditional tail call branches to conditional branches that jump + // to a tail call. + removeConditionalTailCalls(); + + postProcessProfile(); + + // Eliminate inconsistencies between branch instructions and CFG. + postProcessBranches(); + } + + calculateMacroOpFusionStats(); + + // The final cleanup of intermediate structures. + clearList(IgnoredBranches); + + // Remove "Offset" annotations, unless we need an address-translation table + // later. This has no cost, since annotations are allocated by a bumpptr + // allocator and won't be released anyway until late in the pipeline. + if (!requiresAddressTranslation() && !opts::Instrument) + for (BinaryBasicBlock *BB : layout()) + for (MCInst &Inst : *BB) + BC.MIB->removeAnnotation(Inst, "Offset"); + + assert((!isSimple() || validateCFG()) && + "invalid CFG detected after post-processing"); +} + +void BinaryFunction::calculateMacroOpFusionStats() { + if (!getBinaryContext().isX86()) + return; + for (BinaryBasicBlock *BB : layout()) { + auto II = BB->getMacroOpFusionPair(); + if (II == BB->end()) + continue; + + // Check offset of the second instruction. + // FIXME: arch-specific. + const uint32_t Offset = + BC.MIB->getAnnotationWithDefault(*std::next(II), "Offset", 0); + if (!Offset || (getAddress() + Offset) % 64) + continue; + + LLVM_DEBUG(dbgs() << "\nmissed macro-op fusion at address 0x" + << Twine::utohexstr(getAddress() + Offset) + << " in function " << *this << "; executed " + << BB->getKnownExecutionCount() << " times.\n"); + ++BC.MissedMacroFusionPairs; + BC.MissedMacroFusionExecCount += BB->getKnownExecutionCount(); + } +} + +void BinaryFunction::removeTagsFromProfile() { + for (BinaryBasicBlock *BB : BasicBlocks) { + if (BB->ExecutionCount == BinaryBasicBlock::COUNT_NO_PROFILE) + BB->ExecutionCount = 0; + for (BinaryBasicBlock::BinaryBranchInfo &BI : BB->branch_info()) { + if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE && + BI.MispredictedCount != BinaryBasicBlock::COUNT_NO_PROFILE) + continue; + BI.Count = 0; + BI.MispredictedCount = 0; + } + } +} + +void BinaryFunction::removeConditionalTailCalls() { + // Blocks to be appended at the end. + std::vector> NewBlocks; + + for (auto BBI = begin(); BBI != end(); ++BBI) { + BinaryBasicBlock &BB = *BBI; + MCInst *CTCInstr = BB.getLastNonPseudoInstr(); + if (!CTCInstr) + continue; + + Optional TargetAddressOrNone = + BC.MIB->getConditionalTailCall(*CTCInstr); + if (!TargetAddressOrNone) + continue; + + // Gather all necessary information about CTC instruction before + // annotations are destroyed. + const int32_t CFIStateBeforeCTC = BB.getCFIStateAtInstr(CTCInstr); + uint64_t CTCTakenCount = BinaryBasicBlock::COUNT_NO_PROFILE; + uint64_t CTCMispredCount = BinaryBasicBlock::COUNT_NO_PROFILE; + if (hasValidProfile()) { + CTCTakenCount = + BC.MIB->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); + CTCMispredCount = + BC.MIB->getAnnotationWithDefault(*CTCInstr, + "CTCMispredCount"); + } + + // Assert that the tail call does not throw. + assert(!BC.MIB->getEHInfo(*CTCInstr) && + "found tail call with associated landing pad"); + + // Create a basic block with an unconditional tail call instruction using + // the same destination. + const MCSymbol *CTCTargetLabel = BC.MIB->getTargetSymbol(*CTCInstr); + assert(CTCTargetLabel && "symbol expected for conditional tail call"); + MCInst TailCallInstr; + BC.MIB->createTailCall(TailCallInstr, CTCTargetLabel, BC.Ctx.get()); + // Link new BBs to the original input offset of the BB where the CTC + // is, so we can map samples recorded in new BBs back to the original BB + // seem in the input binary (if using BAT) + std::unique_ptr TailCallBB = createBasicBlock( + BB.getInputOffset(), BC.Ctx->createNamedTempSymbol("TC")); + TailCallBB->addInstruction(TailCallInstr); + TailCallBB->setCFIState(CFIStateBeforeCTC); + + // Add CFG edge with profile info from BB to TailCallBB. + BB.addSuccessor(TailCallBB.get(), CTCTakenCount, CTCMispredCount); + + // Add execution count for the block. + TailCallBB->setExecutionCount(CTCTakenCount); + + BC.MIB->convertTailCallToJmp(*CTCInstr); + + BC.MIB->replaceBranchTarget(*CTCInstr, TailCallBB->getLabel(), + BC.Ctx.get()); + + // Add basic block to the list that will be added to the end. + NewBlocks.emplace_back(std::move(TailCallBB)); + + // Swap edges as the TailCallBB corresponds to the taken branch. + BB.swapConditionalSuccessors(); + + // This branch is no longer a conditional tail call. + BC.MIB->unsetConditionalTailCall(*CTCInstr); + } + + insertBasicBlocks(std::prev(end()), + std::move(NewBlocks), + /* UpdateLayout */ true, + /* UpdateCFIState */ false); +} + +uint64_t BinaryFunction::getFunctionScore() const { + if (FunctionScore != -1) + return FunctionScore; + + if (!isSimple() || !hasValidProfile()) { + FunctionScore = 0; + return FunctionScore; + } + + uint64_t TotalScore = 0ULL; + for (BinaryBasicBlock *BB : layout()) { + uint64_t BBExecCount = BB->getExecutionCount(); + if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) + continue; + TotalScore += BBExecCount; + } + FunctionScore = TotalScore; + return FunctionScore; +} + +void BinaryFunction::annotateCFIState() { + assert(CurrentState == State::Disassembled && "unexpected function state"); + assert(!BasicBlocks.empty() && "basic block list should not be empty"); + + // This is an index of the last processed CFI in FDE CFI program. + uint32_t State = 0; + + // This is an index of RememberState CFI reflecting effective state right + // after execution of RestoreState CFI. + // + // It differs from State iff the CFI at (State-1) + // was RestoreState (modulo GNU_args_size CFIs, which are ignored). + // + // This allows us to generate shorter replay sequences when producing new + // CFI programs. + uint32_t EffectiveState = 0; + + // For tracking RememberState/RestoreState sequences. + std::stack StateStack; + + for (BinaryBasicBlock *BB : BasicBlocks) { + BB->setCFIState(EffectiveState); + + for (const MCInst &Instr : *BB) { + const MCCFIInstruction *CFI = getCFIFor(Instr); + if (!CFI) + continue; + + ++State; + + switch (CFI->getOperation()) { + case MCCFIInstruction::OpRememberState: + StateStack.push(EffectiveState); + EffectiveState = State; + break; + case MCCFIInstruction::OpRestoreState: + assert(!StateStack.empty() && "corrupt CFI stack"); + EffectiveState = StateStack.top(); + StateStack.pop(); + break; + case MCCFIInstruction::OpGnuArgsSize: + // OpGnuArgsSize CFIs do not affect the CFI state. + break; + default: + // Any other CFI updates the state. + EffectiveState = State; + break; + } + } + } + + assert(StateStack.empty() && "corrupt CFI stack"); +} + +namespace { + +/// Our full interpretation of a DWARF CFI machine state at a given point +struct CFISnapshot { + /// CFA register number and offset defining the canonical frame at this + /// point, or the number of a rule (CFI state) that computes it with a + /// DWARF expression. This number will be negative if it refers to a CFI + /// located in the CIE instead of the FDE. + uint32_t CFAReg; + int32_t CFAOffset; + int32_t CFARule; + /// Mapping of rules (CFI states) that define the location of each + /// register. If absent, no rule defining the location of such register + /// was ever read. This number will be negative if it refers to a CFI + /// located in the CIE instead of the FDE. + DenseMap RegRule; + + /// References to CIE, FDE and expanded instructions after a restore state + const std::vector &CIE; + const std::vector &FDE; + const DenseMap> &FrameRestoreEquivalents; + + /// Current FDE CFI number representing the state where the snapshot is at + int32_t CurState; + + /// Used when we don't have information about which state/rule to apply + /// to recover the location of either the CFA or a specific register + constexpr static int32_t UNKNOWN = std::numeric_limits::min(); + +private: + /// Update our snapshot by executing a single CFI + void update(const MCCFIInstruction &Instr, int32_t RuleNumber) { + switch (Instr.getOperation()) { + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: + RegRule[Instr.getRegister()] = RuleNumber; + break; + case MCCFIInstruction::OpDefCfaRegister: + CFAReg = Instr.getRegister(); + CFARule = UNKNOWN; + break; + case MCCFIInstruction::OpDefCfaOffset: + CFAOffset = Instr.getOffset(); + CFARule = UNKNOWN; + break; + case MCCFIInstruction::OpDefCfa: + CFAReg = Instr.getRegister(); + CFAOffset = Instr.getOffset(); + CFARule = UNKNOWN; + break; + case MCCFIInstruction::OpDefCfaExpression: + CFARule = RuleNumber; + break; + case MCCFIInstruction::OpAdjustCfaOffset: + case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpEscape: + case MCCFIInstruction::OpNegateRAState: + llvm_unreachable("unsupported CFI opcode"); + break; + case MCCFIInstruction::OpRememberState: + case MCCFIInstruction::OpRestoreState: + case MCCFIInstruction::OpGnuArgsSize: + // do not affect CFI state + break; + } + } + +public: + /// Advance state reading FDE CFI instructions up to State number + void advanceTo(int32_t State) { + for (int32_t I = CurState, E = State; I != E; ++I) { + const MCCFIInstruction &Instr = FDE[I]; + if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) { + update(Instr, I); + continue; + } + // If restore state instruction, fetch the equivalent CFIs that have + // the same effect of this restore. This is used to ensure remember- + // restore pairs are completely removed. + auto Iter = FrameRestoreEquivalents.find(I); + if (Iter == FrameRestoreEquivalents.end()) + continue; + for (int32_t RuleNumber : Iter->second) { + update(FDE[RuleNumber], RuleNumber); + } + } + + assert(((CFAReg != (uint32_t)UNKNOWN && CFAOffset != UNKNOWN) || + CFARule != UNKNOWN) && + "CIE did not define default CFA?"); + + CurState = State; + } + + /// Interpret all CIE and FDE instructions up until CFI State number and + /// populate this snapshot + CFISnapshot( + const std::vector &CIE, + const std::vector &FDE, + const DenseMap> &FrameRestoreEquivalents, + int32_t State) + : CIE(CIE), FDE(FDE), FrameRestoreEquivalents(FrameRestoreEquivalents) { + CFAReg = UNKNOWN; + CFAOffset = UNKNOWN; + CFARule = UNKNOWN; + CurState = 0; + + for (int32_t I = 0, E = CIE.size(); I != E; ++I) { + const MCCFIInstruction &Instr = CIE[I]; + update(Instr, -I); + } + + advanceTo(State); + } + +}; + +/// A CFI snapshot with the capability of checking if incremental additions to +/// it are redundant. This is used to ensure we do not emit two CFI instructions +/// back-to-back that are doing the same state change, or to avoid emitting a +/// CFI at all when the state at that point would not be modified after that CFI +struct CFISnapshotDiff : public CFISnapshot { + bool RestoredCFAReg{false}; + bool RestoredCFAOffset{false}; + DenseMap RestoredRegs; + + CFISnapshotDiff(const CFISnapshot &S) : CFISnapshot(S) {} + + CFISnapshotDiff( + const std::vector &CIE, + const std::vector &FDE, + const DenseMap> &FrameRestoreEquivalents, + int32_t State) + : CFISnapshot(CIE, FDE, FrameRestoreEquivalents, State) {} + + /// Return true if applying Instr to this state is redundant and can be + /// dismissed. + bool isRedundant(const MCCFIInstruction &Instr) { + switch (Instr.getOperation()) { + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: { + if (RestoredRegs[Instr.getRegister()]) + return true; + RestoredRegs[Instr.getRegister()] = true; + const int32_t CurRegRule = + RegRule.find(Instr.getRegister()) != RegRule.end() + ? RegRule[Instr.getRegister()] + : UNKNOWN; + if (CurRegRule == UNKNOWN) { + if (Instr.getOperation() == MCCFIInstruction::OpRestore || + Instr.getOperation() == MCCFIInstruction::OpSameValue) + return true; + return false; + } + const MCCFIInstruction &LastDef = + CurRegRule < 0 ? CIE[-CurRegRule] : FDE[CurRegRule]; + return LastDef == Instr; + } + case MCCFIInstruction::OpDefCfaRegister: + if (RestoredCFAReg) + return true; + RestoredCFAReg = true; + return CFAReg == Instr.getRegister(); + case MCCFIInstruction::OpDefCfaOffset: + if (RestoredCFAOffset) + return true; + RestoredCFAOffset = true; + return CFAOffset == Instr.getOffset(); + case MCCFIInstruction::OpDefCfa: + if (RestoredCFAReg && RestoredCFAOffset) + return true; + RestoredCFAReg = true; + RestoredCFAOffset = true; + return CFAReg == Instr.getRegister() && CFAOffset == Instr.getOffset(); + case MCCFIInstruction::OpDefCfaExpression: + if (RestoredCFAReg && RestoredCFAOffset) + return true; + RestoredCFAReg = true; + RestoredCFAOffset = true; + return false; + case MCCFIInstruction::OpAdjustCfaOffset: + case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpEscape: + case MCCFIInstruction::OpNegateRAState: + llvm_unreachable("unsupported CFI opcode"); + return false; + case MCCFIInstruction::OpRememberState: + case MCCFIInstruction::OpRestoreState: + case MCCFIInstruction::OpGnuArgsSize: + // do not affect CFI state + return true; + } + return false; + } +}; + +} // end anonymous namespace + +bool BinaryFunction::replayCFIInstrs(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator InsertIt) { + if (FromState == ToState) + return true; + assert(FromState < ToState && "can only replay CFIs forward"); + + CFISnapshotDiff CFIDiff(CIEFrameInstructions, FrameInstructions, + FrameRestoreEquivalents, FromState); + + std::vector NewCFIs; + for (int32_t CurState = FromState; CurState < ToState; ++CurState) { + MCCFIInstruction *Instr = &FrameInstructions[CurState]; + if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) { + auto Iter = FrameRestoreEquivalents.find(CurState); + assert(Iter != FrameRestoreEquivalents.end()); + NewCFIs.insert(NewCFIs.end(), Iter->second.begin(), + Iter->second.end()); + // RestoreState / Remember will be filtered out later by CFISnapshotDiff, + // so we might as well fall-through here. + } + NewCFIs.push_back(CurState); + continue; + } + + // Replay instructions while avoiding duplicates + for (auto I = NewCFIs.rbegin(), E = NewCFIs.rend(); I != E; ++I) { + if (CFIDiff.isRedundant(FrameInstructions[*I])) + continue; + InsertIt = addCFIPseudo(InBB, InsertIt, *I); + } + + return true; +} + +SmallVector +BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator &InsertIt) { + SmallVector NewStates; + + CFISnapshot ToCFITable(CIEFrameInstructions, FrameInstructions, + FrameRestoreEquivalents, ToState); + CFISnapshotDiff FromCFITable(ToCFITable); + FromCFITable.advanceTo(FromState); + + auto undoState = [&](const MCCFIInstruction &Instr) { + switch (Instr.getOperation()) { + case MCCFIInstruction::OpRememberState: + case MCCFIInstruction::OpRestoreState: + break; + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: { + if (ToCFITable.RegRule.find(Instr.getRegister()) == + ToCFITable.RegRule.end()) { + FrameInstructions.emplace_back( + MCCFIInstruction::createRestore(nullptr, Instr.getRegister())); + if (FromCFITable.isRedundant(FrameInstructions.back())) { + FrameInstructions.pop_back(); + break; + } + NewStates.push_back(FrameInstructions.size() - 1); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1); + ++InsertIt; + break; + } + const int32_t Rule = ToCFITable.RegRule[Instr.getRegister()]; + if (Rule < 0) { + if (FromCFITable.isRedundant(CIEFrameInstructions[-Rule])) + break; + NewStates.push_back(FrameInstructions.size()); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size()); + ++InsertIt; + FrameInstructions.emplace_back(CIEFrameInstructions[-Rule]); + break; + } + if (FromCFITable.isRedundant(FrameInstructions[Rule])) + break; + NewStates.push_back(Rule); + InsertIt = addCFIPseudo(InBB, InsertIt, Rule); + ++InsertIt; + break; + } + case MCCFIInstruction::OpDefCfaRegister: + case MCCFIInstruction::OpDefCfaOffset: + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpDefCfaExpression: + if (ToCFITable.CFARule == CFISnapshot::UNKNOWN) { + FrameInstructions.emplace_back(MCCFIInstruction::cfiDefCfa( + nullptr, ToCFITable.CFAReg, ToCFITable.CFAOffset)); + if (FromCFITable.isRedundant(FrameInstructions.back())) { + FrameInstructions.pop_back(); + break; + } + NewStates.push_back(FrameInstructions.size() - 1); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1); + ++InsertIt; + } else if (ToCFITable.CFARule < 0) { + if (FromCFITable.isRedundant(CIEFrameInstructions[-ToCFITable.CFARule])) + break; + NewStates.push_back(FrameInstructions.size()); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size()); + ++InsertIt; + FrameInstructions.emplace_back( + CIEFrameInstructions[-ToCFITable.CFARule]); + } else if (!FromCFITable.isRedundant( + FrameInstructions[ToCFITable.CFARule])) { + NewStates.push_back(ToCFITable.CFARule); + InsertIt = addCFIPseudo(InBB, InsertIt, ToCFITable.CFARule); + ++InsertIt; + } + break; + case MCCFIInstruction::OpAdjustCfaOffset: + case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpEscape: + case MCCFIInstruction::OpNegateRAState: + llvm_unreachable("unsupported CFI opcode"); + break; + case MCCFIInstruction::OpGnuArgsSize: + // do not affect CFI state + break; + } + }; + + + // Undo all modifications from ToState to FromState + for (int32_t I = ToState, E = FromState; I != E; ++I) { + const MCCFIInstruction &Instr = FrameInstructions[I]; + if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) { + undoState(Instr); + continue; + } + auto Iter = FrameRestoreEquivalents.find(I); + if (Iter == FrameRestoreEquivalents.end()) + continue; + for (int32_t State : Iter->second) + undoState(FrameInstructions[State]); + } + + return NewStates; +} + +void BinaryFunction::normalizeCFIState() { + // Reordering blocks with remember-restore state instructions can be specially + // tricky. When rewriting the CFI, we omit remember-restore state instructions + // entirely. For restore state, we build a map expanding each restore to the + // equivalent unwindCFIState sequence required at that point to achieve the + // same effect of the restore. All remember state are then just ignored. + std::stack Stack; + for (BinaryBasicBlock *CurBB : BasicBlocksLayout) { + for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { + if (MCCFIInstruction *CFI = getCFIFor(*II)) { + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { + Stack.push(II->getOperand(0).getImm()); + continue; + } + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { + const int32_t RememberState = Stack.top(); + const int32_t CurState = II->getOperand(0).getImm(); + FrameRestoreEquivalents[CurState] = + unwindCFIState(CurState, RememberState, CurBB, II); + Stack.pop(); + } + } + } + } +} + +bool BinaryFunction::finalizeCFIState() { + LLVM_DEBUG( + dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); + LLVM_DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this + << ": "); + + int32_t State = 0; + bool SeenCold = false; + const char *Sep = ""; + (void)Sep; + for (BinaryBasicBlock *BB : BasicBlocksLayout) { + const int32_t CFIStateAtExit = BB->getCFIStateAtExit(); + + // Hot-cold border: check if this is the first BB to be allocated in a cold + // region (with a different FDE). If yes, we need to reset the CFI state. + if (!SeenCold && BB->isCold()) { + State = 0; + SeenCold = true; + } + + // We need to recover the correct state if it doesn't match expected + // state at BB entry point. + if (BB->getCFIState() < State) { + // In this case, State is currently higher than what this BB expect it + // to be. To solve this, we need to insert CFI instructions to undo + // the effect of all CFI from BB's state to current State. + auto InsertIt = BB->begin(); + unwindCFIState(State, BB->getCFIState(), BB, InsertIt); + } else if (BB->getCFIState() > State) { + // If BB's CFI state is greater than State, it means we are behind in the + // state. Just emit all instructions to reach this state at the + // beginning of this BB. If this sequence of instructions involve + // remember state or restore state, bail out. + if (!replayCFIInstrs(State, BB->getCFIState(), BB, BB->begin())) + return false; + } + + State = CFIStateAtExit; + LLVM_DEBUG(dbgs() << Sep << State; Sep = ", "); + } + LLVM_DEBUG(dbgs() << "\n"); + + for (BinaryBasicBlock *BB : BasicBlocksLayout) { + for (auto II = BB->begin(); II != BB->end(); ) { + MCCFIInstruction *CFI = getCFIFor(*II); + if (CFI && + (CFI->getOperation() == MCCFIInstruction::OpRememberState || + CFI->getOperation() == MCCFIInstruction::OpRestoreState)) { + II = BB->eraseInstruction(II); + } else { + ++II; + } + } + } + + return true; +} + +bool BinaryFunction::requiresAddressTranslation() const { + return opts::EnableBAT || hasSDTMarker() || hasPseudoProbe(); +} + +uint64_t BinaryFunction::getInstructionCount() const { + uint64_t Count = 0; + for (BinaryBasicBlock *const &Block : BasicBlocksLayout) { + Count += Block->getNumNonPseudos(); + } + return Count; +} + +bool BinaryFunction::hasLayoutChanged() const { + return ModifiedLayout; +} + +uint64_t BinaryFunction::getEditDistance() const { + return ComputeEditDistance(BasicBlocksPreviousLayout, + BasicBlocksLayout); +} + +void BinaryFunction::clearDisasmState() { + clearList(Instructions); + clearList(IgnoredBranches); + clearList(TakenBranches); + clearList(InterproceduralReferences); + + if (BC.HasRelocations) { + for (std::pair &LI : Labels) { + BC.UndefinedSymbols.insert(LI.second); + } + if (FunctionEndLabel) { + BC.UndefinedSymbols.insert(FunctionEndLabel); + } + } +} + +void BinaryFunction::setTrapOnEntry() { + clearDisasmState(); + + auto addTrapAtOffset = [&](uint64_t Offset) { + MCInst TrapInstr; + BC.MIB->createTrap(TrapInstr); + addInstruction(Offset, std::move(TrapInstr)); + }; + + addTrapAtOffset(0); + for (const std::pair &KV : getLabels()) { + if (getSecondaryEntryPointSymbol(KV.second)) { + addTrapAtOffset(KV.first); + } + } + + TrapsOnEntry = true; +} + +void BinaryFunction::setIgnored() { + if (opts::processAllFunctions()) { + // We can accept ignored functions before they've been disassembled. + // In that case, they would still get disassembled and emited, but not + // optimized. + assert(CurrentState == State::Empty && + "cannot ignore non-empty functions in current mode"); + IsIgnored = true; + return; + } + + clearDisasmState(); + + // Clear CFG state too. + if (hasCFG()) { + releaseCFG(); + + for (BinaryBasicBlock *BB : BasicBlocks) { + delete BB; + } + clearList(BasicBlocks); + + for (BinaryBasicBlock *BB : DeletedBasicBlocks) { + delete BB; + } + clearList(DeletedBasicBlocks); + + clearList(BasicBlocksLayout); + clearList(BasicBlocksPreviousLayout); + } + + CurrentState = State::Empty; + + IsIgnored = true; + IsSimple = false; + LLVM_DEBUG(dbgs() << "Ignoring " << getPrintName() << '\n'); +} + +void BinaryFunction::duplicateConstantIslands() { + for (BinaryBasicBlock *BB : layout()) { + if (!BB->isCold()) + continue; + + for (MCInst &Inst : *BB) { + int OpNum = 0; + for (MCOperand &Operand : Inst) { + if (!Operand.isExpr()) { + ++OpNum; + continue; + } + const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, OpNum); + // Check if this is an island symbol + if (!Islands.Symbols.count(Symbol) && + !Islands.ProxySymbols.count(Symbol)) + continue; + + // Create cold symbol, if missing + auto ISym = Islands.ColdSymbols.find(Symbol); + MCSymbol *ColdSymbol; + if (ISym != Islands.ColdSymbols.end()) { + ColdSymbol = ISym->second; + } else { + ColdSymbol = BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".cold"); + Islands.ColdSymbols[Symbol] = ColdSymbol; + // Check if this is a proxy island symbol and update owner proxy map + if (Islands.ProxySymbols.count(Symbol)) { + BinaryFunction *Owner = Islands.ProxySymbols[Symbol]; + auto IProxiedSym = Owner->Islands.Proxies[this].find(Symbol); + Owner->Islands.ColdProxies[this][IProxiedSym->second] = + ColdSymbol; + } + } + + // Update instruction reference + Operand = MCOperand::createExpr(BC.MIB->getTargetExprFor( + Inst, + MCSymbolRefExpr::create(ColdSymbol, MCSymbolRefExpr::VK_None, + *BC.Ctx), + *BC.Ctx, 0)); + ++OpNum; + } + } + } +} + +namespace { + +#ifndef MAX_PATH +#define MAX_PATH 255 +#endif + +std::string constructFilename(std::string Filename, + std::string Annotation, + std::string Suffix) { + std::replace(Filename.begin(), Filename.end(), '/', '-'); + if (!Annotation.empty()) { + Annotation.insert(0, "-"); + } + if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) { + assert(Suffix.size() + Annotation.size() <= MAX_PATH); + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix + << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n"; + } + Filename.resize(MAX_PATH - (Suffix.size() + Annotation.size())); + } + Filename += Annotation; + Filename += Suffix; + return Filename; +} + +std::string formatEscapes(const std::string& Str) { + std::string Result; + for (unsigned I = 0; I < Str.size(); ++I) { + char C = Str[I]; + switch (C) { + case '\n': + Result += " "; + break; + case '"': + break; + default: + Result += C; + break; + } + } + return Result; +} + +} + +void BinaryFunction::dumpGraph(raw_ostream& OS) const { + OS << "strict digraph \"" << getPrintName() << "\" {\n"; + uint64_t Offset = Address; + for (BinaryBasicBlock *BB : BasicBlocks) { + auto LayoutPos = std::find(BasicBlocksLayout.begin(), + BasicBlocksLayout.end(), + BB); + unsigned Layout = LayoutPos - BasicBlocksLayout.begin(); + const char* ColdStr = BB->isCold() ? " (cold)" : ""; + OS << format("\"%s\" [label=\"%s%s\\n(C:%lu,O:%lu,I:%u,L:%u:CFI:%u)\"]\n", + BB->getName().data(), + BB->getName().data(), + ColdStr, + (BB->ExecutionCount != BinaryBasicBlock::COUNT_NO_PROFILE + ? BB->ExecutionCount + : 0), + BB->getOffset(), + getIndex(BB), + Layout, + BB->getCFIState()); + OS << format("\"%s\" [shape=box]\n", BB->getName().data()); + if (opts::DotToolTipCode) { + std::string Str; + raw_string_ostream CS(Str); + Offset = BC.printInstructions(CS, BB->begin(), BB->end(), Offset, this); + const std::string Code = formatEscapes(CS.str()); + OS << format("\"%s\" [tooltip=\"%s\"]\n", + BB->getName().data(), + Code.c_str()); + } + + // analyzeBranch is just used to get the names of the branch + // opcodes. + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + const bool Success = BB->analyzeBranch(TBB, + FBB, + CondBranch, + UncondBranch); + + const MCInst *LastInstr = BB->getLastNonPseudoInstr(); + const bool IsJumpTable = LastInstr && BC.MIB->getJumpTable(*LastInstr); + + auto BI = BB->branch_info_begin(); + for (BinaryBasicBlock *Succ : BB->successors()) { + std::string Branch; + if (Success) { + if (Succ == BB->getConditionalSuccessor(true)) { + Branch = CondBranch ? std::string(BC.InstPrinter->getOpcodeName( + CondBranch->getOpcode())) + : "TB"; + } else if (Succ == BB->getConditionalSuccessor(false)) { + Branch = UncondBranch ? std::string(BC.InstPrinter->getOpcodeName( + UncondBranch->getOpcode())) + : "FB"; + } else { + Branch = "FT"; + } + } + if (IsJumpTable) { + Branch = "JT"; + } + OS << format("\"%s\" -> \"%s\" [label=\"%s", + BB->getName().data(), + Succ->getName().data(), + Branch.c_str()); + + if (BB->getExecutionCount() != COUNT_NO_PROFILE && + BI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { + OS << "\\n(C:" << BI->Count << ",M:" << BI->MispredictedCount << ")"; + } else if (ExecutionCount != COUNT_NO_PROFILE && + BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { + OS << "\\n(IC:" << BI->Count << ")"; + } + OS << "\"]\n"; + + ++BI; + } + for (BinaryBasicBlock *LP : BB->landing_pads()) { + OS << format("\"%s\" -> \"%s\" [constraint=false style=dashed]\n", + BB->getName().data(), + LP->getName().data()); + } + } + OS << "}\n"; +} + +void BinaryFunction::viewGraph() const { + SmallString Filename; + if (std::error_code EC = + sys::fs::createTemporaryFile("bolt-cfg", "dot", Filename)) { + errs() << "BOLT-ERROR: " << EC.message() << ", unable to create " + << " bolt-cfg-XXXXX.dot temporary file.\n"; + return; + } + dumpGraphToFile(std::string(Filename)); + if (DisplayGraph(Filename)) { + errs() << "BOLT-ERROR: Can't display " << Filename << " with graphviz.\n"; + } + if (std::error_code EC = sys::fs::remove(Filename)) { + errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove " + << Filename << "\n"; + } +} + +void BinaryFunction::dumpGraphForPass(std::string Annotation) const { + std::string Filename = constructFilename(getPrintName(), Annotation, ".dot"); + outs() << "BOLT-DEBUG: Dumping CFG to " << Filename << "\n"; + dumpGraphToFile(Filename); +} + +void BinaryFunction::dumpGraphToFile(std::string Filename) const { + std::error_code EC; + raw_fd_ostream of(Filename, EC, sys::fs::OF_None); + if (EC) { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: " << EC.message() << ", unable to open " + << Filename << " for output.\n"; + } + return; + } + dumpGraph(of); +} + +bool BinaryFunction::validateCFG() const { + bool Valid = true; + for (BinaryBasicBlock *BB : BasicBlocks) { + Valid &= BB->validateSuccessorInvariants(); + } + + if (!Valid) + return Valid; + + // Make sure all blocks in CFG are valid. + auto validateBlock = [this](const BinaryBasicBlock *BB, StringRef Desc) { + if (!BB->isValid()) { + errs() << "BOLT-ERROR: deleted " << Desc << " " << BB->getName() + << " detected in:\n"; + this->dump(); + return false; + } + return true; + }; + for (const BinaryBasicBlock *BB : BasicBlocks) { + if (!validateBlock(BB, "block")) + return false; + for (const BinaryBasicBlock *PredBB : BB->predecessors()) + if (!validateBlock(PredBB, "predecessor")) + return false; + for (const BinaryBasicBlock *SuccBB : BB->successors()) + if (!validateBlock(SuccBB, "successor")) + return false; + for (const BinaryBasicBlock *LP : BB->landing_pads()) + if (!validateBlock(LP, "landing pad")) + return false; + for (const BinaryBasicBlock *Thrower : BB->throwers()) + if (!validateBlock(Thrower, "thrower")) + return false; + } + + for (const BinaryBasicBlock *BB : BasicBlocks) { + std::unordered_set BBLandingPads; + for (const BinaryBasicBlock *LP : BB->landing_pads()) { + if (BBLandingPads.count(LP)) { + errs() << "BOLT-ERROR: duplicate landing pad detected in" + << BB->getName() << " in function " << *this << '\n'; + return false; + } + BBLandingPads.insert(LP); + } + + std::unordered_set BBThrowers; + for (const BinaryBasicBlock *Thrower : BB->throwers()) { + if (BBThrowers.count(Thrower)) { + errs() << "BOLT-ERROR: duplicate thrower detected in" + << BB->getName() << " in function " << *this << '\n'; + return false; + } + BBThrowers.insert(Thrower); + } + + for (const BinaryBasicBlock *LPBlock : BB->landing_pads()) { + if (std::find(LPBlock->throw_begin(), LPBlock->throw_end(), BB) + == LPBlock->throw_end()) { + errs() << "BOLT-ERROR: inconsistent landing pad detected in " + << *this << ": " << BB->getName() + << " is in LandingPads but not in " << LPBlock->getName() + << " Throwers\n"; + return false; + } + } + for (const BinaryBasicBlock *Thrower : BB->throwers()) { + if (std::find(Thrower->lp_begin(), Thrower->lp_end(), BB) + == Thrower->lp_end()) { + errs() << "BOLT-ERROR: inconsistent thrower detected in " + << *this << ": " << BB->getName() + << " is in Throwers list but not in " << Thrower->getName() + << " LandingPads\n"; + return false; + } + } + } + + return Valid; +} + +void BinaryFunction::fixBranches() { + auto &MIB = BC.MIB; + MCContext *Ctx = BC.Ctx.get(); + + for (unsigned I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { + BinaryBasicBlock *BB = BasicBlocksLayout[I]; + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) + continue; + + // We will create unconditional branch with correct destination if needed. + if (UncondBranch) + BB->eraseInstruction(BB->findInstruction(UncondBranch)); + + // Basic block that follows the current one in the final layout. + const BinaryBasicBlock *NextBB = nullptr; + if (I + 1 != E && BB->isCold() == BasicBlocksLayout[I + 1]->isCold()) + NextBB = BasicBlocksLayout[I + 1]; + + if (BB->succ_size() == 1) { + // __builtin_unreachable() could create a conditional branch that + // falls-through into the next function - hence the block will have only + // one valid successor. Since behaviour is undefined - we replace + // the conditional branch with an unconditional if required. + if (CondBranch) + BB->eraseInstruction(BB->findInstruction(CondBranch)); + if (BB->getSuccessor() == NextBB) + continue; + BB->addBranchInstruction(BB->getSuccessor()); + } else if (BB->succ_size() == 2) { + assert(CondBranch && "conditional branch expected"); + const BinaryBasicBlock *TSuccessor = BB->getConditionalSuccessor(true); + const BinaryBasicBlock *FSuccessor = BB->getConditionalSuccessor(false); + // Check whether we support reversing this branch direction + const bool IsSupported = + !MIB->isUnsupportedBranch(CondBranch->getOpcode()); + if (NextBB && NextBB == TSuccessor && IsSupported) { + std::swap(TSuccessor, FSuccessor); + { + auto L = BC.scopeLock(); + MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx); + } + BB->swapConditionalSuccessors(); + } else { + auto L = BC.scopeLock(); + MIB->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx); + } + if (TSuccessor == FSuccessor) { + BB->removeDuplicateConditionalSuccessor(CondBranch); + } + if (!NextBB || + ((NextBB != TSuccessor || !IsSupported) && NextBB != FSuccessor)) { + // If one of the branches is guaranteed to be "long" while the other + // could be "short", then prioritize short for "taken". This will + // generate a sequence 1 byte shorter on x86. + if (IsSupported && BC.isX86() && + TSuccessor->isCold() != FSuccessor->isCold() && + BB->isCold() != TSuccessor->isCold()) { + std::swap(TSuccessor, FSuccessor); + { + auto L = BC.scopeLock(); + MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), + Ctx); + } + BB->swapConditionalSuccessors(); + } + BB->addBranchInstruction(FSuccessor); + } + } + // Cases where the number of successors is 0 (block ends with a + // terminator) or more than 2 (switch table) don't require branch + // instruction adjustments. + } + assert((!isSimple() || validateCFG()) + && "Invalid CFG detected after fixing branches"); +} + +void BinaryFunction::propagateGnuArgsSizeInfo( + MCPlusBuilder::AllocatorIdTy AllocId) { + assert(CurrentState == State::Disassembled && "unexpected function state"); + + if (!hasEHRanges() || !usesGnuArgsSize()) + return; + + // The current value of DW_CFA_GNU_args_size affects all following + // invoke instructions until the next CFI overrides it. + // It is important to iterate basic blocks in the original order when + // assigning the value. + uint64_t CurrentGnuArgsSize = 0; + for (BinaryBasicBlock *BB : BasicBlocks) { + for (auto II = BB->begin(); II != BB->end(); ) { + MCInst &Instr = *II; + if (BC.MIB->isCFI(Instr)) { + MCCFIInstruction *CFI = getCFIFor(Instr); + if (CFI->getOperation() == MCCFIInstruction::OpGnuArgsSize) { + CurrentGnuArgsSize = CFI->getOffset(); + // Delete DW_CFA_GNU_args_size instructions and only regenerate + // during the final code emission. The information is embedded + // inside call instructions. + II = BB->erasePseudoInstruction(II); + continue; + } + } else if (BC.MIB->isInvoke(Instr)) { + // Add the value of GNU_args_size as an extra operand to invokes. + BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize, AllocId); + } + ++II; + } + } +} + +void BinaryFunction::postProcessBranches() { + if (!isSimple()) + return; + for (BinaryBasicBlock *BB : BasicBlocksLayout) { + auto LastInstrRI = BB->getLastNonPseudo(); + if (BB->succ_size() == 1) { + if (LastInstrRI != BB->rend() && + BC.MIB->isConditionalBranch(*LastInstrRI)) { + // __builtin_unreachable() could create a conditional branch that + // falls-through into the next function - hence the block will have only + // one valid successor. Such behaviour is undefined and thus we remove + // the conditional branch while leaving a valid successor. + BB->eraseInstruction(std::prev(LastInstrRI.base())); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: erasing conditional branch in " + << BB->getName() << " in function " << *this << '\n'); + } + } else if (BB->succ_size() == 0) { + // Ignore unreachable basic blocks. + if (BB->pred_size() == 0 || BB->isLandingPad()) + continue; + + // If it's the basic block that does not end up with a terminator - we + // insert a return instruction unless it's a call instruction. + if (LastInstrRI == BB->rend()) { + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: at least one instruction expected in BB " + << BB->getName() << " in function " << *this << '\n'); + continue; + } + if (!BC.MIB->isTerminator(*LastInstrRI) && + !BC.MIB->isCall(*LastInstrRI)) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding return to basic block " + << BB->getName() << " in function " << *this << '\n'); + MCInst ReturnInstr; + BC.MIB->createReturn(ReturnInstr); + BB->addInstruction(ReturnInstr); + } + } + } + assert(validateCFG() && "invalid CFG"); +} + +MCSymbol *BinaryFunction::addEntryPointAtOffset(uint64_t Offset) { + assert(Offset && "cannot add primary entry point"); + assert(CurrentState == State::Empty || CurrentState == State::Disassembled); + + const uint64_t EntryPointAddress = getAddress() + Offset; + MCSymbol *LocalSymbol = getOrCreateLocalLabel(EntryPointAddress); + + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(LocalSymbol); + if (EntrySymbol) + return EntrySymbol; + + if (BinaryData *EntryBD = BC.getBinaryDataAtAddress(EntryPointAddress)) { + EntrySymbol = EntryBD->getSymbol(); + } else { + EntrySymbol = + BC.getOrCreateGlobalSymbol(EntryPointAddress, + Twine("__ENTRY_") + getOneName() + "@"); + } + SecondaryEntryPoints[LocalSymbol] = EntrySymbol; + + BC.setSymbolToFunctionMap(EntrySymbol, this); + + return EntrySymbol; +} + +MCSymbol *BinaryFunction::addEntryPoint(const BinaryBasicBlock &BB) { + assert(CurrentState == State::CFG && + "basic block can be added as an entry only in a function with CFG"); + + if (&BB == BasicBlocks.front()) + return getSymbol(); + + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(BB); + if (EntrySymbol) + return EntrySymbol; + + EntrySymbol = + BC.Ctx->getOrCreateSymbol("__ENTRY_" + BB.getLabel()->getName()); + + SecondaryEntryPoints[BB.getLabel()] = EntrySymbol; + + BC.setSymbolToFunctionMap(EntrySymbol, this); + + return EntrySymbol; +} + +MCSymbol *BinaryFunction::getSymbolForEntryID(uint64_t EntryID) { + if (EntryID == 0) + return getSymbol(); + + if (!isMultiEntry()) + return nullptr; + + uint64_t NumEntries = 0; + if (hasCFG()) { + for (BinaryBasicBlock *BB : BasicBlocks) { + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB); + if (!EntrySymbol) + continue; + if (NumEntries == EntryID) + return EntrySymbol; + ++NumEntries; + } + } else { + for (std::pair &KV : Labels) { + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second); + if (!EntrySymbol) + continue; + if (NumEntries == EntryID) + return EntrySymbol; + ++NumEntries; + } + } + + return nullptr; +} + +uint64_t BinaryFunction::getEntryIDForSymbol(const MCSymbol *Symbol) const { + if (!isMultiEntry()) + return 0; + + for (const MCSymbol *FunctionSymbol : getSymbols()) + if (FunctionSymbol == Symbol) + return 0; + + // Check all secondary entries available as either basic blocks or lables. + uint64_t NumEntries = 0; + for (const BinaryBasicBlock *BB : BasicBlocks) { + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB); + if (!EntrySymbol) + continue; + if (EntrySymbol == Symbol) + return NumEntries; + ++NumEntries; + } + NumEntries = 0; + for (const std::pair &KV : Labels) { + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second); + if (!EntrySymbol) + continue; + if (EntrySymbol == Symbol) + return NumEntries; + ++NumEntries; + } + + llvm_unreachable("symbol not found"); +} + +bool BinaryFunction::forEachEntryPoint(EntryPointCallbackTy Callback) const { + bool Status = Callback(0, getSymbol()); + if (!isMultiEntry()) + return Status; + + for (const std::pair &KV : Labels) { + if (!Status) + break; + + MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second); + if (!EntrySymbol) + continue; + + Status = Callback(KV.first, EntrySymbol); + } + + return Status; +} + +BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { + BasicBlockOrderType DFS; + unsigned Index = 0; + std::stack Stack; + + // Push entry points to the stack in reverse order. + // + // NB: we rely on the original order of entries to match. + for (auto BBI = layout_rbegin(); BBI != layout_rend(); ++BBI) { + BinaryBasicBlock *BB = *BBI; + if (isEntryPoint(*BB)) + Stack.push(BB); + BB->setLayoutIndex(BinaryBasicBlock::InvalidIndex); + } + + while (!Stack.empty()) { + BinaryBasicBlock *BB = Stack.top(); + Stack.pop(); + + if (BB->getLayoutIndex() != BinaryBasicBlock::InvalidIndex) + continue; + + BB->setLayoutIndex(Index++); + DFS.push_back(BB); + + for (BinaryBasicBlock *SuccBB : BB->landing_pads()) { + Stack.push(SuccBB); + } + + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch) && + CondBranch && BB->succ_size() == 2) { + if (BC.MIB->getCanonicalBranchCondCode(BC.MIB->getCondCode( + *CondBranch)) == BC.MIB->getCondCode(*CondBranch)) { + Stack.push(BB->getConditionalSuccessor(true)); + Stack.push(BB->getConditionalSuccessor(false)); + } else { + Stack.push(BB->getConditionalSuccessor(false)); + Stack.push(BB->getConditionalSuccessor(true)); + } + } else { + for (BinaryBasicBlock *SuccBB : BB->successors()) { + Stack.push(SuccBB); + } + } + } + + return DFS; +} + +size_t BinaryFunction::computeHash(bool UseDFS, + OperandHashFuncTy OperandHashFunc) const { + if (size() == 0) + return 0; + + assert(hasCFG() && "function is expected to have CFG"); + + const std::vector &Order = + UseDFS ? dfs() : BasicBlocksLayout; + + // The hash is computed by creating a string of all instruction opcodes and + // possibly their operands and then hashing that string with std::hash. + std::string HashString; + for (const BinaryBasicBlock *BB : Order) { + for (const MCInst &Inst : *BB) { + unsigned Opcode = Inst.getOpcode(); + + if (BC.MIB->isPseudo(Inst)) + continue; + + // Ignore unconditional jumps since we check CFG consistency by processing + // basic blocks in order and do not rely on branches to be in-sync with + // CFG. Note that we still use condition code of conditional jumps. + if (BC.MIB->isUnconditionalBranch(Inst)) + continue; + + if (Opcode == 0) + HashString.push_back(0); + + while (Opcode) { + uint8_t LSB = Opcode & 0xff; + HashString.push_back(LSB); + Opcode = Opcode >> 8; + } + + for (unsigned I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) { + HashString.append(OperandHashFunc(Inst.getOperand(I))); + } + } + } + + return Hash = std::hash{}(HashString); +} + +void BinaryFunction::insertBasicBlocks( + BinaryBasicBlock *Start, + std::vector> &&NewBBs, + const bool UpdateLayout, + const bool UpdateCFIState, + const bool RecomputeLandingPads) { + const auto StartIndex = Start ? getIndex(Start) : -1; + const size_t NumNewBlocks = NewBBs.size(); + + BasicBlocks.insert(BasicBlocks.begin() + (StartIndex + 1), + NumNewBlocks, + nullptr); + + auto I = StartIndex + 1; + for (std::unique_ptr &BB : NewBBs) { + assert(!BasicBlocks[I]); + BasicBlocks[I++] = BB.release(); + } + + if (RecomputeLandingPads) { + recomputeLandingPads(); + } else { + updateBBIndices(0); + } + + if (UpdateLayout) { + updateLayout(Start, NumNewBlocks); + } + + if (UpdateCFIState) { + updateCFIState(Start, NumNewBlocks); + } +} + +BinaryFunction::iterator BinaryFunction::insertBasicBlocks( + BinaryFunction::iterator StartBB, + std::vector> &&NewBBs, + const bool UpdateLayout, + const bool UpdateCFIState, + const bool RecomputeLandingPads) { + const unsigned StartIndex = getIndex(&*StartBB); + const size_t NumNewBlocks = NewBBs.size(); + + BasicBlocks.insert(BasicBlocks.begin() + StartIndex + 1, NumNewBlocks, + nullptr); + auto RetIter = BasicBlocks.begin() + StartIndex + 1; + + unsigned I = StartIndex + 1; + for (std::unique_ptr &BB : NewBBs) { + assert(!BasicBlocks[I]); + BasicBlocks[I++] = BB.release(); + } + + if (RecomputeLandingPads) { + recomputeLandingPads(); + } else { + updateBBIndices(0); + } + + if (UpdateLayout) { + updateLayout(*std::prev(RetIter), NumNewBlocks); + } + + if (UpdateCFIState) { + updateCFIState(*std::prev(RetIter), NumNewBlocks); + } + + return RetIter; +} + +void BinaryFunction::updateBBIndices(const unsigned StartIndex) { + for (unsigned I = StartIndex; I < BasicBlocks.size(); ++I) { + BasicBlocks[I]->Index = I; + } +} + +void BinaryFunction::updateCFIState(BinaryBasicBlock *Start, + const unsigned NumNewBlocks) { + const int32_t CFIState = Start->getCFIStateAtExit(); + const unsigned StartIndex = getIndex(Start) + 1; + for (unsigned I = 0; I < NumNewBlocks; ++I) { + BasicBlocks[StartIndex + I]->setCFIState(CFIState); + } +} + +void BinaryFunction::updateLayout(BinaryBasicBlock *Start, + const unsigned NumNewBlocks) { + // If start not provided insert new blocks at the beginning + if (!Start) { + BasicBlocksLayout.insert(layout_begin(), BasicBlocks.begin(), + BasicBlocks.begin() + NumNewBlocks); + updateLayoutIndices(); + return; + } + + // Insert new blocks in the layout immediately after Start. + auto Pos = std::find(layout_begin(), layout_end(), Start); + assert(Pos != layout_end()); + BinaryBasicBlock **Begin = &BasicBlocks[getIndex(Start) + 1]; + BinaryBasicBlock **End = &BasicBlocks[getIndex(Start) + NumNewBlocks + 1]; + BasicBlocksLayout.insert(Pos + 1, Begin, End); + updateLayoutIndices(); +} + +bool BinaryFunction::checkForAmbiguousJumpTables() { + SmallSet JumpTables; + for (BinaryBasicBlock *&BB : BasicBlocks) { + for (MCInst &Inst : *BB) { + if (!BC.MIB->isIndirectBranch(Inst)) + continue; + uint64_t JTAddress = BC.MIB->getJumpTable(Inst); + if (!JTAddress) + continue; + // This address can be inside another jump table, but we only consider + // it ambiguous when the same start address is used, not the same JT + // object. + if (!JumpTables.count(JTAddress)) { + JumpTables.insert(JTAddress); + continue; + } + return true; + } + } + return false; +} + +void BinaryFunction::disambiguateJumpTables( + MCPlusBuilder::AllocatorIdTy AllocId) { + assert((opts::JumpTables != JTS_BASIC && isSimple()) || !BC.HasRelocations); + SmallPtrSet JumpTables; + for (BinaryBasicBlock *&BB : BasicBlocks) { + for (MCInst &Inst : *BB) { + if (!BC.MIB->isIndirectBranch(Inst)) + continue; + JumpTable *JT = getJumpTable(Inst); + if (!JT) + continue; + auto Iter = JumpTables.find(JT); + if (Iter == JumpTables.end()) { + JumpTables.insert(JT); + continue; + } + // This instruction is an indirect jump using a jump table, but it is + // using the same jump table of another jump. Try all our tricks to + // extract the jump table symbol and make it point to a new, duplicated JT + MCPhysReg BaseReg1; + uint64_t Scale; + const MCSymbol *Target; + // In case we match if our first matcher, first instruction is the one to + // patch + MCInst *JTLoadInst = &Inst; + // Try a standard indirect jump matcher, scale 8 + std::unique_ptr IndJmpMatcher = + BC.MIB->matchIndJmp(BC.MIB->matchReg(BaseReg1), + BC.MIB->matchImm(Scale), BC.MIB->matchReg(), + /*Offset=*/BC.MIB->matchSymbol(Target)); + if (!IndJmpMatcher->match( + *BC.MRI, *BC.MIB, + MutableArrayRef(&*BB->begin(), &Inst + 1), -1) || + BaseReg1 != BC.MIB->getNoRegister() || + Scale != 8) { + MCPhysReg BaseReg2; + uint64_t Offset; + // Standard JT matching failed. Trying now: + // movq "jt.2397/1"(,%rax,8), %rax + // jmpq *%rax + std::unique_ptr LoadMatcherOwner = + BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg1), + BC.MIB->matchImm(Scale), BC.MIB->matchReg(), + /*Offset=*/BC.MIB->matchSymbol(Target)); + MCPlusBuilder::MCInstMatcher *LoadMatcher = LoadMatcherOwner.get(); + std::unique_ptr IndJmpMatcher2 = + BC.MIB->matchIndJmp(std::move(LoadMatcherOwner)); + if (!IndJmpMatcher2->match( + *BC.MRI, *BC.MIB, + MutableArrayRef(&*BB->begin(), &Inst + 1), -1) || + BaseReg1 != BC.MIB->getNoRegister() || Scale != 8) { + // JT matching failed. Trying now: + // PIC-style matcher, scale 4 + // addq %rdx, %rsi + // addq %rdx, %rdi + // leaq DATAat0x402450(%rip), %r11 + // movslq (%r11,%rdx,4), %rcx + // addq %r11, %rcx + // jmpq *%rcx # JUMPTABLE @0x402450 + std::unique_ptr PICIndJmpMatcher = + BC.MIB->matchIndJmp(BC.MIB->matchAdd( + BC.MIB->matchReg(BaseReg1), + BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg2), + BC.MIB->matchImm(Scale), BC.MIB->matchReg(), + BC.MIB->matchImm(Offset)))); + std::unique_ptr LEAMatcherOwner = + BC.MIB->matchLoadAddr(BC.MIB->matchSymbol(Target)); + MCPlusBuilder::MCInstMatcher *LEAMatcher = LEAMatcherOwner.get(); + std::unique_ptr PICBaseAddrMatcher = + BC.MIB->matchIndJmp(BC.MIB->matchAdd(std::move(LEAMatcherOwner), + BC.MIB->matchAnyOperand())); + if (!PICIndJmpMatcher->match( + *BC.MRI, *BC.MIB, + MutableArrayRef(&*BB->begin(), &Inst + 1), -1) || + Scale != 4 || BaseReg1 != BaseReg2 || Offset != 0 || + !PICBaseAddrMatcher->match( + *BC.MRI, *BC.MIB, + MutableArrayRef(&*BB->begin(), &Inst + 1), -1)) { + llvm_unreachable("Failed to extract jump table base"); + continue; + } + // Matched PIC, identify the instruction with the reference to the JT + JTLoadInst = LEAMatcher->CurInst; + } else { + // Matched non-PIC + JTLoadInst = LoadMatcher->CurInst; + } + } + + uint64_t NewJumpTableID = 0; + const MCSymbol *NewJTLabel; + std::tie(NewJumpTableID, NewJTLabel) = + BC.duplicateJumpTable(*this, JT, Target); + { + auto L = BC.scopeLock(); + BC.MIB->replaceMemOperandDisp(*JTLoadInst, NewJTLabel, BC.Ctx.get()); + } + // We use a unique ID with the high bit set as address for this "injected" + // jump table (not originally in the input binary). + BC.MIB->setJumpTable(Inst, NewJumpTableID, 0, AllocId); + } + } +} + +bool BinaryFunction::replaceJumpTableEntryIn(BinaryBasicBlock *BB, + BinaryBasicBlock *OldDest, + BinaryBasicBlock *NewDest) { + MCInst *Instr = BB->getLastNonPseudoInstr(); + if (!Instr || !BC.MIB->isIndirectBranch(*Instr)) + return false; + uint64_t JTAddress = BC.MIB->getJumpTable(*Instr); + assert(JTAddress && "Invalid jump table address"); + JumpTable *JT = getJumpTableContainingAddress(JTAddress); + assert(JT && "No jump table structure for this indirect branch"); + bool Patched = JT->replaceDestination(JTAddress, OldDest->getLabel(), + NewDest->getLabel()); + (void)Patched; + assert(Patched && "Invalid entry to be replaced in jump table"); + return true; +} + +BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From, + BinaryBasicBlock *To) { + // Create intermediate BB + MCSymbol *Tmp; + { + auto L = BC.scopeLock(); + Tmp = BC.Ctx->createNamedTempSymbol("SplitEdge"); + } + // Link new BBs to the original input offset of the From BB, so we can map + // samples recorded in new BBs back to the original BB seem in the input + // binary (if using BAT) + std::unique_ptr NewBB = + createBasicBlock(From->getInputOffset(), Tmp); + BinaryBasicBlock *NewBBPtr = NewBB.get(); + + // Update "From" BB + auto I = From->succ_begin(); + auto BI = From->branch_info_begin(); + for (; I != From->succ_end(); ++I) { + if (*I == To) + break; + ++BI; + } + assert(I != From->succ_end() && "Invalid CFG edge in splitEdge!"); + uint64_t OrigCount = BI->Count; + uint64_t OrigMispreds = BI->MispredictedCount; + replaceJumpTableEntryIn(From, To, NewBBPtr); + From->replaceSuccessor(To, NewBBPtr, OrigCount, OrigMispreds); + + NewBB->addSuccessor(To, OrigCount, OrigMispreds); + NewBB->setExecutionCount(OrigCount); + NewBB->setIsCold(From->isCold()); + + // Update CFI and BB layout with new intermediate BB + std::vector> NewBBs; + NewBBs.emplace_back(std::move(NewBB)); + insertBasicBlocks(From, std::move(NewBBs), true, true, + /*RecomputeLandingPads=*/false); + return NewBBPtr; +} + +void BinaryFunction::deleteConservativeEdges() { + // Our goal is to aggressively remove edges from the CFG that we believe are + // wrong. This is used for instrumentation, where it is safe to remove + // fallthrough edges because we won't reorder blocks. + for (auto I = BasicBlocks.begin(), E = BasicBlocks.end(); I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (BB->succ_size() != 1 || BB->size() == 0) + continue; + + auto NextBB = std::next(I); + MCInst* Last = BB->getLastNonPseudoInstr(); + // Fallthrough is a landing pad? Delete this edge (as long as we don't + // have a direct jump to it) + if ((*BB->succ_begin())->isLandingPad() && NextBB != E && + *BB->succ_begin() == *NextBB && Last && !BC.MIB->isBranch(*Last)) { + BB->removeAllSuccessors(); + continue; + } + + // Look for suspicious calls at the end of BB where gcc may optimize it and + // remove the jump to the epilogue when it knows the call won't return. + if (!Last || !BC.MIB->isCall(*Last)) + continue; + + const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(*Last); + if (!CalleeSymbol) + continue; + + StringRef CalleeName = CalleeSymbol->getName(); + if (CalleeName != "__cxa_throw@PLT" && + CalleeName != "_Unwind_Resume@PLT" && + CalleeName != "__cxa_rethrow@PLT" && + CalleeName != "exit@PLT" && + CalleeName != "abort@PLT" ) + continue; + + BB->removeAllSuccessors(); + } +} + +bool BinaryFunction::isDataMarker(const SymbolRef &Symbol, + uint64_t SymbolSize) const { + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $d identifies a symbol starting data contents. + if (BC.isAArch64() && Symbol.getType() && + cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && + Symbol.getName() && cantFail(Symbol.getName()) == "$d") + return true; + return false; +} + +bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol, + uint64_t SymbolSize) const { + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $x identifies a symbol starting code or the + // end of a data chunk inside code. + if (BC.isAArch64() && Symbol.getType() && + cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && + Symbol.getName() && cantFail(Symbol.getName()) == "$x") + return true; + return false; +} + +bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, + uint64_t SymbolSize) const { + // If this symbol is in a different section from the one where the + // function symbol is, don't consider it as valid. + if (!getOriginSection()->containsAddress( + cantFail(Symbol.getAddress(), "cannot get symbol address"))) + return false; + + // Some symbols are tolerated inside function bodies, others are not. + // The real function boundaries may not be known at this point. + if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize)) + return true; + + // It's okay to have a zero-sized symbol in the middle of non-zero-sized + // function. + if (SymbolSize == 0 && containsAddress(cantFail(Symbol.getAddress()))) + return true; + + if (cantFail(Symbol.getType()) != SymbolRef::ST_Unknown) + return false; + + if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global) + return false; + + return true; +} + +void BinaryFunction::adjustExecutionCount(uint64_t Count) { + if (getKnownExecutionCount() == 0 || Count == 0) + return; + + if (ExecutionCount < Count) + Count = ExecutionCount; + + double AdjustmentRatio = ((double) ExecutionCount - Count) / ExecutionCount; + if (AdjustmentRatio < 0.0) + AdjustmentRatio = 0.0; + + for (BinaryBasicBlock *&BB : layout()) + BB->adjustExecutionCount(AdjustmentRatio); + + ExecutionCount -= Count; +} + +BinaryFunction::~BinaryFunction() { + for (BinaryBasicBlock *BB : BasicBlocks) { + delete BB; + } + for (BinaryBasicBlock *BB : DeletedBasicBlocks) { + delete BB; + } +} + +void BinaryFunction::calculateLoopInfo() { + // Discover loops. + BinaryDominatorTree DomTree; + DomTree.recalculate(*this); + BLI.reset(new BinaryLoopInfo()); + BLI->analyze(DomTree); + + // Traverse discovered loops and add depth and profile information. + std::stack St; + for (auto I = BLI->begin(), E = BLI->end(); I != E; ++I) { + St.push(*I); + ++BLI->OuterLoops; + } + + while (!St.empty()) { + BinaryLoop *L = St.top(); + St.pop(); + ++BLI->TotalLoops; + BLI->MaximumDepth = std::max(L->getLoopDepth(), BLI->MaximumDepth); + + // Add nested loops in the stack. + for (BinaryLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + St.push(*I); + } + + // Skip if no valid profile is found. + if (!hasValidProfile()) { + L->EntryCount = COUNT_NO_PROFILE; + L->ExitCount = COUNT_NO_PROFILE; + L->TotalBackEdgeCount = COUNT_NO_PROFILE; + continue; + } + + // Compute back edge count. + SmallVector Latches; + L->getLoopLatches(Latches); + + for (BinaryBasicBlock *Latch : Latches) { + auto BI = Latch->branch_info_begin(); + for (BinaryBasicBlock *Succ : Latch->successors()) { + if (Succ == L->getHeader()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "profile data not found"); + L->TotalBackEdgeCount += BI->Count; + } + ++BI; + } + } + + // Compute entry count. + L->EntryCount = L->getHeader()->getExecutionCount() - L->TotalBackEdgeCount; + + // Compute exit count. + SmallVector ExitEdges; + L->getExitEdges(ExitEdges); + for (BinaryLoop::Edge &Exit : ExitEdges) { + const BinaryBasicBlock *Exiting = Exit.first; + const BinaryBasicBlock *ExitTarget = Exit.second; + auto BI = Exiting->branch_info_begin(); + for (BinaryBasicBlock *Succ : Exiting->successors()) { + if (Succ == ExitTarget) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "profile data not found"); + L->ExitCount += BI->Count; + } + ++BI; + } + } + } +} + +void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { + if (!isEmitted()) { + assert(!isInjected() && "injected function should be emitted"); + setOutputAddress(getAddress()); + setOutputSize(getSize()); + return; + } + + const uint64_t BaseAddress = getCodeSection()->getOutputAddress(); + ErrorOr ColdSection = getColdCodeSection(); + const uint64_t ColdBaseAddress = + isSplit() ? ColdSection->getOutputAddress() : 0; + if (BC.HasRelocations || isInjected()) { + const uint64_t StartOffset = Layout.getSymbolOffset(*getSymbol()); + const uint64_t EndOffset = Layout.getSymbolOffset(*getFunctionEndLabel()); + setOutputAddress(BaseAddress + StartOffset); + setOutputSize(EndOffset - StartOffset); + if (hasConstantIsland()) { + const uint64_t DataOffset = + Layout.getSymbolOffset(*getFunctionConstantIslandLabel()); + setOutputDataAddress(BaseAddress + DataOffset); + } + if (isSplit()) { + const MCSymbol *ColdStartSymbol = getColdSymbol(); + assert(ColdStartSymbol && ColdStartSymbol->isDefined() && + "split function should have defined cold symbol"); + const MCSymbol *ColdEndSymbol = getFunctionColdEndLabel(); + assert(ColdEndSymbol && ColdEndSymbol->isDefined() && + "split function should have defined cold end symbol"); + const uint64_t ColdStartOffset = Layout.getSymbolOffset(*ColdStartSymbol); + const uint64_t ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); + cold().setAddress(ColdBaseAddress + ColdStartOffset); + cold().setImageSize(ColdEndOffset - ColdStartOffset); + if (hasConstantIsland()) { + const uint64_t DataOffset = + Layout.getSymbolOffset(*getFunctionColdConstantIslandLabel()); + setOutputColdDataAddress(ColdBaseAddress + DataOffset); + } + } + } else { + setOutputAddress(getAddress()); + setOutputSize( + Layout.getSymbolOffset(*getFunctionEndLabel())); + } + + // Update basic block output ranges for the debug info, if we have + // secondary entry points in the symbol table to update or if writing BAT. + if (!opts::UpdateDebugSections && !isMultiEntry() && + !requiresAddressTranslation()) + return; + + // Output ranges should match the input if the body hasn't changed. + if (!isSimple() && !BC.HasRelocations) + return; + + // AArch64 may have functions that only contains a constant island (no code). + if (layout_begin() == layout_end()) + return; + + BinaryBasicBlock *PrevBB = nullptr; + for (auto BBI = layout_begin(), BBE = layout_end(); BBI != BBE; ++BBI) { + BinaryBasicBlock *BB = *BBI; + assert(BB->getLabel()->isDefined() && "symbol should be defined"); + const uint64_t BBBaseAddress = BB->isCold() ? ColdBaseAddress : BaseAddress; + if (!BC.HasRelocations) { + if (BB->isCold()) { + assert(BBBaseAddress == cold().getAddress()); + } else { + assert(BBBaseAddress == getOutputAddress()); + } + } + const uint64_t BBOffset = Layout.getSymbolOffset(*BB->getLabel()); + const uint64_t BBAddress = BBBaseAddress + BBOffset; + BB->setOutputStartAddress(BBAddress); + + if (PrevBB) { + uint64_t PrevBBEndAddress = BBAddress; + if (BB->isCold() != PrevBB->isCold()) { + PrevBBEndAddress = + getOutputAddress() + getOutputSize(); + } + PrevBB->setOutputEndAddress(PrevBBEndAddress); + } + PrevBB = BB; + + BB->updateOutputValues(Layout); + } + PrevBB->setOutputEndAddress(PrevBB->isCold() ? + cold().getAddress() + cold().getImageSize() : + getOutputAddress() + getOutputSize()); +} + +DebugAddressRangesVector BinaryFunction::getOutputAddressRanges() const { + DebugAddressRangesVector OutputRanges; + + if (isFolded()) + return OutputRanges; + + if (IsFragment) + return OutputRanges; + + OutputRanges.emplace_back(getOutputAddress(), + getOutputAddress() + getOutputSize()); + if (isSplit()) { + assert(isEmitted() && "split function should be emitted"); + OutputRanges.emplace_back(cold().getAddress(), + cold().getAddress() + cold().getImageSize()); + } + + if (isSimple()) + return OutputRanges; + + for (BinaryFunction *Frag : Fragments) { + assert(!Frag->isSimple() && + "fragment of non-simple function should also be non-simple"); + OutputRanges.emplace_back(Frag->getOutputAddress(), + Frag->getOutputAddress() + Frag->getOutputSize()); + } + + return OutputRanges; +} + +uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { + if (isFolded()) + return 0; + + // If the function hasn't changed return the same address. + if (!isEmitted()) + return Address; + + if (Address < getAddress()) + return 0; + + // Check if the address is associated with an instruction that is tracked + // by address translation. + auto KV = InputOffsetToAddressMap.find(Address - getAddress()); + if (KV != InputOffsetToAddressMap.end()) { + return KV->second; + } + + // FIXME: #18950828 - we rely on relative offsets inside basic blocks to stay + // intact. Instead we can use pseudo instructions and/or annotations. + const uint64_t Offset = Address - getAddress(); + const BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); + if (!BB) { + // Special case for address immediately past the end of the function. + if (Offset == getSize()) + return getOutputAddress() + getOutputSize(); + + return 0; + } + + return std::min(BB->getOutputAddressRange().first + Offset - BB->getOffset(), + BB->getOutputAddressRange().second); +} + +DebugAddressRangesVector BinaryFunction::translateInputToOutputRanges( + const DWARFAddressRangesVector &InputRanges) const { + DebugAddressRangesVector OutputRanges; + + if (isFolded()) + return OutputRanges; + + // If the function hasn't changed return the same ranges. + if (!isEmitted()) { + OutputRanges.resize(InputRanges.size()); + std::transform(InputRanges.begin(), InputRanges.end(), + OutputRanges.begin(), + [](const DWARFAddressRange &Range) { + return DebugAddressRange(Range.LowPC, Range.HighPC); + }); + return OutputRanges; + } + + // Even though we will merge ranges in a post-processing pass, we attempt to + // merge them in a main processing loop as it improves the processing time. + uint64_t PrevEndAddress = 0; + for (const DWARFAddressRange &Range : InputRanges) { + if (!containsAddress(Range.LowPC)) { + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: invalid debug address range detected for " + << *this << " : [0x" << Twine::utohexstr(Range.LowPC) << ", 0x" + << Twine::utohexstr(Range.HighPC) << "]\n"); + PrevEndAddress = 0; + continue; + } + uint64_t InputOffset = Range.LowPC - getAddress(); + const uint64_t InputEndOffset = + std::min(Range.HighPC - getAddress(), getSize()); + + auto BBI = std::upper_bound(BasicBlockOffsets.begin(), + BasicBlockOffsets.end(), + BasicBlockOffset(InputOffset, nullptr), + CompareBasicBlockOffsets()); + --BBI; + do { + const BinaryBasicBlock *BB = BBI->second; + if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) { + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: invalid debug address range detected for " + << *this << " : [0x" << Twine::utohexstr(Range.LowPC) + << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n"); + PrevEndAddress = 0; + break; + } + + // Skip the range if the block was deleted. + if (const uint64_t OutputStart = BB->getOutputAddressRange().first) { + const uint64_t StartAddress = + OutputStart + InputOffset - BB->getOffset(); + uint64_t EndAddress = BB->getOutputAddressRange().second; + if (InputEndOffset < BB->getEndOffset()) + EndAddress = StartAddress + InputEndOffset - InputOffset; + + if (StartAddress == PrevEndAddress) { + OutputRanges.back().HighPC = std::max(OutputRanges.back().HighPC, + EndAddress); + } else { + OutputRanges.emplace_back(StartAddress, + std::max(StartAddress, EndAddress)); + } + PrevEndAddress = OutputRanges.back().HighPC; + } + + InputOffset = BB->getEndOffset(); + ++BBI; + } while (InputOffset < InputEndOffset); + } + + // Post-processing pass to sort and merge ranges. + std::sort(OutputRanges.begin(), OutputRanges.end()); + DebugAddressRangesVector MergedRanges; + PrevEndAddress = 0; + for (const DebugAddressRange &Range : OutputRanges) { + if (Range.LowPC <= PrevEndAddress) { + MergedRanges.back().HighPC = std::max(MergedRanges.back().HighPC, + Range.HighPC); + } else { + MergedRanges.emplace_back(Range.LowPC, Range.HighPC); + } + PrevEndAddress = MergedRanges.back().HighPC; + } + + return MergedRanges; +} + +MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { + if (CurrentState == State::Disassembled) { + auto II = Instructions.find(Offset); + return (II == Instructions.end()) ? nullptr : &II->second; + } else if (CurrentState == State::CFG) { + BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); + if (!BB) + return nullptr; + + for (MCInst &Inst : *BB) { + constexpr uint32_t InvalidOffset = std::numeric_limits::max(); + if (Offset == BC.MIB->getAnnotationWithDefault(Inst, "Offset", + InvalidOffset)) + return &Inst; + } + + return nullptr; + } else { + llvm_unreachable("invalid CFG state to use getInstructionAtOffset()"); + } +} + +DebugLocationsVector BinaryFunction::translateInputToOutputLocationList( + const DebugLocationsVector &InputLL) const { + DebugLocationsVector OutputLL; + + if (isFolded()) { + return OutputLL; + } + + // If the function hasn't changed - there's nothing to update. + if (!isEmitted()) { + return InputLL; + } + + uint64_t PrevEndAddress = 0; + SmallVectorImpl *PrevExpr = nullptr; + for (const DebugLocationEntry &Entry : InputLL) { + const uint64_t Start = Entry.LowPC; + const uint64_t End = Entry.HighPC; + if (!containsAddress(Start)) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected " + "for " + << *this << " : [0x" << Twine::utohexstr(Start) + << ", 0x" << Twine::utohexstr(End) << "]\n"); + continue; + } + uint64_t InputOffset = Start - getAddress(); + const uint64_t InputEndOffset = std::min(End - getAddress(), getSize()); + auto BBI = std::upper_bound(BasicBlockOffsets.begin(), + BasicBlockOffsets.end(), + BasicBlockOffset(InputOffset, nullptr), + CompareBasicBlockOffsets()); + --BBI; + do { + const BinaryBasicBlock *BB = BBI->second; + if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected " + "for " + << *this << " : [0x" << Twine::utohexstr(Start) + << ", 0x" << Twine::utohexstr(End) << "]\n"); + PrevEndAddress = 0; + break; + } + + // Skip the range if the block was deleted. + if (const uint64_t OutputStart = BB->getOutputAddressRange().first) { + const uint64_t StartAddress = + OutputStart + InputOffset - BB->getOffset(); + uint64_t EndAddress = BB->getOutputAddressRange().second; + if (InputEndOffset < BB->getEndOffset()) + EndAddress = StartAddress + InputEndOffset - InputOffset; + + if (StartAddress == PrevEndAddress && Entry.Expr == *PrevExpr) { + OutputLL.back().HighPC = std::max(OutputLL.back().HighPC, EndAddress); + } else { + OutputLL.emplace_back( + DebugLocationEntry{StartAddress, + std::max(StartAddress, EndAddress), + Entry.Expr}); + } + PrevEndAddress = OutputLL.back().HighPC; + PrevExpr = &OutputLL.back().Expr; + } + + ++BBI; + InputOffset = BB->getEndOffset(); + } while (InputOffset < InputEndOffset); + } + + // Sort and merge adjacent entries with identical location. + std::stable_sort(OutputLL.begin(), OutputLL.end(), + [] (const DebugLocationEntry &A, const DebugLocationEntry &B) { + return A.LowPC < B.LowPC; + }); + DebugLocationsVector MergedLL; + PrevEndAddress = 0; + PrevExpr = nullptr; + for (const DebugLocationEntry &Entry : OutputLL) { + if (Entry.LowPC <= PrevEndAddress && *PrevExpr == Entry.Expr) { + MergedLL.back().HighPC = std::max(Entry.HighPC, MergedLL.back().HighPC); + } else { + const uint64_t Begin = std::max(Entry.LowPC, PrevEndAddress); + const uint64_t End = std::max(Begin, Entry.HighPC); + MergedLL.emplace_back(DebugLocationEntry{Begin, End, Entry.Expr}); + } + PrevEndAddress = MergedLL.back().HighPC; + PrevExpr = &MergedLL.back().Expr; + } + + return MergedLL; +} + +void BinaryFunction::printLoopInfo(raw_ostream &OS) const { + OS << "Loop Info for Function \"" << *this << "\""; + if (hasValidProfile()) { + OS << " (count: " << getExecutionCount() << ")"; + } + OS << "\n"; + + std::stack St; + for (auto I = BLI->begin(), E = BLI->end(); I != E; ++I) { + St.push(*I); + } + while (!St.empty()) { + BinaryLoop *L = St.top(); + St.pop(); + + for (BinaryLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + St.push(*I); + } + + if (!hasValidProfile()) + continue; + + OS << (L->getLoopDepth() > 1 ? "Nested" : "Outer") << " loop header: " + << L->getHeader()->getName(); + OS << "\n"; + OS << "Loop basic blocks: "; + const char *Sep = ""; + for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) { + OS << Sep << (*BI)->getName(); + Sep = ", "; + } + OS << "\n"; + if (hasValidProfile()) { + OS << "Total back edge count: " << L->TotalBackEdgeCount << "\n"; + OS << "Loop entry count: " << L->EntryCount << "\n"; + OS << "Loop exit count: " << L->ExitCount << "\n"; + if (L->EntryCount > 0) { + OS << "Average iters per entry: " + << format("%.4lf", (double)L->TotalBackEdgeCount / L->EntryCount) + << "\n"; + } + } + OS << "----\n"; + } + + OS << "Total number of loops: "<< BLI->TotalLoops << "\n"; + OS << "Number of outer loops: " << BLI->OuterLoops << "\n"; + OS << "Maximum nested loop depth: " << BLI->MaximumDepth << "\n\n"; +} + +bool BinaryFunction::isAArch64Veneer() const { + if (BasicBlocks.size() != 1) + return false; + + BinaryBasicBlock &BB = **BasicBlocks.begin(); + if (BB.size() != 3) + return false; + + for (MCInst &Inst : BB) { + if (!BC.MIB->hasAnnotation(Inst, "AArch64Veneer")) + return false; + } + + return true; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h new file mode 100644 index 000000000000..1695e16fd180 --- /dev/null +++ b/bolt/src/BinaryFunction.h @@ -0,0 +1,2591 @@ +//===--- BinaryFunction.h - Interface for machine-level function ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Interface to function in binary (machine) form. This is assembly-level +// code representation with the control flow. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_H + +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryLoop.h" +#include "BinarySection.h" +#include "DebugData.h" +#include "JumpTable.h" +#include "MCPlus.h" +#include "NameResolver.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include + +using namespace llvm::object; + +namespace llvm { + +class DWARFUnit; + +namespace bolt { + +using InputOffsetToAddressMapTy = std::unordered_map; + +/// Types of macro-fusion alignment corrections. +enum MacroFusionType { + MFT_NONE, + MFT_HOT, + MFT_ALL +}; + +enum IndirectCallPromotionType : char { + ICP_NONE, /// Don't perform ICP. + ICP_CALLS, /// Perform ICP on indirect calls. + ICP_JUMP_TABLES, /// Perform ICP on jump tables. + ICP_ALL /// Perform ICP on calls and jump tables. +}; + +/// Information on a single indirect call to a particular callee. +struct IndirectCallProfile { + MCSymbol *Symbol; + uint32_t Offset; + uint64_t Count; + uint64_t Mispreds; + + IndirectCallProfile(MCSymbol *Symbol, uint64_t Count, + uint64_t Mispreds, uint32_t Offset = 0) + : Symbol(Symbol), Offset(Offset), Count(Count), Mispreds(Mispreds) {} + + bool operator==(const IndirectCallProfile &Other) const { + return Symbol == Other.Symbol && + Offset == Other.Offset; + } +}; + +/// Aggregated information for an indirect call site. +using IndirectCallSiteProfile = SmallVector; + +inline raw_ostream &operator<<(raw_ostream &OS, + const bolt::IndirectCallSiteProfile &ICSP) { + std::string TempString; + raw_string_ostream SS(TempString); + + const char *Sep = "\n "; + uint64_t TotalCount = 0; + uint64_t TotalMispreds = 0; + for (const IndirectCallProfile &CSP : ICSP) { + SS << Sep << "{ " << (CSP.Symbol ? CSP.Symbol->getName() : "") + << ": " << CSP.Count << " (" << CSP.Mispreds << " misses) }"; + Sep = ",\n "; + TotalCount += CSP.Count; + TotalMispreds += CSP.Mispreds; + } + SS.flush(); + + OS << TotalCount << " (" << TotalMispreds << " misses) :" << TempString; + return OS; +} + +/// BinaryFunction is a representation of machine-level function. +/// +/// In the input binary, an instance of BinaryFunction can represent a fragment +/// of a function if the higher-level function was split, e.g. into hot and cold +/// parts. The fragment containing the main entry point is called a parent +/// or the main fragment. +class BinaryFunction { +public: + enum class State : char { + Empty = 0, /// Function body is empty. + Disassembled, /// Function have been disassembled. + CFG, /// Control flow graph has been built. + CFG_Finalized, /// CFG is finalized. No optimizations allowed. + EmittedCFG, /// Instructions have been emitted to output. + Emitted, /// Same as above plus CFG is destroyed. + }; + + /// Types of profile the function can use. Could be a combination. + enum { + PF_NONE = 0, /// No profile. + PF_LBR = 1, /// Profile is based on last branch records. + PF_SAMPLE = 2, /// Non-LBR sample-based profile. + PF_MEMEVENT = 4, /// Profile has mem events. + }; + + /// Struct for tracking exception handling ranges. + struct CallSite { + const MCSymbol *Start; + const MCSymbol *End; + const MCSymbol *LP; + uint64_t Action; + }; + + using IslandProxiesType = + std::map>; + + struct IslandInfo { + /// Temporary holder of offsets that are data markers (used in AArch) + /// It is possible to have data in code sections. To ease the identification + /// of data in code sections, the ABI requires the symbol table to have + /// symbols named "$d" identifying the start of data inside code and "$x" + /// identifying the end of a chunk of data inside code. DataOffsets contain + /// all offsets of $d symbols and CodeOffsets all offsets of $x symbols. + std::set DataOffsets; + std::set CodeOffsets; + + /// Offsets in function that are data values in a constant island identified + /// after disassembling + std::map Offsets; + SmallPtrSet Symbols; + std::map ProxySymbols; + std::map ColdSymbols; + /// Keeps track of other functions we depend on because there is a reference + /// to the constant islands in them. + IslandProxiesType Proxies, ColdProxies; + std::set Dependency; // The other way around + }; + + static constexpr uint64_t COUNT_NO_PROFILE = + BinaryBasicBlock::COUNT_NO_PROFILE; + + /// We have to use at least 2-byte alignment for functions because of C++ ABI. + static constexpr unsigned MinAlign = 2; + + static const char TimerGroupName[]; + static const char TimerGroupDesc[]; + + using BasicBlockOrderType = std::vector; + + /// Mark injected functions + bool IsInjected = false; + + using LSDATypeTableTy = std::vector; + +private: + /// Current state of the function. + State CurrentState{State::Empty}; + + /// A list of symbols associated with the function entry point. + /// + /// Multiple symbols would typically result from identical code-folding + /// optimization. + typedef SmallVector SymbolListTy; + SymbolListTy Symbols; + + /// The list of names this function is known under. Used for fuzzy-matching + /// the function to its name in a profile, command line, etc. + std::vector Aliases; + + /// Containing section in the input file. + BinarySection *OriginSection = nullptr; + + /// Address of the function in memory. Also could be an offset from + /// base address for position independent binaries. + uint64_t Address; + + /// Original size of the function. + uint64_t Size; + + /// Address of the function in output. + uint64_t OutputAddress{0}; + + /// Size of the function in the output file. + uint64_t OutputSize{0}; + + /// Offset in the file. + uint64_t FileOffset{0}; + + /// Maximum size this function is allowed to have. + uint64_t MaxSize{std::numeric_limits::max()}; + + /// Alignment requirements for the function. + uint16_t Alignment{2}; + + /// Maximum number of bytes used for alignment of hot part of the function. + uint16_t MaxAlignmentBytes{0}; + + /// Maximum number of bytes used for alignment of cold part of the function. + uint16_t MaxColdAlignmentBytes{0}; + + const MCSymbol *PersonalityFunction{nullptr}; + uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel}; + + BinaryContext &BC; + + std::unique_ptr BLI; + + /// Set of external addresses in the code that are not a function start + /// and are referenced from this function. + std::set InterproceduralReferences; + + /// All labels in the function that are referenced via relocations from + /// data objects. Typically these are jump table destinations and computed + /// goto labels. + std::set ExternallyReferencedOffsets; + + /// Offsets of indirect branches with unknown destinations. + std::set UnknownIndirectBranchOffsets; + + /// A set of local and global symbols corresponding to secondary entry points. + /// Each additional function entry point has a corresponding entry in the map. + /// The key is a local symbol corresponding to a basic block and the value + /// is a global symbol corresponding to an external entry point. + std::unordered_map SecondaryEntryPoints; + + /// False if the function is too complex to reconstruct its control + /// flow graph. + /// In relocation mode we still disassemble and re-assemble such functions. + bool IsSimple{true}; + + /// Indication that the function should be ignored for optimization purposes. + /// If we can skip emission of some functions, then ignored functions could + /// be not fully disassembled and will not be emitted. + bool IsIgnored{false}; + + /// Pseudo functions should not be disassembled or emitted. + bool IsPseudo{false}; + + /// True if the original function code has all necessary relocations to track + /// addresses of functions emitted to new locations. Typically set for + /// functions that we are not going to emit. + bool HasExternalRefRelocations{false}; + + /// True if the function has an indirect branch with unknown destination. + bool HasUnknownControlFlow{false}; + + /// The code from inside the function references one of the code locations + /// from the same function as a data, i.e. it's possible the label is used + /// inside an address calculation or could be referenced from outside. + bool HasInternalLabelReference{false}; + + /// In AArch64, preserve nops to maintain code equal to input (assuming no + /// optimizations are done). + bool PreserveNops{false}; + + /// Indicate if this function has associated exception handling metadata. + bool HasEHRanges{false}; + + /// True if the function uses DW_CFA_GNU_args_size CFIs. + bool UsesGnuArgsSize{false}; + + /// True if the function might have a profile available externally. + /// Used to check if processing of the function is required under certain + /// conditions. + bool HasProfileAvailable{false}; + + bool HasMemoryProfile{false}; + + /// Execution halts whenever this function is entered. + bool TrapsOnEntry{false}; + + /// True if the function had an indirect branch with a fixed internal + /// destination. + bool HasFixedIndirectBranch{false}; + + /// True if the function is a fragment of another function. This means that + /// this function could only be entered via its parent or one of its sibling + /// fragments. It could be entered at any basic block. It can also return + /// the control to any basic block of its parent or its sibling. + bool IsFragment{false}; + + /// Indicate that the function body has SDT marker + bool HasSDTMarker{false}; + + /// Indicate that the function body has Pseudo Probe + bool HasPseudoProbe{BC.getUniqueSectionByName(".pseudo_probe_desc") && + BC.getUniqueSectionByName(".pseudo_probe")}; + + /// True if the original entry point was patched. + bool IsPatched{false}; + + /// True if the function contains jump table with entries pointing to + /// locations in fragments. + bool HasSplitJumpTable{false}; + + /// True if there are no control-flow edges with successors in other functions + /// (i.e. if tail calls have edges to function-local basic blocks). + /// Set to false by SCTC. Dynostats can't be reliably computed for + /// functions with non-canonical CFG. + /// This attribute is only valid when hasCFG() == true. + bool HasCanonicalCFG{true}; + + /// The address for the code for this function in codegen memory. + /// Used for functions that are emitted in a dedicated section with a fixed + /// address. E.g. for functions that are overwritten in-place. + uint64_t ImageAddress{0}; + + /// The size of the code in memory. + uint64_t ImageSize{0}; + + /// Name for the section this function code should reside in. + std::string CodeSectionName; + + /// Name for the corresponding cold code section. + std::string ColdCodeSectionName; + + /// Parent function fragment for split function fragments. + BinaryFunction *ParentFragment{nullptr}; + + /// Indicate if the function body was folded into another function. + /// Used by ICF optimization. + BinaryFunction *FoldedIntoFunction{nullptr}; + + /// All fragments for a parent function. + std::unordered_set Fragments; + + /// The profile data for the number of times the function was executed. + uint64_t ExecutionCount{COUNT_NO_PROFILE}; + + /// Profile match ratio. + float ProfileMatchRatio{0.0f}; + + /// Raw branch count for this function in the profile + uint64_t RawBranchCount{0}; + + /// Indicates the type of profile the function is using. + uint16_t ProfileFlags{PF_NONE}; + + /// For functions with mismatched profile we store all call profile + /// information at a function level (as opposed to tying it to + /// specific call sites). + IndirectCallSiteProfile AllCallSites; + + /// Score of the function (estimated number of instructions executed, + /// according to profile data). -1 if the score has not been calculated yet. + mutable int64_t FunctionScore{-1}; + + /// Original LSDA address for the function. + uint64_t LSDAAddress{0}; + + /// Containing compilation unit for the function. + DWARFUnit *DwarfUnit{nullptr}; + + /// Last computed hash value. Note that the value could be recomputed using + /// different parameters by every pass. + mutable uint64_t Hash{0}; + + /// For PLT functions it contains a symbol associated with a function + /// reference. It is nullptr for non-PLT functions. + const MCSymbol *PLTSymbol{nullptr}; + + /// Function order for streaming into the destination binary. + uint32_t Index{-1U}; + + /// Get basic block index assuming it belongs to this function. + unsigned getIndex(const BinaryBasicBlock *BB) const { + assert(BB->getIndex() < BasicBlocks.size()); + return BB->getIndex(); + } + + /// Return basic block that originally contained offset \p Offset + /// from the function start. + BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); + + const BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset) const { + return const_cast(this) + ->getBasicBlockContainingOffset(Offset); + } + + /// Return basic block that started at offset \p Offset. + BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { + BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); + return BB && BB->getOffset() == Offset ? BB : nullptr; + } + + /// Release memory taken by the list. + template BinaryFunction &clearList(T& List) { + T TempList; + TempList.swap(List); + return *this; + } + + /// Update the indices of all the basic blocks starting at StartIndex. + void updateBBIndices(const unsigned StartIndex); + + /// Annotate each basic block entry with its current CFI state. This is + /// run right after the construction of CFG while basic blocks are in their + /// original order. + void annotateCFIState(); + + /// Associate DW_CFA_GNU_args_size info with invoke instructions + /// (call instructions with non-empty landing pad). + void propagateGnuArgsSizeInfo(MCPlusBuilder::AllocatorIdTy AllocId); + + /// Synchronize branch instructions with CFG. + void postProcessBranches(); + + /// The address offset where we emitted the constant island, that is, the + /// chunk of data in the function code area (AArch only) + int64_t OutputDataOffset{0}; + int64_t OutputColdDataOffset{0}; + + /// Map labels to corresponding basic blocks. + std::unordered_map LabelToBB; + + using BranchListType = std::vector>; + BranchListType TakenBranches; /// All local taken branches. + BranchListType IgnoredBranches; /// Branches ignored by CFG purposes. + + /// Map offset in the function to a label. + /// Labels are used for building CFG for simple functions. For non-simple + /// function in relocation mode we need to emit them for relocations + /// referencing function internals to work (e.g. jump tables). + using LabelsMapType = std::map; + LabelsMapType Labels; + + /// Temporary holder of instructions before CFG is constructed. + /// Map offset in the function to MCInst. + using InstrMapType = std::map; + InstrMapType Instructions; + + /// List of DWARF CFI instructions. Original CFI from the binary must be + /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs + /// if needed (to fix state after reordering BBs). + using CFIInstrMapType = std::vector; + using cfi_iterator = CFIInstrMapType::iterator; + using const_cfi_iterator = CFIInstrMapType::const_iterator; + + /// We don't decode Call Frame Info encoded in DWARF program state + /// machine. Instead we define a "CFI State" - a frame information that + /// is a result of executing FDE CFI program up to a given point. The + /// program consists of opaque Call Frame Instructions: + /// + /// CFI #0 + /// CFI #1 + /// .... + /// CFI #N + /// + /// When we refer to "CFI State K" - it corresponds to a row in an abstract + /// Call Frame Info table. This row is reached right before executing CFI #K. + /// + /// At any point of execution in a function we are in any one of (N + 2) + /// states described in the original FDE program. We can't have more states + /// without intelligent processing of CFIs. + /// + /// When the final layout of basic blocks is known, and we finalize CFG, + /// we modify the original program to make sure the same state could be + /// reached even when basic blocks containing CFI instructions are executed + /// in a different order. + CFIInstrMapType FrameInstructions; + + /// A map of restore state CFI instructions to their equivalent CFI + /// instructions that produce the same state, in order to eliminate + /// remember-restore CFI instructions when rewriting CFI. + DenseMap> FrameRestoreEquivalents; + + // For tracking exception handling ranges. + std::vector CallSites; + std::vector ColdCallSites; + + /// Binary blobs representing action, type, and type index tables for this + /// function' LSDA (exception handling). + ArrayRef LSDAActionTable; + ArrayRef LSDATypeIndexTable; + + /// Vector of addresses of types referenced by LSDA. + LSDATypeTableTy LSDATypeTable; + + /// Vector of addresses of entries in LSDATypeTable used for indirect + /// addressing. + LSDATypeTableTy LSDATypeAddressTable; + + /// Marking for the beginning of language-specific data area for the function. + MCSymbol *LSDASymbol{nullptr}; + MCSymbol *ColdLSDASymbol{nullptr}; + + /// Map to discover which CFIs are attached to a given instruction offset. + /// Maps an instruction offset into a FrameInstructions offset. + /// This is only relevant to the buildCFG phase and is discarded afterwards. + std::multimap OffsetToCFI; + + /// List of CFI instructions associated with the CIE (common to more than one + /// function and that apply before the entry basic block). + CFIInstrMapType CIEFrameInstructions; + + /// All compound jump tables for this function. This duplicates what's stored + /// in the BinaryContext, but additionally it gives quick access for all + /// jump tables used by this function. + /// + /// -> + std::map JumpTables; + + /// All jump table sites in the function before CFG is built. + std::vector> JTSites; + + /// List of relocations in this function. + std::map Relocations; + + /// Information on function constant islands. + IslandInfo Islands; + + // Blocks are kept sorted in the layout order. If we need to change the + // layout (if BasicBlocksLayout stores a different order than BasicBlocks), + // the terminating instructions need to be modified. + using BasicBlockListType = std::vector; + BasicBlockListType BasicBlocks; + BasicBlockListType DeletedBasicBlocks; + BasicBlockOrderType BasicBlocksLayout; + /// Previous layout replaced by modifyLayout + BasicBlockOrderType BasicBlocksPreviousLayout; + bool ModifiedLayout{false}; + + /// BasicBlockOffsets are used during CFG construction to map from code + /// offsets to BinaryBasicBlocks. Any modifications made to the CFG + /// after initial construction are not reflected in this data structure. + using BasicBlockOffset = std::pair; + struct CompareBasicBlockOffsets { + bool operator()(const BasicBlockOffset &A, + const BasicBlockOffset &B) const { + return A.first < B.first; + } + }; + std::vector BasicBlockOffsets; + + MCSymbol *ColdSymbol{nullptr}; + + /// Symbol at the end of the function. + mutable MCSymbol *FunctionEndLabel{nullptr}; + + /// Symbol at the end of the cold part of split function. + mutable MCSymbol *FunctionColdEndLabel{nullptr}; + + mutable MCSymbol *FunctionConstantIslandLabel{nullptr}; + mutable MCSymbol *FunctionColdConstantIslandLabel{nullptr}; + + /// Unique number associated with the function. + uint64_t FunctionNumber; + + /// Count the number of functions created. + static uint64_t Count; + + /// Map offsets of special instructions to addresses in the output. + InputOffsetToAddressMapTy InputOffsetToAddressMap; + + /// Register alternative function name. + void addAlternativeName(std::string NewName) { + Aliases.push_back(std::move(NewName)); + } + + /// Return a label at a given \p Address in the function. If the label does + /// not exist - create it. Assert if the \p Address does not belong to + /// the function. If \p CreatePastEnd is true, then return the function + /// end label when the \p Address points immediately past the last byte + /// of the function. + /// NOTE: the function always returns a local (temp) symbol, even if there's + /// a global symbol that corresponds to an entry at this address. + MCSymbol *getOrCreateLocalLabel(uint64_t Address, bool CreatePastEnd = false); + + /// Register an entry point at a given \p Offset into the function. + void markDataAtOffset(uint64_t Offset) { + Islands.DataOffsets.emplace(Offset); + } + + /// Register an entry point at a given \p Offset into the function. + void markCodeAtOffset(uint64_t Offset) { + Islands.CodeOffsets.emplace(Offset); + } + + /// Register secondary entry point at a given \p Offset into the function. + /// Return global symbol for use by extern function references. + MCSymbol *addEntryPointAtOffset(uint64_t Offset); + + /// Register an internal offset in a function referenced from outside. + void registerReferencedOffset(uint64_t Offset) { + ExternallyReferencedOffsets.emplace(Offset); + } + + /// True if there are references to internals of this function from data, + /// e.g. from jump tables. + bool hasInternalReference() const { + return !ExternallyReferencedOffsets.empty(); + } + + /// Return an entry ID corresponding to a symbol known to belong to + /// the function. + /// + /// Prefer to use BinaryContext::getFunctionForSymbol(EntrySymbol, &ID) + /// instead of calling this function directly. + uint64_t getEntryIDForSymbol(const MCSymbol *EntrySymbol) const; + + /// If the function represents a secondary split function fragment, set its + /// parent fragment to \p BF. + void setParentFragment(BinaryFunction &BF) { + assert(IsFragment && "function must be a fragment to have a parent"); + assert((!ParentFragment || ParentFragment == &BF) && + "cannot have more than one parent function"); + ParentFragment = &BF; + } + + /// Register a child fragment for the main fragment of a split function. + void addFragment(BinaryFunction &BF) { + Fragments.insert(&BF); + } + + void addInstruction(uint64_t Offset, MCInst &&Instruction) { + Instructions.emplace(Offset, std::forward(Instruction)); + } + + /// Convert CFI instructions to a standard form (remove remember/restore). + void normalizeCFIState(); + + /// Analyze and process indirect branch \p Instruction before it is + /// added to Instructions list. + IndirectBranchType processIndirectBranch(MCInst &Instruction, + unsigned Size, + uint64_t Offset, + uint64_t &TargetAddress); + + DenseMap> + computeLocalUDChain(const MCInst *CurInstr); + + BinaryFunction &operator=(const BinaryFunction &) = delete; + BinaryFunction(const BinaryFunction &) = delete; + + friend class MachORewriteInstance; + friend class RewriteInstance; + friend class BinaryContext; + friend class DataReader; + friend class DataAggregator; + + static std::string buildCodeSectionName(StringRef Name, + const BinaryContext &BC); + static std::string buildColdCodeSectionName(StringRef Name, + const BinaryContext &BC); + + /// Creation should be handled by RewriteInstance or BinaryContext + BinaryFunction(const std::string &Name, BinarySection &Section, + uint64_t Address, uint64_t Size, BinaryContext &BC) + : OriginSection(&Section), Address(Address), Size(Size), BC(BC), + CodeSectionName(buildCodeSectionName(Name, BC)), + ColdCodeSectionName(buildColdCodeSectionName(Name, BC)), + FunctionNumber(++Count) { + Symbols.push_back(BC.Ctx->getOrCreateSymbol(Name)); + } + + /// This constructor is used to create an injected function + BinaryFunction(const std::string &Name, BinaryContext &BC, bool IsSimple) + : Address(0), Size(0), BC(BC), IsSimple(IsSimple), + CodeSectionName(buildCodeSectionName(Name, BC)), + ColdCodeSectionName(buildColdCodeSectionName(Name, BC)), + FunctionNumber(++Count) { + Symbols.push_back(BC.Ctx->getOrCreateSymbol(Name)); + IsInjected = true; + } + + /// Clear state of the function that could not be disassembled or if its + /// disassembled state was later invalidated. + void clearDisasmState(); + + /// Release memory allocated for CFG and instructions. + /// We still keep basic blocks for address translation/mapping purposes. + void releaseCFG() { + for (BinaryBasicBlock *BB : BasicBlocks) { + BB->releaseCFG(); + } + for (BinaryBasicBlock *BB : DeletedBasicBlocks) { + BB->releaseCFG(); + } + + clearList(CallSites); + clearList(ColdCallSites); + clearList(LSDATypeTable); + clearList(LSDATypeAddressTable); + + clearList(LabelToBB); + + if (!isMultiEntry()) + clearList(Labels); + + clearList(FrameInstructions); + clearList(FrameRestoreEquivalents); + } + +public: + BinaryFunction(BinaryFunction &&) = default; + + using iterator = pointee_iterator; + using const_iterator = pointee_iterator; + using reverse_iterator = + pointee_iterator; + using const_reverse_iterator = + pointee_iterator; + + typedef BasicBlockOrderType::iterator order_iterator; + typedef BasicBlockOrderType::const_iterator const_order_iterator; + typedef BasicBlockOrderType::reverse_iterator reverse_order_iterator; + typedef BasicBlockOrderType::const_reverse_iterator + const_reverse_order_iterator; + + // CFG iterators. + iterator begin() { return BasicBlocks.begin(); } + const_iterator begin() const { return BasicBlocks.begin(); } + iterator end () { return BasicBlocks.end(); } + const_iterator end () const { return BasicBlocks.end(); } + + reverse_iterator rbegin() { return BasicBlocks.rbegin(); } + const_reverse_iterator rbegin() const { return BasicBlocks.rbegin(); } + reverse_iterator rend () { return BasicBlocks.rend(); } + const_reverse_iterator rend () const { return BasicBlocks.rend(); } + + size_t size() const { return BasicBlocks.size();} + bool empty() const { return BasicBlocks.empty(); } + const BinaryBasicBlock &front() const { return *BasicBlocks.front(); } + BinaryBasicBlock &front() { return *BasicBlocks.front(); } + const BinaryBasicBlock & back() const { return *BasicBlocks.back(); } + BinaryBasicBlock & back() { return *BasicBlocks.back(); } + inline iterator_range blocks() { + return iterator_range(begin(), end()); + } + inline iterator_range blocks() const { + return iterator_range(begin(), end()); + } + + // Iterators by pointer. + BasicBlockListType::iterator pbegin() { return BasicBlocks.begin(); } + BasicBlockListType::iterator pend() { return BasicBlocks.end(); } + + order_iterator layout_begin() { return BasicBlocksLayout.begin(); } + const_order_iterator layout_begin() const + { return BasicBlocksLayout.begin(); } + order_iterator layout_end() { return BasicBlocksLayout.end(); } + const_order_iterator layout_end() const + { return BasicBlocksLayout.end(); } + reverse_order_iterator layout_rbegin() + { return BasicBlocksLayout.rbegin(); } + const_reverse_order_iterator layout_rbegin() const + { return BasicBlocksLayout.rbegin(); } + reverse_order_iterator layout_rend() + { return BasicBlocksLayout.rend(); } + const_reverse_order_iterator layout_rend() const + { return BasicBlocksLayout.rend(); } + size_t layout_size() const { return BasicBlocksLayout.size(); } + bool layout_empty() const { return BasicBlocksLayout.empty(); } + const BinaryBasicBlock *layout_front() const + { return BasicBlocksLayout.front(); } + BinaryBasicBlock *layout_front() { return BasicBlocksLayout.front(); } + const BinaryBasicBlock *layout_back() const + { return BasicBlocksLayout.back(); } + BinaryBasicBlock *layout_back() { return BasicBlocksLayout.back(); } + + inline iterator_range layout() { + return iterator_range(BasicBlocksLayout.begin(), + BasicBlocksLayout.end()); + } + + inline iterator_range layout() const { + return iterator_range(BasicBlocksLayout.begin(), + BasicBlocksLayout.end()); + } + + inline iterator_range rlayout() { + return + iterator_range(BasicBlocksLayout.rbegin(), + BasicBlocksLayout.rend()); + } + + inline iterator_range rlayout() const { + return + iterator_range(BasicBlocksLayout.rbegin(), + BasicBlocksLayout.rend()); + } + + cfi_iterator cie_begin() { return CIEFrameInstructions.begin(); } + const_cfi_iterator cie_begin() const { return CIEFrameInstructions.begin(); } + cfi_iterator cie_end() { return CIEFrameInstructions.end(); } + const_cfi_iterator cie_end() const { return CIEFrameInstructions.end(); } + bool cie_empty() const { return CIEFrameInstructions.empty(); } + + inline iterator_range cie() { + return iterator_range(cie_begin(), cie_end()); + } + inline iterator_range cie() const { + return iterator_range(cie_begin(), cie_end()); + } + + /// Iterate over all jump tables associated with this function. + iterator_range::const_iterator> + jumpTables() const { + return make_range(JumpTables.begin(), JumpTables.end()); + } + + /// Returns the raw binary encoding of this function. + ErrorOr> getData() const; + + BinaryFunction &updateState(BinaryFunction::State State) { + CurrentState = State; + return *this; + } + + /// Update layout of basic blocks used for output. + void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) { + BasicBlocksPreviousLayout = BasicBlocksLayout; + + if (NewLayout != BasicBlocksLayout) { + ModifiedLayout = true; + BasicBlocksLayout.clear(); + BasicBlocksLayout.swap(NewLayout); + } + } + + /// Recompute landing pad information for the function and all its blocks. + void recomputeLandingPads(); + + /// Return current basic block layout. + const BasicBlockOrderType &getLayout() const { + return BasicBlocksLayout; + } + + /// Return a list of basic blocks sorted using DFS and update layout indices + /// using the same order. Does not modify the current layout. + BasicBlockOrderType dfs() const; + + /// Find the loops in the CFG of the function and store information about + /// them. + void calculateLoopInfo(); + + /// Calculate missed macro-fusion opportunities and update BinaryContext + /// stats. + void calculateMacroOpFusionStats(); + + /// Returns if loop detection has been run for this function. + bool hasLoopInfo() const { + return BLI != nullptr; + } + + const BinaryLoopInfo &getLoopInfo() { + return *BLI.get(); + } + + bool isLoopFree() { + if (!hasLoopInfo()) { + calculateLoopInfo(); + } + return BLI->empty(); + } + + /// Print loop information about the function. + void printLoopInfo(raw_ostream &OS) const; + + /// View CFG in graphviz program + void viewGraph() const; + + /// Dump CFG in graphviz format + void dumpGraph(raw_ostream& OS) const; + + /// Dump CFG in graphviz format to file. + void dumpGraphToFile(std::string Filename) const; + + /// Dump CFG in graphviz format to a file with a filename that is derived + /// from the function name and Annotation strings. Useful for dumping the + /// CFG after an optimization pass. + void dumpGraphForPass(std::string Annotation = "") const; + + /// Return BinaryContext for the function. + const BinaryContext &getBinaryContext() const { + return BC; + } + + /// Return BinaryContext for the function. + BinaryContext &getBinaryContext() { + return BC; + } + + /// Attempt to validate CFG invariants. + bool validateCFG() const; + + BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) { + auto I = LabelToBB.find(Label); + return I == LabelToBB.end() ? nullptr : I->second; + } + + const BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) const { + auto I = LabelToBB.find(Label); + return I == LabelToBB.end() ? nullptr : I->second; + } + + /// Returns the basic block after the given basic block in the layout or + /// nullptr the last basic block is given. + const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits = true) const { + return + const_cast(this)->getBasicBlockAfter(BB, IgnoreSplits); + } + + BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits = true) { + for (auto I = layout_begin(), E = layout_end(); I != E; ++I) { + auto Next = std::next(I); + if (*I == BB && Next != E) { + return (IgnoreSplits || (*I)->isCold() == (*Next)->isCold()) + ? *Next : nullptr; + } + } + return nullptr; + } + + /// Retrieve the landing pad BB associated with invoke instruction \p Invoke + /// that is in \p BB. Return nullptr if none exists + BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB, + const MCInst &InvokeInst) const { + assert(BC.MIB->isInvoke(InvokeInst) && "must be invoke instruction"); + const Optional LP = BC.MIB->getEHInfo(InvokeInst); + if (LP && LP->first) { + BinaryBasicBlock *LBB = BB.getLandingPad(LP->first); + assert (LBB && "Landing pad should be defined"); + return LBB; + } + return nullptr; + } + + /// Return instruction at a given offset in the function. Valid before + /// CFG is constructed or while instruction offsets are available in CFG. + MCInst *getInstructionAtOffset(uint64_t Offset); + + const MCInst *getInstructionAtOffset(uint64_t Offset) const { + return const_cast(this)->getInstructionAtOffset(Offset); + } + + /// Return jump table that covers a given \p Address in memory. + JumpTable *getJumpTableContainingAddress(uint64_t Address) { + auto JTI = JumpTables.upper_bound(Address); + if (JTI == JumpTables.begin()) + return nullptr; + --JTI; + if (JTI->first + JTI->second->getSize() > Address) + return JTI->second; + if (JTI->second->getSize() == 0 && JTI->first == Address) + return JTI->second; + return nullptr; + } + + const JumpTable *getJumpTableContainingAddress(uint64_t Address) const { + return const_cast(this)-> + getJumpTableContainingAddress(Address); + } + + /// Return the name of the function if the function has just one name. + /// If the function has multiple names - return one followed + /// by "(*#)". + /// + /// We should use getPrintName() for diagnostics and use + /// hasName() to match function name against a given string. + /// + /// NOTE: for disambiguating names of local symbols we use the following + /// naming schemes: + /// primary: / + /// alternative: // + std::string getPrintName() const { + const size_t NumNames = Symbols.size() + Aliases.size(); + return NumNames == 1 + ? getOneName().str() + : (getOneName().str() + "(*" + std::to_string(NumNames) + ")"); + } + + /// The function may have many names. For that reason, we avoid having + /// getName() method as most of the time the user needs a different + /// interface, such as forEachName(), hasName(), hasNameRegex(), etc. + /// In some cases though, we need just a name uniquely identifying + /// the function, and that's what this method is for. + StringRef getOneName() const { + return Symbols[0]->getName(); + } + + /// Return the name of the function as getPrintName(), but also trying + /// to demangle it. + std::string getDemangledName() const; + + /// Call \p Callback for every name of this function as long as the Callback + /// returns false. Stop if Callback returns true or all names have been used. + /// Return the name for which the Callback returned true if any. + template + Optional forEachName(FType Callback) const { + for (MCSymbol *Symbol : Symbols) + if (Callback(Symbol->getName())) + return Symbol->getName(); + + for (const std::string &Name : Aliases) + if (Callback(StringRef(Name))) + return StringRef(Name); + + return NoneType(); + } + + /// Check if (possibly one out of many) function name matches the given + /// string. Use this member function instead of direct name comparison. + bool hasName(const std::string &FunctionName) const { + auto Res = forEachName([&](StringRef Name) { + return Name == FunctionName; + }); + return Res.hasValue(); + } + + /// Check if any of function names matches the given regex. + Optional hasNameRegex(const StringRef NameRegex) const; + + /// Check if any of restored function names matches the given regex. + /// Restored name means stripping BOLT-added suffixes like "/1", + Optional hasRestoredNameRegex(const StringRef NameRegex) const; + + /// Return a vector of all possible names for the function. + const std::vector getNames() const { + std::vector AllNames; + forEachName([&AllNames] (StringRef Name) { + AllNames.push_back(Name); + return false; + }); + + return AllNames; + } + + /// Return a state the function is in (see BinaryFunction::State definition + /// for description). + State getState() const { + return CurrentState; + } + + /// Return true if function has a control flow graph available. + bool hasCFG() const { + return getState() == State::CFG || + getState() == State::CFG_Finalized || + getState() == State::EmittedCFG; + } + + /// Return true if the function state implies that it includes instructions. + bool hasInstructions() const { + return getState() == State::Disassembled || + hasCFG(); + } + + bool isEmitted() const { + return getState() == State::EmittedCFG || + getState() == State::Emitted; + } + + /// Return the section in the input binary this function originated from or + /// nullptr if the function did not originate from the file. + BinarySection *getOriginSection() const { + return OriginSection; + } + + void setOriginSection(BinarySection *Section) { + OriginSection = Section; + } + + /// Return true if the function did not originate from the primary input file. + bool isInjected() const { + return IsInjected; + } + + /// Return original address of the function (or offset from base for PIC). + uint64_t getAddress() const { + return Address; + } + + uint64_t getOutputAddress() const { + return OutputAddress; + } + + uint64_t getOutputSize() const { + return OutputSize; + } + + /// Does this function have a valid streaming order index? + bool hasValidIndex() const { + return Index != -1U; + } + + /// Get the streaming order index for this function. + uint32_t getIndex() const { + return Index; + } + + /// Set the streaming order index for this function. + void setIndex(uint32_t Idx) { + assert(!hasValidIndex()); + Index = Idx; + } + + /// Get the original address for the given basic block within this function. + uint64_t getBasicBlockOriginalAddress(const BinaryBasicBlock *BB) const { + return Address + BB->getOffset(); + } + + /// Return offset of the function body in the binary file. + uint64_t getFileOffset() const { + return FileOffset; + } + + /// Return (original) byte size of the function. + uint64_t getSize() const { + return Size; + } + + /// Return the maximum size the body of the function could have. + uint64_t getMaxSize() const { + return MaxSize; + } + + /// Return the number of emitted instructions for this function. + uint32_t getNumNonPseudos() const { + uint32_t N = 0; + for (BinaryBasicBlock *const &BB : layout()) { + N += BB->getNumNonPseudos(); + } + return N; + } + + /// Return MC symbol associated with the function. + /// All references to the function should use this symbol. + MCSymbol *getSymbol() { + return Symbols[0]; + } + + /// Return MC symbol associated with the function (const version). + /// All references to the function should use this symbol. + const MCSymbol *getSymbol() const { + return Symbols[0]; + } + + /// Return a list of symbols associated with the main entry of the function. + SymbolListTy &getSymbols() { return Symbols; } + const SymbolListTy &getSymbols() const { return Symbols; } + + /// If a local symbol \p BBLabel corresponds to a basic block that is a + /// secondary entry point into the function, then return a global symbol + /// that represents the secondary entry point. Otherwise return nullptr. + MCSymbol *getSecondaryEntryPointSymbol(const MCSymbol *BBLabel) const { + auto I = SecondaryEntryPoints.find(BBLabel); + if (I == SecondaryEntryPoints.end()) + return nullptr; + + return I->second; + } + + /// If the basic block serves as a secondary entry point to the function, + /// return a global symbol representing the entry. Otherwise return nullptr. + MCSymbol *getSecondaryEntryPointSymbol(const BinaryBasicBlock &BB) const { + return getSecondaryEntryPointSymbol(BB.getLabel()); + } + + /// Return true if the basic block is an entry point into the function + /// (either primary or secondary). + bool isEntryPoint(const BinaryBasicBlock &BB) const { + if (&BB == BasicBlocks.front()) + return true; + return getSecondaryEntryPointSymbol(BB); + } + + + /// Return MC symbol corresponding to an enumerated entry for multiple-entry + /// functions. + MCSymbol *getSymbolForEntryID(uint64_t EntryNum); + const MCSymbol *getSymbolForEntryID(uint64_t EntryNum) const { + return const_cast(this)->getSymbolForEntryID(EntryNum); + } + + using EntryPointCallbackTy = function_ref; + + /// Invoke \p Callback function for every entry point in the function starting + /// with the main entry and using entries in the ascending address order. + /// Stop calling the function after false is returned by the callback. + /// + /// Pass an offset of the entry point in the input binary and a corresponding + /// global symbol to the callback function. + /// + /// Return true of all callbacks returned true, false otherwise. + bool forEachEntryPoint(EntryPointCallbackTy Callback) const; + + MCSymbol *getColdSymbol() { + if (ColdSymbol) + return ColdSymbol; + + ColdSymbol = BC.Ctx->getOrCreateSymbol( + NameResolver::append(getSymbol()->getName(), ".cold.0")); + + return ColdSymbol; + } + + /// Return MC symbol associated with the end of the function. + MCSymbol *getFunctionEndLabel() const { + assert(BC.Ctx && "cannot be called with empty context"); + if (!FunctionEndLabel) { + std::unique_lock Lock(BC.CtxMutex); + FunctionEndLabel = BC.Ctx->createNamedTempSymbol("func_end"); + } + return FunctionEndLabel; + } + + /// Return MC symbol associated with the end of the cold part of the function. + MCSymbol *getFunctionColdEndLabel() const { + if (!FunctionColdEndLabel) { + std::unique_lock Lock(BC.CtxMutex); + FunctionColdEndLabel = BC.Ctx->createNamedTempSymbol("func_cold_end"); + } + return FunctionColdEndLabel; + } + + /// Return a label used to identify where the constant island was emitted + /// (AArch only). This is used to update the symbol table accordingly, + /// emitting data marker symbols as required by the ABI. + MCSymbol *getFunctionConstantIslandLabel() const { + if (!FunctionConstantIslandLabel) { + FunctionConstantIslandLabel = + BC.Ctx->createNamedTempSymbol("func_const_island"); + } + return FunctionConstantIslandLabel; + } + + MCSymbol *getFunctionColdConstantIslandLabel() const { + if (!FunctionColdConstantIslandLabel) { + FunctionColdConstantIslandLabel = + BC.Ctx->createNamedTempSymbol("func_cold_const_island"); + } + return FunctionColdConstantIslandLabel; + } + + /// Return true if this is a function representing a PLT entry. + bool isPLTFunction() const { + return PLTSymbol != nullptr; + } + + /// Return PLT function reference symbol for PLT functions and nullptr for + /// non-PLT functions. + const MCSymbol *getPLTSymbol() const { + return PLTSymbol; + } + + /// Set function PLT reference symbol for PLT functions. + void setPLTSymbol(const MCSymbol *Symbol) { + assert(Size == 0 && "function size should be 0 for PLT functions"); + PLTSymbol = Symbol; + IsPseudo = true; + } + + /// Update output values of the function based on the final \p Layout. + void updateOutputValues(const MCAsmLayout &Layout); + + /// Return mapping of input to output addresses. Most users should call + /// translateInputToOutputAddress() for address translation. + InputOffsetToAddressMapTy &getInputOffsetToAddressMap() { + assert(isEmitted() && "cannot use address mapping before code emission"); + return InputOffsetToAddressMap; + } + + /// Register relocation type \p RelType at a given \p Address in the function + /// against \p Symbol. + /// Assert if the \p Address is not inside this function. + void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t RelType, + uint64_t Addend, uint64_t Value) { + assert(Address >= getAddress() && Address < getAddress() + getMaxSize() && + "address is outside of the function"); + uint64_t Offset = Address - getAddress(); + switch (RelType) { + case ELF::R_X86_64_8: + case ELF::R_X86_64_16: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_64: + case ELF::R_AARCH64_ABS64: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: + Relocations[Offset] = Relocation{Offset, Symbol, RelType, Addend, Value}; + break; + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_PC8: + case ELF::R_X86_64_PLT32: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_TLSDESC_CALL: + break; + + // The following relocations are ignored. + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_TPOFF32: + case ELF::R_X86_64_GOTTPOFF: + return; + default: + llvm_unreachable("unexpected relocation type in code"); + } + } + + /// Return the name of the section this function originated from. + Optional getOriginSectionName() const { + if (!OriginSection) + return NoneType(); + return OriginSection->getName(); + } + + /// Return internal section name for this function. + StringRef getCodeSectionName() const { + return StringRef(CodeSectionName); + } + + /// Assign a code section name to the function. + void setCodeSectionName(StringRef Name) { + CodeSectionName = std::string(Name); + } + + /// Get output code section. + ErrorOr getCodeSection() const { + return BC.getUniqueSectionByName(getCodeSectionName()); + } + + /// Return cold code section name for the function. + StringRef getColdCodeSectionName() const { + return StringRef(ColdCodeSectionName); + } + + /// Assign a section name for the cold part of the function. + void setColdCodeSectionName(StringRef Name) { + ColdCodeSectionName = std::string(Name); + } + + /// Get output code section for cold code of this function. + ErrorOr getColdCodeSection() const { + return BC.getUniqueSectionByName(getColdCodeSectionName()); + } + + /// Return true iif the function will halt execution on entry. + bool trapsOnEntry() const { + return TrapsOnEntry; + } + + /// Make the function always trap on entry. Other than the trap instruction, + /// the function body will be empty. + void setTrapOnEntry(); + + /// Return true if the function could be correctly processed. + bool isSimple() const { + return IsSimple; + } + + /// Return true if the function should be ignored for optimization purposes. + bool isIgnored() const { + return IsIgnored; + } + + /// Return true if the function should not be disassembled, emitted, or + /// otherwise processed. + bool isPseudo() const { + return IsPseudo; + } + + /// Return true if the function contains a jump table with entries pointing + /// to split fragments. + bool hasSplitJumpTable() const { + return HasSplitJumpTable; + } + + /// Return true if all CFG edges have local successors. + bool hasCanonicalCFG() const { + return HasCanonicalCFG; + } + + /// Return true if the original function code has all necessary relocations + /// to track addresses of functions emitted to new locations. + bool hasExternalRefRelocations() const { + return HasExternalRefRelocations; + } + + /// Return true if the function has instruction(s) with unknown control flow. + bool hasUnknownControlFlow() const { + return HasUnknownControlFlow; + } + + /// Return true if the function body is non-contiguous. + bool isSplit() const { + return isSimple() && + layout_size() && + layout_front()->isCold() != layout_back()->isCold(); + } + + /// Return true if the function has exception handling tables. + bool hasEHRanges() const { + return HasEHRanges; + } + + /// Return true if the function uses DW_CFA_GNU_args_size CFIs. + bool usesGnuArgsSize() const { + return UsesGnuArgsSize; + } + + /// Return true if the function has more than one entry point. + bool isMultiEntry() const { + return !SecondaryEntryPoints.empty(); + } + + /// Return true if the function might have a profile available externally, + /// but not yet populated into the function. + bool hasProfileAvailable() const { + return HasProfileAvailable; + } + + bool hasMemoryProfile() const { + return HasMemoryProfile; + } + + /// Return true if the body of the function was merged into another function. + bool isFolded() const { + return FoldedIntoFunction != nullptr; + } + + /// If this function was folded, return the function it was folded into. + BinaryFunction *getFoldedIntoFunction() const { + return FoldedIntoFunction; + } + + /// Return true if the function uses jump tables. + bool hasJumpTables() const { + return !JumpTables.empty(); + } + + /// Return true if the function has SDT marker + bool hasSDTMarker() const { return HasSDTMarker; } + + /// Return true if the function has Pseudo Probe + bool hasPseudoProbe() const { return HasPseudoProbe; } + + /// Return true if the original entry point was patched. + bool isPatched() const { + return IsPatched; + } + + const JumpTable *getJumpTable(const MCInst &Inst) const { + const uint64_t Address = BC.MIB->getJumpTable(Inst); + return getJumpTableContainingAddress(Address); + } + + JumpTable *getJumpTable(const MCInst &Inst) { + const uint64_t Address = BC.MIB->getJumpTable(Inst); + return getJumpTableContainingAddress(Address); + } + + const MCSymbol *getPersonalityFunction() const { + return PersonalityFunction; + } + + uint8_t getPersonalityEncoding() const { + return PersonalityEncoding; + } + + const std::vector &getCallSites() const { + return CallSites; + } + + const std::vector &getColdCallSites() const { + return ColdCallSites; + } + + const ArrayRef getLSDAActionTable() const { + return LSDAActionTable; + } + + const LSDATypeTableTy &getLSDATypeTable() const { + return LSDATypeTable; + } + + const LSDATypeTableTy &getLSDATypeAddressTable() const { + return LSDATypeAddressTable; + } + + const ArrayRef getLSDATypeIndexTable() const { + return LSDATypeIndexTable; + } + + const LabelsMapType &getLabels() const { + return Labels; + } + + IslandInfo &getIslandInfo() { + return Islands; + } + + const IslandInfo &getIslandInfo() const { + return Islands; + } + + /// Return true if the function has CFI instructions + bool hasCFI() const { + return !FrameInstructions.empty() || !CIEFrameInstructions.empty(); + } + + /// Return unique number associated with the function. + uint64_t getFunctionNumber() const { + return FunctionNumber; + } + + /// Return true if the given address \p PC is inside the function body. + bool containsAddress(uint64_t PC, bool UseMaxSize = false) const { + if (UseMaxSize) + return Address <= PC && PC < Address + MaxSize; + return Address <= PC && PC < Address + Size; + } + + /// Create a basic block at a given \p Offset in the + /// function. + /// If \p DeriveAlignment is true, set the alignment of the block based + /// on the alignment of the existing offset. + /// The new block is not inserted into the CFG. The client must + /// use insertBasicBlocks to add any new blocks to the CFG. + std::unique_ptr + createBasicBlock(uint64_t Offset, + MCSymbol *Label = nullptr, + bool DeriveAlignment = false) { + assert(BC.Ctx && "cannot be called with empty context"); + if (!Label) { + std::unique_lock Lock(BC.CtxMutex); + Label = BC.Ctx->createNamedTempSymbol("BB"); + } + auto BB = std::unique_ptr( + new BinaryBasicBlock(this, Label, Offset)); + + if (DeriveAlignment) { + uint64_t DerivedAlignment = Offset & (1 + ~Offset); + BB->setAlignment(std::min(DerivedAlignment, uint64_t(32))); + } + + LabelToBB.emplace(Label, BB.get()); + + return BB; + } + + /// Create a basic block at a given \p Offset in the + /// function and append it to the end of list of blocks. + /// If \p DeriveAlignment is true, set the alignment of the block based + /// on the alignment of the existing offset. + /// + /// Returns NULL if basic block already exists at the \p Offset. + BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label = nullptr, + bool DeriveAlignment = false) { + assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) && + "basic block already exists in pre-CFG state"); + + if (!Label) { + std::unique_lock Lock(BC.CtxMutex); + Label = BC.Ctx->createNamedTempSymbol("BB"); + } + std::unique_ptr BBPtr = + createBasicBlock(Offset, Label, DeriveAlignment); + BasicBlocks.emplace_back(BBPtr.release()); + + BinaryBasicBlock *BB = BasicBlocks.back(); + BB->setIndex(BasicBlocks.size() - 1); + + if (CurrentState == State::Disassembled) { + BasicBlockOffsets.emplace_back(Offset, BB); + } else if (CurrentState == State::CFG) { + BB->setLayoutIndex(layout_size()); + BasicBlocksLayout.emplace_back(BB); + } + + assert(CurrentState == State::CFG || + (std::is_sorted(BasicBlockOffsets.begin(), + BasicBlockOffsets.end(), + CompareBasicBlockOffsets()) && + std::is_sorted(begin(), end()))); + + return BB; + } + + /// Add basic block \BB as an entry point to the function. Return global + /// symbol associated with the entry. + MCSymbol *addEntryPoint(const BinaryBasicBlock &BB); + + /// Mark all blocks that are unreachable from a root (entry point + /// or landing pad) as invalid. + void markUnreachableBlocks(); + + /// Rebuilds BBs layout, ignoring dead BBs. Returns the number of removed + /// BBs and the removed number of bytes of code. + std::pair eraseInvalidBBs(); + + /// Get the relative order between two basic blocks in the original + /// layout. The result is > 0 if B occurs before A and < 0 if B + /// occurs after A. If A and B are the same block, the result is 0. + signed getOriginalLayoutRelativeOrder(const BinaryBasicBlock *A, + const BinaryBasicBlock *B) const { + return getIndex(A) - getIndex(B); + } + + /// Return basic block range that originally contained offset \p Offset + /// from the function start to the function end. + iterator_range getBasicBlockRangeFromOffsetToEnd(uint64_t Offset) { + BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); + return BB + ? iterator_range(BasicBlocks.begin() + getIndex(BB), end()) + : iterator_range(end(), end()); + } + + /// Insert the BBs contained in NewBBs into the basic blocks for this + /// function. Update the associated state of all blocks as needed, i.e. + /// BB offsets and BB indices. The new BBs are inserted after Start. + /// This operation could affect fallthrough branches for Start. + /// + void insertBasicBlocks( + BinaryBasicBlock *Start, + std::vector> &&NewBBs, + const bool UpdateLayout = true, + const bool UpdateCFIState = true, + const bool RecomputeLandingPads = true); + + iterator insertBasicBlocks( + iterator StartBB, + std::vector> &&NewBBs, + const bool UpdateLayout = true, + const bool UpdateCFIState = true, + const bool RecomputeLandingPads = true); + + /// Update the basic block layout for this function. The BBs from + /// [Start->Index, Start->Index + NumNewBlocks) are inserted into the + /// layout after the BB indicated by Start. + void updateLayout(BinaryBasicBlock* Start, const unsigned NumNewBlocks); + + /// Make sure basic blocks' indices match the current layout. + void updateLayoutIndices() const { + unsigned Index = 0; + for (BinaryBasicBlock *BB : layout()) { + BB->setLayoutIndex(Index++); + } + } + + /// Recompute the CFI state for NumNewBlocks following Start after inserting + /// new blocks into the CFG. This must be called after updateLayout. + void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks); + + /// Return true if we detected ambiguous jump tables in this function, which + /// happen when one JT is used in more than one indirect jumps. This precludes + /// us from splitting edges for this JT unless we duplicate the JT (see + /// disambiguateJumpTables). + bool checkForAmbiguousJumpTables(); + + /// Detect when two distinct indirect jumps are using the same jump table and + /// duplicate it, allocating a separate JT for each indirect branch. This is + /// necessary for code transformations on the CFG that change an edge induced + /// by an indirect branch, e.g.: instrumentation or shrink wrapping. However, + /// this is only possible if we are not updating jump tables in place, but are + /// writing it to a new location (moving them). + void disambiguateJumpTables(MCPlusBuilder::AllocatorIdTy AllocId); + + /// Change \p OrigDest to \p NewDest in the jump table used at the end of + /// \p BB. Returns false if \p OrigDest couldn't be find as a valid target + /// and no replacement took place. + bool replaceJumpTableEntryIn(BinaryBasicBlock *BB, + BinaryBasicBlock *OldDest, + BinaryBasicBlock *NewDest); + + /// Split the CFG edge by inserting an intermediate basic block. + /// Returns a pointer to this new intermediate basic block. BB "From" will be + /// updated to jump to the intermediate block, which in turn will have an + /// unconditional branch to BB "To". + /// User needs to manually call fixBranches(). This function only creates the + /// correct CFG edges. + BinaryBasicBlock *splitEdge(BinaryBasicBlock *From, BinaryBasicBlock *To); + + /// We may have built an overly conservative CFG for functions with calls + /// to functions that the compiler knows will never return. In this case, + /// clear all successors from these blocks. + void deleteConservativeEdges(); + + /// Determine direction of the branch based on the current layout. + /// Callee is responsible of updating basic block indices prior to using + /// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). + static bool isForwardBranch(const BinaryBasicBlock *From, + const BinaryBasicBlock *To) { + assert(From->getFunction() == To->getFunction() && + "basic blocks should be in the same function"); + return To->getLayoutIndex() > From->getLayoutIndex(); + } + + /// Determine direction of the call to callee symbol relative to the start + /// of this function. + /// Note: this doesn't take function splitting into account. + bool isForwardCall(const MCSymbol *CalleeSymbol) const; + + /// Dump function information to debug output. If \p PrintInstructions + /// is true - include instruction disassembly. + void dump(bool PrintInstructions = true) const; + + /// Print function information to the \p OS stream. + void print(raw_ostream &OS, std::string Annotation = "", + bool PrintInstructions = true) const; + + /// Print all relocations between \p Offset and \p Offset + \p Size in + /// this function. + void printRelocations(raw_ostream &OS, uint64_t Offset, uint64_t Size) const; + + /// Return true if function has a profile, even if the profile does not + /// match CFG 100%. + bool hasProfile() const { + return ExecutionCount != COUNT_NO_PROFILE; + } + + /// Return true if function profile is present and accurate. + bool hasValidProfile() const { + return ExecutionCount != COUNT_NO_PROFILE && + ProfileMatchRatio == 1.0f; + } + + /// Mark this function as having a valid profile. + void markProfiled(uint16_t Flags) { + if (ExecutionCount == COUNT_NO_PROFILE) + ExecutionCount = 0; + ProfileFlags = Flags; + ProfileMatchRatio = 1.0f; + } + + /// Return flags describing a profile for this function. + uint16_t getProfileFlags() const { + return ProfileFlags; + } + + void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { + assert(!Instructions.empty()); + + // Fix CFI instructions skipping NOPs. We need to fix this because changing + // CFI state after a NOP, besides being wrong and inaccurate, makes it + // harder for us to recover this information, since we can create empty BBs + // with NOPs and then reorder it away. + // We fix this by moving the CFI instruction just before any NOPs. + auto I = Instructions.lower_bound(Offset); + if (Offset == getSize()) { + assert(I == Instructions.end() && "unexpected iterator value"); + // Sometimes compiler issues restore_state after all instructions + // in the function (even after nop). + --I; + Offset = I->first; + } + assert(I->first == Offset && "CFI pointing to unknown instruction"); + if (I == Instructions.begin()) { + CIEFrameInstructions.emplace_back(std::forward(Inst)); + return; + } + + --I; + while (I != Instructions.begin() && BC.MIB->isNoop(I->second)) { + Offset = I->first; + --I; + } + OffsetToCFI.emplace(Offset, FrameInstructions.size()); + FrameInstructions.emplace_back(std::forward(Inst)); + return; + } + + BinaryBasicBlock::iterator addCFIInstruction(BinaryBasicBlock *BB, + BinaryBasicBlock::iterator Pos, + MCCFIInstruction &&Inst) { + size_t Idx = FrameInstructions.size(); + FrameInstructions.emplace_back(std::forward(Inst)); + return addCFIPseudo(BB, Pos, Idx); + } + + /// Insert a CFI pseudo instruction in a basic block. This pseudo instruction + /// is a placeholder that refers to a real MCCFIInstruction object kept by + /// this function that will be emitted at that position. + BinaryBasicBlock::iterator addCFIPseudo(BinaryBasicBlock *BB, + BinaryBasicBlock::iterator Pos, + uint32_t Offset) { + MCInst CFIPseudo; + BC.MIB->createCFI(CFIPseudo, Offset); + return BB->insertPseudoInstr(Pos, CFIPseudo); + } + + /// Retrieve the MCCFIInstruction object associated with a CFI pseudo. + MCCFIInstruction* getCFIFor(const MCInst &Instr) { + if (!BC.MIB->isCFI(Instr)) + return nullptr; + uint32_t Offset = Instr.getOperand(0).getImm(); + assert(Offset < FrameInstructions.size() && "Invalid CFI offset"); + return &FrameInstructions[Offset]; + } + + const MCCFIInstruction* getCFIFor(const MCInst &Instr) const { + if (!BC.MIB->isCFI(Instr)) + return nullptr; + uint32_t Offset = Instr.getOperand(0).getImm(); + assert(Offset < FrameInstructions.size() && "Invalid CFI offset"); + return &FrameInstructions[Offset]; + } + + BinaryFunction &setFileOffset(uint64_t Offset) { + FileOffset = Offset; + return *this; + } + + BinaryFunction &setSize(uint64_t S) { + Size = S; + return *this; + } + + BinaryFunction &setMaxSize(uint64_t Size) { + MaxSize = Size; + return *this; + } + + BinaryFunction &setOutputAddress(uint64_t Address) { + OutputAddress = Address; + return *this; + } + + BinaryFunction &setOutputSize(uint64_t Size) { + OutputSize = Size; + return *this; + } + + BinaryFunction &setSimple(bool Simple) { + IsSimple = Simple; + return *this; + } + + void setPseudo(bool Pseudo) { + IsPseudo = Pseudo; + } + + BinaryFunction &setUsesGnuArgsSize(bool Uses = true) { + UsesGnuArgsSize = Uses; + return *this; + } + + BinaryFunction &setHasProfileAvailable(bool V = true) { + HasProfileAvailable = V; + return *this; + } + + /// Mark function that should not be emitted. + void setIgnored(); + + void setIsPatched(bool V) { + IsPatched = V; + } + + void setHasSplitJumpTable(bool V) { + HasSplitJumpTable = V; + } + + void setHasCanonicalCFG(bool V) { + HasCanonicalCFG = V; + } + + void setFolded(BinaryFunction *BF) { + FoldedIntoFunction = BF; + } + + BinaryFunction &setPersonalityFunction(uint64_t Addr) { + assert(!PersonalityFunction && "can't set personality function twice"); + PersonalityFunction = BC.getOrCreateGlobalSymbol(Addr, "FUNCat"); + return *this; + } + + BinaryFunction &setPersonalityEncoding(uint8_t Encoding) { + PersonalityEncoding = Encoding; + return *this; + } + + BinaryFunction &setAlignment(uint16_t Align) { + Alignment = Align; + return *this; + } + + uint16_t getAlignment() const { + return Alignment; + } + + BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) { + MaxAlignmentBytes = MaxAlignBytes; + return *this; + } + + uint16_t getMaxAlignmentBytes() const { + return MaxAlignmentBytes; + } + + BinaryFunction &setMaxColdAlignmentBytes(uint16_t MaxAlignBytes) { + MaxColdAlignmentBytes = MaxAlignBytes; + return *this; + } + + uint16_t getMaxColdAlignmentBytes() const { + return MaxColdAlignmentBytes; + } + + BinaryFunction &setImageAddress(uint64_t Address) { + ImageAddress = Address; + return *this; + } + + /// Return the address of this function' image in memory. + uint64_t getImageAddress() const { + return ImageAddress; + } + + BinaryFunction &setImageSize(uint64_t Size) { + ImageSize = Size; + return *this; + } + + /// Return the size of this function' image in memory. + uint64_t getImageSize() const { + return ImageSize; + } + + /// Return true if the function is a secondary fragment of another function. + bool isFragment() const { + return IsFragment; + } + + /// Return parent function fragment if this function is a secondary (child) + /// fragment of another function. + BinaryFunction *getParentFragment() const { + return ParentFragment; + } + + /// If the function is a nested child fragment of another function, return its + /// topmost parent fragment. + const BinaryFunction *getTopmostFragment() const { + const BinaryFunction *BF = this; + while (BF->getParentFragment()) + BF = BF->getParentFragment(); + + return BF; + } + + /// Set the profile data for the number of times the function was called. + BinaryFunction &setExecutionCount(uint64_t Count) { + ExecutionCount = Count; + return *this; + } + + /// Adjust execution count for the function by a given \p Count. The value + /// \p Count will be subtracted from the current function count. + /// + /// The function will proportionally adjust execution count for all + /// basic blocks and edges in the control flow graph. + void adjustExecutionCount(uint64_t Count); + + /// Set LSDA address for the function. + BinaryFunction &setLSDAAddress(uint64_t Address) { + LSDAAddress = Address; + return *this; + } + + /// Set LSDA symbol for the function. + BinaryFunction &setLSDASymbol(MCSymbol *Symbol) { + LSDASymbol = Symbol; + return *this; + } + + /// Return the profile information about the number of times + /// the function was executed. + /// + /// Return COUNT_NO_PROFILE if there's no profile info. + uint64_t getExecutionCount() const { + return ExecutionCount; + } + + /// Return the raw profile information about the number of branch + /// executions corresponding to this function. + uint64_t getRawBranchCount() const { return RawBranchCount; } + + /// Return the execution count for functions with known profile. + /// Return 0 if the function has no profile. + uint64_t getKnownExecutionCount() const { + return ExecutionCount == COUNT_NO_PROFILE ? 0 : ExecutionCount; + } + + /// Return original LSDA address for the function or NULL. + uint64_t getLSDAAddress() const { + return LSDAAddress; + } + + /// Return symbol pointing to function's LSDA. + MCSymbol *getLSDASymbol() { + if (LSDASymbol) + return LSDASymbol; + if (CallSites.empty()) + return nullptr; + + LSDASymbol = + BC.Ctx->getOrCreateSymbol(Twine("GCC_except_table") + + Twine::utohexstr(getFunctionNumber())); + + return LSDASymbol; + } + + /// Return symbol pointing to function's LSDA for the cold part. + MCSymbol *getColdLSDASymbol() { + if (ColdLSDASymbol) + return ColdLSDASymbol; + if (ColdCallSites.empty()) + return nullptr; + + ColdLSDASymbol = + BC.Ctx->getOrCreateSymbol(Twine("GCC_cold_except_table") + + Twine::utohexstr(getFunctionNumber())); + + return ColdLSDASymbol; + } + + /// True if the symbol is a mapping symbol used in AArch64 to delimit + /// data inside code section. + bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; + bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; + + void setOutputDataAddress(uint64_t Address) { + OutputDataOffset = Address; + } + + uint64_t getOutputDataAddress() const { + return OutputDataOffset; + } + + void setOutputColdDataAddress(uint64_t Address) { + OutputColdDataOffset = Address; + } + + uint64_t getOutputColdDataAddress() const { + return OutputColdDataOffset; + } + + /// If \p Address represents an access to a constant island managed by this + /// function, return a symbol so code can safely refer to it. Otherwise, + /// return nullptr. First return value is the symbol for reference in the + /// hot code area while the second return value is the symbol for reference + /// in the cold code area, as when the function is split the islands are + /// duplicated. + MCSymbol *getOrCreateIslandAccess(uint64_t Address) { + MCSymbol *Symbol; + if (!isInConstantIsland(Address)) + return nullptr; + + // Register our island at global namespace + Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat"); + + // Internal bookkeeping + const uint64_t Offset = Address - getAddress(); + assert((!Islands.Offsets.count(Offset) || + Islands.Offsets[Offset] == Symbol) && + "Inconsistent island symbol management"); + if (!Islands.Offsets.count(Offset)) { + Islands.Offsets[Offset] = Symbol; + Islands.Symbols.insert(Symbol); + } + return Symbol; + } + + /// Called by an external function which wishes to emit references to constant + /// island symbols of this function. We create a proxy for it, so we emit + /// separate symbols when emitting our constant island on behalf of this other + /// function. + MCSymbol * + getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction &Referrer) { + MCSymbol *Symbol = getOrCreateIslandAccess(Address); + if (!Symbol) + return nullptr; + + MCSymbol *Proxy; + if (!Islands.Proxies[&Referrer].count(Symbol)) { + Proxy = + BC.Ctx->getOrCreateSymbol(Symbol->getName() + + ".proxy.for." + Referrer.getPrintName()); + Islands.Proxies[&Referrer][Symbol] = Proxy; + Islands.Proxies[&Referrer][Proxy] = Symbol; + } + Proxy = Islands.Proxies[&Referrer][Symbol]; + return Proxy; + } + + /// Detects whether \p Address is inside a data region in this function + /// (constant islands). + bool isInConstantIsland(uint64_t Address) const { + if (Address < getAddress()) + return false; + + uint64_t Offset = Address - getAddress(); + + if (Offset >= getMaxSize()) + return false; + + auto DataIter = Islands.DataOffsets.upper_bound(Offset); + if (DataIter == Islands.DataOffsets.begin()) + return false; + DataIter = std::prev(DataIter); + + auto CodeIter = Islands.CodeOffsets.upper_bound(Offset); + if (CodeIter == Islands.CodeOffsets.begin()) + return true; + + return *std::prev(CodeIter) <= *DataIter; + } + + uint64_t + estimateConstantIslandSize(const BinaryFunction *OnBehalfOf = nullptr) const { + uint64_t Size = 0; + for (auto DataIter = Islands.DataOffsets.begin(); + DataIter != Islands.DataOffsets.end(); + ++DataIter) { + auto NextData = std::next(DataIter); + auto CodeIter = Islands.CodeOffsets.lower_bound(*DataIter); + if (CodeIter == Islands.CodeOffsets.end() && + NextData == Islands.DataOffsets.end()) { + Size += getMaxSize() - *DataIter; + continue; + } + + uint64_t NextMarker; + if (CodeIter == Islands.CodeOffsets.end()) + NextMarker = *NextData; + else if (NextData == Islands.DataOffsets.end()) + NextMarker = *CodeIter; + else + NextMarker = (*CodeIter > *NextData) ? *NextData : *CodeIter; + + Size += NextMarker - *DataIter; + } + + if (!OnBehalfOf) { + for (BinaryFunction *ExternalFunc : Islands.Dependency) + Size += ExternalFunc->estimateConstantIslandSize(this); + } + return Size; + } + + bool hasConstantIsland() const { + return !Islands.DataOffsets.empty(); + } + + /// Return true iff the symbol could be seen inside this function otherwise + /// it is probably another function. + bool isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const; + + /// Disassemble function from raw data. + /// If successful, this function will populate the list of instructions + /// for this function together with offsets from the function start + /// in the input. It will also populate Labels with destinations for + /// local branches, and TakenBranches with [from, to] info. + /// + /// The Function should be properly initialized before this function + /// is called. I.e. function address and size should be set. + /// + /// Returns true on successful disassembly, and updates the current + /// state to State:Disassembled. + /// + /// Returns false if disassembly failed. + bool disassemble(); + + /// Scan function for references to other functions. In relocation mode, + /// add relocations for external references. + /// + /// Return true on success. + bool scanExternalRefs(); + + /// Return the size of a data object located at \p Offset in the function. + /// Return 0 if there is no data object at the \p Offset. + size_t getSizeOfDataInCodeAt(uint64_t Offset) const; + + /// Verify that starting at \p Offset function contents are filled with + /// zero-value bytes. + bool isZeroPaddingAt(uint64_t Offset) const; + + /// Check that entry points have an associated instruction at their + /// offsets after disassembly. + void postProcessEntryPoints(); + + /// Post-processing for jump tables after disassembly. Since their + /// boundaries are not known until all call sites are seen, we need this + /// extra pass to perform any final adjustments. + void postProcessJumpTables(); + + /// Builds a list of basic blocks with successor and predecessor info. + /// + /// The function should in Disassembled state prior to call. + /// + /// Returns true on success and update the current function state to + /// State::CFG. Returns false if CFG cannot be built. + bool buildCFG(MCPlusBuilder::AllocatorIdTy); + + /// Perform post-processing of the CFG. + void postProcessCFG(); + + /// Verify that any assumptions we've made about indirect branches were + /// correct and also make any necessary changes to unknown indirect branches. + /// + /// Catch-22: we need to know indirect branch targets to build CFG, and + /// in order to determine the value for indirect branches we need to know CFG. + /// + /// As such, the process of decoding indirect branches is broken into 2 steps: + /// first we make our best guess about a branch without knowing the CFG, + /// and later after we have the CFG for the function, we verify our earlier + /// assumptions and also do our best at processing unknown indirect branches. + /// + /// Return true upon successful processing, or false if the control flow + /// cannot be statically evaluated for any given indirect branch. + bool postProcessIndirectBranches(MCPlusBuilder::AllocatorIdTy AllocId); + + /// Return all call site profile info for this function. + IndirectCallSiteProfile &getAllCallSites() { + return AllCallSites; + } + + const IndirectCallSiteProfile &getAllCallSites() const { + return AllCallSites; + } + + /// Walks the list of basic blocks filling in missing information about + /// edge frequency for fall-throughs. + /// + /// Assumes the CFG has been built and edge frequency for taken branches + /// has been filled with LBR data. + void inferFallThroughCounts(); + + /// Clear execution profile of the function. + void clearProfile(); + + /// Converts conditional tail calls to unconditional tail calls. We do this to + /// handle conditional tail calls correctly and to give a chance to the + /// simplify conditional tail call pass to decide whether to re-optimize them + /// using profile information. + void removeConditionalTailCalls(); + + // Convert COUNT_NO_PROFILE to 0 + void removeTagsFromProfile(); + + /// Computes a function hotness score: the sum of the products of BB frequency + /// and size. + uint64_t getFunctionScore() const; + + /// Return true if the layout has been changed by basic block reordering, + /// false otherwise. + bool hasLayoutChanged() const; + + /// Get the edit distance of the new layout with respect to the previous + /// layout after basic block reordering. + uint64_t getEditDistance() const; + + /// Get the number of instructions within this function. + uint64_t getInstructionCount() const; + + const CFIInstrMapType &getFDEProgram() const { + return FrameInstructions; + } + + void moveRememberRestorePair(BinaryBasicBlock *BB); + + bool replayCFIInstrs(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator InsertIt); + + /// unwindCFIState is used to unwind from a higher to a lower state number + /// without using remember-restore instructions. We do that by keeping track + /// of what values have been changed from state A to B and emitting + /// instructions that undo this change. + SmallVector unwindCFIState(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator &InsertIt); + + /// After reordering, this function checks the state of CFI and fixes it if it + /// is corrupted. If it is unable to fix it, it returns false. + bool finalizeCFIState(); + + /// Return true if this function needs an address-transaltion table after + /// its code emission. + bool requiresAddressTranslation() const; + + /// Adjust branch instructions to match the CFG. + /// + /// As it comes to internal branches, the CFG represents "the ultimate source + /// of truth". Transformations on functions and blocks have to update the CFG + /// and fixBranches() would make sure the correct branch instructions are + /// inserted at the end of basic blocks. + /// + /// We do require a conditional branch at the end of the basic block if + /// the block has 2 successors as CFG currently lacks the conditional + /// code support (it will probably stay that way). We only use this + /// branch instruction for its conditional code, the destination is + /// determined by CFG - first successor representing true/taken branch, + /// while the second successor - false/fall-through branch. + /// + /// When we reverse the branch condition, the CFG is updated accordingly. + void fixBranches(); + + /// Mark function as finalized. No further optimizations are permitted. + void setFinalized() { + CurrentState = State::CFG_Finalized; + } + + void setEmitted(bool KeepCFG = false) { + CurrentState = State::EmittedCFG; + if (!KeepCFG) { + releaseCFG(); + CurrentState = State::Emitted; + } + } + + /// Process LSDA information for the function. + void parseLSDA(ArrayRef LSDAData, uint64_t LSDAAddress); + + /// Update exception handling ranges for the function. + void updateEHRanges(); + + /// Traverse cold basic blocks and replace references to constants in islands + /// with a proxy symbol for the duplicated constant island that is going to be + /// emitted in the cold region. + void duplicateConstantIslands(); + + /// Merge profile data of this function into those of the given + /// function. The functions should have been proven identical with + /// isIdenticalWith. + void mergeProfileDataInto(BinaryFunction &BF) const; + + /// Returns the last computed hash value of the function. + size_t getHash() const { + return Hash; + } + + using OperandHashFuncTy = + function_ref; + + /// Compute the hash value of the function based on its contents. + /// + /// If \p UseDFS is set, process basic blocks in DFS order. Otherwise, use + /// the existing layout order. + /// + /// By default, instruction operands are ignored while calculating the hash. + /// The caller can change this via passing \p OperandHashFunc function. + /// The return result of this function will be mixed with internal hash. + size_t computeHash(bool UseDFS = false, + OperandHashFuncTy OperandHashFunc = + [](const MCOperand&) { return std::string(); }) const; + + void setDWARFUnit(DWARFUnit *Unit) { + DwarfUnit = Unit; + } + + /// Return DWARF compile unit for this function. + DWARFUnit *getDWARFUnit() const { + return DwarfUnit; + } + + /// Return line info table for this function. + const DWARFDebugLine::LineTable *getDWARFLineTable() const { + return getDWARFUnit() ? BC.DwCtx->getLineTableForUnit(getDWARFUnit()) + : nullptr; + } + + /// Finalize profile for the function. + void postProcessProfile(); + + /// Returns an estimate of the function's hot part after splitting. + /// This is a very rough estimate, as with C++ exceptions there are + /// blocks we don't move, and it makes no attempt at estimating the size + /// of the added/removed branch instructions. + /// Note that this size is optimistic and the actual size may increase + /// after relaxation. + size_t estimateHotSize(const bool UseSplitSize = true) const { + size_t Estimate = 0; + if (UseSplitSize && isSplit()) { + for (const BinaryBasicBlock *BB : BasicBlocksLayout) { + if (!BB->isCold()) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + } + } else { + for (const BinaryBasicBlock *BB : BasicBlocksLayout) { + if (BB->getKnownExecutionCount() != 0) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + } + } + return Estimate; + } + + size_t estimateColdSize() const { + if (!isSplit()) + return estimateSize(); + size_t Estimate = 0; + for (const BinaryBasicBlock *BB : BasicBlocksLayout) { + if (BB->isCold()) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + } + return Estimate; + } + + size_t estimateSize() const { + size_t Estimate = 0; + for (const BinaryBasicBlock *BB : BasicBlocksLayout) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + return Estimate; + } + + /// Return output address ranges for a function. + DebugAddressRangesVector getOutputAddressRanges() const; + + /// Given an address corresponding to an instruction in the input binary, + /// return an address of this instruction in output binary. + /// + /// Return 0 if no matching address could be found or the instruction was + /// removed. + uint64_t translateInputToOutputAddress(uint64_t Address) const; + + /// Take address ranges corresponding to the input binary and translate + /// them to address ranges in the output binary. + DebugAddressRangesVector translateInputToOutputRanges( + const DWARFAddressRangesVector &InputRanges) const; + + /// Similar to translateInputToOutputRanges() but operates on location lists + /// and moves associated data to output location lists. + DebugLocationsVector translateInputToOutputLocationList( + const DebugLocationsVector &InputLL) const; + + /// Return true if the function is an AArch64 linker inserted veneer + bool isAArch64Veneer() const; + + virtual ~BinaryFunction(); + + /// Info for fragmented functions. + class FragmentInfo { + private: + uint64_t Address{0}; + uint64_t ImageAddress{0}; + uint64_t ImageSize{0}; + uint64_t FileOffset{0}; + public: + uint64_t getAddress() const { return Address; } + uint64_t getImageAddress() const { return ImageAddress; } + uint64_t getImageSize() const { return ImageSize; } + uint64_t getFileOffset() const { return FileOffset; } + + void setAddress(uint64_t VAddress) { Address = VAddress; } + void setImageAddress(uint64_t Address) { ImageAddress = Address; } + void setImageSize(uint64_t Size) { ImageSize = Size; } + void setFileOffset(uint64_t Offset) { FileOffset = Offset; } + }; + + /// Cold fragment of the function. + FragmentInfo ColdFragment; + + FragmentInfo &cold() { return ColdFragment; } + + const FragmentInfo &cold() const { return ColdFragment; } + + /// Mark child fragments as ignored. + void ignoreFragments() { + for (BinaryFunction *Fragment : Fragments) + Fragment->setIgnored(); + } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, + const BinaryFunction &Function) { + OS << Function.getPrintName(); + return OS; +} + +inline raw_ostream &operator<<(raw_ostream &OS, + const BinaryFunction::State State) { + switch (State) { + case BinaryFunction::State::Empty: OS << "empty"; break; + case BinaryFunction::State::Disassembled: OS << "disassembled"; break; + case BinaryFunction::State::CFG: OS << "CFG constructed"; break; + case BinaryFunction::State::CFG_Finalized:OS << "CFG finalized"; break; + case BinaryFunction::State::EmittedCFG: OS << "emitted with CFG"; break; + case BinaryFunction::State::Emitted: OS << "emitted"; break; + } + + return OS; +} + +} // namespace bolt + + +// GraphTraits specializations for function basic block graphs (CFGs) +template <> struct GraphTraits : + public GraphTraits { + static NodeRef getEntryNode(bolt::BinaryFunction *F) { + return *F->layout_begin(); + } + + using nodes_iterator = pointer_iterator; + + static nodes_iterator nodes_begin(bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); + return nodes_iterator(F->begin()); + } + static nodes_iterator nodes_end(bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); + return nodes_iterator(F->end()); + } + static size_t size(bolt::BinaryFunction *F) { + return F->size(); + } +}; + +template <> struct GraphTraits : + public GraphTraits { + static NodeRef getEntryNode(const bolt::BinaryFunction *F) { + return *F->layout_begin(); + } + + using nodes_iterator = pointer_iterator; + + static nodes_iterator nodes_begin(const bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); + return nodes_iterator(F->begin()); + } + static nodes_iterator nodes_end(const bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); + return nodes_iterator(F->end()); + } + static size_t size(const bolt::BinaryFunction *F) { + return F->size(); + } +}; + +template <> struct GraphTraits> : + public GraphTraits> { + static NodeRef getEntryNode(Inverse G) { + return *G.Graph->layout_begin(); + } +}; + +template <> struct GraphTraits> : + public GraphTraits> { + static NodeRef getEntryNode(Inverse G) { + return *G.Graph->layout_begin(); + } +}; + +} // namespace llvm + +#endif diff --git a/bolt/src/BinaryFunctionProfile.cpp b/bolt/src/BinaryFunctionProfile.cpp new file mode 100644 index 000000000000..d7aebe485026 --- /dev/null +++ b/bolt/src/BinaryFunctionProfile.cpp @@ -0,0 +1,372 @@ +//===--- BinaryFunctionProfile.cpp --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "Passes/MCF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-prof" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt IndirectCallPromotion; +extern cl::opt JumpTables; + +static cl::opt +DoMCF("mcf", + cl::desc("solve a min cost flow problem on the CFG to fix edge counts " + "(default=disable)"), + cl::init(MCF_DISABLE), + cl::values( + clEnumValN(MCF_DISABLE, "none", + "disable MCF"), + clEnumValN(MCF_LINEAR, "linear", + "cost function is inversely proportional to edge count"), + clEnumValN(MCF_QUADRATIC, "quadratic", + "cost function is inversely proportional to edge count squared"), + clEnumValN(MCF_LOG, "log", + "cost function is inversely proportional to log of edge count"), + clEnumValN(MCF_BLAMEFTS, "blamefts", + "tune cost to blame fall-through edges for surplus flow")), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +FixFuncCounts("fix-func-counts", + cl::desc("adjust function counts based on basic blocks execution count"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +FixBlockCounts("fix-block-counts", + cl::desc("adjust block counts based on outgoing branch counts"), + cl::init(true), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +InferFallThroughs("infer-fall-throughs", + cl::desc("infer execution count for fall-through blocks"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +void BinaryFunction::postProcessProfile() { + if (!hasValidProfile()) { + clearProfile(); + return; + } + + if (!(getProfileFlags() & PF_LBR)) { + // Check if MCF post-processing was requested. + if (opts::DoMCF != MCF_DISABLE) { + removeTagsFromProfile(); + solveMCF(*this, opts::DoMCF); + } + return; + } + + // If we have at least some branch data for the function indicate that it + // was executed. + if (opts::FixFuncCounts && ExecutionCount == 0) { + ExecutionCount = 1; + } + + // Compute preliminary execution count for each basic block. + for (BinaryBasicBlock *BB : BasicBlocks) { + if ((!BB->isEntryPoint() && !BB->isLandingPad()) || + BB->ExecutionCount == BinaryBasicBlock::COUNT_NO_PROFILE) + BB->ExecutionCount = 0; + } + for (BinaryBasicBlock *BB : BasicBlocks) { + auto SuccBIIter = BB->branch_info_begin(); + for (BinaryBasicBlock *Succ : BB->successors()) { + // All incoming edges to the primary entry have been accounted for, thus + // we skip the update here. + if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + Succ != BasicBlocks.front()) + Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); + ++SuccBIIter; + } + } + + if (opts::FixBlockCounts) { + for (BinaryBasicBlock *BB : BasicBlocks) { + // Make sure that execution count of a block is at least the branch count + // of an incoming/outgoing jump. + auto SuccBIIter = BB->branch_info_begin(); + for (BinaryBasicBlock *Succ : BB->successors()) { + uint64_t Count = SuccBIIter->Count; + if (Count != BinaryBasicBlock::COUNT_NO_PROFILE && Count > 0) { + Succ->setExecutionCount(std::max(Succ->getExecutionCount(), Count)); + BB->setExecutionCount(std::max(BB->getExecutionCount(), Count)); + } + ++SuccBIIter; + } + // Make sure that execution count of a block is at least the number of + // function calls from the block. + for (MCInst &Inst : *BB) { + // Ignore non-call instruction + if (!BC.MIA->isCall(Inst)) + continue; + + auto CountAnnt = BC.MIB->tryGetAnnotationAs(Inst, "Count"); + if (CountAnnt) { + BB->setExecutionCount(std::max(BB->getExecutionCount(), *CountAnnt)); + } + } + } + } + + if (opts::InferFallThroughs) + inferFallThroughCounts(); + + // Check if MCF post-processing was requested. + if (opts::DoMCF != MCF_DISABLE) { + removeTagsFromProfile(); + solveMCF(*this, opts::DoMCF); + } + + // Update profile information for jump tables based on CFG branch data. + for (BinaryBasicBlock *BB : BasicBlocks) { + const MCInst *LastInstr = BB->getLastNonPseudoInstr(); + if (!LastInstr) + continue; + const uint64_t JTAddress = BC.MIB->getJumpTable(*LastInstr); + if (!JTAddress) + continue; + JumpTable *JT = getJumpTableContainingAddress(JTAddress); + if (!JT) + continue; + + uint64_t TotalBranchCount = 0; + for (const BinaryBasicBlock::BinaryBranchInfo &BranchInfo : + BB->branch_info()) { + TotalBranchCount += BranchInfo.Count; + } + JT->Count += TotalBranchCount; + + if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && + opts::JumpTables < JTS_AGGRESSIVE) + continue; + + if (JT->Counts.empty()) + JT->Counts.resize(JT->Entries.size()); + auto EI = JT->Entries.begin(); + uint64_t Delta = (JTAddress - JT->getAddress()) / JT->EntrySize; + EI += Delta; + while (EI != JT->Entries.end()) { + const BinaryBasicBlock *TargetBB = getBasicBlockForLabel(*EI); + if (TargetBB) { + const BinaryBasicBlock::BinaryBranchInfo &BranchInfo = + BB->getBranchInfo(*TargetBB); + assert(Delta < JT->Counts.size()); + JT->Counts[Delta].Count += BranchInfo.Count; + JT->Counts[Delta].Mispreds += BranchInfo.MispredictedCount; + } + ++Delta; + ++EI; + // A label marks the start of another jump table. + if (JT->Labels.count(Delta * JT->EntrySize)) + break; + } + } +} + +void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { + // No reason to merge invalid or empty profiles into BF. + if (!hasValidProfile()) + return; + + // Update function execution count. + if (getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) { + BF.setExecutionCount(BF.getKnownExecutionCount() + getExecutionCount()); + } + + // Since we are merging a valid profile, the new profile should be valid too. + // It has either already been valid, or it has been cleaned up. + BF.ProfileMatchRatio = 1.0f; + + // Update basic block and edge counts. + auto BBMergeI = BF.begin(); + for (BinaryBasicBlock *BB : BasicBlocks) { + BinaryBasicBlock *BBMerge = &*BBMergeI; + assert(getIndex(BB) == BF.getIndex(BBMerge)); + + // Update basic block count. + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) { + BBMerge->setExecutionCount( + BBMerge->getKnownExecutionCount() + BB->getExecutionCount()); + } + + // Update edge count for successors of this basic block. + auto BBMergeSI = BBMerge->succ_begin(); + auto BIMergeI = BBMerge->branch_info_begin(); + auto BII = BB->branch_info_begin(); + for (const BinaryBasicBlock *BBSucc : BB->successors()) { + (void)BBSucc; + assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI)); + + // At this point no branch count should be set to COUNT_NO_PROFILE. + assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "unexpected unknown branch profile"); + assert(BIMergeI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "unexpected unknown branch profile"); + + BIMergeI->Count += BII->Count; + + // When we merge inferred and real fall-through branch data, the merged + // data is considered inferred. + if (BII->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED && + BIMergeI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { + BIMergeI->MispredictedCount += BII->MispredictedCount; + } else { + BIMergeI->MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; + } + + ++BBMergeSI; + ++BII; + ++BIMergeI; + } + assert(BBMergeSI == BBMerge->succ_end()); + + ++BBMergeI; + } + assert(BBMergeI == BF.end()); + + // Merge jump tables profile info. + auto JTMergeI = BF.JumpTables.begin(); + for (const auto &JTEntry : JumpTables) { + if (JTMergeI->second->Counts.empty()) + JTMergeI->second->Counts.resize(JTEntry.second->Counts.size()); + auto CountMergeI = JTMergeI->second->Counts.begin(); + for (const JumpTable::JumpInfo &JI : JTEntry.second->Counts) { + CountMergeI->Count += JI.Count; + CountMergeI->Mispreds += JI.Mispreds; + ++CountMergeI; + } + assert(CountMergeI == JTMergeI->second->Counts.end()); + + ++JTMergeI; + } + assert(JTMergeI == BF.JumpTables.end()); +} + +void BinaryFunction::inferFallThroughCounts() { + // Work on a basic block at a time, propagating frequency information + // forwards. + // It is important to walk in the layout order. + for (BinaryBasicBlock *BB : BasicBlocks) { + const uint64_t BBExecCount = BB->getExecutionCount(); + + // Propagate this information to successors, filling in fall-through edges + // with frequency information + if (BB->succ_size() == 0) + continue; + + // Calculate frequency of outgoing branches from this node according to + // LBR data. + uint64_t ReportedBranches = 0; + for (const BinaryBasicBlock::BinaryBranchInfo &SuccBI : BB->branch_info()) { + if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) + ReportedBranches += SuccBI.Count; + } + + // Get taken count of conditional tail call if the block ends with one. + uint64_t CTCTakenCount = 0; + const MCInst *CTCInstr = BB->getLastNonPseudoInstr(); + if (CTCInstr && BC.MIB->getConditionalTailCall(*CTCInstr)) { + CTCTakenCount = + BC.MIB->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); + } + + // Calculate frequency of throws from this node according to LBR data + // for branching into associated landing pads. Since it is possible + // for a landing pad to be associated with more than one basic blocks, + // we may overestimate the frequency of throws for such blocks. + uint64_t ReportedThrows = 0; + for (const BinaryBasicBlock *LP : BB->landing_pads()) { + ReportedThrows += LP->getExecutionCount(); + } + + const uint64_t TotalReportedJumps = + ReportedBranches + CTCTakenCount + ReportedThrows; + + // Infer the frequency of the fall-through edge, representing not taking the + // branch. + uint64_t Inferred = 0; + if (BBExecCount > TotalReportedJumps) + Inferred = BBExecCount - TotalReportedJumps; + + LLVM_DEBUG( + if (BBExecCount < TotalReportedJumps) + dbgs() + << "Fall-through inference is slightly inconsistent. " + "exec frequency is less than the outgoing edges frequency (" + << BBExecCount << " < " << ReportedBranches + << ") for BB at offset 0x" + << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n'; + ); + + if (BB->succ_size() <= 2) { + // Skip if the last instruction is an unconditional jump. + const MCInst *LastInstr = BB->getLastNonPseudoInstr(); + if (LastInstr && + (BC.MIB->isUnconditionalBranch(*LastInstr) || + BC.MIB->isIndirectBranch(*LastInstr))) + continue; + // If there is an FT it will be the last successor. + auto &SuccBI = *BB->branch_info_rbegin(); + auto &Succ = *BB->succ_rbegin(); + if (SuccBI.Count == 0) { + SuccBI.Count = Inferred; + SuccBI.MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; + Succ->ExecutionCount += Inferred; + } + } + } + + return; +} + +void BinaryFunction::clearProfile() { + // Keep function execution profile the same. Only clear basic block and edge + // counts. + for (BinaryBasicBlock *BB : BasicBlocks) { + BB->ExecutionCount = 0; + for (BinaryBasicBlock::BinaryBranchInfo &BI : BB->branch_info()) { + BI.Count = 0; + BI.MispredictedCount = 0; + } + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/BinaryLoop.h b/bolt/src/BinaryLoop.h new file mode 100644 index 000000000000..a09d75ed8fda --- /dev/null +++ b/bolt/src/BinaryLoop.h @@ -0,0 +1,93 @@ +//===--- BinaryLoop.h - Interface for machine-level loop ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the BinaryLoop class, which represents a loop in the +// CFG of a binary function, and the BinaryLoopInfo class, which stores +// information about all the loops of a binary function. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_LOOP_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_LOOP_H + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfoImpl.h" +#include "llvm/Support/GenericDomTreeConstruction.h" + +namespace llvm { +namespace bolt { + +class BinaryBasicBlock; + +using BinaryDomTreeNode = DomTreeNodeBase; +using BinaryDominatorTree = DomTreeBase; + +class BinaryLoop : public LoopBase { +public: + BinaryLoop() : LoopBase() { } + + // The total count of all the back edges of this loop. + uint64_t TotalBackEdgeCount{0}; + + // The times the loop is entered from outside. + uint64_t EntryCount{0}; + + // The times the loop is exited. + uint64_t ExitCount{0}; + + // Most of the public interface is provided by LoopBase. + +protected: + friend class LoopInfoBase; + explicit BinaryLoop(BinaryBasicBlock *BB) : + LoopBase(BB) { } +}; + +class BinaryLoopInfo : public LoopInfoBase { +public: + BinaryLoopInfo() { } + + unsigned OuterLoops{0}; + unsigned TotalLoops{0}; + unsigned MaximumDepth{0}; + + // Most of the public interface is provided by LoopInfoBase. +}; + +} // namespace bolt +} // namespace llvm + +namespace llvm { + +// BinaryDominatorTree GraphTraits specializations. +template <> struct GraphTraits + : public DomTreeGraphTraitsBase {}; + +template <> struct GraphTraits + : public DomTreeGraphTraitsBase {}; + +template <> struct GraphTraits + : public GraphTraits { + static NodeRef getEntryNode(bolt::BinaryDominatorTree *DT) { + return DT->getRootNode(); + } + + static nodes_iterator nodes_begin(bolt::BinaryDominatorTree *N) { + return df_begin(getEntryNode(N)); + } + + static nodes_iterator nodes_end(bolt::BinaryDominatorTree *N) { + return df_end(getEntryNode(N)); + } +}; + +} // namescpae llvm + +#endif diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp new file mode 100644 index 000000000000..07d307279481 --- /dev/null +++ b/bolt/src/BinaryPassManager.cpp @@ -0,0 +1,539 @@ +//===--- BinaryPassManager.cpp - Binary-level analysis/optimization passes ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryPassManager.h" +#include "Passes/Aligner.h" +#include "Passes/AllocCombiner.h" +#include "Passes/FrameOptimizer.h" +#include "Passes/IdenticalCodeFolding.h" +#include "Passes/IndirectCallPromotion.h" +#include "Passes/Inliner.h" +#include "Passes/Instrumentation.h" +#include "Passes/JTFootprintReduction.h" +#include "Passes/LongJmp.h" +#include "Passes/LoopInversionPass.h" +#include "Passes/PLTCall.h" +#include "Passes/PatchEntries.h" +#include "Passes/RegReAssign.h" +#include "Passes/ReorderData.h" +#include "Passes/ReorderFunctions.h" +#include "Passes/RetpolineInsertion.h" +#include "Passes/SplitFunctions.h" +#include "Passes/StokeInfo.h" +#include "Passes/ValidateInternalCalls.h" +#include "Passes/VeneerElimination.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; +extern cl::OptionCategory BoltCategory; + +extern cl::opt Instrument; + +extern cl::opt Verbosity; +extern cl::opt PrintAll; +extern cl::opt PrintDynoStats; +extern cl::opt DumpDotAll; +extern cl::opt PLT; + +static cl::opt +DynoStatsAll("dyno-stats-all", + cl::desc("print dyno stats after each stage"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +EliminateUnreachable("eliminate-unreachable", + cl::desc("eliminate unreachable code"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ICF("icf", + cl::desc("fold functions with identical code"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +JTFootprintReductionFlag("jt-footprint-reduction", + cl::desc("make jump tables size smaller at the cost of using more " + "instructions at jump sites"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintJTFootprintReduction("print-after-jt-footprint-reduction", + cl::desc("print function after jt-footprint-reduction pass"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +NeverPrint("never-print", + cl::desc("never print"), + cl::init(false), + cl::ZeroOrMore, + cl::ReallyHidden, + cl::cat(BoltOptCategory)); + +cl::opt +PrintAfterBranchFixup("print-after-branch-fixup", + cl::desc("print function after fixing local branches"), + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintAfterLowering("print-after-lowering", + cl::desc("print function after instruction lowering"), + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintFOP("print-fop", + cl::desc("print functions after frame optimizer pass"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +cl::opt +PrintFinalized("print-finalized", + cl::desc("print function after CFG is finalized"), + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintLongJmp("print-longjmp", + cl::desc("print functions after longjmp pass"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintICF("print-icf", + cl::desc("print functions after ICF optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintICP("print-icp", + cl::desc("print functions after indirect call promotion"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintRegReAssign("print-regreassign", + cl::desc("print functions after regreassign pass"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintInline("print-inline", + cl::desc("print functions after inlining optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintOptimizeBodyless("print-optimize-bodyless", + cl::desc("print functions after bodyless optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintPLT("print-plt", + cl::desc("print functions after PLT optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintPeepholes("print-peepholes", + cl::desc("print functions after peephole optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +cl::opt +PrintReordered("print-reordered", + cl::desc("print functions after layout optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintReorderedFunctions("print-reordered-functions", + cl::desc("print functions after clustering"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintSCTC("print-sctc", + cl::desc("print functions after conditional tail call simplification"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintSimplifyROLoads("print-simplify-rodata-loads", + cl::desc("print functions after simplification of RO data loads"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintSplit("print-split", + cl::desc("print functions after code splitting"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintUCE("print-uce", + cl::desc("print functions after unreachable code elimination"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintProfileStats("print-profile-stats", + cl::desc("print profile quality/bias analysis"), + cl::ZeroOrMore, + cl::init(false), + cl::cat(BoltCategory)); + +static cl::opt +SimplifyConditionalTailCalls("simplify-conditional-tail-calls", + cl::desc("simplify conditional tail calls by removing unnecessary jumps"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +SimplifyRODataLoads("simplify-rodata-loads", + cl::desc("simplify loads from read-only sections by replacing the memory " + "operand with the constant found in the corresponding section"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +RegReAssign("reg-reassign", + cl::desc("reassign registers so as to avoid using REX prefixes in hot code"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +StringOps("inline-memcpy", + cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::list +SpecializeMemcpy1("memcpy1-spec", + cl::desc("list of functions with call sites for which to specialize memcpy() " + "for size 1"), + cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +StripRepRet("strip-rep-ret", + cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +llvm::cl::opt +TimeOpts("time-opts", + cl::desc("print time spent in each optimization"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt +VerifyCFG("verify-cfg", + cl::desc("verify the CFG after every pass"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt +Stoke("stoke", + cl::desc("turn on the stoke analysis"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt +PrintStoke("print-stoke", + cl::desc("print functions after stoke analysis"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt + PrintVeneerElimination("print-veneer-elimination", + cl::desc("print functions after veneer elimination pass"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt + PrintRetpolineInsertion("print-retpoline-insertion", + cl::desc("print functions after retpoline insertion pass"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +using namespace opts; + +const char BinaryFunctionPassManager::TimerGroupName[] = + "passman"; +const char BinaryFunctionPassManager::TimerGroupDesc[] = + "Binary Function Pass Manager"; + +void BinaryFunctionPassManager::runPasses() { + auto &BFs = BC.getBinaryFunctions(); + for (size_t PassIdx = 0; PassIdx < Passes.size(); PassIdx++) { + const std::pair> + &OptPassPair = Passes[PassIdx]; + if (!OptPassPair.first) + continue; + + const std::unique_ptr &Pass = OptPassPair.second; + std::string PassIdName = + formatv("{0:2}_{1}", PassIdx, Pass->getName()).str(); + + if (opts::Verbosity > 0) { + outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n"; + } + + NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName, + TimerGroupDesc, TimeOpts); + + callWithDynoStats( + [this,&Pass] { + Pass->runOnFunctions(BC); + }, + BFs, + Pass->getName(), + opts::DynoStatsAll + ); + + if (opts::VerifyCFG && + !std::accumulate( + BFs.begin(), BFs.end(), + true, + [](const bool Valid, + const std::pair &It) { + return Valid && It.second.validateCFG(); + })) { + errs() << "BOLT-ERROR: Invalid CFG detected after pass " + << Pass->getName() << "\n"; + exit(1); + } + + if (opts::Verbosity > 0) { + outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n"; + } + + if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass()) + continue; + + const std::string Message = std::string("after ") + Pass->getName(); + + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Pass->shouldPrint(Function)) + continue; + + Function.print(outs(), Message, true); + + if (opts::DumpDotAll) + Function.dumpGraphForPass(PassIdName); + } + } +} + +void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { + BinaryFunctionPassManager Manager(BC); + + const DynoStats InitialDynoStats = getDynoStats(BC.getBinaryFunctions()); + + if (opts::Instrument) { + Manager.registerPass(std::make_unique(NeverPrint)); + } + + // Here we manage dependencies/order manually, since passes are run in the + // order they're registered. + + // Run this pass first to use stats for the original functions. + Manager.registerPass(std::make_unique(NeverPrint)); + + if (opts::PrintProfileStats) + Manager.registerPass(std::make_unique(NeverPrint)); + + Manager.registerPass(std::make_unique(NeverPrint)); + + Manager.registerPass(std::make_unique(NeverPrint), + opts::StripRepRet); + + Manager.registerPass(std::make_unique(PrintICF), + opts::ICF); + + if (BC.isAArch64()) + Manager.registerPass( + std::make_unique(PrintVeneerElimination)); + + Manager.registerPass( + std::make_unique(NeverPrint, opts::SpecializeMemcpy1), + !opts::SpecializeMemcpy1.empty()); + + Manager.registerPass(std::make_unique(NeverPrint), + opts::StringOps); + + Manager.registerPass(std::make_unique(PrintICP)); + + Manager.registerPass( + std::make_unique(PrintJTFootprintReduction), + opts::JTFootprintReductionFlag); + + Manager.registerPass( + std::make_unique(PrintSimplifyROLoads), + opts::SimplifyRODataLoads); + + Manager.registerPass(std::make_unique(PrintRegReAssign), + opts::RegReAssign); + + Manager.registerPass(std::make_unique(PrintInline)); + + Manager.registerPass(std::make_unique(PrintICF), + opts::ICF); + + Manager.registerPass(std::make_unique(PrintPLT)); + + Manager.registerPass(std::make_unique(PrintReordered)); + + Manager.registerPass( + std::make_unique(PrintUCE), + opts::EliminateUnreachable); + + Manager.registerPass(std::make_unique(PrintSplit)); + + Manager.registerPass(std::make_unique()); + + // This pass syncs local branches with CFG. If any of the following + // passes breaks the sync - they either need to re-run the pass or + // fix branches consistency internally. + Manager.registerPass(std::make_unique(PrintAfterBranchFixup)); + + // This pass should come close to last since it uses the estimated hot + // size of a function to determine the order. It should definitely + // also happen after any changes to the call graph are made, e.g. inlining. + Manager.registerPass( + std::make_unique(PrintReorderedFunctions)); + + // Print final dyno stats right while CFG and instruction analysis are intact. + Manager.registerPass( + std::make_unique( + InitialDynoStats, "after all optimizations before SCTC and FOP"), + opts::PrintDynoStats | opts::DynoStatsAll); + + // Add the StokeInfo pass, which extract functions for stoke optimization and + // get the liveness information for them + Manager.registerPass(std::make_unique(PrintStoke), opts::Stoke); + + // This pass introduces conditional jumps into external functions. + // Between extending CFG to support this and isolating this pass we chose + // the latter. Thus this pass will do double jump removal and unreachable + // code elimination if necessary and won't rely on peepholes/UCE for these + // optimizations. + // More generally this pass should be the last optimization pass that + // modifies branches/control flow. This pass is run after function + // reordering so that it can tell whether calls are forward/backward + // accurately. + Manager.registerPass( + std::make_unique(PrintSCTC), + opts::SimplifyConditionalTailCalls); + + Manager.registerPass(std::make_unique(PrintPeepholes)); + + Manager.registerPass(std::make_unique()); + + // Perform reordering on data contained in one or more sections using + // memory profiling data. + Manager.registerPass(std::make_unique()); + + // This pass should always run last.* + Manager.registerPass(std::make_unique(PrintFinalized)); + + // FrameOptimizer has an implicit dependency on FinalizeFunctions. + // FrameOptimizer move values around and needs to update CFIs. To do this, it + // must read CFI, interpret it and rewrite it, so CFIs need to be correctly + // placed according to the final layout. + Manager.registerPass(std::make_unique(PrintFOP)); + + Manager.registerPass(std::make_unique(PrintFOP)); + + Manager.registerPass( + std::make_unique(PrintRetpolineInsertion)); + + // Assign each function an output section. + Manager.registerPass(std::make_unique()); + + // Patch original function entries + if (BC.HasRelocations) + Manager.registerPass(std::make_unique()); + + // Tighten branches according to offset differences between branch and + // targets. No extra instructions after this pass, otherwise we may have + // relocations out of range and crash during linking. + if (BC.isAArch64()) + Manager.registerPass(std::make_unique(PrintLongJmp)); + + // This pass turns tail calls into jumps which makes them invisible to + // function reordering. It's unsafe to use any CFG or instruction analysis + // after this point. + Manager.registerPass( + std::make_unique(PrintAfterLowering)); + + // In non-relocation mode, mark functions that do not fit into their original + // space as non-simple if we have to (e.g. for correct debug info update). + // NOTE: this pass depends on finalized code. + if (!BC.HasRelocations) + Manager.registerPass(std::make_unique(NeverPrint)); + + Manager.registerPass(std::make_unique(NeverPrint)); + + Manager.runPasses(); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/BinaryPassManager.h b/bolt/src/BinaryPassManager.h new file mode 100644 index 000000000000..511ddaad637a --- /dev/null +++ b/bolt/src/BinaryPassManager.h @@ -0,0 +1,60 @@ +//===--- BinaryPassManager.h - Binary-level analysis/optimization passes --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A very simple binary-level analysis/optimization passes system. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H + +#include "Passes/BinaryPasses.h" +#include +#include + +namespace llvm { +namespace bolt { +class BinaryContext; + +/// Simple class for managing analyses and optimizations on BinaryFunctions. +class BinaryFunctionPassManager { +private: + BinaryContext &BC; + std::vector>> Passes; + + public: + static const char TimerGroupName[]; + static const char TimerGroupDesc[]; + + BinaryFunctionPassManager(BinaryContext &BC) + : BC(BC) {} + + /// Adds a pass to this manager based on the value of its corresponding + /// command-line option. + void registerPass(std::unique_ptr Pass, + const bool Run) { + Passes.emplace_back(Run, std::move(Pass)); + } + + /// Adds an unconditionally run pass to this manager. + void registerPass(std::unique_ptr Pass) { + Passes.emplace_back(true, std::move(Pass)); + } + + /// Run all registered passes in the order they were added. + void runPasses(); + + /// Runs all enabled implemented passes on all functions. + static void runAllPasses(BinaryContext &BC); +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/BinarySection.cpp b/bolt/src/BinarySection.cpp new file mode 100644 index 000000000000..4b7d632d6962 --- /dev/null +++ b/bolt/src/BinarySection.cpp @@ -0,0 +1,319 @@ +//===--- BinarySection.cpp - Interface for object file section -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinarySection.h" +#include "BinaryContext.h" +#include "Utils.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace bolt; + +namespace opts { +extern cl::opt PrintRelocations; +extern cl::opt HotData; +} + +bool BinarySection::isELF() const { + return BC.isELF(); +} + +bool BinarySection::isMachO() const { + return BC.isMachO(); +} + +uint64_t +BinarySection::hash(const BinaryData &BD, + std::map &Cache) const { + auto Itr = Cache.find(&BD); + if (Itr != Cache.end()) + return Itr->second; + + Cache[&BD] = 0; + + uint64_t Offset = BD.getAddress() - getAddress(); + const uint64_t EndOffset = BD.getEndAddress() - getAddress(); + auto Begin = Relocations.lower_bound(Relocation{Offset, 0, 0, 0, 0}); + auto End = Relocations.upper_bound(Relocation{EndOffset, 0, 0, 0, 0}); + const StringRef Contents = getContents(); + + hash_code Hash = hash_combine(hash_value(BD.getSize()), + hash_value(BD.getSectionName())); + + while (Begin != End) { + const Relocation &Rel = *Begin++; + Hash = hash_combine( + Hash, + hash_value(Contents.substr(Offset, Begin->Offset - Offset))); + if (BinaryData *RelBD = BC.getBinaryDataByName(Rel.Symbol->getName())) { + Hash = hash_combine(Hash, hash(*RelBD, Cache)); + } + Offset = Rel.Offset + Rel.getSize(); + } + + Hash = hash_combine( + Hash, + hash_value(Contents.substr(Offset, EndOffset - Offset))); + + Cache[&BD] = Hash; + + return Hash; +} + +void BinarySection::emitAsData(MCStreamer &Streamer, StringRef NewName) const { + StringRef SectionName = !NewName.empty() ? NewName : getName(); + StringRef SectionContents = getContents(); + MCSectionELF *ELFSection = + BC.Ctx->getELFSection(SectionName, getELFType(), getELFFlags()); + + Streamer.SwitchSection(ELFSection); + Streamer.emitValueToAlignment(getAlignment()); + + if (BC.HasRelocations && opts::HotData && isReordered()) + Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_start")); + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting " + << (isAllocatable() ? "" : "non-") + << "allocatable data section " << SectionName << '\n'); + + if (!hasRelocations()) { + Streamer.emitBytes(SectionContents); + } else { + uint64_t SectionOffset = 0; + for (const Relocation &Relocation : relocations()) { + assert(Relocation.Offset < SectionContents.size() && "overflow detected"); + // Skip undefined symbols. + if (BC.UndefinedSymbols.count(Relocation.Symbol)) + continue; + if (SectionOffset < Relocation.Offset) { + Streamer.emitBytes( + SectionContents.substr(SectionOffset, + Relocation.Offset - SectionOffset)); + SectionOffset = Relocation.Offset; + } + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol " + << (Relocation.Symbol ? Relocation.Symbol->getName() + : StringRef("")) + << " at offset 0x" + << Twine::utohexstr(Relocation.Offset) << " with size " + << Relocation::getSizeForType(Relocation.Type) << '\n'); + size_t RelocationSize = Relocation.emit(&Streamer); + SectionOffset += RelocationSize; + } + assert(SectionOffset <= SectionContents.size() && "overflow error"); + if (SectionOffset < SectionContents.size()) { + Streamer.emitBytes(SectionContents.substr(SectionOffset)); + } + } + + if (BC.HasRelocations && opts::HotData && isReordered()) + Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_end")); +} + +void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS, + SymbolResolverFuncTy Resolver) { + if (PendingRelocations.empty() && Patches.empty()) + return; + + const uint64_t SectionAddress = getAddress(); + + // We apply relocations to original section contents. For allocatable sections + // this means using their input file offsets, since the output file offset + // could change (e.g. for new instance of .text). For non-allocatable + // sections, the output offset should always be a valid one. + const uint64_t SectionFileOffset = isAllocatable() ? getInputFileOffset() + : getOutputFileOffset(); + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: flushing pending relocations for section " + << getName() << '\n' + << " address: 0x" << Twine::utohexstr(SectionAddress) << '\n' + << " offset: 0x" << Twine::utohexstr(SectionFileOffset) << '\n'); + + for (BinaryPatch &Patch : Patches) { + OS.pwrite(Patch.Bytes.data(), + Patch.Bytes.size(), + SectionFileOffset + Patch.Offset); + } + + + for (Relocation &Reloc : PendingRelocations) { + uint64_t Value = Reloc.Addend; + if (Reloc.Symbol) + Value += Resolver(Reloc.Symbol); + switch(Reloc.Type) { + default: + LLVM_DEBUG(dbgs() << Reloc.Type << '\n';); + llvm_unreachable("unhandled relocation type"); + case ELF::R_X86_64_64: + case ELF::R_X86_64_32: { + OS.pwrite(reinterpret_cast(&Value), + Relocation::getSizeForType(Reloc.Type), + SectionFileOffset + Reloc.Offset); + break; + } + case ELF::R_X86_64_PC32: { + Value -= SectionAddress + Reloc.Offset; + OS.pwrite(reinterpret_cast(&Value), + Relocation::getSizeForType(Reloc.Type), + SectionFileOffset + Reloc.Offset); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: writing value 0x" + << Twine::utohexstr(Value) << " of size " + << Relocation::getSizeForType(Reloc.Type) + << " at offset 0x" << Twine::utohexstr(Reloc.Offset) + << " address 0x" + << Twine::utohexstr(SectionAddress + Reloc.Offset) + << " Offset 0x" + << Twine::utohexstr(SectionFileOffset + Reloc.Offset) + << '\n';); + break; + } + } + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: writing value 0x" << Twine::utohexstr(Value) + << " of size " << Relocation::getSizeForType(Reloc.Type) + << " at section offset 0x" << Twine::utohexstr(Reloc.Offset) + << " address 0x" + << Twine::utohexstr(SectionAddress + Reloc.Offset) + << " file offset 0x" + << Twine::utohexstr(SectionFileOffset + Reloc.Offset) << '\n';); + } + + clearList(PendingRelocations); +} + +BinarySection::~BinarySection() { + if (isReordered()) { + delete[] getData(); + return; + } + + if (!isAllocatable() && + (!hasSectionRef() || + OutputContents.data() != getContents(Section).data())) { + delete[] getOutputData(); + } +} + +void BinarySection::clearRelocations() { + clearList(Relocations); +} + +void BinarySection::print(raw_ostream &OS) const { + OS << getName() << ", " + << "0x" << Twine::utohexstr(getAddress()) << ", " + << getSize() + << " (0x" << Twine::utohexstr(getOutputAddress()) << ", " + << getOutputSize() << ")" + << ", data = " << getData() + << ", output data = " << getOutputData(); + + if (isAllocatable()) + OS << " (allocatable)"; + + if (isVirtual()) + OS << " (virtual)"; + + if (isTLS()) + OS << " (tls)"; + + if (opts::PrintRelocations) { + for (const Relocation &R : relocations()) + OS << "\n " << R; + } +} + +BinarySection::RelocationSetType +BinarySection::reorderRelocations(bool Inplace) const { + assert(PendingRelocations.empty() && + "reodering pending relocations not supported"); + RelocationSetType NewRelocations; + for (const Relocation &Rel : relocations()) { + uint64_t RelAddr = Rel.Offset + getAddress(); + BinaryData *BD = BC.getBinaryDataContainingAddress(RelAddr); + BD = BD->getAtomicRoot(); + assert(BD); + + if ((!BD->isMoved() && !Inplace) || BD->isJumpTable()) + continue; + + Relocation NewRel(Rel); + uint64_t RelOffset = RelAddr - BD->getAddress(); + NewRel.Offset = BD->getOutputOffset() + RelOffset; + assert(NewRel.Offset < getSize()); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: moving " << Rel << " -> " << NewRel + << "\n"); + auto Res = NewRelocations.emplace(std::move(NewRel)); + (void)Res; + assert(Res.second && "Can't overwrite existing relocation"); + } + return NewRelocations; +} + +void BinarySection::reorderContents(const std::vector &Order, + bool Inplace) { + IsReordered = true; + + Relocations = reorderRelocations(Inplace); + + std::string Str; + raw_string_ostream OS(Str); + const char *Src = Contents.data(); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: reorderContents for " << Name << "\n"); + for (BinaryData *BD : Order) { + assert((BD->isMoved() || !Inplace) && !BD->isJumpTable()); + assert(BD->isAtomic() && BD->isMoveable()); + const uint64_t SrcOffset = BD->getAddress() - getAddress(); + assert(SrcOffset < Contents.size()); + assert(SrcOffset == BD->getOffset()); + while (OS.tell() < BD->getOutputOffset()) { + OS.write((unsigned char)0); + } + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ " << OS.tell() + << "\n"); + OS.write(&Src[SrcOffset], BD->getOutputSize()); + } + if (Relocations.empty()) { + // If there are no existing relocations, tack a phony one at the end + // of the reordered segment to force LLVM to recognize and map this + // section. + MCSymbol *ZeroSym = BC.registerNameAtAddress("Zero", 0, 0, 0); + addRelocation(OS.tell(), ZeroSym, ELF::R_X86_64_64, 0xdeadbeef); + + uint64_t Zero = 0; + OS.write(reinterpret_cast(&Zero), sizeof(Zero)); + } + auto *NewData = reinterpret_cast(copyByteArray(OS.str())); + Contents = OutputContents = StringRef(NewData, OS.str().size()); + OutputSize = Contents.size(); +} + +std::string BinarySection::encodeELFNote(StringRef NameStr, StringRef DescStr, + uint32_t Type) { + std::string Str; + raw_string_ostream OS(Str); + const uint32_t NameSz = NameStr.size() + 1; + const uint32_t DescSz = DescStr.size(); + OS.write(reinterpret_cast(&(NameSz)), 4); + OS.write(reinterpret_cast(&(DescSz)), 4); + OS.write(reinterpret_cast(&(Type)), 4); + OS << NameStr << '\0'; + for (uint64_t I = NameSz; I < alignTo(NameSz, 4); ++I) { + OS << '\0'; + } + OS << DescStr; + for (uint64_t I = DescStr.size(); I < alignTo(DescStr.size(), 4); ++I) { + OS << '\0'; + } + return OS.str(); +} diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h new file mode 100644 index 000000000000..acb4ba438331 --- /dev/null +++ b/bolt/src/BinarySection.h @@ -0,0 +1,545 @@ +//===--- BinarySection.h - Interface for object file section --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H + +#include "DebugData.h" +#include "Relocation.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +namespace llvm { +class MCStreamer; +class MCSymbol; + +using namespace object; + +namespace bolt { + +class BinaryContext; +class BinaryData; + +/// A class to manage binary sections that also manages related relocations. +class BinarySection { + friend class BinaryContext; + + BinaryContext &BC; // Owning BinaryContext + std::string Name; // Section name + const SectionRef Section; // SectionRef (may be null) + StringRef Contents; // Input section contents + const uint64_t Address; // Address of section in input binary (may be 0) + const uint64_t Size; // Input section size + uint64_t InputFileOffset{0};// Offset in the input binary + unsigned Alignment; // alignment in bytes (must be > 0) + unsigned ELFType; // ELF section type + unsigned ELFFlags; // ELF section flags + + // Relocations associated with this section. Relocation offsets are + // wrt. to the original section address and size. + using RelocationSetType = std::set>; + RelocationSetType Relocations; + + // Dynamic relocations associated with this section. Relocation offsets are + // from the original section address. + RelocationSetType DynamicRelocations; + + // Pending relocations for this section. + std::vector PendingRelocations; + + struct BinaryPatch { + uint64_t Offset; + SmallString<8> Bytes; + + BinaryPatch(uint64_t Offset, const SmallVectorImpl &Bytes) + : Offset(Offset), Bytes(Bytes.begin(), Bytes.end()) {} + }; + std::vector Patches; + /// Patcher used to apply simple changes to sections of the input binary. + std::unique_ptr Patcher; + + // Output info + bool IsFinalized{false}; // Has this section had output information + // finalized? + std::string OutputName; // Output section name (if the section has + // been renamed) + uint64_t OutputAddress{0}; // Section address for the rewritten binary. + uint64_t OutputSize{0}; // Section size in the rewritten binary. + uint64_t OutputFileOffset{0}; // File offset in the rewritten binary file. + StringRef OutputContents; // Rewritten section contents. + unsigned SectionID{-1u}; // Unique ID used for address mapping. + // Set by ExecutableFileMemoryManager. + uint32_t Index{0}; // Section index in the output file. + mutable bool IsReordered{false}; // Have the contents been reordered? + bool IsAnonymous{false}; // True if the name should not be included + // in the output file. + + uint64_t hash(const BinaryData &BD, + std::map &Cache) const; + + // non-copyable + BinarySection(const BinarySection &) = delete; + BinarySection(BinarySection &&) = delete; + BinarySection &operator=(const BinarySection &) = delete; + BinarySection &operator=(BinarySection &&) = delete; + + static StringRef getName(SectionRef Section) { + return cantFail(Section.getName()); + } + static StringRef getContents(SectionRef Section) { + if (Section.getObject()->isELF() && + ELFSectionRef(Section).getType() == ELF::SHT_NOBITS) + return StringRef(); + + Expected ContentsOrErr = Section.getContents(); + if (!ContentsOrErr) { + Error E = ContentsOrErr.takeError(); + errs() << "BOLT-ERROR: cannot get section contents for " + << getName(Section) << ": " << E << ".\n"; + exit(1); + } + return *ContentsOrErr; + } + + /// Get the set of relocations refering to data in this section that + /// has been reordered. The relocation offsets will be modified to + /// reflect the new data locations. + RelocationSetType reorderRelocations(bool Inplace) const; + + /// Set output info for this section. + void update(uint8_t *NewData, + uint64_t NewSize, + unsigned NewAlignment, + unsigned NewELFType, + unsigned NewELFFlags) { + assert(NewAlignment > 0 && "section alignment must be > 0"); + Alignment = NewAlignment; + ELFType = NewELFType; + ELFFlags = NewELFFlags; + OutputSize = NewSize; + OutputContents = StringRef(reinterpret_cast(NewData), + NewData ? NewSize : 0); + IsFinalized = true; + } +public: + /// Copy a section. + explicit BinarySection(BinaryContext &BC, + StringRef Name, + const BinarySection &Section) + : BC(BC), + Name(Name), + Section(Section.getSectionRef()), + Contents(Section.getContents()), + Address(Section.getAddress()), + Size(Section.getSize()), + Alignment(Section.getAlignment()), + ELFType(Section.getELFType()), + ELFFlags(Section.getELFFlags()), + Relocations(Section.Relocations), + PendingRelocations(Section.PendingRelocations), + OutputName(Name) { + } + + BinarySection(BinaryContext &BC, + SectionRef Section) + : BC(BC), + Name(getName(Section)), + Section(Section), + Contents(getContents(Section)), + Address(Section.getAddress()), + Size(Section.getSize()), + Alignment(Section.getAlignment()), + OutputName(Name) { + if (isELF()) { + ELFType = ELFSectionRef(Section).getType(); + ELFFlags = ELFSectionRef(Section).getFlags(); + InputFileOffset = ELFSectionRef(Section).getOffset(); + } else if (isMachO()) { + auto *O = cast(Section.getObject()); + InputFileOffset = + O->is64Bit() ? O->getSection64(Section.getRawDataRefImpl()).offset + : O->getSection(Section.getRawDataRefImpl()).offset; + } + } + + // TODO: pass Data as StringRef/ArrayRef? use StringRef::copy method. + BinarySection(BinaryContext &BC, + StringRef Name, + uint8_t *Data, + uint64_t Size, + unsigned Alignment, + unsigned ELFType, + unsigned ELFFlags) + : BC(BC), + Name(Name), + Contents(reinterpret_cast(Data), Data ? Size : 0), + Address(0), + Size(Size), + Alignment(Alignment), + ELFType(ELFType), + ELFFlags(ELFFlags), + IsFinalized(true), + OutputName(Name), + OutputSize(Size), + OutputContents(Contents) { + assert(Alignment > 0 && "section alignment must be > 0"); + } + + ~BinarySection(); + + /// Helper function to generate the proper ELF flags from section properties. + static unsigned getFlags(bool IsReadOnly = true, + bool IsText = false, + bool IsAllocatable = false) { + unsigned Flags = 0; + if (IsAllocatable) + Flags |= ELF::SHF_ALLOC; + if (!IsReadOnly) + Flags |= ELF::SHF_WRITE; + if (IsText) + Flags |= ELF::SHF_EXECINSTR; + return Flags; + } + + operator bool() const { + return ELFType != ELF::SHT_NULL; + } + + bool operator==(const BinarySection &Other) const { + return (Name == Other.Name && + Address == Other.Address && + Size == Other.Size && + getData() == Other.getData() && + Alignment == Other.Alignment && + ELFType == Other.ELFType && + ELFFlags == Other.ELFFlags); + } + + bool operator!=(const BinarySection &Other) const { + return !operator==(Other); + } + + // Order sections by their immutable properties. + bool operator<(const BinarySection &Other) const { + return (getAddress() < Other.getAddress() || + (getAddress() == Other.getAddress() && + (getSize() < Other.getSize() || + (getSize() == Other.getSize() && + getName() < Other.getName())))); + } + + /// + /// Basic property access. + /// + BinaryContext &getBinaryContext() { return BC; } + bool isELF() const; + bool isMachO() const; + StringRef getName() const { return Name; } + uint64_t getAddress() const { return Address; } + uint64_t getEndAddress() const { return Address + Size; } + uint64_t getSize() const { return Size; } + uint64_t getInputFileOffset() const { return InputFileOffset; } + uint64_t getAlignment() const { return Alignment; } + bool isText() const { + if (isELF()) + return (ELFFlags & ELF::SHF_EXECINSTR); + return getSectionRef().isText(); + } + bool isData() const { + if (isELF()) + return (ELFType == ELF::SHT_PROGBITS && + (ELFFlags & (ELF::SHF_ALLOC | ELF::SHF_WRITE))); + return getSectionRef().isData(); + } + bool isBSS() const { + return (ELFType == ELF::SHT_NOBITS && + (ELFFlags & (ELF::SHF_ALLOC | ELF::SHF_WRITE))); + } + bool isTLS() const { + return (ELFFlags & ELF::SHF_TLS); + } + bool isTBSS() const { + return isBSS() && isTLS(); + } + bool isVirtual() const { return ELFType == ELF::SHT_NOBITS; } + bool isRela() const { return ELFType == ELF::SHT_RELA; } + bool isReadOnly() const { + return ((ELFFlags & ELF::SHF_ALLOC) && + !(ELFFlags & ELF::SHF_WRITE) && + ELFType == ELF::SHT_PROGBITS); + } + bool isAllocatable() const { + if (isELF()) { + return (ELFFlags & ELF::SHF_ALLOC) && !isTBSS(); + } else { + // On non-ELF assume all sections are allocatable. + return true; + } + } + bool isReordered() const { return IsReordered; } + bool isAnonymous() const { return IsAnonymous; } + unsigned getELFType() const { return ELFType; } + unsigned getELFFlags() const { return ELFFlags; } + + uint8_t *getData() { + return reinterpret_cast(const_cast(getContents().data())); + } + const uint8_t *getData() const { + return reinterpret_cast(getContents().data()); + } + StringRef getContents() const { return Contents; } + void clearContents() { Contents = {}; } + bool hasSectionRef() const { return Section != SectionRef(); } + SectionRef getSectionRef() const { return Section; } + + /// Does this section contain the given \p Address? + /// Note: this is in terms of the original mapped binary addresses. + bool containsAddress(uint64_t Address) const { + return (getAddress() <= Address && Address < getEndAddress()) || + (getSize() == 0 && getAddress() == Address); + } + + /// Does this section contain the range [\p Address, \p Address + \p Size)? + /// Note: this is in terms of the original mapped binary addresses. + bool containsRange(uint64_t Address, uint64_t Size) const { + return containsAddress(Address) && Address + Size <= getEndAddress(); + } + + /// Iterate over all non-pending relocations for this section. + iterator_range relocations() { + return make_range(Relocations.begin(), Relocations.end()); + } + + /// Iterate over all non-pending relocations for this section. + iterator_range relocations() const { + return make_range(Relocations.begin(), Relocations.end()); + } + + /// Does this section have any non-pending relocations? + bool hasRelocations() const { + return !Relocations.empty(); + } + + /// Does this section have any pending relocations? + bool hasPendingRelocations() const { + return !PendingRelocations.empty(); + } + + /// Remove non-pending relocation with the given /p Offset. + bool removeRelocationAt(uint64_t Offset) { + auto Itr = Relocations.find(Offset); + if (Itr != Relocations.end()) { + Relocations.erase(Itr); + return true; + } + return false; + } + + void clearRelocations(); + + /// Add a new relocation at the given /p Offset. + void addRelocation(uint64_t Offset, + MCSymbol *Symbol, + uint64_t Type, + uint64_t Addend, + uint64_t Value = 0, + bool Pending = false) { + assert(Offset < getSize() && "offset not within section bounds"); + if (!Pending) { + Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + } else { + PendingRelocations.emplace_back( + Relocation{Offset, Symbol, Type, Addend, Value}); + } + } + + /// Add a dynamic relocation at the given /p Offset. + void addDynamicRelocation(uint64_t Offset, + MCSymbol *Symbol, + uint64_t Type, + uint64_t Addend, + uint64_t Value = 0) { + assert(Offset < getSize() && "offset not within section bounds"); + DynamicRelocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + } + + /// Add relocation against the original contents of this section. + void addPendingRelocation(const Relocation &Rel) { + PendingRelocations.push_back(Rel); + } + + /// Add patch to the input contents of this section. + void addPatch(uint64_t Offset, const SmallVectorImpl &Bytes) { + Patches.emplace_back(BinaryPatch(Offset, Bytes)); + } + + /// Register patcher for this section. + void registerPatcher(std::unique_ptr BPatcher) { + Patcher = std::move(BPatcher); + } + + /// Returns the patcher + BinaryPatcher *getPatcher() { return Patcher.get(); } + + /// Lookup the relocation (if any) at the given /p Offset. + const Relocation *getRelocationAt(uint64_t Offset) const { + auto Itr = Relocations.find(Offset); + return Itr != Relocations.end() ? &*Itr : nullptr; + } + + /// Lookup the relocation (if any) at the given /p Offset. + const Relocation *getDynamicRelocationAt(uint64_t Offset) const { + Relocation Key{Offset, 0, 0, 0, 0}; + auto Itr = DynamicRelocations.find(Key); + return Itr != DynamicRelocations.end() ? &*Itr : nullptr; + } + + uint64_t hash(const BinaryData &BD) const { + std::map Cache; + return hash(BD, Cache); + } + + /// + /// Property accessors related to output data. + /// + + bool isFinalized() const { return IsFinalized; } + void setIsFinalized() { IsFinalized = true; } + StringRef getOutputName() const { return OutputName; } + uint64_t getOutputSize() const { return OutputSize; } + uint8_t *getOutputData() { + return reinterpret_cast(const_cast(getOutputContents().data())); + } + const uint8_t *getOutputData() const { + return reinterpret_cast(getOutputContents().data()); + } + StringRef getOutputContents() const { return OutputContents; } + uint64_t getAllocAddress() const { + return reinterpret_cast(getOutputData()); + } + uint64_t getOutputAddress() const { return OutputAddress; } + uint64_t getOutputFileOffset() const { return OutputFileOffset; } + unsigned getSectionID() const { + assert(hasValidSectionID() && "trying to use uninitialized section id"); + return SectionID; + } + bool hasValidSectionID() const { + return SectionID != -1u; + } + uint32_t getIndex() const { + return Index; + } + + // mutation + void setOutputAddress(uint64_t Address) { + OutputAddress = Address; + } + void setOutputFileOffset(uint64_t Offset) { + OutputFileOffset = Offset; + } + void setSectionID(unsigned ID) { + assert(!hasValidSectionID() && "trying to set section id twice"); + SectionID = ID; + } + void setIndex(uint32_t I) { + Index = I; + } + void setOutputName(StringRef Name) { + OutputName = std::string(Name); + } + void setAnonymous(bool Flag) { + IsAnonymous = Flag; + } + + /// Emit the section as data, possibly with relocations. Use name \p NewName + // for the section during emission if non-empty. + void emitAsData(MCStreamer &Streamer, StringRef NewName = StringRef()) const; + + using SymbolResolverFuncTy = llvm::function_ref; + + /// Flush all pending relocations to patch original contents of sections + /// that were not emitted via MCStreamer. + void flushPendingRelocations(raw_pwrite_stream &OS, + SymbolResolverFuncTy Resolver); + + /// Reorder the contents of this section according to /p Order. If + /// /p Inplace is true, the entire contents of the section is reordered, + /// otherwise the new contents contain only the reordered data. + void reorderContents(const std::vector &Order, bool Inplace); + + void print(raw_ostream &OS) const; + + /// Write the contents of an ELF note section given the name of the producer, + /// a number identifying the type of note and the contents of the note in + /// \p DescStr. + static std::string encodeELFNote(StringRef NameStr, StringRef DescStr, + uint32_t Type); + + /// Code for ELF notes written by producer 'BOLT' + enum { + NT_BOLT_BAT = 1, + NT_BOLT_INSTRUMENTATION_TABLES = 2 + }; +}; + +inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) { + auto *Array = new uint8_t[Size]; + memcpy(Array, Data, Size); + return Array; +} + +inline uint8_t *copyByteArray(StringRef Buffer) { + return copyByteArray(reinterpret_cast(Buffer.data()), + Buffer.size()); +} + +inline uint8_t *copyByteArray(ArrayRef Buffer) { + return copyByteArray(reinterpret_cast(Buffer.data()), + Buffer.size()); +} + +inline raw_ostream &operator<<(raw_ostream &OS, const BinarySection &Section) { + Section.print(OS); + return OS; +} + +struct SDTMarkerInfo { + uint64_t PC; + uint64_t Base; + uint64_t Semaphore; + StringRef Provider; + StringRef Name; + StringRef Args; + + /// The offset of PC within the note section + unsigned PCOffset; +}; + +/// Linux Kernel special sections point to a specific instruction in many cases. +/// Unlike SDTMarkerInfo, these markers can come from different sections. +struct LKInstructionMarkerInfo { + uint64_t SectionOffset; + int32_t PCRelativeOffset; + bool IsPCRelative; + StringRef SectionName; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/BoltAddressTranslation.cpp b/bolt/src/BoltAddressTranslation.cpp new file mode 100644 index 000000000000..698a414f1d31 --- /dev/null +++ b/bolt/src/BoltAddressTranslation.cpp @@ -0,0 +1,295 @@ +//===--- BoltAddressTranslation.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#include "BoltAddressTranslation.h" +#include "BinaryFunction.h" +#include "llvm/Support/DataExtractor.h" +#include "llvm/Support/Errc.h" + +#define DEBUG_TYPE "bolt-bat" + +namespace llvm { +namespace bolt { + +const char* BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat"; + +void BoltAddressTranslation::writeEntriesForBB(MapTy &Map, + const BinaryBasicBlock &BB, + uint64_t FuncAddress) { + const uint64_t BBOutputOffset = + BB.getOutputAddressRange().first - FuncAddress; + const uint32_t BBInputOffset = BB.getInputOffset(); + + assert(BBInputOffset != BinaryBasicBlock::INVALID_OFFSET && + "Every output BB must track back to an input BB for profile " + "collection in bolted binaries"); + + LLVM_DEBUG(dbgs() << "BB " << BB.getName() << "\n"); + LLVM_DEBUG(dbgs() << " Key: " << Twine::utohexstr(BBOutputOffset) + << " Val: " << Twine::utohexstr(BBInputOffset) << "\n"); + // In case of conflicts (same Key mapping to different Vals), the last + // update takes precedence. Of course it is not ideal to have conflicts and + // those happen when we have an empty BB that either contained only + // NOPs or a jump to the next block (successor). Either way, the successor + // and this deleted block will both share the same output address (the same + // key), and we need to map back. We choose here to privilege the successor by + // allowing it to overwrite the previously inserted key in the map. + Map[BBOutputOffset] = BBInputOffset; + + for (const auto &IOPair : BB.getOffsetTranslationTable()) { + const uint64_t OutputOffset = IOPair.first + BBOutputOffset; + const uint32_t InputOffset = IOPair.second; + + // Is this the first instruction in the BB? No need to duplicate the entry. + if (OutputOffset == BBOutputOffset) + continue; + + LLVM_DEBUG(dbgs() << " Key: " << Twine::utohexstr(OutputOffset) << " Val: " + << Twine::utohexstr(InputOffset) << " (branch)\n"); + Map.insert( + std::pair(OutputOffset, InputOffset | BRANCHENTRY)); + } +} + +void BoltAddressTranslation::write(raw_ostream &OS) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n"); + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + // We don't need a translation table if the body of the function hasn't + // changed + if (!BC.HasRelocations && !Function.isSimple()) + continue; + + LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n"); + LLVM_DEBUG(dbgs() << " Address reference: 0x" + << Twine::utohexstr(Function.getOutputAddress()) << "\n"); + MapTy Map; + const bool IsSplit = Function.isSplit(); + for (BinaryBasicBlock *&BB : Function.layout()) { + if (IsSplit && BB->isCold()) + break; + writeEntriesForBB(Map, *BB, Function.getOutputAddress()); + } + Maps.insert(std::pair(Function.getOutputAddress(), Map)); + + if (!IsSplit) + continue; + + // Cold map + Map.clear(); + LLVM_DEBUG(dbgs() << " Cold part\n"); + for (BinaryBasicBlock *&BB : Function.layout()) { + if (!BB->isCold()) + continue; + writeEntriesForBB(Map, *BB, Function.cold().getAddress()); + } + Maps.insert(std::pair(Function.cold().getAddress(), Map)); + ColdPartSource.insert(std::pair( + Function.cold().getAddress(), Function.getOutputAddress())); + } + + const uint32_t NumFuncs = Maps.size(); + OS.write(reinterpret_cast(&NumFuncs), 4); + LLVM_DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n"); + for (auto &MapEntry : Maps) { + const uint64_t Address = MapEntry.first; + MapTy &Map = MapEntry.second; + const uint32_t NumEntries = Map.size(); + LLVM_DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x" + << Twine::utohexstr(Address) << ".\n"); + OS.write(reinterpret_cast(&Address), 8); + OS.write(reinterpret_cast(&NumEntries), 4); + for (std::pair &KeyVal : Map) { + OS.write(reinterpret_cast(&KeyVal.first), 4); + OS.write(reinterpret_cast(&KeyVal.second), 4); + } + } + const uint32_t NumColdEntries = ColdPartSource.size(); + LLVM_DEBUG(dbgs() << "Writing " << NumColdEntries + << " cold part mappings.\n"); + OS.write(reinterpret_cast(&NumColdEntries), 4); + for (std::pair &ColdEntry : ColdPartSource) { + OS.write(reinterpret_cast(&ColdEntry.first), 8); + OS.write(reinterpret_cast(&ColdEntry.second), 8); + LLVM_DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> " + << Twine::utohexstr(ColdEntry.second) << "\n"); + } + + outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n"; + outs() << "BOLT-INFO: Wrote " << NumColdEntries + << " BAT cold-to-hot entries\n"; +} + +std::error_code BoltAddressTranslation::parse(StringRef Buf) { + DataExtractor DE = DataExtractor(Buf, true, 8); + uint64_t Offset = 0; + if (Buf.size() < 12) + return make_error_code(llvm::errc::io_error); + + const uint32_t NameSz = DE.getU32(&Offset); + const uint32_t DescSz = DE.getU32(&Offset); + const uint32_t Type = DE.getU32(&Offset); + + if (Type != BinarySection::NT_BOLT_BAT || + Buf.size() + Offset < alignTo(NameSz, 4) + DescSz) + return make_error_code(llvm::errc::io_error); + + StringRef Name = Buf.slice(Offset, Offset + NameSz); + Offset = alignTo(Offset + NameSz, 4); + if (Name.substr(0, 4) != "BOLT") + return make_error_code(llvm::errc::io_error); + + if (Buf.size() - Offset < 4) + return make_error_code(llvm::errc::io_error); + + const uint32_t NumFunctions = DE.getU32(&Offset); + LLVM_DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n"); + for (uint32_t I = 0; I < NumFunctions; ++I) { + if (Buf.size() - Offset < 12) + return make_error_code(llvm::errc::io_error); + + const uint64_t Address = DE.getU64(&Offset); + const uint32_t NumEntries = DE.getU32(&Offset); + MapTy Map; + + LLVM_DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x" + << Twine::utohexstr(Address) << "\n"); + if (Buf.size() - Offset < 8 * NumEntries) + return make_error_code(llvm::errc::io_error); + for (uint32_t J = 0; J < NumEntries; ++J) { + const uint32_t OutputAddr = DE.getU32(&Offset); + const uint32_t InputAddr = DE.getU32(&Offset); + Map.insert(std::pair(OutputAddr, InputAddr)); + LLVM_DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> " + << Twine::utohexstr(InputAddr) << "\n"); + } + Maps.insert(std::pair(Address, Map)); + } + + if (Buf.size() - Offset < 4) + return make_error_code(llvm::errc::io_error); + + const uint32_t NumColdEntries = DE.getU32(&Offset); + LLVM_DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n"); + for (uint32_t I = 0; I < NumColdEntries; ++I) { + if (Buf.size() - Offset < 16) + return make_error_code(llvm::errc::io_error); + const uint32_t ColdAddress = DE.getU64(&Offset); + const uint32_t HotAddress = DE.getU64(&Offset); + ColdPartSource.insert( + std::pair(ColdAddress, HotAddress)); + LLVM_DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> " + << Twine::utohexstr(HotAddress) << "\n"); + } + outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n"; + outs() << "BOLT-INFO: Parsed " << NumColdEntries + << " BAT cold-to-hot entries\n"; + + return std::error_code(); +} + +uint64_t BoltAddressTranslation::translate(const BinaryFunction &Func, + uint64_t Offset, + bool IsBranchSrc) const { + auto Iter = Maps.find(Func.getAddress()); + if (Iter == Maps.end()) + return Offset; + + const MapTy &Map = Iter->second; + auto KeyVal = Map.upper_bound(Offset); + if (KeyVal == Map.begin()) + return Offset; + + --KeyVal; + + const uint32_t Val = KeyVal->second & ~BRANCHENTRY; + // Branch source addresses are translated to the first instruction of the + // source BB to avoid accounting for modifications BOLT may have made in the + // BB regarding deletion/addition of instructions. + if (IsBranchSrc) + return Val; + return Offset - KeyVal->first + Val; +} + +Optional +BoltAddressTranslation::getFallthroughsInTrace( + const BinaryFunction &Func, uint64_t From, uint64_t To) const { + SmallVector, 16> Res; + + // Filter out trivial case + if (From >= To) + return Res; + + From -= Func.getAddress(); + To -= Func.getAddress(); + + auto Iter = Maps.find(Func.getAddress()); + if (Iter == Maps.end()) { + return NoneType(); + } + + const MapTy &Map = Iter->second; + auto FromIter = Map.upper_bound(From); + if (FromIter == Map.begin()) + return Res; + // Skip instruction entries, to create fallthroughs we are only interested in + // BB boundaries + do { + if (FromIter == Map.begin()) + return Res; + --FromIter; + } while (FromIter->second & BRANCHENTRY); + + auto ToIter = Map.upper_bound(To); + if (ToIter == Map.begin()) + return Res; + --ToIter; + if (FromIter->first >= ToIter->first) + return Res; + + for (auto Iter = FromIter; Iter != ToIter; ) { + const uint32_t Src = Iter->first; + if (Iter->second & BRANCHENTRY) { + ++Iter; + continue; + } + + ++Iter; + while (Iter->second & BRANCHENTRY && Iter != ToIter) { + ++Iter; + } + if (Iter->second & BRANCHENTRY) + break; + Res.emplace_back(Src, Iter->first); + } + + return Res; +} + +uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const { + auto Iter = ColdPartSource.find(Address); + if (Iter == ColdPartSource.end()) + return 0; + return Iter->second; +} + +bool BoltAddressTranslation::enabledFor( + llvm::object::ELFObjectFileBase *InputFile) const { + for (const SectionRef &Section : InputFile->sections()) { + Expected SectionNameOrErr = Section.getName(); + if (Error E = SectionNameOrErr.takeError()) + continue; + + if (SectionNameOrErr.get() == SECTION_NAME) + return true; + } + return false; +} +} +} diff --git a/bolt/src/BoltAddressTranslation.h b/bolt/src/BoltAddressTranslation.h new file mode 100644 index 000000000000..c0b553158eea --- /dev/null +++ b/bolt/src/BoltAddressTranslation.h @@ -0,0 +1,135 @@ +//===--- BoltAddressTranslation.h -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H +#define LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include +#include +#include + +namespace llvm { +class raw_ostream; + +namespace object { +class ELFObjectFileBase; +} // namespace object + +namespace bolt { +class BinaryBasicBlock; +class BinaryContext; +class BinaryFunction; + +/// The map of output addresses to input ones to be used when translating +/// samples collected in a binary that was already processed by BOLT. We do not +/// support reoptimizing a binary already processed by BOLT, but we do support +/// collecting samples in a binary processed by BOLT. We then translate samples +/// back to addresses from the input (original) binary, one that can be +/// optimized. The goal is to avoid special deployments of non-bolted binaries +/// just for the purposes of data collection. +/// +/// The in-memory representation of the map is as follows. Each function has its +/// own map. A function is identified by its output address. This is the key to +/// retrieve a translation map. The translation map is a collection of ordered +/// keys identifying the start of a region (relative to the function start) in +/// the output address space (addresses in the binary processed by BOLT). +/// +/// A translation then happens when perf2bolt needs to convert sample addresses +/// in the output address space back to input addresses, valid to run BOLT in +/// the original input binary. To convert, perf2bolt first needs to fetch the +/// translation map for a sample recorded in a given function. It then finds +/// the largest key that is still smaller or equal than the recorded address. +/// It then converts this address to use the value of this key. +/// +/// Example translation Map for function foo +/// KEY VALUE BB? +/// Output offset1 (first BB) Original input offset1 Y +/// ... +/// Output offsetN (last branch) Original input offsetN N +/// +/// The information on whether a given entry is a BB start or an instruction +/// that changes control flow is encoded in the last (highest) bit of VALUE. +/// +/// Notes: +/// Instructions that will never appear in LBR because they do not cause control +/// flow change are omitted from this map. Basic block locations are recorded +/// because they can be a target of a jump (To address in the LBR) and also to +/// recreate the BB layout of this function. We use the BB layout map to +/// recreate fall-through jumps in the profile, given an LBR trace. +class BoltAddressTranslation { +public: + // In-memory representation of the address translation table + using MapTy = std::map; + + // List of taken fall-throughs + using FallthroughListTy = SmallVector, 16>; + + /// Name of the ELF section where the table will be serialized to in the + /// output binary + static const char *SECTION_NAME; + + BoltAddressTranslation(BinaryContext &BC) : BC(BC) {} + + /// Write the serialized address translation tables for each reordered + /// function + void write(raw_ostream &OS); + + /// Read the serialized address translation tables and load them internally + /// in memory. Return a parse error if failed. + std::error_code parse(StringRef Buf); + + /// If the maps are loaded in memory, perform the lookup to translate LBR + /// addresses in \p Func. + uint64_t translate(const BinaryFunction &Func, uint64_t Offset, + bool IsBranchSrc) const; + + /// Use the map keys containing basic block addresses to infer fall-throughs + /// taken in the path started at FirstLBR.To and ending at SecondLBR.From. + /// Return NoneType if trace is invalid or the list of fall-throughs + /// otherwise. + Optional + getFallthroughsInTrace(const BinaryFunction &Func, uint64_t From, + uint64_t To) const; + + /// If available, fetch the address of the hot part linked to the cold part + /// at \p Address. Return 0 otherwise. + uint64_t fetchParentAddress(uint64_t Address) const; + + /// True if the input binary has a translation table we can use to convert + /// addresses when aggregating profile + bool enabledFor(llvm::object::ELFObjectFileBase *InputFile) const; + +private: + /// Helper to update \p Map by inserting one or more BAT entries reflecting + /// \p BB for function located at \p FuncAddress. At least one entry will be + /// emitted for the start of the BB. More entries may be emitted to cover + /// the location of calls or any instruction that may change control flow. + void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB, + uint64_t FuncAddress); + + BinaryContext &BC; + + std::map Maps; + + /// Links outlined cold bocks to their original function + std::map ColdPartSource; + + /// Identifies the address of a control-flow changing instructions in a + /// translation map entry + const static uint32_t BRANCHENTRY = 0x80000000; +}; +} + +} + +#endif diff --git a/bolt/src/BoltDiff.cpp b/bolt/src/BoltDiff.cpp new file mode 100644 index 000000000000..b80c7f2be618 --- /dev/null +++ b/bolt/src/BoltDiff.cpp @@ -0,0 +1,717 @@ +//===--- BoltDiff.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RewriteInstance methods related to comparing one instance to another, used +// by the boltdiff tool to print a report. +// +//===----------------------------------------------------------------------===// + +#include "Passes/IdenticalCodeFolding.h" +#include "ProfileReaderBase.h" +#include "RewriteInstance.h" +#include "llvm/Support/CommandLine.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "boltdiff" + +using namespace llvm; +using namespace object; +using namespace bolt; + +namespace opts { +extern cl::OptionCategory BoltDiffCategory; +extern cl::opt NeverPrint; +extern cl::opt ICF; + +static cl::opt +IgnoreLTOSuffix("ignore-lto-suffix", + cl::desc("ignore lto_priv or const suffixes when matching functions"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +PrintUnmapped("print-unmapped", + cl::desc("print functions of binary 2 that were not matched to any " + "function in binary 1"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +PrintProfiledUnmapped("print-profiled-unmapped", + cl::desc("print functions that have profile in binary 1 but do not " + "in binary 2"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +PrintDiffCFG("print-diff-cfg", + cl::desc("print the CFG of important functions that changed in " + "binary 2"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +PrintDiffBBs("print-diff-bbs", + cl::desc("print the basic blocks showed in top differences"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +MatchByHash("match-by-hash", + cl::desc("match functions in binary 2 to binary 1 if they have the same " + "hash of a function in binary 1"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +IgnoreUnchanged("ignore-unchanged", + cl::desc("do not diff functions whose contents have not been changed from " + "one binary to another"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +DisplayCount("display-count", + cl::desc("number of functions to display when printing the top largest " + "differences in function activity"), + cl::init(10), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +static cl::opt +NormalizeByBin1("normalize-by-bin1", + cl::desc("show execution count of functions in binary 2 as a ratio of the " + "total samples in binary 1 - make sure both profiles have equal " + "collection time and sampling rate for this to make sense"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltDiffCategory)); + +} // end namespace opts + +namespace llvm { +namespace bolt { + +namespace { + +/// Helper used to print colored numbers +void printColoredPercentage(double Perc) { + if (outs().has_colors() && Perc > 0.0) + outs().changeColor(raw_ostream::RED); + else if (outs().has_colors() && Perc < 0.0) + outs().changeColor(raw_ostream::GREEN); + else if (outs().has_colors()) + outs().changeColor(raw_ostream::YELLOW); + outs() << format("%.2f", Perc) << "%"; + if (outs().has_colors()) + outs().resetColor(); +} + +void setLightColor() { + if (opts::PrintDiffBBs && outs().has_colors()) + outs().changeColor(raw_ostream::CYAN); +} + +void setTitleColor() { + if (outs().has_colors()) + outs().changeColor(raw_ostream::WHITE, /*Bold=*/true); +} + +void setRegularColor() { + if (outs().has_colors()) + outs().resetColor(); +} + +} // end anonymous namespace + +/// Perform the comparison between two binaries with profiling information +class RewriteInstanceDiff { + typedef std::tuple + EdgeTy; + + RewriteInstance &RI1; + RewriteInstance &RI2; + + // The map of functions keyed by functions in binary 2, providing its + // corresponding function in binary 1 + std::map FuncMap; + + // The map of basic blocks correspondence, analogue to FuncMap for BBs, + // sorted by score difference + std::map BBMap; + + // The map of edge correspondence + std::map> EdgeMap; + + // Maps all known basic blocks back to their parent function + std::map BBToFuncMap; + + // Accounting which functions were matched + std::set Bin1MappedFuncs; + std::set Bin2MappedFuncs; + + // Structures for our 3 matching strategies: by name, by hash and by lto name, + // from the strongest to the weakest bind between two functions + StringMap NameLookup; + DenseMap HashLookup; + StringMap LTONameLookup1; + StringMap LTONameLookup2; + + // Score maps used to order and find hottest functions + std::multimap LargestBin1; + std::multimap LargestBin2; + + // Map multiple functions in the same LTO bucket to a single parent function + // representing all functions sharing the same prefix + std::map LTOMap1; + std::map LTOMap2; + std::map LTOAggregatedScore1; + std::map LTOAggregatedScore2; + + // Map scores in bin2 and 1 keyed by a binary 2 function - post-matching + DenseMap> ScoreMap; + + double getNormalizedScore(const BinaryFunction &Function, + const RewriteInstance &Ctx) { + if (!opts::NormalizeByBin1) + return static_cast(Function.getFunctionScore()) / Ctx.getTotalScore(); + return static_cast(Function.getFunctionScore()) / RI1.getTotalScore(); + } + + double getNormalizedScore(const BinaryBasicBlock &BB, + const RewriteInstance &Ctx) { + if (!opts::NormalizeByBin1) + return static_cast(BB.getKnownExecutionCount()) / Ctx.getTotalScore(); + return static_cast(BB.getKnownExecutionCount()) / RI1.getTotalScore(); + } + + double getNormalizedScore(BinaryBasicBlock::branch_info_iterator BIIter, + const RewriteInstance &Ctx) { + double Score = + BIIter->Count == BinaryBasicBlock::COUNT_NO_PROFILE + ? 0 + : BIIter->Count; + if (!opts::NormalizeByBin1) + return Score / Ctx.getTotalScore(); + return Score / RI1.getTotalScore(); + } + + /// Initialize data structures used for function lookup in binary 1, used + /// later when matching functions in binary 2 to corresponding functions + /// in binary 1 + void buildLookupMaps() { + for (const auto &BFI : RI1.BC->getBinaryFunctions()) { + StringRef LTOName; + const BinaryFunction &Function = BFI.second; + const double Score = getNormalizedScore(Function, RI1); + LargestBin1.insert(std::make_pair<>(Score, &Function)); + for (const StringRef Name : Function.getNames()) { + if (Optional OptionalLTOName = getLTOCommonName(Name)) + LTOName = *OptionalLTOName; + NameLookup[Name] = &Function; + } + if (opts::MatchByHash && Function.hasCFG()) + HashLookup[Function.computeHash(/*UseDFS=*/true)] = &Function; + if (opts::IgnoreLTOSuffix && !LTOName.empty()) { + if (!LTONameLookup1.count(LTOName)) + LTONameLookup1[LTOName] = &Function; + LTOMap1[&Function] = LTONameLookup1[LTOName]; + } + } + + // Compute LTONameLookup2 and LargestBin2 + for (const auto &BFI : RI2.BC->getBinaryFunctions()) { + StringRef LTOName; + const BinaryFunction &Function = BFI.second; + const double Score = getNormalizedScore(Function, RI2); + LargestBin2.insert(std::make_pair<>(Score, &Function)); + for (const StringRef Name : Function.getNames()) { + if (Optional OptionalLTOName = getLTOCommonName(Name)) + LTOName = *OptionalLTOName; + } + if (opts::IgnoreLTOSuffix && !LTOName.empty()) { + if (!LTONameLookup2.count(LTOName)) + LTONameLookup2[LTOName] = &Function; + LTOMap2[&Function] = LTONameLookup2[LTOName]; + } + } + } + + /// Match functions in binary 2 with functions in binary 1 + void matchFunctions() { + outs() << "BOLT-DIFF: Mapping functions in Binary2 to Binary1\n"; + uint64_t BothHaveProfile = 0ull; + std::set Bin1ProfiledMapped; + + for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) { + const BinaryFunction &Function2 = BFI2.second; + StringRef LTOName; + bool Match = false; + for (const StringRef Name : Function2.getNames()) { + auto Iter = NameLookup.find(Name); + if (Optional OptionalLTOName = getLTOCommonName(Name)) + LTOName = *OptionalLTOName; + if (Iter == NameLookup.end()) + continue; + FuncMap.insert(std::make_pair<>(&Function2, Iter->second)); + Bin1MappedFuncs.insert(Iter->second); + Bin2MappedFuncs.insert(&Function2); + if (Function2.hasValidProfile() && Iter->second->hasValidProfile()) { + ++BothHaveProfile; + Bin1ProfiledMapped.insert(Iter->second); + } + Match = true; + break; + } + if (Match || !Function2.hasCFG()) + continue; + auto Iter = HashLookup.find(Function2.computeHash(/*UseDFS*/true)); + if (Iter != HashLookup.end()) { + FuncMap.insert(std::make_pair<>(&Function2, Iter->second)); + Bin1MappedFuncs.insert(Iter->second); + Bin2MappedFuncs.insert(&Function2); + if (Function2.hasValidProfile() && Iter->second->hasValidProfile()) { + ++BothHaveProfile; + Bin1ProfiledMapped.insert(Iter->second); + } + continue; + } + if (LTOName.empty()) + continue; + auto LTOIter = LTONameLookup1.find(LTOName); + if (LTOIter != LTONameLookup1.end()) { + FuncMap.insert(std::make_pair<>(&Function2, LTOIter->second)); + Bin1MappedFuncs.insert(LTOIter->second); + Bin2MappedFuncs.insert(&Function2); + if (Function2.hasValidProfile() && LTOIter->second->hasValidProfile()) { + ++BothHaveProfile; + Bin1ProfiledMapped.insert(LTOIter->second); + } + } + } + PrintProgramStats PPS(opts::NeverPrint); + outs() << "* BOLT-DIFF: Starting print program stats pass for binary 1\n"; + PPS.runOnFunctions(*RI1.BC); + outs() << "* BOLT-DIFF: Starting print program stats pass for binary 2\n"; + PPS.runOnFunctions(*RI2.BC); + outs() << "=====\n"; + outs() << "Inputs share " << BothHaveProfile + << " functions with valid profile.\n"; + if (opts::PrintProfiledUnmapped) { + outs() << "\nFunctions in profile 1 that are missing in the profile 2:\n"; + std::vector Unmapped; + for (const auto &BFI : RI1.BC->getBinaryFunctions()) { + const BinaryFunction &Function = BFI.second; + if (!Function.hasValidProfile() || Bin1ProfiledMapped.count(&Function)) + continue; + Unmapped.emplace_back(&Function); + } + std::sort(Unmapped.begin(), Unmapped.end(), + [&](const BinaryFunction *A, const BinaryFunction *B) { + return A->getFunctionScore() > B->getFunctionScore(); + }); + for (const BinaryFunction *Function : Unmapped) { + outs() << Function->getPrintName() << " : "; + outs() << Function->getFunctionScore() << "\n"; + } + outs() << "=====\n"; + } + } + + /// Check if opcodes in BB1 match those in BB2 + bool compareBBs(const BinaryBasicBlock &BB1, + const BinaryBasicBlock &BB2) const { + auto Iter1 = BB1.begin(); + auto Iter2 = BB2.begin(); + if ((Iter1 == BB1.end() && Iter2 != BB2.end()) || + (Iter1 != BB1.end() && Iter2 == BB2.end())) + return false; + + while (Iter1 != BB1.end()) { + if (Iter2 == BB2.end() || + Iter1->getOpcode() != Iter2->getOpcode()) + return false; + + ++Iter1; + ++Iter2; + } + + if (Iter2 != BB2.end()) + return false; + return true; + } + + /// For a function in binary 2 that matched one in binary 1, now match each + /// individual basic block in it to its corresponding blocks in binary 1. + /// Also match each edge in binary 2 to the corresponding ones in binary 1. + void matchBasicBlocks() { + for (const auto &MapEntry : FuncMap) { + const BinaryFunction *const &Func1 = MapEntry.second; + const BinaryFunction *const &Func2 = MapEntry.first; + + auto Iter1 = Func1->layout_begin(); + auto Iter2 = Func2->layout_begin(); + + bool Match = true; + std::map Map; + std::map> EMap; + while (Iter1 != Func1->layout_end()) { + if (Iter2 == Func2->layout_end()) { + Match = false; + break; + } + if (!compareBBs(**Iter1, **Iter2)) { + Match = false; + break; + } + Map.insert(std::make_pair<>(*Iter2, *Iter1)); + + auto SuccIter1 = (*Iter1)->succ_begin(); + auto SuccIter2 = (*Iter2)->succ_begin(); + auto BIIter1 = (*Iter1)->branch_info_begin(); + auto BIIter2 = (*Iter2)->branch_info_begin(); + while (SuccIter1 != (*Iter1)->succ_end()) { + if (SuccIter2 == (*Iter2)->succ_end()) { + Match = false; + break; + } + const double ScoreEdge1 = getNormalizedScore(BIIter1, RI1); + const double ScoreEdge2 = getNormalizedScore(BIIter2, RI2); + EMap.insert(std::make_pair<>( + std::abs(ScoreEdge2 - ScoreEdge1), + std::make_pair<>( + std::make_tuple<>(*Iter2, *SuccIter2, ScoreEdge2), + std::make_tuple<>(*Iter1, *SuccIter1, ScoreEdge1)))); + + ++SuccIter1; + ++SuccIter2; + ++BIIter1; + ++BIIter2; + } + if (SuccIter2 != (*Iter2)->succ_end()) + Match = false; + if (!Match) + break; + + BBToFuncMap[*Iter1] = Func1; + BBToFuncMap[*Iter2] = Func2; + ++Iter1; + ++Iter2; + } + if (!Match || Iter2 != Func2->layout_end()) + continue; + + BBMap.insert(Map.begin(), Map.end()); + EdgeMap.insert(EMap.begin(), EMap.end()); + } + } + + /// Print the largest differences in basic block performance from binary 1 + /// to binary 2 + void reportHottestBBDiffs() { + std::map LargestDiffs; + for (const auto &MapEntry : BBMap) { + const BinaryBasicBlock *BB2 = MapEntry.first; + const BinaryBasicBlock *BB1 = MapEntry.second; + LargestDiffs.insert( + std::make_pair<>(std::abs(getNormalizedScore(*BB2, RI2) - + getNormalizedScore(*BB1, RI1)), + BB2)); + } + + unsigned Printed = 0; + setTitleColor(); + outs() + << "\nTop " << opts::DisplayCount + << " largest differences in basic block performance bin 2 -> bin 1:\n"; + outs() << "=========================================================\n"; + setRegularColor(); + outs() << " * Functions with different contents do not appear here\n\n"; + for (auto I = LargestDiffs.rbegin(), E = LargestDiffs.rend(); I != E; ++I) { + const BinaryBasicBlock *BB2 = I->second; + const double Score2 = getNormalizedScore(*BB2, RI2); + const double Score1 = getNormalizedScore(*BBMap[BB2], RI1); + outs() << "BB " << BB2->getName() << " from " + << BBToFuncMap[BB2]->getDemangledName() + << "\n\tScore bin1 = " << format("%.4f", Score1 * 100.0) + << "%\n\tScore bin2 = " << format("%.4f", Score2 * 100.0); + outs() << "%\t(Difference: "; + printColoredPercentage((Score2 - Score1) * 100.0); + outs() << ")\n"; + if (opts::PrintDiffBBs) { + setLightColor(); + BB2->dump(); + setRegularColor(); + } + if (Printed++ == opts::DisplayCount) + break; + } + } + + /// Print the largest differences in edge counts from one binary to another + void reportHottestEdgeDiffs() { + unsigned Printed = 0; + setTitleColor(); + outs() + << "\nTop " << opts::DisplayCount + << " largest differences in edge hotness bin 2 -> bin 1:\n"; + outs() << "=========================================================\n"; + setRegularColor(); + outs() << " * Functions with different contents do not appear here\n"; + for (auto I = EdgeMap.rbegin(), E = EdgeMap.rend(); I != E; ++I) { + std::tuple + &Edge2 = I->second.first; + std::tuple + &Edge1 = I->second.second; + const double Score2 = std::get<2>(Edge2); + const double Score1 = std::get<2>(Edge1); + outs() << "Edge (" << std::get<0>(Edge2)->getName() << " -> " + << std::get<1>(Edge2)->getName() << ") in " + << BBToFuncMap[std::get<0>(Edge2)]->getDemangledName() + << "\n\tScore bin1 = " << format("%.4f", Score1 * 100.0) + << "%\n\tScore bin2 = " << format("%.4f", Score2 * 100.0); + outs() << "%\t(Difference: "; + printColoredPercentage((Score2 - Score1) * 100.0); + outs() << ")\n"; + if (opts::PrintDiffBBs) { + setLightColor(); + std::get<0>(Edge2)->dump(); + std::get<1>(Edge2)->dump(); + setRegularColor(); + } + if (Printed++ == opts::DisplayCount) + break; + } + } + + /// For LTO functions sharing the same prefix (for example, func1.lto_priv.1 + /// and func1.lto_priv.2 share the func1.lto_priv prefix), compute aggregated + /// scores for them. This is used to avoid reporting all LTO functions as + /// having a large difference in performance because hotness shifted from + /// LTO variant 1 to variant 2, even though they represent the same function. + void computeAggregatedLTOScore() { + for (const auto &BFI : RI1.BC->getBinaryFunctions()) { + const BinaryFunction &Function = BFI.second; + double Score = getNormalizedScore(Function, RI1); + auto Iter = LTOMap1.find(&Function); + if (Iter == LTOMap1.end()) + continue; + LTOAggregatedScore1[Iter->second] += Score; + } + + double UnmappedScore = 0; + for (const auto &BFI : RI2.BC->getBinaryFunctions()) { + const BinaryFunction &Function = BFI.second; + bool Matched = FuncMap.find(&Function) != FuncMap.end(); + double Score = getNormalizedScore(Function, RI2); + auto Iter = LTOMap2.find(&Function); + if (Iter == LTOMap2.end()) { + if (!Matched) + UnmappedScore += Score; + continue; + } + LTOAggregatedScore2[Iter->second] += Score; + if (FuncMap.find(Iter->second) == FuncMap.end()) + UnmappedScore += Score; + } + int64_t Unmapped = + RI2.BC->getBinaryFunctions().size() - Bin2MappedFuncs.size(); + outs() << "BOLT-DIFF: " << Unmapped + << " functions in Binary2 have no correspondence to any other " + "function in Binary1.\n"; + + // Print the hotness score of functions in binary 2 that were not matched + // to any function in binary 1 + outs() << "BOLT-DIFF: These unmapped functions in Binary2 represent " + << format("%.2f", UnmappedScore * 100.0) << "% of execution.\n"; + } + + /// Print the largest hotness differences from binary 2 to binary 1 + void reportHottestFuncDiffs() { + std::multimap LargestDiffs; + for (const auto &MapEntry : FuncMap) { + const BinaryFunction *const &Func1 = MapEntry.second; + const BinaryFunction *const &Func2 = MapEntry.first; + double Score1 = getNormalizedScore(*Func1, RI1); + auto Iter1 = LTOMap1.find(Func1); + if (Iter1 != LTOMap1.end()) { + Score1 = LTOAggregatedScore1[Iter1->second]; + } + double Score2 = getNormalizedScore(*Func2, RI2); + auto Iter2 = LTOMap2.find(Func2); + if (Iter2 != LTOMap2.end()) { + Score2 = LTOAggregatedScore2[Iter2->second]; + } + if (Score1 == 0.0 || Score2 == 0.0) + continue; + LargestDiffs.insert( + std::make_pair<>(std::abs(Score1 - Score2), MapEntry)); + ScoreMap[Func2] = std::make_pair<>(Score1, Score2); + } + + unsigned Printed = 0; + setTitleColor(); + outs() << "\nTop " << opts::DisplayCount + << " largest differences in performance bin 2 -> bin 1:\n"; + outs() << "=========================================================\n"; + setRegularColor(); + for (auto I = LargestDiffs.rbegin(), E = LargestDiffs.rend(); I != E; ++I) { + const std::pair + &MapEntry = I->second; + if (opts::IgnoreUnchanged && + MapEntry.second->computeHash(/*UseDFS=*/true) == + MapEntry.first->computeHash(/*UseDFS=*/true)) + continue; + const std::pair &Scores = ScoreMap[MapEntry.first]; + outs() << "Function " << MapEntry.first->getDemangledName(); + if (MapEntry.first->getDemangledName() != + MapEntry.second->getDemangledName()) + outs() << "\nmatched " << MapEntry.second->getDemangledName(); + outs() << "\n\tScore bin1 = " << format("%.2f", Scores.first * 100.0) + << "%\n\tScore bin2 = " << format("%.2f", Scores.second * 100.0) + << "%\t(Difference: "; + printColoredPercentage((Scores.second - Scores.first) * 100.0); + outs() << ")"; + if (MapEntry.second->computeHash(/*UseDFS=*/true) != + MapEntry.first->computeHash(/*UseDFS=*/true)) { + outs() << "\t[Functions have different contents]"; + if (opts::PrintDiffCFG) { + outs() << "\n *** CFG for function in binary 1:\n"; + setLightColor(); + MapEntry.second->dump(); + setRegularColor(); + outs() << "\n *** CFG for function in binary 2:\n"; + setLightColor(); + MapEntry.first->dump(); + setRegularColor(); + } + } + outs() << "\n"; + if (Printed++ == opts::DisplayCount) + break; + } + } + + /// Print hottest functions from each binary + void reportHottestFuncs() { + unsigned Printed = 0; + setTitleColor(); + outs() << "\nTop " << opts::DisplayCount + << " hottest functions in binary 2:\n"; + outs() << "=====================================\n"; + setRegularColor(); + for (auto I = LargestBin2.rbegin(), E = LargestBin2.rend(); I != E; ++I) { + const std::pair &MapEntry = *I; + outs() << "Function " << MapEntry.second->getDemangledName() << "\n"; + auto Iter = ScoreMap.find(MapEntry.second); + if (Iter != ScoreMap.end()) { + outs() << "\tScore bin1 = " + << format("%.2f", Iter->second.first * 100.0) << "%\n"; + } + outs() << "\tScore bin2 = " << format("%.2f", MapEntry.first * 100.0) + << "%\n"; + if (Printed++ == opts::DisplayCount) + break; + } + + Printed = 0; + setTitleColor(); + outs() << "\nTop " << opts::DisplayCount + << " hottest functions in binary 1:\n"; + outs() << "=====================================\n"; + setRegularColor(); + for (auto I = LargestBin1.rbegin(), E = LargestBin1.rend(); I != E; ++I) { + const std::pair &MapEntry = *I; + outs() << "Function " << MapEntry.second->getDemangledName() + << "\n\tScore bin1 = " << format("%.2f", MapEntry.first * 100.0) + << "%\n"; + if (Printed++ == opts::DisplayCount) + break; + } + } + + /// Print functions in binary 2 that did not match anything in binary 1. + /// Unfortunately, in an LTO build, even a small change can lead to several + /// LTO variants being unmapped, corresponding to local functions that never + /// appear in one of the binaries because they were previously inlined. + void reportUnmapped() { + outs() << "List of functions from binary 2 that were not matched with any " + << "function in binary 1:\n"; + for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) { + const BinaryFunction &Function2 = BFI2.second; + if (Bin2MappedFuncs.count(&Function2)) + continue; + outs() << Function2.getPrintName() << "\n"; + } + } + +public: + /// Main entry point: coordinate all tasks necessary to compare two binaries + void compareAndReport() { + buildLookupMaps(); + matchFunctions(); + if (opts::IgnoreLTOSuffix) + computeAggregatedLTOScore(); + matchBasicBlocks(); + reportHottestFuncDiffs(); + reportHottestBBDiffs(); + reportHottestEdgeDiffs(); + reportHottestFuncs(); + if (!opts::PrintUnmapped) + return; + reportUnmapped(); + } + + RewriteInstanceDiff(RewriteInstance &RI1, RewriteInstance &RI2) + : RI1(RI1), RI2(RI2) { + compareAndReport(); + } + +}; + +} // end nampespace bolt +} // end namespace llvm + +void RewriteInstance::compare(RewriteInstance &RI2) { + outs() << "BOLT-DIFF: ======== Binary1 vs. Binary2 ========\n"; + outs() << "Trace for binary 1 has " << this->getTotalScore() + << " instructions executed.\n"; + outs() << "Trace for binary 2 has " << RI2.getTotalScore() + << " instructions executed.\n"; + if (opts::NormalizeByBin1) { + double Diff2to1 = static_cast(RI2.getTotalScore() - this->getTotalScore()) / + this->getTotalScore(); + outs() << "Binary2 change in score with respect to Binary1: "; + printColoredPercentage(Diff2to1 * 100.0); + outs() << "\n"; + } + + if (!this->getTotalScore() || !RI2.getTotalScore()) { + outs() << "BOLT-DIFF: Both binaries must have recorded activity in known " + "functions.\n"; + return; + } + + // Pre-pass ICF + if (opts::ICF) { + IdenticalCodeFolding ICF(opts::NeverPrint); + outs() << "BOLT-DIFF: Starting ICF pass for binary 1"; + ICF.runOnFunctions(*BC); + outs() << "BOLT-DIFF: Starting ICF pass for binary 2"; + ICF.runOnFunctions(*RI2.BC); + } + + RewriteInstanceDiff RID(*this, RI2); +} diff --git a/bolt/src/CMakeLists.txt b/bolt/src/CMakeLists.txt new file mode 100644 index 000000000000..cc630b3bd1d5 --- /dev/null +++ b/bolt/src/CMakeLists.txt @@ -0,0 +1,123 @@ +add_subdirectory(merge-fdata) +add_subdirectory(Passes) +add_subdirectory(RuntimeLibs) +add_subdirectory(Target) + +# Get the current git revision for BOLT. +function(get_version ofn) + find_program(git_executable NAMES git git.exe git.cmd) + if (git_executable) + execute_process(COMMAND ${git_executable} rev-parse HEAD + WORKING_DIRECTORY ${LLVM_MAIN_SRC_DIR} + TIMEOUT 5 + RESULT_VARIABLE git_result + OUTPUT_VARIABLE git_output) + if( git_result EQUAL 0 ) + string(STRIP "${git_output}" git_ref_id) + set(BOLT_REVISION "${git_ref_id}") + endif() + endif() + + # If we can't find a revision, set it to "". + if (NOT BOLT_REVISION) + set(BOLT_REVISION "") + endif() + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} + COMMAND echo '"${BOLT_REVISION}"' > ${CMAKE_CURRENT_BINARY_DIR}/${ofn} + COMMENT "Generating bogus ${ofn}..." + ) + + set(VERSION_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} PARENT_SCOPE) + + # `make clean' must remove all those generated files: + set_property(DIRECTORY APPEND + PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${ofn}) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${ofn} PROPERTIES + GENERATED 1) +endfunction() + +# Creates a public target for generating the revision file. +function(add_public_gen_version_target target) + add_custom_target(${target} DEPENDS ${VERSION_OUTPUT}) + set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE) +endfunction() + +get_version(BoltRevision.inc) +add_public_gen_version_target(GenBoltRevision) + +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + BOLTPasses + BOLTRuntimeLibs + CodeGen + Core + DebugInfoDWARF + MC + MCDisassembler + MCParser + Object + Orcjit + Support + ) + +string(FIND "${LLVM_TARGETS_TO_BUILD}" "AArch64" POSITION) +if (NOT ${POSITION} EQUAL -1) + list(APPEND LLVM_LINK_COMPONENTS BOLTTargetAArch64) + set(BOLT_AArch64 On) +endif() + +string(FIND "${LLVM_TARGETS_TO_BUILD}" "X86" POSITION) +if (NOT ${POSITION} EQUAL -1) + list(APPEND LLVM_LINK_COMPONENTS BOLTTargetX86) + set(BOLT_X64 On) +endif() + +add_llvm_tool(llvm-bolt + llvm-bolt.cpp + BinaryBasicBlock.cpp + BinaryContext.cpp + BinaryData.cpp + BinaryEmitter.cpp + BinaryFunction.cpp + BinaryFunctionProfile.cpp + BinaryPassManager.cpp + BinarySection.cpp + BoltAddressTranslation.cpp + BoltDiff.cpp + CacheMetrics.cpp + DataAggregator.cpp + DataReader.cpp + DebugData.cpp + DWARFRewriter.cpp + DynoStats.cpp + Exceptions.cpp + ExecutableFileMemoryManager.cpp + Heatmap.cpp + JumpTable.cpp + MachORewriteInstance.cpp + MCPlusBuilder.cpp + ParallelUtilities.cpp + ProfileReaderBase.cpp + Relocation.cpp + RewriteInstance.cpp + Utils.cpp + YAMLProfileReader.cpp + YAMLProfileWriter.cpp + + DEPENDS + intrinsics_gen + bolt_rt + ) + +if (DEFINED BOLT_AArch64) + target_compile_definitions(llvm-bolt PRIVATE AARCH64_AVAILABLE) +endif() + +if (DEFINED BOLT_X64) + target_compile_definitions(llvm-bolt PRIVATE X86_AVAILABLE) +endif() + +add_llvm_tool_symlink(perf2bolt llvm-bolt) +add_llvm_tool_symlink(llvm-boltdiff llvm-bolt) +add_llvm_tool_symlink(llvm-bolt-heatmap llvm-bolt) diff --git a/bolt/src/CacheMetrics.cpp b/bolt/src/CacheMetrics.cpp new file mode 100644 index 000000000000..3bb1cf520085 --- /dev/null +++ b/bolt/src/CacheMetrics.cpp @@ -0,0 +1,316 @@ +//===------ CacheMetrics.cpp - Calculate metrics for instruction cache ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functions to show metrics of cache lines +// +//===----------------------------------------------------------------------===// + +#include "CacheMetrics.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "llvm/Support/CommandLine.h" +#include + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt ForwardWeight; +extern cl::opt BackwardWeight; +extern cl::opt ForwardDistance; +extern cl::opt BackwardDistance; +extern cl::opt ITLBPageSize; +extern cl::opt ITLBEntries; + +} + +namespace { + +/// Initialize and return a position map for binary basic blocks +void extractBasicBlockInfo( + const std::vector &BinaryFunctions, + std::unordered_map &BBAddr, + std::unordered_map &BBSize) { + + for (BinaryFunction *BF : BinaryFunctions) { + const BinaryContext &BC = BF->getBinaryContext(); + for (BinaryBasicBlock *BB : BF->layout()) { + if (BF->isSimple() || BC.HasRelocations) { + // Use addresses/sizes as in the output binary + BBAddr[BB] = BB->getOutputAddressRange().first; + BBSize[BB] = BB->getOutputSize(); + } else { + // Output ranges should match the input if the body hasn't changed + BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress(); + BBSize[BB] = BB->getOriginalSize(); + } + } + } +} + +/// Calculate TSP metric, which quantifies the number of fallthrough jumps in +/// the ordering of basic blocks +double calcTSPScore( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize) { + + double Score = 0; + for (BinaryFunction *BF : BinaryFunctions) { + if (!BF->hasProfile()) + continue; + for (BinaryBasicBlock *SrcBB : BF->layout()) { + auto BI = SrcBB->branch_info_begin(); + for (BinaryBasicBlock *DstBB : SrcBB->successors()) { + if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB)) + Score += BI->Count; + ++BI; + } + } + } + return Score; +} + +/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache +/// misses for a given ordering of basic blocks +double calcExtTSPScore( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize) { + + double Score = 0.0; + for (BinaryFunction *BF : BinaryFunctions) { + if (!BF->hasProfile()) + continue; + for (BinaryBasicBlock *SrcBB : BF->layout()) { + auto BI = SrcBB->branch_info_begin(); + for (BinaryBasicBlock *DstBB : SrcBB->successors()) { + if (DstBB != SrcBB) { + Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB), + BBSize.at(SrcBB), + BBAddr.at(DstBB), + BI->Count); + } + ++BI; + } + } + } + return Score; +} + +using Predecessors = std::vector>; + +/// Build a simplified version of the call graph: For every function, keep +/// its callers and the frequencies of the calls +std::unordered_map +extractFunctionCalls(const std::vector &BinaryFunctions) { + std::unordered_map Calls; + + for (BinaryFunction *SrcFunction : BinaryFunctions) { + const BinaryContext &BC = SrcFunction->getBinaryContext(); + for (BinaryBasicBlock *BB : SrcFunction->layout()) { + // Find call instructions and extract target symbols from each one + for (MCInst &Inst : *BB) { + if (!BC.MIB->isCall(Inst)) + continue; + + // Call info + const MCSymbol* DstSym = BC.MIB->getTargetSymbol(Inst); + uint64_t Count = BB->getKnownExecutionCount(); + // Ignore calls w/o information + if (DstSym == nullptr || Count == 0) + continue; + + const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); + // Ignore recursive calls + if (DstFunction == nullptr || + DstFunction->layout_empty() || + DstFunction == SrcFunction) + continue; + + // Record the call + Calls[DstFunction].emplace_back(SrcFunction, Count); + } + } + } + return Calls; +} + +/// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). +/// Given an assignment of functions to the i-TLB pages), we divide all +/// functions calls into two categories: +/// - 'short' ones that have a caller-callee distance less than a page; +/// - 'long' ones where the distance exceeds a page. +/// The short calls are likely to result in a i-TLB cache hit. For the long ones, +/// the hit/miss result depends on the 'hotness' of the page (i.e., how often +/// the page is accessed). Assuming that functions are sent to the i-TLB cache +/// in a random order, the probability that a page is present in the cache is +/// proportional to the number of samples corresponding to the functions on the +/// page. The following procedure detects short and long calls, and estimates +/// the expected number of cache misses for the long ones. +double expectedCacheHitRatio( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize) { + + const double PageSize = opts::ITLBPageSize; + const uint64_t CacheEntries = opts::ITLBEntries; + std::unordered_map Calls = + extractFunctionCalls(BinaryFunctions); + // Compute 'hotness' of the functions + double TotalSamples = 0; + std::unordered_map FunctionSamples; + for (BinaryFunction *BF : BinaryFunctions) { + double Samples = 0; + for (std::pair Pair : Calls[BF]) { + Samples += Pair.second; + } + Samples = std::max(Samples, (double)BF->getKnownExecutionCount()); + FunctionSamples[BF] = Samples; + TotalSamples += Samples; + } + + // Compute 'hotness' of the pages + std::unordered_map PageSamples; + for (BinaryFunction *BF : BinaryFunctions) { + if (BF->layout_empty()) + continue; + double Page = BBAddr.at(BF->layout_front()) / PageSize; + PageSamples[Page] += FunctionSamples.at(BF); + } + + // Computing the expected number of misses for every function + double Misses = 0; + for (BinaryFunction *BF : BinaryFunctions) { + // Skip the function if it has no samples + if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0) + continue; + double Samples = FunctionSamples.at(BF); + double Page = BBAddr.at(BF->layout_front()) / PageSize; + // The probability that the page is not present in the cache + double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries); + + // Processing all callers of the function + for (std::pair Pair : Calls[BF]) { + BinaryFunction *SrcFunction = Pair.first; + double SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize; + // Is this a 'long' or a 'short' call? + if (Page != SrcPage) { + // This is a miss + Misses += MissProb * Pair.second; + } + Samples -= Pair.second; + } + assert(Samples >= 0.0 && "Function samples computed incorrectly"); + // The remaining samples likely come from the jitted code + Misses += Samples * MissProb; + } + + return 100.0 * (1.0 - Misses / TotalSamples); +} + +} // end namespace anonymous + +double CacheMetrics::extTSPScore(uint64_t SrcAddr, + uint64_t SrcSize, + uint64_t DstAddr, + uint64_t Count) { + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); + + // Fallthrough + if (SrcAddr + SrcSize == DstAddr) { + // Assume that FallthroughWeight = 1.0 after normalization + return static_cast(Count); + } + // Forward + if (SrcAddr + SrcSize < DstAddr) { + const uint64_t Dist = DstAddr - (SrcAddr + SrcSize); + if (Dist <= opts::ForwardDistance) { + double Prob = 1.0 - static_cast(Dist) / opts::ForwardDistance; + return opts::ForwardWeight * Prob * Count; + } + return 0; + } + // Backward + const uint64_t Dist = SrcAddr + SrcSize - DstAddr; + if (Dist <= opts::BackwardDistance) { + double Prob = 1.0 - static_cast(Dist) / opts::BackwardDistance; + return opts::BackwardWeight * Prob * Count; + } + return 0; +} + +void CacheMetrics::printAll(const std::vector &BFs) { + // Stats related to hot-cold code splitting + size_t NumFunctions = 0; + size_t NumProfiledFunctions = 0; + size_t NumHotFunctions = 0; + size_t NumBlocks = 0; + size_t NumHotBlocks = 0; + + size_t TotalCodeMinAddr = std::numeric_limits::max(); + size_t TotalCodeMaxAddr = 0; + size_t HotCodeMinAddr = std::numeric_limits::max(); + size_t HotCodeMaxAddr = 0; + + for (BinaryFunction *BF : BFs) { + NumFunctions++; + if (BF->hasProfile()) + NumProfiledFunctions++; + if (BF->hasValidIndex()) + NumHotFunctions++; + for (BinaryBasicBlock *BB : BF->layout()) { + NumBlocks++; + size_t BBAddrMin = BB->getOutputAddressRange().first; + size_t BBAddrMax = BB->getOutputAddressRange().second; + TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin); + TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax); + if (BF->hasValidIndex() && !BB->isCold()) { + NumHotBlocks++; + HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin); + HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax); + } + } + } + + outs() << format(" There are %zu functions;", NumFunctions) + << format(" %zu (%.2lf%%) are in the hot section,", + NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions) + << format(" %zu (%.2lf%%) have profile\n", + NumProfiledFunctions, 100.0 * NumProfiledFunctions / NumFunctions); + outs() << format(" There are %zu basic blocks;", NumBlocks) + << format(" %zu (%.2lf%%) are in the hot section\n", + NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks); + + assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses"); + size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr; + size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr; + + size_t HugePage2MB = 2 << 20; + outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, %.2lf huge pages)\n", + 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize, + double(HotCodeSize) / HugePage2MB); + + // Stats related to expected cache performance + std::unordered_map BBAddr; + std::unordered_map BBSize; + extractBasicBlockInfo(BFs, BBAddr, BBSize); + + outs() << " Expected i-TLB cache hit ratio: " + << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize)); + + outs() << " TSP score: " + << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize)); + + outs() << " ExtTSP score: " + << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize)); +} diff --git a/bolt/src/CacheMetrics.h b/bolt/src/CacheMetrics.h new file mode 100644 index 000000000000..3e797b890bdc --- /dev/null +++ b/bolt/src/CacheMetrics.h @@ -0,0 +1,42 @@ +//===- CacheMetrics.h - Interface for instruction cache evaluation --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functions to show metrics of cache lines +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H +#define LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H + +#include +#include + +namespace llvm { +namespace bolt { +class BinaryFunction; +namespace CacheMetrics { + +/// Calculate various metrics related to instruction cache performance. +void printAll(const std::vector &BinaryFunctions); + +/// Calculate Extended-TSP metric, which quantifies the expected number of +/// i-cache misses for a given pair of basic blocks. The parameters are: +/// - SrcAddr is the address of the source block; +/// - SrcSize is the size of the source block; +/// - DstAddr is the address of the destination block; +/// - Count is the number of jumps between the pair of blocks. +double extTSPScore(uint64_t SrcAddr, + uint64_t SrcSize, + uint64_t DstAddr, + uint64_t Count); + +} // namespace CacheMetrics +} // namespace bolt +} // namespace llvm + +#endif //LLVM_CACHEMETRICS_H diff --git a/bolt/src/DWARFRewriter.cpp b/bolt/src/DWARFRewriter.cpp new file mode 100644 index 000000000000..bd00da372842 --- /dev/null +++ b/bolt/src/DWARFRewriter.cpp @@ -0,0 +1,1304 @@ +//===--- DWARFRewriter.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DWARFRewriter.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "DebugData.h" +#include "ParallelUtilities.h" +#include "Utils.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/DWARF/DWARFFormValue.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/ThreadPool.h" +#include "llvm/Support/ToolOutputFile.h" +#include +#include +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +LLVM_ATTRIBUTE_UNUSED +static void printDie(const DWARFDie &DIE) { + DIDumpOptions DumpOpts; + DumpOpts.ShowForm = true; + DumpOpts.Verbose = true; + DumpOpts.ChildRecurseDepth = 0; + DumpOpts.ShowChildren = 0; + DIE.dump(dbgs(), 0, DumpOpts); +} + +using namespace llvm; +using namespace llvm::support::endian; +using namespace object; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltCategory; +extern cl::opt Verbosity; + +static cl::opt +KeepARanges("keep-aranges", + cl::desc("keep or generate .debug_aranges section if .gdb_index is written"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +DeterministicDebugInfo("deterministic-debuginfo", + cl::desc("disables parallel execution of tasks that may produce" + "nondeterministic debug info"), + cl::init(true), + cl::cat(BoltCategory)); + +static cl::opt + DwoOutputPath("dwo-output-path", + cl::desc("Path to where .dwo files will be written out to."), + cl::init(""), cl::cat(BoltCategory)); + +static cl::opt + DebugSkeletonCu("debug-skeleton-cu", + cl::desc("Prints out offsetrs for abbrev and debu_info of " + "Skeleton CUs that get patched."), + cl::ZeroOrMore, cl::Hidden, cl::init(false), + cl::cat(BoltCategory)); +} // namespace opts + +/// Returns DWO Name to be used. Handles case where user specifies output DWO +/// directory, and there are duplicate names. Assumes DWO ID is unique. +static std::string +getDWOName(llvm::DWARFUnit &CU, + std::unordered_map *NameToIndexMap, + std::unordered_map &DWOIdToName) { + llvm::Optional DWOId = CU.getDWOId(); + assert(DWOId && "DWO ID not found."); + (void)DWOId; + auto NameIter = DWOIdToName.find(*DWOId); + if (NameIter != DWOIdToName.end()) + return NameIter->second; + + std::string DWOName = dwarf::toString( + CU.getUnitDIE().find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), + ""); + assert(!DWOName.empty() && + "DW_AT_dwo_name/DW_AT_GNU_dwo_name does not exists."); + if (NameToIndexMap && !opts::DwoOutputPath.empty()) { + auto Iter = NameToIndexMap->find(DWOName); + if (Iter == NameToIndexMap->end()) { + Iter = NameToIndexMap->insert({DWOName, 0}).first; + } + DWOName.append(std::to_string(Iter->second)); + ++Iter->second; + } + DWOName.append(".dwo"); + DWOIdToName[*DWOId] = DWOName; + return DWOName; +} + +static bool isHighPcFormEightBytes(dwarf::Form DwarfForm) { + return DwarfForm == dwarf::DW_FORM_addr || DwarfForm == dwarf::DW_FORM_data8; +} + +void DWARFRewriter::updateDebugInfo() { + ErrorOr DebugAbbrev = + BC.getUniqueSectionByName(".debug_abbrev"); + ErrorOr DebugInfo = BC.getUniqueSectionByName(".debug_info"); + + if (!DebugAbbrev || !DebugInfo) + return; + + DebugAbbrev->registerPatcher(std::make_unique()); + auto *AbbrevPatcher = + static_cast(DebugAbbrev->getPatcher()); + + DebugInfo->registerPatcher(std::make_unique()); + auto *DebugInfoPatcher = + static_cast(DebugInfo->getPatcher()); + + ARangesSectionWriter = std::make_unique(); + RangesSectionWriter = std::make_unique(); + StrWriter = std::make_unique(&BC); + + AddrWriter = std::make_unique(&BC); + DebugLoclistWriter::setAddressWriter(AddrWriter.get()); + + uint64_t NumCUs = BC.DwCtx->getNumCompileUnits(); + if ((opts::NoThreads || opts::DeterministicDebugInfo) && + BC.getNumDWOCUs() == 0) { + // Use single entry for efficiency when running single-threaded + NumCUs = 1; + } + + LocListWritersByCU.reserve(NumCUs); + + for (size_t CUIndex = 0; CUIndex < NumCUs; ++CUIndex) { + LocListWritersByCU[CUIndex] = std::make_unique(&BC); + } + // Unordered maps to handle name collision if output DWO directory is + // specified. + std::unordered_map NameToIndexMap; + std::unordered_map DWOIdToName; + + auto updateDWONameCompDir = [&](DWARFUnit &Unit) -> void { + const DWARFDie &DIE = Unit.getUnitDIE(); + uint64_t AttrOffset = 0; + Optional ValDwoName = + DIE.find(dwarf::DW_AT_GNU_dwo_name, &AttrOffset); + (void)ValDwoName; + assert(ValDwoName && "Skeleton CU doesn't have dwo_name."); + + std::string ObjectName = getDWOName(Unit, &NameToIndexMap, DWOIdToName); + uint32_t NewOffset = StrWriter->addString(ObjectName.c_str()); + DebugInfoPatcher->addLE32Patch(AttrOffset, NewOffset); + + Optional ValCompDir = + DIE.find(dwarf::DW_AT_comp_dir, &AttrOffset); + (void)ValCompDir; + assert(ValCompDir && "DW_AT_comp_dir is not in Skeleton CU."); + if (!opts::DwoOutputPath.empty()) { + uint32_t NewOffset = StrWriter->addString(opts::DwoOutputPath.c_str()); + DebugInfoPatcher->addLE32Patch(AttrOffset, NewOffset); + } + }; + + uint32_t AbbrevOffsetModifier = 0; + // Case 1) Range_base found: patch .debug_info + // Case 2) Range_base not found, but Ranges will be used: patch + // .debug_info/.debug_abbrev + auto updateRangeBase = [&](DWARFUnit &Unit, uint64_t RangeBase, + bool WasRangeBaseUsed) -> void { + uint64_t AttrOffset = 0; + DWARFDie DIE = Unit.getUnitDIE(); + Optional ValRangeBase = + DIE.find(dwarf::DW_AT_GNU_ranges_base, &AttrOffset); + bool NeedToPatch = ValRangeBase.hasValue(); + uint32_t PrevAbbrevOffsetModifier = AbbrevOffsetModifier; + // Case where Skeleton CU doesn't have DW_AT_GNU_ranges_base + if (!NeedToPatch && WasRangeBaseUsed) { + const DWARFAbbreviationDeclaration *AbbreviationDecl = + DIE.getAbbreviationDeclarationPtr(); + if (Optional ValLowPC = + DIE.find(dwarf::DW_AT_low_pc, &AttrOffset)) { + + Optional ValHighPC = DIE.find(dwarf::DW_AT_high_pc); + uint32_t NumBytesToFill = 7; + + AbbrevPatcher->addAttributePatch(AbbreviationDecl, dwarf::DW_AT_low_pc, + dwarf::DW_AT_GNU_ranges_base, + dwarf::DW_FORM_indirect); + // Bolt converts DW_AT_low_pc/DW_AT_high_pc to DW_AT_low_pc/DW_at_ranges + // DW_AT_high_pc can be 4 or 8 bytes. If it's 8 bytes need to use first + // 4 bytes. + if (ValHighPC && isHighPcFormEightBytes(ValHighPC->getForm())) { + NumBytesToFill += 4; + } + LLVM_DEBUG(if (opts::DebugSkeletonCu) dbgs() + << "AttrOffset: " << Twine::utohexstr(AttrOffset) << "\n" + << "Die Offset: " << Twine::utohexstr(DIE.getOffset()) + << "\n" + << "AbbrDecl offfset: " + << Twine::utohexstr(Unit.getAbbrOffset()) << "\n" + << "Unit Offset: " << Twine::utohexstr(Unit.getOffset()) + << "\n\n";); + DebugInfoPatcher->addUDataPatch(AttrOffset, dwarf::DW_FORM_udata, 1); + DebugInfoPatcher->addUDataPatch(AttrOffset + 1, RangeBase, + NumBytesToFill); + + // 1 Byte for DW_AT_GNU_ranges_base (since it's 2 bytes vs DW_AT_low_pc) + AbbrevOffsetModifier += 1; + } else { + errs() << "BOLT-WARNING: Skeleton CU at 0x" + << Twine::utohexstr(DIE.getOffset()) + << " doesn't have DW_AT_GNU_ranges_base, or " + "DW_AT_low_pc to convert\n"; + return; + } + } + if (NeedToPatch) + DebugInfoPatcher->addLE32Patch(AttrOffset, + static_cast(RangeBase)); + + // DWARF4 + // unit_length - 4 bytes + // version - 2 bytes + // So + 6 to patch debug_abbrev_offset + if (PrevAbbrevOffsetModifier) + DebugInfoPatcher->addLE32Patch( + Unit.getOffset() + 6, static_cast(Unit.getAbbrOffset()) + + PrevAbbrevOffsetModifier); + }; + + auto processUnitDIE = [&](size_t CUIndex, DWARFUnit *Unit) { + uint64_t RangeBase = RangesSectionWriter->getSectionOffset(); + updateUnitDebugInfo(CUIndex, *Unit, *DebugInfoPatcher, *AbbrevPatcher); + if (llvm::Optional DWOId = Unit->getDWOId()) { + Optional CU = BC.getDWOCU(*DWOId); + if (CU) { + updateDWONameCompDir(*Unit); + // Assuming there is unique DWOID per binary. i.e. Two or more CUs don't + // have same DWO ID. + assert(LocListWritersByCU.count(*DWOId) == 0 && + "LocList writer for DWO unit already exists."); + LocListWritersByCU[*DWOId] = + std::make_unique(&BC, *DWOId); + SimpleBinaryPatcher *DwoDebugInfoPatcher = + getBinaryDWODebugInfoPatcher(*DWOId); + DwoDebugInfoPatcher->setRangeBase(RangeBase); + updateUnitDebugInfo(*DWOId, *(*CU), *DwoDebugInfoPatcher, + *getBinaryDWOAbbrevPatcher(*DWOId)); + static_cast(LocListWritersByCU[*DWOId].get()) + ->finalizePatches(); + updateRangeBase(*Unit, RangeBase, + DwoDebugInfoPatcher->getWasRangBasedUsed()); + } + } + }; + + if (opts::NoThreads || opts::DeterministicDebugInfo) { + for (std::unique_ptr &CU : BC.DwCtx->compile_units()) { + processUnitDIE(0, CU.get()); + } + } else { + // Update unit debug info in parallel + ThreadPool &ThreadPool = ParallelUtilities::getThreadPool(); + size_t CUIndex = 0; + for (std::unique_ptr &CU : BC.DwCtx->compile_units()) { + ThreadPool.async(processUnitDIE, CUIndex, CU.get()); + CUIndex++; + } + + ThreadPool.wait(); + } + + flushPendingRanges(*DebugInfoPatcher); + + finalizeDebugSections(*DebugInfoPatcher); + + writeOutDWOFiles(DWOIdToName); + + updateGdbIndexSection(); +} + +void DWARFRewriter::updateUnitDebugInfo(uint64_t CUIndex, DWARFUnit &Unit, + SimpleBinaryPatcher &DebugInfoPatcher, + DebugAbbrevPatcher &AbbrevPatcher) { + // Cache debug ranges so that the offset for identical ranges could be reused. + std::map CachedRanges; + + auto &DebugLocWriter = *LocListWritersByCU[CUIndex].get(); + + uint64_t DIEOffset = Unit.getOffset() + Unit.getHeaderSize(); + uint64_t NextCUOffset = Unit.getNextUnitOffset(); + DWARFDebugInfoEntry Die; + DWARFDataExtractor DebugInfoData = Unit.getDebugInfoExtractor(); + uint32_t Depth = 0; + + while ( + Die.extractFast(Unit, &DIEOffset, DebugInfoData, NextCUOffset, Depth)) { + if (const DWARFAbbreviationDeclaration *AbbrDecl = + Die.getAbbreviationDeclarationPtr()) { + if (AbbrDecl->hasChildren()) + ++Depth; + } else { + // NULL entry. + if (Depth > 0) + --Depth; + if (Depth == 0) + break; + } + + DWARFDie DIE(&Unit, &Die); + + switch (DIE.getTag()) { + case dwarf::DW_TAG_compile_unit: { + auto ModuleRangesOrError = DIE.getAddressRanges(); + if (!ModuleRangesOrError) { + consumeError(ModuleRangesOrError.takeError()); + break; + } + DWARFAddressRangesVector ModuleRanges = *ModuleRangesOrError; + DebugAddressRangesVector OutputRanges = + BC.translateModuleAddressRanges(ModuleRanges); + const uint64_t RangesSectionOffset = + RangesSectionWriter->addRanges(OutputRanges); + ARangesSectionWriter->addCURanges(Unit.getOffset(), + std::move(OutputRanges)); + updateDWARFObjectAddressRanges(DIE, RangesSectionOffset, DebugInfoPatcher, + AbbrevPatcher); + break; + } + case dwarf::DW_TAG_subprogram: { + // Get function address either from ranges or [LowPC, HighPC) pair. + bool UsesRanges = false; + uint64_t Address; + uint64_t SectionIndex, HighPC; + if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) { + Expected RangesOrError = + DIE.getAddressRanges(); + if (!RangesOrError) { + consumeError(RangesOrError.takeError()); + break; + } + DWARFAddressRangesVector Ranges = *RangesOrError; + // Not a function definition. + if (Ranges.empty()) + break; + + Address = Ranges.front().LowPC; + UsesRanges = true; + } + + // Clear cached ranges as the new function will have its own set. + CachedRanges.clear(); + + DebugAddressRangesVector FunctionRanges; + if (const BinaryFunction *Function = + BC.getBinaryFunctionAtAddress(Address)) + FunctionRanges = Function->getOutputAddressRanges(); + // Update ranges. + if (UsesRanges) { + updateDWARFObjectAddressRanges( + DIE, RangesSectionWriter->addRanges(FunctionRanges), + DebugInfoPatcher, AbbrevPatcher); + } else { + // Delay conversion of [LowPC, HighPC) into DW_AT_ranges if possible. + const DWARFAbbreviationDeclaration *Abbrev = + DIE.getAbbreviationDeclarationPtr(); + assert(Abbrev && "abbrev expected"); + + // Create a critical section. + static std::shared_timed_mutex CriticalSectionMutex; + std::unique_lock Lock(CriticalSectionMutex); + + if (FunctionRanges.size() > 1) { + convertPending(Abbrev, DebugInfoPatcher, AbbrevPatcher); + // Exit critical section early. + Lock.unlock(); + convertToRanges(DIE, FunctionRanges, DebugInfoPatcher); + } else if (ConvertedRangesAbbrevs.find(Abbrev) != + ConvertedRangesAbbrevs.end()) { + // Exit critical section early. + Lock.unlock(); + convertToRanges(DIE, FunctionRanges, DebugInfoPatcher); + } else { + if (FunctionRanges.empty()) + FunctionRanges.emplace_back(DebugAddressRange()); + addToPendingRanges(Abbrev, DIE, FunctionRanges, Unit.getDWOId()); + } + } + break; + } + case dwarf::DW_TAG_lexical_block: + case dwarf::DW_TAG_inlined_subroutine: + case dwarf::DW_TAG_try_block: + case dwarf::DW_TAG_catch_block: { + uint64_t RangesSectionOffset = + RangesSectionWriter->getEmptyRangesOffset(); + Expected RangesOrError = DIE.getAddressRanges(); + const BinaryFunction *Function = RangesOrError && !RangesOrError->empty() + ? BC.getBinaryFunctionContainingAddress(RangesOrError->front().LowPC) + : nullptr; + if (Function) { + DebugAddressRangesVector OutputRanges = + Function->translateInputToOutputRanges(*RangesOrError); + LLVM_DEBUG(if (OutputRanges.empty() != RangesOrError->empty()) { + dbgs() << "BOLT-DEBUG: problem with DIE at 0x" + << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x" + << Twine::utohexstr(Unit.getOffset()) << '\n'; + }); + RangesSectionOffset = RangesSectionWriter->addRanges( + std::move(OutputRanges), CachedRanges); + } else if (!RangesOrError) { + consumeError(RangesOrError.takeError()); + } + updateDWARFObjectAddressRanges(DIE, RangesSectionOffset, DebugInfoPatcher, + AbbrevPatcher); + break; + } + default: { + // Handle any tag that can have DW_AT_location attribute. + DWARFFormValue Value; + uint64_t AttrOffset; + if (Optional V = + DIE.find(dwarf::DW_AT_location, &AttrOffset)) { + Value = *V; + if (Value.isFormClass(DWARFFormValue::FC_Constant) || + Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { + uint64_t Offset = Value.isFormClass(DWARFFormValue::FC_Constant) + ? Value.getAsUnsignedConstant().getValue() + : Value.getAsSectionOffset().getValue(); + DebugLocationsVector InputLL; + + Optional SectionAddress = + Unit.getBaseAddress(); + uint64_t BaseAddress = 0; + if (SectionAddress) + BaseAddress = SectionAddress->Address; + + Error E = Unit.getLocationTable().visitLocationList( + &Offset, [&](const DWARFLocationEntry &Entry) { + switch (Entry.Kind) { + default: + llvm_unreachable("Unsupported DWARFLocationEntry Kind."); + case dwarf::DW_LLE_end_of_list: + return false; + case dwarf::DW_LLE_base_address: + assert(Entry.SectionIndex == SectionedAddress::UndefSection && + "absolute address expected"); + BaseAddress = Entry.Value0; + break; + case dwarf::DW_LLE_offset_pair: + assert( + (Entry.SectionIndex == SectionedAddress::UndefSection && + !Unit.isDWOUnit()) && + "absolute address expected"); + InputLL.emplace_back(DebugLocationEntry{ + BaseAddress + Entry.Value0, BaseAddress + Entry.Value1, + Entry.Loc}); + break; + case dwarf::DW_LLE_startx_length: + assert(Unit.isDWOUnit() && + "None DWO Unit with DW_LLE_startx_length encoding."); + Optional EntryAddress = + Unit.getAddrOffsetSectionItem(Entry.Value0); + assert(EntryAddress && "Address does not exist."); + InputLL.emplace_back(DebugLocationEntry{ + EntryAddress->Address, + EntryAddress->Address + Entry.Value1, Entry.Loc}); + break; + } + return true; + }); + + uint64_t OutputLocListOffset = DebugLocWriter::EmptyListTag; + if (E || InputLL.empty()) { + errs() << "BOLT-WARNING: empty location list detected at 0x" + << Twine::utohexstr(Offset) << " for DIE at 0x" + << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x" + << Twine::utohexstr(Unit.getOffset()) << '\n'; + } else { + const uint64_t Address = InputLL.front().LowPC; + if (const BinaryFunction *Function = + BC.getBinaryFunctionContainingAddress(Address)) { + const DebugLocationsVector OutputLL = Function + ->translateInputToOutputLocationList(InputLL); + LLVM_DEBUG(if (OutputLL.empty()) { + dbgs() << "BOLT-DEBUG: location list translated to an empty " + "one at 0x" + << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x" + << Twine::utohexstr(Unit.getOffset()) << '\n'; + }); + OutputLocListOffset = DebugLocWriter.addList(OutputLL); + } + } + + if (OutputLocListOffset != DebugLocWriter::EmptyListTag) { + std::lock_guard Lock(LocListDebugInfoPatchesMutex); + if (Unit.isDWOUnit()) { + // Not sure if better approach is to hide all of this away in a + // class. Also re-using LocListDebugInfoPatchType. Wasting some + // space for DWOID/CUIndex. + DwoLocListDebugInfoPatches[CUIndex].push_back( + {AttrOffset, CUIndex, OutputLocListOffset}); + } else { + LocListDebugInfoPatches.push_back( + {AttrOffset, CUIndex, OutputLocListOffset}); + } + } else { + std::lock_guard Lock(DebugInfoPatcherMutex); + DebugInfoPatcher.addLE32Patch(AttrOffset, + DebugLocWriter::EmptyListOffset); + } + } else { + assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) || + Value.isFormClass(DWARFFormValue::FC_Block)) && + "unexpected DW_AT_location form"); + if (Unit.isDWOUnit()) { + ArrayRef Expr = *Value.getAsBlock(); + DataExtractor Data( + StringRef((const char *)Expr.data(), Expr.size()), + Unit.getContext().isLittleEndian(), 0); + DWARFExpression LocExpr(Data, Unit.getAddressByteSize(), + Unit.getFormParams().Format); + for (auto &Expr : LocExpr) { + if (Expr.getCode() != dwarf::DW_OP_GNU_addr_index) + continue; + uint64_t Index = Expr.getRawOperand(0); + Optional EntryAddress = + Unit.getAddrOffsetSectionItem(Index); + assert(EntryAddress && "Address is not found."); + assert(Index <= std::numeric_limits::max() && + "Invalid Operand Index."); + AddrWriter->addIndexAddress(EntryAddress->Address, + static_cast(Index), + *Unit.getDWOId()); + } + } + } + } else if (Optional V = + DIE.find(dwarf::DW_AT_low_pc, &AttrOffset)) { + Value = *V; + const Optional Result = Value.getAsAddress(); + if (Result.hasValue()) { + const uint64_t Address = Result.getValue(); + uint64_t NewAddress = 0; + if (const BinaryFunction *Function = + BC.getBinaryFunctionContainingAddress(Address)) { + NewAddress = Function->translateInputToOutputAddress(Address); + LLVM_DEBUG(dbgs() + << "BOLT-DEBUG: Fixing low_pc 0x" + << Twine::utohexstr(Address) << " for DIE with tag " + << DIE.getTag() << " to 0x" + << Twine::utohexstr(NewAddress) << '\n'); + } + + dwarf::Form Form = Value.getForm(); + assert(Form != dwarf::DW_FORM_LLVM_addrx_offset && + "DW_FORM_LLVM_addrx_offset is not supported"); + std::lock_guard Lock(DebugInfoPatcherMutex); + if (Form == dwarf::DW_FORM_GNU_addr_index) { + assert(Unit.isDWOUnit() && + "DW_FORM_GNU_addr_index in Non DWO unit."); + uint64_t Index = Value.getRawUValue(); + // If there is no new address, storing old address. + // Re-using Index to make implementation easier. + // DW_FORM_GNU_addr_index is variable lenght encoding so we either + // have to create indices of same sizes, or use same index. + AddrWriter->addIndexAddress(NewAddress ? NewAddress : Address, + Index, *Unit.getDWOId()); + } else { + DebugInfoPatcher.addLE64Patch(AttrOffset, NewAddress); + } + } else if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: unexpected form value for attribute at 0x" + << Twine::utohexstr(AttrOffset); + } + } + } + } + } + + if (DIEOffset > NextCUOffset) { + errs() << "BOLT-WARNING: corrupt DWARF detected at 0x" + << Twine::utohexstr(Unit.getOffset()) << '\n'; + } +} + +void DWARFRewriter::updateDWARFObjectAddressRanges( + const DWARFDie DIE, uint64_t DebugRangesOffset, + SimpleBinaryPatcher &DebugInfoPatcher, DebugAbbrevPatcher &AbbrevPatcher) { + + // Some objects don't have an associated DIE and cannot be updated (such as + // compiler-generated functions). + if (!DIE) { + return; + } + + const DWARFAbbreviationDeclaration *AbbreviationDecl = + DIE.getAbbreviationDeclarationPtr(); + if (!AbbreviationDecl) { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " + << "skipping update. DIE at offset 0x" + << Twine::utohexstr(DIE.getOffset()) << '\n'; + } + return; + } + + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) { + // Case 1: The object was already non-contiguous and had DW_AT_ranges. + // In this case we simply need to update the value of DW_AT_ranges. + uint64_t AttrOffset = -1U; + DIE.find(dwarf::DW_AT_ranges, &AttrOffset); + assert(AttrOffset != -1U && "failed to locate DWARF attribute"); + + std::lock_guard Lock(DebugInfoPatcherMutex); + DebugInfoPatcher.addLE32Patch( + AttrOffset, DebugRangesOffset - DebugInfoPatcher.getRangeBase()); + } else { + // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back + // to back. We replace the attributes with DW_AT_ranges and DW_AT_low_pc. + // The low_pc attribute is required for DW_TAG_compile_units to set a base + // address. + // + // Since DW_AT_ranges takes 4-byte DW_FROM_sec_offset value, we have to fill + // in up to 12-bytes left after removal of low/high pc field from + // .debug_info. + // + // To fill in the gap we use a variable length DW_FORM_udata encoding for + // DW_AT_low_pc. We exploit the fact that the encoding can take an arbitrary + // large size. + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) && + AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) { + convertToRanges(AbbreviationDecl, AbbrevPatcher); + convertToRanges(DIE, DebugRangesOffset, DebugInfoPatcher); + } else { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE.getOffset()) << '\n'; + } + } + } +} + +void DWARFRewriter::updateLineTableOffsets() { + const MCSection *LineSection = + BC.Ctx->getObjectFileInfo()->getDwarfLineSection(); + auto CurrentFragment = LineSection->begin(); + uint64_t CurrentOffset = 0; + uint64_t Offset = 0; + + ErrorOr DbgInfoSection = + BC.getUniqueSectionByName(".debug_info"); + ErrorOr TypeInfoSection = + BC.getUniqueSectionByName(".debug_types"); + assert(((BC.DwCtx->getNumTypeUnits() > 0 && TypeInfoSection) || + BC.DwCtx->getNumTypeUnits() == 0) && + "Was not able to retrieve Debug Types section."); + + // There is no direct connection between CU and TU, but same offsets, + // encoded in DW_AT_stmt_list, into .debug_line get modified. + // We take advantage of that to map original CU line table offsets to new + // ones. + std::unordered_map DebugLineOffsetMap; + + auto GetStatementListValue = [](DWARFUnit *Unit) { + Optional StmtList = + Unit->getUnitDIE().find(dwarf::DW_AT_stmt_list); + Optional Offset = dwarf::toSectionOffset(StmtList); + assert(Offset && "Was not able to retreive value of DW_AT_stmt_list."); + return *Offset; + }; + + for (const std::unique_ptr &CU : BC.DwCtx->compile_units()) { + const unsigned CUID = CU->getOffset(); + MCSymbol *Label = BC.Ctx->getMCDwarfLineTable(CUID).getLabel(); + if (!Label) + continue; + + const uint64_t LTOffset = + BC.DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list); + if (!LTOffset) + continue; + + // Line tables are stored in MCContext in ascending order of offset in the + // output file, thus we can compute all table's offset by passing through + // each fragment at most once, continuing from the last CU's beginning + // instead of from the first fragment. + MCFragment *Fragment = Label->getFragment(); + while (&*CurrentFragment != Fragment) { + switch (CurrentFragment->getKind()) { + case MCFragment::FT_Dwarf: + Offset += cast(*CurrentFragment) + .getContents().size() - CurrentOffset; + break; + case MCFragment::FT_Data: + Offset += cast(*CurrentFragment) + .getContents().size() - CurrentOffset; + break; + default: + llvm_unreachable(".debug_line section shouldn't contain other types " + "of fragments."); + } + ++CurrentFragment; + CurrentOffset = 0; + } + + Offset += Label->getOffset() - CurrentOffset; + CurrentOffset = Label->getOffset(); + + DebugLineOffsetMap[GetStatementListValue(CU.get())] = Offset; + assert(DbgInfoSection && ".debug_info section must exist"); + DbgInfoSection->addRelocation(LTOffset, + nullptr, + ELF::R_X86_64_32, + Offset, + 0, + /*Pending=*/true); + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUID + << " has line table at " << Offset << "\n"); + } + + for (const std::unique_ptr &TU : BC.DwCtx->types_section_units()) { + DWARFUnit *Unit = TU.get(); + const uint64_t LTOffset = + BC.DwCtx->getAttrFieldOffsetForUnit(Unit, dwarf::DW_AT_stmt_list); + if (!LTOffset) + continue; + auto Iter = DebugLineOffsetMap.find(GetStatementListValue(Unit)); + assert(Iter != DebugLineOffsetMap.end() && + "Type Unit Updated Line Number Entry does not exist."); + TypeInfoSection->addRelocation(LTOffset, nullptr, ELF::R_X86_64_32, + Iter->second, 0, /*Pending=*/true); + } + + // Set .debug_info as finalized so it won't be skipped over when + // we process sections while writing out the new binary. This ensures + // that the pending relocations will be processed and not ignored. + if(DbgInfoSection) + DbgInfoSection->setIsFinalized(); + + if (TypeInfoSection) + TypeInfoSection->setIsFinalized(); +} + +void DWARFRewriter::finalizeDebugSections( + SimpleBinaryPatcher &DebugInfoPatcher) { + // Skip .debug_aranges if we are re-generating .gdb_index. + if (opts::KeepARanges || !BC.getGdbIndexSection()) { + SmallVector ARangesBuffer; + raw_svector_ostream OS(ARangesBuffer); + + auto MAB = std::unique_ptr(BC.TheTarget->createMCAsmBackend( + *BC.STI, *BC.MRI, MCTargetOptions())); + + ARangesSectionWriter->writeARangesSection(OS); + const StringRef &ARangesContents = OS.str(); + + BC.registerOrUpdateNoteSection(".debug_aranges", + copyByteArray(ARangesContents), + ARangesContents.size()); + } + + if (StrWriter->isInitialized()) { + RewriteInstance::addToDebugSectionsToOverwrite(".debug_str"); + std::unique_ptr DebugStrSectionContents = + StrWriter->finalize(); + BC.registerOrUpdateNoteSection(".debug_str", + copyByteArray(*DebugStrSectionContents), + DebugStrSectionContents->size()); + } + + if (AddrWriter->isInitialized()) { + AddressSectionBuffer AddressSectionContents = AddrWriter->finalize(); + BC.registerOrUpdateNoteSection(".debug_addr", + copyByteArray(AddressSectionContents), + AddressSectionContents.size()); + for (auto &CU : BC.DwCtx->compile_units()) { + DWARFDie DIE = CU->getUnitDIE(); + uint64_t AttrOffset = 0; + if (Optional Val = + DIE.find(dwarf::DW_AT_GNU_addr_base, &AttrOffset)) { + uint64_t Offset = AddrWriter->getOffset(*CU->getDWOId()); + DebugInfoPatcher.addLE32Patch(AttrOffset, static_cast(Offset)); + } + } + } + + std::unique_ptr RangesSectionContents = + RangesSectionWriter->finalize(); + BC.registerOrUpdateNoteSection(".debug_ranges", + copyByteArray(*RangesSectionContents), + RangesSectionContents->size()); + + std::unique_ptr LocationListSectionContents = + makeFinalLocListsSection(DebugInfoPatcher); + BC.registerOrUpdateNoteSection(".debug_loc", + copyByteArray(*LocationListSectionContents), + LocationListSectionContents->size()); +} + +void DWARFRewriter::writeOutDWOFiles( + std::unordered_map &DWOIdToName) { + std::string DebugData = ""; + auto applyPatch = [&](BinaryPatcher *Patcher, StringRef Data, + uint32_t Offset) -> StringRef { + DebugData = Data.str(); + Patcher->patchBinary(DebugData, Offset); + return StringRef(DebugData.c_str(), DebugData.size()); + }; + + using DWOSectionContribution = + const DWARFUnitIndex::Entry::SectionContribution; + auto getSliceData = [&](const DWARFUnitIndex::Entry *DWOEntry, + StringRef OutData, DWARFSectionKind Sec, + uint32_t &DWPOffset) -> StringRef { + if (DWOEntry) { + DWOSectionContribution *DWOContrubution = DWOEntry->getContribution(Sec); + DWPOffset = DWOContrubution->Offset; + OutData = OutData.substr(DWPOffset, DWOContrubution->Length); + } + return OutData; + }; + + // Setup DWP code once. + DWARFContext *DWOCtx = BC.getDWOContext(); + const DWARFUnitIndex *CUIndex = nullptr; + bool IsDWP = false; + if (DWOCtx) { + CUIndex = &DWOCtx->getCUIndex(); + IsDWP = !CUIndex->getRows().empty(); + } + + for (const std::unique_ptr &CU : BC.DwCtx->compile_units()) { + Optional DWOId = CU->getDWOId(); + if (!DWOId) + continue; + + Optional DWOCU = BC.getDWOCU(*DWOId); + if (!DWOCU) + continue; + + const object::ObjectFile *File = + (*DWOCU)->getContext().getDWARFObj().getFile(); + std::string CompDir = opts::DwoOutputPath.empty() + ? CU->getCompilationDir() + : opts::DwoOutputPath.c_str(); + std::string ObjectName = getDWOName(*CU.get(), nullptr, DWOIdToName); + auto FullPath = CompDir.append("/").append(ObjectName); + + std::error_code EC; + std::unique_ptr TempOut = + std::make_unique(FullPath, EC, sys::fs::OF_None); + + std::unique_ptr TmpBC = BinaryContext::createBinaryContext( + File, false, + DWARFContext::create(*File, nullptr, "", WithColor::defaultErrorHandler, + WithColor::defaultWarningHandler, + /*UsesRelocs=*/false)); + std::unique_ptr Streamer = TmpBC->createStreamer(TempOut->os()); + const MCObjectFileInfo &MCOFI = *Streamer->getContext().getObjectFileInfo(); + + const DWARFUnitIndex::Entry *DWOEntry = nullptr; + if (IsDWP) + DWOEntry = CUIndex->getFromHash(*DWOId); + + const StringMap KnownSections = { + {".debug_info.dwo", MCOFI.getDwarfInfoDWOSection()}, + {".debug_types.dwo", MCOFI.getDwarfTypesDWOSection()}, + {".debug_str.dwo", MCOFI.getDwarfStrDWOSection()}, + {".debug_str_offsets.dwo", MCOFI.getDwarfStrOffDWOSection()}, + {".debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}, + {".debug_loc.dwo", MCOFI.getDwarfLocDWOSection()}, + {".debug_line.dwo", MCOFI.getDwarfLineDWOSection()}}; + + for (const SectionRef &Section : File->sections()) { + Expected SectionName = Section.getName(); + assert(SectionName && "Invalid section name."); + auto SectionIter = KnownSections.find(*SectionName); + if (SectionIter == KnownSections.end()) + continue; + Streamer->SwitchSection(SectionIter->second); + Expected Contents = Section.getContents(); + assert(Contents && "Invalid contents."); + StringRef OutData(*Contents); + std::unique_ptr Data; + uint32_t DWPOffset = 0; + + if (SectionName->equals(".debug_info.dwo")) { + OutData = getSliceData(DWOEntry, OutData, + DWARFSectionKind::DW_SECT_INFO, DWPOffset); + SimpleBinaryPatcher *Patcher = getBinaryDWODebugInfoPatcher(*DWOId); + OutData = applyPatch(Patcher, OutData, DWPOffset); + } else if (SectionName->equals(".debug_types.dwo")) { + OutData = getSliceData(DWOEntry, OutData, + DWARFSectionKind::DW_SECT_EXT_TYPES, DWPOffset); + } else if (SectionName->equals(".debug_str.dwo")) { + OutData = (*DWOCU)->getStringSection(); + } else if (SectionName->equals(".debug_str_offsets.dwo")) { + OutData = + getSliceData(DWOEntry, OutData, + DWARFSectionKind::DW_SECT_STR_OFFSETS, DWPOffset); + } else if (SectionName->equals(".debug_abbrev.dwo")) { + OutData = getSliceData(DWOEntry, OutData, + DWARFSectionKind::DW_SECT_ABBREV, DWPOffset); + DebugAbbrevPatcher *Patcher = getBinaryDWOAbbrevPatcher(*DWOId); + OutData = applyPatch(Patcher, OutData, DWPOffset); + } else if (SectionName->equals(".debug_loc.dwo")) { + DebugLocWriter *LocWriter = LocListWritersByCU[*DWOId].get(); + Data = LocWriter->finalize(); + // Creating explicit with creating of StringRef here, otherwise + // with impicit conversion it will take null byte as end of + // string. + OutData = StringRef(reinterpret_cast(Data->data()), + Data->size()); + } else if (SectionName->equals(".debug_line.dwo")) { + OutData = getSliceData(DWOEntry, OutData, + DWARFSectionKind::DW_SECT_LINE, DWPOffset); + } else { + errs() << "BOLT-WARNING: Unsupported Debug section: " << *SectionName + << "\n"; + } + + Streamer->emitBytes(OutData); + } + Streamer->Finish(); + TempOut->keep(); + } +} + +void DWARFRewriter::updateGdbIndexSection() { + if (!BC.getGdbIndexSection()) + return; + + // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for + // .gdb_index section format. + + StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents(); + + const char *Data = GdbIndexContents.data(); + + // Parse the header. + const uint32_t Version = read32le(Data); + if (Version != 7 && Version != 8) { + errs() << "BOLT-ERROR: can only process .gdb_index versions 7 and 8\n"; + exit(1); + } + + // Some .gdb_index generators use file offsets while others use section + // offsets. Hence we can only rely on offsets relative to each other, + // and ignore their absolute values. + const uint32_t CUListOffset = read32le(Data + 4); + const uint32_t CUTypesOffset = read32le(Data + 8); + const uint32_t AddressTableOffset = read32le(Data + 12); + const uint32_t SymbolTableOffset = read32le(Data + 16); + const uint32_t ConstantPoolOffset = read32le(Data + 20); + Data += 24; + + // Map CUs offsets to indices and verify existing index table. + std::map OffsetToIndexMap; + const uint32_t CUListSize = CUTypesOffset - CUListOffset; + const unsigned NumCUs = BC.DwCtx->getNumCompileUnits(); + if (CUListSize != NumCUs * 16) { + errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n"; + exit(1); + } + for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) { + const DWARFUnit *CU = BC.DwCtx->getUnitAtIndex(Index); + const uint64_t Offset = read64le(Data); + if (CU->getOffset() != Offset) { + errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n"; + exit(1); + } + + OffsetToIndexMap[Offset] = Index; + } + + // Ignore old address table. + const uint32_t OldAddressTableSize = SymbolTableOffset - AddressTableOffset; + // Move Data to the beginning of symbol table. + Data += SymbolTableOffset - CUTypesOffset; + + // Calculate the size of the new address table. + uint32_t NewAddressTableSize = 0; + for (const auto &CURangesPair : ARangesSectionWriter->getCUAddressRanges()) { + const SmallVector &Ranges = CURangesPair.second; + NewAddressTableSize += Ranges.size() * 20; + } + + // Difference between old and new table (and section) sizes. + // Could be negative. + int32_t Delta = NewAddressTableSize - OldAddressTableSize; + + size_t NewGdbIndexSize = GdbIndexContents.size() + Delta; + + // Free'd by ExecutableFileMemoryManager. + auto *NewGdbIndexContents = new uint8_t[NewGdbIndexSize]; + uint8_t *Buffer = NewGdbIndexContents; + + write32le(Buffer, Version); + write32le(Buffer + 4, CUListOffset); + write32le(Buffer + 8, CUTypesOffset); + write32le(Buffer + 12, AddressTableOffset); + write32le(Buffer + 16, SymbolTableOffset + Delta); + write32le(Buffer + 20, ConstantPoolOffset + Delta); + Buffer += 24; + + // Copy over CU list and types CU list. + memcpy(Buffer, GdbIndexContents.data() + 24, + AddressTableOffset - CUListOffset); + Buffer += AddressTableOffset - CUListOffset; + + // Generate new address table. + for (const std::pair &CURangesPair : + ARangesSectionWriter->getCUAddressRanges()) { + const uint32_t CUIndex = OffsetToIndexMap[CURangesPair.first]; + const DebugAddressRangesVector &Ranges = CURangesPair.second; + for (const DebugAddressRange &Range : Ranges) { + write64le(Buffer, Range.LowPC); + write64le(Buffer + 8, Range.HighPC); + write32le(Buffer + 16, CUIndex); + Buffer += 20; + } + } + + const size_t TrailingSize = + GdbIndexContents.data() + GdbIndexContents.size() - Data; + assert(Buffer + TrailingSize == NewGdbIndexContents + NewGdbIndexSize && + "size calculation error"); + + // Copy over the rest of the original data. + memcpy(Buffer, Data, TrailingSize); + + // Register the new section. + BC.registerOrUpdateNoteSection(".gdb_index", + NewGdbIndexContents, + NewGdbIndexSize); +} + +void DWARFRewriter::convertToRanges(const DWARFAbbreviationDeclaration *Abbrev, + DebugAbbrevPatcher &AbbrevPatcher) { + dwarf::Form HighPCForm = Abbrev->findAttribute(dwarf::DW_AT_high_pc)->Form; + dwarf::Form LowPCForm = Abbrev->findAttribute(dwarf::DW_AT_low_pc)->Form; + + std::lock_guard Lock(AbbrevPatcherMutex); + // DW_FORM_GNU_addr_index is already variable encoding so nothing to do there. + // If HighForm is 8 bytes need to change low_pc to be variable encoding to + // consume extra bytes from high_pc, since DW_FORM_sec_offset is 4 bytes for + // DWARF32. + if (LowPCForm != dwarf::DW_FORM_GNU_addr_index && + isHighPcFormEightBytes(HighPCForm)) + AbbrevPatcher.addAttributePatch(Abbrev, dwarf::DW_AT_low_pc, + dwarf::DW_AT_low_pc, + dwarf::DW_FORM_indirect); + + AbbrevPatcher.addAttributePatch(Abbrev, dwarf::DW_AT_high_pc, + dwarf::DW_AT_ranges, + dwarf::DW_FORM_sec_offset); +} + +void DWARFRewriter::convertToRanges(DWARFDie DIE, + const DebugAddressRangesVector &Ranges, + SimpleBinaryPatcher &DebugInfoPatcher) { + uint64_t RangesSectionOffset; + if (Ranges.empty()) { + RangesSectionOffset = RangesSectionWriter->getEmptyRangesOffset(); + } else { + RangesSectionOffset = RangesSectionWriter->addRanges(Ranges); + } + + convertToRanges(DIE, RangesSectionOffset, DebugInfoPatcher); +} + +void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev, + SimpleBinaryPatcher &DebugInfoPatcher, + DebugAbbrevPatcher &AbbrevPatcher) { + if (ConvertedRangesAbbrevs.count(Abbrev)) + return; + + convertToRanges(Abbrev, AbbrevPatcher); + + auto I = PendingRanges.find(Abbrev); + if (I != PendingRanges.end()) { + for (std::pair &Pair : I->second) { + convertToRanges(Pair.first, {Pair.second}, DebugInfoPatcher); + } + PendingRanges.erase(I); + } + + ConvertedRangesAbbrevs.emplace(Abbrev); +} + +void DWARFRewriter::addToPendingRanges( + const DWARFAbbreviationDeclaration *Abbrev, DWARFDie DIE, + DebugAddressRangesVector &FunctionRanges, Optional DWOId) { + Optional LowPcValue = DIE.find(dwarf::DW_AT_low_pc); + Optional HighPcValue = DIE.find(dwarf::DW_AT_high_pc); + if (LowPcValue && + LowPcValue->getForm() == dwarf::Form::DW_FORM_GNU_addr_index) { + assert(DWOId && "Invalid DWO ID."); + (void)DWOId; + assert(HighPcValue && "Low PC exists, but not High PC."); + (void)HighPcValue; + uint64_t IndexL = LowPcValue->getRawUValue(); + uint64_t IndexH = HighPcValue->getRawUValue(); + for (auto Address : FunctionRanges) { + AddrWriter->addIndexAddress(Address.LowPC, IndexL, *DWOId); + // 2.17.2 + // If the value of the DW_AT_high_pc is of class address, it is the + // relocated address of the first location past the last instruction + // associated with the entity; if it is of class constant, the value is + // an unsigned integer offset which when added to the low PC gives the + // address of the first location past the last instruction associated + // with the entity. + if (!HighPcValue->isFormClass(DWARFFormValue::FC_Constant)) + AddrWriter->addIndexAddress(Address.HighPC, IndexH, *DWOId); + } + } + PendingRanges[Abbrev].emplace_back( + std::make_pair(DWARFDieWrapper(DIE), FunctionRanges.front())); +} + +std::unique_ptr +DWARFRewriter::makeFinalLocListsSection(SimpleBinaryPatcher &DebugInfoPatcher) { + auto LocBuffer = std::make_unique(); + auto LocStream = std::make_unique(*LocBuffer); + auto Writer = + std::unique_ptr(BC.createObjectWriter(*LocStream)); + + uint64_t SectionOffset = 0; + + // Add an empty list as the first entry; + const char Zeroes[16] = {0}; + *LocStream << StringRef(Zeroes, 16); + SectionOffset += 2 * 8; + + std::unordered_map SectionOffsetByCU( + LocListWritersByCU.size()); + + for (std::pair> &Loc : + LocListWritersByCU) { + uint64_t CUIndex = Loc.first; + DebugLocWriter *LocWriter = Loc.second.get(); + if (llvm::isa(*LocWriter)) + continue; + SectionOffsetByCU[CUIndex] = SectionOffset; + std::unique_ptr CurrCULocationLists = + LocWriter->finalize(); + *LocStream << *CurrCULocationLists; + SectionOffset += CurrCULocationLists->size(); + } + + for (std::pair &Iter : + DwoLocListDebugInfoPatches) { + uint64_t DWOId = Iter.first; + SimpleBinaryPatcher *Patcher = getBinaryDWODebugInfoPatcher(DWOId); + for (LocListDebugInfoPatchType &Patch : Iter.second) { + Patcher->addLE32Patch(Patch.DebugInfoOffset, + SectionOffsetByCU[Patch.CUIndex] + + Patch.CUWriterOffset); + } + } + + for (LocListDebugInfoPatchType &Patch : LocListDebugInfoPatches) { + DebugInfoPatcher.addLE32Patch(Patch.DebugInfoOffset, + SectionOffsetByCU[Patch.CUIndex] + + Patch.CUWriterOffset); + } + + return LocBuffer; +} + +void DWARFRewriter::flushPendingRanges(SimpleBinaryPatcher &DebugInfoPatcher) { + for (std::pair>> + &I : PendingRanges) { + for (std::pair &RangePair : I.second) { + patchLowHigh(RangePair.first, RangePair.second, DebugInfoPatcher); + } + } + clearList(PendingRanges); +} + +namespace { + +void getRangeAttrData( + DWARFDie DIE, + uint64_t &LowPCOffset, uint64_t &HighPCOffset, + DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) { + LowPCOffset = -1U; + HighPCOffset = -1U; + LowPCFormValue = *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset); + HighPCFormValue = *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset); + + if ((LowPCFormValue.getForm() != dwarf::DW_FORM_addr && + LowPCFormValue.getForm() != dwarf::DW_FORM_GNU_addr_index) || + (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && + HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && + HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { + errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " + << "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n"; + return; + } + if ((LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) && + LowPCFormValue.getForm() != dwarf::DW_FORM_GNU_addr_index) { + errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " + << "Cannot update DIE at offset 0x" + << Twine::utohexstr(DIE.getOffset()) << '\n'; + return; + } +} + +} + +void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range, + SimpleBinaryPatcher &DebugInfoPatcher) { + uint64_t LowPCOffset, HighPCOffset; + DWARFFormValue LowPCFormValue, HighPCFormValue; + getRangeAttrData( + DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue); + auto *TempDebugPatcher = &DebugInfoPatcher; + if (LowPCFormValue.getForm() == dwarf::DW_FORM_GNU_addr_index) { + DWARFUnit *Unit = DIE.getDwarfUnit(); + assert(Unit->isDWOUnit() && "DW_FORM_GNU_addr_index not part of DWO."); + uint32_t AddressIndex = + AddrWriter->getIndexFromAddress(Range.LowPC, *Unit->getDWOId()); + TempDebugPatcher = getBinaryDWODebugInfoPatcher(*Unit->getDWOId()); + TempDebugPatcher->addUDataPatch(LowPCOffset, AddressIndex, + std::abs(int(HighPCOffset - LowPCOffset))); + // TODO: In DWARF5 support ULEB128 for high_pc + } else { + TempDebugPatcher->addLE64Patch(LowPCOffset, Range.LowPC); + } + + if (isHighPcFormEightBytes(HighPCFormValue.getForm())) { + TempDebugPatcher->addLE64Patch(HighPCOffset, Range.HighPC - Range.LowPC); + } else { + TempDebugPatcher->addLE32Patch(HighPCOffset, Range.HighPC - Range.LowPC); + } +} + +void DWARFRewriter::convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset, + SimpleBinaryPatcher &DebugInfoPatcher) { + uint64_t LowPCOffset, HighPCOffset; + DWARFFormValue LowPCFormValue, HighPCFormValue; + getRangeAttrData(DIE, LowPCOffset, HighPCOffset, LowPCFormValue, + HighPCFormValue); + + unsigned LowPCSize = 0; + assert(DIE.getDwarfUnit()->getAddressByteSize() == 8); + if (isHighPcFormEightBytes(HighPCFormValue.getForm())) { + LowPCSize = 12; + } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { + LowPCSize = 8; + } else { + llvm_unreachable("unexpected form"); + } + + std::lock_guard Lock(DebugInfoPatcherMutex); + uint32_t BaseOffset = 0; + if (LowPCFormValue.getForm() == dwarf::DW_FORM_GNU_addr_index) { + // Add Indexer is already variable length encoding. + DebugInfoPatcher.addUDataPatch(LowPCOffset, 0, + std::abs(int(HighPCOffset - LowPCOffset)) + + LowPCSize - 8); + // Ranges are relative to DW_AT_GNU_ranges_base. + BaseOffset = DebugInfoPatcher.getRangeBase(); + } else if (LowPCSize == 12) { + // Creatively encoding dwarf::DW_FORM_addr in to 4 bytes. + // Write an indirect 0 value for DW_AT_low_pc so that we can fill + // 12 bytes of space. + // The Abbrev wa already changed. + DebugInfoPatcher.addUDataPatch(LowPCOffset, dwarf::DW_FORM_addr, 4); + DebugInfoPatcher.addLE64Patch(LowPCOffset + 4, 0); + } else { + DebugInfoPatcher.addLE64Patch(LowPCOffset, 0); + } + DebugInfoPatcher.addLE32Patch(HighPCOffset + LowPCSize - 8, + RangesSectionOffset - BaseOffset); +} diff --git a/bolt/src/DWARFRewriter.h b/bolt/src/DWARFRewriter.h new file mode 100644 index 000000000000..d7f7904f443d --- /dev/null +++ b/bolt/src/DWARFRewriter.h @@ -0,0 +1,229 @@ +//===--- DWARFRewriter.h --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H +#define LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H + +#include "DebugData.h" +#include "RewriteInstance.h" +#include +#include +#include +#include +#include +#include + +namespace llvm { + +namespace bolt { + +class BinaryContext; + +class DWARFRewriter { + DWARFRewriter() = delete; + + BinaryContext &BC; + + std::mutex DebugInfoPatcherMutex; + + std::mutex AbbrevPatcherMutex; + + /// Stores and serializes information that will be put into the + /// .debug_ranges DWARF section. + std::unique_ptr RangesSectionWriter; + + /// Stores and serializes information that will be put into the + /// .debug_aranges DWARF section. + std::unique_ptr ARangesSectionWriter; + + /// Stores and serializes information that will be put into the + /// .debug_addr DWARF section. + std::unique_ptr AddrWriter; + + /// Stores and serializes information that will be put in to the + /// .debug_addr DWARF section. + /// Does not do de-duplication. + std::unique_ptr StrWriter; + + using LocWriters = + std::unordered_map>; + /// Use a separate location list writer for each compilation unit + LocWriters LocListWritersByCU; + + using DebugAbbrevDWOPatchers = + std::unordered_map>; + /// Binary patchers for DWO Abbrev sections. + DebugAbbrevDWOPatchers BinaryDWOAbbrevPatchers; + + using DebugInfoDWOPatchers = + std::unordered_map>; + /// Binary patchers for DWO debug_info sections. + DebugInfoDWOPatchers BinaryDWODebugInfoPatchers; + + struct LocListDebugInfoPatchType { + uint64_t DebugInfoOffset; + size_t CUIndex; + uint64_t CUWriterOffset; + }; + using VectorLocListDebugInfoPatchType = + std::vector; + /// The list of debug info patches to be made once individual + /// location list writers have been filled + VectorLocListDebugInfoPatchType LocListDebugInfoPatches; + + std::unordered_map + DwoLocListDebugInfoPatches; + + std::mutex LocListDebugInfoPatchesMutex; + + /// Update debug info for all DIEs in \p Unit. + void updateUnitDebugInfo(uint64_t CUIndex, DWARFUnit &Unit, + SimpleBinaryPatcher &DebugInfoPatcher, + DebugAbbrevPatcher &AbbrevPatcher); + + /// Patches the binary for an object's address ranges to be updated. + /// The object can be a anything that has associated address ranges via either + /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc). + /// \p DebugRangesOffset is the offset in .debug_ranges of the object's + /// new address ranges in the output binary. + /// \p Unit Compile unit the object belongs to. + /// \p DIE is the object's DIE in the input binary. + void updateDWARFObjectAddressRanges(const DWARFDie DIE, + uint64_t DebugRangesOffset, + SimpleBinaryPatcher &DebugInfoPatcher, + DebugAbbrevPatcher &AbbrevPatcher); + + std::unique_ptr + makeFinalLocListsSection(SimpleBinaryPatcher &DebugInfoPatcher); + + /// Generate new contents for .debug_ranges and .debug_aranges section. + void finalizeDebugSections(SimpleBinaryPatcher &DebugInfoPatcher); + + /// Patches the binary for DWARF address ranges (e.g. in functions and lexical + /// blocks) to be updated. + void updateDebugAddressRanges(); + + /// Rewrite .gdb_index section if present. + void updateGdbIndexSection(); + + /// Output .dwo files. + void writeOutDWOFiles(std::unordered_map &DWOIdToName); + /// Abbreviations that were converted to use DW_AT_ranges. + std::set ConvertedRangesAbbrevs; + + /// DWARFDie contains a pointer to a DIE and hence gets invalidated once the + /// embedded DIE is destroyed. This wrapper class stores a DIE internally and + /// could be cast to a DWARFDie that is valid even after the initial DIE is + /// destroyed. + struct DWARFDieWrapper { + DWARFUnit *Unit; + DWARFDebugInfoEntry DIE; + + DWARFDieWrapper(DWARFUnit *Unit, DWARFDebugInfoEntry DIE) : + Unit(Unit), + DIE(DIE) {} + + DWARFDieWrapper(DWARFDie &Die) : + Unit(Die.getDwarfUnit()), + DIE(*Die.getDebugInfoEntry()) {} + + operator DWARFDie() { + return DWARFDie(Unit, &DIE); + } + }; + + /// DIEs with abbrevs that were not converted to DW_AT_ranges. + /// We only update those when all DIEs have been processed to guarantee that + /// the abbrev (which is shared) is intact. + using PendingRangesType = std::unordered_map< + const DWARFAbbreviationDeclaration *, + std::vector>>; + + PendingRangesType PendingRanges; + + /// Convert \p Abbrev from using a simple DW_AT_(low|high)_pc range to + /// DW_AT_ranges. + void convertToRanges(const DWARFAbbreviationDeclaration *Abbrev, + DebugAbbrevPatcher &AbbrevPatcher); + + /// Update \p DIE that was using DW_AT_(low|high)_pc with DW_AT_ranges offset. + void convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset, + SimpleBinaryPatcher &DebugInfoPatcher); + + /// Same as above, but takes a vector of \p Ranges as a parameter. + void convertToRanges(DWARFDie DIE, const DebugAddressRangesVector &Ranges, + SimpleBinaryPatcher &DebugInfoPatcher); + + /// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range. + void patchLowHigh(DWARFDie DIE, DebugAddressRange Range, + SimpleBinaryPatcher &DebugInfoPatcher); + + /// Convert pending ranges associated with the given \p Abbrev. + void convertPending(const DWARFAbbreviationDeclaration *Abbrev, + SimpleBinaryPatcher &DebugInfoPatcher, + DebugAbbrevPatcher &AbbrevPatcher); + + /// Adds to Pending Ranges. + /// For Debug Fission also adding to .debug_addr to take care of a case where + /// some entries are not converted to ranges and left as + /// DW_AT_low_pc/DW_AT_high_pc. + void addToPendingRanges(const DWARFAbbreviationDeclaration *Abbrev, + DWARFDie DIE, DebugAddressRangesVector &Ranges, + Optional DWOId); + + /// Once all DIEs were seen, update DW_AT_(low|high)_pc values. + void flushPendingRanges(SimpleBinaryPatcher &DebugInfoPatcher); + + /// Helper function for creating and returnning DWO DebugInfo and Abbrev + /// patchers. + template + Patcher *getBinaryDWOPatcherHelper(T &BinaryPatchers, uint64_t DwoId) { + auto Iter = BinaryPatchers.find(DwoId); + if (Iter == BinaryPatchers.end()) { + // Using make_pair instead of {} to work around bug in older version of + // the library. https://timsong-cpp.github.io/lwg-issues/2354 + Iter = BinaryPatchers + .insert(std::make_pair(DwoId, std::make_unique())) + .first; + } + + return static_cast(Iter->second.get()); + } + +public: + DWARFRewriter(BinaryContext &BC) : BC(BC) {} + + /// Main function for updating the DWARF debug info. + void updateDebugInfo(); + + /// Computes output .debug_line line table offsets for each compile unit, + /// and updates stmt_list for a corresponding compile unit. + void updateLineTableOffsets(); + + /// Returns a DWO Debug Info Patcher for DWO ID. + /// Creates a new instance if it does not already exist. + SimpleBinaryPatcher *getBinaryDWODebugInfoPatcher(uint64_t DwoId) { + return getBinaryDWOPatcherHelper( + BinaryDWODebugInfoPatchers, DwoId); + } + + /// Returns a DWO Abbrev Patcher for DWO ID. + /// Creates a new instance if it's not already exists. + DebugAbbrevPatcher *getBinaryDWOAbbrevPatcher(uint64_t DwoId) { + return getBinaryDWOPatcherHelper( + BinaryDWOAbbrevPatchers, DwoId); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp new file mode 100644 index 000000000000..d560ce11ebec --- /dev/null +++ b/bolt/src/DataAggregator.cpp @@ -0,0 +1,2284 @@ +//===-- DataAggregator.cpp - Perf data aggregator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by perf record, +// aggregate it and then write it back to an output file. +// +//===----------------------------------------------------------------------===// + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "BoltAddressTranslation.h" +#include "DataAggregator.h" +#include "Heatmap.h" +#include "Utils.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/Timer.h" +#include +#include + +#include + +#define DEBUG_TYPE "aggregator" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory AggregatorCategory; +extern bool HeatmapMode; +extern bool LinuxKernelMode; +extern cl::SubCommand HeatmapCommand; +extern cl::opt AggregateOnly; +extern cl::opt OutputFilename; + +static cl::opt +BasicAggregation("nl", + cl::desc("aggregate basic samples (without LBR info)"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +static cl::opt +FilterMemProfile("filter-mem-profile", + cl::desc("if processing a memory profile, filter out stack or heap accesses " + "that won't be useful for BOLT to reduce profile file size"), + cl::init(true), + cl::cat(AggregatorCategory)); + +static cl::opt +FilterPID("pid", + cl::desc("only use samples from process with specified PID"), + cl::init(0), + cl::Optional, + cl::cat(AggregatorCategory)); + +static cl::opt +HeatmapBlock("block-size", + cl::desc("size of a heat map block in bytes (default 64)"), + cl::init(64), + cl::sub(HeatmapCommand)); + +static cl::opt +HeatmapFile("o", + cl::init("-"), + cl::desc("heatmap output file (default stdout)"), + cl::Optional, + cl::sub(HeatmapCommand)); + +static cl::opt +HeatmapMaxAddress("max-address", + cl::init(0xffffffff), + cl::desc("maximum address considered valid for heatmap (default 4GB)"), + cl::Optional, + cl::sub(HeatmapCommand)); + +static cl::opt +HeatmapMinAddress("min-address", + cl::init(0x0), + cl::desc("minimum address considered valid for heatmap (default 0)"), + cl::Optional, + cl::sub(HeatmapCommand)); + +static cl::opt +IgnoreBuildID("ignore-build-id", + cl::desc("continue even if build-ids in input binary and perf.data mismatch"), + cl::init(false), + cl::cat(AggregatorCategory)); + +static cl::opt +IgnoreInterruptLBR("ignore-interrupt-lbr", + cl::desc("ignore kernel interrupt LBR that happens asynchronously"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +static cl::opt +MaxSamples("max-samples", + cl::init(-1ULL), + cl::desc("maximum number of samples to read from LBR profile"), + cl::Optional, + cl::Hidden, + cl::cat(AggregatorCategory)); + +static cl::opt +ReadPreAggregated("pa", + cl::desc("skip perf and read data from a pre-aggregated file format"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +static cl::opt +TimeAggregator("time-aggr", + cl::desc("time BOLT aggregator"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +static cl::opt +UseEventPC("use-event-pc", + cl::desc("use event PC in combination with LBR sampling"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +static cl::opt +WriteAutoFDOData("autofdo", + cl::desc("generate autofdo textual data instead of bolt data"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +} + +namespace { + +const char TimerGroupName[] = "aggregator"; +const char TimerGroupDesc[] = "Aggregator"; + +} + +constexpr uint64_t DataAggregator::KernelBaseAddr; + +DataAggregator::~DataAggregator() { + deleteTempFiles(); +} + +namespace { +void deleteTempFile(const std::string &FileName) { + if (std::error_code Errc = sys::fs::remove(FileName.c_str())) { + errs() << "PERF2BOLT: failed to delete temporary file " + << FileName << " with error " << Errc.message() << "\n"; + } +} +} + +void DataAggregator::deleteTempFiles() { + for (std::string &FileName : TempFiles) { + deleteTempFile(FileName); + } + TempFiles.clear(); +} + +void DataAggregator::findPerfExecutable() { + Optional PerfExecutable = + sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + outs() << "PERF2BOLT: No perf executable found!\n"; + exit(1); + } + PerfPath = *PerfExecutable; +} + +void DataAggregator::start() { + outs() << "PERF2BOLT: Starting data aggregation job for " << Filename + << "\n"; + + // Don't launch perf for pre-aggregated files + if (opts::ReadPreAggregated) + return; + + findPerfExecutable(); + + if (opts::BasicAggregation) { + launchPerfProcess("events without LBR", + MainEventsPPI, + "script -F pid,event,ip", + /*Wait = */false); + } else { + launchPerfProcess("branch events", + MainEventsPPI, + "script -F pid,ip,brstack", + /*Wait = */false); + } + + // Note: we launch script for mem events regardless of the option, as the + // command fails fairly fast if mem events were not collected. + launchPerfProcess("mem events", + MemEventsPPI, + "script -F pid,event,addr,ip", + /*Wait = */false); + + launchPerfProcess("process events", + MMapEventsPPI, + "script --show-mmap-events", + /*Wait = */false); + + launchPerfProcess("task events", + TaskEventsPPI, + "script --show-task-events", + /*Wait = */false); +} + +void DataAggregator::abort() { + if (opts::ReadPreAggregated) + return; + + std::string Error; + + // Kill subprocesses in case they are not finished + sys::Wait(TaskEventsPPI.PI, 1, false, &Error); + sys::Wait(MMapEventsPPI.PI, 1, false, &Error); + sys::Wait(MainEventsPPI.PI, 1, false, &Error); + sys::Wait(MemEventsPPI.PI, 1, false, &Error); + + deleteTempFiles(); + + exit(1); +} + +void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, + const char *ArgsString, bool Wait) { + SmallVector Argv; + + outs() << "PERF2BOLT: spawning perf job to read " << Name << '\n'; + Argv.push_back(PerfPath.data()); + + char *WritableArgsString = strdup(ArgsString); + char *Str = WritableArgsString; + do { + Argv.push_back(Str); + while (*Str && *Str != ' ') + ++Str; + if (!*Str) + break; + *Str++ = 0; + } while (true); + + Argv.push_back("-f"); + Argv.push_back("-i"); + Argv.push_back(Filename.c_str()); + + if (std::error_code Errc = + sys::fs::createTemporaryFile("perf.script", "out", PPI.StdoutPath)) { + errs() << "PERF2BOLT: failed to create temporary file " + << PPI.StdoutPath << " with error " << Errc.message() + << "\n"; + exit(1); + } + TempFiles.push_back(PPI.StdoutPath.data()); + + if (std::error_code Errc = + sys::fs::createTemporaryFile("perf.script", "err", PPI.StderrPath)) { + errs() << "PERF2BOLT: failed to create temporary file " + << PPI.StderrPath << " with error " << Errc.message() << "\n"; + exit(1); + } + TempFiles.push_back(PPI.StderrPath.data()); + + Optional Redirects[] = { + llvm::None, // Stdin + StringRef(PPI.StdoutPath.data()), // Stdout + StringRef(PPI.StderrPath.data())}; // Stderr + + LLVM_DEBUG({ + dbgs() << "Launching perf: "; + for (StringRef Arg : Argv) + dbgs() << Arg << " "; + dbgs() << " 1> " << PPI.StdoutPath.data() << " 2> " << PPI.StderrPath.data() + << "\n"; + }); + + if (Wait) { + PPI.PI.ReturnCode = sys::ExecuteAndWait(PerfPath.data(), Argv, + /*envp*/ llvm::None, Redirects); + } else { + PPI.PI = sys::ExecuteNoWait(PerfPath.data(), Argv, /*envp*/ llvm::None, + Redirects); + } + + free(WritableArgsString); +} + +void DataAggregator::processFileBuildID(StringRef FileBuildID) { + PerfProcessInfo BuildIDProcessInfo; + launchPerfProcess("buildid list", + BuildIDProcessInfo, + "buildid-list", + /*Wait = */true); + + if (BuildIDProcessInfo.PI.ReturnCode != 0) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(BuildIDProcessInfo.StderrPath.data()); + StringRef ErrBuf = (*MB)->getBuffer(); + + errs() << "PERF-ERROR: return code " << BuildIDProcessInfo.PI.ReturnCode + << '\n'; + errs() << ErrBuf; + return; + } + + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(BuildIDProcessInfo.StdoutPath.data()); + if (std::error_code EC = MB.getError()) { + errs() << "Cannot open " << BuildIDProcessInfo.StdoutPath.data() << ": " + << EC.message() << "\n"; + return; + } + + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + if (ParsingBuf.empty()) { + errs() << "PERF2BOLT-WARNING: build-id will not be checked because perf " + "data was recorded without it\n"; + return; + } + + Col = 0; + Line = 1; + Optional FileName = getFileNameForBuildID(FileBuildID); + if (!FileName) { + errs() << "PERF2BOLT-ERROR: failed to match build-id from perf output. " + "This indicates the input binary supplied for data aggregation " + "is not the same recorded by perf when collecting profiling " + "data, or there were no samples recorded for the binary. " + "Use -ignore-build-id option to override.\n"; + if (!opts::IgnoreBuildID) { + abort(); + } + } else if (*FileName != llvm::sys::path::filename(BC->getFilename())) { + errs() << "PERF2BOLT-WARNING: build-id matched a different file name\n"; + BuildIDBinaryName = std::string(*FileName); + } else { + outs() << "PERF2BOLT: matched build-id and file name\n"; + } + + return; +} + +bool DataAggregator::checkPerfDataMagic(StringRef FileName) { + if (opts::ReadPreAggregated) + return true; + + int FD; + if (sys::fs::openFileForRead(FileName, FD)) { + return false; + } + + char Buf[7] = {0, 0, 0, 0, 0, 0, 0}; + + if (::read(FD, Buf, 7) == -1) { + ::close(FD); + return false; + } + ::close(FD); + + if (strncmp(Buf, "PERFILE", 7) == 0) + return true; + return false; +} + +void DataAggregator::parsePreAggregated() { + std::string Error; + + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = MB.getError()) { + errs() << "PERF2BOLT-ERROR: cannot open " << Filename << ": " + << EC.message() << "\n"; + exit(1); + } + + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + if (parsePreAggregatedLBRSamples()) { + errs() << "PERF2BOLT: failed to parse samples\n"; + exit(1); + } +} + +std::error_code DataAggregator::writeAutoFDOData(StringRef OutputFilename) { + outs() << "PERF2BOLT: writing data for autofdo tools...\n"; + NamedRegionTimer T("writeAutoFDO", "Processing branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + std::error_code EC; + raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None); + if (EC) + return EC; + + // Format: + // number of unique traces + // from_1-to_1:count_1 + // from_2-to_2:count_2 + // ...... + // from_n-to_n:count_n + // number of unique sample addresses + // addr_1:count_1 + // addr_2:count_2 + // ...... + // addr_n:count_n + // number of unique LBR entries + // src_1->dst_1:count_1 + // src_2->dst_2:count_2 + // ...... + // src_n->dst_n:count_n + + const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress; + + // AutoFDO addresses are relative to the first allocated loadable program + // segment + auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t { + if (Address < FirstAllocAddress) + return 0; + return Address - FirstAllocAddress; + }; + + OutFile << FallthroughLBRs.size() << "\n"; + for (const auto &AggrLBR : FallthroughLBRs) { + const Trace &Trace = AggrLBR.first; + const FTInfo &Info = AggrLBR.second; + OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "-" + << Twine::utohexstr(filterAddress(Trace.To)) << ":" + << (Info.InternCount + Info.ExternCount) << "\n"; + } + + OutFile << BasicSamples.size() << "\n"; + for (const auto &Sample : BasicSamples) { + uint64_t PC = Sample.first; + uint64_t HitCount = Sample.second; + OutFile << Twine::utohexstr(filterAddress(PC)) << ":" << HitCount << "\n"; + } + + OutFile << BranchLBRs.size() << "\n"; + for (const auto &AggrLBR : BranchLBRs) { + const Trace &Trace = AggrLBR.first; + const BranchInfo &Info = AggrLBR.second; + OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "->" + << Twine::utohexstr(filterAddress(Trace.To)) << ":" + << Info.TakenCount << "\n"; + } + + outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, " + << BasicSamples.size() << " sample addresses and " << BranchLBRs.size() + << " unique branches to " << OutputFilename << "\n"; + + return std::error_code(); +} + +void DataAggregator::filterBinaryMMapInfo() { + if (opts::FilterPID) { + auto MMapInfoIter = BinaryMMapInfo.find(opts::FilterPID); + if (MMapInfoIter != BinaryMMapInfo.end()) { + MMapInfo MMap = MMapInfoIter->second; + BinaryMMapInfo.clear(); + BinaryMMapInfo.insert(std::make_pair(MMap.PID, MMap)); + } else { + if (errs().has_colors()) + errs().changeColor(raw_ostream::RED); + errs() << "PERF2BOLT-ERROR: could not find a profile matching PID \"" + << opts::FilterPID << "\"" << " for binary \"" + << BC->getFilename() <<"\"."; + assert(!BinaryMMapInfo.empty() && "No memory map for matching binary"); + errs() << " Profile for the following process is available:\n"; + for (std::pair &MMI : BinaryMMapInfo) { + outs() << " " << MMI.second.PID + << (MMI.second.Forked ? " (forked)\n" : "\n"); + } + if (errs().has_colors()) + errs().resetColor(); + + exit(1); + } + } +} + +Error DataAggregator::preprocessProfile(BinaryContext &BC) { + this->BC = &BC; + + if (opts::ReadPreAggregated) { + parsePreAggregated(); + return Error::success(); + } + + if (Optional FileBuildID = BC.getFileBuildID()) { + outs() << "BOLT-INFO: binary build-id is: " << *FileBuildID << "\n"; + processFileBuildID(*FileBuildID); + } else { + errs() << "BOLT-WARNING: build-id will not be checked because we could " + "not read one from input binary\n"; + } + + auto prepareToParse = [&](StringRef Name, PerfProcessInfo &Process) { + std::string Error; + outs() << "PERF2BOLT: waiting for perf " << Name + << " collection to finish...\n"; + sys::ProcessInfo PI = sys::Wait(Process.PI, 0, true, &Error); + + if (!Error.empty()) { + errs() << "PERF-ERROR: " << PerfPath << ": " << Error << "\n"; + deleteTempFiles(); + exit(1); + } + + if (PI.ReturnCode != 0) { + ErrorOr> ErrorMB = + MemoryBuffer::getFileOrSTDIN(Process.StderrPath.data()); + StringRef ErrBuf = (*ErrorMB)->getBuffer(); + + errs() << "PERF-ERROR: return code " << PI.ReturnCode << "\n"; + errs() << ErrBuf; + deleteTempFiles(); + exit(1); + } + + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(Process.StdoutPath.data()); + if (std::error_code EC = MB.getError()) { + errs() << "Cannot open " << Process.StdoutPath.data() << ": " + << EC.message() << "\n"; + deleteTempFiles(); + exit(1); + } + + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + }; + + if (opts::LinuxKernelMode) { + // Current MMap parsing logic does not work with linux kernel. + // MMap entries for linux kernel uses PERF_RECORD_MMAP + // format instead of typical PERF_RECORD_MMAP2 format. + // Since linux kernel address mapping is absolute (same as + // in the ELF file), we avoid parsing MMap in linux kernel mode. + // While generating optimized linux kernel binary, we may need + // to parse MMap entries. + + // In linux kernel mode, we analyze and optimize + // all linux kernel binary instructions, irrespective + // of whether they are due to system calls or due to + // interrupts. Therefore, we cannot ignore interrupt + // in Linux kernel mode. + opts::IgnoreInterruptLBR = false; + } else { + prepareToParse("mmap events", MMapEventsPPI); + if (parseMMapEvents()) { + errs() << "PERF2BOLT: failed to parse mmap events\n"; + } + } + + prepareToParse("task events", TaskEventsPPI); + if (parseTaskEvents()) { + errs() << "PERF2BOLT: failed to parse task events\n"; + } + + filterBinaryMMapInfo(); + prepareToParse("events", MainEventsPPI); + + if (opts::HeatmapMode) { + if (std::error_code EC = printLBRHeatMap()) { + errs() << "ERROR: failed to print heat map: " << EC.message() << '\n'; + exit(1); + } + exit(0); + } + + if ((!opts::BasicAggregation && parseBranchEvents()) || + (opts::BasicAggregation && parseBasicEvents())) { + errs() << "PERF2BOLT: failed to parse samples\n"; + } + + // We can finish early if the goal is just to generate data for autofdo + if (opts::WriteAutoFDOData) { + if (std::error_code EC = writeAutoFDOData(opts::OutputFilename)) { + errs() << "Error writing autofdo data to file: " << EC.message() << "\n"; + } + deleteTempFiles(); + exit(0); + } + + // Special handling for memory events + std::string Error; + sys::ProcessInfo PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error); + if (PI.ReturnCode != 0) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(MemEventsPPI.StderrPath.data()); + StringRef ErrBuf = (*MB)->getBuffer(); + + deleteTempFiles(); + + Regex NoData("Samples for '.*' event do not have ADDR attribute set. " + "Cannot print 'addr' field."); + if (!NoData.match(ErrBuf)) { + errs() << "PERF-ERROR: return code " << PI.ReturnCode << "\n"; + errs() << ErrBuf; + exit(1); + } + return Error::success(); + } + + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(MemEventsPPI.StdoutPath.data()); + if (std::error_code EC = MB.getError()) { + errs() << "Cannot open " << MemEventsPPI.StdoutPath.data() << ": " + << EC.message() << "\n"; + deleteTempFiles(); + exit(1); + } + + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + if (const std::error_code EC = parseMemEvents()) { + errs() << "PERF2BOLT: failed to parse memory events: " + << EC.message() << '\n'; + } + + deleteTempFiles(); + + return Error::success(); +} + +Error DataAggregator::readProfile(BinaryContext &BC) { + processProfile(BC); + + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + convertBranchData(Function); + } + + if (opts::AggregateOnly) { + if (std::error_code EC = writeAggregatedFile(opts::OutputFilename)) { + report_error("cannot create output data file", EC); + } + } + + return Error::success(); +} + +bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) { + return Function.hasProfileAvailable(); +} + +void DataAggregator::processProfile(BinaryContext &BC) { + if (opts::ReadPreAggregated) + processPreAggregated(); + else if (opts::BasicAggregation) + processBasicEvents(); + else + processBranchEvents(); + + processMemEvents(); + + // Mark all functions with registered events as having a valid profile. + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &BF = BFI.second; + if (getBranchData(BF)) { + const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE + : BinaryFunction::PF_LBR; + BF.markProfiled(Flags); + } + } + + // Release intermediate storage. + clear(BranchLBRs); + clear(FallthroughLBRs); + clear(AggregatedLBRs); + clear(BasicSamples); + clear(MemSamples); +} + +BinaryFunction * +DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) const { + if (!BC->containsAddress(Address)) + return nullptr; + + return BC->getBinaryFunctionContainingAddress(Address, /*CheckPastEnd=*/false, + /*UseMaxSize=*/true); +} + +StringRef DataAggregator::getLocationName(BinaryFunction &Func, + uint64_t Count) { + if (!BAT) + return Func.getOneName(); + + const BinaryFunction *OrigFunc = &Func; + if (const uint64_t HotAddr = BAT->fetchParentAddress(Func.getAddress())) { + NumColdSamples += Count; + BinaryFunction *HotFunc = getBinaryFunctionContainingAddress(HotAddr); + if (HotFunc) + OrigFunc = HotFunc; + } + // If it is a local function, prefer the name containing the file name where + // the local function was declared + for (StringRef AlternativeName : OrigFunc->getNames()) { + size_t FileNameIdx = AlternativeName.find('/'); + // Confirm the alternative name has the pattern Symbol/FileName/1 before + // using it + if (FileNameIdx == StringRef::npos || + AlternativeName.find('/', FileNameIdx + 1) == StringRef::npos) + continue; + return AlternativeName; + } + return OrigFunc->getOneName(); +} + +bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address, + uint64_t Count) { + auto I = NamesToSamples.find(Func.getOneName()); + if (I == NamesToSamples.end()) { + bool Success; + StringRef LocName = getLocationName(Func, Count); + std::tie(I, Success) = NamesToSamples.insert(std::make_pair( + Func.getOneName(), + FuncSampleData(LocName, FuncSampleData::ContainerTy()))); + } + + Address -= Func.getAddress(); + if (BAT) + Address = BAT->translate(Func, Address, /*IsBranchSrc=*/false); + + I->second.bumpCount(Address, Count); + return true; +} + +bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From, + uint64_t To, uint64_t Count, + uint64_t Mispreds) { + FuncBranchData *AggrData = getBranchData(Func); + if (!AggrData) { + AggrData = &NamesToBranches[Func.getOneName()]; + AggrData->Name = getLocationName(Func, Count); + setBranchData(Func, AggrData); + } + + From -= Func.getAddress(); + To -= Func.getAddress(); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName() + << " @ " << Twine::utohexstr(From) << " -> " + << Func.getPrintName() << " @ " << Twine::utohexstr(To) + << '\n'); + if (BAT) { + From = BAT->translate(Func, From, /*IsBranchSrc=*/true); + To = BAT->translate(Func, To, /*IsBranchSrc=*/false); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: " + << Func.getPrintName() << " @ " << Twine::utohexstr(From) + << " -> " << Func.getPrintName() << " @ " + << Twine::utohexstr(To) << '\n'); + } + + AggrData->bumpBranchCount(From, To, Count, Mispreds); + return true; +} + +bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, + BinaryFunction *ToFunc, uint64_t From, + uint64_t To, uint64_t Count, + uint64_t Mispreds) { + FuncBranchData *FromAggrData = nullptr; + FuncBranchData *ToAggrData = nullptr; + StringRef SrcFunc; + StringRef DstFunc; + if (FromFunc) { + SrcFunc = getLocationName(*FromFunc, Count); + FromAggrData = getBranchData(*FromFunc); + if (!FromAggrData) { + FromAggrData = &NamesToBranches[FromFunc->getOneName()]; + FromAggrData->Name = SrcFunc; + setBranchData(*FromFunc, FromAggrData); + } + From -= FromFunc->getAddress(); + if (BAT) + From = BAT->translate(*FromFunc, From, /*IsBranchSrc=*/true); + + recordExit(*FromFunc, From, Mispreds, Count); + } + if (ToFunc) { + DstFunc = getLocationName(*ToFunc, 0); + ToAggrData = getBranchData(*ToFunc); + if (!ToAggrData) { + ToAggrData = &NamesToBranches[ToFunc->getOneName()]; + ToAggrData->Name = DstFunc; + setBranchData(*ToFunc, ToAggrData); + } + To -= ToFunc->getAddress(); + if (BAT) + To = BAT->translate(*ToFunc, To, /*IsBranchSrc=*/false); + + recordEntry(*ToFunc, To, Mispreds, Count); + } + + if (FromAggrData) + FromAggrData->bumpCallCount(From, Location(!DstFunc.empty(), DstFunc, To), + Count, Mispreds); + if (ToAggrData) + ToAggrData->bumpEntryCount(Location(!SrcFunc.empty(), SrcFunc, From), To, + Count, Mispreds); + return true; +} + +bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, + uint64_t Mispreds) { + BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From); + BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To); + if (!FromFunc && !ToFunc) + return false; + + if (FromFunc == ToFunc) { + recordBranch(*FromFunc, From - FromFunc->getAddress(), + To - FromFunc->getAddress(), Count, Mispreds); + return doIntraBranch(*FromFunc, From, To, Count, Mispreds); + } + + return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds); +} + +bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, + uint64_t Count) { + BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(First.To); + BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From); + if (!FromFunc || !ToFunc) { + LLVM_DEBUG( + dbgs() << "Out of range trace starting in " << FromFunc->getPrintName() + << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress()) + << " and ending in " << ToFunc->getPrintName() << " @ " + << ToFunc->getPrintName() << " @ " + << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n'); + NumLongRangeTraces += Count; + return false; + } + if (FromFunc != ToFunc) { + NumInvalidTraces += Count; + LLVM_DEBUG( + dbgs() << "Invalid trace starting in " << FromFunc->getPrintName() + << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress()) + << " and ending in " << ToFunc->getPrintName() << " @ " + << ToFunc->getPrintName() << " @ " + << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n'); + return false; + } + + Optional FTs = + BAT ? BAT->getFallthroughsInTrace(*FromFunc, First.To, Second.From) + : getFallthroughsInTrace(*FromFunc, First, Second, Count); + if (!FTs) { + LLVM_DEBUG( + dbgs() << "Invalid trace starting in " << FromFunc->getPrintName() + << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress()) + << " and ending in " << ToFunc->getPrintName() << " @ " + << ToFunc->getPrintName() << " @ " + << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n'); + NumInvalidTraces += Count; + return false; + } + + LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for " + << FromFunc->getPrintName() << ":" + << Twine::utohexstr(First.To) << " to " + << Twine::utohexstr(Second.From) << ".\n"); + for (const std::pair &Pair : *FTs) { + doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(), + Pair.second + FromFunc->getAddress(), Count, false); + } + + return true; +} + +bool DataAggregator::recordTrace( + BinaryFunction &BF, + const LBREntry &FirstLBR, + const LBREntry &SecondLBR, + uint64_t Count, + SmallVector, 16> *Branches) const { + BinaryContext &BC = BF.getBinaryContext(); + + if (!BF.isSimple()) + return false; + + assert(BF.hasCFG() && "can only record traces in CFG state"); + + // Offsets of the trace within this function. + const uint64_t From = FirstLBR.To - BF.getAddress(); + const uint64_t To = SecondLBR.From - BF.getAddress(); + + if (From > To) + return false; + + BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(From); + BinaryBasicBlock *ToBB = BF.getBasicBlockContainingOffset(To); + + if (!FromBB || !ToBB) + return false; + + // Adjust FromBB if the first LBR is a return from the last instruction in + // the previous block (that instruction should be a call). + if (From == FromBB->getOffset() && !BF.containsAddress(FirstLBR.From) && + !FromBB->isEntryPoint() && !FromBB->isLandingPad()) { + BinaryBasicBlock *PrevBB = BF.BasicBlocksLayout[FromBB->getIndex() - 1]; + if (PrevBB->getSuccessor(FromBB->getLabel())) { + const MCInst *Instr = PrevBB->getLastNonPseudoInstr(); + if (Instr && BC.MIB->isCall(*Instr)) { + FromBB = PrevBB; + } else { + LLVM_DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR + << '\n'); + } + } else { + LLVM_DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n'); + } + } + + // Fill out information for fall-through edges. The From and To could be + // within the same basic block, e.g. when two call instructions are in the + // same block. In this case we skip the processing. + if (FromBB == ToBB) { + return true; + } + + // Process blocks in the original layout order. + BinaryBasicBlock *BB = BF.BasicBlocksLayout[FromBB->getIndex()]; + assert(BB == FromBB && "index mismatch"); + while (BB != ToBB) { + BinaryBasicBlock *NextBB = BF.BasicBlocksLayout[BB->getIndex() + 1]; + assert((NextBB && NextBB->getOffset() > BB->getOffset()) && "bad layout"); + + // Check for bad LBRs. + if (!BB->getSuccessor(NextBB->getLabel())) { + LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n" + << " " << FirstLBR << '\n' + << " " << SecondLBR << '\n'); + return false; + } + + // Record fall-through jumps + BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(*NextBB); + BI.Count += Count; + + if (Branches) { + const MCInst *Instr = BB->getLastNonPseudoInstr(); + uint64_t Offset = 0; + if (Instr) { + Offset = BC.MIB->getAnnotationWithDefault(*Instr, "Offset"); + } else { + Offset = BB->getOffset(); + } + Branches->emplace_back(Offset, NextBB->getOffset()); + } + + BB = NextBB; + } + + return true; +} + +Optional, 16>> +DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, + const LBREntry &FirstLBR, + const LBREntry &SecondLBR, + uint64_t Count) const { + SmallVector, 16> Res; + + if (!recordTrace(BF, FirstLBR, SecondLBR, Count, &Res)) + return NoneType(); + + return Res; +} + +bool DataAggregator::recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred, + uint64_t Count) const { + if (To > BF.getSize()) + return false; + + if (!BF.hasProfile()) + BF.ExecutionCount = 0; + + BinaryBasicBlock *EntryBB = nullptr; + if (To == 0) { + BF.ExecutionCount += Count; + if (!BF.empty()) + EntryBB = &BF.front(); + } else if (BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(To)) { + if (BB->isEntryPoint()) + EntryBB = BB; + } + + if (EntryBB) + EntryBB->setExecutionCount(EntryBB->getKnownExecutionCount() + Count); + + return true; +} + +bool DataAggregator::recordExit(BinaryFunction &BF, uint64_t From, + bool Mispred, uint64_t Count) const { + if (!BF.isSimple() || From > BF.getSize()) + return false; + + if (!BF.hasProfile()) + BF.ExecutionCount = 0; + + return true; +} + +ErrorOr DataAggregator::parseLBREntry() { + LBREntry Res; + ErrorOr FromStrRes = parseString('/'); + if (std::error_code EC = FromStrRes.getError()) + return EC; + StringRef OffsetStr = FromStrRes.get(); + if (OffsetStr.getAsInteger(0, Res.From)) { + reportError("expected hexadecimal number with From address"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + + ErrorOr ToStrRes = parseString('/'); + if (std::error_code EC = ToStrRes.getError()) + return EC; + OffsetStr = ToStrRes.get(); + if (OffsetStr.getAsInteger(0, Res.To)) { + reportError("expected hexadecimal number with To address"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + + ErrorOr MispredStrRes = parseString('/'); + if (std::error_code EC = MispredStrRes.getError()) + return EC; + StringRef MispredStr = MispredStrRes.get(); + if (MispredStr.size() != 1 || + (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) { + reportError("expected single char for mispred bit"); + Diag << "Found: " << MispredStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + Res.Mispred = MispredStr[0] == 'M'; + + static bool MispredWarning = true;; + if (MispredStr[0] == '-' && MispredWarning) { + errs() << "PERF2BOLT-WARNING: misprediction bit is missing in profile\n"; + MispredWarning = false; + } + + ErrorOr Rest = parseString(FieldSeparator, true); + if (std::error_code EC = Rest.getError()) + return EC; + if (Rest.get().size() < 5) { + reportError("expected rest of LBR entry"); + Diag << "Found: " << Rest.get() << "\n"; + return make_error_code(llvm::errc::io_error); + } + return Res; +} + +bool DataAggregator::checkAndConsumeFS() { + if (ParsingBuf[0] != FieldSeparator) { + return false; + } + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + return true; +} + +void DataAggregator::consumeRestOfLine() { + size_t LineEnd = ParsingBuf.find_first_of('\n'); + if (LineEnd == StringRef::npos) { + ParsingBuf = StringRef(); + Col = 0; + Line += 1; + return; + } + ParsingBuf = ParsingBuf.drop_front(LineEnd + 1); + Col = 0; + Line += 1; +} + +ErrorOr DataAggregator::parseBranchSample() { + PerfBranchSample Res; + + while (checkAndConsumeFS()) {} + + ErrorOr PIDRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = PIDRes.getError()) + return EC; + auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); + if (!opts::LinuxKernelMode && MMapInfoIter == BinaryMMapInfo.end()) { + consumeRestOfLine(); + return make_error_code(errc::no_such_process); + } + + while (checkAndConsumeFS()) {} + + ErrorOr PCRes = parseHexField(FieldSeparator, true); + if (std::error_code EC = PCRes.getError()) + return EC; + Res.PC = PCRes.get(); + + if (checkAndConsumeNewLine()) + return Res; + + while (!checkAndConsumeNewLine()) { + checkAndConsumeFS(); + + ErrorOr LBRRes = parseLBREntry(); + if (std::error_code EC = LBRRes.getError()) + return EC; + LBREntry LBR = LBRRes.get(); + if (ignoreKernelInterrupt(LBR)) + continue; + if (!BC->HasFixedLoadAddress) + adjustLBR(LBR, MMapInfoIter->second); + Res.LBR.push_back(LBR); + } + + return Res; +} + +ErrorOr DataAggregator::parseBasicSample() { + while (checkAndConsumeFS()) {} + + ErrorOr PIDRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = PIDRes.getError()) + return EC; + + auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); + if (MMapInfoIter == BinaryMMapInfo.end()) { + consumeRestOfLine(); + return PerfBasicSample{StringRef(), 0}; + } + + while (checkAndConsumeFS()) {} + + ErrorOr Event = parseString(FieldSeparator); + if (std::error_code EC = Event.getError()) + return EC; + + while (checkAndConsumeFS()) {} + + ErrorOr AddrRes = parseHexField(FieldSeparator, true); + if (std::error_code EC = AddrRes.getError()) { + return EC; + } + + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + uint64_t Address = *AddrRes; + if (!BC->HasFixedLoadAddress) + adjustAddress(Address, MMapInfoIter->second); + + return PerfBasicSample{Event.get(), Address}; +} + +ErrorOr DataAggregator::parseMemSample() { + PerfMemSample Res{0,0}; + + while (checkAndConsumeFS()) {} + + ErrorOr PIDRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = PIDRes.getError()) + return EC; + + auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); + if (MMapInfoIter == BinaryMMapInfo.end()) { + consumeRestOfLine(); + return Res; + } + + while (checkAndConsumeFS()) {} + + ErrorOr Event = parseString(FieldSeparator); + if (std::error_code EC = Event.getError()) + return EC; + if (Event.get().find("mem-loads") == StringRef::npos) { + consumeRestOfLine(); + return Res; + } + + while (checkAndConsumeFS()) {} + + ErrorOr AddrRes = parseHexField(FieldSeparator); + if (std::error_code EC = AddrRes.getError()) { + return EC; + } + + while (checkAndConsumeFS()) {} + + ErrorOr PCRes = parseHexField(FieldSeparator, true); + if (std::error_code EC = PCRes.getError()) { + consumeRestOfLine(); + return EC; + } + + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + uint64_t Address = *AddrRes; + if (!BC->HasFixedLoadAddress) + adjustAddress(Address, MMapInfoIter->second); + + return PerfMemSample{PCRes.get(), Address}; +} + +ErrorOr DataAggregator::parseLocationOrOffset() { + auto parseOffset = [this]() -> ErrorOr { + ErrorOr Res = parseHexField(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + return Location(Res.get()); + }; + + size_t Sep = ParsingBuf.find_first_of(" \n"); + if (Sep == StringRef::npos) + return parseOffset(); + StringRef LookAhead = ParsingBuf.substr(0, Sep); + if (LookAhead.find_first_of(":") == StringRef::npos) + return parseOffset(); + + ErrorOr BuildID = parseString(':'); + if (std::error_code EC = BuildID.getError()) + return EC; + ErrorOr Offset = parseHexField(FieldSeparator); + if (std::error_code EC = Offset.getError()) + return EC; + return Location(true, BuildID.get(), Offset.get()); +} + +ErrorOr +DataAggregator::parseAggregatedLBREntry() { + while (checkAndConsumeFS()) {} + + ErrorOr TypeOrErr = parseString(FieldSeparator); + if (std::error_code EC = TypeOrErr.getError()) + return EC; + auto Type = AggregatedLBREntry::BRANCH; + if (TypeOrErr.get() == "B") { + Type = AggregatedLBREntry::BRANCH; + } else if (TypeOrErr.get() == "F") { + Type = AggregatedLBREntry::FT; + } else if (TypeOrErr.get() == "f") { + Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN; + } else { + reportError("expected B, F or f"); + return make_error_code(llvm::errc::io_error); + } + + while (checkAndConsumeFS()) {} + ErrorOr From = parseLocationOrOffset(); + if (std::error_code EC = From.getError()) + return EC; + + while (checkAndConsumeFS()) {} + ErrorOr To = parseLocationOrOffset(); + if (std::error_code EC = To.getError()) + return EC; + + while (checkAndConsumeFS()) {} + ErrorOr Frequency = + parseNumberField(FieldSeparator, Type != AggregatedLBREntry::BRANCH); + if (std::error_code EC = Frequency.getError()) + return EC; + + uint64_t Mispreds = 0; + if (Type == AggregatedLBREntry::BRANCH) { + while (checkAndConsumeFS()) {} + ErrorOr MispredsOrErr = parseNumberField(FieldSeparator, true); + if (std::error_code EC = MispredsOrErr.getError()) + return EC; + Mispreds = static_cast(MispredsOrErr.get()); + } + + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return AggregatedLBREntry{From.get(), To.get(), + static_cast(Frequency.get()), Mispreds, + Type}; +} + +bool DataAggregator::hasData() { + if (ParsingBuf.size() == 0) + return false; + + return true; +} + +bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const { + return opts::IgnoreInterruptLBR && + (LBR.From >= KernelBaseAddr || LBR.To >= KernelBaseAddr); +} + +std::error_code DataAggregator::printLBRHeatMap() { + outs() << "PERF2BOLT: parse branch events...\n"; + NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + + if (opts::LinuxKernelMode) { + opts::HeatmapMaxAddress = 0xffffffffffffffff; + opts::HeatmapMinAddress = KernelBaseAddr; + } + Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress, + opts::HeatmapMaxAddress); + uint64_t NumTotalSamples = 0; + + while (hasData()) { + ErrorOr SampleRes = parseBranchSample(); + if (std::error_code EC = SampleRes.getError()) { + if (EC == errc::no_such_process) + continue; + return EC; + } + + PerfBranchSample &Sample = SampleRes.get(); + + // LBRs are stored in reverse execution order. NextLBR refers to the next + // executed branch record. + const LBREntry *NextLBR = nullptr; + for (const LBREntry &LBR : Sample.LBR) { + if (NextLBR) { + // Record fall-through trace. + const uint64_t TraceFrom = LBR.To; + const uint64_t TraceTo = NextLBR->From; + ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount; + } + NextLBR = &LBR; + } + if (!Sample.LBR.empty()) { + HM.registerAddress(Sample.LBR.front().To); + HM.registerAddress(Sample.LBR.back().From); + } + NumTotalSamples += Sample.LBR.size(); + } + + if (!NumTotalSamples) { + errs() << "HEATMAP-ERROR: no LBR traces detected in profile. " + "Cannot build heatmap.\n"; + exit(1); + } + + outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n"; + outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n"; + + outs() << "HEATMAP: building heat map...\n"; + + for (const auto &LBR : FallthroughLBRs) { + const Trace &Trace = LBR.first; + const FTInfo &Info = LBR.second; + HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount); + } + + if (HM.getNumInvalidRanges()) + outs() << "HEATMAP: invalid traces: " << HM.getNumInvalidRanges() << '\n'; + + if (!HM.size()) { + errs() << "HEATMAP-ERROR: no valid traces registered\n"; + exit(1); + } + + HM.print(opts::HeatmapFile); + if (opts::HeatmapFile == "-") { + HM.printCDF(opts::HeatmapFile); + } else { + HM.printCDF(opts::HeatmapFile + ".csv"); + } + + return std::error_code(); +} + +std::error_code DataAggregator::parseBranchEvents() { + outs() << "PERF2BOLT: parse branch events...\n"; + NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + + uint64_t NumTotalSamples = 0; + uint64_t NumEntries = 0; + uint64_t NumSamples = 0; + uint64_t NumSamplesNoLBR = 0; + uint64_t NumTraces = 0; + bool NeedsSkylakeFix = false; + + while (hasData() && NumTotalSamples < opts::MaxSamples) { + ++NumTotalSamples; + + ErrorOr SampleRes = parseBranchSample(); + if (std::error_code EC = SampleRes.getError()) { + if (EC == errc::no_such_process) + continue; + return EC; + } + ++NumSamples; + + PerfBranchSample &Sample = SampleRes.get(); + if (opts::WriteAutoFDOData) + ++BasicSamples[Sample.PC]; + + if (Sample.LBR.empty()) { + ++NumSamplesNoLBR; + continue; + } + + NumEntries += Sample.LBR.size(); + if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) { + errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n"; + NeedsSkylakeFix = true; + } + + // LBRs are stored in reverse execution order. NextPC refers to the next + // recorded executed PC. + uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0; + uint32_t NumEntry = 0; + for (const LBREntry &LBR : Sample.LBR) { + ++NumEntry; + // Hardware bug workaround: Intel Skylake (which has 32 LBR entries) + // sometimes record entry 32 as an exact copy of entry 31. This will cause + // us to likely record an invalid trace and generate a stale function for + // BAT mode (non BAT disassembles the function and is able to ignore this + // trace at aggregation time). Drop first 2 entries (last two, in + // chronological order) + if (NeedsSkylakeFix && NumEntry <= 2) + continue; + if (NextPC) { + // Record fall-through trace. + const uint64_t TraceFrom = LBR.To; + const uint64_t TraceTo = NextPC; + const BinaryFunction *TraceBF = + getBinaryFunctionContainingAddress(TraceFrom); + if (TraceBF && TraceBF->containsAddress(TraceTo)) { + FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; + if (TraceBF->containsAddress(LBR.From)) { + ++Info.InternCount; + } else { + ++Info.ExternCount; + } + } else { + if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) { + LLVM_DEBUG(dbgs() + << "Invalid trace starting in " + << TraceBF->getPrintName() << " @ " + << Twine::utohexstr(TraceFrom - TraceBF->getAddress()) + << " and ending @ " << Twine::utohexstr(TraceTo) + << '\n'); + ++NumInvalidTraces; + } else { + LLVM_DEBUG(dbgs() + << "Out of range trace starting in " + << (TraceBF ? TraceBF->getPrintName() : "None") << " @ " + << Twine::utohexstr( + TraceFrom - (TraceBF ? TraceBF->getAddress() : 0)) + << " and ending in " + << (getBinaryFunctionContainingAddress(TraceTo) + ? getBinaryFunctionContainingAddress(TraceTo) + ->getPrintName() + : "None") + << " @ " + << Twine::utohexstr( + TraceTo - + (getBinaryFunctionContainingAddress(TraceTo) + ? getBinaryFunctionContainingAddress(TraceTo) + ->getAddress() + : 0)) + << '\n'); + ++NumLongRangeTraces; + } + } + ++NumTraces; + } + NextPC = LBR.From; + + uint64_t From = LBR.From; + if (!getBinaryFunctionContainingAddress(From)) + From = 0; + uint64_t To = LBR.To; + if (!getBinaryFunctionContainingAddress(To)) + To = 0; + if (!From && !To) + continue; + BranchInfo &Info = BranchLBRs[Trace(From, To)]; + ++Info.TakenCount; + Info.MispredCount += LBR.Mispred; + } + } + + for (const auto &LBR : BranchLBRs) { + const Trace &Trace = LBR.first; + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.From)) + BF->setHasProfileAvailable(); + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.To)) + BF->setHasProfileAvailable(); + } + + auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) { + OS << " ("; + if (OS.has_colors()) { + if (Percent > T2) { + OS.changeColor(raw_ostream::RED); + } else if (Percent > T1) { + OS.changeColor(raw_ostream::YELLOW); + } else { + OS.changeColor(raw_ostream::GREEN); + } + } + OS << format("%.1f%%", Percent); + if (OS.has_colors()) + OS.resetColor(); + OS << ")"; + }; + + outs() << "PERF2BOLT: read " << NumSamples << " samples and " + << NumEntries << " LBR entries\n"; + if (NumTotalSamples) { + if (NumSamples && NumSamplesNoLBR == NumSamples) { + // Note: we don't know if perf2bolt is being used to parse memory samples + // at this point. In this case, it is OK to parse zero LBRs. + errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack " + "LBR. Record profile with perf record -j any or run perf2bolt " + "in no-LBR mode with -nl (the performance improvement in -nl " + "mode may be limited)\n"; + } else { + const uint64_t IgnoredSamples = NumTotalSamples - NumSamples; + const float PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples; + outs() << "PERF2BOLT: " << IgnoredSamples << " samples"; + printColored(outs(), PercentIgnored, 20, 50); + outs() << " were ignored\n"; + if (PercentIgnored > 50.0f) { + errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples " + "were attributed to the input binary\n"; + } + } + } + outs() << "PERF2BOLT: traces mismatching disassembled function contents: " + << NumInvalidTraces; + float Perc = 0.0f; + if (NumTraces > 0) { + Perc = NumInvalidTraces * 100.0f / NumTraces; + printColored(outs(), Perc, 5, 10); + } + outs() << "\n"; + if (Perc > 10.0f) { + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance.\n\n"; + } + + outs() << "PERF2BOLT: out of range traces involving unknown regions: " + << NumLongRangeTraces; + if (NumTraces > 0) { + outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); + } + outs() << "\n"; + + if (NumColdSamples > 0) { + const float ColdSamples = NumColdSamples * 100.0f / NumTotalSamples; + outs() << "PERF2BOLT: " << NumColdSamples + << format(" (%.1f%%)", ColdSamples) + << " samples recorded in cold regions of split functions.\n"; + if (ColdSamples > 5.0f) { + outs() + << "WARNING: The BOLT-processed binary where samples were collected " + "likely used bad data or your service observed a large shift in " + "profile. You may want to audit this.\n"; + } + } + + return std::error_code(); +} + +void DataAggregator::processBranchEvents() { + outs() << "PERF2BOLT: processing branch events...\n"; + NamedRegionTimer T("processBranch", "Processing branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + for (const auto &AggrLBR : FallthroughLBRs) { + const Trace &Loc = AggrLBR.first; + const FTInfo &Info = AggrLBR.second; + LBREntry First{Loc.From, Loc.From, false}; + LBREntry Second{Loc.To, Loc.To, false}; + if (Info.InternCount) { + doTrace(First, Second, Info.InternCount); + } + if (Info.ExternCount) { + First.From = 0; + doTrace(First, Second, Info.ExternCount); + } + } + + for (const auto &AggrLBR : BranchLBRs) { + const Trace &Loc = AggrLBR.first; + const BranchInfo &Info = AggrLBR.second; + doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); + } +} + +std::error_code DataAggregator::parseBasicEvents() { + outs() << "PERF2BOLT: parsing basic events (without LBR)...\n"; + NamedRegionTimer T("parseBasic", "Parsing basic events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + while (hasData()) { + ErrorOr Sample = parseBasicSample(); + if (std::error_code EC = Sample.getError()) + return EC; + + if (!Sample->PC) + continue; + + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC)) + BF->setHasProfileAvailable(); + + ++BasicSamples[Sample->PC]; + EventNames.insert(Sample->EventName); + } + + return std::error_code(); +} + +void DataAggregator::processBasicEvents() { + outs() << "PERF2BOLT: processing basic events (without LBR)...\n"; + NamedRegionTimer T("processBasic", "Processing basic events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + uint64_t OutOfRangeSamples = 0; + uint64_t NumSamples = 0; + for (auto &Sample : BasicSamples) { + const uint64_t PC = Sample.first; + const uint64_t HitCount = Sample.second; + NumSamples += HitCount; + BinaryFunction *Func = getBinaryFunctionContainingAddress(PC); + if (!Func) { + OutOfRangeSamples += HitCount; + continue; + } + + doSample(*Func, PC, HitCount); + } + outs() << "PERF2BOLT: read " << NumSamples << " samples\n"; + + outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " + << OutOfRangeSamples; + float Perc = 0.0f; + if (NumSamples > 0) { + outs() << " ("; + Perc = OutOfRangeSamples * 100.0f / NumSamples; + if (outs().has_colors()) { + if (Perc > 60.0f) { + outs().changeColor(raw_ostream::RED); + } else if (Perc > 40.0f) { + outs().changeColor(raw_ostream::YELLOW); + } else { + outs().changeColor(raw_ostream::GREEN); + } + } + outs() << format("%.1f%%", Perc); + if (outs().has_colors()) + outs().resetColor(); + outs() << ")"; + } + outs() << "\n"; + if (Perc > 80.0f) { + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance.\n\n"; + } +} + +std::error_code DataAggregator::parseMemEvents() { + outs() << "PERF2BOLT: parsing memory events...\n"; + NamedRegionTimer T("parseMemEvents", "Parsing mem events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + while (hasData()) { + ErrorOr Sample = parseMemSample(); + if (std::error_code EC = Sample.getError()) + return EC; + + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC)) + BF->setHasProfileAvailable(); + + MemSamples.emplace_back(std::move(Sample.get())); + } + + return std::error_code(); +} + +void DataAggregator::processMemEvents() { + NamedRegionTimer T("ProcessMemEvents", "Processing mem events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + for (const PerfMemSample &Sample : MemSamples) { + uint64_t PC = Sample.PC; + uint64_t Addr = Sample.Addr; + StringRef FuncName; + StringRef MemName; + + // Try to resolve symbol for PC + BinaryFunction *Func = getBinaryFunctionContainingAddress(PC); + if (!Func) { + LLVM_DEBUG(if (PC != 0) { + dbgs() << "Skipped mem event: 0x" << Twine::utohexstr(PC) << " => 0x" + << Twine::utohexstr(Addr) << "\n"; + }); + continue; + } + + FuncName = Func->getOneName(); + PC -= Func->getAddress(); + + // Try to resolve symbol for memory load + if (BinaryData *BD = BC->getBinaryDataContainingAddress(Addr)) { + MemName = BD->getName(); + Addr -= BD->getAddress(); + } else if (opts::FilterMemProfile) { + // Filter out heap/stack accesses + continue; + } + + const Location FuncLoc(!FuncName.empty(), FuncName, PC); + const Location AddrLoc(!MemName.empty(), MemName, Addr); + + FuncMemData *MemData = &NamesToMemEvents[FuncName]; + setMemData(*Func, MemData); + MemData->update(FuncLoc, AddrLoc); + LLVM_DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n"); + } +} + +std::error_code DataAggregator::parsePreAggregatedLBRSamples() { + outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; + NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + while (hasData()) { + ErrorOr AggrEntry = parseAggregatedLBREntry(); + if (std::error_code EC = AggrEntry.getError()) + return EC; + + if (BinaryFunction *BF = + getBinaryFunctionContainingAddress(AggrEntry->From.Offset)) + BF->setHasProfileAvailable(); + if (BinaryFunction *BF = + getBinaryFunctionContainingAddress(AggrEntry->To.Offset)) + BF->setHasProfileAvailable(); + + AggregatedLBRs.emplace_back(std::move(AggrEntry.get())); + } + + return std::error_code(); +} + +void DataAggregator::processPreAggregated() { + outs() << "PERF2BOLT: processing pre-aggregated profile...\n"; + NamedRegionTimer T("processAggregated", "Processing aggregated branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + uint64_t NumTraces = 0; + for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) { + switch (AggrEntry.EntryType) { + case AggregatedLBREntry::BRANCH: + doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, + AggrEntry.Mispreds); + break; + case AggregatedLBREntry::FT: + case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { + LBREntry First{AggrEntry.EntryType == AggregatedLBREntry::FT + ? AggrEntry.From.Offset + : 0, + AggrEntry.From.Offset, false}; + LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false}; + doTrace(First, Second, AggrEntry.Count); + NumTraces += AggrEntry.Count; + break; + } + } + } + + outs() << "PERF2BOLT: read " << AggregatedLBRs.size() + << " aggregated LBR entries\n"; + outs() << "PERF2BOLT: traces mismatching disassembled function contents: " + << NumInvalidTraces; + float Perc = 0.0f; + if (NumTraces > 0) { + outs() << " ("; + Perc = NumInvalidTraces * 100.0f / NumTraces; + if (outs().has_colors()) { + if (Perc > 10.0f) { + outs().changeColor(raw_ostream::RED); + } else if (Perc > 5.0f) { + outs().changeColor(raw_ostream::YELLOW); + } else { + outs().changeColor(raw_ostream::GREEN); + } + } + outs() << format("%.1f%%", Perc); + if (outs().has_colors()) + outs().resetColor(); + outs() << ")"; + } + outs() << "\n"; + if (Perc > 10.0f) { + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance.\n\n"; + } + + outs() << "PERF2BOLT: Out of range traces involving unknown regions: " + << NumLongRangeTraces; + if (NumTraces > 0) { + outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); + } + outs() << "\n"; +} + +Optional +DataAggregator::parseCommExecEvent() { + size_t LineEnd = ParsingBuf.find_first_of("\n"); + if (LineEnd == StringRef::npos) { + reportError("expected rest of line"); + Diag << "Found: " << ParsingBuf << "\n"; + return NoneType(); + } + StringRef Line = ParsingBuf.substr(0, LineEnd); + + size_t Pos = Line.find("PERF_RECORD_COMM exec"); + if (Pos == StringRef::npos) { + return NoneType(); + } + Line = Line.drop_front(Pos); + + // Line: + // PERF_RECORD_COMM exec: :/" + StringRef PIDStr = Line.rsplit(':').second.split('/').first; + pid_t PID; + if (PIDStr.getAsInteger(10, PID)) { + reportError("expected PID"); + Diag << "Found: " << PIDStr << "in '" << Line << "'\n"; + return NoneType(); + } + + return PID; +} + +namespace { +Optional parsePerfTime(const StringRef TimeStr) { + const StringRef SecTimeStr = TimeStr.split('.').first; + const StringRef USecTimeStr = TimeStr.split('.').second; + uint64_t SecTime; + uint64_t USecTime; + if (SecTimeStr.getAsInteger(10, SecTime) || + USecTimeStr.getAsInteger(10, USecTime)) { + return NoneType(); + } + return SecTime * 1000000ULL + USecTime; +} +} + +Optional +DataAggregator::parseForkEvent() { + while (checkAndConsumeFS()) {} + + size_t LineEnd = ParsingBuf.find_first_of("\n"); + if (LineEnd == StringRef::npos) { + reportError("expected rest of line"); + Diag << "Found: " << ParsingBuf << "\n"; + return NoneType(); + } + StringRef Line = ParsingBuf.substr(0, LineEnd); + + size_t Pos = Line.find("PERF_RECORD_FORK"); + if (Pos == StringRef::npos) { + consumeRestOfLine(); + return NoneType(); + } + + ForkInfo FI; + + const StringRef TimeStr = + Line.substr(0, Pos).rsplit(':').first.rsplit(FieldSeparator).second; + if (Optional TimeRes = parsePerfTime(TimeStr)) { + FI.Time = *TimeRes; + } + + Line = Line.drop_front(Pos); + + // Line: + // PERF_RECORD_FORK(:):(:) + const StringRef ChildPIDStr = Line.split('(').second.split(':').first; + if (ChildPIDStr.getAsInteger(10, FI.ChildPID)) { + reportError("expected PID"); + Diag << "Found: " << ChildPIDStr << "in '" << Line << "'\n"; + return NoneType(); + } + + const StringRef ParentPIDStr = Line.rsplit('(').second.split(':').first; + if (ParentPIDStr.getAsInteger(10, FI.ParentPID)) { + reportError("expected PID"); + Diag << "Found: " << ParentPIDStr << "in '" << Line << "'\n"; + return NoneType(); + } + + consumeRestOfLine(); + + return FI; +} + +ErrorOr> +DataAggregator::parseMMapEvent() { + while (checkAndConsumeFS()) {} + + MMapInfo ParsedInfo; + + size_t LineEnd = ParsingBuf.find_first_of("\n"); + if (LineEnd == StringRef::npos) { + reportError("expected rest of line"); + Diag << "Found: " << ParsingBuf << "\n"; + return make_error_code(llvm::errc::io_error); + } + StringRef Line = ParsingBuf.substr(0, LineEnd); + + size_t Pos = Line.find("PERF_RECORD_MMAP2"); + if (Pos == StringRef::npos) { + consumeRestOfLine(); + return std::make_pair(StringRef(), ParsedInfo); + } + + // Line: + // { .* .: }PERF_RECORD_MMAP2 /: .* + + const StringRef TimeStr = + Line.substr(0, Pos).rsplit(':').first.rsplit(FieldSeparator).second; + if (Optional TimeRes = parsePerfTime(TimeStr)) { + ParsedInfo.Time = *TimeRes; + } + + Line = Line.drop_front(Pos); + + // Line: + // PERF_RECORD_MMAP2 /: [() .*]: .* + + StringRef FileName = Line.rsplit(FieldSeparator).second; + if (FileName.startswith("//") || FileName.startswith("[")) { + consumeRestOfLine(); + return std::make_pair(StringRef(), ParsedInfo); + } + FileName = sys::path::filename(FileName); + + const StringRef PIDStr = Line.split(FieldSeparator).second.split('/').first; + if (PIDStr.getAsInteger(10, ParsedInfo.PID)) { + reportError("expected PID"); + Diag << "Found: " << PIDStr << "in '" << Line << "'\n"; + return make_error_code(llvm::errc::io_error); + } + + const StringRef BaseAddressStr = Line.split('[').second.split('(').first; + if (BaseAddressStr.getAsInteger(0, ParsedInfo.BaseAddress)) { + reportError("expected base address"); + Diag << "Found: " << BaseAddressStr << "in '" << Line << "'\n"; + return make_error_code(llvm::errc::io_error); + } + + const StringRef SizeStr = Line.split('(').second.split(')').first; + if (SizeStr.getAsInteger(0, ParsedInfo.Size)) { + reportError("expected mmaped size"); + Diag << "Found: " << SizeStr << "in '" << Line << "'\n"; + return make_error_code(llvm::errc::io_error); + } + + const StringRef OffsetStr = + Line.split('@').second.ltrim().split(FieldSeparator).first; + if (OffsetStr.getAsInteger(0, ParsedInfo.Offset)) { + reportError("expected mmaped page-aligned offset"); + Diag << "Found: " << OffsetStr << "in '" << Line << "'\n"; + return make_error_code(llvm::errc::io_error); + } + + consumeRestOfLine(); + + return std::make_pair(FileName, ParsedInfo); +} + +std::error_code DataAggregator::parseMMapEvents() { + outs() << "PERF2BOLT: parsing perf-script mmap events output\n"; + NamedRegionTimer T("parseMMapEvents", "Parsing mmap events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + + std::multimap GlobalMMapInfo; + while (hasData()) { + ErrorOr> FileMMapInfoRes = parseMMapEvent(); + if (std::error_code EC = FileMMapInfoRes.getError()) + return EC; + + std::pair FileMMapInfo = FileMMapInfoRes.get(); + if (FileMMapInfo.second.PID == -1) + continue; + + // Consider only the first mapping of the file for any given PID + bool PIDExists = false; + auto Range = GlobalMMapInfo.equal_range(FileMMapInfo.first); + for (auto MI = Range.first; MI != Range.second; ++MI) { + if (MI->second.PID == FileMMapInfo.second.PID) { + PIDExists = true; + break; + } + } + if (PIDExists) + continue; + + GlobalMMapInfo.insert(FileMMapInfo); + } + + LLVM_DEBUG({ + dbgs() << "FileName -> mmap info:\n"; + for (const std::pair &Pair : GlobalMMapInfo) { + dbgs() << " " << Pair.first << " : " << Pair.second.PID << " [0x" + << Twine::utohexstr(Pair.second.BaseAddress) << ", " + << Twine::utohexstr(Pair.second.Size) << " @ " + << Twine::utohexstr(Pair.second.Offset) << "]\n"; + } + }); + + StringRef NameToUse = llvm::sys::path::filename(BC->getFilename()); + if (GlobalMMapInfo.count(NameToUse) == 0 && !BuildIDBinaryName.empty()) { + errs() << "PERF2BOLT-WARNING: using \"" << BuildIDBinaryName + << "\" for profile matching\n"; + NameToUse = BuildIDBinaryName; + } + + auto Range = GlobalMMapInfo.equal_range(NameToUse); + for (auto I = Range.first; I != Range.second; ++I) { + const MMapInfo &MMapInfo = I->second; + if (BC->HasFixedLoadAddress && MMapInfo.BaseAddress) { + // Check that the binary mapping matches one of the segments. + bool MatchFound = false; + for (auto &KV : BC->SegmentMapInfo) { + SegmentInfo &SegInfo = KV.second; + // The mapping is page-aligned and hence the BaseAddress could be + // different from the segment start address. We cannot know the page + // size of the mapping, but we know it should not exceed the segment + // alignment value. Hence we are performing an approximate check. + if (SegInfo.Address >= MMapInfo.BaseAddress && + SegInfo.Address - MMapInfo.BaseAddress < SegInfo.Alignment) { + MatchFound = true; + break; + } + } + if (!MatchFound) { + errs() << "PERF2BOLT-WARNING: ignoring mapping of " << NameToUse + << " at 0x" << Twine::utohexstr(MMapInfo.BaseAddress) << '\n'; + continue; + } + } + + BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo)); + } + + if (BinaryMMapInfo.empty()) { + if (errs().has_colors()) + errs().changeColor(raw_ostream::RED); + errs() << "PERF2BOLT-ERROR: could not find a profile matching binary \"" + << BC->getFilename() << "\"."; + if (!GlobalMMapInfo.empty()) { + errs() << " Profile for the following binary name(s) is available:\n"; + for (auto I = GlobalMMapInfo.begin(), IE = GlobalMMapInfo.end(); I != IE; + I = GlobalMMapInfo.upper_bound(I->first)) { + errs() << " " << I->first << '\n'; + } + errs() << "Please rename the input binary.\n"; + } else { + errs() << " Failed to extract any binary name from a profile.\n"; + } + if (errs().has_colors()) + errs().resetColor(); + + exit(1); + } + + return std::error_code(); +} + +std::error_code DataAggregator::parseTaskEvents() { + outs() << "PERF2BOLT: parsing perf-script task events output\n"; + NamedRegionTimer T("parseTaskEvents", "Parsing task events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + + while (hasData()) { + if (Optional CommInfo = parseCommExecEvent()) { + // Remove forked child that ran execve + auto MMapInfoIter = BinaryMMapInfo.find(*CommInfo); + if (MMapInfoIter != BinaryMMapInfo.end() && + MMapInfoIter->second.Forked) { + BinaryMMapInfo.erase(MMapInfoIter); + } + consumeRestOfLine(); + continue; + } + + Optional ForkInfo = parseForkEvent(); + if (!ForkInfo) + continue; + + if (ForkInfo->ParentPID == ForkInfo->ChildPID) + continue; + + if (ForkInfo->Time == 0) { + // Process was forked and mmaped before perf ran. In this case the child + // should have its own mmap entry unless it was execve'd. + continue; + } + + auto MMapInfoIter = BinaryMMapInfo.find(ForkInfo->ParentPID); + if (MMapInfoIter == BinaryMMapInfo.end()) + continue; + + MMapInfo MMapInfo = MMapInfoIter->second; + MMapInfo.PID = ForkInfo->ChildPID; + MMapInfo.Forked = true; + BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo)); + } + + outs() << "PERF2BOLT: input binary is associated with " + << BinaryMMapInfo.size() << " PID(s)\n"; + + LLVM_DEBUG({ + for (std::pair &MMI : BinaryMMapInfo) { + outs() << " " << MMI.second.PID << (MMI.second.Forked ? " (forked)" : "") + << ": (0x" << Twine::utohexstr(MMI.second.BaseAddress) + << ": 0x" << Twine::utohexstr(MMI.second.Size) << ")\n"; + } + }); + + return std::error_code(); +} + +Optional> +DataAggregator::parseNameBuildIDPair() { + while (checkAndConsumeFS()) {} + + ErrorOr BuildIDStr = parseString(FieldSeparator, true); + if (std::error_code EC = BuildIDStr.getError()) + return NoneType(); + + ErrorOr NameStr = parseString(FieldSeparator, true); + if (std::error_code EC = NameStr.getError()) + return NoneType(); + + consumeRestOfLine(); + return std::make_pair(NameStr.get(), BuildIDStr.get()); +} + +Optional +DataAggregator::getFileNameForBuildID(StringRef FileBuildID) { + while (hasData()) { + Optional> IDPair = parseNameBuildIDPair(); + if (!IDPair) + return NoneType(); + + if (IDPair->second.startswith(FileBuildID)) + return sys::path::filename(IDPair->first); + } + return NoneType(); +} + +std::error_code +DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { + std::error_code EC; + raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None); + if (EC) + return EC; + + bool WriteMemLocs = false; + + auto writeLocation = [&OutFile,&WriteMemLocs](const Location &Loc) { + if (WriteMemLocs) + OutFile << (Loc.IsSymbol ? "4 " : "3 "); + else + OutFile << (Loc.IsSymbol ? "1 " : "0 "); + OutFile << (Loc.Name.empty() ? "[unknown]" : Loc.Name) << " " + << Twine::utohexstr(Loc.Offset) + << FieldSeparator; + }; + + uint64_t BranchValues = 0; + uint64_t MemValues = 0; + + if (BAT) + OutFile << "boltedcollection\n"; + if (opts::BasicAggregation) { + OutFile << "no_lbr"; + for (const StringMapEntry &Entry : EventNames) { + OutFile << " " << Entry.getKey(); + } + OutFile << "\n"; + + for (const StringMapEntry &Func : NamesToSamples) { + for (const SampleInfo &SI : Func.getValue().Data) { + writeLocation(SI.Loc); + OutFile << SI.Hits << "\n"; + ++BranchValues; + } + } + } else { + for (const StringMapEntry &Func : NamesToBranches) { + for (const llvm::bolt::BranchInfo &BI : Func.getValue().Data) { + writeLocation(BI.From); + writeLocation(BI.To); + OutFile << BI.Mispreds << " " << BI.Branches << "\n"; + ++BranchValues; + } + for (const llvm::bolt::BranchInfo &BI : Func.getValue().EntryData) { + // Do not output if source is a known symbol, since this was already + // accounted for in the source function + if (BI.From.IsSymbol) + continue; + writeLocation(BI.From); + writeLocation(BI.To); + OutFile << BI.Mispreds << " " << BI.Branches << "\n"; + ++BranchValues; + } + } + + WriteMemLocs = true; + for (const StringMapEntry &Func : NamesToMemEvents) { + for (const MemInfo &MemEvent : Func.getValue().Data) { + writeLocation(MemEvent.Offset); + writeLocation(MemEvent.Addr); + OutFile << MemEvent.Count << "\n"; + ++MemValues; + } + } + } + + outs() << "PERF2BOLT: wrote " << BranchValues << " objects and " + << MemValues << " memory objects to " << OutputFilename << "\n"; + + return std::error_code(); +} + +void DataAggregator::dump() const { + DataReader::dump(); +} + +void DataAggregator::dump(const LBREntry &LBR) const { + Diag << "From: " << Twine::utohexstr(LBR.From) + << " To: " << Twine::utohexstr(LBR.To) << " Mispred? " << LBR.Mispred + << "\n"; +} + +void DataAggregator::dump(const PerfBranchSample &Sample) const { + Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n"; + for (const LBREntry &LBR : Sample.LBR) { + dump(LBR); + } +} + +void DataAggregator::dump(const PerfMemSample &Sample) const { + Diag << "Sample mem entries: " << Sample.PC << ": " << Sample.Addr << "\n"; +} diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h new file mode 100644 index 000000000000..0d153e7af454 --- /dev/null +++ b/bolt/src/DataAggregator.h @@ -0,0 +1,487 @@ +//===-- DataAggregator.h - Perf data aggregator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by perf record, +// aggregates it and then writes it back to an output file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DATA_AGGREGATOR_H +#define LLVM_TOOLS_LLVM_BOLT_DATA_AGGREGATOR_H + +#include "DataReader.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Program.h" +#include + +namespace llvm { +namespace bolt { + +class BinaryFunction; +class BinaryContext; +class BoltAddressTranslation; + +/// DataAggregator inherits all parsing logic from DataReader as well as +/// its data structures used to represent aggregated profile data in memory. +/// +/// The aggregator works by dispatching two separate perf-script jobs that +/// read perf samples and perf task annotations. Later, we read the output +/// files to extract information about which PID was used for this binary. +/// With the PID, we filter the samples and extract all LBR entries. +/// +/// To aggregate LBR entries, we rely on a BinaryFunction map to locate the +/// original function where the event happened. Then, we convert a raw address +/// to an offset relative to the start of this function and aggregate branch +/// information for each function. +/// +/// This must be coordinated with RewriteInstance so we have BinaryFunctions in +/// State::Disassembled. After this state, BinaryFunction will drop the +/// instruction map with original addresses we rely on to validate the traces +/// found in the LBR. +/// +/// The last step is to write the aggregated data to disk in the output file +/// specified by the user. +class DataAggregator : public DataReader { +public: + explicit DataAggregator(StringRef Filename) + : DataReader(Filename) { + start(); + } + + ~DataAggregator(); + + StringRef getReaderName() const override { + return "perf data aggregator"; + } + + bool isTrustedSource() const override { + return true; + } + + Error preprocessProfile(BinaryContext &BC) override; + + Error readProfilePreCFG(BinaryContext &BC) override { + return Error::success(); + } + + Error readProfile(BinaryContext &BC) override; + + bool mayHaveProfileData(const BinaryFunction &BF) override; + + /// Set Bolt Address Translation Table when processing samples collected in + /// bolted binaries + void setBAT(BoltAddressTranslation *B) override { BAT = B; } + + /// Check whether \p FileName is a perf.data file + static bool checkPerfDataMagic(StringRef FileName); + +private: + struct PerfBranchSample { + SmallVector LBR; + uint64_t PC; + }; + + struct PerfBasicSample { + StringRef EventName; + uint64_t PC; + }; + + struct PerfMemSample { + uint64_t PC; + uint64_t Addr; + }; + + /// Used for parsing specific pre-aggregated input files. + struct AggregatedLBREntry { + enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; + Location From; + Location To; + uint64_t Count; + uint64_t Mispreds; + Type EntryType; + }; + + struct Trace { + uint64_t From; + uint64_t To; + Trace(uint64_t From, uint64_t To) + : From(From), To(To) {} + bool operator==(const Trace &Other) const { + return From == Other.From && To == Other.To; + } + }; + + struct TraceHash { + size_t operator()(const Trace &L) const { + return std::hash()(L.From << 32 | L.To); + } + }; + + struct FTInfo { + uint64_t InternCount{0}; + uint64_t ExternCount{0}; + }; + + struct BranchInfo { + uint64_t TakenCount{0}; + uint64_t MispredCount{0}; + }; + + /// Intermediate storage for profile data. We save the results of parsing + /// and use them later for processing and assigning profile. + std::unordered_map BranchLBRs; + std::unordered_map FallthroughLBRs; + std::vector AggregatedLBRs; + std::unordered_map BasicSamples; + std::vector MemSamples; + + template void clear(T& Container) { + T TempContainer; + TempContainer.swap(Container); + } + + /// Perf utility full path name + std::string PerfPath; + + /// Perf process spawning bookkeeping + struct PerfProcessInfo { + bool IsFinished{false}; + sys::ProcessInfo PI; + SmallVector StdoutPath; + SmallVector StderrPath; + }; + + /// Process info for spawned processes + PerfProcessInfo MainEventsPPI; + PerfProcessInfo MemEventsPPI; + PerfProcessInfo MMapEventsPPI; + PerfProcessInfo TaskEventsPPI; + + /// Kernel VM starts at fixed based address + /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt + static constexpr uint64_t KernelBaseAddr = 0xffff800000000000; + + /// Current list of created temporary files + std::vector TempFiles; + + /// Name of the binary with matching build-id from perf.data if different + /// from the file name in BC. + std::string BuildIDBinaryName; + + /// Memory map info for a single file + struct MMapInfo { + pid_t PID{-1LL}; + uint64_t BaseAddress; + uint64_t Size; + uint64_t Offset; + bool Forked{false}; + uint64_t Time{0ULL}; // time in micro seconds + }; + + /// Per-PID map info for the binary + std::unordered_map BinaryMMapInfo; + + /// Fork event info + struct ForkInfo { + pid_t ParentPID; + pid_t ChildPID; + uint64_t Time{0ULL}; + }; + + /// References to core BOLT data structures + BinaryContext *BC{nullptr}; + + BoltAddressTranslation *BAT{nullptr}; + + /// Update function execution profile with a recorded trace. + /// A trace is region of code executed between two LBR entries supplied in + /// execution order. + /// + /// Return true if the trace is valid, false otherwise. + bool recordTrace(BinaryFunction &BF, + const LBREntry &First, + const LBREntry &Second, + uint64_t Count = 1, + SmallVector, 16> *Branches = nullptr) const; + + /// Return a vector of offsets corresponding to a trace in a function + /// (see recordTrace() above). + Optional, 16>> + getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First, + const LBREntry &Second, uint64_t Count = 1) const; + + /// Record external entry into the function \p BF. + /// + /// Return true if the entry is valid, false otherwise. + bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred, + uint64_t Count = 1) const; + + /// Record exit from the function \p BF via a call or return. + /// + /// Return true if the exit point is valid, false otherwise. + bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, + uint64_t Count = 1) const; + + /// Aggregation statistics + uint64_t NumInvalidTraces{0}; + uint64_t NumLongRangeTraces{0}; + uint64_t NumColdSamples{0}; + + /// Looks into system PATH for Linux Perf and set up the aggregator to use it + void findPerfExecutable(); + + /// Launch a perf subprocess with given args and save output for later + /// parsing. + void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, + const char *ArgsString, bool Wait); + + /// Delete all temporary files created to hold the output generated by spawned + /// subprocesses during the aggregation job + void deleteTempFiles(); + + // Semantic pass helpers + + /// Look up which function contains an address by using out map of + /// disassembled BinaryFunctions + BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const; + + /// Retrieve the location name to be used for samples recorded in \p Func. + /// If doing BAT translation, link cold parts to the hot part names (used by + /// the original binary). \p Count specifies how many samples were recorded + /// at that location, so we can tally total activity in cold areas if we are + /// dealing with profiling data collected in a bolted binary. For LBRs, + /// \p Count should only be used for the source of the branch to avoid + /// counting cold activity twice (one for source and another for destination). + StringRef getLocationName(BinaryFunction &Func, uint64_t Count); + + /// Semantic actions - parser hooks to interpret parsed perf samples + /// Register a sample (non-LBR mode), i.e. a new hit at \p Address + bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count); + + /// Register an intraprocedural branch \p Branch. + bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, + uint64_t Count, uint64_t Mispreds); + + /// Register an interprocedural branch from \p FromFunc to \p ToFunc with + /// offsets \p From and \p To, respectively. + bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, + uint64_t From, uint64_t To, uint64_t Count, + uint64_t Mispreds); + + /// Register a \p Branch. + bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); + + /// Register a trace between two LBR entries supplied in execution order. + bool doTrace(const LBREntry &First, const LBREntry &Second, + uint64_t Count = 1); + + /// Parser helpers + /// Return false if we exhausted our parser buffer and finished parsing + /// everything + bool hasData(); + + /// Print heat map based on LBR samples. + std::error_code printLBRHeatMap(); + + /// Parse a single perf sample containing a PID associated with a sequence of + /// LBR entries. If the PID does not correspond to the binary we are looking + /// for, return std::errc::no_such_process. If other parsing errors occur, + /// return the error. Otherwise, return the parsed sample. + ErrorOr parseBranchSample(); + + /// Parse a single perf sample containing a PID associated with an event name + /// and a PC + ErrorOr parseBasicSample(); + + /// Parse a single perf sample containing a PID associated with an IP and + /// address. + ErrorOr parseMemSample(); + + /// Parse pre-aggregated LBR samples created by an external tool + ErrorOr parseAggregatedLBREntry(); + + /// Parse either buildid:offset or just offset, representing a location in the + /// binary. Used exclusevely for pre-aggregated LBR samples. + ErrorOr parseLocationOrOffset(); + + /// Check if a field separator is the next char to parse and, if yes, consume + /// it and return true + bool checkAndConsumeFS(); + + /// Consume the entire line + void consumeRestOfLine(); + + /// Parse a single LBR entry as output by perf script -Fbrstack + ErrorOr parseLBREntry(); + + /// Parse and pre-aggregate branch events. + std::error_code parseBranchEvents(); + + /// Process all branch events. + void processBranchEvents(); + + /// This member function supports generating data for AutoFDO LLVM tools. + std::error_code writeAutoFDOData(StringRef OutputFilename); + + /// Parse the full output generated by perf script to report non-LBR samples. + std::error_code parseBasicEvents(); + + /// Process non-LBR events. + void processBasicEvents(); + + /// Parse the full output generated by perf script to report memory events. + std::error_code parseMemEvents(); + + /// Process parsed memory events profile. + void processMemEvents(); + + /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping + /// between the binary name and its memory layout in a process with a given + /// PID. + /// On success return a pair. + ErrorOr> parseMMapEvent(); + + /// Parse PERF_RECORD_FORK event. + Optional parseForkEvent(); + + /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string. + Optional parseCommExecEvent(); + + /// Parse the full output generated by `perf script --show-mmap-events` + /// to generate mapping between binary files and their memory mappings for + /// all PIDs. + std::error_code parseMMapEvents(); + + /// Parse output of `perf script --show-task-events`, and forked processes + /// to the set of tracked PIDs. + std::error_code parseTaskEvents(); + + /// Parse a single pair of binary full path and associated build-id + Optional> parseNameBuildIDPair(); + + /// Parse the output generated by "perf buildid-list" to extract build-ids + /// and return a file name matching a given \p FileBuildID. + Optional getFileNameForBuildID(StringRef FileBuildID); + + /// Coordinate reading and parsing of pre-aggregated file + /// + /// The regular perf2bolt aggregation job is to read perf output directly. + /// However, if the data is coming from a database instead of perf, one could + /// write a query to produce a pre-aggregated file. This function deals with + /// this case. + /// + /// The pre-aggregated file contains aggregated LBR data, but without binary + /// knowledge. BOLT will parse it and, using information from the disassembled + /// binary, augment it with fall-through edge frequency information. After + /// this step is finished, this data can be either written to disk to be + /// consumed by BOLT later, or can be used by BOLT immediately if kept in + /// memory. + /// + /// File format syntax: + /// {B|F|f} [:] [:] + /// [] + /// + /// B - indicates an aggregated branch + /// F - an aggregated fall-through + /// f - an aggregated fall-through with external origin - used to disambiguate + /// between a return hitting a basic block head and a regular internal + /// jump to the block + /// + /// - build id of the object containing the start address. We can + /// skip it for the main binary and use "X" for an unknown object. This will + /// save some space and facilitate human parsing. + /// + /// - hex offset from the object base load address (0 for the + /// main executable unless it's PIE) to the start address. + /// + /// , - same for the end address. + /// + /// - total aggregated count of the branch or a fall-through. + /// + /// - the number of times the branch was mispredicted. + /// Omitted for fall-throughs. + /// + /// Example: + /// F 41be50 41be50 3 + /// F 41be90 41be90 4 + /// B 4b1942 39b57f0 3 0 + /// B 4b196f 4b19e0 2 0 + void parsePreAggregated(); + + /// Parse the full output of pre-aggregated LBR samples generated by + /// an external tool. + std::error_code parsePreAggregatedLBRSamples(); + + /// Process parsed pre-aggregated data. + void processPreAggregated(); + + /// If \p Address falls into the binary address space based on memory + /// mapping info \p MMI, then adjust it for further processing by subtracting + /// the base load address. External addresses, i.e. addresses that do not + /// correspond to the binary allocated address space, are adjusted to avoid + /// conflicts. + void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const { + if (Address >= MMI.BaseAddress && Address < MMI.BaseAddress + MMI.Size) { + // NOTE: Assumptions about the binary segment load table (PH for ELF) + // Segment file offset equals virtual address (which is true for .so) + // There aren't multiple executable segments loaded because MMapInfo + // doesn't support them. + Address -= MMI.BaseAddress - MMI.Offset; + } else if (Address < MMI.Size) { + // Make sure the address is not treated as belonging to the binary. + Address = (-1ULL); + } + } + + /// Adjust addresses in \p LBR entry. + void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const { + adjustAddress(LBR.From, MMI); + adjustAddress(LBR.To, MMI); + } + + /// Ignore kernel/user transition LBR if requested + bool ignoreKernelInterrupt(LBREntry &LBR) const; + + /// Populate functions in \p BC with profile. + void processProfile(BinaryContext &BC); + + /// Start an aggregation job asynchronously. + void start(); + + /// Returns true if this aggregation job is using a translation table to + /// remap samples collected on binaries already processed by BOLT. + bool usesBAT() const { return BAT; } + + /// Force all subprocesses to stop and cancel aggregation + void abort(); + + /// Dump data structures into a file readable by llvm-bolt + std::error_code writeAggregatedFile(StringRef OutputFilename) const; + + /// Filter out binaries based on PID + void filterBinaryMMapInfo(); + + /// If we have a build-id available for the input file, use it to assist + /// matching profile to a binary. + /// + /// If the binary name changed after profile collection, use build-id + /// to get the proper name in perf data when build-ids are available. + /// If \p FileBuildID has no match, then issue an error and exit. + void processFileBuildID(StringRef FileBuildID); + + /// Debugging dump methods + void dump() const; + void dump(const LBREntry &LBR) const; + void dump(const PerfBranchSample &Sample) const; + void dump(const PerfMemSample &Sample) const; +}; +} +} + +#endif diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp new file mode 100644 index 000000000000..42b995b19586 --- /dev/null +++ b/bolt/src/DataReader.cpp @@ -0,0 +1,1448 @@ +//===-- DataReader.cpp - Perf data reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by the perf2bolt +// utility and stores it in memory for llvm-bolt consumption. +// +//===----------------------------------------------------------------------===// + + +#include "BinaryFunction.h" +#include "DataReader.h" +#include "Passes/MCF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-prof" + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltCategory; +extern llvm::cl::opt Verbosity; + +static cl::opt +DumpData("dump-data", + cl::desc("dump parsed bolt data for debugging"), + cl::Hidden, + cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +Optional getLTOCommonName(const StringRef Name) { + size_t LTOSuffixPos = Name.find(".lto_priv."); + if (LTOSuffixPos != StringRef::npos) { + return Name.substr(0, LTOSuffixPos + 10); + } else if ((LTOSuffixPos = Name.find(".constprop.")) != StringRef::npos) { + return Name.substr(0, LTOSuffixPos + 11); + } else { + return NoneType(); + } +} + +namespace { + +/// Return true if the function name can change across compilations. +bool hasVolatileName(const BinaryFunction &BF) { + for (const StringRef Name : BF.getNames()) { + if (getLTOCommonName(Name)) + return true; + } + return false; +} + +/// Return standard name of the function possibly renamed by BOLT. +StringRef normalizeName(StringRef Name) { + // Strip "PG." prefix used for globalized locals. + return Name.startswith("PG.") ? Name.substr(2) : Name; +} + +} // anonymous namespace + +raw_ostream &operator<<(raw_ostream &OS, const Location &Loc) { + if (Loc.IsSymbol) { + OS << Loc.Name; + if (Loc.Offset) + OS << "+" << Twine::utohexstr(Loc.Offset); + } else { + OS << Twine::utohexstr(Loc.Offset); + } + return OS; +} + +iterator_range +FuncBranchData::getBranchRange(uint64_t From) const { + assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const BranchInfo &BI, const uint64_t Val) const { + return BI.From.Offset < Val; + } + bool operator()(const uint64_t Val, const BranchInfo &BI) const { + return Val < BI.From.Offset; + } + }; + auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare()); + return iterator_range(Range.first, Range.second); +} + +void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) { + Data.insert(Data.end(), FBD.Data.begin(), FBD.Data.end()); + for (auto I = Data.begin(), E = Data.end(); I != E; ++I) { + if (I->From.Name == FBD.Name) { + I->From.Name = this->Name; + I->From.Offset += Offset; + } + if (I->To.Name == FBD.Name) { + I->To.Name = this->Name; + I->To.Offset += Offset; + } + } + std::stable_sort(Data.begin(), Data.end()); + ExecutionCount += FBD.ExecutionCount; + for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) { + assert(I->To.Name == FBD.Name); + auto NewElmt = EntryData.insert(EntryData.end(), *I); + NewElmt->To.Name = this->Name; + NewElmt->To.Offset += Offset; + } +} + +uint64_t FuncBranchData::getNumExecutedBranches() const { + uint64_t ExecutedBranches = 0; + for (const BranchInfo &BI : Data) { + int64_t BranchCount = BI.Branches; + assert(BranchCount >= 0 && "branch execution count should not be negative"); + ExecutedBranches += BranchCount; + } + return ExecutedBranches; +} + +void SampleInfo::mergeWith(const SampleInfo &SI) { + Hits += SI.Hits; +} + +void SampleInfo::print(raw_ostream &OS) const { + OS << Loc.IsSymbol << " " << Loc.Name << " " << Twine::utohexstr(Loc.Offset) + << " " << Hits << "\n"; +} + +uint64_t +FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { + assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const SampleInfo &SI, const uint64_t Val) const { + return SI.Loc.Offset < Val; + } + bool operator()(const uint64_t Val, const SampleInfo &SI) const { + return Val < SI.Loc.Offset; + } + }; + uint64_t Result = 0; + for (auto I = std::lower_bound(Data.begin(), Data.end(), Start, Compare()), + E = std::lower_bound(Data.begin(), Data.end(), End, Compare()); + I != E; ++I) { + Result += I->Hits; + } + return Result; +} + +void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) { + auto Iter = Index.find(Offset); + if (Iter == Index.end()) { + Data.emplace_back(Location(true, Name, Offset), Count); + Index[Offset] = Data.size() - 1; + return; + } + SampleInfo &SI = Data[Iter->second]; + SI.Hits += Count; +} + +void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, + uint64_t Count, uint64_t Mispreds) { + auto Iter = IntraIndex[OffsetFrom].find(OffsetTo); + if (Iter == IntraIndex[OffsetFrom].end()) { + Data.emplace_back(Location(true, Name, OffsetFrom), + Location(true, Name, OffsetTo), Mispreds, Count); + IntraIndex[OffsetFrom][OffsetTo] = Data.size() - 1; + return; + } + BranchInfo &BI = Data[Iter->second]; + BI.Branches += Count; + BI.Mispreds += Mispreds; +} + +void FuncBranchData::bumpCallCount(uint64_t OffsetFrom, const Location &To, + uint64_t Count, uint64_t Mispreds) { + auto Iter = InterIndex[OffsetFrom].find(To); + if (Iter == InterIndex[OffsetFrom].end()) { + Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispreds, Count); + InterIndex[OffsetFrom][To] = Data.size() - 1; + return; + } + BranchInfo &BI = Data[Iter->second]; + BI.Branches += Count; + BI.Mispreds += Mispreds; +} + +void FuncBranchData::bumpEntryCount(const Location &From, uint64_t OffsetTo, + uint64_t Count, uint64_t Mispreds) { + auto Iter = EntryIndex[OffsetTo].find(From); + if (Iter == EntryIndex[OffsetTo].end()) { + EntryData.emplace_back(From, Location(true, Name, OffsetTo), Mispreds, + Count); + EntryIndex[OffsetTo][From] = EntryData.size() - 1; + return; + } + BranchInfo &BI = EntryData[Iter->second]; + BI.Branches += Count; + BI.Mispreds += Mispreds; +} + +void BranchInfo::mergeWith(const BranchInfo &BI) { + Branches += BI.Branches; + Mispreds += BI.Mispreds; +} + +void BranchInfo::print(raw_ostream &OS) const { + OS << From.IsSymbol << " " << From.Name << " " + << Twine::utohexstr(From.Offset) << " " + << To.IsSymbol << " " << To.Name << " " + << Twine::utohexstr(To.Offset) << " " + << Mispreds << " " << Branches << '\n'; +} + +ErrorOr FuncBranchData::getBranch(uint64_t From, + uint64_t To) const { + for (const BranchInfo &I : Data) { + if (I.From.Offset == From && I.To.Offset == To && + I.From.Name == I.To.Name) + return I; + } + return make_error_code(llvm::errc::invalid_argument); +} + +ErrorOr +FuncBranchData::getDirectCallBranch(uint64_t From) const { + // Commented out because it can be expensive. + // assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const BranchInfo &BI, const uint64_t Val) const { + return BI.From.Offset < Val; + } + bool operator()(const uint64_t Val, const BranchInfo &BI) const { + return Val < BI.From.Offset; + } + }; + auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare()); + for (auto I = Range.first; I != Range.second; ++I) { + if (I->From.Name != I->To.Name) + return *I; + } + return make_error_code(llvm::errc::invalid_argument); +} + +void MemInfo::print(raw_ostream &OS) const { + OS << (Offset.IsSymbol + 3) << " " << Offset.Name << " " + << Twine::utohexstr(Offset.Offset) << " " + << (Addr.IsSymbol + 3) << " " << Addr.Name << " " + << Twine::utohexstr(Addr.Offset) << " " + << Count << "\n"; +} + +void MemInfo::prettyPrint(raw_ostream &OS) const { + OS << "(PC: " << Offset << ", M: " << Addr << ", C: " << Count << ")"; +} + +void FuncMemData::update(const Location &Offset, const Location &Addr) { + auto Iter = EventIndex[Offset.Offset].find(Addr); + if (Iter == EventIndex[Offset.Offset].end()) { + Data.emplace_back(MemInfo(Offset, Addr, 1)); + EventIndex[Offset.Offset][Addr] = Data.size() - 1; + return; + } + ++Data[Iter->second].Count; +} + +Error DataReader::preprocessProfile(BinaryContext &BC) { + if (std::error_code EC = parseInput()) { + return errorCodeToError(EC); + } + + if (opts::DumpData) { + dump(); + } + + if (collectedInBoltedBinary()) { + outs() << "BOLT-INFO: profile collection done on a binary already " + "processed by BOLT\n"; + } + + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + if (FuncMemData *MemData = getMemDataForNames(Function.getNames())) { + setMemData(Function, MemData); + MemData->Used = true; + } + if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) { + setBranchData(Function, FuncData); + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } + } + + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + matchProfileMemData(Function); + } + + return Error::success(); +} + +Error DataReader::readProfilePreCFG(BinaryContext &BC) { + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + FuncMemData *MemoryData = getMemData(Function); + if (!MemoryData) + continue; + + for (MemInfo &MI : MemoryData->Data) { + const uint64_t Offset = MI.Offset.Offset; + auto II = Function.Instructions.find(Offset); + if (II == Function.Instructions.end()) { + // Ignore bad instruction address. + continue; + } + + auto &MemAccessProfile = + BC.MIB->getOrCreateAnnotationAs( + II->second, "MemoryAccessProfile"); + BinaryData *BD = nullptr; + if (MI.Addr.IsSymbol) { + BD = BC.getBinaryDataByName(MI.Addr.Name); + } + MemAccessProfile.AddressAccessInfo. + push_back({BD, MI.Addr.Offset, MI.Count}); + auto NextII = std::next(II); + if (NextII == Function.Instructions.end()) { + MemAccessProfile.NextInstrOffset = Function.getSize(); + } else { + MemAccessProfile.NextInstrOffset = II->first; + } + } + Function.HasMemoryProfile = true; + } + + return Error::success(); +} + +Error DataReader::readProfile(BinaryContext &BC) { + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + readProfile(Function); + } + + uint64_t NumUnused = 0; + for (const StringMapEntry &FuncData : NamesToBranches) + if (!FuncData.getValue().Used) + ++NumUnused; + BC.setNumUnusedProfiledObjects(NumUnused); + + return Error::success(); +} + +std::error_code DataReader::parseInput() { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = MB.getError()) { + Diag << "cannot open " << Filename << ": " << EC.message() << "\n"; + return EC; + } + FileBuf = std::move(MB.get()); + ParsingBuf = FileBuf->getBuffer(); + if (std::error_code EC = parse()) { + return EC; + } + if (!ParsingBuf.empty()) { + Diag << "WARNING: invalid profile data detected at line " << Line + << ". Possibly corrupted profile.\n"; + } + + buildLTONameMaps(); + + return std::error_code(); +} + +void DataReader::readProfile(BinaryFunction &BF) { + if (BF.empty()) + return; + + if (!hasLBR()) { + BF.ProfileFlags = BinaryFunction::PF_SAMPLE; + readSampleData(BF); + return; + } + + BF.ProfileFlags = BinaryFunction::PF_LBR; + + // Possibly assign/re-assign branch profile data. + matchProfileData(BF); + + FuncBranchData *FBD = getBranchData(BF); + if (!FBD) + return; + + // Assign basic block counts to function entry points. These only include + // counts for outside entries. + // + // There is a slight skew introduced here as branches originated from RETs + // may be accounted for in the execution count of an entry block if the last + // instruction in a predecessor fall-through block is a call. This situation + // should rarely happen because there are few multiple-entry functions. + for (const BranchInfo &BI : FBD->EntryData) { + BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(BI.To.Offset); + if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { + uint64_t Count = BB->getExecutionCount(); + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE) + Count = 0; + BB->setExecutionCount(Count + BI.Branches); + } + } + + uint64_t MismatchedBranches = 0; + for (const BranchInfo &BI : FBD->Data) { + if (BI.From.Name != BI.To.Name) { + continue; + } + + if (!recordBranch(BF, BI.From.Offset, BI.To.Offset, + BI.Branches, BI.Mispreds)) { + LLVM_DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> " + << BI.To.Offset << '\n'); + ++MismatchedBranches; + } + } + + // Convert branch data into annotations. + convertBranchData(BF); +} + +void DataReader::matchProfileData(BinaryFunction &BF) { + // This functionality is available for LBR-mode only + // TODO: Implement evaluateProfileData() for samples, checking whether + // sample addresses match instruction addresses in the function + if (!hasLBR()) + return; + + FuncBranchData *FBD = getBranchData(BF); + if (FBD) { + BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); + BF.RawBranchCount = FBD->getNumExecutedBranches(); + if (BF.ProfileMatchRatio == 1.0f) { + if (fetchProfileForOtherEntryPoints(BF)) { + BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); + BF.ExecutionCount = FBD->ExecutionCount; + BF.RawBranchCount = FBD->getNumExecutedBranches(); + } + return; + } + } + + // Check if the function name can fluctuate between several compilations + // possibly triggered by minor unrelated code changes in the source code + // of the input binary. + if (!hasVolatileName(BF)) + return; + + // Check for a profile that matches with 100% confidence. + const std::vector AllBranchData = + getBranchDataForNamesRegex(BF.getNames()); + for (FuncBranchData *NewBranchData : AllBranchData) { + // Prevent functions from sharing the same profile. + if (NewBranchData->Used) + continue; + + if (evaluateProfileData(BF, *NewBranchData) != 1.0f) + continue; + + if (FBD) + FBD->Used = false; + + // Update function profile data with the new set. + setBranchData(BF, NewBranchData); + NewBranchData->Used = true; + BF.ExecutionCount = NewBranchData->ExecutionCount; + BF.ProfileMatchRatio = 1.0f; + break; + } +} + +void DataReader::matchProfileMemData(BinaryFunction &BF) { + const std::vector AllMemData = + getMemDataForNamesRegex(BF.getNames()); + for (FuncMemData *NewMemData : AllMemData) { + // Prevent functions from sharing the same profile. + if (NewMemData->Used) + continue; + + if (FuncMemData *MD = getMemData(BF)) + MD->Used = false; + + // Update function profile data with the new set. + setMemData(BF, NewMemData); + NewMemData->Used = true; + break; + } +} + +bool DataReader::fetchProfileForOtherEntryPoints(BinaryFunction &BF) { + BinaryContext &BC = BF.getBinaryContext(); + + FuncBranchData *FBD = getBranchData(BF); + if (!FBD) + return false; + + // Check if we are missing profiling data for secondary entry points + bool First = true; + bool Updated = false; + for (BinaryBasicBlock *BB : BF.BasicBlocks) { + if (First) { + First = false; + continue; + } + if (BB->isEntryPoint()) { + uint64_t EntryAddress = BB->getOffset() + BF.getAddress(); + // Look for branch data associated with this entry point + if (BinaryData *BD = BC.getBinaryDataAtAddress(EntryAddress)) { + if (FuncBranchData *Data = getBranchDataForSymbols(BD->getSymbols())) { + FBD->appendFrom(*Data, BB->getOffset()); + Data->Used = true; + Updated = true; + } + } + } + } + + return Updated; +} + +float DataReader::evaluateProfileData(BinaryFunction &BF, + const FuncBranchData &BranchData) const { + BinaryContext &BC = BF.getBinaryContext(); + + // Until we define a minimal profile, we consider an empty branch data to be + // a valid profile. It could happen to a function without branches when we + // still have an EntryData for the execution count. + if (BranchData.Data.empty()) { + return 1.0f; + } + + uint64_t NumMatchedBranches = 0; + for (const BranchInfo &BI : BranchData.Data) { + bool IsValid = false; + if (BI.From.Name == BI.To.Name) { + // Try to record information with 0 count. + IsValid = recordBranch(BF, BI.From.Offset, BI.To.Offset, 0); + } else if (collectedInBoltedBinary()) { + // We can't check branch source for collections in bolted binaries because + // the source of the branch may be mapped to the first instruction in a BB + // instead of the original branch (which may not exist in the source bin). + IsValid = true; + } else { + // The branch has to originate from this function. + // Check for calls, tail calls, rets and indirect branches. + // When matching profiling info, we did not reach the stage + // when we identify tail calls, so they are still represented + // by regular branch instructions and we need isBranch() here. + MCInst *Instr = BF.getInstructionAtOffset(BI.From.Offset); + // If it's a prefix - skip it. + if (Instr && BC.MIB->isPrefix(*Instr)) + Instr = BF.getInstructionAtOffset(BI.From.Offset + 1); + if (Instr && + (BC.MIB->isCall(*Instr) || + BC.MIB->isBranch(*Instr) || + BC.MIB->isReturn(*Instr))) { + IsValid = true; + } + } + + if (IsValid) { + ++NumMatchedBranches; + continue; + } + + LLVM_DEBUG(dbgs() << "\tinvalid branch in " << BF << " : 0x" + << Twine::utohexstr(BI.From.Offset) << " -> "; + if (BI.From.Name == BI.To.Name) dbgs() + << "0x" << Twine::utohexstr(BI.To.Offset) << '\n'; + else dbgs() << "\n";); + } + + const float MatchRatio = (float)NumMatchedBranches / BranchData.Data.size(); + if (opts::Verbosity >= 2 && NumMatchedBranches < BranchData.Data.size()) { + errs() << "BOLT-WARNING: profile branches match only " + << format("%.1f%%", MatchRatio * 100.0f) << " (" + << NumMatchedBranches << '/' << BranchData.Data.size() + << ") for function " << BF << '\n'; + } + + return MatchRatio; +} + +void DataReader::readSampleData(BinaryFunction &BF) { + FuncSampleData *SampleDataOrErr = getFuncSampleData(BF.getNames()); + if (!SampleDataOrErr) + return; + + // Basic samples mode territory (without LBR info) + // First step is to assign BB execution count based on samples from perf + BF.ProfileMatchRatio = 1.0f; + BF.removeTagsFromProfile(); + bool NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions"); + bool NormalizeByCalls = usesEvent("branches"); + static bool NagUser = true; + if (NagUser) { + outs() + << "BOLT-INFO: operating with basic samples profiling data (no LBR).\n"; + if (NormalizeByInsnCount) { + outs() << "BOLT-INFO: normalizing samples by instruction count.\n"; + } else if (NormalizeByCalls) { + outs() << "BOLT-INFO: normalizing samples by branches.\n"; + } + NagUser = false; + } + uint64_t LastOffset = BF.getSize(); + uint64_t TotalEntryCount = 0; + for (auto I = BF.BasicBlockOffsets.rbegin(), E = BF.BasicBlockOffsets.rend(); + I != E; ++I) { + uint64_t CurOffset = I->first; + // Always work with samples multiplied by 1000 to avoid losing them if we + // later need to normalize numbers + uint64_t NumSamples = + SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000; + if (NormalizeByInsnCount && I->second->getNumNonPseudos()) + NumSamples /= I->second->getNumNonPseudos(); + else if (NormalizeByCalls) { + uint32_t NumCalls = I->second->getNumCalls(); + NumSamples /= NumCalls + 1; + } + I->second->setExecutionCount(NumSamples); + if (I->second->isEntryPoint()) + TotalEntryCount += NumSamples; + LastOffset = CurOffset; + } + + BF.ExecutionCount = TotalEntryCount; + + estimateEdgeCounts(BF); +} + +void DataReader::convertBranchData(BinaryFunction &BF) const { + BinaryContext &BC = BF.getBinaryContext(); + + if (BF.empty()) + return; + + FuncBranchData *FBD = getBranchData(BF); + if (!FBD) + return; + + // Profile information for calls. + // + // There are 3 cases that we annotate differently: + // 1) Conditional tail calls that could be mispredicted. + // 2) Indirect calls to multiple destinations with mispredictions. + // Before we validate CFG we have to handle indirect branches here too. + // 3) Regular direct calls. The count could be different from containing + // basic block count. Keep this data in case we find it useful. + // + for (BranchInfo &BI : FBD->Data) { + // Ignore internal branches. + if (BI.To.IsSymbol && BI.To.Name == BI.From.Name && BI.To.Offset != 0) + continue; + + MCInst *Instr = BF.getInstructionAtOffset(BI.From.Offset); + if (!Instr || + (!BC.MIB->isCall(*Instr) && !BC.MIB->isIndirectBranch(*Instr))) + continue; + + auto setOrUpdateAnnotation = [&](StringRef Name, uint64_t Count) { + if (opts::Verbosity >= 1 && BC.MIB->hasAnnotation(*Instr, Name)) { + errs() << "BOLT-WARNING: duplicate " << Name << " info for offset 0x" + << Twine::utohexstr(BI.From.Offset) + << " in function " << BF << '\n'; + } + auto &Value = BC.MIB->getOrCreateAnnotationAs(*Instr, Name); + Value += Count; + }; + + if (BC.MIB->isIndirectCall(*Instr) || BC.MIB->isIndirectBranch(*Instr)) { + IndirectCallSiteProfile &CSP = + BC.MIB->getOrCreateAnnotationAs( + *Instr, "CallProfile"); + MCSymbol *CalleeSymbol = nullptr; + if (BI.To.IsSymbol) { + if (BinaryData *BD = BC.getBinaryDataByName(BI.To.Name)) { + CalleeSymbol = BD->getSymbol(); + } + } + CSP.emplace_back(CalleeSymbol, BI.Branches, BI.Mispreds); + } else if (BC.MIB->getConditionalTailCall(*Instr)) { + setOrUpdateAnnotation("CTCTakenCount", BI.Branches); + setOrUpdateAnnotation("CTCMispredCount", BI.Mispreds); + } else { + setOrUpdateAnnotation("Count", BI.Branches); + } + } +} + +bool DataReader::recordBranch(BinaryFunction &BF, + uint64_t From, uint64_t To, + uint64_t Count, uint64_t Mispreds) const { + BinaryContext &BC = BF.getBinaryContext(); + + BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(From); + BinaryBasicBlock *ToBB = BF.getBasicBlockContainingOffset(To); + + if (!FromBB || !ToBB) { + LLVM_DEBUG(dbgs() << "failed to get block for recorded branch\n"); + return false; + } + + // Could be bad LBR data; ignore the branch. In the case of data collected + // in binaries optimized by BOLT, a source BB may be mapped to two output + // BBs as a result of optimizations. In that case, a branch between these + // two will be recorded as a branch from A going to A in the source address + // space. Keep processing. + if (From == To) { + return true; + } + + if (FromBB->succ_size() == 0) { + // Return from a tail call. + return true; + } + + // Very rarely we will see ignored branches. Do a linear check. + for (std::pair &Branch : BF.IgnoredBranches) { + if (Branch == std::make_pair(static_cast(From), + static_cast(To))) + return true; + } + + if (To != ToBB->getOffset()) { + // "To" could be referring to nop instructions in between 2 basic blocks. + // While building the CFG we make sure these nops are attributed to the + // previous basic block, thus we check if the destination belongs to the + // gap past the last instruction. + const MCInst *LastInstr = ToBB->getLastNonPseudoInstr(); + if (LastInstr) { + const uint32_t LastInstrOffset = + BC.MIB->getAnnotationWithDefault(*LastInstr, "Offset"); + + // With old .fdata we are getting FT branches for "jcc,jmp" sequences. + if (To == LastInstrOffset && BC.MIB->isUnconditionalBranch(*LastInstr)) { + return true; + } + + if (To <= LastInstrOffset) { + LLVM_DEBUG(dbgs() << "branch recorded into the middle of the block" + << " in " << BF << " : " << From << " -> " << To + << '\n'); + return false; + } + } + + // The real destination is the layout successor of the detected ToBB. + if (ToBB == BF.BasicBlocksLayout.back()) + return false; + BinaryBasicBlock *NextBB = BF.BasicBlocksLayout[ToBB->getIndex() + 1]; + assert((NextBB && NextBB->getOffset() > ToBB->getOffset()) && "bad layout"); + ToBB = NextBB; + } + + // If there's no corresponding instruction for 'From', we have probably + // discarded it as a FT from __builtin_unreachable. + MCInst *FromInstruction = BF.getInstructionAtOffset(From); + if (!FromInstruction) { + // If the data was collected in a bolted binary, the From addresses may be + // translated to the first instruction of the source BB if BOLT inserted + // a new branch that did not exist in the source (we can't map it to the + // source instruction, so we map it to the first instr of source BB). + // We do not keep offsets for random instructions. So the check above will + // evaluate to true if the first instr is not a branch (call/jmp/ret/etc) + if (collectedInBoltedBinary()) { + if (FromBB->getInputOffset() != From) { + LLVM_DEBUG(dbgs() << "offset " << From << " does not match a BB in " + << BF << '\n'); + return false; + } + FromInstruction = nullptr; + } else { + LLVM_DEBUG(dbgs() << "no instruction for offset " << From << " in " << BF + << '\n'); + return false; + } + } + + if (!FromBB->getSuccessor(ToBB->getLabel())) { + // Check if this is a recursive call or a return from a recursive call. + if (FromInstruction && ToBB->isEntryPoint() && + (BC.MIB->isCall(*FromInstruction) || + BC.MIB->isIndirectBranch(*FromInstruction))) { + // Execution count is already accounted for. + return true; + } + // For data collected in a bolted binary, we may have created two output BBs + // that map to one original block. Branches between these two blocks will + // appear here as one BB jumping to itself, even though it has no loop edges. + // Ignore these. + if (collectedInBoltedBinary() && FromBB == ToBB) + return true; + + LLVM_DEBUG(dbgs() << "invalid branch in " << BF << '\n' + << Twine::utohexstr(From) << " -> " + << Twine::utohexstr(To) << '\n'); + return false; + } + + BinaryBasicBlock::BinaryBranchInfo &BI = FromBB->getBranchInfo(*ToBB); + BI.Count += Count; + // Only update mispredicted count if it the count was real. + if (Count) { + BI.MispredictedCount += Mispreds; + } + + return true; +} + +void DataReader::reportError(StringRef ErrorMsg) { + Diag << "Error reading BOLT data input file: line " << Line << ", column " + << Col << ": " << ErrorMsg << '\n'; +} + +bool DataReader::expectAndConsumeFS() { + if (ParsingBuf[0] != FieldSeparator) { + reportError("expected field separator"); + return false; + } + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + return true; +} + +void DataReader::consumeAllRemainingFS() { + while (ParsingBuf[0] == FieldSeparator) { + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + } +} + +bool DataReader::checkAndConsumeNewLine() { + if (ParsingBuf[0] != '\n') + return false; + + ParsingBuf = ParsingBuf.drop_front(1); + Col = 0; + Line += 1; + return true; +} + +ErrorOr DataReader::parseString(char EndChar, bool EndNl) { + std::string EndChars(1, EndChar); + if (EndNl) + EndChars.push_back('\n'); + size_t StringEnd = ParsingBuf.find_first_of(EndChars); + if (StringEnd == StringRef::npos || StringEnd == 0) { + reportError("malformed field"); + return make_error_code(llvm::errc::io_error); + } + + StringRef Str = ParsingBuf.substr(0, StringEnd); + + // If EndNl was set and nl was found instead of EndChar, do not consume the + // new line. + bool EndNlInsteadOfEndChar = + ParsingBuf[StringEnd] == '\n' && EndChar != '\n'; + unsigned End = EndNlInsteadOfEndChar ? StringEnd : StringEnd + 1; + + ParsingBuf = ParsingBuf.drop_front(End); + if (EndChar == '\n') { + Col = 0; + Line += 1; + } else { + Col += End; + } + return Str; +} + +ErrorOr DataReader::parseNumberField(char EndChar, bool EndNl) { + ErrorOr NumStrRes = parseString(EndChar, EndNl); + if (std::error_code EC = NumStrRes.getError()) + return EC; + StringRef NumStr = NumStrRes.get(); + int64_t Num; + if (NumStr.getAsInteger(10, Num)) { + reportError("expected decimal number"); + Diag << "Found: " << NumStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + return Num; +} + +ErrorOr DataReader::parseHexField(char EndChar, bool EndNl) { + ErrorOr NumStrRes = parseString(EndChar, EndNl); + if (std::error_code EC = NumStrRes.getError()) + return EC; + StringRef NumStr = NumStrRes.get(); + uint64_t Num; + if (NumStr.getAsInteger(16, Num)) { + reportError("expected hexidecimal number"); + Diag << "Found: " << NumStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + return Num; +} + +ErrorOr DataReader::parseLocation(char EndChar, + bool EndNl, + bool ExpectMemLoc) { + // Read whether the location of the branch should be DSO or a symbol + // 0 means it is a DSO. 1 means it is a global symbol. 2 means it is a local + // symbol. + // The symbol flag is also used to tag memory load events by adding 3 to the + // base values, i.e. 3 not a symbol, 4 global symbol and 5 local symbol. + if (!ExpectMemLoc && + ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && ParsingBuf[0] != '2') { + reportError("expected 0, 1 or 2"); + return make_error_code(llvm::errc::io_error); + } + + if (ExpectMemLoc && + ParsingBuf[0] != '3' && ParsingBuf[0] != '4' && ParsingBuf[0] != '5') { + reportError("expected 3, 4 or 5"); + return make_error_code(llvm::errc::io_error); + } + + bool IsSymbol = + (!ExpectMemLoc && (ParsingBuf[0] == '1' || ParsingBuf[0] == '2')) || + (ExpectMemLoc && (ParsingBuf[0] == '4' || ParsingBuf[0] == '5')); + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + + if (!expectAndConsumeFS()) + return make_error_code(llvm::errc::io_error); + consumeAllRemainingFS(); + + // Read the string containing the symbol or the DSO name + ErrorOr NameRes = parseString(FieldSeparator); + if (std::error_code EC = NameRes.getError()) + return EC; + StringRef Name = NameRes.get(); + consumeAllRemainingFS(); + + // Read the offset + ErrorOr Offset = parseHexField(EndChar, EndNl); + if (std::error_code EC = Offset.getError()) + return EC; + + return Location(IsSymbol, Name, Offset.get()); +} + +ErrorOr DataReader::parseBranchInfo() { + ErrorOr Res = parseLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location From = Res.get(); + + consumeAllRemainingFS(); + Res = parseLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location To = Res.get(); + + consumeAllRemainingFS(); + ErrorOr MRes = parseNumberField(FieldSeparator); + if (std::error_code EC = MRes.getError()) + return EC; + int64_t NumMispreds = MRes.get(); + + consumeAllRemainingFS(); + ErrorOr BRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + if (std::error_code EC = BRes.getError()) + return EC; + int64_t NumBranches = BRes.get(); + + consumeAllRemainingFS(); + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches); +} + +ErrorOr DataReader::parseMemInfo() { + ErrorOr Res = parseMemLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location Offset = Res.get(); + + consumeAllRemainingFS(); + Res = parseMemLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location Addr = Res.get(); + + consumeAllRemainingFS(); + ErrorOr CountRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = CountRes.getError()) + return EC; + + consumeAllRemainingFS(); + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return MemInfo(Offset, Addr, CountRes.get()); +} + +ErrorOr DataReader::parseSampleInfo() { + ErrorOr Res = parseLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location Address = Res.get(); + + consumeAllRemainingFS(); + ErrorOr BRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + if (std::error_code EC = BRes.getError()) + return EC; + int64_t Occurrences = BRes.get(); + + consumeAllRemainingFS(); + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return SampleInfo(std::move(Address), Occurrences); +} + +ErrorOr DataReader::maybeParseNoLBRFlag() { + if (ParsingBuf.size() < 6 || ParsingBuf.substr(0, 6) != "no_lbr") + return false; + ParsingBuf = ParsingBuf.drop_front(6); + Col += 6; + + if (ParsingBuf.size() > 0 && ParsingBuf[0] == ' ') + ParsingBuf = ParsingBuf.drop_front(1); + + while (ParsingBuf.size() > 0 && ParsingBuf[0] != '\n') { + ErrorOr EventName = parseString(' ', true); + if (!EventName) + return make_error_code(llvm::errc::io_error); + EventNames.insert(EventName.get()); + } + + if (!checkAndConsumeNewLine()) { + reportError("malformed no_lbr line"); + return make_error_code(llvm::errc::io_error); + } + return true; +} + +ErrorOr DataReader::maybeParseBATFlag() { + if (ParsingBuf.size() < 16 || ParsingBuf.substr(0, 16) != "boltedcollection") + return false; + ParsingBuf = ParsingBuf.drop_front(16); + Col += 16; + + if (!checkAndConsumeNewLine()) { + reportError("malformed boltedcollection line"); + return make_error_code(llvm::errc::io_error); + } + return true; +} + + +bool DataReader::hasBranchData() { + if (ParsingBuf.size() == 0) + return false; + + if (ParsingBuf[0] == '0' || ParsingBuf[0] == '1' || ParsingBuf[0] == '2') + return true; + return false; +} + +bool DataReader::hasMemData() { + if (ParsingBuf.size() == 0) + return false; + + if (ParsingBuf[0] == '3' || ParsingBuf[0] == '4' || ParsingBuf[0] == '5') + return true; + return false; +} + +std::error_code DataReader::parseInNoLBRMode() { + auto GetOrCreateFuncEntry = [&](StringRef Name) { + auto I = NamesToSamples.find(Name); + if (I == NamesToSamples.end()) { + bool Success; + std::tie(I, Success) = NamesToSamples.insert(std::make_pair( + Name, FuncSampleData(Name, FuncSampleData::ContainerTy()))); + + assert(Success && "unexpected result of insert"); + } + return I; + }; + + auto GetOrCreateFuncMemEntry = [&](StringRef Name) { + auto I = NamesToMemEvents.find(Name); + if (I == NamesToMemEvents.end()) { + bool Success; + std::tie(I, Success) = NamesToMemEvents.insert( + std::make_pair(Name, FuncMemData(Name, FuncMemData::ContainerTy()))); + assert(Success && "unexpected result of insert"); + } + return I; + }; + + while (hasBranchData()) { + ErrorOr Res = parseSampleInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + SampleInfo SI = Res.get(); + + // Ignore samples not involving known locations + if (!SI.Loc.IsSymbol) + continue; + + StringMapIterator I = GetOrCreateFuncEntry(SI.Loc.Name); + I->getValue().Data.emplace_back(std::move(SI)); + } + + while (hasMemData()) { + ErrorOr Res = parseMemInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + MemInfo MI = Res.get(); + + // Ignore memory events not involving known pc. + if (!MI.Offset.IsSymbol) + continue; + + StringMapIterator I = GetOrCreateFuncMemEntry(MI.Offset.Name); + I->getValue().Data.emplace_back(std::move(MI)); + } + + for (StringMapEntry &FuncSamples : NamesToSamples) { + std::stable_sort(FuncSamples.second.Data.begin(), + FuncSamples.second.Data.end()); + } + + for (StringMapEntry &MemEvents : NamesToMemEvents) { + std::stable_sort(MemEvents.second.Data.begin(), + MemEvents.second.Data.end()); + } + + return std::error_code(); +} + +std::error_code DataReader::parse() { + auto GetOrCreateFuncEntry = [&](StringRef Name) { + auto I = NamesToBranches.find(Name); + if (I == NamesToBranches.end()) { + bool Success; + std::tie(I, Success) = NamesToBranches.insert(std::make_pair( + Name, FuncBranchData(Name, FuncBranchData::ContainerTy(), + FuncBranchData::ContainerTy()))); + assert(Success && "unexpected result of insert"); + } + return I; + }; + + auto GetOrCreateFuncMemEntry = [&](StringRef Name) { + auto I = NamesToMemEvents.find(Name); + if (I == NamesToMemEvents.end()) { + bool Success; + std::tie(I, Success) = NamesToMemEvents.insert( + std::make_pair(Name, FuncMemData(Name, FuncMemData::ContainerTy()))); + assert(Success && "unexpected result of insert"); + } + return I; + }; + + Col = 0; + Line = 1; + ErrorOr FlagOrErr = maybeParseNoLBRFlag(); + if (!FlagOrErr) + return FlagOrErr.getError(); + NoLBRMode = *FlagOrErr; + + ErrorOr BATFlagOrErr = maybeParseBATFlag(); + if (!BATFlagOrErr) + return BATFlagOrErr.getError(); + BATMode = *BATFlagOrErr; + + if (!hasBranchData() && !hasMemData()) { + Diag << "ERROR: no valid profile data found\n"; + return make_error_code(llvm::errc::io_error); + } + + if (NoLBRMode) + return parseInNoLBRMode(); + + while (hasBranchData()) { + ErrorOr Res = parseBranchInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + BranchInfo BI = Res.get(); + + // Ignore branches not involving known location. + if (!BI.From.IsSymbol && !BI.To.IsSymbol) + continue; + + StringMapIterator I = GetOrCreateFuncEntry(BI.From.Name); + I->getValue().Data.emplace_back(std::move(BI)); + + // Add entry data for branches to another function or branches + // to entry points (including recursive calls) + if (BI.To.IsSymbol && + (!BI.From.Name.equals(BI.To.Name) || BI.To.Offset == 0)) { + I = GetOrCreateFuncEntry(BI.To.Name); + I->getValue().EntryData.emplace_back(std::move(BI)); + } + + // If destination is the function start - update execution count. + // NB: the data is skewed since we cannot tell tail recursion from + // branches to the function start. + if (BI.To.IsSymbol && BI.To.Offset == 0) { + I = GetOrCreateFuncEntry(BI.To.Name); + I->getValue().ExecutionCount += BI.Branches; + } + } + + while (hasMemData()) { + ErrorOr Res = parseMemInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + MemInfo MI = Res.get(); + + // Ignore memory events not involving known pc. + if (!MI.Offset.IsSymbol) + continue; + + StringMapIterator I = GetOrCreateFuncMemEntry(MI.Offset.Name); + I->getValue().Data.emplace_back(std::move(MI)); + } + + for (StringMapEntry &FuncBranches : NamesToBranches) { + std::stable_sort(FuncBranches.second.Data.begin(), + FuncBranches.second.Data.end()); + } + + for (StringMapEntry &MemEvents : NamesToMemEvents) { + std::stable_sort(MemEvents.second.Data.begin(), + MemEvents.second.Data.end()); + } + + return std::error_code(); +} + +void DataReader::buildLTONameMaps() { + for (StringMapEntry &FuncData : NamesToBranches) { + const StringRef FuncName = FuncData.getKey(); + const Optional CommonName = getLTOCommonName(FuncName); + if (CommonName) + LTOCommonNameMap[*CommonName].push_back(&FuncData.getValue()); + } + + for (StringMapEntry &FuncData : NamesToMemEvents) { + const StringRef FuncName = FuncData.getKey(); + const Optional CommonName = getLTOCommonName(FuncName); + if (CommonName) + LTOCommonNameMemMap[*CommonName].push_back(&FuncData.getValue()); + } +} + +namespace { +template +decltype(MapTy::MapEntryTy::second) * +fetchMapEntry(MapTy &Map, const std::vector &Symbols) { + // Do a reverse order iteration since the name in profile has a higher chance + // of matching a name at the end of the list. + for (auto SI = Symbols.rbegin(), SE = Symbols.rend(); SI != SE; ++SI) { + auto I = Map.find(normalizeName((*SI)->getName())); + if (I != Map.end()) + return &I->getValue(); + } + return nullptr; +} + +template +decltype(MapTy::MapEntryTy::second) * +fetchMapEntry(MapTy &Map, const std::vector &FuncNames) { + // Do a reverse order iteration since the name in profile has a higher chance + // of matching a name at the end of the list. + for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { + auto I = Map.find(normalizeName(*FI)); + if (I != Map.end()) + return &I->getValue(); + } + return nullptr; +} + +template +std::vector +fetchMapEntriesRegex( + MapTy &Map, + const StringMap> <OCommonNameMap, + const std::vector &FuncNames) { + std::vector AllData; + // Do a reverse order iteration since the name in profile has a higher chance + // of matching a name at the end of the list. + for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { + StringRef Name = *FI; + Name = normalizeName(Name); + const Optional LTOCommonName = getLTOCommonName(Name); + if (LTOCommonName) { + auto I = LTOCommonNameMap.find(*LTOCommonName); + if (I != LTOCommonNameMap.end()) { + const std::vector &CommonData = + I->getValue(); + AllData.insert(AllData.end(), CommonData.begin(), CommonData.end()); + } + } else { + auto I = Map.find(Name); + if (I != Map.end()) { + return {&I->getValue()}; + } + } + } + return AllData; +} + +} + +bool DataReader::mayHaveProfileData(const BinaryFunction &Function) { + if (getBranchData(Function) || getMemData(Function)) + return true; + + if (getBranchDataForNames(Function.getNames()) || + getMemDataForNames(Function.getNames())) + return true; + + if (!hasVolatileName(Function)) + return false; + + const std::vector AllBranchData = + getBranchDataForNamesRegex(Function.getNames()); + if (!AllBranchData.empty()) + return true; + + const std::vector AllMemData = + getMemDataForNamesRegex(Function.getNames()); + if (!AllMemData.empty()) + return true; + + return false; +} + +FuncBranchData * +DataReader::getBranchDataForNames(const std::vector &FuncNames) { + return fetchMapEntry(NamesToBranches, FuncNames); +} + +FuncBranchData * +DataReader::getBranchDataForSymbols(const std::vector &Symbols) { + return fetchMapEntry(NamesToBranches, Symbols); +} + +FuncMemData * +DataReader::getMemDataForNames(const std::vector &FuncNames) { + return fetchMapEntry(NamesToMemEvents, FuncNames); +} + +FuncSampleData * +DataReader::getFuncSampleData(const std::vector &FuncNames) { + return fetchMapEntry(NamesToSamples, FuncNames); +} + +std::vector +DataReader::getBranchDataForNamesRegex( + const std::vector &FuncNames) { + return fetchMapEntriesRegex(NamesToBranches, LTOCommonNameMap, FuncNames); +} + +std::vector +DataReader::getMemDataForNamesRegex(const std::vector &FuncNames) { + return fetchMapEntriesRegex(NamesToMemEvents, LTOCommonNameMemMap, FuncNames); +} + +bool DataReader::hasLocalsWithFileName() const { + for (const StringMapEntry &Func : NamesToBranches) { + const StringRef &FuncName = Func.getKey(); + if (FuncName.count('/') == 2 && FuncName[0] != '/') + return true; + } + return false; +} + +void DataReader::dump() const { + for (const StringMapEntry &Func : NamesToBranches) { + Diag << Func.getKey() << " branches:\n"; + for (const BranchInfo &BI : Func.getValue().Data) { + Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " + << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + } + Diag << Func.getKey() << " entry points:\n"; + for (const BranchInfo &BI : Func.getValue().EntryData) { + Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " + << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + } + } + + for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) { + StringRef Event = I->getKey(); + Diag << "Data was collected with event: " << Event << "\n"; + } + for (const StringMapEntry &Func : NamesToSamples) { + Diag << Func.getKey() << " samples:\n"; + for (const SampleInfo &SI : Func.getValue().Data) { + Diag << SI.Loc.Name << " " << SI.Loc.Offset << " " + << SI.Hits << "\n"; + } + } + + for (const StringMapEntry &Func : NamesToMemEvents) { + Diag << "Memory events for " << Func.getValue().Name; + Location LastOffset(0); + for (const MemInfo &MI : Func.getValue().Data) { + if (MI.Offset == LastOffset) { + Diag << ", " << MI.Addr << "/" << MI.Count; + } else { + Diag << "\n" << MI.Offset << ": " << MI.Addr << "/" << MI.Count; + } + LastOffset = MI.Offset; + } + Diag << "\n"; + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/DataReader.h b/bolt/src/DataReader.h new file mode 100644 index 000000000000..294dec7f2cb3 --- /dev/null +++ b/bolt/src/DataReader.h @@ -0,0 +1,551 @@ +//===-- Reader/DataReader.h - Perf data reader ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by the perf2bolt +// utility and stores it in memory for llvm-bolt consumption. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DATA_READER_H +#define LLVM_TOOLS_LLVM_BOLT_DATA_READER_H + +#include "ProfileReaderBase.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +namespace llvm { +class MCSymbol; + +namespace bolt { + +class BinaryFunction; + +struct LBREntry { + uint64_t From; + uint64_t To; + bool Mispred; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) { + OS << "0x" << Twine::utohexstr(LBR.From) + << " -> 0x" << Twine::utohexstr(LBR.To); + return OS; +} + +struct Location { + bool IsSymbol; + StringRef Name; + uint64_t Offset; + + explicit Location(uint64_t Offset) + : IsSymbol(false), Name("[unknown]"), Offset(Offset) {} + + Location(bool IsSymbol, StringRef Name, uint64_t Offset) + : IsSymbol(IsSymbol), Name(Name), Offset(Offset) {} + + bool operator==(const Location &RHS) const { + return IsSymbol == RHS.IsSymbol && + Name == RHS.Name && + (Name == "[heap]" || Offset == RHS.Offset); + } + + bool operator<(const Location &RHS) const { + if (IsSymbol != RHS.IsSymbol) + return IsSymbol < RHS.IsSymbol; + + if (Name != RHS.Name) + return Name < RHS.Name; + + return Name != "[heap]" && Offset < RHS.Offset; + } + + friend raw_ostream &operator<<(raw_ostream &OS, const Location &Loc); +}; + +typedef std::vector> BranchContext; + +struct BranchInfo { + Location From; + Location To; + int64_t Mispreds; + int64_t Branches; + + BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches) + : From(std::move(From)), To(std::move(To)), Mispreds(Mispreds), + Branches(Branches) {} + + bool operator==(const BranchInfo &RHS) const { + return From == RHS.From && + To == RHS.To; + } + + bool operator<(const BranchInfo &RHS) const { + if (From < RHS.From) + return true; + + if (From == RHS.From) + return (To < RHS.To); + + return false; + } + + /// Merges branch and misprediction counts of \p BI with those of this object. + void mergeWith(const BranchInfo &BI); + + void print(raw_ostream &OS) const; +}; + +struct FuncBranchData { + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + ContainerTy EntryData; + + /// Total execution count for the function. + int64_t ExecutionCount{0}; + + /// Indicate if the data was used. + bool Used{false}; + + FuncBranchData() {} + + FuncBranchData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} + + FuncBranchData(StringRef Name, ContainerTy Data, ContainerTy EntryData) + : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} + + ErrorOr getBranch(uint64_t From, uint64_t To) const; + + /// Returns the branch info object associated with a direct call originating + /// from the given offset. If no branch info object is found, an error is + /// returned. If the offset corresponds to an indirect call the behavior is + /// undefined. + ErrorOr getDirectCallBranch(uint64_t From) const; + + /// Find all the branches originating at From. + iterator_range getBranchRange( + uint64_t From) const; + + /// Append the branch data of another function located \p Offset bytes away + /// from the entry of this function. + void appendFrom(const FuncBranchData &FBD, uint64_t Offset); + + /// Returns the total number of executed branches in this function + /// by counting the number of executed branches for each BranchInfo + uint64_t getNumExecutedBranches() const; + + /// Aggregation helpers + DenseMap> IntraIndex; + DenseMap> InterIndex; + DenseMap> EntryIndex; + + void bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, uint64_t Count, + uint64_t Mispreds); + void bumpCallCount(uint64_t OffsetFrom, const Location &To, uint64_t Count, + uint64_t Mispreds); + void bumpEntryCount(const Location &From, uint64_t OffsetTo, uint64_t Count, + uint64_t Mispreds); +}; + +/// MemInfo represents a single memory load from an address \p Addr at an \p +/// Offset within a function. \p Count represents how many times a particular +/// address was seen. +struct MemInfo { + Location Offset; + Location Addr; + uint64_t Count; + + bool operator==(const MemInfo &RHS) const { + return Offset == RHS.Offset && Addr == RHS.Addr; + } + + bool operator<(const MemInfo &RHS) const { + if (Offset < RHS.Offset) + return true; + + if (Offset == RHS.Offset) + return (Addr < RHS.Addr); + + return false; + } + + void mergeWith(const MemInfo &MI) { + Count += MI.Count; + } + + void print(raw_ostream &OS) const; + void prettyPrint(raw_ostream &OS) const; + + MemInfo(const Location &Offset, const Location &Addr, uint64_t Count = 0) + : Offset(Offset), Addr(Addr), Count(Count) {} + + friend raw_ostream &operator<<(raw_ostream &OS, const MemInfo &MI) { + MI.prettyPrint(OS); + return OS; + } +}; + +/// Helper class to store memory load events recorded in the address space of +/// a given function, analogous to FuncBranchData but for memory load events +/// instead of branches. +struct FuncMemData { + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + + /// Indicate if the data was used. + bool Used{false}; + + DenseMap> EventIndex; + + /// Update \p Data with a memory event. Events with the same + /// \p Offset and \p Addr will be coalesced. + void update(const Location &Offset, const Location &Addr); + + FuncMemData() {} + + FuncMemData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} +}; + +/// Similar to BranchInfo, but instead of recording from-to address (an edge), +/// it records the address of a perf event and the number of times samples hit +/// this address. +struct SampleInfo { + Location Loc; + int64_t Hits; + + SampleInfo(Location Loc, int64_t Hits) + : Loc(std::move(Loc)), Hits(Hits) {} + + bool operator==(const SampleInfo &RHS) const { + return Loc == RHS.Loc; + } + + bool operator<(const SampleInfo &RHS) const { + if (Loc < RHS.Loc) + return true; + + return false; + } + + void print(raw_ostream &OS) const; + + void mergeWith(const SampleInfo &SI); +}; + +/// Helper class to store samples recorded in the address space of a given +/// function, analogous to FuncBranchData but for samples instead of branches. +struct FuncSampleData { + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + + FuncSampleData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} + + /// Get the number of samples recorded in [Start, End) + uint64_t getSamples(uint64_t Start, uint64_t End) const; + + /// Aggregation helper + DenseMap Index; + + void bumpCount(uint64_t Offset, uint64_t Count); +}; + +/// DataReader Class +/// +class DataReader : public ProfileReaderBase { +public: + explicit DataReader(StringRef Filename) + : ProfileReaderBase(Filename), + Diag(errs()) {} + + StringRef getReaderName() const override { + return "branch profile reader"; + } + + bool isTrustedSource() const override { + return false; + } + + virtual Error preprocessProfile(BinaryContext &BC) override; + + virtual Error readProfilePreCFG(BinaryContext &BC) override; + + virtual Error readProfile(BinaryContext &BC) override; + + virtual bool hasLocalsWithFileName() const override; + + virtual bool mayHaveProfileData(const BinaryFunction &BF) override; + + /// Return all event names used to collect this profile + virtual StringSet<> getEventNames() const override { + return EventNames; + } + +protected: + /// Read profile information available for the function. + void readProfile(BinaryFunction &BF); + + /// In functions with multiple entry points, the profile collection records + /// data for other entry points in a different function entry. This function + /// attempts to fetch extra profile data for each secondary entry point. + bool fetchProfileForOtherEntryPoints(BinaryFunction &BF); + + /// Find the best matching profile for a function after the creation of basic + /// blocks. + void matchProfileData(BinaryFunction &BF); + + /// Find the best matching memory data profile for a function before the + /// creation of basic blocks. + void matchProfileMemData(BinaryFunction &BF); + + /// Check how closely \p BranchData matches the function \p BF. + /// Return accuracy (ranging from 0.0 to 1.0) of the matching. + float evaluateProfileData(BinaryFunction &BF, + const FuncBranchData &BranchData) const; + + /// If our profile data comes from sample addresses instead of LBR entries, + /// collect sample count for all addresses in this function address space, + /// aggregating them per basic block and assigning an execution count to each + /// basic block based on the number of samples recorded at those addresses. + /// The last step is to infer edge counts based on BB execution count. Note + /// this is the opposite of the LBR way, where we infer BB execution count + /// based on edge counts. + void readSampleData(BinaryFunction &BF); + + /// Convert function-level branch data into instruction annotations. + void convertBranchData(BinaryFunction &BF) const; + + /// Update function \p BF profile with a taken branch. + /// \p Count could be 0 if verification of the branch is required. + /// + /// Return true if the branch is valid, false otherwise. + bool recordBranch(BinaryFunction &BF, + uint64_t From, uint64_t To, uint64_t Count = 1, + uint64_t Mispreds = 0) const; + + /// Parses the input bolt data file into internal data structures. We expect + /// the file format to follow the syntax below. + /// + /// + /// + /// + /// + /// In field we record 0 if our closest address is a DSO load + /// address or 1 if our closest address is an ELF symbol. + /// + /// Examples: + /// + /// 1 main 3fb 0 /lib/ld-2.21.so 12 4 221 + /// + /// The example records branches from symbol main, offset 3fb, to DSO ld-2.21, + /// offset 12, with 4 mispredictions and 221 branches. + /// + /// 2 t2.c/func 11 1 globalfunc 1d 0 1775 2 + /// 0 1002 2 + /// 2 t2.c/func 31 2 t2.c/func d + /// 2 t2.c/func 18 2 t2.c/func 20 + /// 0 773 2 + /// 2 t2.c/func 71 2 t2.c/func d + /// 2 t2.c/func 18 2 t2.c/func 60 + /// + /// The examples records branches from local symbol func (from t2.c), offset + /// 11, to global symbol globalfunc, offset 1d, with 1775 branches, no + /// mispreds. Of these branches, 1002 were preceeded by a sequence of + /// branches from func, offset 18 to offset 20 and then from offset 31 to + /// offset d. The rest 773 branches were preceeded by a different sequence + /// of branches, from func, offset 18 to offset 60 and then from offset 71 to + /// offset d. + std::error_code parse(); + + /// When no_lbr is the first line of the file, activate No LBR mode. In this + /// mode we read the addresses where samples were recorded directly instead of + /// LBR entries. The line format is almost the same, except for a missing + /// triple and a missing mispredictions field: + /// + /// no_lbr + /// + /// ... + /// + /// Example: + /// + /// no_lbr # First line of fdata file + /// 1 BZ2_compressBlock 466c 3 + /// 1 BZ2_hbMakeCodeLengths 29c 1 + /// + std::error_code parseInNoLBRMode(); + + /// Return branch data matching one of the names in \p FuncNames. + FuncBranchData * + getBranchDataForNames(const std::vector &FuncNames); + + /// Return branch data matching one of the \p Symbols. + FuncBranchData * + getBranchDataForSymbols(const std::vector &Symbols); + + /// Return mem data matching one of the names in \p FuncNames. + FuncMemData *getMemDataForNames(const std::vector &FuncNames); + + FuncSampleData * + getFuncSampleData(const std::vector &FuncNames); + + /// Return a vector of all FuncBranchData matching the list of names. + /// Internally use fuzzy matching to match special names like LTO-generated + /// function names. + std::vector + getBranchDataForNamesRegex(const std::vector &FuncNames); + + /// Return a vector of all FuncMemData matching the list of names. + /// Internally use fuzzy matching to match special names like LTO-generated + /// function names. + std::vector + getMemDataForNamesRegex(const std::vector &FuncNames); + + /// Return branch data profile associated with function \p BF or nullptr + /// if the function has no associated profile. + FuncBranchData *getBranchData(const BinaryFunction &BF) const { + auto FBDI = FuncsToBranches.find(&BF); + if (FBDI == FuncsToBranches.end()) + return nullptr; + return FBDI->second; + } + + /// Updates branch profile data associated with function \p BF. + void setBranchData(const BinaryFunction &BF, FuncBranchData *FBD) { + FuncsToBranches[&BF] = FBD; + } + + /// Return memory profile data associated with function \p BF, or nullptr + /// if the function has no associated profile. + FuncMemData *getMemData(const BinaryFunction &BF) const { + auto FMDI = FuncsToMemData.find(&BF); + if (FMDI == FuncsToMemData.end()) + return nullptr; + return FMDI->second; + } + + /// Updates the memory profile data associated with function \p BF. + void setMemData(const BinaryFunction &BF, FuncMemData *FMD) { + FuncsToMemData[&BF] = FMD; + } + + using NamesToBranchesMapTy = StringMap; + using NamesToSamplesMapTy = StringMap; + using NamesToMemEventsMapTy = StringMap; + using FuncsToBranchesMapTy = + std::unordered_map; + using FuncsToMemDataMapTy = + std::unordered_map; + + /// Dumps the entire data structures parsed. Used for debugging. + void dump() const; + + /// Return false only if we are running with profiling data that lacks LBR. + bool hasLBR() const { return !NoLBRMode; } + + /// Return true if the profiling data was collected in a bolted binary. This + /// means we lose the ability to identify stale data at some branch locations, + /// since we have to be more permissive in some cases. + bool collectedInBoltedBinary() const { return BATMode; } + + /// Return true if event named \p Name was used to collect this profile data. + bool usesEvent(StringRef Name) const { + for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) { + StringRef Event = I->getKey(); + if (Event.find(Name) != StringRef::npos) + return true; + } + return false; + } + + /// Open the file and parse the contents. + std::error_code parseInput(); + + /// Build suffix map once the profile data is parsed. + void buildLTONameMaps(); + + void reportError(StringRef ErrorMsg); + bool expectAndConsumeFS(); + void consumeAllRemainingFS(); + bool checkAndConsumeNewLine(); + ErrorOr parseString(char EndChar, bool EndNl=false); + ErrorOr parseNumberField(char EndChar, bool EndNl=false); + ErrorOr parseHexField(char EndChar, bool EndNl=false); + ErrorOr parseLocation(char EndChar, bool EndNl, bool ExpectMemLoc); + ErrorOr parseLocation(char EndChar, bool EndNl=false) { + return parseLocation(EndChar, EndNl, false); + } + ErrorOr parseMemLocation(char EndChar, bool EndNl=false) { + return parseLocation(EndChar, EndNl, true); + } + ErrorOr parseBranchInfo(); + ErrorOr parseSampleInfo(); + ErrorOr parseMemInfo(); + ErrorOr maybeParseNoLBRFlag(); + ErrorOr maybeParseBATFlag(); + bool hasBranchData(); + bool hasMemData(); + + /// An in-memory copy of the input data file - owns strings used in reader. + std::unique_ptr FileBuf; + raw_ostream &Diag; + StringRef ParsingBuf; + unsigned Line{0}; + unsigned Col{0}; + NamesToBranchesMapTy NamesToBranches; + NamesToSamplesMapTy NamesToSamples; + NamesToMemEventsMapTy NamesToMemEvents; + FuncsToBranchesMapTy FuncsToBranches; + FuncsToMemDataMapTy FuncsToMemData; + bool NoLBRMode{false}; + bool BATMode{false}; + StringSet<> EventNames; + static const char FieldSeparator = ' '; + + /// Maps of common LTO names to possible matching profiles. + StringMap> LTOCommonNameMap; + StringMap> LTOCommonNameMemMap; +}; + +} + +/// DenseMapInfo allows us to use the DenseMap LLVM data structure to store +/// Locations +template<> struct DenseMapInfo { + static inline bolt::Location getEmptyKey() { + return bolt::Location(true, StringRef(), static_cast(-1LL)); + } + static inline bolt::Location getTombstoneKey() { + return bolt::Location(true, StringRef(), static_cast(-2LL));; + } + static unsigned getHashValue(const bolt::Location &L) { + return (unsigned(DenseMapInfo::getHashValue(L.Name)) >> 4) ^ + (unsigned(L.Offset)); + } + static bool isEqual(const bolt::Location &LHS, + const bolt::Location &RHS) { + return LHS.IsSymbol == RHS.IsSymbol && LHS.Name == RHS.Name && + LHS.Offset == RHS.Offset; + } +}; + +} + +#endif diff --git a/bolt/src/DebugData.cpp b/bolt/src/DebugData.cpp new file mode 100644 index 000000000000..4681a27cfd50 --- /dev/null +++ b/bolt/src/DebugData.cpp @@ -0,0 +1,473 @@ +//===- DebugData.cpp - Representation and writing of debugging information. ==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DebugData.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" + +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/LEB128.h" +#include +#include +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-debug-info" + +namespace opts { +extern llvm::cl::opt Verbosity; +} + +namespace llvm { +namespace bolt { + +const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0}; + +namespace { + +// Writes address ranges to Writer as pairs of 64-bit (address, size). +// If RelativeRange is true, assumes the address range to be written must be of +// the form (begin address, range size), otherwise (begin address, end address). +// Terminates the list by writing a pair of two zeroes. +// Returns the number of written bytes. +uint64_t writeAddressRanges( + raw_svector_ostream &Stream, + const DebugAddressRangesVector &AddressRanges, + const bool WriteRelativeRanges = false) { + for (const DebugAddressRange &Range : AddressRanges) { + support::endian::write(Stream, Range.LowPC, support::little); + support::endian::write( + Stream, WriteRelativeRanges ? Range.HighPC - Range.LowPC : Range.HighPC, + support::little); + } + // Finish with 0 entries. + support::endian::write(Stream, 0ULL, support::little); + support::endian::write(Stream, 0ULL, support::little); + return AddressRanges.size() * 16 + 16; +} + +} // namespace + +DebugRangesSectionWriter::DebugRangesSectionWriter() { + RangesBuffer = std::make_unique(); + RangesStream = std::make_unique(*RangesBuffer); + + // Add an empty range as the first entry; + SectionOffset += + writeAddressRanges(*RangesStream.get(), DebugAddressRangesVector{}); +} + +uint64_t DebugRangesSectionWriter::addRanges( + DebugAddressRangesVector &&Ranges, + std::map &CachedRanges) { + if (Ranges.empty()) + return getEmptyRangesOffset(); + + const auto RI = CachedRanges.find(Ranges); + if (RI != CachedRanges.end()) + return RI->second; + + const uint64_t EntryOffset = addRanges(Ranges); + CachedRanges.emplace(std::move(Ranges), EntryOffset); + + return EntryOffset; +} + +uint64_t +DebugRangesSectionWriter::addRanges(const DebugAddressRangesVector &Ranges) { + if (Ranges.empty()) + return getEmptyRangesOffset(); + + // Reading the SectionOffset and updating it should be atomic to guarantee + // unique and correct offsets in patches. + std::lock_guard Lock(WriterMutex); + const uint32_t EntryOffset = SectionOffset; + SectionOffset += writeAddressRanges(*RangesStream.get(), Ranges); + + return EntryOffset; +} + +uint64_t DebugRangesSectionWriter::getSectionOffset() { + std::lock_guard Lock(WriterMutex); + return SectionOffset; +} + +void DebugARangesSectionWriter::addCURanges(uint64_t CUOffset, + DebugAddressRangesVector &&Ranges) { + std::lock_guard Lock(CUAddressRangesMutex); + CUAddressRanges.emplace(CUOffset, std::move(Ranges)); +} + +void DebugARangesSectionWriter::writeARangesSection( + raw_svector_ostream &RangesStream) const { + // For reference on the format of the .debug_aranges section, see the DWARF4 + // specification, section 6.1.4 Lookup by Address + // http://www.dwarfstd.org/doc/DWARF4.pdf + for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { + const uint64_t Offset = CUOffsetAddressRangesPair.first; + const DebugAddressRangesVector &AddressRanges = + CUOffsetAddressRangesPair.second; + + // Emit header. + + // Size of this set: 8 (size of the header) + 4 (padding after header) + // + 2*sizeof(uint64_t) bytes for each of the ranges, plus an extra + // pair of uint64_t's for the terminating, zero-length range. + // Does not include size field itself. + uint32_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1); + + // Header field #1: set size. + support::endian::write(RangesStream, Size, support::little); + + // Header field #2: version number, 2 as per the specification. + support::endian::write(RangesStream, static_cast(2), + support::little); + + // Header field #3: debug info offset of the correspondent compile unit. + support::endian::write(RangesStream, static_cast(Offset), + support::little); + + // Header field #4: address size. + // 8 since we only write ELF64 binaries for now. + RangesStream << char(8); + + // Header field #5: segment size of target architecture. + RangesStream << char(0); + + // Padding before address table - 4 bytes in the 64-bit-pointer case. + support::endian::write(RangesStream, static_cast(0), + support::little); + + writeAddressRanges(RangesStream, AddressRanges, true); + } +} + +DebugAddrWriter::DebugAddrWriter(BinaryContext *Bc) { BC = Bc; } + +void DebugAddrWriter::AddressForDWOCU::dump() { + std::vector SortedMap(indexToAddressBegin(), + indexToAdddessEnd()); + // Sorting address in increasing order of indices. + std::sort(SortedMap.begin(), SortedMap.end(), + [](const IndexAddressPair &A, const IndexAddressPair &B) { + return A.first < B.first; + }); + for (auto &Pair : SortedMap) + dbgs() << Twine::utohexstr(Pair.second) << "\t" << Pair.first << "\n"; +} +uint32_t DebugAddrWriter::getIndexFromAddress(uint64_t Address, + uint64_t DWOId) { + if (!AddressMaps.count(DWOId)) + AddressMaps[DWOId] = AddressForDWOCU(); + + AddressForDWOCU &Map = AddressMaps[DWOId]; + auto Entry = Map.find(Address); + if (Entry == Map.end()) { + auto Index = Map.getNextIndex(); + Entry = Map.insert(Address, Index).first; + } + return Entry->second; +} + +// Case1) Address is not in map insert in to AddresToIndex and IndexToAddres +// Case2) Address is in the map but Index is higher or equal. Need to update +// IndexToAddrss. Case3) Address is in the map but Index is lower. Need to +// update AddressToIndex and IndexToAddress +void DebugAddrWriter::addIndexAddress(uint64_t Address, uint32_t Index, + uint64_t DWOId) { + AddressForDWOCU &Map = AddressMaps[DWOId]; + auto Entry = Map.find(Address); + if (Entry != Map.end()) { + if (Entry->second > Index) + Map.updateAddressToIndex(Address, Index); + Map.updateIndexToAddrss(Address, Index); + } else + Map.insert(Address, Index); +} + +AddressSectionBuffer DebugAddrWriter::finalize() { + // Need to layout all sections within .debug_addr + // Within each section sort Address by index. + AddressSectionBuffer Buffer; + raw_svector_ostream AddressStream(Buffer); + for (std::unique_ptr &CU : BC->DwCtx->compile_units()) { + Optional DWOId = CU->getDWOId(); + // Handling the case wehre debug information is a mix of Debug fission and + // monolitic. + if (!DWOId) + continue; + auto AM = AddressMaps.find(*DWOId); + // Adding to map even if it did not contribute to .debug_addr. + // The Skeleton CU will still have DW_AT_GNU_addr_base. + DWOIdToOffsetMap[*DWOId] = Buffer.size(); + // If does not exist this CUs DWO section didn't contribute to .debug_addr. + if (AM == AddressMaps.end()) + continue; + std::vector SortedMap(AM->second.indexToAddressBegin(), + AM->second.indexToAdddessEnd()); + // Sorting address in increasing order of indices. + std::sort(SortedMap.begin(), SortedMap.end(), + [](const IndexAddressPair &A, const IndexAddressPair &B) { + return A.first < B.first; + }); + + uint8_t AddrSize = CU->getAddressByteSize(); + uint32_t Counter = 0; + auto WriteAddress = [&](uint64_t Address) -> void { + ++Counter; + switch (AddrSize) { + default: + assert(false && "Address Size is invalid."); + break; + case 4: + support::endian::write(AddressStream, static_cast(Address), + support::little); + break; + case 8: + support::endian::write(AddressStream, Address, support::little); + break; + } + }; + + for (const IndexAddressPair &Val : SortedMap) { + while (Val.first > Counter) + WriteAddress(0); + WriteAddress(Val.second); + } + } + + return Buffer; +} + +uint64_t DebugAddrWriter::getOffset(uint64_t DWOId) { + auto Iter = DWOIdToOffsetMap.find(DWOId); + assert(Iter != DWOIdToOffsetMap.end() && + "Offset in to.debug_addr was not found for DWO ID."); + return Iter->second; +} + +DebugLocWriter::DebugLocWriter(BinaryContext *BC) { + LocBuffer = std::make_unique(); + LocStream = std::make_unique(*LocBuffer); +} + +// DWARF 4: 2.6.2 +uint64_t +DebugLocWriter::addList(const DebugLocationsVector &LocList) { + if (LocList.empty()) + return EmptyListTag; + + // Since there is a separate DebugLocWriter for each thread, + // we don't need a lock to read the SectionOffset and update it. + const uint32_t EntryOffset = SectionOffset; + + for (const DebugLocationEntry &Entry : LocList) { + support::endian::write(*LocStream, static_cast(Entry.LowPC), + support::little); + support::endian::write(*LocStream, static_cast(Entry.HighPC), + support::little); + support::endian::write(*LocStream, static_cast(Entry.Expr.size()), + support::little); + *LocStream << StringRef(reinterpret_cast(Entry.Expr.data()), + Entry.Expr.size()); + SectionOffset += 2 * 8 + 2 + Entry.Expr.size(); + } + LocStream->write_zeros(16); + SectionOffset += 16; + + return EntryOffset; +} + +DebugAddrWriter *DebugLoclistWriter::AddrWriter = nullptr; +void DebugLoclistWriter::finalizePatches() { + auto numOfBytes = [](uint32_t Val) -> uint32_t { + int LogVal = (int)std::log2(Val) + 1; + uint32_t CeilVal = (LogVal + 8 - 1) / 8; + return !Val ? 1 : CeilVal; + }; + (void)numOfBytes; + + for (const auto &Patch : IndexPatches) { + uint32_t Index = AddrWriter->getIndexFromAddress(Patch.Address, DWOId); + assert(numOfBytes(Index) <= DebugLoclistWriter::NumBytesForIndex && + "Index size in DebugLocation too large."); + std::string Buff; + raw_string_ostream OS(Buff); + encodeULEB128(Index, OS, DebugLoclistWriter::NumBytesForIndex); + for (uint32_t I = 0; I < DebugLoclistWriter::NumBytesForIndex; ++I) { + (*LocBuffer)[Patch.Offset + I] = Buff[I]; + } + } +} + +uint64_t DebugLoclistWriter::addList(const DebugLocationsVector &LocList) { + if (LocList.empty()) + return EmptyListTag; + uint64_t EntryOffset = LocBuffer->size(); + + for (const DebugLocationEntry &Entry : LocList) { + support::endian::write(*LocStream, + static_cast(dwarf::DW_LLE_startx_length), + support::little); + IndexPatches.emplace_back(static_cast(LocBuffer->size()), + Entry.LowPC); + LocStream->write_zeros(DebugLoclistWriter::NumBytesForIndex); + // TODO: Support DWARF5 + support::endian::write(*LocStream, + static_cast(Entry.HighPC - Entry.LowPC), + support::little); + support::endian::write(*LocStream, static_cast(Entry.Expr.size()), + support::little); + *LocStream << StringRef(reinterpret_cast(Entry.Expr.data()), + Entry.Expr.size()); + } + support::endian::write(*LocStream, + static_cast(dwarf::DW_LLE_end_of_list), + support::little); + return EntryOffset; +} + +void SimpleBinaryPatcher::addBinaryPatch(uint32_t Offset, + const std::string &NewValue) { + Patches.emplace_back(Offset, NewValue); +} + +void SimpleBinaryPatcher::addBytePatch(uint32_t Offset, uint8_t Value) { + Patches.emplace_back(Offset, std::string(1, Value)); +} + +void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue, + size_t ByteSize) { + std::string LE64(ByteSize, 0); + for (size_t I = 0; I < ByteSize; ++I) { + LE64[I] = NewValue & 0xff; + NewValue >>= 8; + } + Patches.emplace_back(Offset, LE64); +} + +void SimpleBinaryPatcher::addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size) { + std::string Buff; + raw_string_ostream OS(Buff); + encodeULEB128(Value, OS, Size); + + Patches.emplace_back(Offset, OS.str()); +} + +void SimpleBinaryPatcher::addLE64Patch(uint32_t Offset, uint64_t NewValue) { + addLEPatch(Offset, NewValue, 8); +} + +void SimpleBinaryPatcher::addLE32Patch(uint32_t Offset, uint32_t NewValue) { + addLEPatch(Offset, NewValue, 4); +} + +void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents, + uint32_t DWPOffset = 0) { + for (const auto &Patch : Patches) { + uint32_t Offset = Patch.first - DWPOffset; + const std::string &ByteSequence = Patch.second; + assert(Offset + ByteSequence.size() <= BinaryContents.size() && + "Applied patch runs over binary size."); + for (uint64_t I = 0, Size = ByteSequence.size(); I < Size; ++I) { + BinaryContents[Offset + I] = ByteSequence[I]; + } + } +} + +void DebugAbbrevPatcher::addAttributePatch( + const DWARFAbbreviationDeclaration *Abbrev, dwarf::Attribute AttrTag, + dwarf::Attribute NewAttrTag, uint8_t NewAttrForm) { + assert(Abbrev && "no abbreviation specified"); + + if (std::numeric_limits::max() >= NewAttrTag) + AbbrevPatches.emplace( + AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm}); + else + AbbrevNonStandardPatches.emplace_back( + AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm}); +} + +void DebugAbbrevPatcher::patchBinary(std::string &Contents, + uint32_t DWPOffset = 0) { + SimpleBinaryPatcher Patcher; + + for (const AbbrevAttrPatch &Patch : AbbrevPatches) { + const DWARFAbbreviationDeclaration::AttributeSpec *const Attribute = + Patch.Abbrev->findAttribute(Patch.Attr); + assert(Attribute && "Specified attribute doesn't occur in abbreviation."); + + Patcher.addBytePatch(Attribute->AttrOffset - DWPOffset, + static_cast(Patch.NewAttr)); + Patcher.addBytePatch(Attribute->FormOffset - DWPOffset, Patch.NewForm); + } + Patcher.patchBinary(Contents); + + if (AbbrevNonStandardPatches.empty()) + return; + + std::string Section; + Section.reserve(Contents.size() + AbbrevNonStandardPatches.size() / 2); + + const char *Ptr = Contents.c_str(); + uint32_t Start = 0; + std::string Buff; + auto Encode = [&](uint16_t Value) -> std::string { + Buff.clear(); + raw_string_ostream OS(Buff); + encodeULEB128(Value, OS, 2); + return Buff; + }; + + for (const AbbrevAttrPatch &Patch : AbbrevNonStandardPatches) { + const DWARFAbbreviationDeclaration::AttributeSpec *const Attribute = + Patch.Abbrev->findAttribute(Patch.Attr); + assert(Attribute && "Specified attribute doesn't occur in abbreviation."); + assert(Start <= Attribute->AttrOffset && + "Offsets are not in sequential order."); + // Assuming old attribute is 1 byte. Otherwise will need to add logic to + // determine it's size at runtime. + assert(std::numeric_limits::max() >= Patch.Attr && + "Old attribute is greater then 1 byte."); + + Section.append(Ptr + Start, Attribute->AttrOffset - Start); + Section.append(Encode(Patch.NewAttr)); + Section.append(std::string(1, Patch.NewForm)); + + Start = Attribute->FormOffset + 1; + } + Section.append(Ptr + Start, Contents.size() - Start); + Contents = std::move(Section); +} + +void DebugStrWriter::create() { + StrBuffer = std::make_unique(); + StrStream = std::make_unique(*StrBuffer); +} + +void DebugStrWriter::initialize() { + auto StrSection = BC->DwCtx->getDWARFObj().getStrSection(); + (*StrStream) << StrSection; +} + +uint32_t DebugStrWriter::addString(StringRef Str) { + if (StrBuffer->empty()) + initialize(); + auto Offset = StrBuffer->size(); + (*StrStream) << Str; + StrStream->write_zeros(1); + return Offset; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/DebugData.h b/bolt/src/DebugData.h new file mode 100644 index 000000000000..47aa86da8dbf --- /dev/null +++ b/bolt/src/DebugData.h @@ -0,0 +1,491 @@ +//===-- DebugData.h - Representation and writing of debugging information. -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Classes that represent and serialize DWARF-related entities. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_DATA_H +#define LLVM_TOOLS_LLVM_BOLT_DEBUG_DATA_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/Support/SMLoc.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { + +namespace bolt { + +class BinaryContext; + +/// Address range representation. Takes less space than DWARFAddressRange. +struct DebugAddressRange { + uint64_t LowPC{0}; + uint64_t HighPC{0}; + + DebugAddressRange() = default; + + DebugAddressRange(uint64_t LowPC, uint64_t HighPC) + : LowPC(LowPC), HighPC(HighPC) {} +}; + +static inline bool operator<(const DebugAddressRange &LHS, + const DebugAddressRange &RHS) { + return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC); +} + +/// DebugAddressRangesVector - represents a set of absolute address ranges. +using DebugAddressRangesVector = SmallVector; + +/// Address range with location used by .debug_loc section. +/// More compact than DWARFLocationEntry and uses absolute addresses. +struct DebugLocationEntry { + uint64_t LowPC; + uint64_t HighPC; + SmallVector Expr; +}; + +using DebugLocationsVector = SmallVector; + +/// References a row in a DWARFDebugLine::LineTable by the DWARF +/// Context index of the DWARF Compile Unit that owns the Line Table and the row +/// index. This is tied to our IR during disassembly so that we can later update +/// .debug_line information. RowIndex has a base of 1, which means a RowIndex +/// of 1 maps to the first row of the line table and a RowIndex of 0 is invalid. +struct DebugLineTableRowRef { + uint32_t DwCompileUnitIndex; + uint32_t RowIndex; + + const static DebugLineTableRowRef NULL_ROW; + + bool operator==(const DebugLineTableRowRef &Rhs) const { + return DwCompileUnitIndex == Rhs.DwCompileUnitIndex && + RowIndex == Rhs.RowIndex; + } + + bool operator!=(const DebugLineTableRowRef &Rhs) const { + return !(*this == Rhs); + } + + static DebugLineTableRowRef fromSMLoc(const SMLoc &Loc) { + union { + decltype(Loc.getPointer()) Ptr; + DebugLineTableRowRef Ref; + } U; + U.Ptr = Loc.getPointer(); + return U.Ref; + } + + SMLoc toSMLoc() const { + union { + decltype(SMLoc().getPointer()) Ptr; + DebugLineTableRowRef Ref; + } U; + U.Ref = *this; + return SMLoc::getFromPointer(U.Ptr); + } +}; + +using RangesBufferVector = SmallVector; + +/// Serializes the .debug_ranges DWARF section. +class DebugRangesSectionWriter { +public: + DebugRangesSectionWriter(); + + /// Add ranges with caching. + uint64_t + addRanges(DebugAddressRangesVector &&Ranges, + std::map &CachedRanges); + + /// Add ranges and return offset into section. + uint64_t addRanges(const DebugAddressRangesVector &Ranges); + + /// Returns an offset of an empty address ranges list that is always written + /// to .debug_ranges + uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; } + + /// Returns the SectionOffset. + uint64_t getSectionOffset(); + + std::unique_ptr finalize() { + return std::move(RangesBuffer); + } + +private: + std::unique_ptr RangesBuffer; + + std::unique_ptr RangesStream; + + std::mutex WriterMutex; + + /// Current offset in the section (updated as new entries are written). + /// Starts with 16 since the first 16 bytes are reserved for an empty range. + uint32_t SectionOffset{0}; + + /// Offset of an empty address ranges list. + static constexpr uint64_t EmptyRangesOffset{0}; +}; + +/// Serializes the .debug_aranges DWARF section. +class DebugARangesSectionWriter { +public: + /// Add ranges for CU matching \p CUOffset. + void addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges); + + /// Writes .debug_aranges with the added ranges to the MCObjectWriter. + void writeARangesSection(raw_svector_ostream &RangesStream) const; + + /// Resets the writer to a clear state. + void reset() { + CUAddressRanges.clear(); + } + + /// Map DWARFCompileUnit index to ranges. + using CUAddressRangesType = std::map; + + /// Return ranges for a given CU. + const CUAddressRangesType &getCUAddressRanges() const { + return CUAddressRanges; + } + +private: + /// Map from compile unit offset to the list of address intervals that belong + /// to that compile unit. Each interval is a pair + /// (first address, interval size). + CUAddressRangesType CUAddressRanges; + + std::mutex CUAddressRangesMutex; +}; + +using IndexAddressPair = std::pair; +using AddressToIndexMap = std::unordered_map; +using IndexToAddressMap = std::unordered_map; +using AddressSectionBuffer = SmallVector; +class DebugAddrWriter { +public: + DebugAddrWriter() = delete; + DebugAddrWriter(BinaryContext *BC_); + /// Given an address returns an index in .debug_addr. + /// Adds Address to map. + uint32_t getIndexFromAddress(uint64_t Address, uint64_t DWOId); + + /// Adds {Address, Index} to DWO ID CU. + void addIndexAddress(uint64_t Address, uint32_t Index, uint64_t DWOId); + + /// Creates consolidated .debug_addr section, and builds DWOID to offset map. + AddressSectionBuffer finalize(); + + /// Given DWOID returns offset of this CU in to .debug_addr section. + uint64_t getOffset(uint64_t DWOId); + + /// Returns False if .debug_addr section was created.. + bool isInitialized() const { return !AddressMaps.empty(); } + +private: + class AddressForDWOCU { + public: + AddressToIndexMap::iterator find(uint64_t Adddress) { + return AddressToIndex.find(Adddress); + } + AddressToIndexMap::iterator end() { return AddressToIndex.end(); } + AddressToIndexMap::iterator begin() { return AddressToIndex.begin(); } + + IndexToAddressMap::iterator indexToAdddessEnd() { + return IndexToAddress.end(); + } + IndexToAddressMap::iterator indexToAddressBegin() { + return IndexToAddress.begin(); + } + uint32_t getNextIndex() { + while (IndexToAddress.count(CurrentIndex)) + ++CurrentIndex; + return CurrentIndex; + } + + /// Inserts elements in to IndexToAddress and AddressToIndex. + /// Follows the same semantics as unordered_map insert. + std::pair insert(uint64_t Address, + uint32_t Index) { + IndexToAddress.insert({Index, Address}); + return AddressToIndex.insert({Address, Index}); + } + + /// Updates AddressToIndex Map. + /// Follows the same symantics as unordered map []. + void updateAddressToIndex(uint64_t Address, uint32_t Index) { + AddressToIndex[Address] = Index; + } + + /// Updates IndexToAddress Map. + /// Follows the same symantics as unordered map []. + void updateIndexToAddrss(uint64_t Address, uint32_t Index) { + IndexToAddress[Index] = Address; + } + + void dump(); + + private: + AddressToIndexMap AddressToIndex; + IndexToAddressMap IndexToAddress; + uint32_t CurrentIndex{0}; + }; + BinaryContext *BC; + /// Maps DWOID to AddressForDWOCU. + std::unordered_map AddressMaps; + /// Maps DWOID to offset within .debug_addr section. + std::unordered_map DWOIdToOffsetMap; +}; + +using DebugStrBufferVector = SmallVector; +class DebugStrWriter { +public: + DebugStrWriter() = delete; + DebugStrWriter(BinaryContext *Bc) : BC(Bc) { create(); } + std::unique_ptr finalize() { + return std::move(StrBuffer); + } + + /// Adds string to .debug_str. + /// On first invokation it initializes internal data stractures. + uint32_t addString(StringRef Str); + + /// Returns False if no strings were added to .debug_str. + bool isInitialized() const { return !StrBuffer->empty(); } + +private: + /// Initializes Buffer and Stream. + void initialize(); + /// Creats internal data stractures. + void create(); + std::unique_ptr StrBuffer; + std::unique_ptr StrStream; + BinaryContext *BC; +}; + +using LocBufferVector = SmallVector; + +enum class LocWriterKind { DebugLocWriter, DebugLoclistWriter }; + +/// Serializes part of a .debug_loc DWARF section with LocationLists. +class DebugLocWriter { +public: + DebugLocWriter() = delete; + DebugLocWriter(BinaryContext *BC); + virtual ~DebugLocWriter(){}; + + virtual uint64_t addList(const DebugLocationsVector &LocList); + + virtual std::unique_ptr finalize() { + return std::move(LocBuffer); + } + + /// Offset of an empty location list. + static constexpr uint32_t EmptyListOffset = 0; + + /// Value returned by addList if list is empty + /// Use 64 bits here so that a max 32 bit value can still + /// be stored while we use max 64 bit value as empty tag + static constexpr uint64_t EmptyListTag = -1; + + LocWriterKind getKind() const { return Kind; } + + static bool classof(const DebugLocWriter *Writer) { + return Writer->getKind() == LocWriterKind::DebugLocWriter; + } + +protected: + std::unique_ptr LocBuffer; + + std::unique_ptr LocStream; + /// Current offset in the section (updated as new entries are written). + /// Starts with 0 here since this only writes part of a full location lists + /// section. In the final section, the first 16 bytes are reserved for an + /// empty list. + uint32_t SectionOffset{0}; + LocWriterKind Kind{LocWriterKind::DebugLocWriter}; +}; + +class DebugLoclistWriter : public DebugLocWriter { +public: + ~DebugLoclistWriter() {} + DebugLoclistWriter() = delete; + DebugLoclistWriter(BinaryContext *BC, uint64_t DWOId_) + : DebugLocWriter(BC), DWOId(DWOId_) { + Kind = LocWriterKind::DebugLoclistWriter; + assert(DebugLoclistWriter::AddrWriter && + "Please use SetAddressWriter to initialize " + "DebugAddrWriter before instantiation."); + } + + static void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; } + + /// Writes out locationList, with index in to .debug_addr to be patched later. + uint64_t addList(const DebugLocationsVector &LocList) override; + + /// Finalizes all the location by patching correct index in to .debug_addr. + void finalizePatches(); + + static bool classof(const DebugLocWriter *Writer) { + return Writer->getKind() == LocWriterKind::DebugLoclistWriter; + } + +private: + class Patch { + public: + Patch() = delete; + Patch(uint64_t O, uint64_t A) : Offset(O), Address(A) {} + uint64_t Offset{0}; + uint64_t Address{0}; + }; + static DebugAddrWriter *AddrWriter; + uint64_t DWOId{0}; + std::unordered_map AddressMap; + std::vector IndexPatches; + constexpr static uint32_t NumBytesForIndex{3}; +}; + +/// Abstract interface for classes that apply modifications to a binary string. +class BinaryPatcher { +public: + virtual ~BinaryPatcher() {} + /// Applies in-place modifications to the binary string \p BinaryContents . + /// \p DWPOffset used to correctly patch sections that come from DWP file. + virtual void patchBinary(std::string &BinaryContents, uint32_t DWPOffset) = 0; +}; + +/// Applies simple modifications to a binary string, such as directly replacing +/// the contents of a certain portion with a string or an integer. +class SimpleBinaryPatcher : public BinaryPatcher { +private: + std::vector> Patches; + + /// Adds a patch to replace the contents of \p ByteSize bytes with the integer + /// \p NewValue encoded in little-endian, with the least-significant byte + /// being written at the offset \p Offset . + void addLEPatch(uint32_t Offset, uint64_t NewValue, size_t ByteSize); + + /// RangeBase for DWO DebugInfo Patcher. + uint64_t RangeBase{0}; + + /// Gets reset to false when setRangeBase is invoked. + /// Gets set to true when getRangeBase is called + uint64_t WasRangeBaseUsed{false}; + +public: + virtual ~SimpleBinaryPatcher() {} + + /// Adds a patch to replace the contents of the binary string starting at the + /// specified \p Offset with the string \p NewValue. + void addBinaryPatch(uint32_t Offset, const std::string &NewValue); + + /// Adds a patch to replace the contents of a single byte of the string, at + /// the offset \p Offset, with the value \Value . + void addBytePatch(uint32_t Offset, uint8_t Value); + + /// Adds a patch to put the integer \p NewValue encoded as a 64-bit + /// little-endian value at offset \p Offset. + void addLE64Patch(uint32_t Offset, uint64_t NewValue); + + /// Adds a patch to put the integer \p NewValue encoded as a 32-bit + /// little-endian value at offset \p Offset. + void addLE32Patch(uint32_t Offset, uint32_t NewValue); + + /// Add a patch at \p Offset with \p Value using unsigned LEB128 encoding with + /// size \p Size. \p Size should not be less than a minimum number of bytes + /// needed to encode \p Value. + void addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size); + + /// Setting DW_AT_GNU_ranges_base + void setRangeBase(uint64_t Rb) { + WasRangeBaseUsed = false; + RangeBase = Rb; + } + + /// Gets DW_AT_GNU_ranges_base + uint64_t getRangeBase() { + WasRangeBaseUsed = true; + return RangeBase; + } + + /// Proxy for if we broke up low_pc/high_pc to ranges. + bool getWasRangBasedUsed() const { return WasRangeBaseUsed; } + + virtual void patchBinary(std::string &BinaryContents, + uint32_t DWPOffset) override; +}; + +/// Apply small modifications to the .debug_abbrev DWARF section. +class DebugAbbrevPatcher : public BinaryPatcher { +public: + /// Patch of changing one attribute to another. + struct AbbrevAttrPatch { + const DWARFAbbreviationDeclaration *Abbrev; + dwarf::Attribute Attr; // ID of attribute to be replaced. + dwarf::Attribute NewAttr; // ID of the new attribute. + uint8_t NewForm; // Form of the new attribute. + + bool operator==(const AbbrevAttrPatch &RHS) const { + return Abbrev == RHS.Abbrev && Attr == RHS.Attr; + } + }; + + struct AbbrevHash { + size_t operator()(const AbbrevAttrPatch &P) const { + return std::hash()(((uint64_t)P.Abbrev << 16) + P.Attr); + } + }; + +private: + std::unordered_set AbbrevPatches; + /// Using Vector because this is currently being used for specific purpose of + /// patching DW_AT_GNU_ranges_base. So don't need fast look up, but do need + /// them to be in order. + std::vector AbbrevNonStandardPatches; + +public: + virtual ~DebugAbbrevPatcher() {} + /// Adds a patch to change an attribute of the abbreviation + /// \p Abbrev the abbreviation to be modified. + /// \p AttrTag ID of the attribute to be replaced. + /// \p NewAttrTag ID of the new attribute. + /// \p NewAttrForm Form of the new attribute. + /// If None standard form is added through this API it will expand the section + /// to accomidate it. + void addAttributePatch(const DWARFAbbreviationDeclaration *Abbrev, + dwarf::Attribute AttrTag, dwarf::Attribute NewAttrTag, + uint8_t NewAttrForm); + + virtual void patchBinary(std::string &Contents, uint32_t DWPOffset) override; + + /// Finds an abbreviation patch. + /// \p AbbrevDecl the abbreviation declaration. + /// \p AttrTag Attribute to search for. + /// Returns nullptr if patch doesn't exist. + const AbbrevAttrPatch *find(const DWARFAbbreviationDeclaration *AbbrevDecl, + dwarf::Attribute AttrTag) { + auto Iter = AbbrevPatches.find({AbbrevDecl, AttrTag, AttrTag, 0}); + if (Iter == AbbrevPatches.end()) + return nullptr; + return &(*Iter); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/DynoStats.cpp b/bolt/src/DynoStats.cpp new file mode 100644 index 000000000000..1aa787df5872 --- /dev/null +++ b/bolt/src/DynoStats.cpp @@ -0,0 +1,251 @@ +//===--- DynoStats.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "DynoStats.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltCategory; + +static cl::opt +DynoStatsScale("dyno-stats-scale", + cl::desc("scale to be applied while reporting dyno stats"), + cl::Optional, + cl::init(1), + cl::Hidden, + cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +constexpr const char *DynoStats::Desc[]; + +bool DynoStats::operator<(const DynoStats &Other) const { + return std::lexicographical_compare( + &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], + &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT] + ); +} + +bool DynoStats::operator==(const DynoStats &Other) const { + return std::equal( + &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], + &Other.Stats[FIRST_DYNO_STAT] + ); +} + +bool DynoStats::lessThan(const DynoStats &Other, + ArrayRef Keys) const { + return std::lexicographical_compare( + Keys.begin(), Keys.end(), + Keys.begin(), Keys.end(), + [this,&Other](const Category A, const Category) { + return Stats[A] < Other.Stats[A]; + } + ); +} + +void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { + auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, + uint64_t OtherStat) { + OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name; + if (Other) { + if (Stat != OtherStat) { + OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0 + OS << format(" (%+.1f%%)", + ( (float) Stat - (float) OtherStat ) * 100.0 / + (float) (OtherStat) ); + } else { + OS << " (=)"; + } + } + OS << '\n'; + }; + + for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; + Stat < DynoStats::LAST_DYNO_STAT; + ++Stat) { + + if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64) + continue; + + printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0); + } +} + +void DynoStats::operator+=(const DynoStats &Other) { + for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; + Stat < DynoStats::LAST_DYNO_STAT; + ++Stat) { + Stats[Stat] += Other[Stat]; + } +} + +DynoStats getDynoStats(const BinaryFunction &BF) { + auto &BC = BF.getBinaryContext(); + + DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64()); + + // Return empty-stats about the function we don't completely understand. + if (!BF.isSimple() || !BF.hasValidProfile() || !BF.hasCanonicalCFG()) + return Stats; + + // Update enumeration of basic blocks for correct detection of branch' + // direction. + BF.updateLayoutIndices(); + + for (BinaryBasicBlock *const &BB : BF.layout()) { + // The basic block execution count equals to the sum of incoming branch + // frequencies. This may deviate from the sum of outgoing branches of the + // basic block especially since the block may contain a function that + // does not return or a function that throws an exception. + const uint64_t BBExecutionCount = BB->getKnownExecutionCount(); + + // Ignore empty blocks and blocks that were not executed. + if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0) + continue; + + // Count AArch64 linker-inserted veneers + if(BF.isAArch64Veneer()) + Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount(); + + // Count the number of calls by iterating through all instructions. + for (const MCInst &Instr : *BB) { + if (BC.MIB->isStore(Instr)) { + Stats[DynoStats::STORES] += BBExecutionCount; + } + if (BC.MIB->isLoad(Instr)) { + Stats[DynoStats::LOADS] += BBExecutionCount; + } + + if (!BC.MIB->isCall(Instr)) + continue; + + uint64_t CallFreq = BBExecutionCount; + if (BC.MIB->getConditionalTailCall(Instr)) { + CallFreq = + BC.MIB->getAnnotationWithDefault(Instr, "CTCTakenCount"); + } + Stats[DynoStats::FUNCTION_CALLS] += CallFreq; + if (BC.MIB->isIndirectCall(Instr)) { + Stats[DynoStats::INDIRECT_CALLS] += CallFreq; + } else if (const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr)) { + const BinaryFunction *BF = BC.getFunctionForSymbol(CallSymbol); + if (BF && BF->isPLTFunction()) { + Stats[DynoStats::PLT_CALLS] += CallFreq; + + // We don't process PLT functions and hence have to adjust relevant + // dynostats here for: + // + // jmp *GOT_ENTRY(%rip) + // + // NOTE: this is arch-specific. + Stats[DynoStats::FUNCTION_CALLS] += CallFreq; + Stats[DynoStats::INDIRECT_CALLS] += CallFreq; + Stats[DynoStats::LOADS] += CallFreq; + Stats[DynoStats::INSTRUCTIONS] += CallFreq; + } + } + } + + Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; + + // Jump tables. + const MCInst *LastInstr = BB->getLastNonPseudoInstr(); + if (BC.MIB->getJumpTable(*LastInstr)) { + Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount; + LLVM_DEBUG( + static uint64_t MostFrequentJT; + if (BBExecutionCount > MostFrequentJT) { + MostFrequentJT = BBExecutionCount; + dbgs() << "BOLT-INFO: most frequently executed jump table is in " + << "function " << BF << " in basic block " << BB->getName() + << " executed totally " << BBExecutionCount << " times.\n"; + } + ); + continue; + } + + if (BC.MIB->isIndirectBranch(*LastInstr) && !BC.MIB->isCall(*LastInstr)) { + Stats[DynoStats::UNKNOWN_INDIRECT_BRANCHES] += BBExecutionCount; + continue; + } + + // Update stats for branches. + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { + continue; + } + + if (!CondBranch && !UncondBranch) { + continue; + } + + // Simple unconditional branch. + if (!CondBranch) { + Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount; + continue; + } + + // CTCs: instruction annotations could be stripped, hence check the number + // of successors to identify conditional tail calls. + if (BB->succ_size() == 1) { + if (BB->branch_info_begin() != BB->branch_info_end()) + Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count; + continue; + } + + // Conditional branch that could be followed by an unconditional branch. + uint64_t TakenCount = BB->getTakenBranchInfo().Count; + if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE) + TakenCount = 0; + + uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count; + if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE) + NonTakenCount = 0; + + if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) { + Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount; + Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount; + } else { + Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount; + Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount; + } + + if (UncondBranch) { + Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount; + } + } + + return Stats; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/DynoStats.h b/bolt/src/DynoStats.h new file mode 100644 index 000000000000..f3a9b3042764 --- /dev/null +++ b/bolt/src/DynoStats.h @@ -0,0 +1,177 @@ +//===--- DynoStats.h ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H +#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +namespace bolt { +class BinaryFunction; + +/// Class encapsulating runtime statistics about an execution unit. +class DynoStats { + +#define DYNO_STATS\ + D(FIRST_DYNO_STAT, "", Fn)\ + D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\ + D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\ + D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\ + D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\ + D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\ + D(FUNCTION_CALLS, "all function calls", Fn)\ + D(INDIRECT_CALLS, "indirect calls", Fn)\ + D(PLT_CALLS, "PLT calls", Fn)\ + D(INSTRUCTIONS, "executed instructions", Fn)\ + D(LOADS, "executed load instructions", Fn)\ + D(STORES, "executed store instructions", Fn)\ + D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\ + D(UNKNOWN_INDIRECT_BRANCHES, "taken unknown indirect branches", Fn)\ + D(ALL_BRANCHES, "total branches",\ + Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\ + D(ALL_TAKEN, "taken branches",\ + Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\ + D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\ + Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\ + D(TAKEN_CONDITIONAL, "taken conditional branches",\ + Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\ + D(ALL_CONDITIONAL, "all conditional branches",\ + Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\ + D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\ + D(LAST_DYNO_STAT, "", 0) + +public: +#define D(name, ...) name, + enum Category : uint8_t { DYNO_STATS }; +#undef D + + +private: + uint64_t Stats[LAST_DYNO_STAT+1]; + bool PrintAArch64Stats; + +#define D(name, desc, ...) desc, + static constexpr const char *Desc[] = { DYNO_STATS }; +#undef D + +public: + DynoStats(bool PrintAArch64Stats) { + this->PrintAArch64Stats = PrintAArch64Stats; + for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat) + Stats[Stat] = 0; + } + + uint64_t &operator[](size_t I) { + assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT && + "index out of bounds"); + return Stats[I]; + } + + uint64_t operator[](size_t I) const { + switch (I) { +#define D(name, desc, func) \ + case name: \ + return func; +#define Fn Stats[I] +#define Fadd(a, b) operator[](a) + operator[](b) +#define Fsub(a, b) operator[](a) - operator[](b) +#define F(a) operator[](a) +#define Radd(a, b) (a + b) +#define Rsub(a, b) (a - b) + DYNO_STATS +#undef Rsub +#undef Radd +#undef F +#undef Fsub +#undef Fadd +#undef Fn +#undef D + default: + llvm_unreachable("index out of bounds"); + } + return 0; + } + + void print(raw_ostream &OS, const DynoStats *Other = nullptr) const; + + void operator+=(const DynoStats &Other); + bool operator<(const DynoStats &Other) const; + bool operator==(const DynoStats &Other) const; + bool operator!=(const DynoStats &Other) const { return !operator==(Other); } + bool lessThan(const DynoStats &Other, ArrayRef Keys) const; + + static const char* Description(const Category C) { + return Desc[C]; + } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { + Stats.print(OS, nullptr); + return OS; +} + +DynoStats operator+(const DynoStats &A, const DynoStats &B); + +/// Return dynostats for the function. +/// +/// The function relies on branch instructions being in-sync with CFG for +/// branch instructions stats. Thus it is better to call it after +/// fixBranches(). +DynoStats getDynoStats(const BinaryFunction &BF); + +/// Return program-wide dynostats. +template +inline DynoStats getDynoStats(const FuncsType &Funcs) { + bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); + DynoStats dynoStats(IsAArch64); + for (auto &BFI : Funcs) { + auto &BF = BFI.second; + if (BF.isSimple()) { + dynoStats += getDynoStats(BF); + } + } + return dynoStats; +} + +/// Call a function with optional before and after dynostats printing. +template +inline void +callWithDynoStats(FnType &&Func, + const FuncsType &Funcs, + StringRef Phase, + const bool Flag) { + bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); + DynoStats DynoStatsBefore(IsAArch64); + if (Flag) { + DynoStatsBefore = getDynoStats(Funcs); + } + + Func(); + + if (Flag) { + const DynoStats DynoStatsAfter = getDynoStats(Funcs); + const bool Changed = (DynoStatsAfter != DynoStatsBefore); + outs() << "BOLT-INFO: program-wide dynostats after running " + << Phase << (Changed ? "" : " (no change)") << ":\n\n" + << DynoStatsBefore << '\n'; + if (Changed) { + DynoStatsAfter.print(outs(), &DynoStatsBefore); + } + outs() << '\n'; + } +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp new file mode 100644 index 000000000000..22b1a8439f08 --- /dev/null +++ b/bolt/src/Exceptions.cpp @@ -0,0 +1,782 @@ +//===-- Exceptions.cpp - Helpers for processing C++ exceptions ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Some of the code is taken from examples/ExceptionDemo +// +//===----------------------------------------------------------------------===// + +#include "Exceptions.h" +#include "BinaryFunction.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-exceptions" + +using namespace llvm::dwarf; + +namespace opts { + +extern llvm::cl::OptionCategory BoltCategory; + +extern llvm::cl::opt Verbosity; + +static llvm::cl::opt +PrintExceptions("print-exceptions", + llvm::cl::desc("print exception handling data"), + llvm::cl::ZeroOrMore, + llvm::cl::Hidden, + llvm::cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +// Read and dump the .gcc_exception_table section entry. +// +// .gcc_except_table section contains a set of Language-Specific Data Areas - +// a fancy name for exception handling tables. There's one LSDA entry per +// function. However, we can't actually tell which function LSDA refers to +// unless we parse .eh_frame entry that refers to the LSDA. +// Then inside LSDA most addresses are encoded relative to the function start, +// so we need the function context in order to get to real addresses. +// +// The best visual representation of the tables comprising LSDA and +// relationships between them is illustrated at: +// https://github.com/itanium-cxx-abi/cxx-abi/blob/master/exceptions.pdf +// Keep in mind that GCC implementation deviates slightly from that document. +// +// To summarize, there are 4 tables in LSDA: call site table, actions table, +// types table, and types index table (for indirection). The main table contains +// call site entries. Each call site includes a PC range that can throw an +// exception, a handler (landing pad), and a reference to an entry in the action +// table. The handler and/or action could be 0. The action entry is a head +// of a list of actions associated with a call site. The action table contains +// all such lists (it could be optimized to share list tails). Each action could +// be either to catch an exception of a given type, to perform a cleanup, or to +// propagate the exception after filtering it out (e.g. to make sure function +// exception specification is not violated). Catch action contains a reference +// to an entry in the type table, and filter action refers to an entry in the +// type index table to encode a set of types to filter. +// +// Call site table follows LSDA header. Action table immediately follows the +// call site table. +// +// Both types table and type index table start at the same location, but they +// grow in opposite directions (types go up, indices go down). The beginning of +// these tables is encoded in LSDA header. Sizes for both of the tables are not +// included anywhere. +// +// We have to parse all of the tables to determine their sizes. Then we have +// to parse the call site table and associate discovered information with +// actual call instructions and landing pad blocks. +// +// For the purpose of rewriting exception handling tables, we can reuse action, +// and type index tables in their original binary format. +// +// Type table could be encoded using position-independent references, and thus +// may require relocation. +// +// Ideally we should be able to re-write LSDA in-place, without the need to +// allocate a new space for it. Sadly there's no guarantee that the new call +// site table will be the same size as GCC uses uleb encodings for PC offsets. +// +// Note: some functions have LSDA entries with 0 call site entries. +void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, + uint64_t LSDASectionAddress) { + assert(CurrentState == State::Disassembled && "unexpected function state"); + + if (!getLSDAAddress()) + return; + + DWARFDataExtractor Data( + StringRef(reinterpret_cast(LSDASectionData.data()), + LSDASectionData.size()), + BC.DwCtx->getDWARFObj().isLittleEndian(), 8); + uint64_t Offset = getLSDAAddress() - LSDASectionAddress; + assert(Data.isValidOffset(Offset) && "wrong LSDA address"); + + uint8_t LPStartEncoding = Data.getU8(&Offset); + uint64_t LPStart = 0; + if (Optional MaybeLPStart = Data.getEncodedPointer( + &Offset, LPStartEncoding, Offset + LSDASectionAddress)) + LPStart = *MaybeLPStart; + + assert(LPStart == 0 && "support for split functions not implemented"); + + const uint8_t TTypeEncoding = Data.getU8(&Offset); + size_t TTypeEncodingSize = 0; + uintptr_t TTypeEnd = 0; + if (TTypeEncoding != DW_EH_PE_omit) { + TTypeEnd = Data.getULEB128(&Offset); + TTypeEncodingSize = BC.getDWARFEncodingSize(TTypeEncoding); + } + + if (opts::PrintExceptions) { + outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress()) + << " for function " << *this << "]:\n"; + outs() << "LPStart Encoding = 0x" + << Twine::utohexstr(LPStartEncoding) << '\n'; + outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; + outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding) << '\n'; + outs() << "TType End = " << TTypeEnd << '\n'; + } + + // Table to store list of indices in type table. Entries are uleb128 values. + const uint64_t TypeIndexTableStart = Offset + TTypeEnd; + + // Offset past the last decoded index. + uint64_t MaxTypeIndexTableOffset = 0; + + // Max positive index used in type table. + unsigned MaxTypeIndex = 0; + + // The actual type info table starts at the same location, but grows in + // opposite direction. TTypeEncoding is used to encode stored values. + const uint64_t TypeTableStart = Offset + TTypeEnd; + + uint8_t CallSiteEncoding = Data.getU8(&Offset); + uint32_t CallSiteTableLength = Data.getULEB128(&Offset); + uint64_t CallSiteTableStart = Offset; + uint64_t CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; + uint64_t CallSitePtr = CallSiteTableStart; + uint64_t ActionTableStart = CallSiteTableEnd; + + if (opts::PrintExceptions) { + outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; + outs() << "CallSite table length = " << CallSiteTableLength << '\n'; + outs() << '\n'; + } + + this->HasEHRanges = CallSitePtr < CallSiteTableEnd; + const uint64_t RangeBase = getAddress(); + while (CallSitePtr < CallSiteTableEnd) { + uint64_t Start = *Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding, + CallSitePtr + LSDASectionAddress); + uint64_t Length = *Data.getEncodedPointer( + &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); + uint64_t LandingPad = *Data.getEncodedPointer( + &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); + uint64_t ActionEntry = Data.getULEB128(&CallSitePtr); + + if (opts::PrintExceptions) { + outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) + << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) + << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) + << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; + outs() << " current offset is " << (CallSitePtr - CallSiteTableStart) + << '\n'; + } + + // Create a handler entry if necessary. + MCSymbol *LPSymbol = nullptr; + if (LandingPad) { + if (Instructions.find(LandingPad) == Instructions.end()) { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) + << " not pointing to an instruction in function " + << *this << " - ignoring.\n"; + } + } else { + auto Label = Labels.find(LandingPad); + if (Label != Labels.end()) { + LPSymbol = Label->second; + } else { + LPSymbol = BC.Ctx->createNamedTempSymbol("LP"); + Labels[LandingPad] = LPSymbol; + } + } + } + + // Mark all call instructions in the range. + auto II = Instructions.find(Start); + auto IE = Instructions.end(); + assert(II != IE && "exception range not pointing to an instruction"); + do { + MCInst &Instruction = II->second; + if (BC.MIB->isCall(Instruction) && + !BC.MIB->getConditionalTailCall(Instruction)) { + assert(!BC.MIB->isInvoke(Instruction) && + "overlapping exception ranges detected"); + // Add extra operands to a call instruction making it an invoke from + // now on. + BC.MIB->addEHInfo(Instruction, + MCPlus::MCLandingPad(LPSymbol, ActionEntry)); + } + ++II; + } while (II != IE && II->first < Start + Length); + + if (ActionEntry != 0) { + auto printType = [&](int Index, raw_ostream &OS) { + assert(Index > 0 && "only positive indices are valid"); + uint64_t TTEntry = TypeTableStart - Index * TTypeEncodingSize; + const uint64_t TTEntryAddress = TTEntry + LSDASectionAddress; + uint64_t TypeAddress = + *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); + if ((TTypeEncoding & DW_EH_PE_pcrel) && TypeAddress == TTEntryAddress) { + TypeAddress = 0; + } + if (TypeAddress == 0) { + OS << ""; + return; + } + if (TTypeEncoding & DW_EH_PE_indirect) { + ErrorOr PointerOrErr = BC.getPointerAtAddress(TypeAddress); + assert(PointerOrErr && "failed to decode indirect address"); + TypeAddress = *PointerOrErr; + } + if (BinaryData *TypeSymBD = BC.getBinaryDataAtAddress(TypeAddress)) { + OS << TypeSymBD->getName(); + } else { + OS << "0x" << Twine::utohexstr(TypeAddress); + } + }; + if (opts::PrintExceptions) + outs() << " actions: "; + uint64_t ActionPtr = ActionTableStart + ActionEntry - 1; + int64_t ActionType; + int64_t ActionNext; + const char *Sep = ""; + do { + ActionType = Data.getSLEB128(&ActionPtr); + const uint32_t Self = ActionPtr; + ActionNext = Data.getSLEB128(&ActionPtr); + if (opts::PrintExceptions) + outs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; + if (ActionType == 0) { + if (opts::PrintExceptions) + outs() << "cleanup"; + } else if (ActionType > 0) { + // It's an index into a type table. + MaxTypeIndex = std::max(MaxTypeIndex, + static_cast(ActionType)); + if (opts::PrintExceptions) { + outs() << "catch type "; + printType(ActionType, outs()); + } + } else { // ActionType < 0 + if (opts::PrintExceptions) + outs() << "filter exception types "; + const char *TSep = ""; + // ActionType is a negative *byte* offset into *uleb128-encoded* table + // of indices with base 1. + // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are + // encoded using uleb128 thus we cannot directly dereference them. + uint64_t TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; + while (uint64_t Index = Data.getULEB128(&TypeIndexTablePtr)) { + MaxTypeIndex = std::max(MaxTypeIndex, static_cast(Index)); + if (opts::PrintExceptions) { + outs() << TSep; + printType(Index, outs()); + TSep = ", "; + } + } + MaxTypeIndexTableOffset = + std::max(MaxTypeIndexTableOffset, + TypeIndexTablePtr - TypeIndexTableStart); + } + + Sep = "; "; + + ActionPtr = Self + ActionNext; + } while (ActionNext); + if (opts::PrintExceptions) + outs() << '\n'; + } + } + if (opts::PrintExceptions) + outs() << '\n'; + + assert(TypeIndexTableStart + MaxTypeIndexTableOffset <= + Data.getData().size() && + "LSDA entry has crossed section boundary"); + + if (TTypeEnd) { + LSDAActionTable = LSDASectionData.slice( + ActionTableStart, TypeIndexTableStart - + MaxTypeIndex * TTypeEncodingSize - + ActionTableStart); + for (unsigned Index = 1; Index <= MaxTypeIndex; ++Index) { + uint64_t TTEntry = TypeTableStart - Index * TTypeEncodingSize; + const uint64_t TTEntryAddress = TTEntry + LSDASectionAddress; + uint64_t TypeAddress = + *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); + if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) + TypeAddress = 0; + if (TTypeEncoding & DW_EH_PE_indirect) { + LSDATypeAddressTable.emplace_back(TypeAddress); + if (TypeAddress) { + ErrorOr PointerOrErr = BC.getPointerAtAddress(TypeAddress); + assert(PointerOrErr && "failed to decode indirect address"); + TypeAddress = *PointerOrErr; + } + } + LSDATypeTable.emplace_back(TypeAddress); + } + LSDATypeIndexTable = + LSDASectionData.slice(TypeIndexTableStart, MaxTypeIndexTableOffset); + } +} + +void BinaryFunction::updateEHRanges() { + if (getSize() == 0) + return; + + assert(CurrentState == State::CFG_Finalized && "unexpected state"); + + // Build call sites table. + struct EHInfo { + const MCSymbol *LP; // landing pad + uint64_t Action; + }; + + // If previous call can throw, this is its exception handler. + EHInfo PreviousEH = {nullptr, 0}; + + // Marker for the beginning of exceptions range. + const MCSymbol *StartRange = nullptr; + + // Indicates whether the start range is located in a cold part. + bool IsStartInCold = false; + + // Have we crossed hot/cold border for split functions? + bool SeenCold = false; + + // Sites to update - either regular or cold. + std::vector *Sites = &CallSites; + + for (BinaryBasicBlock *&BB : BasicBlocksLayout) { + + if (BB->isCold() && !SeenCold) { + SeenCold = true; + + // Close the range (if any) and change the target call sites. + if (StartRange) { + Sites->emplace_back(CallSite{StartRange, getFunctionEndLabel(), + PreviousEH.LP, PreviousEH.Action}); + } + Sites = &ColdCallSites; + + // Reset the range. + StartRange = nullptr; + PreviousEH = {nullptr, 0}; + } + + for (auto II = BB->begin(); II != BB->end(); ++II) { + if (!BC.MIB->isCall(*II)) + continue; + + // Instruction can throw an exception that should be handled. + const bool Throws = BC.MIB->isInvoke(*II); + + // Ignore the call if it's a continuation of a no-throw gap. + if (!Throws && !StartRange) + continue; + + // Extract exception handling information from the instruction. + const MCSymbol *LP = nullptr; + uint64_t Action = 0; + if (const Optional EHInfo = BC.MIB->getEHInfo(*II)) + std::tie(LP, Action) = *EHInfo; + + // No action if the exception handler has not changed. + if (Throws && + StartRange && + PreviousEH.LP == LP && + PreviousEH.Action == Action) + continue; + + // Same symbol is used for the beginning and the end of the range. + const MCSymbol *EHSymbol; + MCInst EHLabel; + { + std::unique_lock Lock(BC.CtxMutex); + EHSymbol = BC.Ctx->createNamedTempSymbol("EH"); + BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get()); + } + + II = std::next(BB->insertPseudoInstr(II, EHLabel)); + + // At this point we could be in one of the following states: + // + // I. Exception handler has changed and we need to close previous range + // and start a new one. + // + // II. Start a new exception range after the gap. + // + // III. Close current exception range and start a new gap. + const MCSymbol *EndRange; + if (StartRange) { + // I, III: + EndRange = EHSymbol; + } else { + // II: + StartRange = EHSymbol; + IsStartInCold = SeenCold; + EndRange = nullptr; + } + + // Close the previous range. + if (EndRange) { + Sites->emplace_back(CallSite{StartRange, EndRange, + PreviousEH.LP, PreviousEH.Action}); + } + + if (Throws) { + // I, II: + StartRange = EHSymbol; + IsStartInCold = SeenCold; + PreviousEH = EHInfo{LP, Action}; + } else { + StartRange = nullptr; + } + } + } + + // Check if we need to close the range. + if (StartRange) { + assert((!isSplit() || Sites == &ColdCallSites) && "sites mismatch"); + const MCSymbol *EndRange = + IsStartInCold ? getFunctionColdEndLabel() : getFunctionEndLabel(); + Sites->emplace_back(CallSite{StartRange, EndRange, + PreviousEH.LP, PreviousEH.Action}); + } +} + +const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; + +CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) { + // Prepare FDEs for fast lookup + for (const dwarf::FrameEntry &Entry : EHFrame.entries()) { + const auto *CurFDE = dyn_cast(&Entry); + // Skip CIEs. + if (!CurFDE) + continue; + // There could me multiple FDEs with the same initial address, and perhaps + // different sizes (address ranges). Use the first entry with non-zero size. + auto FDEI = FDEs.lower_bound(CurFDE->getInitialLocation()); + if (FDEI != FDEs.end() && FDEI->first == CurFDE->getInitialLocation()) { + if (CurFDE->getAddressRange()) { + if (FDEI->second->getAddressRange() == 0) { + FDEI->second = CurFDE; + } else if (opts::Verbosity > 0) { + errs() << "BOLT-WARNING: different FDEs for function at 0x" + << Twine::utohexstr(FDEI->first) + << " detected; sizes: " + << FDEI->second->getAddressRange() << " and " + << CurFDE->getAddressRange() << '\n'; + } + } + } else { + FDEs.emplace_hint(FDEI, CurFDE->getInitialLocation(), CurFDE); + } + } +} + +bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { + uint64_t Address = Function.getAddress(); + auto I = FDEs.find(Address); + // Ignore zero-length FDE ranges. + if (I == FDEs.end() || !I->second->getAddressRange()) + return true; + + const FDE &CurFDE = *I->second; + Optional LSDA = CurFDE.getLSDAAddress(); + Function.setLSDAAddress(LSDA ? *LSDA : 0); + + uint64_t Offset = 0; + uint64_t CodeAlignment = CurFDE.getLinkedCIE()->getCodeAlignmentFactor(); + uint64_t DataAlignment = CurFDE.getLinkedCIE()->getDataAlignmentFactor(); + if (CurFDE.getLinkedCIE()->getPersonalityAddress()) { + Function.setPersonalityFunction( + *CurFDE.getLinkedCIE()->getPersonalityAddress()); + Function.setPersonalityEncoding( + *CurFDE.getLinkedCIE()->getPersonalityEncoding()); + } + + auto decodeFrameInstruction = + [&Function, &Offset, Address, CodeAlignment, DataAlignment]( + const CFIProgram::Instruction &Instr) { + uint8_t Opcode = Instr.Opcode; + if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) + Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK; + switch (Instr.Opcode) { + case DW_CFA_nop: + break; + case DW_CFA_advance_loc4: + case DW_CFA_advance_loc2: + case DW_CFA_advance_loc1: + case DW_CFA_advance_loc: + // Advance our current address + Offset += CodeAlignment * int64_t(Instr.Ops[0]); + break; + case DW_CFA_offset_extended_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createOffset( + nullptr, Instr.Ops[0], + DataAlignment * int64_t(Instr.Ops[1]))); + break; + case DW_CFA_offset_extended: + case DW_CFA_offset: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createOffset( + nullptr, Instr.Ops[0], DataAlignment * Instr.Ops[1])); + break; + case DW_CFA_restore_extended: + case DW_CFA_restore: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); + break; + case DW_CFA_set_loc: + assert(Instr.Ops[0] >= Address && "set_loc out of function bounds"); + assert(Instr.Ops[0] <= Address + Function.getSize() && + "set_loc out of function bounds"); + Offset = Instr.Ops[0] - Address; + break; + + case DW_CFA_undefined: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0])); + break; + case DW_CFA_same_value: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0])); + break; + case DW_CFA_register: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRegister(nullptr, Instr.Ops[0], + Instr.Ops[1])); + break; + case DW_CFA_remember_state: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRememberState(nullptr)); + break; + case DW_CFA_restore_state: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRestoreState(nullptr)); + break; + case DW_CFA_def_cfa: + Function.addCFIInstruction( + Offset, MCCFIInstruction::cfiDefCfa(nullptr, Instr.Ops[0], + Instr.Ops[1])); + break; + case DW_CFA_def_cfa_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::cfiDefCfa( + nullptr, Instr.Ops[0], + DataAlignment * int64_t(Instr.Ops[1]))); + break; + case DW_CFA_def_cfa_register: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createDefCfaRegister(nullptr, Instr.Ops[0])); + break; + case DW_CFA_def_cfa_offset: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::cfiDefCfaOffset(nullptr, Instr.Ops[0])); + break; + case DW_CFA_def_cfa_offset_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::cfiDefCfaOffset( + nullptr, DataAlignment * int64_t(Instr.Ops[0]))); + break; + case DW_CFA_GNU_args_size: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createGnuArgsSize(nullptr, Instr.Ops[0])); + Function.setUsesGnuArgsSize(); + break; + case DW_CFA_val_offset_sf: + case DW_CFA_val_offset: + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n"; + } + return false; + case DW_CFA_expression: + case DW_CFA_def_cfa_expression: + case DW_CFA_val_expression: { + MCDwarfExprBuilder Builder; + for (DWARFExpression::Operation &ExprOp : *Instr.Expression) { + const DWARFExpression::Operation::Description &Desc = + ExprOp.getDescription(); + if (Desc.Op[0] == DWARFExpression::Operation::SizeNA) { + Builder.appendOperation(ExprOp.getCode()); + } else if (Desc.Op[1] == DWARFExpression::Operation::SizeNA) { + Builder.appendOperation(ExprOp.getCode(), + ExprOp.getRawOperand(0)); + } else { + Builder.appendOperation(ExprOp.getCode(), ExprOp.getRawOperand(0), + ExprOp.getRawOperand(1)); + } + } + if (Opcode == DW_CFA_expression) { + Function.addCFIInstruction( + Offset, MCCFIInstruction::createExpression( + nullptr, Instr.Ops[0], Builder.take())); + } else if (Opcode == DW_CFA_def_cfa_expression) { + Function.addCFIInstruction(Offset, + MCCFIInstruction::createDefCfaExpression( + nullptr, Builder.take())); + } else { + assert(Opcode == DW_CFA_val_expression && "Unexpected opcode"); + Function.addCFIInstruction( + Offset, MCCFIInstruction::createValExpression( + nullptr, Instr.Ops[0], Builder.take())); + } + break; + } + case DW_CFA_MIPS_advance_loc8: + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n"; + } + return false; + case DW_CFA_GNU_window_save: + case DW_CFA_lo_user: + case DW_CFA_hi_user: + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: DW_CFA_GNU_* and DW_CFA_*_user " + "unimplemented\n"; + } + return false; + default: + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Unrecognized CFI instruction: " + << Instr.Opcode << '\n'; + } + return false; + } + + return true; + }; + + for (const CFIProgram::Instruction &Instr : CurFDE.getLinkedCIE()->cfis()) { + if (!decodeFrameInstruction(Instr)) + return false; + } + + for (const CFIProgram::Instruction &Instr : CurFDE.cfis()) { + if (!decodeFrameInstruction(Instr)) + return false; + } + + return true; +} + +std::vector CFIReaderWriter::generateEHFrameHeader( + const DWARFDebugFrame &OldEHFrame, + const DWARFDebugFrame &NewEHFrame, + uint64_t EHFrameHeaderAddress, + std::vector &FailedAddresses) const { + // Common PC -> FDE map to be written into .eh_frame_hdr. + std::map PCToFDE; + + // Presort array for binary search. + std::sort(FailedAddresses.begin(), FailedAddresses.end()); + + // Initialize PCToFDE using NewEHFrame. + for (dwarf::FrameEntry &Entry : NewEHFrame.entries()) { + const dwarf::FDE *FDE = dyn_cast(&Entry); + if (FDE == nullptr) + continue; + const uint64_t FuncAddress = FDE->getInitialLocation(); + const uint64_t FDEAddress = + NewEHFrame.getEHFrameAddress() + FDE->getOffset(); + + // Ignore unused FDEs. + if (FuncAddress == 0) + continue; + + // Add the address to the map unless we failed to write it. + if (!std::binary_search(FailedAddresses.begin(), FailedAddresses.end(), + FuncAddress)) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: FDE for function at 0x" + << Twine::utohexstr(FuncAddress) << " is at 0x" + << Twine::utohexstr(FDEAddress) << '\n'); + PCToFDE[FuncAddress] = FDEAddress; + } + }; + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: new .eh_frame contains " + << std::distance(NewEHFrame.entries().begin(), + NewEHFrame.entries().end()) + << " entries\n"); + + // Add entries from the original .eh_frame corresponding to the functions + // that we did not update. + for (const dwarf::FrameEntry &Entry : OldEHFrame) { + const dwarf::FDE *FDE = dyn_cast(&Entry); + if (FDE == nullptr) + continue; + const uint64_t FuncAddress = FDE->getInitialLocation(); + const uint64_t FDEAddress = + OldEHFrame.getEHFrameAddress() + FDE->getOffset(); + + // Add the address if we failed to write it. + if (PCToFDE.count(FuncAddress) == 0) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: old FDE for function at 0x" + << Twine::utohexstr(FuncAddress) << " is at 0x" + << Twine::utohexstr(FDEAddress) << '\n'); + PCToFDE[FuncAddress] = FDEAddress; + } + }; + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: old .eh_frame contains " + << std::distance(OldEHFrame.entries().begin(), + OldEHFrame.entries().end()) + << " entries\n"); + + // Generate a new .eh_frame_hdr based on the new map. + + // Header plus table of entries of size 8 bytes. + std::vector EHFrameHeader(12 + PCToFDE.size() * 8); + + // Version is 1. + EHFrameHeader[0] = 1; + // Encoding of the eh_frame pointer. + EHFrameHeader[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4; + // Encoding of the count field to follow. + EHFrameHeader[2] = DW_EH_PE_udata4; + // Encoding of the table entries - 4-byte offset from the start of the header. + EHFrameHeader[3] = DW_EH_PE_datarel | DW_EH_PE_sdata4; + + // Address of eh_frame. Use the new one. + support::ulittle32_t::ref(EHFrameHeader.data() + 4) = + NewEHFrame.getEHFrameAddress() - (EHFrameHeaderAddress + 4); + + // Number of entries in the table (FDE count). + support::ulittle32_t::ref(EHFrameHeader.data() + 8) = PCToFDE.size(); + + // Write the table at offset 12. + char *Ptr = EHFrameHeader.data(); + uint32_t Offset = 12; + for (const auto &PCI : PCToFDE) { + int64_t InitialPCOffset = PCI.first - EHFrameHeaderAddress; + assert(isInt<32>(InitialPCOffset) && "PC offset out of bounds"); + support::ulittle32_t::ref(Ptr + Offset) = InitialPCOffset; + Offset += 4; + int64_t FDEOffset = PCI.second - EHFrameHeaderAddress; + assert(isInt<32>(FDEOffset) && "FDE offset out of bounds"); + support::ulittle32_t::ref(Ptr + Offset) = FDEOffset; + Offset += 4; + } + + return EHFrameHeader; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/Exceptions.h b/bolt/src/Exceptions.h new file mode 100644 index 000000000000..2b66945cfa25 --- /dev/null +++ b/bolt/src/Exceptions.h @@ -0,0 +1,63 @@ +//===-- Exceptions.h - Helpers for processing C++ exceptions --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_EXCEPTIONS_H +#define LLVM_TOOLS_LLVM_BOLT_EXCEPTIONS_H + +#include +#include +#include + +namespace llvm { +class DWARFDebugFrame; +namespace dwarf { +class FDE; +} // namespace dwarf + +namespace bolt { + +class BinaryFunction; + +/// \brief Wraps up information to read all CFI instructions and feed them to a +/// BinaryFunction, as well as rewriting CFI sections. +class CFIReaderWriter { +public: + explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame); + + bool fillCFIInfoFor(BinaryFunction &Function) const; + + /// Generate .eh_frame_hdr from old and new .eh_frame sections. + /// + /// Take FDEs from the \p NewEHFrame unless their initial_pc is listed + /// in \p FailedAddresses. All other entries are taken from the + /// \p OldEHFrame. + /// + /// \p EHFrameHeaderAddress specifies location of .eh_frame_hdr, + /// and is required for relative addressing used in the section. + std::vector generateEHFrameHeader( + const DWARFDebugFrame &OldEHFrame, + const DWARFDebugFrame &NewEHFrame, + uint64_t EHFrameHeaderAddress, + std::vector &FailedAddresses) const; + + using FDEsMap = std::map; + + const FDEsMap &getFDEs() const { + return FDEs; + } + +private: + FDEsMap FDEs; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/ExecutableFileMemoryManager.cpp b/bolt/src/ExecutableFileMemoryManager.cpp new file mode 100644 index 000000000000..c56cfef094ff --- /dev/null +++ b/bolt/src/ExecutableFileMemoryManager.cpp @@ -0,0 +1,100 @@ +//===--- ExecutableFileMemoryManager.cpp ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "ExecutableFileMemoryManager.h" +#include "RewriteInstance.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "efmm" + +using namespace llvm; +using namespace object; +using namespace bolt; + +namespace llvm { + +namespace bolt { + +uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsCode, + bool IsReadOnly) { + // Register a debug section as a note section. + if (!ObjectsLoaded && RewriteInstance::isDebugSection(SectionName)) { + uint8_t *DataCopy = new uint8_t[Size]; + BinarySection &Section = + BC.registerOrUpdateNoteSection(SectionName, DataCopy, Size, Alignment); + Section.setSectionID(SectionID); + assert(!Section.isAllocatable() && "note sections cannot be allocatable"); + return DataCopy; + } + + if (!IsCode && + (SectionName == ".strtab" || + SectionName == ".symtab" || + SectionName == "" || + SectionName.startswith(".rela."))) { + return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, + SectionName, IsReadOnly); + } + + uint8_t *Ret; + if (IsCode) { + Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, + SectionID, SectionName); + } else { + Ret = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, + SectionName, IsReadOnly); + } + + SmallVector Buf; + if (ObjectsLoaded > 0) { + if (BC.isELF()) { + SectionName = (Twine(SectionName) + ".bolt.extra." + Twine(ObjectsLoaded)) + .toStringRef(Buf); + } else if (BC.isMachO()) { + assert((SectionName == "__text" || SectionName == "__data" || + SectionName == "__fini" || SectionName == "__setup" || + SectionName == "__cstring" || SectionName == "__literal16") && + "Unexpected section in the instrumentation library"); + // Sections coming from the instrumentation runtime are prefixed with "I". + SectionName = ("I" + Twine(SectionName)).toStringRef(Buf); + } + } + + BinarySection &Section = BC.registerOrUpdateSection( + SectionName, ELF::SHT_PROGBITS, + BinarySection::getFlags(IsReadOnly, IsCode, true), Ret, Size, Alignment); + Section.setSectionID(SectionID); + assert(Section.isAllocatable() && + "verify that allocatable is marked as allocatable"); + + LLVM_DEBUG( + dbgs() << "BOLT: allocating " + << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data")) + << " section : " << SectionName << " with size " << Size + << ", alignment " << Alignment << " at 0x" << Ret + << ", ID = " << SectionID << "\n"); + return Ret; +} + +bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { + LLVM_DEBUG(dbgs() << "BOLT: finalizeMemory()\n"); + ++ObjectsLoaded; + return SectionMemoryManager::finalizeMemory(ErrMsg); +} + +ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { } + +} + +} diff --git a/bolt/src/ExecutableFileMemoryManager.h b/bolt/src/ExecutableFileMemoryManager.h new file mode 100644 index 000000000000..1b52162b4173 --- /dev/null +++ b/bolt/src/ExecutableFileMemoryManager.h @@ -0,0 +1,71 @@ +//===--- ExecutableFileMemoryManager.h ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H +#define LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include +#include + +namespace llvm { + +namespace bolt { +class BinaryContext; + +/// Class responsible for allocating and managing code and data sections. +class ExecutableFileMemoryManager : public SectionMemoryManager { +private: + uint8_t *allocateSection(intptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsCode, + bool IsReadOnly); + BinaryContext &BC; + bool AllowStubs; + +public: + // Our linker's main purpose is to handle a single object file, created + // by RewriteInstance after reading the input binary and reordering it. + // After objects finish loading, we increment this. Therefore, whenever + // this is greater than zero, we are dealing with additional objects that + // will not be managed by BinaryContext but only exist to support linking + // user-supplied objects into the main input executable. + uint32_t ObjectsLoaded{0}; + + ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs) + : BC(BC), AllowStubs(AllowStubs) {} + + ~ExecutableFileMemoryManager(); + + uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, + StringRef SectionName) override { + return allocateSection(Size, Alignment, SectionID, SectionName, + /*IsCode=*/true, true); + } + + uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, StringRef SectionName, + bool IsReadOnly) override { + return allocateSection(Size, Alignment, SectionID, SectionName, + /*IsCode=*/false, IsReadOnly); + } + bool allowStubAllocation() const override { return AllowStubs; } + + bool finalizeMemory(std::string *ErrMsg = nullptr) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/Heatmap.cpp b/bolt/src/Heatmap.cpp new file mode 100644 index 000000000000..c9b497b820df --- /dev/null +++ b/bolt/src/Heatmap.cpp @@ -0,0 +1,280 @@ +//===-- Heatmap.cpp ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "Heatmap.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +#define DEBUG_TYPE "bolt-heatmap" + +using namespace llvm; + +namespace opts { + +extern cl::SubCommand HeatmapCommand; + +static cl::opt +BucketsPerLine("line-size", + cl::desc("number of entries per line (default 256)"), + cl::init(256), + cl::Optional, + cl::sub(HeatmapCommand)); + +} + +namespace llvm { +namespace bolt { + +void Heatmap::registerAddressRange(uint64_t StartAddress, uint64_t EndAddress, + uint64_t Count) { + if (ignoreAddress(StartAddress)) { + ++NumSkippedRanges; + return; + } + + if (StartAddress > EndAddress || + EndAddress - StartAddress > 64 * 1024) { + LLVM_DEBUG(dbgs() << "invalid range : 0x" << Twine::utohexstr(StartAddress) + << " -> 0x" << Twine::utohexstr(EndAddress) << '\n'); + ++NumSkippedRanges; + return; + } + + for (uint64_t Bucket = StartAddress / BucketSize; + Bucket <= EndAddress / BucketSize; ++Bucket) { + Map[Bucket] += Count; + } +} + +void Heatmap::print(StringRef FileName) const { + std::error_code EC; + raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None); + if (EC) { + errs() << "error opening output file: " << EC.message() << '\n'; + exit(1); + } + print(OS); +} + +void Heatmap::print(raw_ostream &OS) const { + const char FillChar = '.'; + + const auto DefaultColor = raw_ostream::WHITE; + auto changeColor = [&](raw_ostream::Colors Color) -> void { + static auto CurrentColor = raw_ostream::BLACK; + if (CurrentColor == Color) + return; + OS.changeColor(Color); + CurrentColor = Color; + }; + + const uint64_t BytesPerLine = opts::BucketsPerLine * BucketSize; + + // Calculate the max value for scaling. + uint64_t MaxValue = 0; + for (const std::pair &Entry : Map) { + MaxValue = std::max(MaxValue, Entry.second); + } + + // Print start of the line and fill it with an empty space right before + // the Address. + auto startLine = [&](uint64_t Address, bool Empty = false) { + changeColor(DefaultColor); + const uint64_t LineAddress = Address / BytesPerLine * BytesPerLine; + + if (MaxAddress > 0xffffffff) + OS << format("0x%016" PRIx64 ": ", LineAddress); + else + OS << format("0x%08" PRIx64 ": ", LineAddress); + + if (Empty) + Address = LineAddress + BytesPerLine; + for (uint64_t Fill = LineAddress; Fill < Address; Fill += BucketSize) { + OS << FillChar; + } + }; + + // Finish line after \p Address was printed. + auto finishLine = [&](uint64_t Address) { + const uint64_t End = alignTo(Address + 1, BytesPerLine); + for (uint64_t Fill = Address + BucketSize; Fill < End; Fill += BucketSize) + OS << FillChar; + OS << '\n'; + }; + + // Fill empty space in (Start, End) range. + auto fillRange = [&](uint64_t Start, uint64_t End) { + if ((Start / BytesPerLine) == (End / BytesPerLine)) { + for (uint64_t Fill = Start + BucketSize; Fill < End; Fill += BucketSize) { + changeColor(DefaultColor); + OS << FillChar; + } + return; + } + + changeColor(DefaultColor); + finishLine(Start); + Start = alignTo(Start, BytesPerLine); + + uint64_t NumEmptyLines = (End - Start) / BytesPerLine; + + if (NumEmptyLines > 32) { + OS << '\n'; + } else { + while (NumEmptyLines--) { + startLine(Start, /*Empty=*/true); + OS << '\n'; + Start += BytesPerLine; + } + } + + startLine(End); + }; + + static raw_ostream::Colors Colors[] = { + raw_ostream::WHITE, + raw_ostream::WHITE, + raw_ostream::CYAN, + raw_ostream::GREEN, + raw_ostream::YELLOW, + raw_ostream::RED + }; + constexpr size_t NumRanges = sizeof(Colors) / sizeof(Colors[0]); + + uint64_t Range[NumRanges]; + for (uint64_t I = 0; I < NumRanges; ++I) + Range[I] = std::max(I + 1, + (uint64_t) std::pow((double) MaxValue, + (double) (I + 1) / NumRanges)); + Range[NumRanges - 1] = std::max((uint64_t) NumRanges, MaxValue); + + // Print scaled value + auto printValue = [&](uint64_t Value, bool ResetColor = false) { + assert(Value && "should only print positive values"); + for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) { + if (Value <= Range[I]) { + changeColor(Colors[I]); + break; + } + } + if (Value <= Range[0]) { + OS << 'o'; + } else { + OS << 'O'; + } + if (ResetColor) + changeColor(DefaultColor); + }; + + // Print against black background + OS.changeColor(raw_ostream::BLACK, /*Bold=*/false, /*Background=*/true); + changeColor(DefaultColor); + + // Print map legend + OS << "Legend:\n"; + uint64_t PrevValue = 0; + for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) { + const uint64_t Value = Range[I]; + OS << " "; + printValue(Value, true); + OS << " : (" << PrevValue << ", " << Value << "]\n"; + PrevValue = Value; + } + + // Pos - character position from right in hex form. + auto printHeader = [&](unsigned Pos) { + OS << " "; + if (MaxAddress > 0xffffffff) + OS << " "; + unsigned PrevValue = unsigned(-1); + for (unsigned I = 0; I < BytesPerLine; I += BucketSize) { + const unsigned Value = (I & ((1 << Pos * 4) - 1)) >> (Pos - 1) * 4; + if (Value != PrevValue) { + OS << Twine::utohexstr(Value); + PrevValue = Value; + } else { + OS << ' '; + } + } + OS << '\n'; + }; + for (unsigned I = 5; I > 0; --I) + printHeader(I); + + uint64_t PrevAddress = 0; + for (auto MI = Map.begin(), ME = Map.end(); MI != ME; ++MI) { + const std::pair &Entry = *MI; + uint64_t Address = Entry.first * BucketSize; + + if (PrevAddress) { + fillRange(PrevAddress, Address); + } else { + startLine(Address); + } + + printValue(Entry.second); + + PrevAddress = Address; + } + + if (PrevAddress) { + changeColor(DefaultColor); + finishLine(PrevAddress); + } +} + +void Heatmap::printCDF(StringRef FileName) const { + std::error_code EC; + raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None); + if (EC) { + errs() << "error opening output file: " << EC.message() << '\n'; + exit(1); + } + printCDF(OS); +} + +void Heatmap::printCDF(raw_ostream &OS) const { + uint64_t NumTotalCounts = 0; + std::vector Counts; + + for (const std::pair &KV : Map) { + Counts.push_back(KV.second); + NumTotalCounts += KV.second; + } + + std::sort(Counts.begin(), Counts.end(), std::greater()); + + double RatioLeftInKB = (1.0 * BucketSize) / 1024; + assert(NumTotalCounts > 0 && + "total number of heatmap buckets should be greater than 0"); + double RatioRightInPercent = 100.0 / NumTotalCounts; + uint64_t RunningCount = 0; + + OS << "Bucket counts, Size (KB), CDF (%)\n"; + for (uint64_t I = 0; I < Counts.size(); I++) { + RunningCount += Counts[I]; + OS << format("%llu", (I + 1)) << ", " + << format("%.4f", RatioLeftInKB * (I + 1)) << ", " + << format("%.4f", RatioRightInPercent * (RunningCount)) << "\n"; + } + + Counts.clear(); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/Heatmap.h b/bolt/src/Heatmap.h new file mode 100644 index 000000000000..71144119c62e --- /dev/null +++ b/bolt/src/Heatmap.h @@ -0,0 +1,81 @@ +//===-- Heatmap.h -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_HEATMAP_H +#define LLVM_TOOLS_LLVM_BOLT_HEATMAP_H + +#include "llvm/ADT/StringRef.h" +#include +#include + +namespace llvm { +class raw_ostream; + +namespace bolt { + +class Heatmap { + /// Number of bytes per entry in the heat map. + size_t BucketSize; + + /// Minimum address that is considered to be valid. + uint64_t MinAddress; + + /// Maximum address that is considered to be valid. + uint64_t MaxAddress; + + /// Count invalid ranges. + uint64_t NumSkippedRanges{0}; + + /// Map buckets to the number of samples. + std::map Map; + +public: + explicit Heatmap(uint64_t BucketSize = 4096, + uint64_t MinAddress = 0, + uint64_t MaxAddress = std::numeric_limits::max()) + : BucketSize(BucketSize), MinAddress(MinAddress), MaxAddress(MaxAddress) + {}; + + inline bool ignoreAddress(uint64_t Address) const { + return (Address > MaxAddress) || (Address < MinAddress); + } + + /// Register a single sample at \p Address. + void registerAddress(uint64_t Address) { + if (!ignoreAddress(Address)) + ++Map[Address / BucketSize]; + } + + /// Register \p Count samples at [\p StartAddress, \p EndAddress ]. + void registerAddressRange(uint64_t StartAddress, uint64_t EndAddress, + uint64_t Count); + + /// Return the number of ranges that failed to register. + uint64_t getNumInvalidRanges() const { + return NumSkippedRanges; + } + + void print(StringRef FileName) const; + + void print(raw_ostream &OS) const; + + void printCDF(StringRef FileName) const; + + void printCDF(raw_ostream &OS) const; + + size_t size() const { + return Map.size(); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/JumpTable.cpp b/bolt/src/JumpTable.cpp new file mode 100644 index 000000000000..700c9bf881a2 --- /dev/null +++ b/bolt/src/JumpTable.cpp @@ -0,0 +1,128 @@ +//===--- JumpTable.h - Representation of a jump table ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "JumpTable.h" +#include "BinaryFunction.h" +#include "BinarySection.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace bolt; + +using JumpTable = bolt::JumpTable; + +namespace opts { +extern cl::opt JumpTables; +extern cl::opt Verbosity; +} + +bolt::JumpTable::JumpTable(MCSymbol &Symbol, uint64_t Address, size_t EntrySize, + JumpTableType Type, LabelMapType &&Labels, + BinaryFunction &BF, BinarySection &Section) + : BinaryData(Symbol, Address, 0, EntrySize, Section), EntrySize(EntrySize), + OutputEntrySize(EntrySize), Type(Type), Labels(Labels), Parent(&BF) {} + +std::pair +bolt::JumpTable::getEntriesForAddress(const uint64_t Addr) const { + // Check if this is not an address, but a cloned JT id + if ((int64_t)Addr < 0ll) + return std::make_pair(0, Entries.size()); + + const uint64_t InstOffset = Addr - getAddress(); + size_t StartIndex = 0, EndIndex = 0; + uint64_t Offset = 0; + + for (size_t I = 0; I < Entries.size(); ++I) { + auto LI = Labels.find(Offset); + if (LI != Labels.end()) { + const auto NextLI = std::next(LI); + const uint64_t NextOffset = + NextLI == Labels.end() ? getSize() : NextLI->first; + if (InstOffset >= LI->first && InstOffset < NextOffset) { + StartIndex = I; + EndIndex = I; + while (Offset < NextOffset) { + ++EndIndex; + Offset += EntrySize; + } + break; + } + } + Offset += EntrySize; + } + + return std::make_pair(StartIndex, EndIndex); +} + +bool bolt::JumpTable::replaceDestination(uint64_t JTAddress, + const MCSymbol *OldDest, + MCSymbol *NewDest) { + bool Patched = false; + const std::pair Range = getEntriesForAddress(JTAddress); + for (auto I = &Entries[Range.first], E = &Entries[Range.second]; I != E; + ++I) { + MCSymbol *&Entry = *I; + if (Entry == OldDest) { + Patched = true; + Entry = NewDest; + } + } + return Patched; +} + +void bolt::JumpTable::updateOriginal() { + BinaryContext &BC = getSection().getBinaryContext(); + const uint64_t BaseOffset = getAddress() - getSection().getAddress(); + uint64_t EntryOffset = BaseOffset; + for (MCSymbol *Entry : Entries) { + const uint64_t RelType = + Type == JTT_NORMAL ? ELF::R_X86_64_64 : ELF::R_X86_64_PC32; + const uint64_t RelAddend = + Type == JTT_NORMAL ? 0 : EntryOffset - BaseOffset; + // Replace existing relocation with the new one to allow any modifications + // to the original jump table. + if (BC.HasRelocations) + getOutputSection().removeRelocationAt(EntryOffset); + getOutputSection().addRelocation(EntryOffset, Entry, RelType, RelAddend); + EntryOffset += EntrySize; + } +} + +void bolt::JumpTable::print(raw_ostream &OS) const { + uint64_t Offset = 0; + if (Type == JTT_PIC) + OS << "PIC "; + OS << "Jump table " << getName() << " for function " << *Parent << " at 0x" + << Twine::utohexstr(getAddress()) << " with a total count of " << Count + << ":\n"; + for (const uint64_t EntryOffset : OffsetEntries) { + OS << " 0x" << Twine::utohexstr(EntryOffset) << '\n'; + } + for (const MCSymbol *Entry : Entries) { + auto LI = Labels.find(Offset); + if (Offset && LI != Labels.end()) { + OS << "Jump Table " << LI->second->getName() << " at 0x" + << Twine::utohexstr(getAddress() + Offset) + << " (possibly part of larger jump table):\n"; + } + OS << format(" 0x%04" PRIx64 " : ", Offset) << Entry->getName(); + if (!Counts.empty()) { + OS << " : " << Counts[Offset / EntrySize].Mispreds + << "/" << Counts[Offset / EntrySize].Count; + } + OS << '\n'; + Offset += EntrySize; + } + OS << "\n\n"; +} diff --git a/bolt/src/JumpTable.h b/bolt/src/JumpTable.h new file mode 100644 index 000000000000..40619972c2fe --- /dev/null +++ b/bolt/src/JumpTable.h @@ -0,0 +1,131 @@ +//===--- JumpTable.h - Representation of a jump table ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_JUMP_TABLE_H +#define LLVM_TOOLS_LLVM_BOLT_JUMP_TABLE_H + +#include "BinaryData.h" +#include +#include + +namespace llvm { +class MCSymbol; +class raw_ostream; + +namespace bolt { + +enum JumpTableSupportLevel : char { + JTS_NONE = 0, /// Disable jump tables support. + JTS_BASIC = 1, /// Enable basic jump tables support (in-place). + JTS_MOVE = 2, /// Move jump tables to a separate section. + JTS_SPLIT = 3, /// Enable hot/cold splitting of jump tables. + JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables. +}; + +class BinaryFunction; + +/// Representation of a jump table. +/// +/// The jump table may include other jump tables that are referenced by +/// a different label at a different offset in this jump table. +class JumpTable : public BinaryData { + friend class BinaryContext; + + JumpTable() = delete; + JumpTable(const JumpTable &) = delete; + JumpTable &operator=(const JumpTable &) = delete; + +public: + enum JumpTableType : char { + JTT_NORMAL, + JTT_PIC, + }; + + /// Branch statistics for jump table entries. + struct JumpInfo { + uint64_t Mispreds{0}; + uint64_t Count{0}; + }; + + /// Size of the entry used for storage. + size_t EntrySize; + + /// Size of the entry size we will write (we may use a more compact layout) + size_t OutputEntrySize; + + /// The type of this jump table. + JumpTableType Type; + + /// All the entries as labels. + std::vector Entries; + + /// All the entries as offsets into a function. Invalid after CFG is built. + using OffsetsType = std::vector; + OffsetsType OffsetEntries; + + /// Map ->