diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000000..6b8710a711f3
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+.git
diff --git a/.github/workflows/Dockerfile b/.github/workflows/Dockerfile
new file mode 100644
index 000000000000..e08a623f6dbd
--- /dev/null
+++ b/.github/workflows/Dockerfile
@@ -0,0 +1,31 @@
+FROM ubuntu:20.04 AS builder
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=UTC
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ca-certificates git \
+      build-essential cmake ninja-build python3 libjemalloc-dev && \
+    rm -rf /var/lib/apt/lists
+
+WORKDIR /home/bolt
+
+COPY . llvm-bolt
+
+WORKDIR build
+
+RUN cmake -G Ninja ../llvm-bolt/llvm \
+      -DLLVM_ENABLE_PROJECTS="bolt" \
+      -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_ENABLE_ASSERTIONS=ON \
+      -DCMAKE_EXE_LINKER_FLAGS="-Wl,--push-state -Wl,-whole-archive -ljemalloc_pic -Wl,--pop-state -lpthread -lstdc++ -lm -ldl" \
+      -DCMAKE_INSTALL_PREFIX=/home/bolt/install
+
+RUN ninja check-bolt && \
+    ninja install-llvm-bolt install-perf2bolt install-merge-fdata \
+      install-llvm-boltdiff install-bolt_rt
+
+FROM ubuntu:20.04
+
+COPY --from=builder /home/bolt/install /usr/local
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 000000000000..21c505d85c0a
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,16 @@
+name: Docker Image CI
+
+on:
+  push:
+    branches: [ rebased ]
+  pull_request:
+    branches: [ rebased ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Build the Docker image and test
+      run: docker build . --file .github/workflows/Dockerfile --tag ubuntu-bolt:$(date +%s)
diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
new file mode 100644
index 000000000000..4f5664d7abc6
--- /dev/null
+++ b/bolt/CMakeLists.txt
@@ -0,0 +1,29 @@
+include(ExternalProject)
+
+set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(CMAKE_CXX_STANDARD 14)
+
+ExternalProject_Add(bolt_rt
+  SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
+  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins
+  CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_BUILD_TYPE=Release
+             -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
+             -DCMAKE_INSTALL_PREFIX=${LLVM_BINARY_DIR}
+  # You might want to set this to True if actively developing bolt_rt, otherwise
+  # cmake will not rebuild it after source code changes
+  BUILD_ALWAYS True
+  )
+
+install(CODE "execute_process\(COMMAND \${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=\${CMAKE_INSTALL_PREFIX} -P ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/cmake_install.cmake \)"
+  COMPONENT bolt_rt)
+
+add_llvm_install_targets(install-bolt_rt
+  DEPENDS bolt_rt
+  COMPONENT bolt_rt)
+
+add_subdirectory(src)
+add_subdirectory(test)
diff --git a/bolt/LICENSE.TXT b/bolt/LICENSE.TXT
new file mode 100644
index 000000000000..fa6ac5400070
--- /dev/null
+++ b/bolt/LICENSE.TXT
@@ -0,0 +1,279 @@
+==============================================================================
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+   `LICENSE` file at the top containing the specific license and restrictions
+   which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+   file.
+
+==============================================================================
+Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
diff --git a/bolt/README.md b/bolt/README.md
new file mode 100644
index 000000000000..ffa2119fd501
--- /dev/null
+++ b/bolt/README.md
@@ -0,0 +1,221 @@
+# BOLT
+
+BOLT is a post-link optimizer developed to speed up large applications.
+It achieves the improvements by optimizing application's code layout based on
+execution profile gathered by sampling profiler, such as Linux `perf` tool.
+An overview of the ideas implemented in BOLT along with a discussion of its
+potential and current results is available in
+[CGO'19 paper](https://research.fb.com/publications/bolt-a-practical-binary-optimizer-for-data-centers-and-beyond/).
+
+## Input Binary Requirements
+
+BOLT operates on X86-64 and AArch64 ELF binaries. At the minimum, the binaries
+should have an unstripped symbol table, and, to get maximum performance gains,
+they should be linked with relocations (`--emit-relocs` or `-q` linker flag).
+
+BOLT disassembles functions and reconstructs the control flow graph (CFG)
+before it runs optimizations. Since this is a nontrivial task,
+especially when indirect branches are present, we rely on certain heuristics
+to accomplish it. These heuristics have been tested on a code generated with
+Clang and GCC compilers. The main requirement for C/C++ code is not to rely
+on code layout properties, such as function pointer deltas.
+Assembly code can be processed too. Requirements for it include a clear
+separation of code and data, with data objects being placed into data
+sections/segments. If indirect jumps are used for intra-function control
+transfer (e.g., jump tables), the code patterns should be matching those
+generated by Clang/GCC.
+
+NOTE: BOLT is currently incompatible with the `-freorder-blocks-and-partition`
+compiler option. Since GCC8 enables this option by default, you have to
+explicitly disable it by adding `-fno-reorder-blocks-and-partition` flag if
+you are compiling with GCC8.
+
+PIE and .so support has been added recently. Please report bugs if you
+encounter any issues.
+
+## Installation
+
+### Docker Image
+
+You can build and use the docker image containing BOLT using our [docker file](./utils/docker/Dockerfile).
+Alternatively, you can build BOLT manually using the steps below.
+
+### Manual Build
+
+BOLT heavily uses LLVM libraries, and by design, it is built as one of LLVM
+tools. The build process is not much different from a regular LLVM build.
+The following instructions are assuming that you are running under Linux.
+
+Start with cloning LLVM and BOLT repos:
+
+```
+> git clone https://github.com/llvm-mirror/llvm llvm
+> cd llvm/tools
+> git checkout -b llvm-bolt f137ed238db11440f03083b1c88b7ffc0f4af65e
+> git clone https://github.com/facebookincubator/BOLT llvm-bolt
+> cd ..
+> patch -p 1 < tools/llvm-bolt/llvm.patch
+```
+
+Proceed to a normal LLVM build using a compiler with C++11 support (for GCC
+use version 4.9 or later):
+
+```
+> cd ..
+> mkdir build
+> cd build
+> cmake -G Ninja ../llvm -DLLVM_TARGETS_TO_BUILD="X86;AArch64" -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON
+> ninja
+```
+
+`llvm-bolt` will be available under `bin/`. Add this directory to your path to
+ensure the rest of the commands in this tutorial work.
+
+Note that we use a specific revision of LLVM as we currently rely on a set of
+patches that are not yet upstreamed.
+
+## Optimizing BOLT's Performance
+
+BOLT runs many internal passes in parallel. If you foresee heavy usage of
+BOLT, you can improve the processing time by linking against one of memory
+allocation libraries with good support for concurrency. E.g. to use jemalloc:
+
+```
+> sudo yum install jemalloc-devel
+> LD_PRELOAD=/usr/lib64/libjemalloc.so llvm-bolt ....
+```
+Or if you rather use tcmalloc:
+```
+> sudo yum install gperftools-devel
+> LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so llvm-bolt ....
+```
+
+## Usage
+
+For a complete practical guide of using BOLT see [Optimizing Clang with BOLT](./docs/OptimizingClang.md).
+
+### Step 0
+
+In order to allow BOLT to re-arrange functions (in addition to re-arranging
+code within functions) in your program, it needs a little help from the linker.
+Add `--emit-relocs` to the final link step of your application. You can verify
+the presence of relocations by checking for `.rela.text` section in the binary.
+BOLT will also report if it detects relocations while processing the binary.
+
+### Step 1: Collect Profile
+
+This step is different for different kinds of executables. If you can invoke
+your program to run on a representative input from a command line, then check
+**For Applications** section below. If your program typically runs as a
+server/service, then skip to **For Services** section.
+
+The version of `perf` command used for the following steps has to support
+`-F brstack` option. We recommend using `perf` version 4.5 or later.
+
+#### For Applications
+
+This assumes you can run your program from a command line with a typical input.
+In this case, simply prepend the command line invocation with `perf`:
+```
+$ perf record -e cycles:u -j any,u -o perf.data -- <executable> <args> ...
+```
+
+#### For Services
+
+Once you get the service deployed and warmed-up, it is time to collect perf
+data with LBR (branch information). The exact perf command to use will depend
+on the service. E.g., to collect the data for all processes running on the
+server for the next 3 minutes use:
+```
+$ perf record -e cycles:u -j any,u -a -o perf.data -- sleep 180
+```
+
+Depending on the application, you may need more samples to be included with
+your profile. It's hard to tell upfront what would be a sweet spot for your
+application. We recommend the profile to cover 1B instructions as reported
+by BOLT `-dyno-stats` option. If you need to increase the number of samples
+in the profile, you can either run the `sleep` command for longer and use
+`-F<N>` option with `perf` to increase sampling frequency.
+
+Note that for profile collection we recommend using cycle events and not
+`BR_INST_RETIRED.*`. Empirically we found it to produce better results.
+
+If the collection of a profile with branches is not available, e.g., when you run on
+a VM or on hardware that does not support it, then you can use only sample
+events, such as cycles. In this case, the quality of the profile information
+would not be as good, and performance gains with BOLT are expected to be lower.
+
+#### With instrumentation
+
+If perf record is not available to you, you may collect profile by first
+instrumenting the binary with BOLT and then running it.
+```
+llvm-bolt <executable> -instrument -o <instrumented-executable>
+```
+
+After you run instrumented-executable with the desired workload, its BOLT
+profile should be ready for you in `/tmp/prof.fdata` and you can skip
+**Step 2**.
+
+Run BOLT with the `-help` option and check the category "BOLT instrumentation
+options" for a quick reference on instrumentation knobs.
+
+### Step 2: Convert Profile to BOLT Format
+
+NOTE: you can skip this step and feed `perf.data` directly to BOLT using
+experimental `-p perf.data` option.
+
+For this step, you will need `perf.data` file collected from the previous step and
+a copy of the binary that was running. The binary has to be either
+unstripped, or should have a symbol table intact (i.e., running `strip -g` is
+okay).
+
+Make sure `perf` is in your `PATH`, and execute `perf2bolt`:
+```
+$ perf2bolt -p perf.data -o perf.fdata <executable>
+```
+
+This command will aggregate branch data from `perf.data` and store it in a
+format that is both more compact and more resilient to binary modifications.
+
+If the profile was collected without LBRs, you will need to add `-nl` flag to
+the command line above.
+
+### Step 3: Optimize with BOLT
+
+Once you have `perf.fdata` ready, you can use it for optimizations with
+BOLT. Assuming your environment is setup to include the right path, execute
+`llvm-bolt`:
+```
+$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=cache+ -reorder-functions=hfsort -split-functions=2 -split-all-cold -split-eh -dyno-stats
+```
+
+If you do need an updated debug info, then add `-update-debug-sections` option
+to the command above. The processing time will be slightly longer.
+
+For a full list of options see `-help`/`-help-hidden` output.
+
+The input binary for this step does not have to 100% match the binary used for
+profile collection in **Step 1**. This could happen when you are doing active
+development, and the source code constantly changes, yet you want to benefit
+from profile-guided optimizations. However, since the binary is not precisely the
+same, the profile information could become invalid or stale, and BOLT will
+report the number of functions with a stale profile. The higher the
+number, the less performance improvement should be expected. Thus, it is
+crucial to update `.fdata` for release branches.
+
+## Multiple Profiles
+
+Suppose your application can run in different modes, and you can generate
+multiple profiles for each one of them. To generate a single binary that can
+benefit all modes (assuming the profiles don't contradict each other) you can
+use `merge-fdata` tool:
+```
+$ merge-fdata *.fdata > combined.fdata
+```
+Use `combined.fdata` for **Step 3** above to generate a universally optimized
+binary.
+
+## License
+
+BOLT is licensed under the [Apache License v2.0 with LLVM Exceptions](./LICENSE.TXT).
diff --git a/bolt/docs/Heatmap.png b/bolt/docs/Heatmap.png
new file mode 100644
index 000000000000..e3f76fb22932
Binary files /dev/null and b/bolt/docs/Heatmap.png differ
diff --git a/bolt/docs/Heatmaps.md b/bolt/docs/Heatmaps.md
new file mode 100644
index 000000000000..526b3900b2d1
--- /dev/null
+++ b/bolt/docs/Heatmaps.md
@@ -0,0 +1,50 @@
+# Code Heatmaps
+
+BOLT has gained the ability to print code heatmaps based on
+sampling-based LBR profiles generated by `perf`. The output is produced
+in colored ASCII to be displayed in a color-capable terminal. It looks
+something like this:
+
+![](./Heatmap.png)
+
+Heatmaps can be generated for BOLTed and non-BOLTed binaries. You can
+use them to compare the code layout before and after optimizations.
+
+To generate a heatmap, start with running your app under `perf`:
+
+```bash
+$ perf record -e cycles:u -j any,u -- <executable with args>
+```
+or if you want to monitor the existing process(es):
+```bash
+$ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
+```
+
+Note that at the moment running with LBR (`-j any,u` or `-b`) is
+a requirement.
+
+Once the run is complete, and `perf.data` is generated, run BOLT in
+a heatmap mode:
+
+```bash
+$ llvm-bolt heatmap -p perf.data <executable>
+```
+
+By default the heatmap will be dumped to *stdout*. You can change it
+with `-o <heatmapfile>` option. Each character/block in the heatmap
+shows the execution data accumulated for corresponding 64 bytes of
+code. You can change this granularity with a `-block-size` option.
+E.g. set it to 4096 to see code usage grouped by 4K pages.
+Other useful options are:
+
+```bash
+-line-size=<uint>   - number of entries per line (default 256)
+-max-address=<uint> - maximum address considered valid for heatmap (default 4GB)
+```
+
+If you prefer to look at the data in a browser (or would like to share
+it that way), then you can use an HTML conversion tool. E.g.:
+
+```bash
+$ aha -b -f <heatmapfile> > <heatmapfile>.html
+```
diff --git a/bolt/docs/OptimizingClang.md b/bolt/docs/OptimizingClang.md
new file mode 100644
index 000000000000..12ac2fe19e23
--- /dev/null
+++ b/bolt/docs/OptimizingClang.md
@@ -0,0 +1,266 @@
+# Optimizing Clang : A Practical Example of Applying BOLT
+
+## Preface
+
+*BOLT* (Binary Optimization and Layout Tool) is designed to improve the application
+performance by laying out code in a manner that helps CPU better utilize its caching and
+branch predicting resources.
+
+The most obvious candidates for BOLT optimizations
+are programs that suffer from many instruction cache and iTLB misses, such as
+large applications measuring over hundreds of megabytes in size. However, medium-sized
+programs can benefit too. Clang, one of the most popular open-source C/C++ compilers,
+is a good example of the latter. Its code size could easily be in the order of tens of megabytes.
+As we will see, the Clang binary suffers from many instruction cache
+misses and can be significantly improved with BOLT, even on top of profile-guided and
+link-time optimizations.
+
+In this tutorial we will first build Clang with PGO and LTO, and then will show steps on how to
+apply BOLT optimizations to make Clang up to 15% faster. We will also analyze where
+the compile-time performance gains are coming from, and verify that the speed-ups are
+sustainable while building other applications.
+
+## Building Clang
+
+The process of getting Clang sources and performing the build is very similar to the
+one described at http://clang.llvm.org/get_started.html. For completeness, we provide the detailed steps
+on how to obtain and build Clang in [Bootstrapping Clang-7 with PGO and LTO](#bootstrapping-clang-7-with-pgo-and-lto) section.
+
+The only difference from the standard Clang build is that we require the `-Wl,-q` flag to be present during
+the final link. This option saves relocation metadata in the executable file, but does not affect
+the generated code in any way.
+
+## Optimizing Clang with BOLT
+
+We will use the setup described in [Bootstrapping Clang-7 with PGO and LTO](#bootstrapping-clang-7-with-pgo-and-lto).
+Adjust the steps accordingly if you skipped that section. We will also assume that `llvm-bolt` is present in your `$PATH`.
+
+Before we can run BOLT optimizations, we need to collect the profile for Clang, and we will use
+Clang/LLVM sources for that.
+Collecting accurate profile requires running `perf` on a hardware that
+implements taken branch sampling (`-b/-j` flag). For that reason, it may not be possible to
+collect the accurate profile in a virtualized environment, e.g. in the cloud.
+We do support regular sampling profiles, but the performance
+improvements are expected to be more modest. 
+
+```bash
+$ mkdir ${TOPLEV}/stage3
+$ cd ${TOPLEV}/stage3
+$ CPATH=${TOPLEV}/stage2-prof-use-lto/install/bin/
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3/install
+$ perf record -e cycles:u -j any,u -- ninja clang
+```
+
+Once the last command is finished, it will create a `perf.data` file larger than 10GiB.
+We will first convert this profile into a more compact aggregated
+form suitable to be consumed by BOLT:
+```bash
+  $ perf2bolt $CPATH/clang-7 -p perf.data -o clang-7.fdata -w clang-7.yaml
+```
+Notice that we are passing `clang-7` to `perf2bolt` which is the real binary that
+`clang` and `clang++` are symlinking to. The next step will optimize Clang using
+the generated profile:
+```bash
+$ llvm-bolt $CPATH/clang-7 -o $CPATH/clang-7.bolt -b clang-7.yaml \
+    -reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 \
+    -split-all-cold -dyno-stats -icf=1 -use-gnu-stack
+```
+The output will look similar to the one below:
+```t
+...
+BOLT-INFO: enabling relocation mode
+BOLT-INFO: 11415 functions out of 104526 simple functions (10.9%) have non-empty execution profile.
+...
+BOLT-INFO: ICF folded 29144 out of 105177 functions in 8 passes. 82 functions had jump tables.
+BOLT-INFO: Removing all identical functions will save 5466.69 KB of code space. Folded functions were called 2131985 times based on profile.
+BOLT-INFO: basic block reordering modified layout of 7848 (10.32%) functions
+...
+           660155947 : executed forward branches (-2.3%)
+            48252553 : taken forward branches (-57.2%)
+           129897961 : executed backward branches (+13.8%)
+            52389551 : taken backward branches (-19.5%)
+            35650038 : executed unconditional branches (-33.2%)
+           128338874 : all function calls (=)
+            19010563 : indirect calls (=)
+             9918250 : PLT calls (=)
+          6113398840 : executed instructions (-0.6%)
+          1519537463 : executed load instructions (=)
+           943321306 : executed store instructions (=)
+            20467109 : taken jump table branches (=)
+           825703946 : total branches (-2.1%)
+           136292142 : taken branches (-41.1%)
+           689411804 : non-taken conditional branches (+12.6%)
+           100642104 : taken conditional branches (-43.4%)
+           790053908 : all conditional branches (=)
+...
+```
+The statistics in the output is based on the LBR profile collected with `perf`, and since we were using
+the `cycles` counter, its accuracy is affected. However, the relative improvement in `taken conditional
+ branches` is a good indication that BOLT was able to straighten out the code even after PGO.
+
+## Measuring Compile-time Improvement
+
+`clang-7.bolt` can be used as a replacement for *PGO+LTO* Clang:
+```bash
+$ mv $CPATH/clang-7 $CPATH/clang-7.org
+$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7
+```
+Doing a new build of Clang using the new binary shows a significant overall
+build time reduction on a 48-core Haswell system:
+```bash
+$ ln -fs $CPATH/clang-7.org $CPATH/clang-7
+$ ninja clean && /bin/time -f %e ninja clang -j48
+202.72
+$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7
+$ ninja clean && /bin/time -f %e ninja clang -j48
+180.11
+```
+That's 22.61 seconds (or 12%) faster compared to the *PGO+LTO* build.
+Notice that we are measuring an improvement of the total build time, which includes the time spent in the linker.
+Compilation time improvements for individual files differ, and speedups over 15% are not uncommon.
+If we run BOLT on a Clang binary compiled without *PGO+LTO* (in which case the build is finished in 253.32 seconds),
+the gains we see are over 50 seconds (25%),
+but, as expected, the result is still slower than *PGO+LTO+BOLT* build.
+
+## Source of the Wins
+
+We mentioned that Clang suffers from considerable instruction cache misses. This can be measured with `perf`:
+```bash
+$ ln -fs $CPATH/clang-7.org $CPATH/clang-7
+$ ninja clean && perf stat -e instructions,L1-icache-misses -- ninja clang -j48
+  ...
+   16,366,101,626,647      instructions
+      359,996,216,537      L1-icache-misses
+```
+That's about 22 instruction cache misses per thousand instructions. As a rule of thumb, if the application
+has over 10 misses per thousand instructions, it is a good indication that it will be improved by BOLT.
+Now let's see how many misses are in the BOLTed binary:
+```bash
+$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7
+$ ninja clean && perf stat -e instructions,L1-icache-misses -- ninja clang -j48
+  ...
+  16,319,818,488,769      instructions
+     244,888,677,972      L1-icache-misses
+```
+The number of misses per thousand instructions went down from 22 to 15, significantly reducing
+the number of stalls in the CPU front-end.
+Notice how the number of executed instructions stayed roughly the same. That's because we didn't
+run any optimizations beyond the ones affecting the code layout. Other than instruction cache misses,
+BOLT also improves branch mispredictions, iTLB misses, and misses in L2 and L3.
+
+## Using Clang for Other Applications
+
+We have collected profile for Clang using its own source code. Would it be enough to speed up
+the compilation of other projects? We picked `mysqld`, an open-source database, to do the test.
+
+On our 48-core Haswell system using the *PGO+LTO* Clang, the build finished in 136.06 seconds, while using the *PGO+LTO+BOLT* Clang, 126.10 seconds.
+That's a noticeable improvement, but not as significant as the one we saw on Clang itself.
+This is partially because the number of instruction cache misses is slightly lower on this scenario : 19 vs 22.
+Another reason is that Clang is run with a different set of options while building `mysqld` compared
+to the training run.
+
+Different options exercise different code paths, and
+if we trained without a specific option, we may have misplaced parts of the code responsible for handling it.
+To test this theory, we have collected another `perf` profile while building `mysqld`, and merged it with an existing profile
+using the `merge-fdata` utility that comes with BOLT. Optimized with that profile, the *PGO+LTO+BOLT* Clang was able
+to perform the `mysqld` build in 124.74 seconds, i.e. 11 seconds or 9% faster compared to *PGO+LGO* Clang.
+The merged profile didn't make the original Clang compilation slower either, while the number of profiled functions in Clang increased from 11,415 to 14,025.
+
+Ideally, the profile run has to be done with a superset of all commonly used options. However, the main improvement is expected with just the basic set.
+
+## Summary
+
+In this tutorial we demonstrated how to use BOLT to improve the
+performance of the Clang compiler. Similarly, BOLT could be used to improve the performance
+of GCC, or any other application suffering from a high number of instruction
+cache misses.
+
+----
+# Appendix
+
+## Bootstrapping Clang-7 with PGO and LTO
+
+Below we describe detailed steps to build Clang, and make it ready for BOLT optimizations. If you
+already have the build setup, you can skip this section, except for the last step that adds `-Wl,-q` linker flag to the final build.
+
+### Getting Clang-7 Sources
+
+Set `$TOPLEV` to the directory of your preference where you would like to do
+builds. E.g. `TOPLEV=~/clang-7/`. Follow with commands to clone the `release_70` branches
+of LLVM, Clang, lld linker, and the compiler runtime:
+```bash
+$ cd ${TOPLEV}
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/llvm.git/ llvm
+$ cd llvm/tools
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/clang.git/
+$ cd ../projects
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/lld.git/
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/compiler-rt.git/
+```
+
+### Building Stage 1 Compiler
+
+Stage 1 will be the first build we are going to do, and we will be using the
+default system compiler to build Clang. If your system lacks a compiler, use your distribution package manager to install one
+that supports C++11. In this example we are going to use GCC. In addition to the compiler,
+you will need the `cmake` and `ninja` packages.
+```bash
+$ mkdir ${TOPLEV}stage1
+$ cd ${TOPLEV}/stage1
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_ASM_COMPILER=gcc \
+      -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage1/install
+$ ninja install
+```
+
+### Building Stage 2 Compiler With Instrumentation
+
+Using the freshly-baked stage 1 Clang compiler, we are going to build Clang with profile generation capabilities:
+```bash
+$ mkdir ${TOPLEV}/stage2-prof-gen
+$ cd ${TOPLEV}/stage2-prof-gen
+$ CPATH=${TOPLEV}/stage1/install/bin/
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_USE_LINKER=lld -DLLVM_BUILD_INSTRUMENTED=ON \
+    -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage2-prof-gen/install
+$ ninja install
+```
+
+### Generating Profile for PGO
+
+While there are many ways to obtain the profile data, we are going to use the source code already at our
+disposal, i.e. we are going to collect the profile while building Clang itself:
+```bash
+$ mkdir ${TOPLEV}/stage3-train
+$ cd ${TOPLEV}/stage3-train
+$ CPATH=${TOPLEV}/stage2-prof-gen/install/bin
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3-train/install
+$ ninja clang
+```
+Once the build is completed, the profile files will be saved under `${TOPLEV}/stage2-prof-gen/profiles`. We will merge them before they can be passed back into Clang:
+```bash
+$ cd ${TOPLEV}/stage2-prof-gen/profiles
+$ ${TOPLEV}/stage1/install/bin/llvm-profdata merge -output=clang.profdata *
+```
+
+### Building Clang with PGO and LTO
+
+Now the profile can be used to guide optimizations to produce better code for our scenario, i.e. building Clang.
+We will also enable link-time optimizations to allow cross-module inlining and other optimizations. Finally, we are going to add one extra step that is useful for BOLT: a linker flag instructing it to preserve relocations in the output binary. Note that this flag does not affect the generated code or data used at runtime, it only writes metadata to the file on disk:
+```bash
+$ mkdir ${TOPLEV}/stage2-prof-use-lto
+$ cd ${TOPLEV}/stage2-prof-use-lto
+$ CPATH=${TOPLEV}/stage1/install/bin/
+$ export LDFLAGS="-Wl,-q"
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_ENABLE_LTO=Full -DLLVM_PROFDATA_FILE=${TOPLEV}/stage2-prof-gen/profiles/clang.profdata \
+    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage2-prof-use-lto/install
+$ ninja install
+```
+Now we have a Clang compiler that can build itself much faster. As we will see, it builds other applications faster as well, and, with BOLT, the compile time can be improved even further.
diff --git a/bolt/docs/RuntimeLibrary.md b/bolt/docs/RuntimeLibrary.md
new file mode 100644
index 000000000000..58d9497a195b
--- /dev/null
+++ b/bolt/docs/RuntimeLibrary.md
@@ -0,0 +1,37 @@
+# BOLT ORC-based linker
+
+A high-level view on the simple linker used to insert auxiliary/library code into the final binary produced by BOLT. This is built on top of LLVM's ORC infra (the newest iteration on JITting for LLVM).
+
+## Several levels of code injection
+
+When BOLT starts processing an input executable, its first task is to raise the binary to a low-level IR with CFG. After this is done, we are ready to change code in this binary. Throughout BOLT's pipeline of code transformations, there are plenty of situations when we need to insert new code or fix existing code.
+
+If operating with small code changes inside a basic block, we typically defer this work to MCPlusBuilder. This is our target-independent interface to create new instructions, but it also contains some functions that may create code spanning multiple basic blocks (for instance, when doing indirect call promotion and unrolling an indirect call into a ladder of comparisons/direct calls). The implementation here usually boils down to programmatically creating new MCInst instructions while setting their opcodes according to the target list (see X86GenInstOpcodes.inc generated by tablegen in an LLVM build).
+
+However, this approach quickly becomes awkward if we want to insert a lot of code, especially if this code is frozen and never changes. In these situations, it is more convenient to have a runtime library with all the code you need to insert. This library defines some symbols and can be linked into the final binary. In this case, all you need to do in a BOLT transformation is to insert a call to your library.
+
+## The runtime library
+
+Currently, our runtime library is written in C++ and contains code that helps us instrument a binary.
+
+### Limitations
+Our library is not written with regular C++ code as it is not linked against any other libraries (this means we cannnot rely on anything defined on libstdc++, glibc, libgcc etc), but is self sufficient. In runtime/CMakeLists.txt, we can see it is built with -ffreestanding, which requires the compiler to avoid using a runtime library by itself.
+
+While this requires us to make our own syscalls, it does simplify our linker a lot, which is very limited and can only do basic function name resolving. However, this is a big improvement in comparison with programmatically generating the code in assembly language using MCInsts.
+
+A few more quirks:
+
+* No BSS section: don't use uninitialized globals
+* No dependencies on foreign code: self sufficient
+* You should closely watch the generated bolt_rt object files, anything requiring fancy linker features will break. We only support bare bones .text, .data and nothing else.
+
+Read instr.cpp opening comment for more details.
+
+
+## Linking
+
+While RewriteInstance::emitAndLink() will perform an initial link step to resolve all references of the input program, it will not start linking the runtime library right away. The input program lives in its own module that may end up with unresolved references to the runtime library.
+
+RewriteInstance::linkRuntime() has the job of actually reading individual .o files and adding them to the binary. We currently have a single .o file, so after it is read, ORC can finally resolve references from the first module to the newly inserted .o objects.
+
+This sequence of steps is done by calls to addObject() and emitAndFinalize(). The latter will trigger symbol resolution, relying on the symbol resolver provided by us when calling createLegacyLookupResolver().
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
new file mode 100644
index 000000000000..cc679c31ff6d
--- /dev/null
+++ b/bolt/runtime/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.1.0)
+
+include(CheckIncludeFiles)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+project(libbolt_rt_project)
+
+check_include_files(elf.h HAVE_ELF_H)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.in
+               ${CMAKE_CURRENT_BINARY_DIR}/config.h)
+
+add_library(bolt_rt_instr STATIC
+  instr.cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/config.h
+  )
+add_library(bolt_rt_hugify STATIC
+  hugify.cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/config.h
+  )
+
+# Don't let the compiler think it can create calls to standard libs
+target_compile_options(bolt_rt_instr PRIVATE -ffreestanding -fno-exceptions -fno-rtti -fPIE)
+target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_compile_options(bolt_rt_hugify PRIVATE -ffreestanding -fno-exceptions -fno-rtti)
+target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+install(TARGETS bolt_rt_instr DESTINATION lib)
+install(TARGETS bolt_rt_hugify DESTINATION lib)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*")
+  add_library(bolt_rt_instr_osx STATIC
+    instr.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/config.h
+  )
+  target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+  target_compile_options(bolt_rt_instr_osx PRIVATE
+    -target x86_64-apple-darwin19.6.0
+    -ffreestanding
+    -fno-exceptions
+    -fno-rtti
+    -fno-stack-protector)
+  install(TARGETS bolt_rt_instr_osx DESTINATION lib)
+endif()
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
new file mode 100644
index 000000000000..732ea3425179
--- /dev/null
+++ b/bolt/runtime/common.h
@@ -0,0 +1,499 @@
+//===-- common.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(__APPLE__)
+
+#include <cstddef>
+#include <cstdint>
+
+#else
+
+typedef __SIZE_TYPE__ size_t;
+#define __SSIZE_TYPE__                                                         \
+  __typeof__(_Generic((__SIZE_TYPE__)0, unsigned long long int                 \
+                      : (long long int)0, unsigned long int                    \
+                      : (long int)0, unsigned int                              \
+                      : (int)0, unsigned short                                 \
+                      : (short)0, unsigned char                                \
+                      : (signed char)0))
+typedef __SSIZE_TYPE__ ssize_t;
+
+typedef unsigned long long uint64_t;
+typedef unsigned uint32_t;
+typedef unsigned char uint8_t;
+
+typedef long long int64_t;
+typedef int int32_t;
+
+#endif
+
+#include "config.h"
+
+#ifdef HAVE_ELF_H
+#include <elf.h>
+#endif
+
+// Save all registers while keeping 16B stack alignment
+#define SAVE_ALL                                                               \
+  "push %%rax\n"                                                               \
+  "push %%rbx\n"                                                               \
+  "push %%rcx\n"                                                               \
+  "push %%rdx\n"                                                               \
+  "push %%rdi\n"                                                               \
+  "push %%rsi\n"                                                               \
+  "push %%rbp\n"                                                               \
+  "push %%r8\n"                                                                \
+  "push %%r9\n"                                                                \
+  "push %%r10\n"                                                               \
+  "push %%r11\n"                                                               \
+  "push %%r12\n"                                                               \
+  "push %%r13\n"                                                               \
+  "push %%r14\n"                                                               \
+  "push %%r15\n"                                                               \
+  "sub $8, %%rsp\n"
+
+// Mirrors SAVE_ALL
+#define RESTORE_ALL                                                            \
+  "add $8, %%rsp\n"                                                            \
+  "pop %%r15\n"                                                                \
+  "pop %%r14\n"                                                                \
+  "pop %%r13\n"                                                                \
+  "pop %%r12\n"                                                                \
+  "pop %%r11\n"                                                                \
+  "pop %%r10\n"                                                                \
+  "pop %%r9\n"                                                                 \
+  "pop %%r8\n"                                                                 \
+  "pop %%rbp\n"                                                                \
+  "pop %%rsi\n"                                                                \
+  "pop %%rdi\n"                                                                \
+  "pop %%rdx\n"                                                                \
+  "pop %%rcx\n"                                                                \
+  "pop %%rbx\n"                                                                \
+  "pop %%rax\n"
+
+// Anonymous namespace covering everything but our library entry point
+namespace {
+
+constexpr uint32_t BufSize = 10240;
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
+  uint64_t ret;
+#if defined(__APPLE__)
+#define READ_SYSCALL 0x2000003
+#else
+#define READ_SYSCALL 0
+#endif
+  __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(buf), "d"(count)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
+  uint64_t ret;
+#if defined(__APPLE__)
+#define WRITE_SYSCALL 0x2000004
+#else
+#define WRITE_SYSCALL 1
+#endif
+  __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(buf), "d"(count)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
+             uint64_t fd, uint64_t offset) {
+#if defined(__APPLE__)
+#define MMAP_SYSCALL 0x20000c5
+#else
+#define MMAP_SYSCALL 9
+#endif
+  void *ret;
+  register uint64_t r8 asm("r8") = fd;
+  register uint64_t r9 asm("r9") = offset;
+  register uint64_t r10 asm("r10") = flags;
+  __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8),
+                         "r"(r9)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __munmap(void *addr, uint64_t size) {
+#if defined(__APPLE__)
+#define MUNMAP_SYSCALL 0x2000049
+#else
+#define MUNMAP_SYSCALL 11
+#endif
+  uint64_t ret;
+  __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(size)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+#define SIG_BLOCK 0
+#define SIG_UNBLOCK 1
+#define SIG_SETMASK 2
+
+static const uint64_t MaskAllSignals[] = {-1ULL};
+
+uint64_t __sigprocmask(int how, const void *set, void *oldset) {
+#if defined(__APPLE__)
+#define SIGPROCMASK_SYSCALL 0x2000030
+#else
+#define SIGPROCMASK_SYSCALL 14
+#endif
+  uint64_t ret;
+  register long r10 asm("r10") = sizeof(uint64_t);
+  __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n"
+                                                               "syscall\n"
+                       : "=a"(ret)
+                       : "D"(how), "S"(set), "d"(oldset), "r"(r10)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __exit(uint64_t code) {
+#if defined(__APPLE__)
+#define EXIT_SYSCALL 0x2000001
+#else
+#define EXIT_SYSCALL 231
+#endif
+  uint64_t ret;
+  __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(code)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+// Helper functions for writing strings to the .fdata file. We intentionally
+// avoid using libc names (lowercase memset) to make it clear it is our impl.
+
+/// Write number Num using Base to the buffer in OutBuf, returns a pointer to
+/// the end of the string.
+char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) {
+  const char *Chars = "0123456789abcdef";
+  char Buf[21];
+  char *Ptr = Buf;
+  while (Num) {
+    *Ptr++ = *(Chars + (Num % Base));
+    Num /= Base;
+  }
+  if (Ptr == Buf) {
+    *OutBuf++ = '0';
+    return OutBuf;
+  }
+  while (Ptr != Buf) {
+    *OutBuf++ = *--Ptr;
+  }
+  return OutBuf;
+}
+
+/// Copy Str to OutBuf, returns a pointer to the end of the copied string
+char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) {
+  while (*Str) {
+    *OutBuf++ = *Str++;
+    if (--Size <= 0)
+      return OutBuf;
+  }
+  return OutBuf;
+}
+
+/// Compare two strings, at most Num bytes.
+int strnCmp(const char *Str1, const char *Str2, size_t Num) {
+  while (Num && *Str1 && (*Str1 == *Str2)) {
+    Num--;
+    Str1++;
+    Str2++;
+  }
+  if (Num == 0)
+    return 0;
+  return *(unsigned char *)Str1 - *(unsigned char *)Str2;
+}
+
+void memSet(char *Buf, char C, uint32_t Size) {
+  for (int I = 0; I < Size; ++I)
+    *Buf++ = C;
+}
+
+void *memCpy(void *Dest, const void *Src, size_t Len) {
+  char *d = static_cast<char *>(Dest);
+  const char *s = static_cast<const char *>(Src);
+  while (Len--)
+    *d++ = *s++;
+  return Dest;
+}
+
+uint32_t strLen(const char *Str) {
+  uint32_t Size = 0;
+  while (*Str++)
+    ++Size;
+  return Size;
+}
+
+void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) {
+  char Buf[BufSize];
+  char *Ptr = Buf;
+  Ptr = strCopy(Ptr, Msg, BufSize - 23);
+  Ptr = intToStr(Ptr, Num, Base);
+  Ptr = strCopy(Ptr, "\n");
+  __write(2, Buf, Ptr - Buf);
+}
+
+void report(const char *Msg) { __write(2, Msg, strLen(Msg)); }
+
+unsigned long hexToLong(const char *str) {
+  unsigned long res = 0;
+  while (*str != '\0') {
+    res <<= 4;
+    if ('0' <= *str && *str <= '9')
+      res += *str++ - '0';
+    else if ('a' <= *str && *str <= 'f')
+      res += *str++ - 'a' + 10;
+    else if ('A' <= *str && *str <= 'F')
+      res += *str++ - 'A' + 10;
+    else {
+      return 0;
+    }
+  }
+  return res;
+}
+
+#if !defined(__APPLE__)
+// We use a stack-allocated buffer for string manipulation in many pieces of
+// this code, including the code that prints each line of the fdata file. This
+// buffer needs to accomodate large function names, but shouldn't be arbitrarily
+// large (dynamically allocated) for simplicity of our memory space usage.
+
+// Declare some syscall wrappers we use throughout this code to avoid linking
+// against system libc.
+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $2, %%rax\n"
+                       "syscall"
+                       : "=a"(ret)
+                       : "D"(pathname), "S"(flags), "d"(mode)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+struct dirent {
+  unsigned long d_ino;     /* Inode number */
+  unsigned long d_off;     /* Offset to next linux_dirent */
+  unsigned short d_reclen; /* Length of this linux_dirent */
+  char d_name[];           /* Filename (null-terminated) */
+                           /* length is actually (d_reclen - 2 -
+                             offsetof(struct linux_dirent, d_name)) */
+};
+
+long __getdents(unsigned int fd, dirent *dirp, size_t count) {
+  long ret;
+  __asm__ __volatile__("movq $78, %%rax\n"
+                       "syscall"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(dirp), "d"(count)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $89, %%rax\n"
+                       "syscall"
+                       : "=a"(ret)
+                       : "D"(pathname), "S"(buf), "d"(bufsize)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $8, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(pos), "d"(whence)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __close(uint64_t fd) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $3, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __madvise(void *addr, size_t length, int advice) {
+  int ret;
+  __asm__ __volatile__("movq $28, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(length), "d"(advice)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+struct timespec {
+  uint64_t tv_sec;  /* seconds */
+  uint64_t tv_nsec; /* nanoseconds */
+};
+
+uint64_t __nanosleep(const timespec *req, timespec *rem) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $35, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(req), "S"(rem)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int64_t __fork() {
+  uint64_t ret;
+  __asm__ __volatile__("movq $57, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __mprotect(void *addr, size_t len, int prot) {
+  int ret;
+  __asm__ __volatile__("movq $10, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(len), "d"(prot)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __getpid() {
+  uint64_t ret;
+  __asm__ __volatile__("movq $39, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __getppid() {
+  uint64_t ret;
+  __asm__ __volatile__("movq $110, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __setpgid(uint64_t pid, uint64_t pgid) {
+  int ret;
+  __asm__ __volatile__("movq $109, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(pid), "S"(pgid)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __getpgid(uint64_t pid) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $121, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(pid)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __kill(uint64_t pid, int sig) {
+  int ret;
+  __asm__ __volatile__("movq $62, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(pid), "S"(sig)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+#endif
+
+void reportError(const char *Msg, uint64_t Size) {
+  __write(2, Msg, Size);
+  __exit(1);
+}
+
+void assert(bool Assertion, const char *Msg) {
+  if (Assertion)
+    return;
+  char Buf[BufSize];
+  char *Ptr = Buf;
+  Ptr = strCopy(Ptr, "Assertion failed: ");
+  Ptr = strCopy(Ptr, Msg, BufSize - 40);
+  Ptr = strCopy(Ptr, "\n");
+  reportError(Buf, Ptr - Buf);
+}
+
+/// 1B mutex accessed by lock xchg
+class Mutex {
+  volatile bool InUse{false};
+
+public:
+  bool acquire() {
+    bool Result = true;
+    asm volatile("lock; xchg %0, %1" : "+m"(InUse), "=r"(Result) : : "cc");
+    return !Result;
+  }
+  void release() { InUse = false; }
+};
+
+/// RAII wrapper for Mutex
+class Lock {
+  Mutex &M;
+  uint64_t SignalMask[1] = {};
+
+public:
+  Lock(Mutex &M) : M(M) {
+    __sigprocmask(SIG_BLOCK, MaskAllSignals, SignalMask);
+    while (!M.acquire()) {
+    }
+  }
+
+  ~Lock() {
+    M.release();
+    __sigprocmask(SIG_SETMASK, SignalMask, nullptr);
+  }
+};
+
+inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
+  return (Value + Align - 1) / Align * Align;
+}
+
+} // anonymous namespace
diff --git a/bolt/runtime/config.h.in b/bolt/runtime/config.h.in
new file mode 100644
index 000000000000..e52919dea83b
--- /dev/null
+++ b/bolt/runtime/config.h.in
@@ -0,0 +1 @@
+#cmakedefine HAVE_ELF_H
diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp
new file mode 100644
index 000000000000..aa3e1f78ccaf
--- /dev/null
+++ b/bolt/runtime/hugify.cpp
@@ -0,0 +1,127 @@
+//===-- hugify.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(__APPLE__)
+
+#include "common.h"
+#include <sys/mman.h>
+
+// Enables a very verbose logging to stderr useful when debugging
+//#define ENABLE_DEBUG
+
+// Function pointers to init routines in the binary, so we can resume
+// regular execution of the function that we hooked.
+extern void (*__bolt_hugify_init_ptr)();
+
+// The __hot_start and __hot_end symbols set by Bolt. We use them to figure
+// out the rage for marking huge pages.
+extern uint64_t __hot_start;
+extern uint64_t __hot_end;
+
+#ifdef MADV_HUGEPAGE
+/// Check whether the kernel supports THP via corresponding sysfs entry.
+static bool has_pagecache_thp_support() {
+  char buf[256] = {0};
+  const char *madviseStr = "always [madvise] never";
+
+  int fd = __open("/sys/kernel/mm/transparent_hugepage/enabled",
+                  0 /* O_RDONLY */, 0);
+  if (fd < 0)
+    return false;
+
+  size_t res = __read(fd, buf, 256);
+  if (res < 0)
+    return false;
+
+  int cmp = strnCmp(buf, madviseStr, strLen(madviseStr));
+  return cmp == 0;
+}
+
+static void hugify_for_old_kernel(uint8_t *from, uint8_t *to) {
+  size_t size = to - from;
+
+  uint8_t *mem = reinterpret_cast<uint8_t *>(
+      __mmap(0, size, 0x3 /* PROT_READ | PROT_WRITE*/,
+             0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0));
+
+  if (mem == (void *)MAP_FAILED) {
+    char msg[] = "Could not allocate memory for text move\n";
+    reportError(msg, sizeof(msg));
+  }
+#ifdef ENABLE_DEBUG
+  reportNumber("Allocated temporary space: ", (uint64_t)mem, 16);
+#endif
+
+  // Copy the hot code to a temproary location.
+  memCpy(mem, from, size);
+
+  // Maps out the existing hot code.
+  if (__mmap(reinterpret_cast<uint64_t>(from), size,
+             PROT_READ | PROT_WRITE | PROT_EXEC,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1,
+             0) == (void *)MAP_FAILED) {
+    char msg[] = "failed to mmap memory for large page move terminating\n";
+    reportError(msg, sizeof(msg));
+  }
+
+  // Mark the hot code page to be huge page.
+  if (__madvise(from, size, MADV_HUGEPAGE) == -1) {
+    char msg[] = "failed to allocate large page\n";
+    reportError(msg, sizeof(msg));
+  }
+
+  // Copy the hot code back.
+  memCpy(from, mem, size);
+
+  // Change permission back to read-only, ignore failure
+  __mprotect(from, size, PROT_READ | PROT_EXEC);
+
+  __munmap(mem, size);
+}
+#endif
+
+extern "C" void __bolt_hugify_self_impl() {
+#ifdef MADV_HUGEPAGE
+  uint8_t *hotStart = (uint8_t *)&__hot_start;
+  uint8_t *hotEnd = (uint8_t *)&__hot_end;
+  // Make sure the start and end are aligned with huge page address
+  const size_t hugePageBytes = 2L * 1024 * 1024;
+  uint8_t *from = hotStart - ((intptr_t)hotStart & (hugePageBytes - 1));
+  uint8_t *to = hotEnd + (hugePageBytes - 1);
+  to -= (intptr_t)to & (hugePageBytes - 1);
+
+#ifdef ENABLE_DEBUG
+  reportNumber("[hugify] hot start: ", (uint64_t)hotStart, 16);
+  reportNumber("[hugify] hot end: ", (uint64_t)hotEnd, 16);
+  reportNumber("[hugify] aligned huge page from: ", (uint64_t)from, 16);
+  reportNumber("[hugify] aligned huge page to: ", (uint64_t)to, 16);
+#endif
+
+  if (!has_pagecache_thp_support()) {
+    hugify_for_old_kernel(from, to);
+    return;
+  }
+
+  if (__madvise(from, (to - from), MADV_HUGEPAGE) == -1) {
+    char msg[] = "failed to allocate large page\n";
+    // TODO: allow user to control the failure behavior.
+    reportError(msg, sizeof(msg));
+  }
+#endif
+}
+
+/// This is hooking ELF's entry, it needs to save all machine state.
+extern "C" __attribute((naked)) void __bolt_hugify_self() {
+  __asm__ __volatile__(SAVE_ALL
+                       "call __bolt_hugify_self_impl\n"
+                       RESTORE_ALL
+                       "jmp *__bolt_hugify_init_ptr(%%rip)\n"
+                       :::);
+}
+
+#endif
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
new file mode 100644
index 000000000000..160f9c1c66da
--- /dev/null
+++ b/bolt/runtime/instr.cpp
@@ -0,0 +1,1673 @@
+//===-- instr.cpp -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does
+// not support linking modules with dependencies on one another into the final
+// binary (TODO?), which means this library has to be self-contained in a single
+// module.
+//
+// All extern declarations here need to be defined by BOLT itself. Those will be
+// undefined symbols that BOLT needs to resolve by emitting these symbols with
+// MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible
+// for defining the symbols here and these two files have a tight coupling: one
+// working statically when you run BOLT and another during program runtime when
+// you run an instrumented binary. The main goal here is to output an fdata file
+// (BOLT profile) with the instrumentation counters inserted by the static pass.
+// Counters for indirect calls are an exception, as we can't know them
+// statically. These counters are created and managed here. To allow this, we
+// need a minimal framework for allocating memory dynamically. We provide this
+// with the BumpPtrAllocator class (not LLVM's, but our own version of it).
+//
+// Since this code is intended to be inserted into any executable, we decided to
+// make it standalone and do not depend on any external libraries (i.e. language
+// support libraries, such as glibc or stdc++). To allow this, we provide a few
+// light implementations of common OS interacting functionalities using direct
+// syscall wrappers. Our simple allocator doesn't manage deallocations that
+// fragment the memory space, so it's stack based. This is the minimal framework
+// provided here to allow processing instrumented counters and writing fdata.
+//
+// In the C++ idiom used here, we never use or rely on constructors or
+// destructors for global objects. That's because those need support from the
+// linker in initialization/finalization code, and we want to keep our linker
+// very simple. Similarly, we don't create any global objects that are zero
+// initialized, since those would need to go .bss, which our simple linker also
+// don't support (TODO?).
+//
+//===----------------------------------------------------------------------===//
+
+#include "common.h"
+
+// Enables a very verbose logging to stderr useful when debugging
+//#define ENABLE_DEBUG
+
+#ifdef ENABLE_DEBUG
+#define DEBUG(X)                                                               \
+  { X; }
+#else
+#define DEBUG(X)                                                               \
+  {}
+#endif
+
+
+extern "C" {
+
+#if defined(__APPLE__)
+extern uint64_t* _bolt_instr_locations_getter();
+extern uint32_t _bolt_num_counters_getter();
+
+extern uint8_t* _bolt_instr_tables_getter();
+extern uint32_t _bolt_instr_num_funcs_getter();
+
+#else
+
+// Main counters inserted by instrumentation, incremented during runtime when
+// points of interest (locations) in the program are reached. Those are direct
+// calls and direct and indirect branches (local ones). There are also counters
+// for basic block execution if they are a spanning tree leaf and need to be
+// counted in order to infer the execution count of other edges of the CFG.
+extern uint64_t __bolt_instr_locations[];
+extern uint32_t __bolt_num_counters;
+// Descriptions are serialized metadata about binary functions written by BOLT,
+// so we have a minimal understanding about the program structure. For a
+// reference on the exact format of this metadata, see *Description structs,
+// Location, IntrumentedNode and EntryNode.
+// Number of indirect call site descriptions
+extern uint32_t __bolt_instr_num_ind_calls;
+// Number of indirect call target descriptions
+extern uint32_t __bolt_instr_num_ind_targets;
+// Number of function descriptions
+extern uint32_t __bolt_instr_num_funcs;
+// Time to sleep across dumps (when we write the fdata profile to disk)
+extern uint32_t __bolt_instr_sleep_time;
+// Do not clear counters across dumps, rewrite file with the updated values
+extern bool __bolt_instr_no_counters_clear;
+// Wait until all forks of instrumented process will finish
+extern bool __bolt_instr_wait_forks;
+// Filename to dump data to
+extern char __bolt_instr_filename[];
+// Instumented binary file path
+extern char __bolt_instr_binpath[];
+// If true, append current PID to the fdata filename when creating it so
+// different invocations of the same program can be differentiated.
+extern bool __bolt_instr_use_pid;
+// Functions that will be used to instrument indirect calls. BOLT static pass
+// will identify indirect calls and modify them to load the address in these
+// trampolines and call this address instead. BOLT can't use direct calls to
+// our handlers because our addresses here are not known at analysis time. We
+// only support resolving dependencies from this file to the output of BOLT,
+// *not* the other way around.
+// TODO: We need better linking support to make that happen.
+extern void (*__bolt_ind_call_instr_pointer)();
+extern void (*__bolt_ind_tailcall_instr_pointer)();
+// init/fini trampoline routines in the binary, so we can
+// resume regular execution of these functions that we hooked
+extern void __bolt_start_trampoline();
+extern void __bolt_fini_trampoline();
+
+#endif
+
+}
+
+namespace {
+
+/// A simple allocator that mmaps a fixed size region and manages this space
+/// in a stack fashion, meaning you always deallocate the last element that
+/// was allocated. In practice, we don't need to deallocate individual elements.
+/// We monotonically increase our usage and then deallocate everything once we
+/// are done processing something.
+class BumpPtrAllocator {
+  /// This is written before each allocation and act as a canary to detect when
+  /// a bug caused our program to cross allocation boundaries.
+  struct EntryMetadata {
+    uint64_t Magic;
+    uint64_t AllocSize;
+  };
+
+public:
+  void *allocate(size_t Size) {
+    Lock L(M);
+
+    if (StackBase == nullptr) {
+#if defined(__APPLE__)
+    int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002;
+#else
+    int MAP_PRIVATE_MAP_ANONYMOUS = 0x22;
+#endif
+      StackBase = reinterpret_cast<uint8_t *>(
+          __mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/,
+                 Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/
+                        : MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/,
+                 -1, 0));
+      StackSize = 0;
+    }
+
+    Size = alignTo(Size + sizeof(EntryMetadata), 16);
+    uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
+    auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
+    M->Magic = Magic;
+    M->AllocSize = Size;
+    StackSize += Size;
+    assert(StackSize < MaxSize, "allocator ran out of memory");
+    return AllocAddress;
+  }
+
+#ifdef DEBUG
+  /// Element-wise deallocation is only used for debugging to catch memory
+  /// bugs by checking magic bytes. Ordinarily, we reset the allocator once
+  /// we are done with it. Reset is done with clear(). There's no need
+  /// to deallocate each element individually.
+  void deallocate(void *Ptr) {
+    Lock L(M);
+    uint8_t MetadataOffset = sizeof(EntryMetadata);
+    auto *M = reinterpret_cast<EntryMetadata *>(
+        reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
+    const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
+    // Validate size
+    if (Ptr != StackTop - M->AllocSize) {
+      // Failed validation, check if it is a pointer returned by operator new []
+      MetadataOffset +=
+          sizeof(uint64_t); // Space for number of elements alloc'ed
+      M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
+                                            MetadataOffset);
+      // Ok, it failed both checks if this assertion fails. Stop the program, we
+      // have a memory bug.
+      assert(Ptr == StackTop - M->AllocSize,
+             "must deallocate the last element alloc'ed");
+    }
+    assert(M->Magic == Magic, "allocator magic is corrupt");
+    StackSize -= M->AllocSize;
+  }
+#else
+  void deallocate(void *) {}
+#endif
+
+  void clear() {
+    Lock L(M);
+    StackSize = 0;
+  }
+
+  /// Set mmap reservation size (only relevant before first allocation)
+  void setMaxSize(uint64_t Size) { MaxSize = Size; }
+
+  /// Set mmap reservation privacy (only relevant before first allocation)
+  void setShared(bool S) { Shared = S; }
+
+  void destroy() {
+    if (StackBase == nullptr)
+      return;
+    __munmap(StackBase, MaxSize);
+  }
+
+private:
+  static constexpr uint64_t Magic = 0x1122334455667788ull;
+  uint64_t MaxSize = 0xa00000;
+  uint8_t *StackBase{nullptr};
+  uint64_t StackSize{0};
+  bool Shared{false};
+  Mutex M;
+};
+
+/// Used for allocating indirect call instrumentation counters. Initialized by
+/// __bolt_instr_setup, our initialization routine.
+BumpPtrAllocator GlobalAlloc;
+} // anonymous namespace
+
+// User-defined placement new operators. We only use those (as opposed to
+// overriding the regular operator new) so we can keep our allocator in the
+// stack instead of in a data section (global).
+void *operator new(size_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); }
+void *operator new(size_t Sz, BumpPtrAllocator &A, char C) {
+  auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
+  memSet(Ptr, C, Sz);
+  return Ptr;
+}
+void *operator new[](size_t Sz, BumpPtrAllocator &A) {
+  return A.allocate(Sz);
+}
+void *operator new[](size_t Sz, BumpPtrAllocator &A, char C) {
+  auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
+  memSet(Ptr, C, Sz);
+  return Ptr;
+}
+// Only called during exception unwinding (useless). We must manually dealloc.
+// C++ language weirdness
+void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
+
+namespace {
+
+/// Basic key-val atom stored in our hash
+struct SimpleHashTableEntryBase {
+  uint64_t Key;
+  uint64_t Val;
+};
+
+/// This hash table implementation starts by allocating a table of size
+/// InitialSize. When conflicts happen in this main table, it resolves
+/// them by chaining a new table of size IncSize. It never reallocs as our
+/// allocator doesn't support it. The key is intended to be function pointers.
+/// There's no clever hash function (it's just x mod size, size being prime).
+/// I never tuned the coefficientes in the modular equation (TODO)
+/// This is used for indirect calls (each call site has one of this, so it
+/// should have a small footprint) and for tallying call counts globally for
+/// each target to check if we missed the origin of some calls (this one is a
+/// large instantiation of this template, since it is global for all call sites)
+template <typename T = SimpleHashTableEntryBase, uint32_t InitialSize = 7,
+          uint32_t IncSize = 7>
+class SimpleHashTable {
+public:
+  using MapEntry = T;
+
+  /// Increment by 1 the value of \p Key. If it is not in this table, it will be
+  /// added to the table and its value set to 1.
+  void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) {
+    ++get(Key, Alloc).Val;
+  }
+
+  /// Basic member accessing interface. Here we pass the allocator explicitly to
+  /// avoid storing a pointer to it as part of this table (remember there is one
+  /// hash for each indirect call site, so we wan't to minimize our footprint).
+  MapEntry &get(uint64_t Key, BumpPtrAllocator &Alloc) {
+    Lock L(M);
+    if (TableRoot)
+      return getEntry(TableRoot, Key, Key, Alloc, 0);
+    return firstAllocation(Key, Alloc);
+  }
+
+  /// Traverses all elements in the table
+  template <typename... Args>
+  void forEachElement(void (*Callback)(MapEntry &, Args...), Args... args) {
+    if (!TableRoot)
+      return;
+    return forEachElement(Callback, InitialSize, TableRoot, args...);
+  }
+
+  void resetCounters();
+
+private:
+  constexpr static uint64_t VacantMarker = 0;
+  constexpr static uint64_t FollowUpTableMarker = 0x8000000000000000ull;
+
+  MapEntry *TableRoot{nullptr};
+  Mutex M;
+
+  template <typename... Args>
+  void forEachElement(void (*Callback)(MapEntry &, Args...),
+                      uint32_t NumEntries, MapEntry *Entries, Args... args) {
+    for (uint32_t I = 0; I < NumEntries; ++I) {
+      MapEntry &Entry = Entries[I];
+      if (Entry.Key == VacantMarker)
+        continue;
+      if (Entry.Key & FollowUpTableMarker) {
+        forEachElement(Callback, IncSize,
+                       reinterpret_cast<MapEntry *>(Entry.Key &
+                                                    ~FollowUpTableMarker),
+                       args...);
+        continue;
+      }
+      Callback(Entry, args...);
+    }
+  }
+
+  MapEntry &firstAllocation(uint64_t Key, BumpPtrAllocator &Alloc) {
+    TableRoot = new (Alloc, 0) MapEntry[InitialSize];
+    MapEntry &Entry = TableRoot[Key % InitialSize];
+    Entry.Key = Key;
+    return Entry;
+  }
+
+  MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector,
+                     BumpPtrAllocator &Alloc, int CurLevel) {
+    const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize;
+    uint64_t Remainder = Selector / NumEntries;
+    Selector = Selector % NumEntries;
+    MapEntry &Entry = Entries[Selector];
+
+    // A hit
+    if (Entry.Key == Key) {
+      return Entry;
+    }
+
+    // Vacant - add new entry
+    if (Entry.Key == VacantMarker) {
+      Entry.Key = Key;
+      return Entry;
+    }
+
+    // Defer to the next level
+    if (Entry.Key & FollowUpTableMarker) {
+      return getEntry(
+          reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker),
+          Key, Remainder, Alloc, CurLevel + 1);
+    }
+
+    // Conflict - create the next level
+    MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize];
+    uint64_t CurEntrySelector = Entry.Key / InitialSize;
+    for (int I = 0; I < CurLevel; ++I)
+      CurEntrySelector /= IncSize;
+    CurEntrySelector = CurEntrySelector % IncSize;
+    NextLevelTbl[CurEntrySelector] = Entry;
+    Entry.Key = reinterpret_cast<uint64_t>(NextLevelTbl) | FollowUpTableMarker;
+    return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1);
+  }
+};
+
+template <typename T> void resetIndCallCounter(T &Entry) {
+  Entry.Val = 0;
+}
+
+template <typename T, uint32_t X, uint32_t Y>
+void SimpleHashTable<T, X, Y>::resetCounters() {
+  Lock L(M);
+  forEachElement(resetIndCallCounter);
+}
+
+/// Represents a hash table mapping a function target address to its counter.
+using IndirectCallHashTable = SimpleHashTable<>;
+
+/// Initialize with number 1 instead of 0 so we don't go into .bss. This is the
+/// global array of all hash tables storing indirect call destinations happening
+/// during runtime, one table per call site.
+IndirectCallHashTable *GlobalIndCallCounters{
+    reinterpret_cast<IndirectCallHashTable *>(1)};
+
+/// Don't allow reentrancy in the fdata writing phase - only one thread writes
+/// it
+Mutex *GlobalWriteProfileMutex{reinterpret_cast<Mutex *>(1)};
+
+/// Store number of calls in additional to target address (Key) and frequency
+/// as perceived by the basic block counter (Val).
+struct CallFlowEntryBase : public SimpleHashTableEntryBase {
+  uint64_t Calls;
+};
+
+using CallFlowHashTableBase = SimpleHashTable<CallFlowEntryBase, 11939, 233>;
+
+/// This is a large table indexing all possible call targets (indirect and
+/// direct ones). The goal is to find mismatches between number of calls (for
+/// those calls we were able to track) and the entry basic block counter of the
+/// callee. In most cases, these two should be equal. If not, there are two
+/// possible scenarios here:
+///
+///  * Entry BB has higher frequency than all known calls to this function.
+///    In this case, we have dynamic library code or any uninstrumented code
+///    calling this function. We will write the profile for these untracked
+///    calls as having source "0 [unknown] 0" in the fdata file.
+///
+///  * Number of known calls is higher than the frequency of entry BB
+///    This only happens when there is no counter for the entry BB / callee
+///    function is not simple (in BOLT terms). We don't do anything special
+///    here and just ignore those (we still report all calls to the non-simple
+///    function, though).
+///
+class CallFlowHashTable : public CallFlowHashTableBase {
+public:
+  CallFlowHashTable(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
+
+  MapEntry &get(uint64_t Key) { return CallFlowHashTableBase::get(Key, Alloc); }
+
+private:
+  // Different than the hash table for indirect call targets, we do store the
+  // allocator here since there is only one call flow hash and space overhead
+  // is negligible.
+  BumpPtrAllocator &Alloc;
+};
+
+///
+/// Description metadata emitted by BOLT to describe the program - refer to
+/// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote()
+///
+struct Location {
+  uint32_t FunctionName;
+  uint32_t Offset;
+};
+
+struct CallDescription {
+  Location From;
+  uint32_t FromNode;
+  Location To;
+  uint32_t Counter;
+  uint64_t TargetAddress;
+};
+
+using IndCallDescription = Location;
+
+struct IndCallTargetDescription {
+  Location Loc;
+  uint64_t Address;
+};
+
+struct EdgeDescription {
+  Location From;
+  uint32_t FromNode;
+  Location To;
+  uint32_t ToNode;
+  uint32_t Counter;
+};
+
+struct InstrumentedNode {
+  uint32_t Node;
+  uint32_t Counter;
+};
+
+struct EntryNode {
+  uint64_t Node;
+  uint64_t Address;
+};
+
+struct FunctionDescription {
+  uint32_t NumLeafNodes;
+  const InstrumentedNode *LeafNodes;
+  uint32_t NumEdges;
+  const EdgeDescription *Edges;
+  uint32_t NumCalls;
+  const CallDescription *Calls;
+  uint32_t NumEntryNodes;
+  const EntryNode *EntryNodes;
+
+  /// Constructor will parse the serialized function metadata written by BOLT
+  FunctionDescription(const uint8_t *FuncDesc);
+
+  uint64_t getSize() const {
+    return 16 + NumLeafNodes * sizeof(InstrumentedNode) +
+           NumEdges * sizeof(EdgeDescription) +
+           NumCalls * sizeof(CallDescription) +
+           NumEntryNodes * sizeof(EntryNode);
+  }
+};
+
+/// The context is created when the fdata profile needs to be written to disk
+/// and we need to interpret our runtime counters. It contains pointers to the
+/// mmaped binary (only the BOLT written metadata section). Deserialization
+/// should be straightforward as most data is POD or an array of POD elements.
+/// This metadata is used to reconstruct function CFGs.
+struct ProfileWriterContext {
+  IndCallDescription *IndCallDescriptions;
+  IndCallTargetDescription *IndCallTargets;
+  uint8_t *FuncDescriptions;
+  char *Strings;  // String table with function names used in this binary
+  int FileDesc;   // File descriptor for the file on disk backing this
+                  // information in memory via mmap
+  void *MMapPtr;  // The mmap ptr
+  int MMapSize;   // The mmap size
+
+  /// Hash table storing all possible call destinations to detect untracked
+  /// calls and correctly report them as [unknown] in output fdata.
+  CallFlowHashTable *CallFlowTable;
+
+  /// Lookup the sorted indirect call target vector to fetch function name and
+  /// offset for an arbitrary function pointer.
+  const IndCallTargetDescription *lookupIndCallTarget(uint64_t Target) const;
+};
+
+/// Perform a string comparison and returns zero if Str1 matches Str2. Compares
+/// at most Size characters.
+int compareStr(const char *Str1, const char *Str2, int Size) {
+  while (*Str1 == *Str2) {
+    if (*Str1 == '\0' || --Size == 0)
+      return 0;
+    ++Str1;
+    ++Str2;
+  }
+  return 1;
+}
+
+/// Output Location to the fdata file
+char *serializeLoc(const ProfileWriterContext &Ctx, char *OutBuf,
+                   const Location Loc, uint32_t BufSize) {
+  // fdata location format: Type Name Offset
+  // Type 1 - regular symbol
+  OutBuf = strCopy(OutBuf, "1 ");
+  const char *Str = Ctx.Strings + Loc.FunctionName;
+  uint32_t Size = 25;
+  while (*Str) {
+    *OutBuf++ = *Str++;
+    if (++Size >= BufSize)
+      break;
+  }
+  assert(!*Str, "buffer overflow, function name too large");
+  *OutBuf++ = ' ';
+  OutBuf = intToStr(OutBuf, Loc.Offset, 16);
+  *OutBuf++ = ' ';
+  return OutBuf;
+}
+
+/// Read and deserialize a function description written by BOLT. \p FuncDesc
+/// points at the beginning of the function metadata structure in the file.
+/// See Instrumentation::emitTablesAsELFNote()
+FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
+  NumLeafNodes = *reinterpret_cast<const uint32_t *>(FuncDesc);
+  DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10));
+  LeafNodes = reinterpret_cast<const InstrumentedNode *>(FuncDesc + 4);
+
+  NumEdges = *reinterpret_cast<const uint32_t *>(
+      FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
+  DEBUG(reportNumber("NumEdges = ", NumEdges, 10));
+  Edges = reinterpret_cast<const EdgeDescription *>(
+      FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
+
+  NumCalls = *reinterpret_cast<const uint32_t *>(
+      FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
+      NumEdges * sizeof(EdgeDescription));
+  DEBUG(reportNumber("NumCalls = ", NumCalls, 10));
+  Calls = reinterpret_cast<const CallDescription *>(
+      FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
+      NumEdges * sizeof(EdgeDescription));
+  NumEntryNodes = *reinterpret_cast<const uint32_t *>(
+      FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
+      NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
+  DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes, 10));
+  EntryNodes = reinterpret_cast<const EntryNode *>(
+      FuncDesc + 16 + NumLeafNodes * sizeof(InstrumentedNode) +
+      NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
+}
+
+/// Read and mmap descriptions written by BOLT from the executable's notes
+/// section
+#if defined(HAVE_ELF_H) and !defined(__APPLE__)
+#define BUF_SIZE 1024
+#define NAME_MAX 256
+#define ADDR_SIZE 32
+
+void *__attribute__((noinline)) __get_pc() {
+  return __builtin_extract_return_addr(__builtin_return_address(0));
+}
+
+/// Get full path to the real binary by getting current virtual address
+/// and searching for the appropriate link in address range in
+/// /proc/self/map_files
+static char *getBinaryPath() {
+  const char dirPath[] = "/proc/self/map_files/";
+  static char target_path[NAME_MAX] = {};
+  char buf[BUF_SIZE], start[ADDR_SIZE], end[ADDR_SIZE];
+  char findBuf[NAME_MAX];
+  char *strcp;
+  int lenS, lenE;
+  long nread;
+  unsigned long curAddr, saddr, eaddr;
+  struct dirent *d;
+
+  if (__bolt_instr_binpath[0] != '\0')
+    return __bolt_instr_binpath;
+
+  if (target_path[0] != '\0')
+    return target_path;
+
+  curAddr = (unsigned long)__get_pc();
+  uint64_t FDdir = __open(dirPath,
+                          /*flags=*/0 /*O_RDONLY*/,
+                          /*mode=*/0666);
+  assert(static_cast<int64_t>(FDdir) > 0,
+         "failed to open /proc/self/map_files");
+
+  for (;;) {
+    nread = __getdents(FDdir, (struct dirent *)buf, BUF_SIZE);
+    assert(static_cast<int64_t>(nread) != -1, "failed to get folder entries");
+    if (nread == 0)
+      break;
+    for (long bpos = 0; bpos < nread; bpos += d->d_reclen) {
+      d = (struct dirent *)(buf + bpos);
+      strcp = d->d_name;
+
+      lenS = 0, lenE = 0;
+      while (*strcp != '-') {
+        if (*strcp == '\0') {
+          lenS = 0;
+          break;
+        }
+        *strcp++;
+        lenS++;
+      }
+      if (lenS == 0)
+        continue;
+      *strcp++;
+      while (*strcp != '\0') {
+        *strcp++;
+        lenE++;
+      }
+      assert(lenS + lenE < ADDR_SIZE, "file name too long");
+
+      strCopy(start, d->d_name, lenS);
+      strCopy(end, d->d_name + lenS + 1, lenE);
+      start[lenS] = '\0';
+      end[lenE] = '\0';
+      saddr = hexToLong(start);
+      eaddr = hexToLong(end);
+
+      if (curAddr >= saddr && curAddr <= eaddr) {
+        strCopy(findBuf, dirPath);
+        strCopy(findBuf + sizeof(dirPath) - 1, d->d_name);
+        findBuf[sizeof(dirPath) + lenS + lenE] = '\0';
+        lenS = __readlink(findBuf, target_path, sizeof(target_path));
+        assert(lenS != -1 && lenS != BUF_SIZE, "readlink error");
+        target_path[lenS] = '\0';
+        goto end;
+      }
+    }
+  }
+  return NULL;
+end:
+  return target_path;
+}
+
+ProfileWriterContext readDescriptions() {
+  ProfileWriterContext Result;
+  char *binPath = getBinaryPath();
+  assert(binPath[0] != '\0', "failed to find binary path");
+
+  uint64_t FD = __open(binPath,
+                       /*flags=*/0 /*O_RDONLY*/,
+                       /*mode=*/0666);
+  assert(static_cast<int64_t>(FD) > 0, "failed to open binary path");
+
+  Result.FileDesc = FD;
+
+  // mmap our binary to memory
+  uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/);
+  uint8_t *BinContents = reinterpret_cast<uint8_t *>(
+      __mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0));
+  Result.MMapPtr = BinContents;
+  Result.MMapSize = Size;
+  Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
+  Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
+  Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
+      BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
+
+  // Find .bolt.instr.tables with the data we need and set pointers to it
+  for (int I = 0; I < Hdr->e_shnum; ++I) {
+    char *SecName = reinterpret_cast<char *>(
+        BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
+    if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
+      Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
+                                            (I + 1) * Hdr->e_shentsize);
+      continue;
+    }
+    // Actual contents of the ELF note start after offset 20 decimal:
+    // Offset 0: Producer name size (4 bytes)
+    // Offset 4: Contents size (4 bytes)
+    // Offset 8: Note type (4 bytes)
+    // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
+    // Offset 20: Contents
+    uint32_t IndCallDescSize =
+        *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
+    uint32_t IndCallTargetDescSize = *reinterpret_cast<uint32_t *>(
+        BinContents + Shdr->sh_offset + 24 + IndCallDescSize);
+    uint32_t FuncDescSize =
+        *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 28 +
+                                      IndCallDescSize + IndCallTargetDescSize);
+    Result.IndCallDescriptions = reinterpret_cast<IndCallDescription *>(
+        BinContents + Shdr->sh_offset + 24);
+    Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
+        BinContents + Shdr->sh_offset + 28 + IndCallDescSize);
+    Result.FuncDescriptions = BinContents + Shdr->sh_offset + 32 +
+                              IndCallDescSize + IndCallTargetDescSize;
+    Result.Strings = reinterpret_cast<char *>(
+        BinContents + Shdr->sh_offset + 32 + IndCallDescSize +
+        IndCallTargetDescSize + FuncDescSize);
+    return Result;
+  }
+  const char ErrMsg[] =
+      "BOLT instrumentation runtime error: could not find section "
+      ".bolt.instr.tables\n";
+  reportError(ErrMsg, sizeof(ErrMsg));
+  return Result;
+}
+
+#else
+
+ProfileWriterContext readDescriptions() {
+  ProfileWriterContext Result;
+  uint8_t *Tables = _bolt_instr_tables_getter();
+  uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
+  uint32_t IndCallTargetDescSize =
+      *reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
+  uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
+      Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
+  Result.IndCallDescriptions =
+      reinterpret_cast<IndCallDescription *>(Tables + 4);
+  Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
+      Tables + 8 + IndCallDescSize);
+  Result.FuncDescriptions =
+      Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
+  Result.Strings = reinterpret_cast<char *>(
+      Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
+  return Result;
+}
+
+#endif
+
+#if !defined(__APPLE__)
+/// Debug by printing overall metadata global numbers to check it is sane
+void printStats(const ProfileWriterContext &Ctx) {
+  char StatMsg[BufSize];
+  char *StatPtr = StatMsg;
+  StatPtr =
+      strCopy(StatPtr,
+              "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: ");
+  StatPtr = intToStr(StatPtr,
+                     Ctx.FuncDescriptions -
+                         reinterpret_cast<uint8_t *>(Ctx.IndCallDescriptions),
+                     10);
+  StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
+  StatPtr = intToStr(
+      StatPtr,
+      reinterpret_cast<uint8_t *>(Ctx.Strings) - Ctx.FuncDescriptions, 10);
+  StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_ind_calls: ");
+  StatPtr = intToStr(StatPtr, __bolt_instr_num_ind_calls, 10);
+  StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
+  StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
+  StatPtr = strCopy(StatPtr, "\n");
+  __write(2, StatMsg, StatPtr - StatMsg);
+}
+#endif
+
+
+/// This is part of a simple CFG representation in memory, where we store
+/// a dynamically sized array of input and output edges per node, and store
+/// a dynamically sized array of nodes per graph. We also store the spanning
+/// tree edges for that CFG in a separate array of nodes in
+/// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
+struct Edge {
+  uint32_t Node; // Index in nodes array regarding the destination of this edge
+  uint32_t ID;   // Edge index in an array comprising all edges of the graph
+};
+
+/// A regular graph node or a spanning tree node
+struct Node {
+  uint32_t NumInEdges{0};  // Input edge count used to size InEdge
+  uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
+  Edge *InEdges{nullptr};  // Created and managed by \p Graph
+  Edge *OutEdges{nullptr}; // ditto
+};
+
+/// Main class for CFG representation in memory. Manages object creation and
+/// destruction, populates an array of CFG nodes as well as corresponding
+/// spanning tree nodes.
+struct Graph {
+  uint32_t NumNodes;
+  Node *CFGNodes;
+  Node *SpanningTreeNodes;
+  uint64_t *EdgeFreqs;
+  uint64_t *CallFreqs;
+  BumpPtrAllocator &Alloc;
+  const FunctionDescription &D;
+
+  /// Reads a list of edges from function description \p D and builds
+  /// the graph from it. Allocates several internal dynamic structures that are
+  /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all
+  /// spanning tree leaf nodes descriptions (their counters). They are the seed
+  /// used to compute the rest of the missing edge counts in a bottom-up
+  /// traversal of the spanning tree.
+  Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
+        const uint64_t *Counters, ProfileWriterContext &Ctx);
+  ~Graph();
+  void dump() const;
+
+private:
+  void computeEdgeFrequencies(const uint64_t *Counters,
+                              ProfileWriterContext &Ctx);
+  void dumpEdgeFreqs() const;
+};
+
+Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
+             const uint64_t *Counters, ProfileWriterContext &Ctx)
+    : Alloc(Alloc), D(D) {
+  DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
+  // First pass to determine number of nodes
+  int32_t MaxNodes = -1;
+  CallFreqs = nullptr;
+  EdgeFreqs = nullptr;
+  for (int I = 0; I < D.NumEdges; ++I) {
+    if (static_cast<int32_t>(D.Edges[I].FromNode) > MaxNodes)
+      MaxNodes = D.Edges[I].FromNode;
+    if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
+      MaxNodes = D.Edges[I].ToNode;
+  }
+
+  for (int I = 0; I < D.NumLeafNodes; ++I) {
+    if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
+      MaxNodes = D.LeafNodes[I].Node;
+  }
+  for (int I = 0; I < D.NumCalls; ++I) {
+    if (static_cast<int32_t>(D.Calls[I].FromNode) > MaxNodes)
+      MaxNodes = D.Calls[I].FromNode;
+  }
+  // No nodes? Nothing to do
+  if (MaxNodes < 0) {
+    DEBUG(report("No nodes!\n"));
+    CFGNodes = nullptr;
+    SpanningTreeNodes = nullptr;
+    NumNodes = 0;
+    return;
+  }
+  ++MaxNodes;
+  DEBUG(reportNumber("NumNodes = ", MaxNodes, 10));
+  NumNodes = static_cast<uint32_t>(MaxNodes);
+
+  // Initial allocations
+  CFGNodes = new (Alloc) Node[MaxNodes];
+
+  DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
+  SpanningTreeNodes = new (Alloc) Node[MaxNodes];
+  DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
+                     (uint64_t)SpanningTreeNodes, 16));
+
+  // Figure out how much to allocate to each vector (in/out edge sets)
+  for (int I = 0; I < D.NumEdges; ++I) {
+    CFGNodes[D.Edges[I].FromNode].NumOutEdges++;
+    CFGNodes[D.Edges[I].ToNode].NumInEdges++;
+    if (D.Edges[I].Counter != 0xffffffff)
+      continue;
+
+    SpanningTreeNodes[D.Edges[I].FromNode].NumOutEdges++;
+    SpanningTreeNodes[D.Edges[I].ToNode].NumInEdges++;
+  }
+
+  // Allocate in/out edge sets
+  for (int I = 0; I < MaxNodes; ++I) {
+    if (CFGNodes[I].NumInEdges > 0)
+      CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
+    if (CFGNodes[I].NumOutEdges > 0)
+      CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
+    if (SpanningTreeNodes[I].NumInEdges > 0)
+      SpanningTreeNodes[I].InEdges =
+          new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
+    if (SpanningTreeNodes[I].NumOutEdges > 0)
+      SpanningTreeNodes[I].OutEdges =
+          new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
+    CFGNodes[I].NumInEdges = 0;
+    CFGNodes[I].NumOutEdges = 0;
+    SpanningTreeNodes[I].NumInEdges = 0;
+    SpanningTreeNodes[I].NumOutEdges = 0;
+  }
+
+  // Fill in/out edge sets
+  for (int I = 0; I < D.NumEdges; ++I) {
+    const uint32_t Src = D.Edges[I].FromNode;
+    const uint32_t Dst = D.Edges[I].ToNode;
+    Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
+    E->Node = Dst;
+    E->ID = I;
+
+    E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
+    E->Node = Src;
+    E->ID = I;
+
+    if (D.Edges[I].Counter != 0xffffffff)
+      continue;
+
+    E = &SpanningTreeNodes[Src]
+             .OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
+    E->Node = Dst;
+    E->ID = I;
+
+    E = &SpanningTreeNodes[Dst]
+             .InEdges[SpanningTreeNodes[Dst].NumInEdges++];
+    E->Node = Src;
+    E->ID = I;
+  }
+
+  computeEdgeFrequencies(Counters, Ctx);
+}
+
+Graph::~Graph() {
+  if (CallFreqs)
+    Alloc.deallocate(CallFreqs);
+  if (EdgeFreqs)
+    Alloc.deallocate(EdgeFreqs);
+  for (int I = NumNodes - 1; I >= 0; --I) {
+    if (SpanningTreeNodes[I].OutEdges)
+      Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
+    if (SpanningTreeNodes[I].InEdges)
+      Alloc.deallocate(SpanningTreeNodes[I].InEdges);
+    if (CFGNodes[I].OutEdges)
+      Alloc.deallocate(CFGNodes[I].OutEdges);
+    if (CFGNodes[I].InEdges)
+      Alloc.deallocate(CFGNodes[I].InEdges);
+  }
+  if (SpanningTreeNodes)
+    Alloc.deallocate(SpanningTreeNodes);
+  if (CFGNodes)
+    Alloc.deallocate(CFGNodes);
+}
+
+void Graph::dump() const {
+  reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
+  report("  Full graph:\n");
+  for (int I = 0; I < NumNodes; ++I) {
+    const Node *N = &CFGNodes[I];
+    reportNumber("    Node #", I, 10);
+    reportNumber("      InEdges total ", N->NumInEdges, 10);
+    for (int J = 0; J < N->NumInEdges; ++J)
+      reportNumber("        ", N->InEdges[J].Node, 10);
+    reportNumber("      OutEdges total ", N->NumOutEdges, 10);
+    for (int J = 0; J < N->NumOutEdges; ++J)
+      reportNumber("        ", N->OutEdges[J].Node, 10);
+    report("\n");
+  }
+  report("  Spanning tree:\n");
+  for (int I = 0; I < NumNodes; ++I) {
+    const Node *N = &SpanningTreeNodes[I];
+    reportNumber("    Node #", I, 10);
+    reportNumber("      InEdges total ", N->NumInEdges, 10);
+    for (int J = 0; J < N->NumInEdges; ++J)
+      reportNumber("        ", N->InEdges[J].Node, 10);
+    reportNumber("      OutEdges total ", N->NumOutEdges, 10);
+    for (int J = 0; J < N->NumOutEdges; ++J)
+      reportNumber("        ", N->OutEdges[J].Node, 10);
+    report("\n");
+  }
+}
+
+void Graph::dumpEdgeFreqs() const {
+  reportNumber(
+      "Dumping edge frequencies for graph with num edges: ", D.NumEdges, 10);
+  for (int I = 0; I < D.NumEdges; ++I) {
+    reportNumber("* Src: ", D.Edges[I].FromNode, 10);
+    reportNumber("  Dst: ", D.Edges[I].ToNode, 10);
+    reportNumber("    Cnt: ", EdgeFreqs[I], 10);
+  }
+}
+
+/// Auxiliary map structure for fast lookups of which calls map to each node of
+/// the function CFG
+struct NodeToCallsMap {
+  struct MapEntry {
+    uint32_t NumCalls;
+    uint32_t *Calls;
+  };
+  MapEntry *Entries;
+  BumpPtrAllocator &Alloc;
+  const uint32_t NumNodes;
+
+  NodeToCallsMap(BumpPtrAllocator &Alloc, const FunctionDescription &D,
+                 uint32_t NumNodes)
+      : Alloc(Alloc), NumNodes(NumNodes) {
+    Entries = new (Alloc, 0) MapEntry[NumNodes];
+    for (int I = 0; I < D.NumCalls; ++I) {
+      DEBUG(reportNumber("Registering call in node ", D.Calls[I].FromNode, 10));
+      ++Entries[D.Calls[I].FromNode].NumCalls;
+    }
+    for (int I = 0; I < NumNodes; ++I) {
+      Entries[I].Calls = Entries[I].NumCalls ? new (Alloc)
+                                                   uint32_t[Entries[I].NumCalls]
+                                             : nullptr;
+      Entries[I].NumCalls = 0;
+    }
+    for (int I = 0; I < D.NumCalls; ++I) {
+      MapEntry &Entry = Entries[D.Calls[I].FromNode];
+      Entry.Calls[Entry.NumCalls++] = I;
+    }
+  }
+
+  /// Set the frequency of all calls in node \p NodeID to Freq. However, if
+  /// the calls have their own counters and do not depend on the basic block
+  /// counter, this means they have landing pads and throw exceptions. In this
+  /// case, set their frequency with their counters and return the maximum
+  /// value observed in such counters. This will be used as the new frequency
+  /// at basic block entry. This is used to fix the CFG edge frequencies in the
+  /// presence of exceptions.
+  uint64_t visitAllCallsIn(uint32_t NodeID, uint64_t Freq, uint64_t *CallFreqs,
+                           const FunctionDescription &D,
+                           const uint64_t *Counters,
+                           ProfileWriterContext &Ctx) const {
+    const MapEntry &Entry = Entries[NodeID];
+    uint64_t MaxValue = 0ull;
+    for (int I = 0, E = Entry.NumCalls; I != E; ++I) {
+      const uint32_t CallID = Entry.Calls[I];
+      DEBUG(reportNumber("  Setting freq for call ID: ", CallID, 10));
+      const CallDescription &CallDesc = D.Calls[CallID];
+      if (CallDesc.Counter == 0xffffffff) {
+        CallFreqs[CallID] = Freq;
+        DEBUG(reportNumber("  with : ", Freq, 10));
+      } else {
+        const uint64_t CounterVal = Counters[CallDesc.Counter];
+        CallFreqs[CallID] = CounterVal;
+        MaxValue = CounterVal > MaxValue ? CounterVal : MaxValue;
+        DEBUG(reportNumber("  with (private counter) : ", CounterVal, 10));
+      }
+      DEBUG(reportNumber("  Address: 0x", CallDesc.TargetAddress, 16));
+      if (CallFreqs[CallID] > 0)
+        Ctx.CallFlowTable->get(CallDesc.TargetAddress).Calls +=
+            CallFreqs[CallID];
+    }
+    return MaxValue;
+  }
+
+  ~NodeToCallsMap() {
+    for (int I = NumNodes - 1; I >= 0; --I) {
+      if (Entries[I].Calls)
+        Alloc.deallocate(Entries[I].Calls);
+    }
+    Alloc.deallocate(Entries);
+  }
+};
+
+/// Fill an array with the frequency of each edge in the function represented
+/// by G, as well as another array for each call.
+void Graph::computeEdgeFrequencies(const uint64_t *Counters,
+                                   ProfileWriterContext &Ctx) {
+  if (NumNodes == 0)
+    return;
+
+  EdgeFreqs = D.NumEdges ? new (Alloc, 0) uint64_t [D.NumEdges] : nullptr;
+  CallFreqs = D.NumCalls ? new (Alloc, 0) uint64_t [D.NumCalls] : nullptr;
+
+  // Setup a lookup for calls present in each node (BB)
+  NodeToCallsMap *CallMap = new (Alloc) NodeToCallsMap(Alloc, D, NumNodes);
+
+  // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
+  // spanning tree don't have explicit counters. We must infer their value using
+  // a linear combination of other counters (sum of counters of the outgoing
+  // edges minus sum of counters of the incoming edges).
+  uint32_t *Stack = new (Alloc) uint32_t [NumNodes];
+  uint32_t StackTop = 0;
+  enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
+  Status *Visited = new (Alloc, 0) Status[NumNodes];
+  uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[NumNodes];
+  uint64_t *EntryAddress = new (Alloc, 0) uint64_t[NumNodes];
+
+  // Setup a fast lookup for frequency of leaf nodes, which have special
+  // basic block frequency instrumentation (they are not edge profiled).
+  for (int I = 0; I < D.NumLeafNodes; ++I) {
+    LeafFrequency[D.LeafNodes[I].Node] = Counters[D.LeafNodes[I].Counter];
+    DEBUG({
+      if (Counters[D.LeafNodes[I].Counter] > 0) {
+        reportNumber("Leaf Node# ", D.LeafNodes[I].Node, 10);
+        reportNumber("     Counter: ", Counters[D.LeafNodes[I].Counter], 10);
+      }
+    });
+  }
+  for (int I = 0; I < D.NumEntryNodes; ++I) {
+    EntryAddress[D.EntryNodes[I].Node] = D.EntryNodes[I].Address;
+    DEBUG({
+        reportNumber("Entry Node# ", D.EntryNodes[I].Node, 10);
+        reportNumber("      Address: ", D.EntryNodes[I].Address, 16);
+    });
+  }
+  // Add all root nodes to the stack
+  for (int I = 0; I < NumNodes; ++I) {
+    if (SpanningTreeNodes[I].NumInEdges == 0)
+      Stack[StackTop++] = I;
+  }
+  // Empty stack?
+  if (StackTop == 0) {
+    DEBUG(report("Empty stack!\n"));
+    Alloc.deallocate(EntryAddress);
+    Alloc.deallocate(LeafFrequency);
+    Alloc.deallocate(Visited);
+    Alloc.deallocate(Stack);
+    CallMap->~NodeToCallsMap();
+    Alloc.deallocate(CallMap);
+    if (CallFreqs)
+      Alloc.deallocate(CallFreqs);
+    if (EdgeFreqs)
+      Alloc.deallocate(EdgeFreqs);
+    EdgeFreqs = nullptr;
+    CallFreqs = nullptr;
+    return;
+  }
+  // Add all known edge counts, will infer the rest
+  for (int I = 0; I < D.NumEdges; ++I) {
+    const uint32_t C = D.Edges[I].Counter;
+    if (C == 0xffffffff) // inferred counter - we will compute its value
+      continue;
+    EdgeFreqs[I] = Counters[C];
+  }
+
+  while (StackTop > 0) {
+    const uint32_t Cur = Stack[--StackTop];
+    DEBUG({
+      if (Visited[Cur] == S_VISITING)
+        report("(visiting) ");
+      else
+        report("(new) ");
+      reportNumber("Cur: ", Cur, 10);
+    });
+
+    // This shouldn't happen in a tree
+    assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
+    if (Visited[Cur] == S_NEW) {
+      Visited[Cur] = S_VISITING;
+      Stack[StackTop++] = Cur;
+      assert(StackTop <= NumNodes, "stack grew too large");
+      for (int I = 0, E = SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
+        const uint32_t Succ = SpanningTreeNodes[Cur].OutEdges[I].Node;
+        Stack[StackTop++] = Succ;
+        assert(StackTop <= NumNodes, "stack grew too large");
+     }
+      continue;
+    }
+    Visited[Cur] = S_VISITED;
+
+    // Establish our node frequency based on outgoing edges, which should all be
+    // resolved by now.
+    int64_t CurNodeFreq = LeafFrequency[Cur];
+    // Not a leaf?
+    if (!CurNodeFreq) {
+      for (int I = 0, E = CFGNodes[Cur].NumOutEdges; I != E; ++I) {
+        const uint32_t SuccEdge = CFGNodes[Cur].OutEdges[I].ID;
+        CurNodeFreq += EdgeFreqs[SuccEdge];
+      }
+    }
+    if (CurNodeFreq < 0)
+      CurNodeFreq = 0;
+
+    const uint64_t CallFreq = CallMap->visitAllCallsIn(
+        Cur, CurNodeFreq > 0 ? CurNodeFreq : 0, CallFreqs, D, Counters, Ctx);
+
+    // Exception handling affected our output flow? Fix with calls info
+    DEBUG({
+      if (CallFreq > CurNodeFreq)
+        report("Bumping node frequency with call info\n");
+    });
+    CurNodeFreq = CallFreq > CurNodeFreq ? CallFreq : CurNodeFreq;
+
+    if (CurNodeFreq > 0) {
+      if (uint64_t Addr = EntryAddress[Cur]) {
+        DEBUG(
+            reportNumber("  Setting flow at entry point address 0x", Addr, 16));
+        DEBUG(reportNumber("  with: ", CurNodeFreq, 10));
+        Ctx.CallFlowTable->get(Addr).Val = CurNodeFreq;
+      }
+    }
+
+    // No parent? Reached a tree root, limit to call frequency updating.
+    if (SpanningTreeNodes[Cur].NumInEdges == 0) {
+      continue;
+    }
+
+    assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
+    const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
+    const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
+
+    // Calculate parent edge freq.
+    int64_t ParentEdgeFreq = CurNodeFreq;
+    for (int I = 0, E = CFGNodes[Cur].NumInEdges; I != E; ++I) {
+      const uint32_t PredEdge = CFGNodes[Cur].InEdges[I].ID;
+      ParentEdgeFreq -= EdgeFreqs[PredEdge];
+    }
+
+    // Sometimes the conservative CFG that BOLT builds will lead to incorrect
+    // flow computation. For example, in a BB that transitively calls the exit
+    // syscall, BOLT will add a fall-through successor even though it should not
+    // have any successors. So this block execution will likely be wrong. We
+    // tolerate this imperfection since this case should be quite infrequent.
+    if (ParentEdgeFreq < 0) {
+      DEBUG(dumpEdgeFreqs());
+      DEBUG(report("WARNING: incorrect flow"));
+      ParentEdgeFreq = 0;
+    }
+    DEBUG(reportNumber("  Setting freq for ParentEdge: ", ParentEdge, 10));
+    DEBUG(reportNumber("  with ParentEdgeFreq: ", ParentEdgeFreq, 10));
+    EdgeFreqs[ParentEdge] = ParentEdgeFreq;
+  }
+
+  Alloc.deallocate(EntryAddress);
+  Alloc.deallocate(LeafFrequency);
+  Alloc.deallocate(Visited);
+  Alloc.deallocate(Stack);
+  CallMap->~NodeToCallsMap();
+  Alloc.deallocate(CallMap);
+  DEBUG(dumpEdgeFreqs());
+}
+
+/// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
+/// \p Alloc to allocate helper dynamic structures used to compute profile for
+/// edges that we do not explictly instrument.
+const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
+                                    const uint8_t *FuncDesc,
+                                    BumpPtrAllocator &Alloc) {
+  const FunctionDescription F(FuncDesc);
+  const uint8_t *next = FuncDesc + F.getSize();
+
+#if !defined(__APPLE__)
+  uint64_t *bolt_instr_locations = __bolt_instr_locations;
+#else
+  uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
+#endif
+
+  // Skip funcs we know are cold
+#ifndef ENABLE_DEBUG
+  uint64_t CountersFreq = 0;
+  for (int I = 0; I < F.NumLeafNodes; ++I) {
+    CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
+  }
+  if (CountersFreq == 0) {
+    for (int I = 0; I < F.NumEdges; ++I) {
+      const uint32_t C = F.Edges[I].Counter;
+      if (C == 0xffffffff)
+        continue;
+      CountersFreq += bolt_instr_locations[C];
+    }
+    if (CountersFreq == 0) {
+      for (int I = 0; I < F.NumCalls; ++I) {
+        const uint32_t C = F.Calls[I].Counter;
+        if (C == 0xffffffff)
+          continue;
+        CountersFreq += bolt_instr_locations[C];
+      }
+      if (CountersFreq == 0)
+        return next;
+    }
+  }
+#endif
+
+  Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
+  DEBUG(G->dump());
+
+  if (!G->EdgeFreqs && !G->CallFreqs) {
+    G->~Graph();
+    Alloc.deallocate(G);
+    return next;
+  }
+
+  for (int I = 0; I < F.NumEdges; ++I) {
+    const uint64_t Freq = G->EdgeFreqs[I];
+    if (Freq == 0)
+      continue;
+    const EdgeDescription *Desc = &F.Edges[I];
+    char LineBuf[BufSize];
+    char *Ptr = LineBuf;
+    Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
+    Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
+    Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
+    Ptr = intToStr(Ptr, Freq, 10);
+    *Ptr++ = '\n';
+    __write(FD, LineBuf, Ptr - LineBuf);
+  }
+
+  for (int I = 0; I < F.NumCalls; ++I) {
+    const uint64_t Freq = G->CallFreqs[I];
+    if (Freq == 0)
+      continue;
+    char LineBuf[BufSize];
+    char *Ptr = LineBuf;
+    const CallDescription *Desc = &F.Calls[I];
+    Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
+    Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
+    Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
+    Ptr = intToStr(Ptr, Freq, 10);
+    *Ptr++ = '\n';
+    __write(FD, LineBuf, Ptr - LineBuf);
+  }
+
+  G->~Graph();
+  Alloc.deallocate(G);
+  return next;
+}
+
+#if !defined(__APPLE__)
+const IndCallTargetDescription *
+ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
+  uint32_t B = 0;
+  uint32_t E = __bolt_instr_num_ind_targets;
+  if (E == 0)
+    return nullptr;
+  do {
+    uint32_t I = (E - B) / 2 + B;
+    if (IndCallTargets[I].Address == Target)
+      return &IndCallTargets[I];
+    if (IndCallTargets[I].Address < Target)
+      B = I + 1;
+    else
+      E = I;
+  } while (B < E);
+  return nullptr;
+}
+
+/// Write a single indirect call <src, target> pair to the fdata file
+void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
+                         int FD, int CallsiteID,
+                         ProfileWriterContext *Ctx) {
+  if (Entry.Val == 0)
+    return;
+  DEBUG(reportNumber("Target func 0x", Entry.Key, 16));
+  DEBUG(reportNumber("Target freq: ", Entry.Val, 10));
+  const IndCallDescription *CallsiteDesc =
+      &Ctx->IndCallDescriptions[CallsiteID];
+  const IndCallTargetDescription *TargetDesc =
+      Ctx->lookupIndCallTarget(Entry.Key);
+  if (!TargetDesc) {
+    DEBUG(report("Failed to lookup indirect call target\n"));
+    char LineBuf[BufSize];
+    char *Ptr = LineBuf;
+    Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
+    Ptr = strCopy(Ptr, "0 [unknown] 0 0 ", BufSize - (Ptr - LineBuf) - 40);
+    Ptr = intToStr(Ptr, Entry.Val, 10);
+    *Ptr++ = '\n';
+    __write(FD, LineBuf, Ptr - LineBuf);
+    return;
+  }
+  Ctx->CallFlowTable->get(TargetDesc->Address).Calls += Entry.Val;
+  char LineBuf[BufSize];
+  char *Ptr = LineBuf;
+  Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
+  Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
+  Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
+  Ptr = intToStr(Ptr, Entry.Val, 10);
+  *Ptr++ = '\n';
+  __write(FD, LineBuf, Ptr - LineBuf);
+}
+
+/// Write to \p FD all of the indirect call profiles.
+void writeIndirectCallProfile(int FD, ProfileWriterContext &Ctx) {
+  for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) {
+    DEBUG(reportNumber("IndCallsite #", I, 10));
+    GlobalIndCallCounters[I].forEachElement(visitIndCallCounter, FD, I, &Ctx);
+  }
+}
+
+/// Check a single call flow for a callee versus all known callers. If there are
+/// less callers than what the callee expects, write the difference with source
+/// [unknown] in the profile.
+void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
+                        ProfileWriterContext *Ctx) {
+  DEBUG(reportNumber("Call flow entry address: 0x", Entry.Key, 16));
+  DEBUG(reportNumber("Calls: ", Entry.Calls, 10));
+  DEBUG(reportNumber("Reported entry frequency: ", Entry.Val, 10));
+  DEBUG({
+    if (Entry.Calls > Entry.Val)
+      report("  More calls than expected!\n");
+  });
+  if (Entry.Val <= Entry.Calls)
+    return;
+  DEBUG(reportNumber(
+      "  Balancing calls with traffic: ", Entry.Val - Entry.Calls, 10));
+  const IndCallTargetDescription *TargetDesc =
+      Ctx->lookupIndCallTarget(Entry.Key);
+  if (!TargetDesc) {
+    // There is probably something wrong with this callee and this should be
+    // investigated, but I don't want to assert and lose all data collected.
+    DEBUG(report("WARNING: failed to look up call target!\n"));
+    return;
+  }
+  char LineBuf[BufSize];
+  char *Ptr = LineBuf;
+  Ptr = strCopy(Ptr, "0 [unknown] 0 ", BufSize);
+  Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
+  Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
+  Ptr = intToStr(Ptr, Entry.Val - Entry.Calls, 10);
+  *Ptr++ = '\n';
+  __write(FD, LineBuf, Ptr - LineBuf);
+}
+
+/// Open fdata file for writing and return a valid file descriptor, aborting
+/// program upon failure.
+int openProfile() {
+  // Build the profile name string by appending our PID
+  char Buf[BufSize];
+  char *Ptr = Buf;
+  uint64_t PID = __getpid();
+  Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
+  if (__bolt_instr_use_pid) {
+    Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
+    Ptr = intToStr(Ptr, PID, 10);
+    Ptr = strCopy(Ptr, ".fdata", BufSize - (Ptr - Buf + 1));
+  }
+  *Ptr++ = '\0';
+  uint64_t FD = __open(Buf,
+                       /*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
+                       /*mode=*/0666);
+  if (static_cast<int64_t>(FD) < 0) {
+    report("Error while trying to open profile file for writing: ");
+    report(Buf);
+    reportNumber("\nFailed with error number: 0x",
+                 0 - static_cast<int64_t>(FD), 16);
+    __exit(1);
+  }
+  return FD;
+}
+
+#endif
+
+} // anonymous namespace
+
+#if !defined(__APPLE__)
+
+/// Reset all counters in case you want to start profiling a new phase of your
+/// program independently of prior phases.
+/// The address of this function is printed by BOLT and this can be called by
+/// any attached debugger during runtime. There is a useful oneliner for gdb:
+///
+///   gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \
+///     -ex 'set confirm off' -ex quit
+///
+/// Where 0xdeadbeef is this function address and PROCESSNAME your binary file
+/// name.
+extern "C" void __bolt_instr_clear_counters() {
+  memSet(reinterpret_cast<char *>(__bolt_instr_locations), 0,
+         __bolt_num_counters * 8);
+  for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) {
+    GlobalIndCallCounters[I].resetCounters();
+  }
+}
+
+/// This is the entry point for profile writing.
+/// There are three ways of getting here:
+///
+///  * Program execution ended, finalization methods are running and BOLT
+///    hooked into FINI from your binary dynamic section;
+///  * You used the sleep timer option and during initialization we forked
+///    a separete process that will call this function periodically;
+///  * BOLT prints this function address so you can attach a debugger and
+///    call this function directly to get your profile written to disk
+///    on demand.
+///
+extern "C" void __attribute((force_align_arg_pointer))
+__bolt_instr_data_dump() {
+  // Already dumping
+  if (!GlobalWriteProfileMutex->acquire())
+    return;
+
+  BumpPtrAllocator HashAlloc;
+  HashAlloc.setMaxSize(0x6400000);
+  ProfileWriterContext Ctx = readDescriptions();
+  Ctx.CallFlowTable = new (HashAlloc, 0) CallFlowHashTable(HashAlloc);
+
+  DEBUG(printStats(Ctx));
+
+  int FD = openProfile();
+
+  BumpPtrAllocator Alloc;
+  const uint8_t *FuncDesc = Ctx.FuncDescriptions;
+  for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
+    FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
+    Alloc.clear();
+    DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
+  }
+  assert(FuncDesc == (void *)Ctx.Strings,
+         "FuncDesc ptr must be equal to stringtable");
+
+  writeIndirectCallProfile(FD, Ctx);
+  Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
+
+  __close(FD);
+  __munmap(Ctx.MMapPtr, Ctx.MMapSize);
+  __close(Ctx.FileDesc);
+  HashAlloc.destroy();
+  GlobalWriteProfileMutex->release();
+  DEBUG(report("Finished writing profile.\n"));
+}
+
+/// Event loop for our child process spawned during setup to dump profile data
+/// at user-specified intervals
+void watchProcess() {
+  timespec ts, rem;
+  uint64_t Ellapsed = 0ull;
+  uint64_t ppid;
+  if (__bolt_instr_wait_forks) {
+    // Store parent pgid
+    ppid = -__getpgid(0);
+    // And leave parent process group
+    __setpgid(0, 0);
+  } else {
+    // Store parent pid
+    ppid = __getppid();
+    if (ppid == 1) {
+      // Parent already dead
+      goto out;
+    }
+  }
+
+  ts.tv_sec = 1;
+  ts.tv_nsec = 0;
+  while (1) {
+    __nanosleep(&ts, &rem);
+    // This means our parent process or all its forks are dead,
+    // so no need for us to keep dumping.
+    if (__kill(ppid, 0) < 0) {
+      if (__bolt_instr_no_counters_clear)
+        __bolt_instr_data_dump();
+      break;
+    }
+
+    if (++Ellapsed < __bolt_instr_sleep_time)
+      continue;
+
+    Ellapsed = 0;
+    __bolt_instr_data_dump();
+    if (__bolt_instr_no_counters_clear == false)
+      __bolt_instr_clear_counters();
+  }
+
+out:;
+  DEBUG(report("My parent process is dead, bye!\n"));
+  __exit(0);
+}
+
+extern "C" void __bolt_instr_indirect_call();
+extern "C" void __bolt_instr_indirect_tailcall();
+
+/// Initialization code
+extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
+  const uint64_t CountersStart =
+      reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
+  const uint64_t CountersEnd = alignTo(
+      reinterpret_cast<uint64_t>(&__bolt_instr_locations[__bolt_num_counters]),
+      0x1000);
+  DEBUG(reportNumber("replace mmap start: ", CountersStart, 16));
+  DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16));
+  assert (CountersEnd > CountersStart, "no counters");
+  // Maps our counters to be shared instead of private, so we keep counting for
+  // forked processes
+  __mmap(CountersStart, CountersEnd - CountersStart,
+         0x3 /*PROT_READ|PROT_WRITE*/,
+         0x31 /*MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED*/, -1, 0);
+
+  __bolt_ind_call_instr_pointer = __bolt_instr_indirect_call;
+  __bolt_ind_tailcall_instr_pointer = __bolt_instr_indirect_tailcall;
+  // Conservatively reserve 100MiB shared pages
+  GlobalAlloc.setMaxSize(0x6400000);
+  GlobalAlloc.setShared(true);
+  GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex();
+  if (__bolt_instr_num_ind_calls > 0)
+    GlobalIndCallCounters =
+        new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
+
+  if (__bolt_instr_sleep_time != 0) {
+    // Separate instrumented process to the own process group
+    if (__bolt_instr_wait_forks)
+      __setpgid(0, 0);
+
+    if (long PID = __fork())
+      return;
+    watchProcess();
+  }
+}
+
+extern "C" __attribute((force_align_arg_pointer)) void
+instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
+  GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc);
+}
+
+/// We receive as in-stack arguments the identifier of the indirect call site
+/// as well as the target address for the call
+extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
+{
+  __asm__ __volatile__(SAVE_ALL
+                       "mov 0xa0(%%rsp), %%rdi\n"
+                       "mov 0x98(%%rsp), %%rsi\n"
+                       "call instrumentIndirectCall\n"
+                       RESTORE_ALL
+                       "ret\n"
+                       :::);
+}
+
+extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
+{
+  __asm__ __volatile__(SAVE_ALL
+                       "mov 0x98(%%rsp), %%rdi\n"
+                       "mov 0x90(%%rsp), %%rsi\n"
+                       "call instrumentIndirectCall\n"
+                       RESTORE_ALL
+                       "ret\n"
+                       :::);
+}
+
+/// This is hooking ELF's entry, it needs to save all machine state.
+extern "C" __attribute((naked)) void __bolt_instr_start()
+{
+  __asm__ __volatile__(SAVE_ALL
+                       "call __bolt_instr_setup\n"
+                       RESTORE_ALL
+                       "jmp __bolt_start_trampoline\n"
+                       :::);
+}
+
+/// This is hooking into ELF's DT_FINI
+extern "C" void __bolt_instr_fini() {
+  __bolt_fini_trampoline();
+  if (__bolt_instr_sleep_time == 0)
+    __bolt_instr_data_dump();
+  DEBUG(report("Finished.\n"));
+}
+
+#endif
+
+#if defined(__APPLE__)
+
+extern "C" void __bolt_instr_data_dump() {
+  ProfileWriterContext Ctx = readDescriptions();
+
+  int FD = 2;
+  BumpPtrAllocator Alloc;
+  const uint8_t *FuncDesc = Ctx.FuncDescriptions;
+  uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
+
+  for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
+    FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
+    Alloc.clear();
+    DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
+  }
+  assert(FuncDesc == (void *)Ctx.Strings,
+         "FuncDesc ptr must be equal to stringtable");
+}
+
+// On OSX/iOS the final symbol name of an extern "C" function/variable contains
+// one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
+extern "C"
+__attribute__((section("__TEXT,__setup")))
+__attribute__((force_align_arg_pointer))
+void _bolt_instr_setup() {
+  __asm__ __volatile__(SAVE_ALL :::);
+
+  report("Hello!\n");
+
+  __asm__ __volatile__(RESTORE_ALL :::);
+}
+
+extern "C"
+__attribute__((section("__TEXT,__fini")))
+__attribute__((force_align_arg_pointer))
+void _bolt_instr_fini() {
+  report("Bye!\n");
+  __bolt_instr_data_dump();
+}
+
+#endif
diff --git a/bolt/src/BinaryBasicBlock.cpp b/bolt/src/BinaryBasicBlock.cpp
new file mode 100644
index 000000000000..7e04279cd9a6
--- /dev/null
+++ b/bolt/src/BinaryBasicBlock.cpp
@@ -0,0 +1,623 @@
+//===--- BinaryBasicBlock.cpp - Interface for assembly-level basic block --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryBasicBlock.h"
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Errc.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+namespace llvm {
+namespace bolt {
+
+constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET;
+
+bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
+  return LHS.Index < RHS.Index;
+}
+
+bool BinaryBasicBlock::hasCFG() const {
+  return getParent()->hasCFG();
+}
+
+bool BinaryBasicBlock::isEntryPoint() const {
+  return getParent()->isEntryPoint(*this);
+}
+
+bool BinaryBasicBlock::hasInstructions() const {
+  return getParent()->hasInstructions();
+}
+
+void BinaryBasicBlock::adjustNumPseudos(const MCInst &Inst, int Sign) {
+  BinaryContext &BC = Function->getBinaryContext();
+  if (BC.MIB->isPseudo(Inst))
+    NumPseudos += Sign;
+}
+
+BinaryBasicBlock::iterator BinaryBasicBlock::getFirstNonPseudo() {
+  const BinaryContext &BC = Function->getBinaryContext();
+  for (auto II = Instructions.begin(), E = Instructions.end(); II != E; ++II) {
+    if (!BC.MIB->isPseudo(*II))
+      return II;
+  }
+  return end();
+}
+
+BinaryBasicBlock::reverse_iterator BinaryBasicBlock::getLastNonPseudo() {
+  const BinaryContext &BC = Function->getBinaryContext();
+  for (auto RII = Instructions.rbegin(), E = Instructions.rend();
+       RII != E; ++RII) {
+    if (!BC.MIB->isPseudo(*RII))
+      return RII;
+  }
+  return rend();
+}
+
+bool BinaryBasicBlock::validateSuccessorInvariants() {
+  const MCInst *Inst = getLastNonPseudoInstr();
+  const JumpTable *JT = Inst ? Function->getJumpTable(*Inst) : nullptr;
+  BinaryContext &BC = Function->getBinaryContext();
+  bool Valid = true;
+
+  if (JT) {
+    // Note: for now we assume that successors do not reference labels from
+    // any overlapping jump tables.  We only look at the entries for the jump
+    // table that is referenced at the last instruction.
+    const auto Range = JT->getEntriesForAddress(BC.MIB->getJumpTable(*Inst));
+    const std::vector<const MCSymbol *> Entries(&JT->Entries[Range.first],
+                                                &JT->Entries[Range.second]);
+    std::set<const MCSymbol *> UniqueSyms(Entries.begin(), Entries.end());
+    for (BinaryBasicBlock *Succ : Successors) {
+      auto Itr = UniqueSyms.find(Succ->getLabel());
+      if (Itr != UniqueSyms.end()) {
+        UniqueSyms.erase(Itr);
+      } else  {
+        // Work on the assumption that jump table blocks don't
+        // have a conditional successor.
+        Valid = false;
+        errs() << "BOLT-WARNING: Jump table successor "
+               << Succ->getName()
+               << " not contained in the jump table.\n";
+      }
+    }
+    // If there are any leftover entries in the jump table, they
+    // must be one of the function end labels.
+    if (Valid) {
+      for (const MCSymbol *Sym : UniqueSyms) {
+        Valid &= (Sym == Function->getFunctionEndLabel() ||
+                  Sym == Function->getFunctionColdEndLabel());
+        if (!Valid) {
+          errs() << "BOLT-WARNING: Jump table contains illegal entry: "
+                 << Sym->getName() << "\n";
+        }
+      }
+    }
+  } else {
+    // Unknown control flow.
+    if (Inst && BC.MIB->isIndirectBranch(*Inst))
+      return true;
+
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+
+    if (analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+      switch (Successors.size()) {
+      case 0:
+        Valid = !CondBranch && !UncondBranch;
+        break;
+      case 1: {
+        const bool HasCondBlock = CondBranch &&
+          Function->getBasicBlockForLabel(BC.MIB->getTargetSymbol(*CondBranch));
+        Valid = !CondBranch || !HasCondBlock;
+        break;
+      }
+      case 2:
+        Valid =
+          (CondBranch &&
+           (TBB == getConditionalSuccessor(true)->getLabel() &&
+            ((!UncondBranch && !FBB) ||
+             (UncondBranch &&
+              FBB == getConditionalSuccessor(false)->getLabel()))));
+        break;
+      }
+    }
+  }
+  if (!Valid) {
+    errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ "
+           << getName() << "\n";
+    if (JT) {
+      errs() << "Jump Table instruction addr = 0x"
+             << Twine::utohexstr(BC.MIB->getJumpTable(*Inst)) << "\n";
+      JT->print(errs());
+    }
+    getFunction()->dump();
+  }
+  return Valid;
+}
+
+BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const {
+  if (!Label && succ_size() == 1)
+    return *succ_begin();
+
+  for (BinaryBasicBlock *BB : successors()) {
+    if (BB->getLabel() == Label)
+      return BB;
+  }
+
+  return nullptr;
+}
+
+BinaryBasicBlock *
+BinaryBasicBlock::getSuccessor(const MCSymbol *Label,
+                               BinaryBranchInfo &BI) const {
+  auto BIIter = branch_info_begin();
+  for (BinaryBasicBlock *BB : successors()) {
+    if (BB->getLabel() == Label) {
+      BI = *BIIter;
+      return BB;
+    }
+    ++BIIter;
+  }
+
+  return nullptr;
+}
+
+BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const {
+  for (BinaryBasicBlock *BB : landing_pads()) {
+    if (BB->getLabel() == Label)
+      return BB;
+  }
+
+  return nullptr;
+}
+
+int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const {
+  assert(
+      getFunction()->getState() >= BinaryFunction::State::CFG &&
+      "can only calculate CFI state when function is in or past the CFG state");
+
+  const std::vector<MCCFIInstruction> &FDEProgram =
+      getFunction()->getFDEProgram();
+
+  // Find the last CFI preceding Instr in this basic block.
+  const MCInst *LastCFI = nullptr;
+  bool InstrSeen = (Instr == nullptr);
+  for (auto RII = Instructions.rbegin(), E = Instructions.rend();
+       RII != E; ++RII) {
+    if (!InstrSeen) {
+      InstrSeen = (&*RII == Instr);
+      continue;
+    }
+    if (Function->getBinaryContext().MIB->isCFI(*RII)) {
+      LastCFI = &*RII;
+      break;
+    }
+  }
+
+  assert(InstrSeen && "instruction expected in basic block");
+
+  // CFI state is the same as at basic block entry point.
+  if (!LastCFI)
+    return getCFIState();
+
+  // Fold all RememberState/RestoreState sequences, such as for:
+  //
+  //   [ CFI #(K-1) ]
+  //   RememberState (#K)
+  //     ....
+  //   RestoreState
+  //   RememberState
+  //     ....
+  //   RestoreState
+  //   [ GNU_args_size ]
+  //   RememberState
+  //     ....
+  //   RestoreState   <- LastCFI
+  //
+  // we return K - the most efficient state to (re-)generate.
+  int64_t State = LastCFI->getOperand(0).getImm();
+  while (State >= 0 &&
+         FDEProgram[State].getOperation() == MCCFIInstruction::OpRestoreState) {
+    int32_t Depth = 1;
+    --State;
+    assert(State >= 0 && "first CFI cannot be RestoreState");
+    while (Depth && State >= 0) {
+      const MCCFIInstruction &CFIInstr = FDEProgram[State];
+      if (CFIInstr.getOperation() == MCCFIInstruction::OpRestoreState) {
+        ++Depth;
+      } else if (CFIInstr.getOperation() == MCCFIInstruction::OpRememberState) {
+        --Depth;
+      }
+      --State;
+    }
+    assert(Depth == 0 && "unbalanced RememberState/RestoreState stack");
+
+    // Skip any GNU_args_size.
+    while (State >= 0 &&
+           FDEProgram[State].getOperation() == MCCFIInstruction::OpGnuArgsSize){
+      --State;
+    }
+  }
+
+  assert((State + 1 >= 0) && "miscalculated CFI state");
+  return State + 1;
+}
+
+void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ,
+                                    uint64_t Count,
+                                    uint64_t MispredictedCount) {
+  Successors.push_back(Succ);
+  BranchInfo.push_back({Count, MispredictedCount});
+  Succ->Predecessors.push_back(this);
+}
+
+void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ,
+                                        BinaryBasicBlock *NewSucc,
+                                        uint64_t Count,
+                                        uint64_t MispredictedCount) {
+  Succ->removePredecessor(this, /*Multiple=*/false);
+  auto I = succ_begin();
+  auto BI = BranchInfo.begin();
+  for (; I != succ_end(); ++I) {
+    assert(BI != BranchInfo.end() && "missing BranchInfo entry");
+    if (*I == Succ)
+      break;
+    ++BI;
+  }
+  assert(I != succ_end() && "no such successor!");
+
+  *I = NewSucc;
+  *BI = BinaryBranchInfo{Count, MispredictedCount};
+  NewSucc->addPredecessor(this);
+}
+
+void BinaryBasicBlock::removeAllSuccessors() {
+  for (BinaryBasicBlock *SuccessorBB : successors()) {
+    SuccessorBB->removePredecessor(this);
+  }
+  Successors.clear();
+  BranchInfo.clear();
+}
+
+void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) {
+  Succ->removePredecessor(this, /*Multiple=*/false);
+  auto I = succ_begin();
+  auto BI = BranchInfo.begin();
+  for (; I != succ_end(); ++I) {
+    assert(BI != BranchInfo.end() && "missing BranchInfo entry");
+    if (*I == Succ)
+      break;
+    ++BI;
+  }
+  assert(I != succ_end() && "no such successor!");
+
+  Successors.erase(I);
+  BranchInfo.erase(BI);
+}
+
+void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) {
+  Predecessors.push_back(Pred);
+}
+
+void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred,
+                                         bool Multiple) {
+  // Note: the predecessor could be listed multiple times.
+  bool Erased = false;
+  for (auto PredI = Predecessors.begin(); PredI != Predecessors.end(); ) {
+    if (*PredI == Pred) {
+      Erased = true;
+      PredI = Predecessors.erase(PredI);
+      if (!Multiple)
+        return;
+    } else {
+      ++PredI;
+    }
+  }
+  assert(Erased && "Pred is not a predecessor of this block!");
+}
+
+void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) {
+  assert(succ_size() == 2 && Successors[0] == Successors[1] &&
+         "conditional successors expected");
+
+  BinaryBasicBlock *Succ = Successors[0];
+  const BinaryBranchInfo CondBI = BranchInfo[0];
+  const BinaryBranchInfo UncondBI = BranchInfo[1];
+
+  eraseInstruction(findInstruction(CondBranch));
+
+  Successors.clear();
+  BranchInfo.clear();
+
+  Successors.push_back(Succ);
+
+  uint64_t Count = COUNT_NO_PROFILE;
+  if (CondBI.Count != COUNT_NO_PROFILE && UncondBI.Count != COUNT_NO_PROFILE)
+    Count = CondBI.Count + UncondBI.Count;
+  BranchInfo.push_back({Count, 0});
+}
+
+void BinaryBasicBlock::adjustExecutionCount(double Ratio) {
+  auto adjustedCount = [&](uint64_t Count) -> uint64_t {
+    double NewCount = Count * Ratio;
+    if (!NewCount && Count && (Ratio > 0.0))
+      NewCount = 1;
+    return NewCount;
+  };
+
+  setExecutionCount(adjustedCount(getKnownExecutionCount()));
+  for (BinaryBranchInfo &BI : branch_info()) {
+    if (BI.Count != COUNT_NO_PROFILE)
+      BI.Count = adjustedCount(BI.Count);
+    if (BI.MispredictedCount != COUNT_INFERRED)
+      BI.MispredictedCount = adjustedCount(BI.MispredictedCount);
+  }
+}
+
+bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB,
+                                     const MCSymbol *&FBB,
+                                     MCInst *&CondBranch,
+                                     MCInst *&UncondBranch) {
+  auto &MIB = Function->getBinaryContext().MIB;
+  return MIB->analyzeBranch(Instructions.begin(),
+                            Instructions.end(),
+                            TBB,
+                            FBB,
+                            CondBranch,
+                            UncondBranch);
+}
+
+bool BinaryBasicBlock::isMacroOpFusionPair(const_iterator I) const {
+  auto &MIB = Function->getBinaryContext().MIB;
+  ArrayRef<MCInst> Insts = Instructions;
+  return MIB->isMacroOpFusionPair(Insts.slice(I - begin()));
+}
+
+BinaryBasicBlock::const_iterator
+BinaryBasicBlock::getMacroOpFusionPair() const {
+  if (!Function->getBinaryContext().isX86())
+    return end();
+
+  if (getNumNonPseudos() < 2 || succ_size() != 2)
+    return end();
+
+  auto RI = getLastNonPseudo();
+  assert(RI != rend() && "cannot have an empty block with 2 successors");
+
+  BinaryContext &BC = Function->getBinaryContext();
+
+  // Skip instruction if it's an unconditional branch following
+  // a conditional one.
+  if (BC.MIB->isUnconditionalBranch(*RI))
+    ++RI;
+
+  if (!BC.MIB->isConditionalBranch(*RI))
+    return end();
+
+  // Start checking with instruction preceding the conditional branch.
+  ++RI;
+  if (RI == rend())
+    return end();
+
+  auto II = std::prev(RI.base()); // convert to a forward iterator
+  if (isMacroOpFusionPair(II))
+    return II;
+
+  return end();
+}
+
+MCInst *BinaryBasicBlock::getTerminatorBefore(MCInst *Pos) {
+  BinaryContext &BC = Function->getBinaryContext();
+  auto Itr = rbegin();
+  bool Check = Pos ? false : true;
+  MCInst *FirstTerminator = nullptr;
+  while (Itr != rend()) {
+    if (!Check) {
+      if (&*Itr == Pos)
+        Check = true;
+      ++Itr;
+      continue;
+    }
+    if (BC.MIB->isTerminator(*Itr))
+      FirstTerminator = &*Itr;
+    ++Itr;
+  }
+  return FirstTerminator;
+}
+
+bool BinaryBasicBlock::hasTerminatorAfter(MCInst *Pos) {
+  BinaryContext &BC = Function->getBinaryContext();
+  auto Itr = rbegin();
+  while (Itr != rend()) {
+    if (&*Itr == Pos)
+      return false;
+    if (BC.MIB->isTerminator(*Itr))
+      return true;
+    ++Itr;
+  }
+  return false;
+}
+
+bool BinaryBasicBlock::swapConditionalSuccessors() {
+  if (succ_size() != 2)
+    return false;
+
+  std::swap(Successors[0], Successors[1]);
+  std::swap(BranchInfo[0], BranchInfo[1]);
+  return true;
+}
+
+void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) {
+  assert(isSuccessor(Successor));
+  BinaryContext &BC = Function->getBinaryContext();
+  MCInst NewInst;
+  std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+  BC.MIB->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get());
+  Instructions.emplace_back(std::move(NewInst));
+}
+
+void BinaryBasicBlock::addTailCallInstruction(const MCSymbol *Target) {
+  BinaryContext &BC = Function->getBinaryContext();
+  MCInst NewInst;
+  BC.MIB->createTailCall(NewInst, Target, BC.Ctx.get());
+  Instructions.emplace_back(std::move(NewInst));
+}
+
+uint32_t BinaryBasicBlock::getNumCalls() const {
+  uint32_t N = 0;
+  BinaryContext &BC = Function->getBinaryContext();
+  for (const MCInst &Instr : Instructions) {
+    if (BC.MIB->isCall(Instr))
+      ++N;
+  }
+  return N;
+}
+
+uint32_t BinaryBasicBlock::getNumPseudos() const {
+#ifndef NDEBUG
+  BinaryContext &BC = Function->getBinaryContext();
+  uint32_t N = 0;
+  for (const MCInst &Instr : Instructions) {
+    if (BC.MIB->isPseudo(Instr))
+      ++N;
+  }
+  if (N != NumPseudos) {
+    errs() << "BOLT-ERROR: instructions for basic block " << getName()
+           << " in function " << *Function << ": calculated pseudos "
+           << N << ", set pseudos " << NumPseudos << ", size " << size()
+           << '\n';
+    llvm_unreachable("pseudos mismatch");
+  }
+#endif
+  return NumPseudos;
+}
+
+ErrorOr<std::pair<double, double>>
+BinaryBasicBlock::getBranchStats(const BinaryBasicBlock *Succ) const {
+  if (Function->hasValidProfile()) {
+    uint64_t TotalCount = 0;
+    uint64_t TotalMispreds = 0;
+    for (const BinaryBranchInfo &BI : BranchInfo) {
+      if (BI.Count != COUNT_NO_PROFILE) {
+        TotalCount += BI.Count;
+        TotalMispreds += BI.MispredictedCount;
+      }
+    }
+
+    if (TotalCount > 0) {
+      auto Itr = std::find(Successors.begin(), Successors.end(), Succ);
+      assert(Itr != Successors.end());
+      const BinaryBranchInfo &BI = BranchInfo[Itr - Successors.begin()];
+      if (BI.Count && BI.Count != COUNT_NO_PROFILE) {
+        if (TotalMispreds == 0) TotalMispreds = 1;
+        return std::make_pair(double(BI.Count) / TotalCount,
+                              double(BI.MispredictedCount) / TotalMispreds);
+      }
+    }
+  }
+  return make_error_code(llvm::errc::result_out_of_range);
+}
+
+void BinaryBasicBlock::dump() const {
+  BinaryContext &BC = Function->getBinaryContext();
+  if (Label) outs() << Label->getName() << ":\n";
+  BC.printInstructions(outs(), Instructions.begin(), Instructions.end(),
+                       getOffset());
+  outs() << "preds:";
+  for (auto itr = pred_begin(); itr != pred_end(); ++itr) {
+    outs() << " " << (*itr)->getName();
+  }
+  outs() << "\nsuccs:";
+  for (auto itr = succ_begin(); itr != succ_end(); ++itr) {
+    outs() << " " << (*itr)->getName();
+  }
+  outs() << "\n";
+}
+
+uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
+  return Function->getBinaryContext().computeCodeSize(begin(), end(), Emitter);
+}
+
+BinaryBasicBlock::BinaryBranchInfo &
+BinaryBasicBlock::getBranchInfo(const BinaryBasicBlock &Succ) {
+  auto BI = branch_info_begin();
+  for (BinaryBasicBlock *BB : successors()) {
+    if (&Succ == BB)
+      return *BI;
+    ++BI;
+  }
+
+  llvm_unreachable("Invalid successor");
+  return *BI;
+}
+
+BinaryBasicBlock::BinaryBranchInfo &
+BinaryBasicBlock::getBranchInfo(const MCSymbol *Label) {
+  auto BI = branch_info_begin();
+  for (BinaryBasicBlock *BB : successors()) {
+    if (BB->getLabel() == Label)
+      return *BI;
+    ++BI;
+  }
+
+  llvm_unreachable("Invalid successor");
+  return *BI;
+}
+
+BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) {
+  assert(II != end() && "expected iterator pointing to instruction");
+
+  BinaryBasicBlock *NewBlock = getFunction()->addBasicBlock(0);
+
+  // Adjust successors/predecessors and propagate the execution count.
+  moveAllSuccessorsTo(NewBlock);
+  addSuccessor(NewBlock, getExecutionCount(), 0);
+
+  // Set correct CFI state for the new block.
+  NewBlock->setCFIState(getCFIStateAtInstr(&*II));
+
+  // Move instructions over.
+  adjustNumPseudos(II, end(), -1);
+  NewBlock->addInstructions(II, end());
+  Instructions.erase(II, end());
+
+  return NewBlock;
+}
+
+void BinaryBasicBlock::updateOutputValues(const MCAsmLayout &Layout) {
+  if (!LocSyms)
+    return;
+
+  const uint64_t BBAddress = getOutputAddressRange().first;
+  const uint64_t BBOffset = Layout.getSymbolOffset(*getLabel());
+  for (const auto &LocSymKV : *LocSyms) {
+    const uint32_t InputFunctionOffset = LocSymKV.first;
+    const uint32_t OutputOffset = static_cast<uint32_t>(
+        Layout.getSymbolOffset(*LocSymKV.second) - BBOffset);
+    getOffsetTranslationTable().emplace_back(
+        std::make_pair(OutputOffset, InputFunctionOffset));
+
+    // Update reverse (relative to BAT) address lookup table for function.
+    if (getFunction()->requiresAddressTranslation()) {
+      getFunction()->getInputOffsetToAddressMap().emplace(
+          std::make_pair(InputFunctionOffset, OutputOffset + BBAddress));
+    }
+  }
+  LocSyms.reset(nullptr);
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h
new file mode 100644
index 000000000000..e6c96c78117e
--- /dev/null
+++ b/bolt/src/BinaryBasicBlock.h
@@ -0,0 +1,1080 @@
+//===--- BinaryBasicBlock.h - Interface for assembly-level basic block ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H
+
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <limits>
+#include <utility>
+
+namespace llvm {
+class MCCodeEmitter;
+
+namespace bolt {
+
+class BinaryFunction;
+
+/// The intention is to keep the structure similar to MachineBasicBlock as
+/// we might switch to it at some point.
+class BinaryBasicBlock {
+public:
+
+  /// Profile execution information for a given edge in CFG.
+  ///
+  /// If MispredictedCount equals COUNT_INFERRED, then we have a profile
+  /// data for a fall-through edge with a Count representing an inferred
+  /// execution count, i.e. the count we calculated internally, not the one
+  /// coming from profile data.
+  ///
+  /// For all other values of MispredictedCount, Count represents the number of
+  /// branch executions from a profile, and MispredictedCount is the number
+  /// of times the branch was mispredicted according to this profile.
+  struct BinaryBranchInfo {
+    uint64_t Count;
+    uint64_t MispredictedCount; /// number of branches mispredicted
+
+    bool operator<(const BinaryBranchInfo &Other) const {
+      return (Count < Other.Count) ||
+             (Count == Other.Count &&
+              MispredictedCount < Other.MispredictedCount);
+    }
+  };
+
+  static constexpr uint32_t INVALID_OFFSET =
+                                          std::numeric_limits<uint32_t>::max();
+
+private:
+  /// Vector of all instructions in the block.
+  std::vector<MCInst> Instructions;
+
+  /// CFG information.
+  std::vector<BinaryBasicBlock *> Predecessors;
+  std::vector<BinaryBasicBlock *> Successors;
+  std::vector<BinaryBasicBlock *> Throwers;
+  std::vector<BinaryBasicBlock *> LandingPads;
+
+  /// Each successor has a corresponding BranchInfo entry in the list.
+  std::vector<BinaryBranchInfo> BranchInfo;
+
+  /// Function that owns this basic block.
+  BinaryFunction *Function;
+
+  /// Label associated with the block.
+  MCSymbol *Label{nullptr};
+
+  /// [Begin, End) address range for this block in the output binary.
+  std::pair<uint32_t, uint32_t> OutputAddressRange{0, 0};
+
+  /// Original offset range of the basic block in the function.
+  std::pair<uint32_t, uint32_t> InputRange{INVALID_OFFSET, INVALID_OFFSET};
+
+  /// Map input offset (from function start) of an instruction to an output
+  /// symbol. Enables writing BOLT address translation tables used for mapping
+  /// control transfer in the output binary back to the original binary.
+  using LocSymsTy = std::vector<std::pair<uint32_t, const MCSymbol *>>;
+  std::unique_ptr<LocSymsTy> LocSyms;
+
+  /// After output/codegen, map output offsets of instructions in this basic
+  /// block to instruction offsets in the original function. Note that the
+  /// output basic block could be different from the input basic block.
+  /// We only map instruction of interest, such as calls, and sdt markers.
+  ///
+  /// We store the offset array in a basic block to facilitate BAT tables
+  /// generation. Otherwise, the mapping could be done at function level.
+  using OffsetTranslationTableTy = std::vector<std::pair<uint32_t, uint32_t>>;
+  std::unique_ptr<OffsetTranslationTableTy> OffsetTranslationTable;
+
+  /// Alignment requirements for the block.
+  uint32_t Alignment{1};
+
+  /// Maximum number of bytes to use for alignment of the block.
+  uint32_t AlignmentMaxBytes{0};
+
+  /// Number of times this basic block was executed.
+  uint64_t ExecutionCount{COUNT_NO_PROFILE};
+
+  static constexpr unsigned InvalidIndex = ~0u;
+
+  /// Index to BasicBlocks vector in BinaryFunction.
+  unsigned Index{InvalidIndex};
+
+  /// Index in the current layout.
+  mutable unsigned LayoutIndex{InvalidIndex};
+
+  /// Number of pseudo instructions in this block.
+  uint32_t NumPseudos{0};
+
+  /// CFI state at the entry to this basic block.
+  int32_t CFIState{-1};
+
+  /// In cases where the parent function has been split, IsCold == true means
+  /// this BB will be allocated outside its parent function.
+  bool IsCold{false};
+
+  /// Indicates if the block could be outlined.
+  bool CanOutline{true};
+
+  /// Flag to indicate whether this block is valid or not.  Invalid
+  /// blocks may contain out of date or incorrect information.
+  bool IsValid{true};
+
+private:
+  BinaryBasicBlock() = delete;
+  BinaryBasicBlock(const BinaryBasicBlock &) = delete;
+  BinaryBasicBlock& operator=(const BinaryBasicBlock &) = delete;
+
+  explicit BinaryBasicBlock(
+      BinaryFunction *Function,
+      MCSymbol *Label,
+      uint32_t Offset = INVALID_OFFSET)
+    : Function(Function), Label(Label) {
+    assert(Function && "Function must be non-null");
+    InputRange.first = Offset;
+  }
+
+  // Exclusively managed by BinaryFunction.
+  friend class BinaryFunction;
+  friend bool operator<(const BinaryBasicBlock &LHS,
+                        const BinaryBasicBlock &RHS);
+
+  /// Assign new label to the basic block.
+  void setLabel(MCSymbol *Symbol) {
+    Label = Symbol;
+  }
+
+public:
+  static constexpr uint64_t COUNT_INFERRED =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint64_t COUNT_NO_PROFILE =
+      std::numeric_limits<uint64_t>::max();
+
+  // Instructions iterators.
+  using iterator       = std::vector<MCInst>::iterator;
+  using const_iterator = std::vector<MCInst>::const_iterator;
+  using reverse_iterator       = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  bool         empty()            const { assert(hasInstructions());
+                                          return Instructions.empty(); }
+  size_t       size()             const { assert(hasInstructions());
+                                          return Instructions.size(); }
+  MCInst       &front()                 { assert(hasInstructions());
+                                          return Instructions.front();  }
+  MCInst       &back()                  { assert(hasInstructions());
+                                          return Instructions.back();   }
+  const MCInst &front()           const { assert(hasInstructions());
+                                          return Instructions.front();  }
+  const MCInst &back()            const { assert(hasInstructions());
+                                          return Instructions.back();   }
+
+  iterator                begin()       { assert(hasInstructions());
+                                          return Instructions.begin();  }
+  const_iterator          begin() const { assert(hasInstructions());
+                                          return Instructions.begin();  }
+  iterator                end  ()       { assert(hasInstructions());
+                                          return Instructions.end();    }
+  const_iterator          end  () const { assert(hasInstructions());
+                                          return Instructions.end();    }
+  reverse_iterator       rbegin()       { assert(hasInstructions());
+                                          return Instructions.rbegin(); }
+  const_reverse_iterator rbegin() const { assert(hasInstructions());
+                                          return Instructions.rbegin(); }
+  reverse_iterator       rend  ()       { assert(hasInstructions());
+                                          return Instructions.rend();   }
+  const_reverse_iterator rend  () const { assert(hasInstructions());
+                                          return Instructions.rend();   }
+
+  // CFG iterators.
+  using pred_iterator        = std::vector<BinaryBasicBlock *>::iterator;
+  using const_pred_iterator  = std::vector<BinaryBasicBlock *>::const_iterator;
+  using succ_iterator        = std::vector<BinaryBasicBlock *>::iterator;
+  using const_succ_iterator  = std::vector<BinaryBasicBlock *>::const_iterator;
+  using throw_iterator       = decltype(Throwers)::iterator;
+  using const_throw_iterator = decltype(Throwers)::const_iterator;
+  using lp_iterator          = decltype(LandingPads)::iterator;
+  using const_lp_iterator    = decltype(LandingPads)::const_iterator;
+
+  using pred_reverse_iterator = std::reverse_iterator<pred_iterator>;
+  using const_pred_reverse_iterator =
+    std::reverse_iterator<const_pred_iterator>;
+  using succ_reverse_iterator = std::reverse_iterator<succ_iterator>;
+  using const_succ_reverse_iterator =
+    std::reverse_iterator<const_succ_iterator>;
+
+  pred_iterator        pred_begin()       { return Predecessors.begin(); }
+  const_pred_iterator  pred_begin() const { return Predecessors.begin(); }
+  pred_iterator        pred_end()         { return Predecessors.end();   }
+  const_pred_iterator  pred_end()   const { return Predecessors.end();   }
+  pred_reverse_iterator        pred_rbegin()
+                                          { return Predecessors.rbegin();}
+  const_pred_reverse_iterator  pred_rbegin() const
+                                          { return Predecessors.rbegin();}
+  pred_reverse_iterator        pred_rend()
+                                          { return Predecessors.rend();  }
+  const_pred_reverse_iterator  pred_rend()   const
+                                          { return Predecessors.rend();  }
+  size_t               pred_size()  const {
+    return Predecessors.size();
+  }
+  bool                 pred_empty() const { return Predecessors.empty(); }
+
+  succ_iterator        succ_begin()       { return Successors.begin();   }
+  const_succ_iterator  succ_begin() const { return Successors.begin();   }
+  succ_iterator        succ_end()         { return Successors.end();     }
+  const_succ_iterator  succ_end()   const { return Successors.end();     }
+  succ_reverse_iterator        succ_rbegin()
+                                          { return Successors.rbegin();  }
+  const_succ_reverse_iterator  succ_rbegin() const
+                                          { return Successors.rbegin();  }
+  succ_reverse_iterator        succ_rend()
+                                          { return Successors.rend();    }
+  const_succ_reverse_iterator  succ_rend()   const
+                                          { return Successors.rend();    }
+  size_t               succ_size()  const {
+    return Successors.size();
+  }
+  bool                 succ_empty() const { return Successors.empty();   }
+
+  throw_iterator        throw_begin()       { return Throwers.begin(); }
+  const_throw_iterator  throw_begin() const { return Throwers.begin(); }
+  throw_iterator        throw_end()         { return Throwers.end();   }
+  const_throw_iterator  throw_end()   const { return Throwers.end();   }
+  size_t                throw_size()  const {
+    return Throwers.size();
+  }
+  bool                  throw_empty() const { return Throwers.empty(); }
+  bool                  isLandingPad() const { return !Throwers.empty(); }
+
+  lp_iterator        lp_begin()       { return LandingPads.begin();   }
+  const_lp_iterator  lp_begin() const { return LandingPads.begin();   }
+  lp_iterator        lp_end()         { return LandingPads.end();     }
+  const_lp_iterator  lp_end()   const { return LandingPads.end();     }
+  size_t             lp_size()  const {
+    return LandingPads.size();
+  }
+  bool               lp_empty() const { return LandingPads.empty();   }
+
+  inline iterator_range<iterator> instructions() {
+    assert(hasInstructions());
+    return iterator_range<iterator>(begin(), end());
+  }
+  inline iterator_range<const_iterator> instructions() const {
+    assert(hasInstructions());
+    return iterator_range<const_iterator>(begin(), end());
+  }
+  inline iterator_range<pred_iterator> predecessors() {
+    assert(hasCFG());
+    return iterator_range<pred_iterator>(pred_begin(), pred_end());
+  }
+  inline iterator_range<const_pred_iterator> predecessors() const {
+    assert(hasCFG());
+    return iterator_range<const_pred_iterator>(pred_begin(), pred_end());
+  }
+  inline iterator_range<succ_iterator> successors() {
+    assert(hasCFG());
+    return iterator_range<succ_iterator>(succ_begin(), succ_end());
+  }
+  inline iterator_range<const_succ_iterator> successors() const {
+    assert(hasCFG());
+    return iterator_range<const_succ_iterator>(succ_begin(), succ_end());
+  }
+  inline iterator_range<throw_iterator> throwers() {
+    assert(hasCFG());
+    return iterator_range<throw_iterator>(throw_begin(), throw_end());
+  }
+  inline iterator_range<const_throw_iterator> throwers() const {
+    assert(hasCFG());
+    return iterator_range<const_throw_iterator>(throw_begin(), throw_end());
+  }
+  inline iterator_range<lp_iterator> landing_pads() {
+    assert(hasCFG());
+    return iterator_range<lp_iterator>(lp_begin(), lp_end());
+  }
+  inline iterator_range<const_lp_iterator> landing_pads() const {
+    assert(hasCFG());
+    return iterator_range<const_lp_iterator>(lp_begin(), lp_end());
+  }
+
+  // BranchInfo iterators.
+  using branch_info_iterator = std::vector<BinaryBranchInfo>::iterator;
+  using const_branch_info_iterator =
+                       std::vector<BinaryBranchInfo>::const_iterator;
+  using branch_info_reverse_iterator =
+                       std::reverse_iterator<branch_info_iterator>;
+  using const_branch_info_reverse_iterator =
+                       std::reverse_iterator<const_branch_info_iterator>;
+
+  branch_info_iterator branch_info_begin() { return BranchInfo.begin(); }
+  branch_info_iterator branch_info_end()   { return BranchInfo.end(); }
+  const_branch_info_iterator branch_info_begin() const {
+    return BranchInfo.begin();
+  }
+  const_branch_info_iterator branch_info_end() const {
+    return BranchInfo.end();
+  }
+  branch_info_reverse_iterator branch_info_rbegin() {
+    return BranchInfo.rbegin();
+  }
+  branch_info_reverse_iterator branch_info_rend() {
+    return BranchInfo.rend();
+  }
+  const_branch_info_reverse_iterator branch_info_rbegin() const {
+    return BranchInfo.rbegin();
+  }
+  const_branch_info_reverse_iterator branch_info_rend() const {
+    return BranchInfo.rend();
+  }
+
+  size_t branch_info_size()  const { return BranchInfo.size(); }
+  bool branch_info_empty() const { return BranchInfo.empty(); }
+
+  inline iterator_range<branch_info_iterator> branch_info() {
+    return iterator_range<branch_info_iterator>(
+        BranchInfo.begin(), BranchInfo.end());
+  }
+  inline iterator_range<const_branch_info_iterator> branch_info() const {
+    return iterator_range<const_branch_info_iterator>(
+        BranchInfo.begin(), BranchInfo.end());
+  }
+
+  /// Get instruction at given index.
+  MCInst &getInstructionAtIndex(unsigned Index) {
+    return Instructions.at(Index);
+  }
+
+  const MCInst &getInstructionAtIndex(unsigned Index) const {
+    return Instructions.at(Index);
+  }
+
+  /// Return symbol marking the start of this basic block.
+  MCSymbol *getLabel() {
+    return Label;
+  }
+
+  /// Return symbol marking the start of this basic block (const version).
+  const MCSymbol *getLabel() const {
+    return Label;
+  }
+
+  /// Get successor with given \p Label if \p Label != nullptr.
+  /// Returns nullptr if no such successor is found.
+  /// If the \p Label == nullptr and the block has only one successor then
+  /// return the successor.
+  BinaryBasicBlock *getSuccessor(const MCSymbol *Label = nullptr) const;
+
+  /// Return the related branch info as well as the successor.
+  BinaryBasicBlock *getSuccessor(const MCSymbol *Label,
+                                 BinaryBranchInfo &BI) const;
+
+  /// If the basic block ends with a conditional branch (possibly followed by
+  /// an unconditional branch) and thus has 2 successors, return a successor
+  /// corresponding to a jump condition which could be true or false.
+  /// Return nullptr if the basic block does not have a conditional jump.
+  BinaryBasicBlock *getConditionalSuccessor(bool Condition) {
+    if (succ_size() != 2)
+      return nullptr;
+    return Successors[Condition == true ? 0 : 1];
+  }
+
+  const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const {
+    return
+      const_cast<BinaryBasicBlock *>(this)->getConditionalSuccessor(Condition);
+  }
+
+  /// Find the fallthrough successor for a block, or nullptr if there is
+  /// none.
+  BinaryBasicBlock* getFallthrough() {
+    if (succ_size() == 2)
+      return getConditionalSuccessor(false);
+    else
+      return getSuccessor();
+  }
+
+  const BinaryBasicBlock *getFallthrough() const {
+    return const_cast<BinaryBasicBlock *>(this)->getFallthrough();
+  }
+
+  /// Return branch info corresponding to a taken branch.
+  const BinaryBranchInfo &getTakenBranchInfo() const {
+    assert(BranchInfo.size() == 2 &&
+           "could only be called for blocks with 2 successors");
+    return BranchInfo[0];
+  };
+
+  /// Return branch info corresponding to a fall-through branch.
+  const BinaryBranchInfo &getFallthroughBranchInfo() const {
+    assert(BranchInfo.size() == 2 &&
+           "could only be called for blocks with 2 successors");
+    return BranchInfo[1];
+  };
+
+  /// Return branch info corresponding to an edge going to \p Succ basic block.
+  BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ);
+
+  /// Return branch info corresponding to an edge going to a basic block with
+  /// label \p Label.
+  BinaryBranchInfo &getBranchInfo(const MCSymbol *Label);
+
+  /// Set branch information for the outgoing edge to block \p Succ.
+  void setSuccessorBranchInfo(const BinaryBasicBlock &Succ,
+                              uint64_t Count,
+                              uint64_t MispredictedCount) {
+    BinaryBranchInfo &BI = getBranchInfo(Succ);
+    BI.Count = Count;
+    BI.MispredictedCount = MispredictedCount;
+  }
+
+  /// Try to compute the taken and misprediction frequencies for the given
+  /// successor.  The result is an error if no information can be found.
+  ErrorOr<std::pair<double, double>>
+  getBranchStats(const BinaryBasicBlock *Succ) const;
+
+  /// If the basic block ends with a conditional branch (possibly followed by
+  /// an unconditional branch) and thus has 2 successor, reverse the order of
+  /// its successors in CFG, update branch info, and return true. If the basic
+  /// block does not have 2 successors return false.
+  bool swapConditionalSuccessors();
+
+  /// Add an instruction with unconditional control transfer to \p Successor
+  /// basic block to the end of this basic block.
+  void addBranchInstruction(const BinaryBasicBlock *Successor);
+
+  /// Add an instruction with tail call control transfer to \p Target
+  /// to the end of this basic block.
+  void addTailCallInstruction(const MCSymbol *Target);
+
+  /// Return the number of call instructions in this basic block.
+  uint32_t getNumCalls() const;
+
+  /// Get landing pad with given label. Returns nullptr if no such
+  /// landing pad is found.
+  BinaryBasicBlock *getLandingPad(const MCSymbol *Label) const;
+
+  /// Return local name for the block.
+  StringRef getName() const {
+    return Label->getName();
+  }
+
+  /// Add instruction at the end of this basic block.
+  /// Returns iterator pointing to the inserted instruction.
+  iterator addInstruction(MCInst &&Inst) {
+    adjustNumPseudos(Inst, 1);
+    Instructions.emplace_back(Inst);
+    return std::prev(Instructions.end());
+  }
+
+  /// Add instruction at the end of this basic block.
+  /// Returns iterator pointing to the inserted instruction.
+  iterator addInstruction(const MCInst &Inst) {
+    adjustNumPseudos(Inst, 1);
+    Instructions.push_back(Inst);
+    return std::prev(Instructions.end());
+  }
+
+  /// Add a range of instructions to the end of this basic block.
+  template <typename Itr>
+  void addInstructions(Itr Begin, Itr End) {
+    while (Begin != End) {
+      addInstruction(*Begin++);
+    }
+  }
+
+  /// Add a range of instructions to the end of this basic block.
+  template <typename RangeTy>
+  void addInstructions(RangeTy R) {
+    for (auto &I : R)
+      addInstruction(I);
+  }
+
+  /// Add instruction before Pos in this basic block.
+  template <typename Itr>
+  Itr insertPseudoInstr(Itr Pos, MCInst &Instr) {
+    ++NumPseudos;
+    return Instructions.emplace(Pos, Instr);
+  }
+
+  /// Return the number of pseudo instructions in the basic block.
+  uint32_t getNumPseudos() const;
+
+  /// Return the number of emitted instructions for this basic block.
+  uint32_t getNumNonPseudos() const {
+    return size() - getNumPseudos();
+  }
+
+  /// Return iterator to the first non-pseudo instruction or end()
+  /// if no such instruction was found.
+  iterator getFirstNonPseudo();
+
+  /// Return a pointer to the first non-pseudo instruction in this basic
+  /// block.  Returns nullptr if none exists.
+  MCInst *getFirstNonPseudoInstr() {
+    auto II = getFirstNonPseudo();
+    return II == Instructions.end() ? nullptr : &*II;
+  }
+
+  /// Return reverse iterator to the last non-pseudo instruction or rend()
+  /// if no such instruction was found.
+  reverse_iterator getLastNonPseudo();
+  const_reverse_iterator getLastNonPseudo() const {
+    return const_cast<BinaryBasicBlock *>(this)->getLastNonPseudo();
+  }
+
+  /// Return a pointer to the last non-pseudo instruction in this basic
+  /// block.  Returns nullptr if none exists.
+  MCInst *getLastNonPseudoInstr() {
+    auto RII = getLastNonPseudo();
+    return RII == Instructions.rend() ? nullptr : &*RII;
+  }
+  const MCInst *getLastNonPseudoInstr() const {
+    auto RII = getLastNonPseudo();
+    return RII == Instructions.rend() ? nullptr : &*RII;
+  }
+
+  /// Set CFI state at entry to this basic block.
+  void setCFIState(int32_t NewCFIState) {
+    assert((CFIState == -1 || NewCFIState == CFIState) &&
+           "unexpected change of CFI state for basic block");
+    CFIState = NewCFIState;
+  }
+
+  /// Return CFI state (expected) at entry of this basic block.
+  int32_t getCFIState() const {
+    assert(CFIState >= 0 && "unknown CFI state");
+    return CFIState;
+  }
+
+  /// Calculate and return CFI state right before instruction \p Instr in
+  /// this basic block. If \p Instr is nullptr then return the state at
+  /// the end of the basic block.
+  int32_t getCFIStateAtInstr(const MCInst *Instr) const;
+
+  /// Calculate and return CFI state after execution of this basic block.
+  /// The state depends on CFI state at entry and CFI instructions inside the
+  /// basic block.
+  int32_t getCFIStateAtExit() const {
+    return getCFIStateAtInstr(nullptr);
+  }
+
+  /// Set minimum alignment for the basic block.
+  void setAlignment(uint32_t Align) {
+    Alignment = Align;
+  }
+
+  /// Return required alignment for the block.
+  uint32_t getAlignment() const {
+    return Alignment;
+  }
+
+  /// Set the maximum number of bytes to use for the block alignment.
+  void setAlignmentMaxBytes(uint32_t Value) {
+    AlignmentMaxBytes = Value;
+  }
+
+  /// Return the maximum number of bytes to use for the block alignment.
+  uint32_t getAlignmentMaxBytes() const {
+    return AlignmentMaxBytes;
+  }
+
+  /// Adds block to successor list, and also updates predecessor list for
+  /// successor block.
+  /// Set branch info for this path.
+  void addSuccessor(BinaryBasicBlock *Succ,
+                    uint64_t Count = 0,
+                    uint64_t MispredictedCount = 0);
+
+  void addSuccessor(BinaryBasicBlock *Succ, const BinaryBranchInfo &BI) {
+    addSuccessor(Succ, BI.Count, BI.MispredictedCount);
+  }
+
+  /// Add a range of successors.
+  template <typename Itr>
+  void addSuccessors(Itr Begin, Itr End) {
+    while (Begin != End) {
+      addSuccessor(*Begin++);
+    }
+  }
+
+  /// Add a range of successors with branch info.
+  template <typename Itr, typename BrItr>
+  void addSuccessors(Itr Begin, Itr End, BrItr BrBegin, BrItr BrEnd) {
+    assert(std::distance(Begin, End) == std::distance(BrBegin, BrEnd));
+    while (Begin != End) {
+      addSuccessor(*Begin++, *BrBegin++);
+    }
+  }
+
+  /// Replace Succ with NewSucc.  This routine is helpful for preserving
+  /// the order of conditional successors when editing the CFG.
+  void replaceSuccessor(BinaryBasicBlock *Succ,
+                        BinaryBasicBlock *NewSucc,
+                        uint64_t Count = 0,
+                        uint64_t MispredictedCount = 0);
+
+  /// Move all of this block's successors to a new block, and set the
+  /// execution count of this new block with our execution count. This is
+  /// useful when splitting a block in two.
+  void moveAllSuccessorsTo(BinaryBasicBlock *New) {
+    New->addSuccessors(successors().begin(),
+                       successors().end(),
+                       branch_info_begin(),
+                       branch_info_end());
+    removeAllSuccessors();
+
+    // Update the execution count on the new block.
+    New->setExecutionCount(getExecutionCount());
+  }
+
+  /// Remove /p Succ basic block from the list of successors. Update the
+  /// list of predecessors of /p Succ and update branch info.
+  void removeSuccessor(BinaryBasicBlock *Succ);
+
+  /// Remove all successors of the basic block, and remove the block
+  /// from respective lists of predecessors.
+  void removeAllSuccessors();
+
+  /// Remove useless duplicate successors.  When the conditional
+  /// successor is the same as the unconditional successor, we can
+  /// remove the conditional successor and branch instruction.
+  void removeDuplicateConditionalSuccessor(MCInst *CondBranch);
+
+  /// Test if BB is a predecessor of this block.
+  bool isPredecessor(const BinaryBasicBlock *BB) const {
+    auto Itr = std::find(Predecessors.begin(), Predecessors.end(), BB);
+    return Itr != Predecessors.end();
+  }
+
+  /// Test if BB is a successor of this block.
+  bool isSuccessor(const BinaryBasicBlock *BB) const {
+    auto Itr = std::find(Successors.begin(), Successors.end(), BB);
+    return Itr != Successors.end();
+  }
+
+  /// Test if this BB has a valid execution count.
+  bool hasProfile() const {
+    return ExecutionCount != COUNT_NO_PROFILE;
+  }
+
+  /// Return the information about the number of times this basic block was
+  /// executed.
+  ///
+  /// Return COUNT_NO_PROFILE if there's no profile info.
+  uint64_t getExecutionCount() const {
+    return ExecutionCount;
+  }
+
+  /// Return the execution count for blocks with known profile.
+  /// Return 0 if the block has no profile.
+  uint64_t getKnownExecutionCount() const {
+    return !hasProfile() ? 0 : ExecutionCount;
+  }
+
+  /// Set the execution count for this block.
+  void setExecutionCount(uint64_t Count) {
+    ExecutionCount = Count;
+  }
+
+  /// Apply a given \p Ratio to the profile information of this basic block.
+  void adjustExecutionCount(double Ratio);
+
+  /// Return true if the basic block is an entry point into the function
+  /// (either primary or secondary).
+  bool isEntryPoint() const;
+
+  bool isValid() const {
+    return IsValid;
+  }
+
+  void markValid(const bool Valid) {
+    IsValid = Valid;
+  }
+
+  bool isCold() const {
+    return IsCold;
+  }
+
+  void setIsCold(const bool Flag) {
+    IsCold = Flag;
+  }
+
+  /// Return true if the block can be outlined. At the moment we disallow
+  /// outlining of blocks that can potentially throw exceptions or are
+  /// the beginning of a landing pad. The entry basic block also can
+  /// never be outlined.
+  bool canOutline() const {
+    return CanOutline;
+  }
+
+  void setCanOutline(const bool Flag) {
+    CanOutline = Flag;
+  }
+
+  /// Erase pseudo instruction at a given iterator.
+  /// Return iterator following the removed instruction.
+  iterator erasePseudoInstruction(iterator II) {
+    --NumPseudos;
+    return Instructions.erase(II);
+  }
+
+  /// Erase non-pseudo instruction at a given iterator \p II.
+  /// Return iterator following the removed instruction.
+  iterator eraseInstruction(iterator II) {
+    adjustNumPseudos(*II, -1);
+    return Instructions.erase(II);
+  }
+
+  /// Erase instructions in the specified range.
+  template <typename ItrType>
+  void eraseInstructions(ItrType Begin, ItrType End) {
+    while (End > Begin) {
+      eraseInstruction(findInstruction(*--End));
+    }
+  }
+
+  /// Erase all instructions.
+  void clear() {
+    Instructions.clear();
+    NumPseudos = 0;
+  }
+
+  /// Retrieve iterator for \p Inst or return end iterator if instruction is not
+  /// from this basic block.
+  decltype(Instructions)::iterator findInstruction(const MCInst *Inst) {
+    if (Instructions.empty())
+      return Instructions.end();
+    size_t Index = Inst - &Instructions[0];
+    return Index >= Instructions.size() ? Instructions.end()
+                                        : Instructions.begin() + Index;
+  }
+
+  /// Replace instruction referenced by iterator \II with a sequence of
+  /// instructions defined by [\p Begin, \p End] range.
+  ///
+  /// Return iterator pointing to the first inserted instruction.
+  template <typename Itr>
+  iterator replaceInstruction(iterator II, Itr Begin, Itr End) {
+    adjustNumPseudos(*II, -1);
+    adjustNumPseudos(Begin, End, 1);
+
+    auto I = II - Instructions.begin();
+    Instructions.insert(Instructions.erase(II), Begin, End);
+    return I + Instructions.begin();
+  }
+
+  iterator replaceInstruction(iterator II,
+                              const std::vector<MCInst> &Replacement) {
+    return replaceInstruction(II, Replacement.begin(), Replacement.end());
+  }
+
+  /// Insert \p NewInst before \p At, which must be an existing instruction in
+  /// this BB. Return iterator pointing to the newly inserted instruction.
+  iterator insertInstruction(iterator At, MCInst &&NewInst) {
+    adjustNumPseudos(NewInst, 1);
+    return Instructions.emplace(At, std::move(NewInst));
+  }
+
+  iterator insertInstruction(iterator At, MCInst &NewInst) {
+    adjustNumPseudos(NewInst, 1);
+    return Instructions.emplace(At, NewInst);
+  }
+
+  /// Helper to retrieve any terminators in \p BB before \p Pos. This is used
+  /// to skip CFI instructions and to retrieve the first terminator instruction
+  /// in basic blocks with two terminators (conditional jump and unconditional
+  /// jump).
+  MCInst *getTerminatorBefore(MCInst *Pos);
+
+  /// Used to identify whether an instruction is before a terminator and whether
+  /// moving it to the end of the BB would render it dead code.
+  bool hasTerminatorAfter(MCInst *Pos);
+
+  /// Split apart the instructions in this basic block starting at Inst.
+  /// The instructions following Inst are removed and returned in a vector.
+  std::vector<MCInst> splitInstructions(const MCInst *Inst) {
+    std::vector<MCInst> SplitInst;
+
+    assert(!Instructions.empty());
+    while(&Instructions.back() != Inst) {
+      SplitInst.push_back(Instructions.back());
+      Instructions.pop_back();
+    }
+    std::reverse(SplitInst.begin(), SplitInst.end());
+    NumPseudos = 0;
+    adjustNumPseudos(Instructions.begin(), Instructions.end(), 1);
+    return SplitInst;
+  }
+
+  /// Split basic block at the instruction pointed to by II.
+  /// All iterators pointing after II get invalidated.
+  ///
+  /// Return the new basic block that starts with the instruction
+  /// at the split point.
+  BinaryBasicBlock *splitAt(iterator II);
+
+  /// Sets address of the basic block in the output.
+  void setOutputStartAddress(uint64_t Address) {
+    OutputAddressRange.first = Address;
+  }
+
+  /// Sets address past the end of the basic block in the output.
+  void setOutputEndAddress(uint64_t Address) {
+    OutputAddressRange.second = Address;
+  }
+
+  /// Gets the memory address range of this BB in the input binary.
+  std::pair<uint64_t, uint64_t> getInputAddressRange() const {
+    return InputRange;
+  }
+
+  /// Gets the memory address range of this BB in the output binary.
+  std::pair<uint64_t, uint64_t> getOutputAddressRange() const {
+    return OutputAddressRange;
+  }
+
+  /// Update addresses of special instructions inside this basic block.
+  void updateOutputValues(const MCAsmLayout &Layout);
+
+  /// Return mapping of input offsets to symbols in the output.
+  LocSymsTy &getLocSyms() {
+    return LocSyms ? *LocSyms : *(LocSyms = std::make_unique<LocSymsTy>());
+  }
+
+  /// Return mapping of input offsets to symbols in the output.
+  const LocSymsTy &getLocSyms() const {
+    return const_cast<BinaryBasicBlock *>(this)->getLocSyms();
+  }
+
+  /// Return offset translation table for the basic block.
+  OffsetTranslationTableTy &getOffsetTranslationTable() {
+    return OffsetTranslationTable ?
+      *OffsetTranslationTable :
+      *(OffsetTranslationTable = std::make_unique<OffsetTranslationTableTy>());
+  }
+
+  /// Return offset translation table for the basic block.
+  const OffsetTranslationTableTy &getOffsetTranslationTable() const {
+    return const_cast<BinaryBasicBlock *>(this)->getOffsetTranslationTable();
+  }
+
+  /// Return size of the basic block in the output binary.
+  uint64_t getOutputSize() const {
+    return OutputAddressRange.second - OutputAddressRange.first;
+  }
+
+  BinaryFunction *getFunction() const {
+    return Function;
+  }
+
+  /// Analyze and interpret the terminators of this basic block. TBB must be
+  /// initialized with the original fall-through for this BB.
+  bool analyzeBranch(const MCSymbol *&TBB,
+                     const MCSymbol *&FBB,
+                     MCInst *&CondBranch,
+                     MCInst *&UncondBranch);
+
+  /// Return true if iterator \p I is pointing to the first instruction in
+  /// a pair that could be macro-fused.
+  bool isMacroOpFusionPair(const_iterator I) const;
+
+  /// If the basic block has a pair of instructions suitable for macro-fusion,
+  /// return iterator to the first instruction of the pair.
+  /// Otherwise return end().
+  const_iterator getMacroOpFusionPair() const;
+
+  /// Printer required for printing dominator trees.
+  void printAsOperand(raw_ostream &OS, bool PrintType = true) {
+    if (PrintType) {
+      OS << "basic block ";
+    }
+    OS << getName();
+  }
+
+  /// A simple dump function for debugging.
+  void dump() const;
+
+  /// Validate successor invariants for this BB.
+  bool validateSuccessorInvariants();
+
+  /// Return offset of the basic block from the function start on input.
+  uint32_t getInputOffset() const {
+    return InputRange.first;
+  }
+
+  /// Return offset from the function start to location immediately past
+  /// the end of the basic block.
+  uint32_t getEndOffset() const {
+    return InputRange.second;
+  }
+
+  /// Return size of the basic block on input.
+  uint32_t getOriginalSize() const {
+    return InputRange.second - InputRange.first;
+  }
+
+  /// Returns an estimate of size of basic block during run time optionally
+  /// using a user-supplied emitter for lock-free multi-thread work.
+  /// MCCodeEmitter is not thread safe and each thread should operate with its
+  /// own copy of it.
+  uint64_t estimateSize(const MCCodeEmitter *Emitter = nullptr) const;
+
+  /// Return index in the current layout. The user is responsible for
+  /// making sure the indices are up to date,
+  /// e.g. by calling BinaryFunction::updateLayoutIndices();
+  unsigned getLayoutIndex() const {
+    assert(isValid());
+    return LayoutIndex;
+  }
+
+  /// Set layout index. To be used by BinaryFunction.
+  void setLayoutIndex(unsigned Index) const {
+    LayoutIndex = Index;
+  }
+
+  /// Needed by graph traits.
+  BinaryFunction *getParent() const {
+    return getFunction();
+  }
+
+  /// Return true if the containing function is in CFG state.
+  bool hasCFG() const;
+
+  /// Return true if the containing function is in a state with instructions.
+  bool hasInstructions() const;
+
+  /// Return offset of the basic block from the function start.
+  uint32_t getOffset() const {
+    return InputRange.first;
+  }
+
+  /// Get the index of this basic block.
+  unsigned getIndex() const {
+    assert(isValid());
+    return Index;
+  }
+
+private:
+  void adjustNumPseudos(const MCInst &Inst, int Sign);
+
+  template <typename Itr>
+  void adjustNumPseudos(Itr Begin, Itr End, int Sign) {
+    while (Begin != End) {
+      adjustNumPseudos(*Begin++, Sign);
+    }
+  }
+
+  /// Adds predecessor to the BB. Most likely you don't need to call this.
+  void addPredecessor(BinaryBasicBlock *Pred);
+
+  /// Remove predecessor of the basic block. Don't use directly, instead
+  /// use removeSuccessor() function.
+  /// If \p Multiple is set to true, it will remove all predecessors that
+  /// are equal to \p Pred. Otherwise, the first instance of \p Pred found
+  /// will be removed. This only matters in awkward, redundant CFGs.
+  void removePredecessor(BinaryBasicBlock *Pred, bool Multiple=true);
+
+  /// Set end offset of this basic block.
+  void setEndOffset(uint32_t Offset) {
+    InputRange.second = Offset;
+  }
+
+  /// Set the index of this basic block.
+  void setIndex(unsigned I) {
+    Index = I;
+  }
+
+  template<typename T> void clearList(T& List) {
+    T TempList;
+    TempList.swap(List);
+  }
+
+  /// Release memory taken by CFG edges and instructions.
+  void releaseCFG() {
+    clearList(Predecessors);
+    clearList(Successors);
+    clearList(Throwers);
+    clearList(LandingPads);
+    clearList(BranchInfo);
+    clearList(Instructions);
+  }
+};
+
+/// Keep the size of the BinaryBasicBlock within a reasonable size class.
+static_assert(sizeof(BinaryBasicBlock) <= 256, "");
+
+bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS);
+
+} // namespace bolt
+
+
+// GraphTraits specializations for basic block graphs (CFGs)
+template <> struct GraphTraits<bolt::BinaryBasicBlock *> {
+  using NodeRef = bolt::BinaryBasicBlock *;
+  using ChildIteratorType = bolt::BinaryBasicBlock::succ_iterator;
+
+  static NodeRef getEntryNode(bolt::BinaryBasicBlock *BB) { return BB; }
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->succ_begin();
+  }
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->succ_end();
+  }
+};
+
+template <> struct GraphTraits<const bolt::BinaryBasicBlock *> {
+  using NodeRef = const bolt::BinaryBasicBlock *;
+  using ChildIteratorType = bolt::BinaryBasicBlock::const_succ_iterator;
+
+  static NodeRef getEntryNode(const bolt::BinaryBasicBlock *BB) {
+    return BB;
+  }
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->succ_begin();
+  }
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->succ_end();
+  }
+};
+
+template <> struct GraphTraits<Inverse<bolt::BinaryBasicBlock *>> {
+  using NodeRef = bolt::BinaryBasicBlock *;
+  using ChildIteratorType = bolt::BinaryBasicBlock::pred_iterator;
+  static NodeRef getEntryNode(Inverse<bolt::BinaryBasicBlock *> G) {
+    return G.Graph;
+  }
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->pred_begin();
+  }
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->pred_end();
+  }
+};
+
+template <> struct GraphTraits<Inverse<const bolt::BinaryBasicBlock *>> {
+  using NodeRef = const bolt::BinaryBasicBlock *;
+  using ChildIteratorType = bolt::BinaryBasicBlock::const_pred_iterator;
+  static NodeRef getEntryNode(Inverse<const bolt::BinaryBasicBlock *> G) {
+    return G.Graph;
+  }
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->pred_begin();
+  }
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->pred_end();
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp
new file mode 100644
index 000000000000..15f756b8ddaf
--- /dev/null
+++ b/bolt/src/BinaryContext.cpp
@@ -0,0 +1,2179 @@
+//===--- BinaryContext.cpp  - Interface for machine-level context ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryContext.h"
+#include "BinaryEmitter.h"
+#include "BinaryFunction.h"
+#include "NameResolver.h"
+#include "Utils.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
+#include <functional>
+#include <iterator>
+
+using namespace llvm;
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+
+extern cl::opt<bool> AggregateOnly;
+extern cl::opt<bool> HotText;
+extern cl::opt<bool> HotData;
+extern cl::opt<bool> StrictMode;
+extern cl::opt<bool> UseOldText;
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<unsigned> ExecutionCountThreshold;
+
+extern bool processAllFunctions();
+
+cl::opt<bool>
+NoHugePages("no-huge-pages",
+  cl::desc("use regular size pages for code alignment"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+PrintDebugInfo("print-debug-info",
+  cl::desc("print debug info when printing functions"),
+  cl::Hidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+PrintRelocations("print-relocations",
+  cl::desc("print relocations when printing functions/objects"),
+  cl::Hidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+PrintMemData("print-mem-data",
+  cl::desc("print memory data annotations when printing functions"),
+  cl::Hidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
+                             std::unique_ptr<DWARFContext> DwCtx,
+                             std::unique_ptr<Triple> TheTriple,
+                             const Target *TheTarget,
+                             std::string TripleName,
+                             std::unique_ptr<MCCodeEmitter> MCE,
+                             std::unique_ptr<MCObjectFileInfo> MOFI,
+                             std::unique_ptr<const MCAsmInfo> AsmInfo,
+                             std::unique_ptr<const MCInstrInfo> MII,
+                             std::unique_ptr<const MCSubtargetInfo> STI,
+                             std::unique_ptr<MCInstPrinter> InstPrinter,
+                             std::unique_ptr<const MCInstrAnalysis> MIA,
+                             std::unique_ptr<MCPlusBuilder> MIB,
+                             std::unique_ptr<const MCRegisterInfo> MRI,
+                             std::unique_ptr<MCDisassembler> DisAsm)
+    : Ctx(std::move(Ctx)),
+      DwCtx(std::move(DwCtx)),
+      TheTriple(std::move(TheTriple)),
+      TheTarget(TheTarget),
+      TripleName(TripleName),
+      MCE(std::move(MCE)),
+      MOFI(std::move(MOFI)),
+      AsmInfo(std::move(AsmInfo)),
+      MII(std::move(MII)),
+      STI(std::move(STI)),
+      InstPrinter(std::move(InstPrinter)),
+      MIA(std::move(MIA)),
+      MIB(std::move(MIB)),
+      MRI(std::move(MRI)),
+      DisAsm(std::move(DisAsm)) {
+  Relocation::Arch = this->TheTriple->getArch();
+  PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize;
+}
+
+BinaryContext::~BinaryContext() {
+  for (BinarySection *Section : Sections) {
+    delete Section;
+  }
+  for (BinaryFunction *InjectedFunction : InjectedBinaryFunctions) {
+    delete InjectedFunction;
+  }
+  for (std::pair<const uint64_t, JumpTable *> JTI : JumpTables) {
+    delete JTI.second;
+  }
+  clearBinaryData();
+}
+
+extern MCPlusBuilder *createX86MCPlusBuilder(const MCInstrAnalysis *,
+                                             const MCInstrInfo *,
+                                             const MCRegisterInfo *);
+extern MCPlusBuilder *createAArch64MCPlusBuilder(const MCInstrAnalysis *,
+                                                 const MCInstrInfo *,
+                                                 const MCRegisterInfo *);
+
+namespace {
+
+MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch,
+                                   const MCInstrAnalysis *Analysis,
+                                   const MCInstrInfo *Info,
+                                   const MCRegisterInfo *RegInfo) {
+#ifdef X86_AVAILABLE
+  if (Arch == Triple::x86_64)
+    return createX86MCPlusBuilder(Analysis, Info, RegInfo);
+#endif
+
+#ifdef AARCH64_AVAILABLE
+  if (Arch == Triple::aarch64)
+    return createAArch64MCPlusBuilder(Analysis, Info, RegInfo);
+#endif
+
+  llvm_unreachable("architecture unsupport by MCPlusBuilder");
+}
+
+} // anonymous namespace
+
+/// Create BinaryContext for a given architecture \p ArchName and
+/// triple \p TripleName.
+std::unique_ptr<BinaryContext>
+BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
+                                   std::unique_ptr<DWARFContext> DwCtx) {
+  StringRef ArchName = "";
+  StringRef FeaturesStr = "";
+  switch (File->getArch()) {
+  case llvm::Triple::x86_64:
+    ArchName = "x86-64";
+    FeaturesStr = "+nopl";
+    break;
+  case llvm::Triple::aarch64:
+    ArchName = "aarch64";
+    FeaturesStr = "+fp-armv8,+neon,+crypto,+dotprod,+crc,+lse,+ras,+rdm,"
+                  "+fullfp16,+spe,+fuse-aes,+rcpc";
+    break;
+  default:
+    errs() << "BOLT-ERROR: Unrecognized machine in ELF file.\n";
+    return nullptr;
+  }
+
+  auto TheTriple = std::make_unique<Triple>(File->makeTriple());
+  const std::string TripleName = TheTriple->str();
+
+  std::string Error;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(std::string(ArchName), *TheTriple, Error);
+  if (!TheTarget) {
+    errs() << "BOLT-ERROR: " << Error;
+    return nullptr;
+  }
+
+  std::unique_ptr<const MCRegisterInfo> MRI(
+      TheTarget->createMCRegInfo(TripleName));
+  if (!MRI) {
+    errs() << "BOLT-ERROR: no register info for target " << TripleName << "\n";
+    return nullptr;
+  }
+
+  // Set up disassembler.
+  std::unique_ptr<const MCAsmInfo> AsmInfo(
+      TheTarget->createMCAsmInfo(*MRI, TripleName, MCTargetOptions()));
+  if (!AsmInfo) {
+    errs() << "BOLT-ERROR: no assembly info for target " << TripleName << "\n";
+    return nullptr;
+  }
+
+  std::unique_ptr<const MCSubtargetInfo> STI(
+      TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr));
+  if (!STI) {
+    errs() << "BOLT-ERROR: no subtarget info for target " << TripleName << "\n";
+    return nullptr;
+  }
+
+  std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+  if (!MII) {
+    errs() << "BOLT-ERROR: no instruction info for target " << TripleName
+           << "\n";
+    return nullptr;
+  }
+
+  std::unique_ptr<MCContext> Ctx(
+      new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get()));
+  std::unique_ptr<MCObjectFileInfo> MOFI(
+      TheTarget->createMCObjectFileInfo(*Ctx, IsPIC));
+  Ctx->setObjectFileInfo(MOFI.get());
+  // We do not support X86 Large code model. Change this in the future.
+  bool Large = false;
+  if (TheTriple->getArch() == llvm::Triple::aarch64)
+    Large = true;
+  unsigned LSDAEncoding =
+      Large ? dwarf::DW_EH_PE_absptr : dwarf::DW_EH_PE_udata4;
+  unsigned TTypeEncoding =
+      Large ? dwarf::DW_EH_PE_absptr : dwarf::DW_EH_PE_udata4;
+  if (IsPIC) {
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel |
+                   (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4);
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                    (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4);
+  }
+
+  std::unique_ptr<MCDisassembler> DisAsm(
+      TheTarget->createMCDisassembler(*STI, *Ctx));
+
+  if (!DisAsm) {
+    errs() << "BOLT-ERROR: no disassembler for target " << TripleName << "\n";
+    return nullptr;
+  }
+
+  std::unique_ptr<const MCInstrAnalysis> MIA(
+      TheTarget->createMCInstrAnalysis(MII.get()));
+  if (!MIA) {
+    errs() << "BOLT-ERROR: failed to create instruction analysis for target"
+           << TripleName << "\n";
+    return nullptr;
+  }
+
+  std::unique_ptr<MCPlusBuilder> MIB(createMCPlusBuilder(
+      TheTriple->getArch(), MIA.get(), MII.get(), MRI.get()));
+  if (!MIB) {
+    errs() << "BOLT-ERROR: failed to create instruction builder for target"
+           << TripleName << "\n";
+    return nullptr;
+  }
+
+  int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
+  std::unique_ptr<MCInstPrinter> InstructionPrinter(
+      TheTarget->createMCInstPrinter(*TheTriple, AsmPrinterVariant, *AsmInfo,
+                                     *MII, *MRI));
+  if (!InstructionPrinter) {
+    errs() << "BOLT-ERROR: no instruction printer for target " << TripleName
+           << '\n';
+    return nullptr;
+  }
+  InstructionPrinter->setPrintImmHex(true);
+
+  std::unique_ptr<MCCodeEmitter> MCE(
+      TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx));
+
+  // Make sure we don't miss any output on core dumps.
+  outs().SetUnbuffered();
+  errs().SetUnbuffered();
+  dbgs().SetUnbuffered();
+
+  auto BC = std::make_unique<BinaryContext>(
+      std::move(Ctx), std::move(DwCtx), std::move(TheTriple), TheTarget,
+      std::string(TripleName), std::move(MCE), std::move(MOFI),
+      std::move(AsmInfo), std::move(MII), std::move(STI),
+      std::move(InstructionPrinter), std::move(MIA), std::move(MIB),
+      std::move(MRI), std::move(DisAsm));
+
+  BC->TTypeEncoding = TTypeEncoding;
+  BC->LSDAEncoding = LSDAEncoding;
+
+  BC->MAB = std::unique_ptr<MCAsmBackend>(
+      BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions()));
+
+  BC->setFilename(File->getFileName());
+
+  BC->HasFixedLoadAddress = !IsPIC;
+
+  return BC;
+}
+
+bool BinaryContext::forceSymbolRelocations(StringRef SymbolName) const {
+  if (opts::HotText && (SymbolName == "__hot_start" ||
+                        SymbolName == "__hot_end"))
+    return true;
+
+  if (opts::HotData && (SymbolName == "__hot_data_start" ||
+                        SymbolName == "__hot_data_end"))
+    return true;
+
+  if (SymbolName == "_end")
+    return true;
+
+  return false;
+}
+
+std::unique_ptr<MCObjectWriter>
+BinaryContext::createObjectWriter(raw_pwrite_stream &OS) {
+  return MAB->createObjectWriter(OS);
+}
+
+bool BinaryContext::validateObjectNesting() const {
+  auto Itr = BinaryDataMap.begin();
+  auto End = BinaryDataMap.end();
+  bool Valid = true;
+  while (Itr != End) {
+    auto Next = std::next(Itr);
+    while (Next != End &&
+           Itr->second->getSection() == Next->second->getSection() &&
+           Itr->second->containsRange(Next->second->getAddress(),
+                                      Next->second->getSize())) {
+      if (Next->second->Parent != Itr->second) {
+        errs() << "BOLT-WARNING: object nesting incorrect for:\n"
+               << "BOLT-WARNING:  " << *Itr->second << "\n"
+               << "BOLT-WARNING:  " << *Next->second << "\n";
+        Valid = false;
+      }
+      ++Next;
+    }
+    Itr = Next;
+  }
+  return Valid;
+}
+
+bool BinaryContext::validateHoles() const {
+  bool Valid = true;
+  for (BinarySection &Section : sections()) {
+    for (const Relocation &Rel : Section.relocations()) {
+      uint64_t RelAddr = Rel.Offset + Section.getAddress();
+      const BinaryData *BD = getBinaryDataContainingAddress(RelAddr);
+      if (!BD) {
+        errs() << "BOLT-WARNING: no BinaryData found for relocation at address"
+               << " 0x" << Twine::utohexstr(RelAddr) << " in "
+               << Section.getName() << "\n";
+        Valid = false;
+      } else if (!BD->getAtomicRoot()) {
+        errs() << "BOLT-WARNING: no atomic BinaryData found for relocation at "
+               << "address 0x" << Twine::utohexstr(RelAddr) << " in "
+               << Section.getName() << "\n";
+        Valid = false;
+      }
+    }
+  }
+  return Valid;
+}
+
+void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) {
+  const uint64_t Address = GAI->second->getAddress();
+  const uint64_t Size = GAI->second->getSize();
+
+  auto fixParents =
+    [&](BinaryDataMapType::iterator Itr, BinaryData *NewParent) {
+    BinaryData *OldParent = Itr->second->Parent;
+    Itr->second->Parent = NewParent;
+    ++Itr;
+    while (Itr != BinaryDataMap.end() && OldParent &&
+           Itr->second->Parent == OldParent) {
+      Itr->second->Parent = NewParent;
+      ++Itr;
+    }
+  };
+
+  // Check if the previous symbol contains the newly added symbol.
+  if (GAI != BinaryDataMap.begin()) {
+    BinaryData *Prev = std::prev(GAI)->second;
+    while (Prev) {
+      if (Prev->getSection() == GAI->second->getSection() &&
+          Prev->containsRange(Address, Size)) {
+        fixParents(GAI, Prev);
+      } else {
+        fixParents(GAI, nullptr);
+      }
+      Prev = Prev->Parent;
+    }
+  }
+
+  // Check if the newly added symbol contains any subsequent symbols.
+  if (Size != 0) {
+    BinaryData *BD = GAI->second->Parent ? GAI->second->Parent : GAI->second;
+    auto Itr = std::next(GAI);
+    while (Itr != BinaryDataMap.end() &&
+           BD->containsRange(Itr->second->getAddress(),
+                             Itr->second->getSize())) {
+      Itr->second->Parent = BD;
+      ++Itr;
+    }
+  }
+}
+
+iterator_range<BinaryContext::binary_data_iterator>
+BinaryContext::getSubBinaryData(BinaryData *BD) {
+  auto Start = std::next(BinaryDataMap.find(BD->getAddress()));
+  auto End = Start;
+  while (End != BinaryDataMap.end() &&
+         BD->isAncestorOf(End->second)) {
+    ++End;
+  }
+  return make_range(Start, End);
+}
+
+std::pair<const MCSymbol *, uint64_t>
+BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
+                                bool IsPCRel) {
+  uint64_t Addend = 0;
+
+  if (isAArch64()) {
+    // Check if this is an access to a constant island and create bookkeeping
+    // to keep track of it and emit it later as part of this function.
+    if (MCSymbol *IslandSym = BF.getOrCreateIslandAccess(Address))
+      return std::make_pair(IslandSym, Addend);
+
+    // Detect custom code written in assembly that refers to arbitrary
+    // constant islands from other functions. Write this reference so we
+    // can pull this constant island and emit it as part of this function
+    // too.
+    auto IslandIter = AddressToConstantIslandMap.lower_bound(Address);
+    if (IslandIter != AddressToConstantIslandMap.end()) {
+      if (MCSymbol *IslandSym =
+              IslandIter->second->getOrCreateProxyIslandAccess(Address, BF)) {
+        /// Make this function depend on IslandIter->second because we have
+        /// a reference to its constant island. When emitting this function,
+        /// we will also emit IslandIter->second's constants. This only
+        /// happens in custom AArch64 assembly code.
+        BF.Islands.Dependency.insert(IslandIter->second);
+        BF.Islands.ProxySymbols[IslandSym] = IslandIter->second;
+        return std::make_pair(IslandSym, Addend);
+      }
+    }
+  }
+
+  // Note that the address does not necessarily have to reside inside
+  // a section, it could be an absolute address too.
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  if (Section && Section->isText()) {
+    if (BF.containsAddress(Address, /*UseMaxSize=*/ isAArch64())) {
+      if (Address != BF.getAddress()) {
+        // The address could potentially escape. Mark it as another entry
+        // point into the function.
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: potentially escaped address 0x"
+                 << Twine::utohexstr(Address) << " in function "
+                 << BF << '\n';
+        }
+        BF.HasInternalLabelReference = true;
+        return std::make_pair(
+                  BF.addEntryPointAtOffset(Address - BF.getAddress()),
+                  Addend);
+      }
+    } else {
+      BF.InterproceduralReferences.insert(Address);
+    }
+  }
+
+  // With relocations, catch jump table references outside of the basic block
+  // containing the indirect jump.
+  if (HasRelocations) {
+    const MemoryContentsType MemType = analyzeMemoryAt(Address, BF);
+    if (MemType == MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE && IsPCRel) {
+      const MCSymbol *Symbol =
+        getOrCreateJumpTable(BF, Address, JumpTable::JTT_PIC);
+
+      return std::make_pair(Symbol, Addend);
+    }
+  }
+
+  if (BinaryData *BD = getBinaryDataContainingAddress(Address)) {
+    return std::make_pair(BD->getSymbol(), Address - BD->getAddress());
+  }
+
+  // TODO: use DWARF info to get size/alignment here?
+  MCSymbol *TargetSymbol = getOrCreateGlobalSymbol(Address, "DATAat");
+  LLVM_DEBUG(dbgs() << "Created symbol " << TargetSymbol->getName() << '\n');
+  return std::make_pair(TargetSymbol, Addend);
+}
+
+MemoryContentsType
+BinaryContext::analyzeMemoryAt(uint64_t Address, BinaryFunction &BF) {
+  if (!isX86())
+    return MemoryContentsType::UNKNOWN;
+
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  if (!Section) {
+    // No section - possibly an absolute address. Since we don't allow
+    // internal function addresses to escape the function scope - we
+    // consider it a tail call.
+    if (opts::Verbosity > 1) {
+      errs() << "BOLT-WARNING: no section for address 0x"
+             << Twine::utohexstr(Address) << " referenced from function "
+             << BF << '\n';
+    }
+    return MemoryContentsType::UNKNOWN;
+  }
+
+  if (Section->isVirtual()) {
+    // The contents are filled at runtime.
+    return MemoryContentsType::UNKNOWN;
+  }
+
+  // No support for jump tables in code yet.
+  if (Section->isText())
+    return MemoryContentsType::UNKNOWN;
+
+  // Start with checking for PIC jump table. We expect non-PIC jump tables
+  // to have high 32 bits set to 0.
+  if (analyzeJumpTable(Address, JumpTable::JTT_PIC, BF))
+    return MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE;
+
+  if (analyzeJumpTable(Address, JumpTable::JTT_NORMAL, BF))
+    return MemoryContentsType::POSSIBLE_JUMP_TABLE;
+
+  return MemoryContentsType::UNKNOWN;
+}
+
+bool BinaryContext::analyzeJumpTable(const uint64_t Address,
+                                     const JumpTable::JumpTableType Type,
+                                     BinaryFunction &BF,
+                                     const uint64_t NextJTAddress,
+                                     JumpTable::OffsetsType *Offsets) {
+  // Is one of the targets __builtin_unreachable?
+  bool HasUnreachable = false;
+
+  // Number of targets other than __builtin_unreachable.
+  uint64_t NumRealEntries = 0;
+
+  constexpr uint64_t INVALID_OFFSET = std::numeric_limits<uint64_t>::max();
+  auto addOffset = [&](uint64_t Offset) {
+    if (Offsets)
+      Offsets->emplace_back(Offset);
+  };
+
+  auto isFragment = [](BinaryFunction &Fragment,
+                       BinaryFunction &Parent) -> bool {
+    // Check if <fragment restored name> == <parent restored name>.cold(.\d+)?
+    for (StringRef BFName : Parent.getNames()) {
+      std::string BFNamePrefix = Regex::escape(NameResolver::restore(BFName));
+      std::string BFNameRegex =
+          Twine(BFNamePrefix, "\\.cold(\\.[0-9]+)?").str();
+      if (Fragment.hasRestoredNameRegex(BFNameRegex))
+        return true;
+    }
+    return false;
+  };
+
+  auto doesBelongToFunction = [&](const uint64_t Addr,
+                                  BinaryFunction *TargetBF) -> bool {
+    if (BF.containsAddress(Addr))
+      return true;
+    // Nothing to do if we failed to identify the containing function.
+    if (!TargetBF)
+      return false;
+    // Case 1: check if BF is a fragment and TargetBF is its parent.
+    if (BF.isFragment()) {
+      // BF is a fragment, but parent function is not registered.
+      // This means there's no direct jump between parent and fragment.
+      // Set parent link here in jump table analysis, based on function name
+      // matching heuristic.
+      if (!BF.getParentFragment() && isFragment(BF, *TargetBF))
+        registerFragment(BF, *TargetBF);
+      return BF.getParentFragment() == TargetBF;
+    }
+    // Case 2: check if TargetBF is a fragment and BF is its parent.
+    if (TargetBF->isFragment()) {
+      // TargetBF is a fragment, but parent function is not registered.
+      // This means there's no direct jump between parent and fragment.
+      // Set parent link here in jump table analysis, based on function name
+      // matching heuristic.
+      if (!TargetBF->getParentFragment() && isFragment(*TargetBF, BF))
+        registerFragment(*TargetBF, BF);
+      return TargetBF->getParentFragment() == &BF;
+    }
+    return false;
+  };
+
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  if (!Section)
+    return false;
+
+  // The upper bound is defined by containing object, section limits, and
+  // the next jump table in memory.
+  uint64_t UpperBound = Section->getEndAddress();
+  const BinaryData *JumpTableBD = getBinaryDataAtAddress(Address);
+  if (JumpTableBD && JumpTableBD->getSize()) {
+    assert(JumpTableBD->getEndAddress() <= UpperBound &&
+           "data object cannot cross a section boundary");
+    UpperBound = JumpTableBD->getEndAddress();
+  }
+  if (NextJTAddress) {
+    UpperBound = std::min(NextJTAddress, UpperBound);
+  }
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: analyzeJumpTable in " << BF.getPrintName()
+                    << '\n');
+  const uint64_t EntrySize = getJumpTableEntrySize(Type);
+  for (uint64_t EntryAddress = Address; EntryAddress <= UpperBound - EntrySize;
+       EntryAddress += EntrySize) {
+    LLVM_DEBUG(dbgs() << "  * Checking 0x" << Twine::utohexstr(EntryAddress)
+                      << " -> ");
+    // Check if there's a proper relocation against the jump table entry.
+    if (HasRelocations) {
+      if (Type == JumpTable::JTT_PIC &&
+          !DataPCRelocations.count(EntryAddress)) {
+        LLVM_DEBUG(
+            dbgs() << "FAIL: JTT_PIC table, no relocation for this address\n");
+        break;
+      }
+      if (Type == JumpTable::JTT_NORMAL && !getRelocationAt(EntryAddress)) {
+        LLVM_DEBUG(
+            dbgs()
+            << "FAIL: JTT_NORMAL table, no relocation for this address\n");
+        break;
+      }
+    }
+
+    const uint64_t Value = (Type == JumpTable::JTT_PIC)
+      ? Address + *getSignedValueAtAddress(EntryAddress, EntrySize)
+      : *getPointerAtAddress(EntryAddress);
+
+    // __builtin_unreachable() case.
+    if (Value == BF.getAddress() + BF.getSize()) {
+      addOffset(Value - BF.getAddress());
+      HasUnreachable = true;
+      LLVM_DEBUG(dbgs() << "OK: __builtin_unreachable\n");
+      continue;
+    }
+
+    // Function or one of its fragments.
+    BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value);
+
+    // We assume that a jump table cannot have function start as an entry.
+    if (!doesBelongToFunction(Value, TargetBF) || Value == BF.getAddress()) {
+      LLVM_DEBUG({
+        if (!BF.containsAddress(Value)) {
+          dbgs() << "FAIL: function doesn't contain this address\n";
+          if (TargetBF) {
+            dbgs() << "  ! function containing this address: "
+                   << TargetBF->getPrintName() << '\n';
+            if (TargetBF->isFragment())
+              dbgs() << "  ! is a fragment\n";
+            BinaryFunction *TargetParent = TargetBF->getParentFragment();
+            dbgs() << "  ! its parent is "
+                   << (TargetParent ? TargetParent->getPrintName() : "(none)")
+                   << '\n';
+          }
+        }
+        if (Value == BF.getAddress())
+          dbgs() << "FAIL: jump table cannot have function start as an entry\n";
+      });
+      break;
+    }
+
+    // Check there's an instruction at this offset.
+    if (TargetBF->getState() == BinaryFunction::State::Disassembled &&
+        !TargetBF->getInstructionAtOffset(Value - TargetBF->getAddress())) {
+      LLVM_DEBUG(dbgs() << "FAIL: no instruction at this offset\n");
+      break;
+    }
+
+    ++NumRealEntries;
+
+    if (TargetBF == &BF) {
+      // Address inside the function.
+      addOffset(Value - TargetBF->getAddress());
+      LLVM_DEBUG(dbgs() << "OK: real entry\n");
+    } else {
+      // Address in split fragment.
+      BF.setHasSplitJumpTable(true);
+      // Add invalid offset for proper identification of jump table size.
+      addOffset(INVALID_OFFSET);
+      LLVM_DEBUG(dbgs() << "OK: address in split fragment\n");
+    }
+  }
+
+  // It's a jump table if the number of real entries is more than 1, or there's
+  // one real entry and "unreachable" targets. If there are only multiple
+  // "unreachable" targets, then it's not a jump table.
+  return NumRealEntries + HasUnreachable >= 2;
+}
+
+void BinaryContext::populateJumpTables() {
+  std::vector<BinaryFunction *> FuncsToSkip;
+  LLVM_DEBUG(dbgs() << "DataPCRelocations: " << DataPCRelocations.size()
+                    << '\n');
+  for (auto JTI = JumpTables.begin(), JTE = JumpTables.end(); JTI != JTE;
+       ++JTI) {
+    JumpTable *JT = JTI->second;
+    BinaryFunction &BF = *JT->Parent;
+
+    if (!BF.isSimple())
+      continue;
+
+    uint64_t NextJTAddress = 0;
+    auto NextJTI = std::next(JTI);
+    if (NextJTI != JTE) {
+      NextJTAddress = NextJTI->second->getAddress();
+    }
+
+    const bool Success = analyzeJumpTable(JT->getAddress(), JT->Type, BF,
+                                          NextJTAddress, &JT->OffsetEntries);
+    if (!Success) {
+      dbgs() << "failed to analyze jump table in function " << BF << '\n';
+      JT->print(dbgs());
+      if (NextJTI != JTE) {
+        dbgs() << "next jump table at 0x"
+               << Twine::utohexstr(NextJTI->second->getAddress())
+               << " belongs to function " << *NextJTI->second->Parent << '\n';
+        NextJTI->second->print(dbgs());
+      }
+      llvm_unreachable("jump table heuristic failure");
+    }
+
+    for (uint64_t EntryOffset : JT->OffsetEntries) {
+      if (EntryOffset == BF.getSize())
+        BF.IgnoredBranches.emplace_back(EntryOffset, BF.getSize());
+      else
+        BF.registerReferencedOffset(EntryOffset);
+    }
+
+    // In strict mode, erase PC-relative relocation record. Later we check that
+    // all such records are erased and thus have been accounted for.
+    if (opts::StrictMode && JT->Type == JumpTable::JTT_PIC) {
+      for (uint64_t Address = JT->getAddress();
+           Address < JT->getAddress() + JT->getSize();
+           Address += JT->EntrySize) {
+        DataPCRelocations.erase(DataPCRelocations.find(Address));
+      }
+    }
+
+    // Mark to skip the function and all its fragments.
+    if (BF.hasSplitJumpTable())
+      FuncsToSkip.push_back(&BF);
+  }
+
+  if (opts::StrictMode && DataPCRelocations.size()) {
+    LLVM_DEBUG({
+      dbgs() << DataPCRelocations.size()
+             << " unclaimed PC-relative relocations left in data:\n";
+      for (uint64_t Reloc : DataPCRelocations)
+        dbgs() << Twine::utohexstr(Reloc) << '\n';
+    });
+    assert(0 && "unclaimed PC-relative relocations left in data\n");
+  }
+  clearList(DataPCRelocations);
+  // Functions containing split jump tables need to be skipped with all
+  // fragments.
+  for (BinaryFunction *BF : FuncsToSkip) {
+    BinaryFunction *ParentBF =
+        const_cast<BinaryFunction *>(BF->getTopmostFragment());
+    LLVM_DEBUG(dbgs() << "Skipping " << ParentBF->getPrintName()
+                      << " family\n");
+    ParentBF->setIgnored();
+    ParentBF->ignoreFragments();
+  }
+}
+
+MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address,
+                                                 Twine Prefix,
+                                                 uint64_t Size,
+                                                 uint16_t Alignment,
+                                                 unsigned Flags) {
+  auto Itr = BinaryDataMap.find(Address);
+  if (Itr != BinaryDataMap.end()) {
+    assert(Itr->second->getSize() == Size || !Size);
+    return Itr->second->getSymbol();
+  }
+
+  std::string Name = (Prefix + "0x" + Twine::utohexstr(Address)).str();
+  assert(!GlobalSymbols.count(Name) && "created name is not unique");
+  return registerNameAtAddress(Name, Address, Size, Alignment, Flags);
+}
+
+MCSymbol *BinaryContext::getOrCreateUndefinedGlobalSymbol(StringRef Name) {
+  return Ctx->getOrCreateSymbol(Name);
+}
+
+BinaryFunction *BinaryContext::createBinaryFunction(
+    const std::string &Name, BinarySection &Section, uint64_t Address,
+    uint64_t Size, uint64_t SymbolSize, uint16_t Alignment) {
+  auto Result = BinaryFunctions.emplace(
+      Address, BinaryFunction(Name, Section, Address, Size, *this));
+  assert(Result.second == true && "unexpected duplicate function");
+  BinaryFunction *BF = &Result.first->second;
+  registerNameAtAddress(Name, Address, SymbolSize ? SymbolSize : Size,
+                        Alignment);
+  setSymbolToFunctionMap(BF->getSymbol(), BF);
+  return BF;
+}
+
+const MCSymbol *
+BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
+                                    JumpTable::JumpTableType Type) {
+  if (JumpTable *JT = getJumpTableContainingAddress(Address)) {
+    assert(JT->Type == Type && "jump table types have to match");
+    assert(JT->Parent == &Function &&
+           "cannot re-use jump table of a different function");
+    assert(Address == JT->getAddress() && "unexpected non-empty jump table");
+
+    return JT->getFirstLabel();
+  }
+
+  // Re-use the existing symbol if possible.
+  MCSymbol *JTLabel = nullptr;
+  if (BinaryData *Object = getBinaryDataAtAddress(Address)) {
+    if (!isInternalSymbolName(Object->getSymbol()->getName()))
+      JTLabel = Object->getSymbol();
+  }
+
+  const uint64_t EntrySize = getJumpTableEntrySize(Type);
+  if (!JTLabel) {
+    const std::string JumpTableName = generateJumpTableName(Function, Address);
+    JTLabel = registerNameAtAddress(JumpTableName, Address, 0, EntrySize);
+  }
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " << JTLabel->getName()
+                    << " in function " << Function << '\n');
+
+  JumpTable *JT = new JumpTable(*JTLabel, Address, EntrySize, Type,
+                                JumpTable::LabelMapType{{0, JTLabel}}, Function,
+                                *getSectionForAddress(Address));
+  JumpTables.emplace(Address, JT);
+
+  // Duplicate the entry for the parent function for easy access.
+  Function.JumpTables.emplace(Address, JT);
+
+  return JTLabel;
+}
+
+std::pair<uint64_t, const MCSymbol *>
+BinaryContext::duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
+                                  const MCSymbol *OldLabel) {
+  auto L = scopeLock();
+  unsigned Offset = 0;
+  bool Found = false;
+  for (std::pair<const unsigned, MCSymbol *> Elmt : JT->Labels) {
+    if (Elmt.second != OldLabel)
+      continue;
+    Offset = Elmt.first;
+    Found = true;
+    break;
+  }
+  assert(Found && "Label not found");
+  MCSymbol *NewLabel = Ctx->createNamedTempSymbol("duplicatedJT");
+  JumpTable *NewJT =
+      new JumpTable(*NewLabel, JT->getAddress(), JT->EntrySize, JT->Type,
+                    JumpTable::LabelMapType{{Offset, NewLabel}}, Function,
+                    *getSectionForAddress(JT->getAddress()));
+  NewJT->Entries = JT->Entries;
+  NewJT->Counts = JT->Counts;
+  uint64_t JumpTableID = ++DuplicatedJumpTables;
+  // Invert it to differentiate from regular jump tables whose IDs are their
+  // addresses in the input binary memory space
+  JumpTableID = ~JumpTableID;
+  JumpTables.emplace(JumpTableID, NewJT);
+  Function.JumpTables.emplace(JumpTableID, NewJT);
+  return std::make_pair(JumpTableID, NewLabel);
+}
+
+std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF,
+                                                 uint64_t Address) {
+  size_t Id;
+  uint64_t Offset = 0;
+  if (const JumpTable *JT = BF.getJumpTableContainingAddress(Address)) {
+    Offset = Address - JT->getAddress();
+    auto Itr = JT->Labels.find(Offset);
+    if (Itr != JT->Labels.end()) {
+      return std::string(Itr->second->getName());
+    }
+    Id = JumpTableIds.at(JT->getAddress());
+  } else {
+    Id = JumpTableIds[Address] = BF.JumpTables.size();
+  }
+  return ("JUMP_TABLE/" + BF.getOneName().str() + "." + std::to_string(Id) +
+          (Offset ? ("." + std::to_string(Offset)) : ""));
+}
+
+bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
+  // FIXME: aarch64 support is missing.
+  if (!isX86())
+    return true;
+
+  if (BF.getSize() == BF.getMaxSize())
+    return true;
+
+  ErrorOr<ArrayRef<unsigned char>> FunctionData = BF.getData();
+  assert(FunctionData && "cannot get function as data");
+
+  uint64_t Offset = BF.getSize();
+  MCInst Instr;
+  uint64_t InstrSize = 0;
+  uint64_t InstrAddress = BF.getAddress() + Offset;
+  using std::placeholders::_1;
+
+  // Skip instructions that satisfy the predicate condition.
+  auto skipInstructions = [&](std::function<bool(const MCInst &)> Predicate) {
+    const uint64_t StartOffset = Offset;
+    for (; Offset < BF.getMaxSize();
+         Offset += InstrSize, InstrAddress += InstrSize) {
+      if (!DisAsm->getInstruction(Instr,
+                                  InstrSize,
+                                  FunctionData->slice(Offset),
+                                  InstrAddress,
+                                  nulls()))
+        break;
+      if (!Predicate(Instr))
+        break;
+    }
+
+    return Offset - StartOffset;
+  };
+
+  // Skip a sequence of zero bytes.
+  auto skipZeros = [&]() {
+    const uint64_t StartOffset = Offset;
+    for (; Offset < BF.getMaxSize(); ++Offset)
+      if ((*FunctionData)[Offset] != 0)
+        break;
+
+    return Offset - StartOffset;
+  };
+
+  // Accept the whole padding area filled with breakpoints.
+  auto isBreakpoint = std::bind(&MCPlusBuilder::isBreakpoint, MIB.get(), _1);
+  if (skipInstructions(isBreakpoint) && Offset == BF.getMaxSize())
+    return true;
+
+  auto isNoop = std::bind(&MCPlusBuilder::isNoop, MIB.get(), _1);
+
+  // Some functions have a jump to the next function or to the padding area
+  // inserted after the body.
+  auto isSkipJump = [&](const MCInst &Instr) {
+    uint64_t TargetAddress = 0;
+    if (MIB->isUnconditionalBranch(Instr) &&
+        MIB->evaluateBranch(Instr, InstrAddress, InstrSize, TargetAddress)) {
+      if (TargetAddress >= InstrAddress + InstrSize &&
+          TargetAddress <= BF.getAddress() + BF.getMaxSize()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  // Skip over nops, jumps, and zero padding. Allow interleaving (this happens).
+  while (skipInstructions(isNoop) ||
+         skipInstructions(isSkipJump) ||
+         skipZeros())
+    ;
+
+  if (Offset == BF.getMaxSize())
+    return true;
+
+  if (opts::Verbosity >= 1) {
+    errs() << "BOLT-WARNING: bad padding at address 0x"
+           << Twine::utohexstr(BF.getAddress() + BF.getSize())
+           << " starting at offset "
+           << (Offset - BF.getSize()) << " in function "
+           << BF << '\n'
+           << FunctionData->slice(BF.getSize(), BF.getMaxSize() - BF.getSize())
+           << '\n';
+  }
+
+  return false;
+}
+
+void BinaryContext::adjustCodePadding() {
+  for (auto &BFI : BinaryFunctions) {
+    BinaryFunction &BF = BFI.second;
+    if (!shouldEmit(BF))
+      continue;
+
+    if (!hasValidCodePadding(BF)) {
+      if (HasRelocations) {
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: function " << BF
+                 << " has invalid padding. Ignoring the function.\n";
+        }
+        BF.setIgnored();
+      } else {
+        BF.setMaxSize(BF.getSize());
+      }
+    }
+  }
+}
+
+MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name,
+                                               uint64_t Address,
+                                               uint64_t Size,
+                                               uint16_t Alignment,
+                                               unsigned Flags) {
+  // Register the name with MCContext.
+  MCSymbol *Symbol = Ctx->getOrCreateSymbol(Name);
+
+  auto GAI = BinaryDataMap.find(Address);
+  BinaryData *BD;
+  if (GAI == BinaryDataMap.end()) {
+    ErrorOr<BinarySection &> SectionOrErr = getSectionForAddress(Address);
+    BinarySection &Section =
+        SectionOrErr ? SectionOrErr.get() : absoluteSection();
+    BD = new BinaryData(*Symbol,
+                        Address,
+                        Size,
+                        Alignment ? Alignment : 1,
+                        Section,
+                        Flags);
+    GAI = BinaryDataMap.emplace(Address, BD).first;
+    GlobalSymbols[Name] = BD;
+    updateObjectNesting(GAI);
+  } else {
+    BD = GAI->second;
+    if (!BD->hasName(Name)) {
+      GlobalSymbols[Name] = BD;
+      BD->Symbols.push_back(Symbol);
+    }
+  }
+
+  return Symbol;
+}
+
+const BinaryData *
+BinaryContext::getBinaryDataContainingAddressImpl(uint64_t Address) const {
+  auto NI = BinaryDataMap.lower_bound(Address);
+  auto End = BinaryDataMap.end();
+  if ((NI != End && Address == NI->first) ||
+      ((NI != BinaryDataMap.begin()) && (NI-- != BinaryDataMap.begin()))) {
+    if (NI->second->containsAddress(Address)) {
+      return NI->second;
+    }
+
+    // If this is a sub-symbol, see if a parent data contains the address.
+    const BinaryData *BD = NI->second->getParent();
+    while (BD) {
+      if (BD->containsAddress(Address))
+        return BD;
+      BD = BD->getParent();
+    }
+  }
+  return nullptr;
+}
+
+bool BinaryContext::setBinaryDataSize(uint64_t Address, uint64_t Size) {
+  auto NI = BinaryDataMap.find(Address);
+  assert(NI != BinaryDataMap.end());
+  if (NI == BinaryDataMap.end())
+    return false;
+  // TODO: it's possible that a jump table starts at the same address
+  // as a larger blob of private data.  When we set the size of the
+  // jump table, it might be smaller than the total blob size.  In this
+  // case we just leave the original size since (currently) it won't really
+  // affect anything.  See T26915981.
+  assert((!NI->second->Size || NI->second->Size == Size ||
+          (NI->second->isJumpTable() && NI->second->Size > Size)) &&
+         "can't change the size of a symbol that has already had its "
+         "size set");
+  if (!NI->second->Size) {
+    NI->second->Size = Size;
+    updateObjectNesting(NI);
+    return true;
+  }
+  return false;
+}
+
+void BinaryContext::generateSymbolHashes() {
+  auto isPadding = [](const BinaryData &BD) {
+    StringRef Contents = BD.getSection().getContents();
+    StringRef SymData = Contents.substr(BD.getOffset(), BD.getSize());
+    return (BD.getName().startswith("HOLEat") ||
+            SymData.find_first_not_of(0) == StringRef::npos);
+  };
+
+  uint64_t NumCollisions = 0;
+  for (auto &Entry : BinaryDataMap) {
+    BinaryData &BD = *Entry.second;
+    StringRef Name = BD.getName();
+
+    if (!isInternalSymbolName(Name))
+      continue;
+
+    // First check if a non-anonymous alias exists and move it to the front.
+    if (BD.getSymbols().size() > 1) {
+      auto Itr = std::find_if(BD.getSymbols().begin(),
+                              BD.getSymbols().end(),
+                              [&](const MCSymbol *Symbol) {
+                                return !isInternalSymbolName(Symbol->getName());
+                              });
+      if (Itr != BD.getSymbols().end()) {
+        size_t Idx = std::distance(BD.getSymbols().begin(), Itr);
+        std::swap(BD.getSymbols()[0], BD.getSymbols()[Idx]);
+        continue;
+      }
+    }
+
+    // We have to skip 0 size symbols since they will all collide.
+    if (BD.getSize() == 0) {
+      continue;
+    }
+
+    const uint64_t Hash = BD.getSection().hash(BD);
+    const size_t Idx = Name.find("0x");
+    std::string NewName = (Twine(Name.substr(0, Idx)) +
+                 "_" + Twine::utohexstr(Hash)).str();
+    if (getBinaryDataByName(NewName)) {
+      // Ignore collisions for symbols that appear to be padding
+      // (i.e. all zeros or a "hole")
+      if (!isPadding(BD)) {
+        if (opts::Verbosity) {
+          errs() << "BOLT-WARNING: collision detected when hashing " << BD
+                 << " with new name (" << NewName << "), skipping.\n";
+        }
+        ++NumCollisions;
+      }
+      continue;
+    }
+    BD.Symbols.insert(BD.Symbols.begin(),
+                       Ctx->getOrCreateSymbol(NewName));
+    GlobalSymbols[NewName] = &BD;
+  }
+  if (NumCollisions) {
+    errs() << "BOLT-WARNING: " << NumCollisions
+           << " collisions detected while hashing binary objects";
+    if (!opts::Verbosity)
+      errs() << ". Use -v=1 to see the list.";
+    errs() << '\n';
+  }
+}
+
+void BinaryContext::registerFragment(BinaryFunction &TargetFunction,
+                                     BinaryFunction &Function) const {
+  // Only a parent function (or a sibling) can reach its fragment.
+  assert(!Function.IsFragment &&
+         "only one cold fragment is supported at this time");
+  if (BinaryFunction *TargetParent = TargetFunction.getParentFragment()) {
+    assert(TargetParent == &Function && "mismatching parent function");
+    return;
+  }
+  TargetFunction.setParentFragment(Function);
+  Function.addFragment(TargetFunction);
+  if (!HasRelocations) {
+    TargetFunction.setSimple(false);
+    Function.setSimple(false);
+  }
+  if (opts::Verbosity >= 1) {
+    outs() << "BOLT-INFO: marking " << TargetFunction
+           << " as a fragment of " << Function << '\n';
+  }
+}
+
+void BinaryContext::processInterproceduralReferences(BinaryFunction &Function) {
+  for (uint64_t Address : Function.InterproceduralReferences) {
+    if (!Address)
+      continue;
+
+    BinaryFunction *TargetFunction =
+        getBinaryFunctionContainingAddress(Address);
+    if (&Function == TargetFunction)
+      continue;
+
+    if (TargetFunction) {
+      if (TargetFunction->IsFragment)
+        registerFragment(*TargetFunction, Function);
+      if (uint64_t Offset = Address - TargetFunction->getAddress())
+        TargetFunction->addEntryPointAtOffset(Offset);
+
+      continue;
+    }
+
+    // Check if address falls in function padding space - this could be
+    // unmarked data in code. In this case adjust the padding space size.
+    ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+    assert(Section && "cannot get section for referenced address");
+
+    if (!Section->isText())
+      continue;
+
+    // PLT requires special handling and could be ignored in this context.
+    StringRef SectionName = Section->getName();
+    if (SectionName == ".plt" || SectionName == ".plt.got")
+      continue;
+
+    if (opts::processAllFunctions()) {
+      errs() << "BOLT-ERROR: cannot process binaries with unmarked "
+             << "object in code at address 0x"
+             << Twine::utohexstr(Address) << " belonging to section "
+             << SectionName << " in current mode\n";
+      exit(1);
+    }
+
+    TargetFunction =
+      getBinaryFunctionContainingAddress(Address,
+                                         /*CheckPastEnd=*/false,
+                                         /*UseMaxSize=*/true);
+    // We are not going to overwrite non-simple functions, but for simple
+    // ones - adjust the padding size.
+    if (TargetFunction && TargetFunction->isSimple()) {
+      errs() << "BOLT-WARNING: function " << *TargetFunction
+             << " has an object detected in a padding region at address 0x"
+             << Twine::utohexstr(Address) << '\n';
+      TargetFunction->setMaxSize(TargetFunction->getSize());
+    }
+  }
+
+  clearList(Function.InterproceduralReferences);
+}
+
+void BinaryContext::postProcessSymbolTable() {
+  fixBinaryDataHoles();
+  bool Valid = true;
+  for (auto &Entry : BinaryDataMap) {
+    BinaryData *BD = Entry.second;
+    if ((BD->getName().startswith("SYMBOLat") ||
+         BD->getName().startswith("DATAat")) &&
+        !BD->getParent() &&
+        !BD->getSize() &&
+        !BD->isAbsolute() &&
+        BD->getSection()) {
+      errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD << "\n";
+      Valid = false;
+    }
+  }
+  assert(Valid);
+  generateSymbolHashes();
+}
+
+void BinaryContext::foldFunction(BinaryFunction &ChildBF,
+                                 BinaryFunction &ParentBF) {
+  assert(!ChildBF.isMultiEntry() && !ParentBF.isMultiEntry() &&
+         "cannot merge functions with multiple entry points");
+
+  std::unique_lock<std::shared_timed_mutex> WriteCtxLock(CtxMutex,
+                                                         std::defer_lock);
+  std::unique_lock<std::shared_timed_mutex> WriteSymbolMapLock(
+      SymbolToFunctionMapMutex, std::defer_lock);
+
+  const StringRef ChildName = ChildBF.getOneName();
+
+  // Move symbols over and update bookkeeping info.
+  for (MCSymbol *Symbol : ChildBF.getSymbols()) {
+    ParentBF.getSymbols().push_back(Symbol);
+    WriteSymbolMapLock.lock();
+    SymbolToFunctionMap[Symbol] = &ParentBF;
+    WriteSymbolMapLock.unlock();
+    // NB: there's no need to update BinaryDataMap and GlobalSymbols.
+  }
+  ChildBF.getSymbols().clear();
+
+  // Move other names the child function is known under.
+  std::move(ChildBF.Aliases.begin(), ChildBF.Aliases.end(),
+            std::back_inserter(ParentBF.Aliases));
+  ChildBF.Aliases.clear();
+
+  if (HasRelocations) {
+    // Merge execution counts of ChildBF into those of ParentBF.
+    // Without relocations, we cannot reliably merge profiles as both functions
+    // continue to exist and either one can be executed.
+    ChildBF.mergeProfileDataInto(ParentBF);
+
+    std::shared_lock<std::shared_timed_mutex> ReadBfsLock(BinaryFunctionsMutex,
+                                                          std::defer_lock);
+    std::unique_lock<std::shared_timed_mutex> WriteBfsLock(BinaryFunctionsMutex,
+                                                           std::defer_lock);
+    // Remove ChildBF from the global set of functions in relocs mode.
+    ReadBfsLock.lock();
+    auto FI = BinaryFunctions.find(ChildBF.getAddress());
+    ReadBfsLock.unlock();
+
+    assert(FI != BinaryFunctions.end() && "function not found");
+    assert(&ChildBF == &FI->second && "function mismatch");
+
+    WriteBfsLock.lock();
+    ChildBF.clearDisasmState();
+    FI = BinaryFunctions.erase(FI);
+    WriteBfsLock.unlock();
+
+  } else {
+    // In non-relocation mode we keep the function, but rename it.
+    std::string NewName = "__ICF_" + ChildName.str();
+
+    WriteCtxLock.lock();
+    ChildBF.getSymbols().push_back(Ctx->getOrCreateSymbol(NewName));
+    WriteCtxLock.unlock();
+
+    ChildBF.setFolded(&ParentBF);
+  }
+}
+
+void BinaryContext::fixBinaryDataHoles() {
+  assert(validateObjectNesting() && "object nesting inconsitency detected");
+
+  for (BinarySection &Section : allocatableSections()) {
+    std::vector<std::pair<uint64_t, uint64_t>> Holes;
+
+    auto isNotHole = [&Section](const binary_data_iterator &Itr) {
+      BinaryData *BD = Itr->second;
+      bool isHole = (!BD->getParent() &&
+                     !BD->getSize() &&
+                     BD->isObject() &&
+                     (BD->getName().startswith("SYMBOLat0x") ||
+                      BD->getName().startswith("DATAat0x") ||
+                      BD->getName().startswith("ANONYMOUS")));
+      return !isHole && BD->getSection() == Section && !BD->getParent();
+    };
+
+    auto BDStart = BinaryDataMap.begin();
+    auto BDEnd = BinaryDataMap.end();
+    auto Itr = FilteredBinaryDataIterator(isNotHole, BDStart, BDEnd);
+    auto End = FilteredBinaryDataIterator(isNotHole, BDEnd, BDEnd);
+
+    uint64_t EndAddress = Section.getAddress();
+
+    while (Itr != End) {
+      if (Itr->second->getAddress() > EndAddress) {
+        uint64_t Gap = Itr->second->getAddress() - EndAddress;
+        Holes.emplace_back(EndAddress, Gap);
+      }
+      EndAddress = Itr->second->getEndAddress();
+      ++Itr;
+    }
+
+    if (EndAddress < Section.getEndAddress()) {
+      Holes.emplace_back(EndAddress, Section.getEndAddress() - EndAddress);
+    }
+
+    // If there is already a symbol at the start of the hole, grow that symbol
+    // to cover the rest.  Otherwise, create a new symbol to cover the hole.
+    for (std::pair<uint64_t, uint64_t> &Hole : Holes) {
+      BinaryData *BD = getBinaryDataAtAddress(Hole.first);
+      if (BD) {
+        // BD->getSection() can be != Section if there are sections that
+        // overlap.  In this case it is probably safe to just skip the holes
+        // since the overlapping section will not(?) have any symbols in it.
+        if (BD->getSection() == Section)
+          setBinaryDataSize(Hole.first, Hole.second);
+      } else {
+        getOrCreateGlobalSymbol(Hole.first, "HOLEat", Hole.second, 1);
+      }
+    }
+  }
+
+  assert(validateObjectNesting() && "object nesting inconsitency detected");
+  assert(validateHoles() && "top level hole detected in object map");
+}
+
+void BinaryContext::printGlobalSymbols(raw_ostream& OS) const {
+  const BinarySection* CurrentSection = nullptr;
+  bool FirstSection = true;
+
+  for (auto &Entry : BinaryDataMap) {
+    const BinaryData *BD = Entry.second;
+    const BinarySection &Section = BD->getSection();
+    if (FirstSection || Section != *CurrentSection) {
+      uint64_t Address, Size;
+      StringRef Name = Section.getName();
+      if (Section) {
+        Address = Section.getAddress();
+        Size = Section.getSize();
+      } else {
+        Address = BD->getAddress();
+        Size = BD->getSize();
+      }
+      OS << "BOLT-INFO: Section " << Name << ", "
+         << "0x" + Twine::utohexstr(Address) << ":"
+         << "0x" + Twine::utohexstr(Address + Size) << "/"
+         << Size << "\n";
+      CurrentSection = &Section;
+      FirstSection = false;
+    }
+
+    OS << "BOLT-INFO: ";
+    const BinaryData *P = BD->getParent();
+    while (P) {
+      OS << "  ";
+      P = P->getParent();
+    }
+    OS << *BD << "\n";
+  }
+}
+
+unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID,
+                                               const uint32_t SrcCUID,
+                                               unsigned FileIndex) {
+  DWARFCompileUnit *SrcUnit = DwCtx->getCompileUnitForOffset(SrcCUID);
+  const DWARFDebugLine::LineTable *LineTable =
+      DwCtx->getLineTableForUnit(SrcUnit);
+  const std::vector<DWARFDebugLine::FileNameEntry> &FileNames =
+      LineTable->Prologue.FileNames;
+  // Dir indexes start at 1, as DWARF file numbers, and a dir index 0
+  // means empty dir.
+  assert(FileIndex > 0 && FileIndex <= FileNames.size() &&
+         "FileIndex out of range for the compilation unit.");
+  StringRef Dir = "";
+  if (FileNames[FileIndex - 1].DirIdx != 0) {
+    if (Optional<const char *> DirName =
+            LineTable->Prologue
+                .IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1]
+                .getAsCString()) {
+      Dir = *DirName;
+    }
+  }
+  StringRef FileName = "";
+  if (Optional<const char *> FName =
+          FileNames[FileIndex - 1].Name.getAsCString())
+    FileName = *FName;
+  assert(FileName != "");
+  return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, None, None, DestCUID));
+}
+
+std::vector<BinaryFunction *> BinaryContext::getSortedFunctions() {
+  std::vector<BinaryFunction *> SortedFunctions(BinaryFunctions.size());
+  std::transform(BinaryFunctions.begin(), BinaryFunctions.end(),
+                 SortedFunctions.begin(),
+                 [](std::pair<const uint64_t, BinaryFunction> &BFI) {
+                   return &BFI.second;
+                 });
+
+  std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(),
+                   [] (const BinaryFunction *A, const BinaryFunction *B) {
+                     if (A->hasValidIndex() && B->hasValidIndex()) {
+                       return A->getIndex() < B->getIndex();
+                     }
+                     return A->hasValidIndex();
+                   });
+  return SortedFunctions;
+}
+
+std::vector<BinaryFunction *> BinaryContext::getAllBinaryFunctions() {
+  std::vector<BinaryFunction *> AllFunctions;
+  AllFunctions.reserve(BinaryFunctions.size() + InjectedBinaryFunctions.size());
+  std::transform(BinaryFunctions.begin(), BinaryFunctions.end(),
+                 std::back_inserter(AllFunctions),
+                 [](std::pair<const uint64_t, BinaryFunction> &BFI) {
+                   return &BFI.second;
+                 });
+  std::copy(InjectedBinaryFunctions.begin(), InjectedBinaryFunctions.end(),
+            std::back_inserter(AllFunctions));
+
+  return AllFunctions;
+}
+
+Optional<DWARFUnit *> BinaryContext::getDWOCU(uint64_t DWOId) {
+  auto Iter = DWOCUs.find(DWOId);
+  if (Iter == DWOCUs.end())
+    return None;
+
+  return Iter->second;
+}
+
+DWARFContext *BinaryContext::getDWOContext() {
+  if (DWOCUs.empty())
+    return nullptr;
+  return &DWOCUs.begin()->second->getContext();
+}
+
+/// Handles DWO sections that can either be in .o, .dwo or .dwp files.
+void BinaryContext::preprocessDWODebugInfo() {
+  for (const std::unique_ptr<DWARFUnit> &CU : DwCtx->compile_units()) {
+    DWARFUnit *const DwarfUnit = CU.get();
+    if (llvm::Optional<uint64_t> DWOId = DwarfUnit->getDWOId()) {
+      DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit();
+      if (!DWOCU->isDWOUnit()) {
+        std::string DWOName = dwarf::toString(
+            DwarfUnit->getUnitDIE().find(
+                {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+            "");
+        outs() << "BOLT-WARNING: Debug Fission: DWO debug information for "
+               << DWOName
+               << " was not retrieved and won't be updated. Please check "
+                  "relative path.\n";
+        continue;
+      }
+      DWOCUs[*DWOId] = DWOCU;
+    }
+  }
+}
+
+void BinaryContext::preprocessDebugInfo() {
+  struct CURange {
+    uint64_t LowPC;
+    uint64_t HighPC;
+    DWARFUnit *Unit;
+
+    bool operator<(const CURange &Other) const {
+      return LowPC < Other.LowPC;
+    }
+  };
+
+  // Building a map of address ranges to CUs similar to .debug_aranges and use
+  // it to assign CU to functions.
+  std::vector<CURange> AllRanges;
+  AllRanges.reserve(DwCtx->getNumCompileUnits());
+  for (const std::unique_ptr<DWARFUnit> &CU : DwCtx->compile_units()) {
+    Expected<DWARFAddressRangesVector> RangesOrError =
+        CU->getUnitDIE().getAddressRanges();
+    if (!RangesOrError) {
+      consumeError(RangesOrError.takeError());
+      continue;
+    }
+    for (DWARFAddressRange &Range : *RangesOrError) {
+      // Parts of the debug info could be invalidated due to corresponding code
+      // being removed from the binary by the linker. Hence we check if the
+      // address is a valid one.
+      if (containsAddress(Range.LowPC))
+        AllRanges.emplace_back(CURange{Range.LowPC, Range.HighPC, CU.get()});
+    }
+  }
+
+  std::sort(AllRanges.begin(), AllRanges.end());
+  for (auto &KV : BinaryFunctions) {
+    const uint64_t FunctionAddress = KV.first;
+    BinaryFunction &Function = KV.second;
+
+    auto It = std::partition_point(AllRanges.begin(), AllRanges.end(),
+        [=](CURange R) { return R.HighPC <= FunctionAddress; });
+    if (It != AllRanges.end() && It->LowPC <= FunctionAddress) {
+      Function.setDWARFUnit(It->Unit);
+    }
+  }
+
+  // Populate MCContext with DWARF files from all units.
+  StringRef GlobalPrefix = AsmInfo->getPrivateGlobalPrefix();
+  for (const std::unique_ptr<DWARFUnit> &CU : DwCtx->compile_units()) {
+    const uint64_t CUID = CU->getOffset();
+    const DWARFDebugLine::LineTable *LineTable =
+      DwCtx->getLineTableForUnit(CU.get());
+    const std::vector<DWARFDebugLine::FileNameEntry> &FileNames =
+        LineTable->Prologue.FileNames;
+
+    // Assign a unique label to every line table, one per CU.
+    Ctx->getMCDwarfLineTable(CUID).setLabel(
+      Ctx->getOrCreateSymbol(GlobalPrefix + "line_table_start" + Twine(CUID)));
+
+    // Make sure empty debug line tables are registered too.
+    if (FileNames.empty()) {
+      cantFail(Ctx->getDwarfFile("", "<unknown>", 0, None, None, CUID));
+      continue;
+    }
+    for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) {
+      // Dir indexes start at 1, as DWARF file numbers, and a dir index 0
+      // means empty dir.
+      StringRef Dir = "";
+      if (FileNames[I].DirIdx != 0)
+        if (Optional<const char *> DirName =
+                LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1]
+                    .getAsCString())
+          Dir = *DirName;
+      StringRef FileName = "";
+      if (Optional<const char *> FName = FileNames[I].Name.getAsCString())
+        FileName = *FName;
+      assert(FileName != "");
+      cantFail(Ctx->getDwarfFile(Dir, FileName, 0, None, None, CUID));
+    }
+  }
+
+  preprocessDWODebugInfo();
+}
+
+bool BinaryContext::shouldEmit(const BinaryFunction &Function) const {
+  if (opts::processAllFunctions())
+    return true;
+
+  if (Function.isIgnored())
+    return false;
+
+  // In relocation mode we will emit non-simple functions with CFG.
+  // If the function does not have a CFG it should be marked as ignored.
+  return HasRelocations || Function.isSimple();
+}
+
+void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
+  uint32_t Operation = Inst.getOperation();
+  switch (Operation) {
+  case MCCFIInstruction::OpSameValue:
+    OS << "OpSameValue Reg" << Inst.getRegister();
+    break;
+  case MCCFIInstruction::OpRememberState:
+    OS << "OpRememberState";
+    break;
+  case MCCFIInstruction::OpRestoreState:
+    OS << "OpRestoreState";
+    break;
+  case MCCFIInstruction::OpOffset:
+    OS << "OpOffset Reg" << Inst.getRegister() << " " << Inst.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OS << "OpDefCfaRegister Reg" << Inst.getRegister();
+    break;
+  case MCCFIInstruction::OpDefCfaOffset:
+    OS << "OpDefCfaOffset " << Inst.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OS << "OpDefCfa Reg" << Inst.getRegister() << " " << Inst.getOffset();
+    break;
+  case MCCFIInstruction::OpRelOffset:
+    OS << "OpRelOffset Reg" << Inst.getRegister() << " " << Inst.getOffset();
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OS << "OfAdjustCfaOffset " << Inst.getOffset();
+    break;
+  case MCCFIInstruction::OpEscape:
+    OS << "OpEscape";
+    break;
+  case MCCFIInstruction::OpRestore:
+    OS << "OpRestore Reg" << Inst.getRegister();
+    break;
+  case MCCFIInstruction::OpUndefined:
+    OS << "OpUndefined Reg" << Inst.getRegister();
+    break;
+  case MCCFIInstruction::OpRegister:
+    OS << "OpRegister Reg" << Inst.getRegister() << " Reg"
+       << Inst.getRegister2();
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OS << "OpWindowSave";
+    break;
+  case MCCFIInstruction::OpGnuArgsSize:
+    OS << "OpGnuArgsSize";
+    break;
+  default:
+    OS << "Op#" << Operation;
+    break;
+  }
+}
+
+void BinaryContext::printInstruction(raw_ostream &OS,
+                                     const MCInst &Instruction,
+                                     uint64_t Offset,
+                                     const BinaryFunction* Function,
+                                     bool PrintMCInst,
+                                     bool PrintMemData,
+                                     bool PrintRelocations) const {
+  if (MIB->isEHLabel(Instruction)) {
+    OS << "  EH_LABEL: " << *MIB->getTargetSymbol(Instruction) << '\n';
+    return;
+  }
+  OS << format("    %08" PRIx64 ": ", Offset);
+  if (MIB->isCFI(Instruction)) {
+    uint32_t Offset = Instruction.getOperand(0).getImm();
+    OS << "\t!CFI\t$" << Offset << "\t; ";
+    if (Function)
+      printCFI(OS, *Function->getCFIFor(Instruction));
+    OS << "\n";
+    return;
+  }
+  InstPrinter->printInst(&Instruction, 0, "", *STI, OS);
+  if (MIB->isCall(Instruction)) {
+    if (MIB->isTailCall(Instruction))
+      OS << " # TAILCALL ";
+    if (MIB->isInvoke(Instruction)) {
+      const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instruction);
+      OS << " # handler: ";
+      if (EHInfo->first)
+        OS << *EHInfo->first;
+      else
+        OS << '0';
+      OS << "; action: " << EHInfo->second;
+      const int64_t GnuArgsSize = MIB->getGnuArgsSize(Instruction);
+      if (GnuArgsSize >= 0)
+        OS << "; GNU_args_size = " << GnuArgsSize;
+    }
+  } else if (MIB->isIndirectBranch(Instruction)) {
+    if (uint64_t JTAddress = MIB->getJumpTable(Instruction)) {
+      OS << " # JUMPTABLE @0x" << Twine::utohexstr(JTAddress);
+    } else {
+      OS << " # UNKNOWN CONTROL FLOW";
+    }
+  }
+
+  MIB->printAnnotations(Instruction, OS);
+
+  if (opts::PrintDebugInfo) {
+    DebugLineTableRowRef RowRef =
+        DebugLineTableRowRef::fromSMLoc(Instruction.getLoc());
+    if (RowRef != DebugLineTableRowRef::NULL_ROW) {
+      const DWARFDebugLine::LineTable *LineTable;
+      if (Function && Function->getDWARFUnit() &&
+          Function->getDWARFUnit()->getOffset() == RowRef.DwCompileUnitIndex) {
+        LineTable = Function->getDWARFLineTable();
+      } else {
+        LineTable = DwCtx->getLineTableForUnit(
+            DwCtx->getCompileUnitForOffset(RowRef.DwCompileUnitIndex));
+      }
+      assert(LineTable &&
+             "line table expected for instruction with debug info");
+
+      const DWARFDebugLine::Row &Row = LineTable->Rows[RowRef.RowIndex - 1];
+      StringRef FileName = "";
+      if (Optional<const char *> FName =
+              LineTable->Prologue.FileNames[Row.File - 1].Name.getAsCString())
+        FileName = *FName;
+      OS << " # debug line " << FileName << ":" << Row.Line;
+      if (Row.Column)
+        OS << ":" << Row.Column;
+      if (Row.Discriminator)
+        OS << " discriminator:" << Row.Discriminator;
+    }
+  }
+
+  if ((opts::PrintRelocations || PrintRelocations) && Function) {
+    const uint64_t Size = computeCodeSize(&Instruction, &Instruction + 1);
+    Function->printRelocations(OS, Offset, Size);
+  }
+
+  OS << "\n";
+
+  if (PrintMCInst) {
+    Instruction.dump_pretty(OS, InstPrinter.get());
+    OS << "\n";
+  }
+}
+
+ErrorOr<BinarySection &> BinaryContext::getSectionForAddress(uint64_t Address) {
+  auto SI = AddressToSection.upper_bound(Address);
+  if (SI != AddressToSection.begin()) {
+    --SI;
+    uint64_t UpperBound = SI->first + SI->second->getSize();
+    if (!SI->second->getSize())
+      UpperBound += 1;
+    if (UpperBound > Address)
+      return *SI->second;
+  }
+  return std::make_error_code(std::errc::bad_address);
+}
+
+ErrorOr<StringRef>
+BinaryContext::getSectionNameForAddress(uint64_t Address) const {
+  if (ErrorOr<const BinarySection &> Section = getSectionForAddress(Address)) {
+    return Section->getName();
+  }
+  return std::make_error_code(std::errc::bad_address);
+}
+
+BinarySection &BinaryContext::registerSection(BinarySection *Section) {
+  auto Res = Sections.insert(Section);
+  (void)Res;
+  assert(Res.second && "can't register the same section twice.");
+
+  // Only register allocatable sections in the AddressToSection map.
+  if (Section->isAllocatable() && Section->getAddress())
+    AddressToSection.insert(std::make_pair(Section->getAddress(), Section));
+  NameToSection.insert(
+      std::make_pair(std::string(Section->getName()), Section));
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: registering " << *Section << "\n");
+  return *Section;
+}
+
+BinarySection &BinaryContext::registerSection(SectionRef Section) {
+  return registerSection(new BinarySection(*this, Section));
+}
+
+BinarySection &
+BinaryContext::registerSection(StringRef SectionName,
+                               const BinarySection &OriginalSection) {
+  return registerSection(new BinarySection(*this,
+                                           SectionName,
+                                           OriginalSection));
+}
+
+BinarySection &BinaryContext::registerOrUpdateSection(StringRef Name,
+                                                      unsigned ELFType,
+                                                      unsigned ELFFlags,
+                                                      uint8_t *Data,
+                                                      uint64_t Size,
+                                                      unsigned Alignment) {
+  auto NamedSections = getSectionByName(Name);
+  if (NamedSections.begin() != NamedSections.end()) {
+    assert(std::next(NamedSections.begin()) == NamedSections.end() &&
+           "can only update unique sections");
+    BinarySection *Section = NamedSections.begin()->second;
+
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: updating " << *Section << " -> ");
+    const bool Flag = Section->isAllocatable();
+    (void)Flag;
+    Section->update(Data, Size, Alignment, ELFType, ELFFlags);
+    LLVM_DEBUG(dbgs() << *Section << "\n");
+    // FIXME: Fix section flags/attributes for MachO.
+    if (isELF())
+      assert(Flag == Section->isAllocatable() &&
+             "can't change section allocation status");
+    return *Section;
+  }
+
+  return registerSection(new BinarySection(*this, Name, Data, Size, Alignment,
+                                           ELFType, ELFFlags));
+}
+
+bool BinaryContext::deregisterSection(BinarySection &Section) {
+  BinarySection *SectionPtr = &Section;
+  auto Itr = Sections.find(SectionPtr);
+  if (Itr != Sections.end()) {
+    auto Range = AddressToSection.equal_range(SectionPtr->getAddress());
+    while (Range.first != Range.second) {
+      if (Range.first->second == SectionPtr) {
+        AddressToSection.erase(Range.first);
+        break;
+      }
+      ++Range.first;
+    }
+
+    auto NameRange =
+        NameToSection.equal_range(std::string(SectionPtr->getName()));
+    while (NameRange.first != NameRange.second) {
+      if (NameRange.first->second == SectionPtr) {
+        NameToSection.erase(NameRange.first);
+        break;
+      }
+      ++NameRange.first;
+    }
+
+    Sections.erase(Itr);
+    delete SectionPtr;
+    return true;
+  }
+  return false;
+}
+
+void BinaryContext::printSections(raw_ostream &OS) const {
+  for (BinarySection *const &Section : Sections) {
+    OS << "BOLT-INFO: " << *Section << "\n";
+  }
+}
+
+BinarySection &BinaryContext::absoluteSection() {
+  if (ErrorOr<BinarySection &> Section = getUniqueSectionByName("<absolute>"))
+    return *Section;
+  return registerOrUpdateSection("<absolute>", ELF::SHT_NULL, 0u);
+}
+
+ErrorOr<uint64_t>
+BinaryContext::getUnsignedValueAtAddress(uint64_t Address,
+                                         size_t Size) const {
+  const ErrorOr<const BinarySection &> Section = getSectionForAddress(Address);
+  if (!Section)
+    return std::make_error_code(std::errc::bad_address);
+
+  if (Section->isVirtual())
+    return 0;
+
+  DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(),
+                   AsmInfo->getCodePointerSize());
+  auto ValueOffset = static_cast<uint64_t>(Address - Section->getAddress());
+  return DE.getUnsigned(&ValueOffset, Size);
+}
+
+ErrorOr<uint64_t>
+BinaryContext::getSignedValueAtAddress(uint64_t Address,
+                                       size_t Size) const {
+  const ErrorOr<const BinarySection &> Section = getSectionForAddress(Address);
+  if (!Section)
+    return std::make_error_code(std::errc::bad_address);
+
+  if (Section->isVirtual())
+    return 0;
+
+  DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(),
+                   AsmInfo->getCodePointerSize());
+  auto ValueOffset = static_cast<uint64_t>(Address - Section->getAddress());
+  return DE.getSigned(&ValueOffset, Size);
+}
+
+void BinaryContext::addRelocation(uint64_t Address,
+                                  MCSymbol *Symbol,
+                                  uint64_t Type,
+                                  uint64_t Addend,
+                                  uint64_t Value) {
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  assert(Section && "cannot find section for address");
+  Section->addRelocation(Address - Section->getAddress(),
+                         Symbol,
+                         Type,
+                         Addend,
+                         Value);
+}
+
+void BinaryContext::addDynamicRelocation(uint64_t Address,
+                                         MCSymbol *Symbol,
+                                         uint64_t Type,
+                                         uint64_t Addend,
+                                         uint64_t Value) {
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  assert(Section && "cannot find section for address");
+  Section->addDynamicRelocation(Address - Section->getAddress(),
+                                Symbol,
+                                Type,
+                                Addend,
+                                Value);
+}
+
+bool BinaryContext::removeRelocationAt(uint64_t Address) {
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  assert(Section && "cannot find section for address");
+  return Section->removeRelocationAt(Address - Section->getAddress());
+}
+
+const Relocation *BinaryContext::getRelocationAt(uint64_t Address) {
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  if (!Section)
+    return nullptr;
+
+  return Section->getRelocationAt(Address - Section->getAddress());
+}
+
+const Relocation *BinaryContext::getDynamicRelocationAt(uint64_t Address) {
+  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
+  if (!Section)
+    return nullptr;
+
+  return Section->getDynamicRelocationAt(Address - Section->getAddress());
+}
+
+void BinaryContext::markAmbiguousRelocations(BinaryData &BD,
+                                             const uint64_t Address) {
+  auto setImmovable = [&](BinaryData &BD) {
+    BinaryData *Root = BD.getAtomicRoot();
+    LLVM_DEBUG(if (Root->isMoveable()) {
+      dbgs() << "BOLT-DEBUG: setting " << *Root << " as immovable "
+             << "due to ambiguous relocation referencing 0x"
+             << Twine::utohexstr(Address) << '\n';
+    });
+    Root->setIsMoveable(false);
+  };
+
+  if (Address == BD.getAddress()) {
+    setImmovable(BD);
+
+    // Set previous symbol as immovable
+    BinaryData *Prev = getBinaryDataContainingAddress(Address - 1);
+    if (Prev && Prev->getEndAddress() == BD.getAddress())
+      setImmovable(*Prev);
+  }
+
+  if (Address == BD.getEndAddress()) {
+    setImmovable(BD);
+
+    // Set next symbol as immovable
+    BinaryData *Next = getBinaryDataContainingAddress(BD.getEndAddress());
+    if (Next && Next->getAddress() == BD.getEndAddress())
+      setImmovable(*Next);
+  }
+}
+
+BinaryFunction *BinaryContext::getFunctionForSymbol(const MCSymbol *Symbol,
+                                                    uint64_t *EntryDesc) {
+  std::shared_lock<std::shared_timed_mutex> Lock(SymbolToFunctionMapMutex);
+  auto BFI = SymbolToFunctionMap.find(Symbol);
+  if (BFI == SymbolToFunctionMap.end())
+    return nullptr;
+
+  BinaryFunction *BF = BFI->second;
+  if (EntryDesc)
+    *EntryDesc = BF->getEntryIDForSymbol(Symbol);
+
+  return BF;
+}
+
+void BinaryContext::exitWithBugReport(StringRef Message,
+                                      const BinaryFunction &Function) const {
+  errs() << "=======================================\n";
+  errs() << "BOLT is unable to proceed because it couldn't properly understand "
+            "this function.\n";
+  errs() << "If you are running the most recent version of BOLT, you may "
+            "want to "
+            "report this and paste this dump.\nPlease check that there is no "
+            "sensitive contents being shared in this dump.\n";
+  errs() << "\nOffending function: " << Function.getPrintName() << "\n\n";
+  ScopedPrinter SP(errs());
+  SP.printBinaryBlock("Function contents", *Function.getData());
+  errs() << "\n";
+  Function.dump();
+  errs() << "ERROR: " << Message;
+  errs() << "\n=======================================\n";
+  exit(1);
+}
+
+BinaryFunction *
+BinaryContext::createInjectedBinaryFunction(const std::string &Name,
+                                            bool IsSimple) {
+  InjectedBinaryFunctions.push_back(new BinaryFunction(Name, *this, IsSimple));
+  BinaryFunction *BF = InjectedBinaryFunctions.back();
+  setSymbolToFunctionMap(BF->getSymbol(), BF);
+  BF->CurrentState = BinaryFunction::State::CFG;
+  return BF;
+}
+
+std::pair<size_t, size_t>
+BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
+  // Adjust branch instruction to match the current layout.
+  if (FixBranches)
+    BF.fixBranches();
+
+  // Create local MC context to isolate the effect of ephemeral code emission.
+  IndependentCodeEmitter MCEInstance = createIndependentMCCodeEmitter();
+  MCContext *LocalCtx = MCEInstance.LocalCtx.get();
+  MCAsmBackend *MAB =
+      TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions());
+
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+
+  std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(VecOS);
+  std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
+      *TheTriple, *LocalCtx, std::unique_ptr<MCAsmBackend>(MAB), std::move(OW),
+      std::unique_ptr<MCCodeEmitter>(MCEInstance.MCE.release()), *STI,
+      /*RelaxAll=*/false,
+      /*IncrementalLinkerCompatible=*/false,
+      /*DWARFMustBeAtTheEnd=*/false));
+
+  Streamer->InitSections(false);
+
+  MCSection *Section = MCEInstance.LocalMOFI->getTextSection();
+  Section->setHasInstructions(true);
+
+  // Create symbols in the LocalCtx so that they get destroyed with it.
+  MCSymbol *StartLabel = LocalCtx->createTempSymbol();
+  MCSymbol *EndLabel = LocalCtx->createTempSymbol();
+  MCSymbol *ColdStartLabel = LocalCtx->createTempSymbol();
+  MCSymbol *ColdEndLabel = LocalCtx->createTempSymbol();
+
+  Streamer->SwitchSection(Section);
+  Streamer->emitLabel(StartLabel);
+  emitFunctionBody(*Streamer, BF, /*EmitColdPart=*/false,
+                   /*EmitCodeOnly=*/true);
+  Streamer->emitLabel(EndLabel);
+
+  if (BF.isSplit()) {
+    MCSectionELF *ColdSection =
+        LocalCtx->getELFSection(BF.getColdCodeSectionName(), ELF::SHT_PROGBITS,
+                                ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
+    ColdSection->setHasInstructions(true);
+
+    Streamer->SwitchSection(ColdSection);
+    Streamer->emitLabel(ColdStartLabel);
+    emitFunctionBody(*Streamer, BF, /*EmitColdPart=*/true,
+                     /*EmitCodeOnly=*/true);
+    Streamer->emitLabel(ColdEndLabel);
+    // To avoid calling MCObjectStreamer::flushPendingLabels() which is private
+    Streamer->emitBytes(StringRef(""));
+    Streamer->SwitchSection(Section);
+  }
+
+  // To avoid calling MCObjectStreamer::flushPendingLabels() which is private or
+  // MCStreamer::Finish(), which does more than we want
+  Streamer->emitBytes(StringRef(""));
+
+  MCAssembler &Assembler =
+      static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler();
+  MCAsmLayout Layout(Assembler);
+  Assembler.layout(Layout);
+
+  const uint64_t HotSize =
+      Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel);
+  const uint64_t ColdSize = BF.isSplit()
+                                ? Layout.getSymbolOffset(*ColdEndLabel) -
+                                      Layout.getSymbolOffset(*ColdStartLabel)
+                                : 0ULL;
+
+  // Clean-up the effect of the code emission.
+  for (const MCSymbol &Symbol : Assembler.symbols()) {
+    MCSymbol *MutableSymbol = const_cast<MCSymbol *>(&Symbol);
+    MutableSymbol->setUndefined();
+    MutableSymbol->setIsRegistered(false);
+  }
+
+  return std::make_pair(HotSize, ColdSize);
+}
+
+bool BinaryContext::validateEncoding(const MCInst &Inst,
+                                     ArrayRef<uint8_t> InputEncoding) const {
+  SmallString<256> Code;
+  SmallVector<MCFixup, 4> Fixups;
+  raw_svector_ostream VecOS(Code);
+
+  MCE->encodeInstruction(Inst, VecOS, Fixups, *STI);
+  auto EncodedData = ArrayRef<uint8_t>((uint8_t *)Code.data(), Code.size());
+  if (InputEncoding != EncodedData) {
+    if (opts::Verbosity > 1) {
+      errs() << "BOLT-WARNING: mismatched encoding detected\n"
+             << "      input: " << InputEncoding << '\n'
+             << "     output: " << EncodedData << '\n';
+    }
+    return false;
+  }
+
+  return true;
+}
+
+uint64_t BinaryContext::getHotThreshold() const {
+  static uint64_t Threshold = 0;
+  if (Threshold == 0) {
+    Threshold = std::max((uint64_t)opts::ExecutionCountThreshold,
+        NumProfiledFuncs ? SumExecutionCount / (2 * NumProfiledFuncs) : 1);
+  }
+  return Threshold;
+}
+
+BinaryFunction *
+BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address,
+                                                  bool CheckPastEnd,
+                                                  bool UseMaxSize) {
+  auto FI = BinaryFunctions.upper_bound(Address);
+  if (FI == BinaryFunctions.begin())
+    return nullptr;
+  --FI;
+
+  const uint64_t UsedSize =
+      UseMaxSize ? FI->second.getMaxSize() : FI->second.getSize();
+
+  if (Address >= FI->first + UsedSize + (CheckPastEnd ? 1 : 0))
+    return nullptr;
+
+  return &FI->second;
+}
+
+BinaryFunction *
+BinaryContext::getBinaryFunctionAtAddress(uint64_t Address) {
+  // First, try to find a function starting at the given address. If the
+  // function was folded, this will get us the original folded function if it
+  // wasn't removed from the list, e.g. in non-relocation mode.
+  auto BFI = BinaryFunctions.find(Address);
+  if (BFI != BinaryFunctions.end()) {
+    return &BFI->second;
+  }
+
+  // We might have folded the function matching the object at the given
+  // address. In such case, we look for a function matching the symbol
+  // registered at the original address. The new function (the one that the
+  // original was folded into) will hold the symbol.
+  if (const BinaryData *BD = getBinaryDataAtAddress(Address)) {
+    uint64_t EntryID = 0;
+    BinaryFunction *BF = getFunctionForSymbol(BD->getSymbol(), &EntryID);
+    if (BF && EntryID == 0)
+      return BF;
+  }
+  return nullptr;
+}
+
+DebugAddressRangesVector BinaryContext::translateModuleAddressRanges(
+      const DWARFAddressRangesVector &InputRanges) const {
+  DebugAddressRangesVector OutputRanges;
+
+  for (const DWARFAddressRange Range : InputRanges) {
+    auto BFI = BinaryFunctions.lower_bound(Range.LowPC);
+    while (BFI != BinaryFunctions.end()) {
+      const BinaryFunction &Function = BFI->second;
+      if (Function.getAddress() >= Range.HighPC)
+        break;
+      const DebugAddressRangesVector FunctionRanges =
+          Function.getOutputAddressRanges();
+      std::move(std::begin(FunctionRanges),
+                std::end(FunctionRanges),
+                std::back_inserter(OutputRanges));
+      std::advance(BFI, 1);
+    }
+  }
+
+  return OutputRanges;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h
new file mode 100644
index 000000000000..792b1a809208
--- /dev/null
+++ b/bolt/src/BinaryContext.h
@@ -0,0 +1,1260 @@
+//===--- BinaryContext.h  - Interface for machine-level context -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Context for processing binary executables in files and/or memory.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H
+
+#include "BinaryData.h"
+#include "BinarySection.h"
+#include "DebugData.h"
+#include "JumpTable.h"
+#include "MCPlusBuilder.h"
+#include "RuntimeLibs/RuntimeLibrary.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCPseudoProbe.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <functional>
+#include <map>
+#include <set>
+#include <shared_mutex>
+#include <string>
+#include <system_error>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+class MCDisassembler;
+class MCInstPrinter;
+
+using namespace object;
+
+namespace bolt {
+
+class BinaryFunction;
+class ExecutableFileMemoryManager;
+
+/// Information on loadable part of the file.
+struct SegmentInfo {
+  uint64_t Address;           /// Address of the segment in memory.
+  uint64_t Size;              /// Size of the segment in memory.
+  uint64_t FileOffset;        /// Offset in the file.
+  uint64_t FileSize;          /// Size in file.
+  uint64_t Alignment;         /// Alignment of the segment.
+
+  void print(raw_ostream &OS) const {
+    OS << "SegmentInfo { Address: 0x"
+       << Twine::utohexstr(Address) << ", Size: 0x"
+       << Twine::utohexstr(Size) << ", FileOffset: 0x"
+       << Twine::utohexstr(FileOffset) << ", FileSize: 0x"
+       << Twine::utohexstr(FileSize) << ", Alignment: 0x"
+       << Twine::utohexstr(Alignment) << "}";
+  };
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) {
+  SegInfo.print(OS);
+  return OS;
+}
+
+enum class MemoryContentsType : char {
+  UNKNOWN = 0,              /// Unknown contents.
+  POSSIBLE_JUMP_TABLE,      /// Possibly a non-PIC jump table.
+  POSSIBLE_PIC_JUMP_TABLE,  /// Possibly a PIC jump table.
+};
+
+/// Helper function to truncate a \p Value to given size in \p Bytes.
+inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
+  return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
+}
+
+/// Filter iterator.
+template <typename ItrType,
+          typename PredType = std::function<bool (const ItrType &)>>
+class FilterIterator
+  : public std::iterator<std::bidirectional_iterator_tag,
+                         typename std::iterator_traits<ItrType>::value_type> {
+  using Iterator = FilterIterator;
+  using T = typename std::iterator_traits<ItrType>::reference;
+  using PointerT = typename std::iterator_traits<ItrType>::pointer;
+
+  PredType Pred;
+  ItrType Itr, End;
+
+  void prev() {
+    while (!Pred(--Itr))
+      ;
+  }
+  void next() {
+    ++Itr;
+    nextMatching();
+  }
+  void nextMatching() {
+    while (Itr != End && !Pred(Itr))
+      ++Itr;
+  }
+public:
+  Iterator &operator++() { next(); return *this; }
+  Iterator &operator--() { prev(); return *this; }
+  Iterator operator++(int) { auto Tmp(Itr); next(); return Tmp; }
+  Iterator operator--(int) { auto Tmp(Itr); prev(); return Tmp; }
+  bool operator==(const Iterator& Other) const {
+    return Itr == Other.Itr;
+  }
+  bool operator!=(const Iterator& Other) const {
+    return !operator==(Other);
+  }
+  T operator*() { return *Itr; }
+  PointerT operator->() { return &operator*(); }
+  FilterIterator(PredType Pred, ItrType Itr, ItrType End)
+    : Pred(Pred), Itr(Itr), End(End) {
+    nextMatching();
+  }
+};
+
+class BinaryContext {
+  BinaryContext() = delete;
+
+  /// Name of the binary file the context originated from.
+  std::string Filename;
+
+  /// Unique build ID if available for the binary.
+  Optional<std::string> FileBuildID;
+
+  /// Set of all sections.
+  struct CompareSections {
+    bool operator()(const BinarySection *A, const BinarySection *B) const {
+      return *A < *B;
+    }
+  };
+  using SectionSetType = std::set<BinarySection *, CompareSections>;
+  SectionSetType Sections;
+
+  using SectionIterator = pointee_iterator<SectionSetType::iterator>;
+  using SectionConstIterator = pointee_iterator<SectionSetType::const_iterator>;
+
+  using FilteredSectionIterator = FilterIterator<SectionIterator>;
+  using FilteredSectionConstIterator = FilterIterator<SectionConstIterator>;
+
+  /// Map virtual address to a section.  It is possible to have more than one
+  /// section mapped to the same address, e.g. non-allocatable sections.
+  using AddressToSectionMapType = std::multimap<uint64_t, BinarySection *>;
+  AddressToSectionMapType AddressToSection;
+
+  /// multimap of section name to BinarySection object.  Some binaries
+  /// have multiple sections with the same name.
+  using NameToSectionMapType = std::multimap<std::string, BinarySection *>;
+  NameToSectionMapType NameToSection;
+
+  /// Low level section registration.
+  BinarySection &registerSection(BinarySection *Section);
+
+  /// Store all functions in the binary, sorted by original address.
+  std::map<uint64_t, BinaryFunction> BinaryFunctions;
+
+  /// A mutex that is used to control parallel accesses to BinaryFunctions
+  mutable std::shared_timed_mutex BinaryFunctionsMutex;
+
+  /// Functions injected by BOLT
+  std::vector<BinaryFunction *> InjectedBinaryFunctions;
+
+  /// Jump tables for all functions mapped by address.
+  std::map<uint64_t, JumpTable *> JumpTables;
+
+  /// Locations of PC-relative relocations in data objects.
+  std::unordered_set<uint64_t> DataPCRelocations;
+
+  /// Used in duplicateJumpTable() to uniquely identify a JT clone
+  /// Start our IDs with a high number so getJumpTableContainingAddress checks
+  /// with size won't overflow
+  uint32_t DuplicatedJumpTables{0x10000000};
+
+  /// The runtime library.
+  std::unique_ptr<RuntimeLibrary> RtLibrary;
+
+  /// DWP Context.
+  std::shared_ptr<DWARFContext> DWPContext;
+
+  /// A map of DWO Ids to CUs.
+  using DWOIdToCUMapType = std::unordered_map<uint64_t, DWARFUnit *>;
+  DWOIdToCUMapType DWOCUs;
+
+  /// Preprocess DWO debug information.
+  void preprocessDWODebugInfo();
+
+public:
+  static std::unique_ptr<BinaryContext>
+  createBinaryContext(const ObjectFile *File, bool IsPIC,
+                      std::unique_ptr<DWARFContext> DwCtx);
+
+  /// Given DWOId returns CU if it exists in DWOCUs.
+  Optional<DWARFUnit *> getDWOCU(uint64_t DWOId);
+
+  /// Returns DWOContext if it exists.
+  DWARFContext *getDWOContext();
+
+  /// Get Number of DWOCUs in a map.
+  uint32_t getNumDWOCUs() { return DWOCUs.size(); }
+
+  /// [start memory address] -> [segment info] mapping.
+  std::map<uint64_t, SegmentInfo> SegmentMapInfo;
+
+  /// Symbols that are expected to be undefined in MCContext during emission.
+  std::unordered_set<MCSymbol *> UndefinedSymbols;
+
+  /// [name] -> [BinaryData*] map used for global symbol resolution.
+  using SymbolMapType = StringMap<BinaryData *>;
+  SymbolMapType GlobalSymbols;
+
+  /// [address] -> [BinaryData], ...
+  /// Addresses never change.
+  /// Note: it is important that clients do not hold on to instances of
+  /// BinaryData* while the map is still being modified during BinaryFunction
+  /// disassembly.  This is because of the possibility that a regular
+  /// BinaryData is later discovered to be a JumpTable.
+  using BinaryDataMapType = std::map<uint64_t, BinaryData *>;
+  using binary_data_iterator = BinaryDataMapType::iterator;
+  using binary_data_const_iterator = BinaryDataMapType::const_iterator;
+  BinaryDataMapType BinaryDataMap;
+
+  using FilteredBinaryDataConstIterator =
+    FilterIterator<binary_data_const_iterator>;
+  using FilteredBinaryDataIterator = FilterIterator<binary_data_iterator>;
+
+  /// Memory manager for sections and segments. Used to communicate with ORC
+  /// among other things.
+  std::shared_ptr<ExecutableFileMemoryManager> EFMM;
+
+  StringRef getFilename() const { return Filename; }
+  void setFilename(StringRef Name) { Filename = std::string(Name); }
+
+  Optional<StringRef> getFileBuildID() const {
+    if (FileBuildID) {
+      return StringRef(*FileBuildID);
+    }
+
+    return NoneType();
+  }
+  void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); }
+
+  bool hasSymbolsWithFileName() const {
+    return HasSymbolsWithFileName;
+  }
+  void setHasSymbolsWithFileName(bool Value) {
+    HasSymbolsWithFileName = true;
+  }
+
+  /// Return true if relocations against symbol with a given name
+  /// must be created.
+  bool forceSymbolRelocations(StringRef SymbolName) const;
+
+  uint64_t getNumUnusedProfiledObjects() const {
+    return NumUnusedProfiledObjects;
+  }
+  void setNumUnusedProfiledObjects(uint64_t N) {
+    NumUnusedProfiledObjects = N;
+  }
+
+  RuntimeLibrary *getRuntimeLibrary() { return RtLibrary.get(); }
+  void setRuntimeLibrary(std::unique_ptr<RuntimeLibrary> Lib) {
+    assert(!RtLibrary && "Cannot set runtime library twice.");
+    RtLibrary = std::move(Lib);
+  }
+
+  /// Return BinaryFunction containing a given \p Address or nullptr if
+  /// no registered function contains the \p Address.
+  ///
+  /// In a binary a function has somewhat vague  boundaries. E.g. a function can
+  /// refer to the first byte past the end of the function, and it will still be
+  /// referring to this function, not the function following it in the address
+  /// space. Thus we have the following flags that allow to lookup for
+  /// a function where a caller has more context for the search.
+  ///
+  /// If \p CheckPastEnd is true and the \p Address falls on a byte
+  /// immediately following the last byte of some function and there's no other
+  /// function that starts there, then return the function as the one containing
+  /// the \p Address. This is useful when we need to locate functions for
+  /// references pointing immediately past a function body.
+  ///
+  /// If \p UseMaxSize is true, then include the space between this function
+  /// body and the next object in address ranges that we check.
+  BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address,
+                                                     bool CheckPastEnd = false,
+                                                     bool UseMaxSize = false);
+
+  /// Return a BinaryFunction that starts at a given \p Address.
+  BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address);
+
+  const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const {
+    return const_cast<BinaryContext *>(this)->
+        getBinaryFunctionAtAddress(Address);
+  }
+
+  /// Return size of an entry for the given jump table \p Type.
+  uint64_t getJumpTableEntrySize(JumpTable::JumpTableType Type) const {
+    return Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize();
+  }
+
+  /// Return JumpTable containing a given \p Address.
+  JumpTable *getJumpTableContainingAddress(uint64_t Address) {
+    auto JTI = JumpTables.upper_bound(Address);
+    if (JTI == JumpTables.begin())
+      return nullptr;
+    --JTI;
+    if (JTI->first + JTI->second->getSize() > Address)
+      return JTI->second;
+    if (JTI->second->getSize() == 0 && JTI->first == Address)
+      return JTI->second;
+    return nullptr;
+  }
+
+  unsigned getDWARFEncodingSize(unsigned Encoding) {
+    switch (Encoding & 0x0f) {
+    default: llvm_unreachable("unknown encoding");
+    case dwarf::DW_EH_PE_absptr:
+    case dwarf::DW_EH_PE_signed:
+      return AsmInfo->getCodePointerSize();
+    case dwarf::DW_EH_PE_udata2:
+    case dwarf::DW_EH_PE_sdata2:
+      return 2;
+    case dwarf::DW_EH_PE_udata4:
+    case dwarf::DW_EH_PE_sdata4:
+      return 4;
+    case dwarf::DW_EH_PE_udata8:
+    case dwarf::DW_EH_PE_sdata8:
+      return 8;
+    }
+  }
+
+  /// [MCSymbol] -> [BinaryFunction]
+  ///
+  /// As we fold identical functions, multiple symbols can point
+  /// to the same BinaryFunction.
+  std::unordered_map<const MCSymbol *,
+                     BinaryFunction *> SymbolToFunctionMap;
+
+  /// A mutex that is used to control parallel accesses to SymbolToFunctionMap
+  mutable std::shared_timed_mutex SymbolToFunctionMapMutex;
+
+  /// Look up the symbol entry that contains the given \p Address (based on
+  /// the start address and size for each symbol).  Returns a pointer to
+  /// the BinaryData for that symbol.  If no data is found, nullptr is returned.
+  const BinaryData *getBinaryDataContainingAddressImpl(uint64_t Address) const;
+
+  /// Update the Parent fields in BinaryDatas after adding a new entry into
+  /// \p BinaryDataMap.
+  void updateObjectNesting(BinaryDataMapType::iterator GAI);
+
+  /// Validate that if object address ranges overlap that the object with
+  /// the larger range is a parent of the object with the smaller range.
+  bool validateObjectNesting() const;
+
+  /// Validate that there are no top level "holes" in each section
+  /// and that all relocations with a section are mapped to a valid
+  /// top level BinaryData.
+  bool validateHoles() const;
+
+  /// Produce output address ranges based on input ranges for some module.
+  DebugAddressRangesVector translateModuleAddressRanges(
+      const DWARFAddressRangesVector &InputRanges) const;
+
+  /// Get a bogus "absolute" section that will be associated with all
+  /// absolute BinaryDatas.
+  BinarySection &absoluteSection();
+
+  /// Process "holes" in between known BinaryData objects.  For now,
+  /// symbols are padded with the space before the next BinaryData object.
+  void fixBinaryDataHoles();
+
+  /// Generate names based on data hashes for unknown symbols.
+  void generateSymbolHashes();
+
+  /// Construct BinaryFunction object and add it to internal maps.
+  BinaryFunction *createBinaryFunction(const std::string &Name,
+                                       BinarySection &Section,
+                                       uint64_t Address,
+                                       uint64_t Size,
+                                       uint64_t SymbolSize = 0,
+                                       uint16_t Alignment = 0);
+
+  /// Return all functions for this rewrite instance.
+  std::map<uint64_t, BinaryFunction> &getBinaryFunctions() {
+    return BinaryFunctions;
+  }
+
+  /// Return all functions for this rewrite instance.
+  const std::map<uint64_t, BinaryFunction> &getBinaryFunctions() const {
+    return BinaryFunctions;
+  }
+
+  /// Create BOLT-injected function
+  BinaryFunction *createInjectedBinaryFunction(const std::string &Name,
+                                               bool IsSimple = true);
+
+  std::vector<BinaryFunction *> &getInjectedBinaryFunctions() {
+    return InjectedBinaryFunctions;
+  }
+
+  /// Return vector with all functions, i.e. include functions from the input
+  /// binary and functions created by BOLT.
+  std::vector<BinaryFunction *> getAllBinaryFunctions();
+
+  /// Construct a jump table for \p Function at \p Address or return an existing
+  /// one at that location.
+  ///
+  /// May create an embedded jump table and return its label as the second
+  /// element of the pair.
+  const MCSymbol *getOrCreateJumpTable(BinaryFunction &Function,
+                                       uint64_t Address,
+                                       JumpTable::JumpTableType Type);
+
+  /// Analyze a possible jump table of type \p Type at a given \p Address.
+  /// \p BF is a function referencing the jump table.
+  /// Return true if the jump table was detected at \p Address, and false
+  /// otherwise.
+  ///
+  /// If \p NextJTAddress is different from zero, it is used as an upper
+  /// bound for jump table memory layout.
+  ///
+  /// Optionally, populate \p Offsets with jump table entries. The entries
+  /// could be partially populated if the jump table detection fails.
+  bool analyzeJumpTable(const uint64_t Address,
+                        const JumpTable::JumpTableType Type,
+                        BinaryFunction &BF,
+                        const uint64_t NextJTAddress = 0,
+                        JumpTable::OffsetsType *Offsets = nullptr);
+
+  /// After jump table locations are established, this function will populate
+  /// their OffsetEntries based on memory contents.
+  void populateJumpTables();
+
+  /// Returns a jump table ID and label pointing to the duplicated jump table.
+  /// Ordinarily, jump tables are identified by their address in the input
+  /// binary. We return an ID with the high bit set to differentiate it from
+  /// regular addresses, avoiding conflicts with standard jump tables.
+  std::pair<uint64_t, const MCSymbol *>
+  duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
+                     const MCSymbol *OldLabel);
+
+  /// Generate a unique name for jump table at a given \p Address belonging
+  /// to function \p BF.
+  std::string generateJumpTableName(const BinaryFunction &BF, uint64_t Address);
+
+  /// Return true if the array of bytes represents a valid code padding.
+  bool hasValidCodePadding(const BinaryFunction &BF);
+
+  /// Verify padding area between functions, and adjust max function size
+  /// accordingly.
+  void adjustCodePadding();
+
+  /// Regular page size.
+  static constexpr unsigned RegularPageSize = 0x1000;
+
+  /// Huge page size to use.
+  static constexpr unsigned HugePageSize = 0x200000;
+
+  /// Map address to a constant island owner (constant data in code section)
+  std::map<uint64_t, BinaryFunction *> AddressToConstantIslandMap;
+
+  /// A map from jump table address to insertion order.  Used for generating
+  /// jump table names.
+  std::map<uint64_t, size_t> JumpTableIds;
+
+  std::unique_ptr<MCContext> Ctx;
+
+  /// A mutex that is used to control parallel accesses to Ctx
+  mutable std::shared_timed_mutex CtxMutex;
+  std::unique_lock<std::shared_timed_mutex> scopeLock() const {
+    return std::unique_lock<std::shared_timed_mutex>(CtxMutex);
+  }
+
+  std::unique_ptr<DWARFContext> DwCtx;
+
+  std::unique_ptr<Triple> TheTriple;
+
+  const Target *TheTarget;
+
+  std::string TripleName;
+
+  std::unique_ptr<MCCodeEmitter> MCE;
+
+  std::unique_ptr<MCObjectFileInfo> MOFI;
+
+  std::unique_ptr<const MCAsmInfo> AsmInfo;
+
+  std::unique_ptr<const MCInstrInfo> MII;
+
+  std::unique_ptr<const MCSubtargetInfo> STI;
+
+  std::unique_ptr<MCInstPrinter> InstPrinter;
+
+  std::unique_ptr<const MCInstrAnalysis> MIA;
+
+  std::unique_ptr<MCPlusBuilder> MIB;
+
+  std::unique_ptr<const MCRegisterInfo> MRI;
+
+  std::unique_ptr<MCDisassembler> DisAsm;
+
+  std::unique_ptr<MCAsmBackend> MAB;
+
+  /// Indicates if relocations are available for usage.
+  bool HasRelocations{false};
+
+  /// Is the binary always loaded at a fixed address. Shared objects and
+  /// position-independent executables (PIEs) are examples of binaries that
+  /// will have HasFixedLoadAddress set to false.
+  bool HasFixedLoadAddress{true};
+
+  /// Set to true if the binary contains PT_DYNAMIC header.
+  bool hasDynamicHeader{false};
+
+  /// Set to true if the binary contains PT_INTERP header.
+  bool hasInterpHeader{false};
+
+  /// Indicates if any of local symbols used for functions or data objects
+  /// have an origin file name available.
+  bool HasSymbolsWithFileName{false};
+
+  /// Sum of execution count of all functions
+  uint64_t SumExecutionCount{0};
+
+  /// Number of functions with profile information
+  uint64_t NumProfiledFuncs{0};
+
+  /// Number of objects in profile whose profile was ignored.
+  uint64_t NumUnusedProfiledObjects{0};
+
+  /// Total hotness score according to profiling data for this binary.
+  uint64_t TotalScore{0};
+
+  /// Binary-wide stats for macro-fusion.
+  uint64_t MissedMacroFusionPairs{0};
+  uint64_t MissedMacroFusionExecCount{0};
+
+  // Address of the first allocated segment.
+  uint64_t FirstAllocAddress{std::numeric_limits<uint64_t>::max()};
+
+  /// Track next available address for new allocatable sections. RewriteInstance
+  /// sets this prior to running BOLT passes, so layout passes are aware of the
+  /// final addresses functions will have.
+  uint64_t LayoutStartAddress{0};
+
+  /// Old .text info.
+  uint64_t OldTextSectionAddress{0};
+  uint64_t OldTextSectionOffset{0};
+  uint64_t OldTextSectionSize{0};
+
+  /// Address of the code/function that is executed before any other code in
+  /// the binary.
+  Optional<uint64_t> StartFunctionAddress;
+
+  /// Address of the code/function that is going to be executed right before
+  /// the execution of the binary is completed.
+  Optional<uint64_t> FiniFunctionAddress;
+
+  /// Page alignment used for code layout.
+  uint64_t PageAlign{HugePageSize};
+
+  /// True if the binary requires immediate relocation processing.
+  bool RequiresZNow{false};
+
+  /// List of functions that always trap.
+  std::vector<const BinaryFunction *> TrappedFunctions;
+
+  /// Map SDT locations to SDT markers info
+  std::unordered_map<uint64_t, SDTMarkerInfo> SDTMarkers;
+
+  /// Map linux kernel program locations/instructions to their pointers in
+  /// special linux kernel sections
+  std::unordered_map<uint64_t, std::vector<LKInstructionMarkerInfo>> LKMarkers;
+
+  /// PseudoProbe decoder
+  MCPseudoProbeDecoder ProbeDecoder;
+
+  /// DWARF encoding. Available encoding types defined in BinaryFormat/Dwarf.h
+  /// enum Constants, e.g. DW_EH_PE_omit.
+  unsigned TTypeEncoding = dwarf::DW_EH_PE_omit;
+  unsigned LSDAEncoding = dwarf::DW_EH_PE_omit;
+
+  BinaryContext(std::unique_ptr<MCContext> Ctx,
+                std::unique_ptr<DWARFContext> DwCtx,
+                std::unique_ptr<Triple> TheTriple,
+                const Target *TheTarget,
+                std::string TripleName,
+                std::unique_ptr<MCCodeEmitter> MCE,
+                std::unique_ptr<MCObjectFileInfo> MOFI,
+                std::unique_ptr<const MCAsmInfo> AsmInfo,
+                std::unique_ptr<const MCInstrInfo> MII,
+                std::unique_ptr<const MCSubtargetInfo> STI,
+                std::unique_ptr<MCInstPrinter> InstPrinter,
+                std::unique_ptr<const MCInstrAnalysis> MIA,
+                std::unique_ptr<MCPlusBuilder> MIB,
+                std::unique_ptr<const MCRegisterInfo> MRI,
+                std::unique_ptr<MCDisassembler> DisAsm);
+
+  ~BinaryContext();
+
+  std::unique_ptr<MCObjectWriter> createObjectWriter(raw_pwrite_stream &OS);
+
+  bool isELF() const {
+    return TheTriple->isOSBinFormatELF();
+  }
+
+  bool isMachO() const {
+    return TheTriple->isOSBinFormatMachO();
+  }
+
+  bool isAArch64() const {
+    return TheTriple->getArch() == llvm::Triple::aarch64;
+  }
+
+  bool isX86() const {
+    return TheTriple->getArch() == llvm::Triple::x86 ||
+           TheTriple->getArch() == llvm::Triple::x86_64;
+  }
+
+  bool isExecutable() const {
+    // TODO: handle static-pie binaries case
+    return hasInterpHeader;
+  }
+
+  /// Iterate over all BinaryData.
+  iterator_range<binary_data_const_iterator> getBinaryData() const {
+    return make_range(BinaryDataMap.begin(), BinaryDataMap.end());
+  }
+
+  /// Iterate over all BinaryData.
+  iterator_range<binary_data_iterator> getBinaryData() {
+    return make_range(BinaryDataMap.begin(), BinaryDataMap.end());
+  }
+
+  /// Iterate over all BinaryData associated with the given \p Section.
+  iterator_range<FilteredBinaryDataConstIterator>
+  getBinaryDataForSection(const BinarySection &Section) const {
+    auto Begin = BinaryDataMap.lower_bound(Section.getAddress());
+    if (Begin != BinaryDataMap.begin()) {
+      --Begin;
+    }
+    auto End = BinaryDataMap.upper_bound(Section.getEndAddress());
+    auto pred =
+      [&Section](const binary_data_const_iterator &Itr) -> bool {
+        return Itr->second->getSection() == Section;
+      };
+    return make_range(FilteredBinaryDataConstIterator(pred, Begin, End),
+                      FilteredBinaryDataConstIterator(pred, End, End));
+  }
+
+  /// Iterate over all BinaryData associated with the given \p Section.
+  iterator_range<FilteredBinaryDataIterator>
+  getBinaryDataForSection(BinarySection &Section) {
+    auto Begin = BinaryDataMap.lower_bound(Section.getAddress());
+    if (Begin != BinaryDataMap.begin()) {
+      --Begin;
+    }
+    auto End = BinaryDataMap.upper_bound(Section.getEndAddress());
+    auto pred = [&Section](const binary_data_iterator &Itr) -> bool {
+      return Itr->second->getSection() == Section;
+    };
+    return make_range(FilteredBinaryDataIterator(pred, Begin, End),
+                      FilteredBinaryDataIterator(pred, End, End));
+  }
+
+  /// Iterate over all the sub-symbols of /p BD (if any).
+  iterator_range<binary_data_iterator> getSubBinaryData(BinaryData *BD);
+
+  /// Clear the global symbol address -> name(s) map.
+  void clearBinaryData() {
+    GlobalSymbols.clear();
+    for (auto &Entry : BinaryDataMap) {
+      delete Entry.second;
+    }
+    BinaryDataMap.clear();
+  }
+
+  /// Process \p Address reference from code in function \BF.
+  /// \p IsPCRel indicates if the reference is PC-relative.
+  /// Return <Symbol, Addend> pair corresponding to the \p Address.
+  std::pair<const MCSymbol *, uint64_t> handleAddressRef(uint64_t Address,
+                                                         BinaryFunction &BF,
+                                                         bool IsPCRel);
+
+  /// Analyze memory contents at the given \p Address and return the type of
+  /// memory contents (such as a possible jump table).
+  MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF);
+
+  /// Return a value of the global \p Symbol or an error if the value
+  /// was not set.
+  ErrorOr<uint64_t> getSymbolValue(const MCSymbol &Symbol) const {
+    const BinaryData *BD = getBinaryDataByName(Symbol.getName());
+    if (!BD)
+      return std::make_error_code(std::errc::bad_address);
+    return BD->getAddress();
+  }
+
+  /// Return a global symbol registered at a given \p Address and \p Size.
+  /// If no symbol exists, create one with unique name using \p Prefix.
+  /// If there are multiple symbols registered at the \p Address, then
+  /// return the first one.
+  MCSymbol *getOrCreateGlobalSymbol(uint64_t Address,
+                                    Twine Prefix,
+                                    uint64_t Size = 0,
+                                    uint16_t Alignment = 0,
+                                    unsigned Flags = 0);
+
+  /// Create a global symbol without registering an address.
+  MCSymbol *getOrCreateUndefinedGlobalSymbol(StringRef Name);
+
+  /// Register a symbol with \p Name at a given \p Address using \p Size,
+  /// \p Alignment, and \p Flags. See llvm::SymbolRef::Flags for the definition
+  /// of \p Flags.
+  MCSymbol *registerNameAtAddress(StringRef Name,
+                                  uint64_t Address,
+                                  uint64_t Size,
+                                  uint16_t Alignment,
+                                  unsigned Flags = 0);
+
+  /// Return BinaryData registered at a given \p Address or nullptr if no
+  /// global symbol was registered at the location.
+  const BinaryData *getBinaryDataAtAddress(uint64_t Address) const {
+    auto NI = BinaryDataMap.find(Address);
+    return NI != BinaryDataMap.end() ? NI->second : nullptr;
+  }
+
+  BinaryData *getBinaryDataAtAddress(uint64_t Address) {
+    auto NI = BinaryDataMap.find(Address);
+    return NI != BinaryDataMap.end() ? NI->second : nullptr;
+  }
+
+  /// Look up the symbol entry that contains the given \p Address (based on
+  /// the start address and size for each symbol).  Returns a pointer to
+  /// the BinaryData for that symbol.  If no data is found, nullptr is returned.
+  const BinaryData *
+  getBinaryDataContainingAddress(uint64_t Address) const {
+    return getBinaryDataContainingAddressImpl(Address);
+  }
+
+  BinaryData *getBinaryDataContainingAddress(uint64_t Address) {
+    return
+      const_cast<BinaryData *>(getBinaryDataContainingAddressImpl(Address));
+  }
+
+  /// Return BinaryData for the given \p Name or nullptr if no
+  /// global symbol with that name exists.
+  const BinaryData *getBinaryDataByName(StringRef Name) const {
+    auto Itr = GlobalSymbols.find(Name);
+    return Itr != GlobalSymbols.end() ? Itr->second : nullptr;
+  }
+
+  BinaryData *getBinaryDataByName(StringRef Name) {
+    auto Itr = GlobalSymbols.find(Name);
+    return Itr != GlobalSymbols.end() ? Itr->second : nullptr;
+  }
+
+  /// Return true if \p SymbolName was generated internally and was not present
+  /// in the input binary.
+  bool isInternalSymbolName(const StringRef Name) {
+    return Name.startswith("SYMBOLat") ||
+           Name.startswith("DATAat") ||
+           Name.startswith("HOLEat");
+  }
+
+  MCSymbol *getHotTextStartSymbol() const {
+    return Ctx->getOrCreateSymbol("__hot_start");
+  }
+
+  MCSymbol *getHotTextEndSymbol() const {
+    return Ctx->getOrCreateSymbol("__hot_end");
+  }
+
+  MCSection *getTextSection() const {
+    return MOFI->getTextSection();
+  }
+
+  /// Return code section with a given name.
+  MCSection *getCodeSection(StringRef SectionName) const {
+    if (isELF())
+      return Ctx->getELFSection(SectionName, ELF::SHT_PROGBITS,
+                                ELF::SHF_EXECINSTR | ELF::SHF_ALLOC);
+    else
+      return Ctx->getMachOSection("__TEXT", SectionName,
+                                  MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                  SectionKind::getText());
+  }
+
+  /// Return data section with a given name.
+  MCSection *getDataSection(StringRef SectionName) const {
+    return Ctx->getELFSection(SectionName, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  }
+
+  /// \name Pre-assigned Section Names
+  /// @{
+
+  const char *getMainCodeSectionName() const {
+    return ".text";
+  }
+
+  const char *getColdCodeSectionName() const {
+    return ".text.cold";
+  }
+
+  const char *getHotTextMoverSectionName() const {
+    return ".text.mover";
+  }
+
+  const char *getInjectedCodeSectionName() const {
+    return ".text.injected";
+  }
+
+  const char *getInjectedColdCodeSectionName() const {
+    return ".text.injected.cold";
+  }
+
+  ErrorOr<BinarySection &> getGdbIndexSection() const {
+    return getUniqueSectionByName(".gdb_index");
+  }
+
+  /// @}
+
+  /// Register \p TargetFunction as fragment of \p Function.
+  void registerFragment(BinaryFunction &TargetFunction,
+                        BinaryFunction &Function) const;
+
+  /// Resolve inter-procedural dependencies from \p Function.
+  void processInterproceduralReferences(BinaryFunction &Function);
+
+  /// Perform any necessary post processing on the symbol table after
+  /// function disassembly is complete.  This processing fixes top
+  /// level data holes and makes sure the symbol table is valid.
+  /// It also assigns all memory profiling info to the appropriate
+  /// BinaryData objects.
+  void postProcessSymbolTable();
+
+  /// Set the size of the global symbol located at \p Address.  Return
+  /// false if no symbol exists, true otherwise.
+  bool setBinaryDataSize(uint64_t Address, uint64_t Size);
+
+  /// Print the global symbol table.
+  void printGlobalSymbols(raw_ostream& OS) const;
+
+  /// Register information about the given \p Section so we can look up
+  /// sections by address.
+  BinarySection &registerSection(SectionRef Section);
+
+  /// Register a copy of /p OriginalSection under a different name.
+  BinarySection &registerSection(StringRef SectionName,
+                                 const BinarySection &OriginalSection);
+
+  /// Register or update the information for the section with the given
+  /// /p Name.  If the section already exists, the information in the
+  /// section will be updated with the new data.
+  BinarySection &registerOrUpdateSection(StringRef Name,
+                                         unsigned ELFType,
+                                         unsigned ELFFlags,
+                                         uint8_t *Data = nullptr,
+                                         uint64_t Size = 0,
+                                         unsigned Alignment = 1);
+
+  /// Register the information for the note (non-allocatable) section
+  /// with the given /p Name.  If the section already exists, the
+  /// information in the section will be updated with the new data.
+  BinarySection &
+  registerOrUpdateNoteSection(StringRef Name,
+                              uint8_t *Data = nullptr,
+                              uint64_t Size = 0,
+                              unsigned Alignment = 1,
+                              bool IsReadOnly = true,
+                              unsigned ELFType = ELF::SHT_PROGBITS) {
+    return registerOrUpdateSection(Name, ELFType,
+                                   BinarySection::getFlags(IsReadOnly),
+                                   Data, Size, Alignment);
+  }
+
+  /// Remove the given /p Section from the set of all sections.  Return
+  /// true if the section was removed (and deleted), otherwise false.
+  bool deregisterSection(BinarySection &Section);
+
+  /// Iterate over all registered sections.
+  iterator_range<FilteredSectionIterator> sections() {
+    auto notNull = [](const SectionIterator &Itr) {
+      return (bool)*Itr;
+    };
+    return make_range(FilteredSectionIterator(notNull,
+                                              Sections.begin(),
+                                              Sections.end()),
+                      FilteredSectionIterator(notNull,
+                                              Sections.end(),
+                                              Sections.end()));
+  }
+
+  /// Iterate over all registered sections.
+  iterator_range<FilteredSectionConstIterator> sections() const {
+    return const_cast<BinaryContext *>(this)->sections();
+  }
+
+  /// Iterate over all registered allocatable sections.
+  iterator_range<FilteredSectionIterator> allocatableSections() {
+    auto isAllocatable = [](const SectionIterator &Itr) {
+      return *Itr && Itr->isAllocatable();
+    };
+    return make_range(FilteredSectionIterator(isAllocatable,
+                                              Sections.begin(),
+                                              Sections.end()),
+                      FilteredSectionIterator(isAllocatable,
+                                              Sections.end(),
+                                              Sections.end()));
+  }
+
+  /// Iterate over all registered code sections.
+  iterator_range<FilteredSectionIterator> textSections() {
+    auto isText = [](const SectionIterator &Itr) {
+      return *Itr && Itr->isAllocatable() && Itr->isText();
+    };
+    return make_range(FilteredSectionIterator(isText,
+                                              Sections.begin(),
+                                              Sections.end()),
+                      FilteredSectionIterator(isText,
+                                              Sections.end(),
+                                              Sections.end()));
+  }
+
+  /// Iterate over all registered allocatable sections.
+  iterator_range<FilteredSectionConstIterator> allocatableSections() const {
+    return const_cast<BinaryContext *>(this)->allocatableSections();
+  }
+
+  /// Iterate over all registered non-allocatable sections.
+  iterator_range<FilteredSectionIterator> nonAllocatableSections() {
+    auto notAllocated = [](const SectionIterator &Itr) {
+      return *Itr && !Itr->isAllocatable();
+    };
+    return make_range(FilteredSectionIterator(notAllocated,
+                                              Sections.begin(),
+                                              Sections.end()),
+                      FilteredSectionIterator(notAllocated,
+                                              Sections.end(),
+                                              Sections.end()));
+  }
+
+  /// Iterate over all registered non-allocatable sections.
+  iterator_range<FilteredSectionConstIterator> nonAllocatableSections() const {
+    return const_cast<BinaryContext *>(this)->nonAllocatableSections();
+  }
+
+  /// Iterate over all allocatable relocation sections.
+  iterator_range<FilteredSectionIterator> allocatableRelaSections() {
+    auto isAllocatableRela = [](const SectionIterator &Itr) {
+      return *Itr && Itr->isAllocatable() && Itr->isRela();
+    };
+    return make_range(FilteredSectionIterator(isAllocatableRela,
+                                              Sections.begin(),
+                                              Sections.end()),
+                      FilteredSectionIterator(isAllocatableRela,
+                                              Sections.end(),
+                                              Sections.end()));
+  }
+
+  /// Check if the address belongs to this binary's static allocation space.
+  bool containsAddress(uint64_t Address) const {
+    return Address >= FirstAllocAddress && Address < LayoutStartAddress;
+  }
+
+  /// Return section name containing the given \p Address.
+  ErrorOr<StringRef> getSectionNameForAddress(uint64_t Address) const;
+
+  /// Print all sections.
+  void printSections(raw_ostream& OS) const;
+
+  /// Return largest section containing the given \p Address.  These
+  /// functions only work for allocatable sections, i.e. ones with non-zero
+  /// addresses.
+  ErrorOr<BinarySection &> getSectionForAddress(uint64_t Address);
+  ErrorOr<const BinarySection &> getSectionForAddress(uint64_t Address) const {
+    return const_cast<BinaryContext *>(this)->getSectionForAddress(Address);
+  }
+
+  /// Return section(s) associated with given \p Name.
+  iterator_range<NameToSectionMapType::iterator>
+  getSectionByName(StringRef Name) {
+    return make_range(NameToSection.equal_range(std::string(Name)));
+  }
+  iterator_range<NameToSectionMapType::const_iterator>
+  getSectionByName(StringRef Name) const {
+    return make_range(NameToSection.equal_range(std::string(Name)));
+  }
+
+  /// Return the unique section associated with given \p Name.
+  /// If there is more than one section with the same name, return an error
+  /// object.
+  ErrorOr<BinarySection &> getUniqueSectionByName(StringRef SectionName) const {
+    auto Sections = getSectionByName(SectionName);
+    if (Sections.begin() != Sections.end() &&
+        std::next(Sections.begin()) == Sections.end())
+      return *Sections.begin()->second;
+    return std::make_error_code(std::errc::bad_address);
+  }
+
+  /// Return an unsigned value of \p Size stored at \p Address. The address has
+  /// to be a valid statically allocated address for the binary.
+  ErrorOr<uint64_t> getUnsignedValueAtAddress(uint64_t Address,
+                                              size_t Size) const;
+
+  /// Return a signed value of \p Size stored at \p Address. The address has
+  /// to be a valid statically allocated address for the binary.
+  ErrorOr<uint64_t> getSignedValueAtAddress(uint64_t Address,
+                                            size_t Size) const;
+
+  /// Special case of getUnsignedValueAtAddress() that uses a pointer size.
+  ErrorOr<uint64_t> getPointerAtAddress(uint64_t Address) const {
+    return getUnsignedValueAtAddress(Address, AsmInfo->getCodePointerSize());
+  }
+
+  /// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then
+  /// removed from the list of functions \p BFs. The profile data of \p ChildBF
+  /// is merged into that of \p ParentBF. This function is thread safe.
+  void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF);
+
+  /// Add a Section relocation at a given \p Address.
+  void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type,
+                     uint64_t Addend = 0, uint64_t Value = 0);
+
+  /// Return a relocation registered at a given \p Address, or nullptr if there
+  /// is no relocation at such address.
+  const Relocation *getRelocationAt(uint64_t Address);
+
+  /// Register a presence of PC-relative relocation at the given \p Address.
+  void addPCRelativeDataRelocation(uint64_t Address) {
+    DataPCRelocations.emplace(Address);
+  }
+
+  /// Register dynamic relocation at \p Address.
+  void addDynamicRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type,
+                            uint64_t Addend, uint64_t Value = 0);
+
+  /// Return a dynamic relocation registered at a given \p Address, or nullptr
+  /// if there is no dynamic relocation at such address.
+  const Relocation *getDynamicRelocationAt(uint64_t Address);
+
+  /// Remove registered relocation at a given \p Address.
+  bool removeRelocationAt(uint64_t Address);
+
+  /// This function makes sure that symbols referenced by ambiguous relocations
+  /// are marked as immovable. For now, if a section relocation points at the
+  /// boundary between two symbols then those symbols are marked as immovable.
+  void markAmbiguousRelocations(BinaryData &BD, const uint64_t Address);
+
+  /// Return BinaryFunction corresponding to \p Symbol. If \p EntryDesc is not
+  /// nullptr, set it to entry descriminator corresponding to \p Symbol
+  /// (0 for single-entry functions). This function is thread safe.
+  BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol,
+                                       uint64_t *EntryDesc = nullptr);
+
+  const BinaryFunction *getFunctionForSymbol(
+      const MCSymbol *Symbol, uint64_t *EntryDesc = nullptr) const {
+    return const_cast<BinaryContext *>(this)->
+        getFunctionForSymbol(Symbol, EntryDesc);
+  }
+
+  /// Associate the symbol \p Sym with the function \p BF for lookups with
+  /// getFunctionForSymbol().
+  void setSymbolToFunctionMap(const MCSymbol *Sym, BinaryFunction *BF) {
+    SymbolToFunctionMap[Sym] = BF;
+  }
+
+  /// Populate some internal data structures with debug info.
+  void preprocessDebugInfo();
+
+  /// Add a filename entry from SrcCUID to DestCUID.
+  unsigned addDebugFilenameToUnit(const uint32_t DestCUID,
+                                  const uint32_t SrcCUID,
+                                  unsigned FileIndex);
+
+  /// Return functions in output layout order
+  std::vector<BinaryFunction *> getSortedFunctions();
+
+  /// Do the best effort to calculate the size of the function by emitting
+  /// its code, and relaxing branch instructions. By default, branch
+  /// instructions are updated to match the layout. Pass \p FixBranches set to
+  /// false if the branches are known to be up to date with the code layout.
+  ///
+  /// Return the pair where the first size is for the main part, and the second
+  /// size is for the cold one.
+  std::pair<size_t, size_t>
+  calculateEmittedSize(BinaryFunction &BF, bool FixBranches = true);
+
+  /// Calculate the size of the instruction \p Inst optionally using a
+  /// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is
+  /// not thread safe and each thread should operate with its own copy of it.
+  uint64_t
+  computeInstructionSize(const MCInst &Inst,
+                         const MCCodeEmitter *Emitter = nullptr) const {
+    if (!Emitter)
+      Emitter = this->MCE.get();
+    SmallString<256> Code;
+    SmallVector<MCFixup, 4> Fixups;
+    raw_svector_ostream VecOS(Code);
+    Emitter->encodeInstruction(Inst, VecOS, Fixups, *STI);
+    return Code.size();
+  }
+
+  /// Compute the native code size for a range of instructions.
+  /// Note: this can be imprecise wrt the final binary since happening prior to
+  /// relaxation, as well as wrt the original binary because of opcode
+  /// shortening.MCCodeEmitter is not thread safe and each thread should operate
+  /// with its own copy of it.
+  template <typename Itr>
+  uint64_t computeCodeSize(Itr Beg, Itr End,
+                           const MCCodeEmitter *Emitter = nullptr) const {
+    uint64_t Size = 0;
+    while (Beg != End) {
+      if (!MIB->isPseudo(*Beg))
+        Size += computeInstructionSize(*Beg, Emitter);
+      ++Beg;
+    }
+    return Size;
+  }
+
+  /// Verify that assembling instruction \p Inst results in the same sequence of
+  /// bytes as \p Encoding.
+  bool validateEncoding(const MCInst &Instruction,
+                        ArrayRef<uint8_t> Encoding) const;
+
+  /// Return a function execution count threshold for determining whether
+  /// the function is 'hot'. Consider it hot if count is above the average exec
+  /// count of profiled functions.
+  uint64_t getHotThreshold() const;
+
+  /// Return true if instruction \p Inst requires an offset for further
+  /// processing (e.g. assigning a profile).
+  bool keepOffsetForInstruction(const MCInst &Inst) const {
+    if (MIB->isCall(Inst) || MIB->isBranch(Inst) || MIB->isReturn(Inst) ||
+        MIB->isPrefix(Inst) || MIB->isIndirectBranch(Inst)) {
+      return true;
+    }
+    return false;
+  }
+
+  /// Return true if the function should be emitted to the output file.
+  bool shouldEmit(const BinaryFunction &Function) const;
+
+  /// Print the string name for a CFI operation.
+  static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst);
+
+  /// Print a single MCInst in native format.  If Function is non-null,
+  /// the instruction will be annotated with CFI and possibly DWARF line table
+  /// info.
+  /// If printMCInst is true, the instruction is also printed in the
+  /// architecture independent format.
+  void printInstruction(raw_ostream &OS,
+                        const MCInst &Instruction,
+                        uint64_t Offset = 0,
+                        const BinaryFunction *Function = nullptr,
+                        bool PrintMCInst = false,
+                        bool PrintMemData = false,
+                        bool PrintRelocations = false) const;
+
+  /// Print a range of instructions.
+  template <typename Itr>
+  uint64_t printInstructions(raw_ostream &OS,
+                             Itr Begin,
+                             Itr End,
+                             uint64_t Offset = 0,
+                             const BinaryFunction *Function = nullptr,
+                             bool PrintMCInst = false,
+                             bool PrintMemData = false,
+                             bool PrintRelocations = false) const {
+    while (Begin != End) {
+      printInstruction(OS, *Begin, Offset, Function, PrintMCInst,
+                       PrintMemData, PrintRelocations);
+      Offset += computeCodeSize(Begin, Begin + 1);
+      ++Begin;
+    }
+    return Offset;
+  }
+
+  void exitWithBugReport(StringRef Message,
+                         const BinaryFunction &Function) const;
+
+  struct IndependentCodeEmitter {
+    std::unique_ptr<MCObjectFileInfo> LocalMOFI;
+    std::unique_ptr<MCContext> LocalCtx;
+    std::unique_ptr<MCCodeEmitter> MCE;
+  };
+
+  /// Encapsulates an independent MCCodeEmitter that doesn't share resources
+  /// with the main one available through BinaryContext::MCE, managed by
+  /// BinaryContext.
+  /// This is intended to create a lock-free environment for an auxiliary thread
+  /// that needs to perform work with an MCCodeEmitter that can be transient or
+  /// won't be used in the main code emitter.
+  IndependentCodeEmitter createIndependentMCCodeEmitter() const {
+    IndependentCodeEmitter MCEInstance;
+    MCEInstance.LocalCtx.reset(
+        new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get()));
+    MCEInstance.LocalMOFI.reset(
+        TheTarget->createMCObjectFileInfo(*MCEInstance.LocalCtx.get(),
+                                          /*PIC=*/!HasFixedLoadAddress));
+    MCEInstance.LocalCtx->setObjectFileInfo(MCEInstance.LocalMOFI.get());
+    MCEInstance.MCE.reset(
+        TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx));
+    return MCEInstance;
+  }
+
+  /// Creating MCStreamer instance.
+  std::unique_ptr<MCStreamer>
+  createStreamer(llvm::raw_pwrite_stream &OS) const {
+    MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx);
+    MCAsmBackend *MAB =
+        TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions());
+    std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(OS);
+    std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
+        *TheTriple, *Ctx, std::unique_ptr<MCAsmBackend>(MAB), std::move(OW),
+        std::unique_ptr<MCCodeEmitter>(MCE), *STI,
+        /* RelaxAll */ false,
+        /* IncrementalLinkerCompatible */ false,
+        /* DWARFMustBeAtTheEnd */ false));
+    return Streamer;
+  }
+};
+
+template <typename T,
+          typename = std::enable_if_t<sizeof(T) == 1> >
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const ArrayRef<T> &ByteArray) {
+  const char *Sep = "";
+  for (const auto Byte : ByteArray) {
+    OS << Sep << format("%.2x", Byte);
+    Sep = " ";
+  }
+  return OS;
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BinaryData.cpp b/bolt/src/BinaryData.cpp
new file mode 100644
index 000000000000..8a2fbe41ef80
--- /dev/null
+++ b/bolt/src/BinaryData.cpp
@@ -0,0 +1,159 @@
+//===--- BinaryData.cpp - Representation of section data objects ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryData.h"
+#include "BinarySection.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+using namespace bolt;
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+namespace opts {
+extern cl::OptionCategory BoltCategory;
+extern cl::opt<unsigned> Verbosity;
+
+cl::opt<bool>
+PrintSymbolAliases("print-aliases",
+  cl::desc("print aliases when printing objects"),
+  cl::Hidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+}
+
+bool BinaryData::isAbsolute() const {
+  return Flags & SymbolRef::SF_Absolute;
+}
+
+bool BinaryData::isMoveable() const {
+  return (!isAbsolute() &&
+          (IsMoveable &&
+           (!Parent || isTopLevelJumpTable())));
+}
+
+void BinaryData::merge(const BinaryData *Other) {
+  assert(!Size || !Other->Size || Size == Other->Size);
+  assert(Address == Other->Address);
+  assert(*Section == *Other->Section);
+  assert(OutputOffset == Other->OutputOffset);
+  assert(OutputSection == Other->OutputSection);
+  Symbols.insert(Symbols.end(), Other->Symbols.begin(), Other->Symbols.end());
+  Flags |= Other->Flags;
+  if (!Size)
+    Size = Other->Size;
+}
+
+bool BinaryData::hasName(StringRef Name) const {
+  for (const MCSymbol *Symbol : Symbols) {
+    if (Name == Symbol->getName())
+      return true;
+  }
+  return false;
+}
+
+bool BinaryData::hasNameRegex(StringRef NameRegex) const {
+  Regex MatchName(NameRegex);
+  for (const MCSymbol *Symbol : Symbols) {
+    if (MatchName.match(Symbol->getName()))
+      return true;
+  }
+  return false;
+}
+
+bool BinaryData::nameStartsWith(StringRef Prefix) const {
+  for (const MCSymbol *Symbol : Symbols) {
+    if (Symbol->getName().startswith(Prefix))
+      return true;
+  }
+  return false;
+}
+
+StringRef BinaryData::getSectionName() const {
+  return getSection().getName();
+}
+
+StringRef BinaryData::getOutputSectionName() const {
+  return getOutputSection().getName();
+}
+
+uint64_t BinaryData::getOutputAddress() const {
+  assert(OutputSection->getOutputAddress());
+  return OutputSection->getOutputAddress() + OutputOffset;
+}
+
+uint64_t BinaryData::getOffset() const {
+  return Address - getSection().getAddress();
+}
+
+void BinaryData::setSection(BinarySection &NewSection) {
+  if (OutputSection == Section)
+    OutputSection = &NewSection;
+  Section = &NewSection;
+}
+
+bool BinaryData::isMoved() const {
+  return (getOffset() != OutputOffset || OutputSection != Section);
+}
+
+void BinaryData::print(raw_ostream &OS) const {
+  printBrief(OS);
+}
+
+void BinaryData::printBrief(raw_ostream &OS) const {
+  OS << "(";
+
+  if (isJumpTable())
+    OS << "jump-table: ";
+  else
+    OS << "object: ";
+
+  OS << getName();
+
+  if ((opts::PrintSymbolAliases || opts::Verbosity > 1) && Symbols.size() > 1) {
+    OS << ", aliases:";
+    for (unsigned I = 1u; I < Symbols.size(); ++I) {
+      OS << (I == 1 ? " (" : ", ") << Symbols[I]->getName();
+    }
+    OS << ")";
+  }
+
+  if (Parent) {
+    OS << " (parent: ";
+    Parent->printBrief(OS);
+    OS << ")";
+  }
+
+  OS << ", 0x" << Twine::utohexstr(getAddress())
+     << ":0x" << Twine::utohexstr(getEndAddress())
+     << "/" << getSize() << "/" << getAlignment()
+     << "/0x" << Twine::utohexstr(Flags);
+
+  OS << ")";
+}
+
+BinaryData::BinaryData(MCSymbol &Symbol,
+                       uint64_t Address,
+                       uint64_t Size,
+                       uint16_t Alignment,
+                       BinarySection &Section,
+                       unsigned Flags)
+: Section(&Section),
+  Address(Address),
+  Size(Size),
+  Alignment(Alignment),
+  Flags(Flags),
+  OutputSection(&Section),
+  OutputOffset(getOffset())
+{
+  Symbols.push_back(&Symbol);
+}
diff --git a/bolt/src/BinaryData.h b/bolt/src/BinaryData.h
new file mode 100644
index 000000000000..563d30fc33c6
--- /dev/null
+++ b/bolt/src/BinaryData.h
@@ -0,0 +1,246 @@
+//===--- BinaryData.h  - Representation of section data objects -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_DATA_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_DATA_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+
+class BinarySection;
+
+/// \p BinaryData represents an indivisible part of a data section section.
+/// BinaryData's may contain sub-components, e.g. jump tables but they are
+/// considered to be part of the parent symbol in terms of divisibility and
+/// reordering.
+class BinaryData {
+  friend class BinaryContext;
+  /// Non-null if this BinaryData is contained in a larger BinaryData object,
+  /// i.e. the start and end addresses are contained within another object.
+  BinaryData *Parent{nullptr};
+
+  // non-copyable
+  BinaryData() = delete;
+  BinaryData(const BinaryData &) = delete;
+  BinaryData &operator=(const BinaryData &) = delete;
+
+protected:
+  /// All symbols associated with this data.
+  std::vector<MCSymbol *> Symbols;
+
+  /// Section this data belongs to.
+  BinarySection *Section{nullptr};
+
+  /// Start address of this symbol.
+  uint64_t Address{0};
+  /// Size of this data (can be 0).
+  uint64_t Size{0};
+  /// Alignment of this data.
+  uint16_t Alignment{1};
+
+  bool IsMoveable{true};
+
+  /// Symbol flags (same as llvm::SymbolRef::Flags)
+  unsigned Flags{0};
+
+  /// Output section for this data if it has been moved from the original
+  /// section.
+  BinarySection *OutputSection{nullptr};
+
+  /// The offset of this symbol in the output section.  This is different
+  /// from \p Address - Section.getAddress() when the data has been reordered.
+  uint64_t OutputOffset{0};
+
+  BinaryData *getRootData() {
+    BinaryData *BD = this;
+    while (BD->Parent)
+      BD = BD->Parent;
+    return BD;
+  }
+
+public:
+  BinaryData(BinaryData &&) = default;
+  BinaryData(MCSymbol &Symbol,
+             uint64_t Address,
+             uint64_t Size,
+             uint16_t Alignment,
+             BinarySection &Section,
+             unsigned Flags = 0);
+  virtual ~BinaryData() { }
+
+  virtual bool isJumpTable() const { return false; }
+  virtual bool isObject() const { return !isJumpTable(); }
+  virtual void merge(const BinaryData *Other);
+
+  bool isTopLevelJumpTable() const {
+    return (isJumpTable() &&
+            (!Parent || (!Parent->Parent && Parent->isObject())));
+  }
+
+  // BinaryData that is considered atomic and potentially moveable.  All
+  // MemInfo data and relocations should be wrt. to atomic data.
+  bool isAtomic() const {
+    return isTopLevelJumpTable() || !Parent;
+  }
+
+  iterator_range<std::vector<MCSymbol *>::const_iterator> symbols() const {
+    return make_range(Symbols.begin(), Symbols.end());
+  }
+
+  StringRef getName() const { return getSymbol()->getName(); }
+
+  MCSymbol *getSymbol() { return Symbols.front(); }
+  const MCSymbol *getSymbol() const { return Symbols.front(); }
+
+  const std::vector<MCSymbol *> &getSymbols() const { return Symbols; }
+  std::vector<MCSymbol *> &getSymbols() { return Symbols; }
+
+  bool hasName(StringRef Name) const;
+  bool hasNameRegex(StringRef Name) const;
+  bool nameStartsWith(StringRef Prefix) const;
+
+  bool hasSymbol(const MCSymbol *Symbol) const {
+    return std::find(Symbols.begin(), Symbols.end(), Symbol) != Symbols.end();
+  }
+
+  bool isAbsolute() const;
+  bool isMoveable() const;
+
+  uint64_t getAddress() const { return Address; }
+  uint64_t getEndAddress() const { return Address + Size; }
+  uint64_t getOffset() const;
+  uint64_t getSize() const { return Size; }
+  uint16_t getAlignment() const { return Alignment; }
+
+  BinarySection &getSection() { return *Section; }
+  const BinarySection &getSection() const { return *Section; }
+  StringRef getSectionName() const;
+
+  BinarySection &getOutputSection() { return *OutputSection; }
+  const BinarySection &getOutputSection() const { return *OutputSection; }
+  StringRef getOutputSectionName() const;
+  uint64_t getOutputAddress() const;
+  uint64_t getOutputOffset() const { return OutputOffset; }
+  uint64_t getOutputSize() const { return Size; }
+
+  bool isMoved() const;
+  bool containsAddress(uint64_t Address) const {
+    return ((getAddress() <= Address && Address < getEndAddress()) ||
+            (getAddress() == Address && !getSize()));
+  }
+  bool containsRange(uint64_t Address, uint64_t Size) const {
+    return containsAddress(Address) && Address + Size <= getEndAddress();
+  }
+
+  const BinaryData *getParent() const {
+    return Parent;
+  }
+
+  const BinaryData *getRootData() const {
+    const BinaryData *BD = this;
+    while (BD->Parent)
+      BD = BD->Parent;
+    return BD;
+  }
+
+  BinaryData *getAtomicRoot() {
+    BinaryData *BD = this;
+    while (!BD->isAtomic() && BD->Parent)
+      BD = BD->Parent;
+    return BD;
+  }
+
+  const BinaryData *getAtomicRoot() const {
+    const BinaryData *BD = this;
+    while (!BD->isAtomic() && BD->Parent)
+      BD = BD->Parent;
+    return BD;
+  }
+
+  bool isAncestorOf(const BinaryData *BD) const {
+    return Parent && (Parent == BD || Parent->isAncestorOf(BD));
+  }
+
+  void setIsMoveable(bool Flag) { IsMoveable = Flag; }
+  void setSection(BinarySection &NewSection);
+  void setOutputSection(BinarySection &NewSection) {
+    OutputSection = &NewSection;
+  }
+  void setOutputOffset(uint64_t Offset) { OutputOffset = Offset; }
+  void setOutputLocation(BinarySection &NewSection, uint64_t NewOffset) {
+    setOutputSection(NewSection);
+    setOutputOffset(NewOffset);
+  }
+
+  virtual void printBrief(raw_ostream &OS) const;
+  virtual void print(raw_ostream &OS) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const BinaryData &BD) {
+  BD.printBrief(OS);
+  return OS;
+}
+
+/// Address access info used for memory profiling.
+struct AddressAccess {
+  BinaryData *MemoryObject; /// Object accessed or nullptr
+  uint64_t Offset;          /// Offset within the object or absolute address
+  uint64_t Count;           /// Number of accesses
+  bool operator==(const AddressAccess &Other) const {
+    return MemoryObject == Other.MemoryObject &&
+           Offset == Other.Offset &&
+           Count == Other.Count;
+  }
+};
+
+/// Aggregated memory access info per instruction.
+struct MemoryAccessProfile {
+  uint64_t NextInstrOffset;
+  SmallVector<AddressAccess, 4> AddressAccessInfo;
+  bool operator==(const MemoryAccessProfile &Other) const {
+    return NextInstrOffset == Other.NextInstrOffset &&
+           AddressAccessInfo == Other.AddressAccessInfo;
+  }
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const bolt::MemoryAccessProfile &MAP) {
+  std::string TempString;
+  raw_string_ostream SS(TempString);
+
+  const char *Sep = "\n        ";
+  uint64_t TotalCount = 0;
+  for (const AddressAccess &AccessInfo : MAP.AddressAccessInfo) {
+    SS << Sep << "{ ";
+    if (AccessInfo.MemoryObject)
+      SS << AccessInfo.MemoryObject->getName() << " + ";
+    SS << "0x" << Twine::utohexstr(AccessInfo.Offset) << ": "
+       << AccessInfo.Count << " }";
+    Sep = ",\n        ";
+    TotalCount += AccessInfo.Count;
+  }
+  SS.flush();
+
+  OS << TotalCount << " total counts : " << TempString;
+  return OS;
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BinaryEmitter.cpp b/bolt/src/BinaryEmitter.cpp
new file mode 100644
index 000000000000..76ef4b8159bc
--- /dev/null
+++ b/bolt/src/BinaryEmitter.cpp
@@ -0,0 +1,1027 @@
+//===--- BinaryEmitter.cpp - collection of functions to emit code and data ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryEmitter.h"
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/SMLoc.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+extern cl::OptionCategory BoltOptCategory;
+extern cl::OptionCategory BoltRelocCategory;
+
+extern cl::opt<unsigned> AlignText;
+extern cl::opt<bool> HotText;
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+extern cl::opt<bool> PreserveBlocksAlignment;
+extern cl::opt<bool> PrintCacheMetrics;
+extern cl::opt<bool> UpdateDebugSections;
+extern cl::opt<unsigned> Verbosity;
+
+cl::opt<bool>
+AlignBlocks("align-blocks",
+  cl::desc("align basic blocks"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<MacroFusionType>
+AlignMacroOpFusion("align-macro-fusion",
+  cl::desc("fix instruction alignment for macro-fusion (x86 relocation mode)"),
+  cl::init(MFT_HOT),
+  cl::values(clEnumValN(MFT_NONE, "none",
+               "do not insert alignment no-ops for macro-fusion"),
+             clEnumValN(MFT_HOT, "hot",
+               "only insert alignment no-ops on hot execution paths (default)"),
+             clEnumValN(MFT_ALL, "all",
+               "always align instructions to allow macro-fusion")),
+  cl::ZeroOrMore,
+  cl::cat(BoltRelocCategory));
+
+static cl::list<std::string>
+BreakFunctionNames("break-funcs",
+  cl::CommaSeparated,
+  cl::desc("list of functions to core dump on (debugging)"),
+  cl::value_desc("func1,func2,func3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::list<std::string>
+FunctionPadSpec("pad-funcs",
+  cl::CommaSeparated,
+  cl::desc("list of functions to pad with amount of bytes"),
+  cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+MarkFuncs("mark-funcs",
+  cl::desc("mark function boundaries with break instruction to make "
+           "sure we accidentally don't cross them"),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+PrintJumpTables("print-jump-tables",
+  cl::desc("print jump tables"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+X86AlignBranchBoundaryHotOnly("x86-align-branch-boundary-hot-only",
+  cl::desc("only apply branch boundary alignment in hot code"),
+  cl::init(true),
+  cl::cat(BoltOptCategory));
+
+size_t padFunction(const BinaryFunction &Function) {
+  static std::map<std::string, size_t> FunctionPadding;
+
+  if (FunctionPadding.empty() && !FunctionPadSpec.empty()) {
+    for (std::string &Spec : FunctionPadSpec) {
+      size_t N = Spec.find(':');
+      if (N == std::string::npos)
+        continue;
+      std::string Name = Spec.substr(0, N);
+      size_t Padding = std::stoull(Spec.substr(N+1));
+      FunctionPadding[Name] = Padding;
+    }
+  }
+
+  for (auto &FPI : FunctionPadding) {
+    std::string Name = FPI.first;
+    size_t Padding = FPI.second;
+    if (Function.hasNameRegex(Name)) {
+      return Padding;
+    }
+  }
+
+  return 0;
+}
+
+} // namespace opts
+
+namespace {
+using JumpTable = bolt::JumpTable;
+
+class BinaryEmitter {
+private:
+  BinaryEmitter(const BinaryEmitter &) = delete;
+  BinaryEmitter &operator=(const BinaryEmitter &) = delete;
+
+  MCStreamer &Streamer;
+  BinaryContext &BC;
+
+public:
+  BinaryEmitter(MCStreamer &Streamer, BinaryContext &BC)
+    : Streamer(Streamer),
+      BC(BC) {}
+
+  /// Emit all code and data.
+  void emitAll(StringRef OrgSecPrefix);
+
+  /// Emit function code. The caller is responsible for emitting function
+  /// symbol(s) and setting the section to emit the code to.
+  void emitFunctionBody(BinaryFunction &BF, bool EmitColdPart,
+                        bool EmitCodeOnly = false);
+
+private:
+  /// Emit function code.
+  void emitFunctions();
+
+  /// Emit a single function.
+  bool emitFunction(BinaryFunction &BF, bool EmitColdPart);
+
+  /// Helper for emitFunctionBody to write data inside a function
+  /// (used for AArch64)
+  void emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
+                           BinaryFunction *OnBehalfOf = nullptr);
+
+  /// Emit jump tables for the function.
+  void emitJumpTables(const BinaryFunction &BF);
+
+  /// Emit jump table data. Callee supplies sections for the data.
+  void emitJumpTable(const JumpTable &JT, MCSection *HotSection,
+                     MCSection *ColdSection);
+
+  /// Emit exception handling ranges for the function.
+  void emitLSDA(BinaryFunction &BF, bool EmitColdPart);
+
+  /// Emit line number information corresponding to \p NewLoc. \p PrevLoc
+  /// provides a context for de-duplication of line number info.
+  /// \p FirstInstr indicates if \p NewLoc represents the first instruction
+  /// in a sequence, such as a function fragment.
+  ///
+  /// Return new current location which is either \p NewLoc or \p PrevLoc.
+  SMLoc emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, SMLoc PrevLoc,
+                     bool FirstInstr);
+
+  /// Emit debug line information for functions that were not emitted.
+  void emitDebugLineInfoForOriginalFunctions();
+
+  /// Emit data sections that have code references in them.
+  void emitDataSections(StringRef OrgSecPrefix);
+};
+
+} // anonymous namespace
+
+void BinaryEmitter::emitAll(StringRef OrgSecPrefix) {
+  Streamer.InitSections(false);
+
+  if (opts::UpdateDebugSections && BC.isELF()) {
+    // Force the emission of debug line info into allocatable section to ensure
+    // RuntimeDyld will process it without ProcessAllSections flag.
+    //
+    // NB: on MachO all sections are required for execution, hence no need
+    //     to change flags/attributes.
+    MCSectionELF *ELFDwarfLineSection =
+        static_cast<MCSectionELF *>(BC.MOFI->getDwarfLineSection());
+    ELFDwarfLineSection->setFlags(ELF::SHF_ALLOC);
+  }
+
+  if (RuntimeLibrary *RtLibrary = BC.getRuntimeLibrary()) {
+    RtLibrary->emitBinary(BC, Streamer);
+  }
+
+  BC.getTextSection()->setAlignment(Align(opts::AlignText));
+
+  emitFunctions();
+
+  if (opts::UpdateDebugSections)
+    emitDebugLineInfoForOriginalFunctions();
+
+  emitDataSections(OrgSecPrefix);
+
+  Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("_end"));
+}
+
+void BinaryEmitter::emitFunctions() {
+  auto emit = [&](const std::vector<BinaryFunction *> &Functions) {
+    const bool HasProfile = BC.NumProfiledFuncs > 0;
+    const bool OriginalAllowAutoPadding = Streamer.getAllowAutoPadding();
+    for (BinaryFunction *Function : Functions) {
+      if (!BC.shouldEmit(*Function)) {
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "BOLT: generating code for function \"" << *Function
+                        << "\" : " << Function->getFunctionNumber() << '\n');
+
+      // Was any part of the function emitted.
+      bool Emitted = false;
+
+      // Turn off Intel JCC Erratum mitigation for cold code if requested
+      if (HasProfile && opts::X86AlignBranchBoundaryHotOnly &&
+          !Function->hasValidProfile())
+        Streamer.setAllowAutoPadding(false);
+
+      Emitted |= emitFunction(*Function, /*EmitColdPart=*/false);
+
+      if (Function->isSplit()) {
+        if (opts::X86AlignBranchBoundaryHotOnly)
+          Streamer.setAllowAutoPadding(false);
+        Emitted |= emitFunction(*Function, /*EmitColdPart=*/true);
+      }
+      Streamer.setAllowAutoPadding(OriginalAllowAutoPadding);
+
+      if (Emitted)
+        Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics);
+    }
+  };
+
+  // Mark the start of hot text.
+  if (opts::HotText) {
+    Streamer.SwitchSection(BC.getTextSection());
+    Streamer.emitLabel(BC.getHotTextStartSymbol());
+  }
+
+  // Emit functions in sorted order.
+  std::vector<BinaryFunction *> SortedFunctions = BC.getSortedFunctions();
+  emit(SortedFunctions);
+
+  // Emit functions added by BOLT.
+  emit(BC.getInjectedBinaryFunctions());
+
+  // Mark the end of hot text.
+  if (opts::HotText) {
+    Streamer.SwitchSection(BC.getTextSection());
+    Streamer.emitLabel(BC.getHotTextEndSymbol());
+  }
+}
+
+bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) {
+  if (Function.size() == 0)
+    return false;
+
+  if (Function.getState() == BinaryFunction::State::Empty)
+    return false;
+
+  MCSection *Section =
+      BC.getCodeSection(EmitColdPart ? Function.getColdCodeSectionName()
+                                     : Function.getCodeSectionName());
+  Streamer.SwitchSection(Section);
+  Section->setHasInstructions(true);
+  BC.Ctx->addGenDwarfSection(Section);
+
+  if (BC.HasRelocations) {
+    Streamer.emitCodeAlignment(BinaryFunction::MinAlign);
+    uint16_t MaxAlignBytes = EmitColdPart
+      ? Function.getMaxColdAlignmentBytes()
+      : Function.getMaxAlignmentBytes();
+    if (MaxAlignBytes > 0)
+      Streamer.emitCodeAlignment(Function.getAlignment(), MaxAlignBytes);
+  } else {
+    Streamer.emitCodeAlignment(Function.getAlignment());
+  }
+
+  MCContext &Context = Streamer.getContext();
+  const MCAsmInfo *MAI = Context.getAsmInfo();
+
+  MCSymbol *StartSymbol = nullptr;
+
+  // Emit all symbols associated with the main function entry.
+  if (!EmitColdPart) {
+    StartSymbol = Function.getSymbol();
+    for (MCSymbol *Symbol : Function.getSymbols()) {
+      Streamer.emitSymbolAttribute(Symbol, MCSA_ELF_TypeFunction);
+      Streamer.emitLabel(Symbol);
+    }
+  } else {
+    StartSymbol = Function.getColdSymbol();
+    Streamer.emitSymbolAttribute(StartSymbol, MCSA_ELF_TypeFunction);
+    Streamer.emitLabel(StartSymbol);
+  }
+
+  // Emit CFI start
+  if (Function.hasCFI()) {
+    Streamer.emitCFIStartProc(/*IsSimple=*/false);
+    if (Function.getPersonalityFunction() != nullptr) {
+      Streamer.emitCFIPersonality(Function.getPersonalityFunction(),
+                                  Function.getPersonalityEncoding());
+    }
+    MCSymbol *LSDASymbol =
+        EmitColdPart ? Function.getColdLSDASymbol() : Function.getLSDASymbol();
+    if (LSDASymbol) {
+      Streamer.emitCFILsda(LSDASymbol, BC.LSDAEncoding);
+    } else {
+      Streamer.emitCFILsda(0, dwarf::DW_EH_PE_omit);
+    }
+    // Emit CFI instructions relative to the CIE
+    for (const MCCFIInstruction &CFIInstr : Function.cie()) {
+      // Only write CIE CFI insns that LLVM will not already emit
+      const std::vector<MCCFIInstruction> &FrameInstrs =
+          MAI->getInitialFrameState();
+      if (std::find(FrameInstrs.begin(), FrameInstrs.end(), CFIInstr) ==
+          FrameInstrs.end())
+        Streamer.emitCFIInstruction(CFIInstr);
+    }
+  }
+
+  assert((Function.empty() || !(*Function.begin()).isCold()) &&
+         "first basic block should never be cold");
+
+  // Emit UD2 at the beginning if requested by user.
+  if (!opts::BreakFunctionNames.empty()) {
+    for (std::string &Name : opts::BreakFunctionNames) {
+      if (Function.hasNameRegex(Name)) {
+        Streamer.emitIntValue(0x0B0F, 2); // UD2: 0F 0B
+        break;
+      }
+    }
+  }
+
+  // Emit code.
+  emitFunctionBody(Function, EmitColdPart, /*EmitCodeOnly=*/false);
+
+  // Emit padding if requested.
+  if (size_t Padding = opts::padFunction(Function)) {
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with "
+                      << Padding << " bytes\n");
+    Streamer.emitFill(Padding, MAI->getTextAlignFillValue());
+  }
+
+  if (opts::MarkFuncs) {
+    Streamer.emitIntValue(MAI->getTrapFillValue(), 1);
+  }
+
+  // Emit CFI end
+  if (Function.hasCFI())
+    Streamer.emitCFIEndProc();
+
+  MCSymbol *EndSymbol = EmitColdPart ? Function.getFunctionColdEndLabel()
+                                     : Function.getFunctionEndLabel();
+  Streamer.emitLabel(EndSymbol);
+
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    const MCExpr *SizeExpr = MCBinaryExpr::createSub(
+        MCSymbolRefExpr::create(EndSymbol, Context),
+        MCSymbolRefExpr::create(StartSymbol, Context), Context);
+    Streamer.emitELFSize(StartSymbol, SizeExpr);
+  }
+
+  // Exception handling info for the function.
+  emitLSDA(Function, EmitColdPart);
+
+  if (!EmitColdPart && opts::JumpTables > JTS_NONE)
+    emitJumpTables(Function);
+
+  return true;
+}
+
+void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, bool EmitColdPart,
+                                     bool EmitCodeOnly) {
+  if (!EmitCodeOnly && EmitColdPart && BF.hasConstantIsland())
+    BF.duplicateConstantIslands();
+
+  // Track the first emitted instruction with debug info.
+  bool FirstInstr = true;
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (EmitColdPart != BB->isCold())
+      continue;
+
+    if ((opts::AlignBlocks || opts::PreserveBlocksAlignment)
+        && BB->getAlignment() > 1) {
+      Streamer.emitCodeAlignment(BB->getAlignment(),
+                                 BB->getAlignmentMaxBytes());
+    }
+    Streamer.emitLabel(BB->getLabel());
+    if (!EmitCodeOnly) {
+      if (MCSymbol *EntrySymbol = BF.getSecondaryEntryPointSymbol(*BB)) {
+        Streamer.emitLabel(EntrySymbol);
+      }
+    }
+
+    // Check if special alignment for macro-fusion is needed.
+    bool MayNeedMacroFusionAlignment =
+      (opts::AlignMacroOpFusion == MFT_ALL) ||
+      (opts::AlignMacroOpFusion == MFT_HOT &&
+       BB->getKnownExecutionCount());
+    BinaryBasicBlock::const_iterator MacroFusionPair;
+    if (MayNeedMacroFusionAlignment) {
+      MacroFusionPair = BB->getMacroOpFusionPair();
+      if (MacroFusionPair == BB->end())
+        MayNeedMacroFusionAlignment = false;
+    }
+
+    SMLoc LastLocSeen;
+    // Remember if the last instruction emitted was a prefix.
+    bool LastIsPrefix = false;
+    for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+      MCInst &Instr = *I;
+
+      if (EmitCodeOnly && BC.MIB->isPseudo(Instr))
+        continue;
+
+      // Handle pseudo instructions.
+      if (BC.MIB->isEHLabel(Instr)) {
+        const MCSymbol *Label = BC.MIB->getTargetSymbol(Instr);
+        assert(Instr.getNumOperands() >= 1 && Label &&
+               "bad EH_LABEL instruction");
+        Streamer.emitLabel(const_cast<MCSymbol *>(Label));
+        continue;
+      }
+      if (BC.MIB->isCFI(Instr)) {
+        Streamer.emitCFIInstruction(*BF.getCFIFor(Instr));
+        continue;
+      }
+
+      // Handle macro-fusion alignment. If we emitted a prefix as
+      // the last instruction, we should've already emitted the associated
+      // alignment hint, so don't emit it twice.
+      if (MayNeedMacroFusionAlignment && !LastIsPrefix && I == MacroFusionPair){
+        // This assumes the second instruction in the macro-op pair will get
+        // assigned to its own MCRelaxableFragment. Since all JCC instructions
+        // are relaxable, we should be safe.
+        Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64);
+      }
+
+      if (!EmitCodeOnly && opts::UpdateDebugSections && BF.getDWARFUnit()) {
+        LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, FirstInstr);
+        FirstInstr = false;
+      }
+
+      // Prepare to tag this location with a label if we need to keep track of
+      // the location of calls/returns for BOLT address translation maps
+      if (!EmitCodeOnly && BF.requiresAddressTranslation() &&
+          BC.MIB->hasAnnotation(Instr, "Offset")) {
+        const auto Offset = BC.MIB->getAnnotationAs<uint32_t>(Instr, "Offset");
+        MCSymbol *LocSym = BC.Ctx->createTempSymbol();
+        Streamer.emitLabel(LocSym);
+        BB->getLocSyms().emplace_back(Offset, LocSym);
+      }
+
+      Streamer.emitInstruction(Instr, *BC.STI);
+      LastIsPrefix = BC.MIB->isPrefix(Instr);
+    }
+  }
+
+  if (!EmitCodeOnly)
+    emitConstantIslands(BF, EmitColdPart);
+}
+
+void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
+                                        BinaryFunction *OnBehalfOf) {
+  BinaryFunction::IslandInfo &Islands = BF.getIslandInfo();
+  if (Islands.DataOffsets.empty() && Islands.Dependency.empty())
+    return;
+
+  if (!OnBehalfOf) {
+    if (!EmitColdPart)
+      Streamer.emitLabel(BF.getFunctionConstantIslandLabel());
+    else
+      Streamer.emitLabel(BF.getFunctionColdConstantIslandLabel());
+  }
+
+  assert((!OnBehalfOf || Islands.Proxies[OnBehalfOf].size() > 0) &&
+         "spurious OnBehalfOf constant island emission");
+
+  assert(!BF.isInjected() &&
+         "injected functions should not have constant islands");
+  // Raw contents of the function.
+  StringRef SectionContents = BF.getOriginSection()->getContents();
+
+  // Raw contents of the function.
+  StringRef FunctionContents =
+      SectionContents.substr(
+          BF.getAddress() - BF.getOriginSection()->getAddress(),
+          BF.getMaxSize());
+
+  if (opts::Verbosity && !OnBehalfOf)
+    outs() << "BOLT-INFO: emitting constant island for function " << BF << "\n";
+
+  // We split the island into smaller blocks and output labels between them.
+  auto IS = Islands.Offsets.begin();
+  for (auto DataIter = Islands.DataOffsets.begin();
+       DataIter != Islands.DataOffsets.end();
+       ++DataIter) {
+    uint64_t FunctionOffset = *DataIter;
+    uint64_t EndOffset = 0ULL;
+
+    // Determine size of this data chunk
+    auto NextData = std::next(DataIter);
+    auto CodeIter = Islands.CodeOffsets.lower_bound(*DataIter);
+    if (CodeIter == Islands.CodeOffsets.end() &&
+        NextData == Islands.DataOffsets.end()) {
+      EndOffset = BF.getMaxSize();
+    } else if (CodeIter == Islands.CodeOffsets.end()) {
+      EndOffset = *NextData;
+    } else if (NextData == Islands.DataOffsets.end()) {
+      EndOffset = *CodeIter;
+    } else {
+      EndOffset = (*CodeIter > *NextData) ? *NextData : *CodeIter;
+    }
+
+    if (FunctionOffset == EndOffset)
+      continue;    // Size is zero, nothing to emit
+
+    // Emit labels, relocs and data
+    while (IS != Islands.Offsets.end() && IS->first < EndOffset) {
+      auto NextLabelOffset =
+        IS == Islands.Offsets.end() ? EndOffset : IS->first;
+      auto NextRelOffset = EndOffset;
+      auto NextStop = std::min(NextLabelOffset, NextRelOffset);
+      assert(NextStop <= EndOffset && "internal overflow error");
+      if (FunctionOffset < NextStop) {
+        Streamer.emitBytes(FunctionContents.slice(FunctionOffset, NextStop));
+        FunctionOffset = NextStop;
+      }
+      if (IS != Islands.Offsets.end() && FunctionOffset == IS->first) {
+        // This is a slightly complex code to decide which label to emit. We
+        // have 4 cases to handle: regular symbol, cold symbol, regular or cold
+        // symbol being emitted on behalf of an external function.
+        if (!OnBehalfOf) {
+          if (!EmitColdPart) {
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label "
+                              << IS->second->getName() << " at offset 0x"
+                              << Twine::utohexstr(IS->first) << '\n');
+            if (IS->second->isUndefined())
+              Streamer.emitLabel(IS->second);
+            else
+              assert(BF.hasName(std::string(IS->second->getName())));
+          } else if (Islands.ColdSymbols.count(IS->second) != 0) {
+            LLVM_DEBUG(dbgs()
+                       << "BOLT-DEBUG: emitted label "
+                       << Islands.ColdSymbols[IS->second]->getName() << '\n');
+            if (Islands.ColdSymbols[IS->second]->isUndefined())
+              Streamer.emitLabel(Islands.ColdSymbols[IS->second]);
+          }
+        } else {
+          if (!EmitColdPart) {
+            if (MCSymbol *Sym = Islands.Proxies[OnBehalfOf][IS->second]) {
+              LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label "
+                                << Sym->getName() << '\n');
+              Streamer.emitLabel(Sym);
+            }
+          } else if (MCSymbol *Sym =
+                         Islands.ColdProxies[OnBehalfOf][IS->second]) {
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName()
+                              << '\n');
+            Streamer.emitLabel(Sym);
+          }
+        }
+        ++IS;
+      }
+    }
+    assert(FunctionOffset <= EndOffset && "overflow error");
+    if (FunctionOffset < EndOffset) {
+      Streamer.emitBytes(FunctionContents.slice(FunctionOffset, EndOffset));
+    }
+  }
+  assert(IS == Islands.Offsets.end() && "some symbols were not emitted!");
+
+  if (OnBehalfOf)
+    return;
+  // Now emit constant islands from other functions that we may have used in
+  // this function.
+  for (BinaryFunction *ExternalFunc : Islands.Dependency) {
+    emitConstantIslands(*ExternalFunc, EmitColdPart, &BF);
+  }
+}
+
+SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc,
+                                  SMLoc PrevLoc, bool FirstInstr) {
+  DWARFUnit *FunctionCU = BF.getDWARFUnit();
+  const DWARFDebugLine::LineTable *FunctionLineTable = BF.getDWARFLineTable();
+  assert(FunctionCU && "cannot emit line info for function without CU");
+
+  DebugLineTableRowRef RowReference = DebugLineTableRowRef::fromSMLoc(NewLoc);
+
+  // Check if no new line info needs to be emitted.
+  if (RowReference == DebugLineTableRowRef::NULL_ROW ||
+      NewLoc.getPointer() == PrevLoc.getPointer())
+    return PrevLoc;
+
+  unsigned CurrentFilenum = 0;
+  const DWARFDebugLine::LineTable *CurrentLineTable = FunctionLineTable;
+
+  // If the CU id from the current instruction location does not
+  // match the CU id from the current function, it means that we
+  // have come across some inlined code.  We must look up the CU
+  // for the instruction's original function and get the line table
+  // from that.
+  const uint64_t FunctionUnitIndex = FunctionCU->getOffset();
+  const uint32_t CurrentUnitIndex = RowReference.DwCompileUnitIndex;
+  if (CurrentUnitIndex != FunctionUnitIndex) {
+    CurrentLineTable = BC.DwCtx->getLineTableForUnit(
+        BC.DwCtx->getCompileUnitForOffset(CurrentUnitIndex));
+    // Add filename from the inlined function to the current CU.
+    CurrentFilenum =
+      BC.addDebugFilenameToUnit(FunctionUnitIndex, CurrentUnitIndex,
+        CurrentLineTable->Rows[RowReference.RowIndex - 1].File);
+  }
+
+  const DWARFDebugLine::Row &CurrentRow =
+      CurrentLineTable->Rows[RowReference.RowIndex - 1];
+  if (!CurrentFilenum)
+    CurrentFilenum = CurrentRow.File;
+
+  unsigned Flags = (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) |
+                   (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) |
+                   (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) |
+                   (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin);
+
+  // Always emit is_stmt at the beginning of function fragment.
+  if (FirstInstr)
+    Flags |= DWARF2_FLAG_IS_STMT;
+
+  BC.Ctx->setCurrentDwarfLoc(
+    CurrentFilenum,
+    CurrentRow.Line,
+    CurrentRow.Column,
+    Flags,
+    CurrentRow.Isa,
+    CurrentRow.Discriminator);
+  BC.Ctx->setDwarfCompileUnitID(FunctionUnitIndex);
+
+  return NewLoc;
+}
+
+void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
+  MCSection *ReadOnlySection = BC.MOFI->getReadOnlySection();
+  MCSection *ReadOnlyColdSection = BC.MOFI->getContext().getELFSection(
+      ".rodata.cold", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+  if (!BF.hasJumpTables())
+    return;
+
+  if (opts::PrintJumpTables) {
+    outs() << "BOLT-INFO: jump tables for function " << BF << ":\n";
+  }
+
+  for (auto &JTI : BF.jumpTables()) {
+    JumpTable &JT = *JTI.second;
+    if (opts::PrintJumpTables)
+      JT.print(outs());
+    if ((opts::JumpTables == JTS_BASIC || !BF.isSimple()) &&
+        BC.HasRelocations) {
+      JT.updateOriginal();
+    } else {
+      MCSection *HotSection, *ColdSection;
+      if (opts::JumpTables == JTS_BASIC) {
+        // In non-relocation mode we have to emit jump tables in local sections.
+        // This way we only overwrite them when the corresponding function is
+        // overwritten.
+        std::string Name = ".local." + JT.Labels[0]->getName().str();
+        std::replace(Name.begin(), Name.end(), '/', '.');
+        BinarySection &Section =
+            BC.registerOrUpdateSection(Name, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+        Section.setAnonymous(true);
+        JT.setOutputSection(Section);
+        HotSection = BC.getDataSection(Name);
+        ColdSection = HotSection;
+      } else {
+        if (BF.isSimple()) {
+          HotSection = ReadOnlySection;
+          ColdSection = ReadOnlyColdSection;
+        } else {
+          HotSection = BF.hasProfile() ? ReadOnlySection : ReadOnlyColdSection;
+          ColdSection = HotSection;
+        }
+      }
+      emitJumpTable(JT, HotSection, ColdSection);
+    }
+  }
+}
+
+void BinaryEmitter::emitJumpTable(const JumpTable &JT, MCSection *HotSection,
+                                  MCSection *ColdSection) {
+  // Pre-process entries for aggressive splitting.
+  // Each label represents a separate switch table and gets its own count
+  // determining its destination.
+  std::map<MCSymbol *, uint64_t> LabelCounts;
+  if (opts::JumpTables > JTS_SPLIT && !JT.Counts.empty()) {
+    MCSymbol *CurrentLabel = JT.Labels.at(0);
+    uint64_t CurrentLabelCount = 0;
+    for (unsigned Index = 0; Index < JT.Entries.size(); ++Index) {
+      auto LI = JT.Labels.find(Index * JT.EntrySize);
+      if (LI != JT.Labels.end()) {
+        LabelCounts[CurrentLabel] = CurrentLabelCount;
+        CurrentLabel = LI->second;
+        CurrentLabelCount = 0;
+      }
+      CurrentLabelCount += JT.Counts[Index].Count;
+    }
+    LabelCounts[CurrentLabel] = CurrentLabelCount;
+  } else {
+    Streamer.SwitchSection(JT.Count > 0 ? HotSection : ColdSection);
+    Streamer.emitValueToAlignment(JT.EntrySize);
+  }
+  MCSymbol *LastLabel = nullptr;
+  uint64_t Offset = 0;
+  for (MCSymbol *Entry : JT.Entries) {
+    auto LI = JT.Labels.find(Offset);
+    if (LI != JT.Labels.end()) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table "
+                        << LI->second->getName()
+                        << " (originally was at address 0x"
+                        << Twine::utohexstr(JT.getAddress() + Offset)
+                        << (Offset ? "as part of larger jump table\n" : "\n"));
+      if (!LabelCounts.empty()) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump table count: "
+                          << LabelCounts[LI->second] << '\n');
+        if (LabelCounts[LI->second] > 0) {
+          Streamer.SwitchSection(HotSection);
+        } else {
+          Streamer.SwitchSection(ColdSection);
+        }
+        Streamer.emitValueToAlignment(JT.EntrySize);
+      }
+      Streamer.emitLabel(LI->second);
+      LastLabel = LI->second;
+    }
+    if (JT.Type == JumpTable::JTT_NORMAL) {
+      Streamer.emitSymbolValue(Entry, JT.OutputEntrySize);
+    } else { // JTT_PIC
+      const MCSymbolRefExpr *JTExpr =
+          MCSymbolRefExpr::create(LastLabel, Streamer.getContext());
+      const MCSymbolRefExpr *E =
+          MCSymbolRefExpr::create(Entry, Streamer.getContext());
+      const MCBinaryExpr *Value =
+          MCBinaryExpr::createSub(E, JTExpr, Streamer.getContext());
+      Streamer.emitValue(Value, JT.EntrySize);
+    }
+    Offset += JT.EntrySize;
+  }
+}
+
+// The code is based on EHStreamer::emitExceptionTable().
+void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {
+  const std::vector<BinaryFunction::CallSite> *Sites =
+      EmitColdPart ? &BF.getColdCallSites() : &BF.getCallSites();
+  if (Sites->empty()) {
+    return;
+  }
+
+  // Calculate callsite table size. Size of each callsite entry is:
+  //
+  //  sizeof(start) + sizeof(length) + sizeof(LP) + sizeof(uleb128(action))
+  //
+  // or
+  //
+  //  sizeof(dwarf::DW_EH_PE_data4) * 3 + sizeof(uleb128(action))
+  uint64_t CallSiteTableLength = Sites->size() * 4 * 3;
+  for (const BinaryFunction::CallSite &CallSite : *Sites) {
+    CallSiteTableLength += getULEB128Size(CallSite.Action);
+  }
+
+  Streamer.SwitchSection(BC.MOFI->getLSDASection());
+
+  const unsigned TTypeEncoding = BC.TTypeEncoding;
+  const unsigned TTypeEncodingSize = BC.getDWARFEncodingSize(TTypeEncoding);
+  const uint16_t TTypeAlignment = 4;
+
+  // Type tables have to be aligned at 4 bytes.
+  Streamer.emitValueToAlignment(TTypeAlignment);
+
+  // Emit the LSDA label.
+  MCSymbol *LSDASymbol =
+      EmitColdPart ? BF.getColdLSDASymbol() : BF.getLSDASymbol();
+  assert(LSDASymbol && "no LSDA symbol set");
+  Streamer.emitLabel(LSDASymbol);
+
+  // Corresponding FDE start.
+  const MCSymbol *StartSymbol = EmitColdPart ? BF.getColdSymbol()
+                                             : BF.getSymbol();
+
+  // Emit the LSDA header.
+
+  // If LPStart is omitted, then the start of the FDE is used as a base for
+  // landing pad displacements. Then if a cold fragment starts with
+  // a landing pad, this means that the first landing pad offset will be 0.
+  // As a result, the exception handling runtime will ignore this landing pad
+  // because zero offset denotes the absence of a landing pad.
+  // For this reason, when the binary has fixed starting address we emit LPStart
+  // as 0 and output the absolute value of the landing pad in the table.
+  //
+  // If the base address can change, we cannot use absolute addresses for
+  // landing pads (at least not without runtime relocations). Hence, we fall
+  // back to emitting landing pads relative to the FDE start.
+  // As we are emitting label differences, we have to guarantee both labels are
+  // defined in the same section and hence cannot place the landing pad into a
+  // cold fragment when the corresponding call site is in the hot fragment.
+  // Because of this issue and the previously described issue of possible
+  // zero-offset landing pad we disable splitting of exception-handling
+  // code for shared objects.
+  std::function<void(const MCSymbol *)> emitLandingPad;
+  if (BC.HasFixedLoadAddress) {
+    Streamer.emitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
+    Streamer.emitIntValue(0, 4);                      // LPStart
+    emitLandingPad = [&](const MCSymbol *LPSymbol) {
+      if (!LPSymbol)
+        Streamer.emitIntValue(0, 4);
+      else
+        Streamer.emitSymbolValue(LPSymbol, 4);
+    };
+  } else {
+    assert(!EmitColdPart &&
+           "cannot have exceptions in cold fragment for shared object");
+    Streamer.emitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
+    emitLandingPad = [&](const MCSymbol *LPSymbol) {
+      if (!LPSymbol)
+        Streamer.emitIntValue(0, 4);
+      else
+        Streamer.emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4);
+    };
+  }
+
+  Streamer.emitIntValue(TTypeEncoding, 1);        // TType format
+
+  // See the comment in EHStreamer::emitExceptionTable() on to use
+  // uleb128 encoding (which can use variable number of bytes to encode the same
+  // value) to ensure type info table is properly aligned at 4 bytes without
+  // iteratively fixing sizes of the tables.
+  unsigned CallSiteTableLengthSize = getULEB128Size(CallSiteTableLength);
+  unsigned TTypeBaseOffset =
+    sizeof(int8_t) +                            // Call site format
+    CallSiteTableLengthSize +                   // Call site table length size
+    CallSiteTableLength +                       // Call site table length
+    BF.getLSDAActionTable().size() +            // Actions table size
+    BF.getLSDATypeTable().size() * TTypeEncodingSize; // Types table size
+  unsigned TTypeBaseOffsetSize = getULEB128Size(TTypeBaseOffset);
+  unsigned TotalSize =
+    sizeof(int8_t) +                            // LPStart format
+    sizeof(int8_t) +                            // TType format
+    TTypeBaseOffsetSize +                       // TType base offset size
+    TTypeBaseOffset;                            // TType base offset
+  unsigned SizeAlign = (4 - TotalSize) & 3;
+
+  // Account for any extra padding that will be added to the call site table
+  // length.
+  Streamer.emitULEB128IntValue(TTypeBaseOffset,
+                               /*PadTo=*/TTypeBaseOffsetSize + SizeAlign);
+
+  // Emit the landing pad call site table. We use signed data4 since we can emit
+  // a landing pad in a different part of the split function that could appear
+  // earlier in the address space than LPStart.
+  Streamer.emitIntValue(dwarf::DW_EH_PE_sdata4, 1);
+  Streamer.emitULEB128IntValue(CallSiteTableLength);
+
+  for (const BinaryFunction::CallSite &CallSite : *Sites) {
+    const MCSymbol *BeginLabel = CallSite.Start;
+    const MCSymbol *EndLabel = CallSite.End;
+
+    assert(BeginLabel && "start EH label expected");
+    assert(EndLabel && "end EH label expected");
+
+    // Start of the range is emitted relative to the start of current
+    // function split part.
+    Streamer.emitAbsoluteSymbolDiff(BeginLabel, StartSymbol, 4);
+    Streamer.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
+    emitLandingPad(CallSite.LP);
+    Streamer.emitULEB128IntValue(CallSite.Action);
+  }
+
+  // Write out action, type, and type index tables at the end.
+  //
+  // For action and type index tables there's no need to change the original
+  // table format unless we are doing function splitting, in which case we can
+  // split and optimize the tables.
+  //
+  // For type table we (re-)encode the table using TTypeEncoding matching
+  // the current assembler mode.
+  for (uint8_t const &Byte : BF.getLSDAActionTable()) {
+    Streamer.emitIntValue(Byte, 1);
+  }
+
+  const BinaryFunction::LSDATypeTableTy &TypeTable =
+      (TTypeEncoding & dwarf::DW_EH_PE_indirect) ? BF.getLSDATypeAddressTable()
+                                                 : BF.getLSDATypeTable();
+  assert(TypeTable.size() == BF.getLSDATypeTable().size() &&
+         "indirect type table size mismatch");
+
+  for (int Index = TypeTable.size() - 1; Index >= 0; --Index) {
+    const uint64_t TypeAddress = TypeTable[Index];
+    switch (TTypeEncoding & 0x70) {
+    default:
+      llvm_unreachable("unsupported TTypeEncoding");
+    case dwarf::DW_EH_PE_absptr:
+      Streamer.emitIntValue(TypeAddress, TTypeEncodingSize);
+      break;
+    case dwarf::DW_EH_PE_pcrel: {
+      if (TypeAddress) {
+        const MCSymbol *TypeSymbol =
+          BC.getOrCreateGlobalSymbol(TypeAddress, "TI", 0, TTypeAlignment);
+        MCSymbol *DotSymbol = BC.Ctx->createNamedTempSymbol();
+        Streamer.emitLabel(DotSymbol);
+        const MCBinaryExpr *SubDotExpr = MCBinaryExpr::createSub(
+            MCSymbolRefExpr::create(TypeSymbol, *BC.Ctx),
+            MCSymbolRefExpr::create(DotSymbol, *BC.Ctx), *BC.Ctx);
+        Streamer.emitValue(SubDotExpr, TTypeEncodingSize);
+      } else {
+        Streamer.emitIntValue(0, TTypeEncodingSize);
+      }
+      break;
+    }
+    }
+  }
+  for (uint8_t const &Byte : BF.getLSDATypeIndexTable()) {
+    Streamer.emitIntValue(Byte, 1);
+  }
+}
+
+void BinaryEmitter::emitDebugLineInfoForOriginalFunctions() {
+  for (auto &It : BC.getBinaryFunctions()) {
+    const BinaryFunction &Function = It.second;
+
+    // If the function was emitted, its line info was emitted with it.
+    if (Function.isEmitted())
+      continue;
+
+    DWARFUnit *Unit = Function.getDWARFUnit();
+    const DWARFDebugLine::LineTable *LineTable = Function.getDWARFLineTable();
+
+    if (!LineTable)
+      continue; // nothing to update for this function
+
+    std::vector<uint32_t> Results;
+    MCSection *FunctionSection =
+        BC.getCodeSection(Function.getCodeSectionName());
+
+    uint64_t Address = It.first;
+    if (LineTable->lookupAddressRange({Address, 0}, Function.getMaxSize(),
+                                      Results)) {
+      MCLineSection &OutputLineTable =
+          BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
+      for (uint32_t RowIndex : Results) {
+        const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+        BC.Ctx->setCurrentDwarfLoc(
+            Row.File,
+            Row.Line,
+            Row.Column,
+            (DWARF2_FLAG_IS_STMT * Row.IsStmt) |
+            (DWARF2_FLAG_BASIC_BLOCK * Row.BasicBlock) |
+            (DWARF2_FLAG_PROLOGUE_END * Row.PrologueEnd) |
+            (DWARF2_FLAG_EPILOGUE_BEGIN * Row.EpilogueBegin),
+            Row.Isa,
+            Row.Discriminator,
+            Row.Address.Address);
+        MCDwarfLoc Loc = BC.Ctx->getCurrentDwarfLoc();
+        BC.Ctx->clearDwarfLocSeen();
+        OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
+                                     FunctionSection);
+      }
+      // Add an empty entry past the end of the function
+      // for end_sequence mark.
+      BC.Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0,
+                                 Address + Function.getMaxSize());
+      MCDwarfLoc Loc = BC.Ctx->getCurrentDwarfLoc();
+      BC.Ctx->clearDwarfLocSeen();
+      OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
+                                   FunctionSection);
+    } else {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: function " << Function
+                        << " has no associated line number information\n");
+    }
+  }
+}
+
+void BinaryEmitter::emitDataSections(StringRef OrgSecPrefix) {
+  for (BinarySection &Section : BC.sections()) {
+    if (!Section.hasRelocations() || !Section.hasSectionRef())
+      continue;
+
+    StringRef SectionName = Section.getName();
+    std::string EmitName = Section.isReordered()
+      ? std::string(Section.getOutputName())
+      : OrgSecPrefix.str() + std::string(SectionName);
+    Section.emitAsData(Streamer, EmitName);
+    Section.clearRelocations();
+  }
+}
+
+namespace llvm {
+namespace bolt {
+
+void emitBinaryContext(MCStreamer &Streamer, BinaryContext &BC,
+                       StringRef OrgSecPrefix) {
+  BinaryEmitter(Streamer, BC).emitAll(OrgSecPrefix);
+}
+
+void emitFunctionBody(MCStreamer &Streamer, BinaryFunction &BF,
+                      bool EmitColdPart, bool EmitCodeOnly) {
+  BinaryEmitter(Streamer, BF.getBinaryContext()).
+    emitFunctionBody(BF, EmitColdPart, EmitCodeOnly);
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/BinaryEmitter.h b/bolt/src/BinaryEmitter.h
new file mode 100644
index 000000000000..d6bc6077e764
--- /dev/null
+++ b/bolt/src/BinaryEmitter.h
@@ -0,0 +1,39 @@
+//===--- BinaryEmitter.h - collection of functions to emit code and data --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_EMITTER_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_EMITTER_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class MCStreamer;
+
+namespace bolt {
+class BinaryContext;
+class BinaryFunction;
+
+/// Emit all code and data in the BinaryContext \p BC.
+///
+/// \p OrgSecPrefix is used to modify name of emitted original sections
+/// contained in \p BC. This is done to distinguish them from sections emitted
+/// by LLVM backend.
+void emitBinaryContext(MCStreamer &Streamer, BinaryContext &BC,
+                       StringRef OrgSecPrefix = "");
+
+/// Emit \p BF function code. The caller is responsible for emitting function
+/// symbol(s) and setting the section to emit the code to.
+void emitFunctionBody(MCStreamer &Streamer, BinaryFunction &BF,
+                      bool EmitColdPart, bool EmitCodeOnly = false);
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp
new file mode 100644
index 000000000000..4eea2a247905
--- /dev/null
+++ b/bolt/src/BinaryFunction.cpp
@@ -0,0 +1,4474 @@
+//===--- BinaryFunction.cpp - Interface for machine-level function --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryFunction.h"
+#include "BinaryBasicBlock.h"
+#include "DynoStats.h"
+#include "MCPlusBuilder.h"
+#include "NameResolver.h"
+#include "NameShortener.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/edit_distance.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cxxabi.h>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <string>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+extern cl::OptionCategory BoltOptCategory;
+extern cl::OptionCategory BoltRelocCategory;
+
+extern cl::opt<bool> EnableBAT;
+extern cl::opt<bool> Instrument;
+extern cl::opt<bool> StrictMode;
+extern cl::opt<bool> UpdateDebugSections;
+extern cl::opt<unsigned> Verbosity;
+
+extern bool processAllFunctions();
+
+cl::opt<bool>
+CheckEncoding("check-encoding",
+  cl::desc("perform verification of LLVM instruction encoding/decoding. "
+           "Every instruction in the input is decoded and re-encoded. "
+           "If the resulting bytes do not match the input, a warning message "
+           "is printed."),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+DotToolTipCode("dot-tooltip-code",
+  cl::desc("add basic block instructions as tool tips on nodes"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<JumpTableSupportLevel>
+JumpTables("jump-tables",
+  cl::desc("jump tables support (default=basic)"),
+  cl::init(JTS_BASIC),
+  cl::values(
+      clEnumValN(JTS_NONE, "none",
+                 "do not optimize functions with jump tables"),
+      clEnumValN(JTS_BASIC, "basic",
+                 "optimize functions with jump tables"),
+      clEnumValN(JTS_MOVE, "move",
+                 "move jump tables to a separate section"),
+      clEnumValN(JTS_SPLIT, "split",
+                 "split jump tables section into hot and cold based on "
+                 "function execution frequency"),
+      clEnumValN(JTS_AGGRESSIVE, "aggressive",
+                 "aggressively split jump tables section based on usage "
+                 "of the tables")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+NoScan("no-scan",
+  cl::desc("do not scan cold functions for external references (may result in "
+           "slower binary)"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+PreserveBlocksAlignment("preserve-blocks-alignment",
+  cl::desc("try to preserve basic block alignment"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+PrintDynoStats("dyno-stats",
+  cl::desc("print execution info based on profile"),
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+PrintDynoStatsOnly("print-dyno-stats-only",
+  cl::desc("while printing functions output dyno-stats and skip instructions"),
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::list<std::string>
+PrintOnly("print-only",
+  cl::CommaSeparated,
+  cl::desc("list of functions to print"),
+  cl::value_desc("func1,func2,func3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+TimeBuild("time-build",
+  cl::desc("print time spent constructing binary functions"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+TrapOnAVX512("trap-avx512",
+  cl::desc("in relocation mode trap upon entry to any function that uses "
+            "AVX-512 instructions"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+bool shouldPrint(const BinaryFunction &Function) {
+  if (Function.isIgnored())
+    return false;
+
+  if (PrintOnly.empty())
+    return true;
+
+  for (std::string &Name : opts::PrintOnly) {
+    if (Function.hasNameRegex(Name)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+constexpr unsigned BinaryFunction::MinAlign;
+
+namespace {
+
+template <typename R>
+bool emptyRange(const R &Range) {
+  return Range.begin() == Range.end();
+}
+
+/// Gets debug line information for the instruction located at the given
+/// address in the original binary. The SMLoc's pointer is used
+/// to point to this information, which is represented by a
+/// DebugLineTableRowRef. The returned pointer is null if no debug line
+/// information for this instruction was found.
+SMLoc findDebugLineInformationForInstructionAt(uint64_t Address,
+    DWARFUnit *Unit, const DWARFDebugLine::LineTable *LineTable) {
+  // We use the pointer in SMLoc to store an instance of DebugLineTableRowRef,
+  // which occupies 64 bits. Thus, we can only proceed if the struct fits into
+  // the pointer itself.
+  assert(
+      sizeof(decltype(SMLoc().getPointer())) >= sizeof(DebugLineTableRowRef) &&
+      "Cannot fit instruction debug line information into SMLoc's pointer");
+
+  SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc();
+  uint32_t RowIndex = LineTable->lookupAddress(
+      {Address, object::SectionedAddress::UndefSection});
+  if (RowIndex == LineTable->UnknownRowIndex)
+    return NullResult;
+
+  assert(RowIndex < LineTable->Rows.size() &&
+         "Line Table lookup returned invalid index.");
+
+  decltype(SMLoc().getPointer()) Ptr;
+  DebugLineTableRowRef *InstructionLocation =
+    reinterpret_cast<DebugLineTableRowRef *>(&Ptr);
+
+  InstructionLocation->DwCompileUnitIndex = Unit->getOffset();
+  InstructionLocation->RowIndex = RowIndex + 1;
+
+  return SMLoc::getFromPointer(Ptr);
+}
+
+std::string buildSectionName(StringRef Prefix, StringRef Name,
+                             const BinaryContext &BC) {
+  if (BC.isELF())
+    return (Prefix + Name).str();
+  static NameShortener NS;
+  return (Prefix + Twine(NS.getID(Name))).str();
+}
+
+} // namespace
+
+std::string BinaryFunction::buildCodeSectionName(StringRef Name,
+                                                 const BinaryContext &BC) {
+  return buildSectionName(BC.isELF() ? ".local.text." : ".l.text.", Name, BC);
+}
+
+std::string BinaryFunction::buildColdCodeSectionName(StringRef Name,
+                                                     const BinaryContext &BC) {
+  return buildSectionName(BC.isELF() ? ".local.cold.text." : ".l.c.text.", Name,
+                          BC);
+}
+
+uint64_t BinaryFunction::Count = 0;
+
+Optional<StringRef>
+BinaryFunction::hasNameRegex(const StringRef Name) const {
+  const std::string RegexName = (Twine("^") + StringRef(Name) + "$").str();
+  Regex MatchName(RegexName);
+  Optional<StringRef> Match = forEachName(
+      [&MatchName](StringRef Name) { return MatchName.match(Name); });
+
+  return Match;
+}
+
+Optional<StringRef>
+BinaryFunction::hasRestoredNameRegex(const StringRef Name) const {
+  const std::string RegexName = (Twine("^") + StringRef(Name) + "$").str();
+  Regex MatchName(RegexName);
+  Optional<StringRef> Match = forEachName([&MatchName](StringRef Name) {
+    return MatchName.match(NameResolver::restore(Name));
+  });
+
+  return Match;
+}
+
+std::string BinaryFunction::getDemangledName() const {
+  StringRef MangledName = NameResolver::restore(getOneName());
+  int Status = 0;
+  char *const Name =
+      abi::__cxa_demangle(MangledName.str().c_str(), 0, 0, &Status);
+  const std::string NameStr(Status == 0 ? Name : MangledName);
+  ::free(Name);
+  return NameStr;
+}
+
+BinaryBasicBlock *
+BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) {
+  if (Offset > Size)
+    return nullptr;
+
+  if (BasicBlockOffsets.empty())
+    return nullptr;
+
+  /*
+   * This is commented out because it makes BOLT too slow.
+   * assert(std::is_sorted(BasicBlockOffsets.begin(),
+   *                       BasicBlockOffsets.end(),
+   *                       CompareBasicBlockOffsets())));
+   */
+  auto I = std::upper_bound(BasicBlockOffsets.begin(),
+                            BasicBlockOffsets.end(),
+                            BasicBlockOffset(Offset, nullptr),
+                            CompareBasicBlockOffsets());
+  assert(I != BasicBlockOffsets.begin() && "first basic block not at offset 0");
+  --I;
+  BinaryBasicBlock *BB = I->second;
+  return (Offset < BB->getOffset() + BB->getOriginalSize()) ? BB : nullptr;
+}
+
+void BinaryFunction::markUnreachableBlocks() {
+  std::stack<BinaryBasicBlock *> Stack;
+
+  for (BinaryBasicBlock *BB : layout()) {
+    BB->markValid(false);
+  }
+
+  // Add all entries and landing pads as roots.
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    if (isEntryPoint(*BB) || BB->isLandingPad()) {
+      Stack.push(BB);
+      BB->markValid(true);
+      continue;
+    }
+    // FIXME:
+    // Also mark BBs with indirect jumps as reachable, since we do not
+    // support removing unused jump tables yet (T29418024 / GH-issue20)
+    for (const MCInst &Inst : *BB) {
+      if (BC.MIB->getJumpTable(Inst)) {
+        Stack.push(BB);
+        BB->markValid(true);
+        break;
+      }
+    }
+  }
+
+  // Determine reachable BBs from the entry point
+  while (!Stack.empty()) {
+    BinaryBasicBlock *BB = Stack.top();
+    Stack.pop();
+    for (BinaryBasicBlock *Succ : BB->successors()) {
+      if (Succ->isValid())
+        continue;
+      Succ->markValid(true);
+      Stack.push(Succ);
+    }
+  }
+}
+
+// Any unnecessary fallthrough jumps revealed after calling eraseInvalidBBs
+// will be cleaned up by fixBranches().
+std::pair<unsigned, uint64_t> BinaryFunction::eraseInvalidBBs() {
+  BasicBlockOrderType NewLayout;
+  unsigned Count = 0;
+  uint64_t Bytes = 0;
+  for (BinaryBasicBlock *BB : layout()) {
+    if (BB->isValid()) {
+      NewLayout.push_back(BB);
+    } else {
+      assert(!isEntryPoint(*BB) && "all entry blocks must be valid");
+      ++Count;
+      Bytes += BC.computeCodeSize(BB->begin(), BB->end());
+    }
+  }
+  BasicBlocksLayout = std::move(NewLayout);
+
+  BasicBlockListType NewBasicBlocks;
+  for (auto I = BasicBlocks.begin(), E = BasicBlocks.end(); I != E; ++I) {
+    BinaryBasicBlock *BB = *I;
+    if (BB->isValid()) {
+      NewBasicBlocks.push_back(BB);
+    } else {
+      // Make sure the block is removed from the list of predecessors.
+      BB->removeAllSuccessors();
+      DeletedBasicBlocks.push_back(BB);
+    }
+  }
+  BasicBlocks = std::move(NewBasicBlocks);
+
+  assert(BasicBlocks.size() == BasicBlocksLayout.size());
+
+  // Update CFG state if needed
+  if (Count > 0)
+    recomputeLandingPads();
+
+  return std::make_pair(Count, Bytes);
+}
+
+bool BinaryFunction::isForwardCall(const MCSymbol *CalleeSymbol) const {
+  // This function should work properly before and after function reordering.
+  // In order to accomplish this, we use the function index (if it is valid).
+  // If the function indices are not valid, we fall back to the original
+  // addresses.  This should be ok because the functions without valid indices
+  // should have been ordered with a stable sort.
+  const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CalleeSymbol);
+  if (CalleeBF) {
+    if(CalleeBF->isInjected())
+      return true;
+
+    if (hasValidIndex() && CalleeBF->hasValidIndex()) {
+      return getIndex() < CalleeBF->getIndex();
+    } else if (hasValidIndex() && !CalleeBF->hasValidIndex()) {
+      return true;
+    } else if (!hasValidIndex() && CalleeBF->hasValidIndex()) {
+      return false;
+    } else {
+      return getAddress() < CalleeBF->getAddress();
+    }
+  } else {
+    // Absolute symbol.
+    ErrorOr<uint64_t> CalleeAddressOrError = BC.getSymbolValue(*CalleeSymbol);
+    assert(CalleeAddressOrError && "unregistered symbol found");
+    return *CalleeAddressOrError > getAddress();
+  }
+}
+
+void BinaryFunction::dump(bool PrintInstructions) const {
+  print(dbgs(), "", PrintInstructions);
+}
+
+void BinaryFunction::print(raw_ostream &OS, std::string Annotation,
+                           bool PrintInstructions) const {
+  if (!opts::shouldPrint(*this))
+    return;
+
+  StringRef SectionName =
+      OriginSection ? OriginSection->getName() : "<no origin section>";
+  OS << "Binary Function \"" << *this << "\" " << Annotation << " {";
+  std::vector<StringRef> AllNames = getNames();
+  if (AllNames.size() > 1) {
+    OS << "\n  All names   : ";
+    const char *Sep = "";
+    for (const StringRef Name : AllNames) {
+      OS << Sep << Name;
+      Sep = "\n                ";
+    }
+  }
+  OS << "\n  Number      : "   << FunctionNumber
+     << "\n  State       : "   << CurrentState
+     << "\n  Address     : 0x" << Twine::utohexstr(Address)
+     << "\n  Size        : 0x" << Twine::utohexstr(Size)
+     << "\n  MaxSize     : 0x" << Twine::utohexstr(MaxSize)
+     << "\n  Offset      : 0x" << Twine::utohexstr(FileOffset)
+     << "\n  Section     : "   << SectionName
+     << "\n  Orc Section : "   << getCodeSectionName()
+     << "\n  LSDA        : 0x" << Twine::utohexstr(getLSDAAddress())
+     << "\n  IsSimple    : "   << IsSimple
+     << "\n  IsMultiEntry: "   << isMultiEntry()
+     << "\n  IsSplit     : "   << isSplit()
+     << "\n  BB Count    : "   << size();
+
+  if (HasFixedIndirectBranch) {
+    OS << "\n  HasFixedIndirectBranch : true";
+  }
+  if (HasUnknownControlFlow) {
+    OS << "\n  Unknown CF  : true";
+  }
+  if (getPersonalityFunction()) {
+    OS << "\n  Personality : " << getPersonalityFunction()->getName();
+  }
+  if (IsFragment) {
+    OS << "\n  IsFragment  : true";
+  }
+  if (isFolded()) {
+    OS << "\n  FoldedInto  : " << *getFoldedIntoFunction();
+  }
+  if (ParentFragment) {
+    OS << "\n  Parent      : " << *ParentFragment;
+  }
+  if (!Fragments.empty()) {
+    OS << "\n  Fragments   : ";
+    const char *Sep = "";
+    for (BinaryFunction *Frag : Fragments) {
+      OS << Sep << *Frag;
+      Sep = ", ";
+    }
+  }
+  if (hasCFG()) {
+    OS << "\n  Hash        : "   << Twine::utohexstr(computeHash());
+  }
+  if (isMultiEntry()) {
+    OS << "\n  Secondary Entry Points : ";
+    const char *Sep = "";
+    for (const std::pair<const MCSymbol *const, MCSymbol *> &KV :
+         SecondaryEntryPoints) {
+      OS << Sep << KV.second->getName();
+      Sep = ", ";
+    }
+  }
+  if (FrameInstructions.size()) {
+    OS << "\n  CFI Instrs  : "   << FrameInstructions.size();
+  }
+  if (BasicBlocksLayout.size()) {
+    OS << "\n  BB Layout   : ";
+    const char *Sep = "";
+    for (BinaryBasicBlock *BB : BasicBlocksLayout) {
+      OS << Sep << BB->getName();
+      Sep = ", ";
+    }
+  }
+  if (ImageAddress)
+    OS << "\n  Image       : 0x" << Twine::utohexstr(ImageAddress);
+  if (ExecutionCount != COUNT_NO_PROFILE) {
+    OS << "\n  Exec Count  : " << ExecutionCount;
+    OS << "\n  Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f);
+  }
+
+  if (opts::PrintDynoStats && !BasicBlocksLayout.empty()) {
+    OS << '\n';
+    DynoStats dynoStats = getDynoStats(*this);
+    OS << dynoStats;
+  }
+
+  OS << "\n}\n";
+
+  if (opts::PrintDynoStatsOnly || !PrintInstructions || !BC.InstPrinter)
+    return;
+
+  // Offset of the instruction in function.
+  uint64_t Offset = 0;
+
+  if (BasicBlocks.empty() && !Instructions.empty()) {
+    // Print before CFG was built.
+    for (const std::pair<const uint32_t, MCInst> &II : Instructions) {
+      Offset = II.first;
+
+      // Print label if exists at this offset.
+      auto LI = Labels.find(Offset);
+      if (LI != Labels.end()) {
+        if (const MCSymbol *EntrySymbol =
+                getSecondaryEntryPointSymbol(LI->second))
+          OS << EntrySymbol->getName() << " (Entry Point):\n";
+        OS << LI->second->getName() << ":\n";
+      }
+
+      BC.printInstruction(OS, II.second, Offset, this);
+    }
+  }
+
+  for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) {
+    BinaryBasicBlock *BB = BasicBlocksLayout[I];
+    if (I != 0 &&
+        BB->isCold() != BasicBlocksLayout[I - 1]->isCold())
+      OS << "-------   HOT-COLD SPLIT POINT   -------\n\n";
+
+    OS << BB->getName() << " ("
+       << BB->size() << " instructions, align : " << BB->getAlignment()
+       << ")\n";
+
+    if (isEntryPoint(*BB)) {
+      if (MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB))
+        OS << "  Secondary Entry Point: " << EntrySymbol->getName() << '\n';
+      else
+        OS << "  Entry Point\n";
+    }
+
+    if (BB->isLandingPad())
+      OS << "  Landing Pad\n";
+
+    uint64_t BBExecCount = BB->getExecutionCount();
+    if (hasValidProfile()) {
+      OS << "  Exec Count : ";
+      if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE)
+        OS << BBExecCount << '\n';
+      else
+        OS << "<unknown>\n";
+    }
+    if (BB->getCFIState() >= 0) {
+      OS << "  CFI State : " << BB->getCFIState() << '\n';
+    }
+    if (opts::EnableBAT) {
+      OS << "  Input offset: " << Twine::utohexstr(BB->getInputOffset())
+         << "\n";
+    }
+    if (!BB->pred_empty()) {
+      OS << "  Predecessors: ";
+      const char *Sep = "";
+      for (BinaryBasicBlock *Pred : BB->predecessors()) {
+        OS << Sep << Pred->getName();
+        Sep = ", ";
+      }
+      OS << '\n';
+    }
+    if (!BB->throw_empty()) {
+      OS << "  Throwers: ";
+      const char *Sep = "";
+      for (BinaryBasicBlock *Throw : BB->throwers()) {
+        OS << Sep << Throw->getName();
+        Sep = ", ";
+      }
+      OS << '\n';
+    }
+
+    Offset = alignTo(Offset, BB->getAlignment());
+
+    // Note: offsets are imprecise since this is happening prior to relaxation.
+    Offset = BC.printInstructions(OS, BB->begin(), BB->end(), Offset, this);
+
+    if (!BB->succ_empty()) {
+      OS << "  Successors: ";
+      // For more than 2 successors, sort them based on frequency.
+      std::vector<uint64_t> Indices(BB->succ_size());
+      std::iota(Indices.begin(), Indices.end(), 0);
+      if (BB->succ_size() > 2 && BB->getKnownExecutionCount()) {
+        std::stable_sort(Indices.begin(), Indices.end(),
+                         [&](const uint64_t A, const uint64_t B) {
+                           return BB->BranchInfo[B] < BB->BranchInfo[A];
+                         });
+      }
+      const char *Sep = "";
+      for (unsigned I = 0; I < Indices.size(); ++I) {
+        BinaryBasicBlock *Succ = BB->Successors[Indices[I]];
+        BinaryBasicBlock::BinaryBranchInfo &BI = BB->BranchInfo[Indices[I]];
+        OS << Sep << Succ->getName();
+        if (ExecutionCount != COUNT_NO_PROFILE &&
+            BI.MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) {
+          OS << " (mispreds: " << BI.MispredictedCount
+             << ", count: " << BI.Count << ")";
+        } else if (ExecutionCount != COUNT_NO_PROFILE &&
+                   BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
+          OS << " (inferred count: " << BI.Count << ")";
+        }
+        Sep = ", ";
+      }
+      OS << '\n';
+    }
+
+    if (!BB->lp_empty()) {
+      OS << "  Landing Pads: ";
+      const char *Sep = "";
+      for (BinaryBasicBlock *LP : BB->landing_pads()) {
+        OS << Sep << LP->getName();
+        if (ExecutionCount != COUNT_NO_PROFILE) {
+          OS << " (count: " << LP->getExecutionCount() << ")";
+        }
+        Sep = ", ";
+      }
+      OS << '\n';
+    }
+
+    // In CFG_Finalized state we can miscalculate CFI state at exit.
+    if (CurrentState == State::CFG) {
+      const int32_t CFIStateAtExit = BB->getCFIStateAtExit();
+      if (CFIStateAtExit >= 0)
+        OS << "  CFI State: " << CFIStateAtExit << '\n';
+    }
+
+    OS << '\n';
+  }
+
+  // Dump new exception ranges for the function.
+  if (!CallSites.empty()) {
+    OS << "EH table:\n";
+    for (const CallSite &CSI : CallSites) {
+      OS << "  [" << *CSI.Start << ", " << *CSI.End << ") landing pad : ";
+      if (CSI.LP)
+        OS << *CSI.LP;
+      else
+        OS << "0";
+      OS << ", action : " << CSI.Action << '\n';
+    }
+    OS << '\n';
+  }
+
+  // Print all jump tables.
+  for (const std::pair<const uint64_t, JumpTable *> &JTI : JumpTables) {
+    JTI.second->print(OS);
+  }
+
+  OS << "DWARF CFI Instructions:\n";
+  if (OffsetToCFI.size()) {
+    // Pre-buildCFG information
+    for (const std::pair<const uint32_t, uint32_t> &Elmt : OffsetToCFI) {
+      OS << format("    %08x:\t", Elmt.first);
+      assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset");
+      BinaryContext::printCFI(OS, FrameInstructions[Elmt.second]);
+      OS << "\n";
+    }
+  } else {
+    // Post-buildCFG information
+    for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) {
+      const MCCFIInstruction &CFI = FrameInstructions[I];
+      OS << format("    %d:\t", I);
+      BinaryContext::printCFI(OS, CFI);
+      OS << "\n";
+    }
+  }
+  if (FrameInstructions.empty())
+    OS << "    <empty>\n";
+
+  OS << "End of Function \"" << *this << "\"\n\n";
+}
+
+void BinaryFunction::printRelocations(raw_ostream &OS,
+                                      uint64_t Offset,
+                                      uint64_t Size) const {
+  const char *Sep = " # Relocs: ";
+
+  auto RI = Relocations.lower_bound(Offset);
+  while (RI != Relocations.end() && RI->first < Offset + Size) {
+    OS << Sep << "(R: " << RI->second << ")";
+    Sep = ", ";
+    ++RI;
+  }
+}
+
+IndirectBranchType
+BinaryFunction::processIndirectBranch(MCInst &Instruction,
+                                      unsigned Size,
+                                      uint64_t Offset,
+                                      uint64_t &TargetAddress) {
+  const unsigned PtrSize = BC.AsmInfo->getCodePointerSize();
+
+  // The instruction referencing memory used by the branch instruction.
+  // It could be the branch instruction itself or one of the instructions
+  // setting the value of the register used by the branch.
+  MCInst *MemLocInstr;
+
+  // Address of the table referenced by MemLocInstr. Could be either an
+  // array of function pointers, or a jump table.
+  uint64_t ArrayStart = 0;
+
+  unsigned BaseRegNum, IndexRegNum;
+  int64_t DispValue;
+  const MCExpr *DispExpr;
+
+  // In AArch, identify the instruction adding the PC-relative offset to
+  // jump table entries to correctly decode it.
+  MCInst *PCRelBaseInstr;
+  uint64_t PCRelAddr = 0;
+
+  auto Begin = Instructions.begin();
+  if (BC.isAArch64()) {
+    PreserveNops = BC.HasRelocations;
+    // Start at the last label as an approximation of the current basic block.
+    // This is a heuristic, since the full set of labels have yet to be
+    // determined
+    for (auto LI = Labels.rbegin(); LI != Labels.rend(); ++LI) {
+      auto II = Instructions.find(LI->first);
+      if (II != Instructions.end()) {
+        Begin = II;
+        break;
+      }
+    }
+  }
+
+  IndirectBranchType BranchType =
+    BC.MIB->analyzeIndirectBranch(Instruction,
+                                  Begin,
+                                  Instructions.end(),
+                                  PtrSize,
+                                  MemLocInstr,
+                                  BaseRegNum,
+                                  IndexRegNum,
+                                  DispValue,
+                                  DispExpr,
+                                  PCRelBaseInstr);
+
+  if (BranchType == IndirectBranchType::UNKNOWN && !MemLocInstr)
+    return BranchType;
+
+  if (MemLocInstr != &Instruction)
+    IndexRegNum = BC.MIB->getNoRegister();
+
+  if (BC.isAArch64()) {
+    const MCSymbol *Sym = BC.MIB->getTargetSymbol(*PCRelBaseInstr, 1);
+    assert(Sym && "Symbol extraction failed");
+    ErrorOr<uint64_t> SymValueOrError = BC.getSymbolValue(*Sym);
+    if (SymValueOrError) {
+      PCRelAddr = *SymValueOrError;
+    } else {
+      for (std::pair<const uint32_t, MCSymbol *> &Elmt : Labels) {
+        if (Elmt.second == Sym) {
+          PCRelAddr = Elmt.first + getAddress();
+          break;
+        }
+      }
+    }
+    uint64_t InstrAddr = 0;
+    for (auto II = Instructions.rbegin(); II != Instructions.rend(); ++II) {
+      if (&II->second == PCRelBaseInstr) {
+        InstrAddr = II->first + getAddress();
+        break;
+      }
+    }
+    assert(InstrAddr != 0 && "instruction not found");
+    // We do this to avoid spurious references to code locations outside this
+    // function (for example, if the indirect jump lives in the last basic
+    // block of the function, it will create a reference to the next function).
+    // This replaces a symbol reference with an immediate.
+    BC.MIB->replaceMemOperandDisp(*PCRelBaseInstr,
+                                  MCOperand::createImm(PCRelAddr - InstrAddr));
+    // FIXME: Disable full jump table processing for AArch64 until we have a
+    // proper way of determining the jump table limits.
+    return IndirectBranchType::UNKNOWN;
+  }
+
+  // RIP-relative addressing should be converted to symbol form by now
+  // in processed instructions (but not in jump).
+  if (DispExpr) {
+    const MCSymbol *TargetSym;
+    uint64_t TargetOffset;
+    std::tie(TargetSym, TargetOffset) = BC.MIB->getTargetSymbolInfo(DispExpr);
+    ErrorOr<uint64_t> SymValueOrError = BC.getSymbolValue(*TargetSym);
+    assert(SymValueOrError && "global symbol needs a value");
+    ArrayStart = *SymValueOrError + TargetOffset;
+    BaseRegNum = BC.MIB->getNoRegister();
+    if (BC.isAArch64()) {
+      ArrayStart &= ~0xFFFULL;
+      ArrayStart += DispValue & 0xFFFULL;
+    }
+  } else {
+    ArrayStart = static_cast<uint64_t>(DispValue);
+  }
+
+  if (BaseRegNum == BC.MRI->getProgramCounter())
+    ArrayStart += getAddress() + Offset + Size;
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x"
+                    << Twine::utohexstr(ArrayStart) << '\n');
+
+  ErrorOr<BinarySection &> Section = BC.getSectionForAddress(ArrayStart);
+  if (!Section) {
+    // No section - possibly an absolute address. Since we don't allow
+    // internal function addresses to escape the function scope - we
+    // consider it a tail call.
+    if (opts::Verbosity >= 1) {
+      errs() << "BOLT-WARNING: no section for address 0x"
+             << Twine::utohexstr(ArrayStart) << " referenced from function "
+             << *this << '\n';
+    }
+    return IndirectBranchType::POSSIBLE_TAIL_CALL;
+  }
+  if (Section->isVirtual()) {
+    // The contents are filled at runtime.
+    return IndirectBranchType::POSSIBLE_TAIL_CALL;
+  }
+
+  if (BranchType == IndirectBranchType::POSSIBLE_FIXED_BRANCH) {
+    ErrorOr<uint64_t> Value = BC.getPointerAtAddress(ArrayStart);
+    if (!Value)
+      return IndirectBranchType::UNKNOWN;
+
+    if (!BC.getSectionForAddress(ArrayStart)->isReadOnly())
+      return IndirectBranchType::UNKNOWN;
+
+    outs() << "BOLT-INFO: fixed indirect branch detected in " << *this
+           << " at 0x" << Twine::utohexstr(getAddress() + Offset)
+           << " referencing data at 0x" << Twine::utohexstr(ArrayStart)
+           << " the destination value is 0x" << Twine::utohexstr(*Value)
+           << '\n';
+
+    TargetAddress = *Value;
+    return BranchType;
+  }
+
+  // Check if there's already a jump table registered at this address.
+  MemoryContentsType MemType;
+  if (JumpTable *JT = BC.getJumpTableContainingAddress(ArrayStart)) {
+    switch (JT->Type) {
+    case JumpTable::JTT_NORMAL:
+      MemType = MemoryContentsType::POSSIBLE_JUMP_TABLE;
+      break;
+    case JumpTable::JTT_PIC:
+      MemType = MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE;
+      break;
+    }
+  } else {
+    MemType = BC.analyzeMemoryAt(ArrayStart, *this);
+  }
+
+  // Check that jump table type in instruction pattern matches memory contents.
+  JumpTable::JumpTableType JTType;
+  if (BranchType == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) {
+    if (MemType != MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE)
+      return IndirectBranchType::UNKNOWN;
+    JTType = JumpTable::JTT_PIC;
+  } else {
+    if (MemType == MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE)
+      return IndirectBranchType::UNKNOWN;
+
+    if (MemType == MemoryContentsType::UNKNOWN)
+      return IndirectBranchType::POSSIBLE_TAIL_CALL;
+
+    BranchType = IndirectBranchType::POSSIBLE_JUMP_TABLE;
+    JTType = JumpTable::JTT_NORMAL;
+  }
+
+  // Convert the instruction into jump table branch.
+  const MCSymbol *JTLabel = BC.getOrCreateJumpTable(*this, ArrayStart, JTType);
+  BC.MIB->replaceMemOperandDisp(*MemLocInstr, JTLabel, BC.Ctx.get());
+  BC.MIB->setJumpTable(Instruction, ArrayStart, IndexRegNum);
+
+  JTSites.emplace_back(Offset, ArrayStart);
+
+  return BranchType;
+}
+
+MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
+                                                bool CreatePastEnd) {
+  const uint64_t Offset = Address - getAddress();
+
+  if ((Offset == getSize()) && CreatePastEnd)
+    return getFunctionEndLabel();
+
+  auto LI = Labels.find(Offset);
+  if (LI != Labels.end())
+    return LI->second;
+
+  // For AArch64, check if this address is part of a constant island.
+  if (BC.isAArch64()) {
+    if (MCSymbol *IslandSym = getOrCreateIslandAccess(Address)) {
+      return IslandSym;
+    }
+  }
+
+  MCSymbol *Label = BC.Ctx->createNamedTempSymbol();
+  Labels[Offset] = Label;
+
+  return Label;
+}
+
+ErrorOr<ArrayRef<uint8_t>> BinaryFunction::getData() const {
+  BinarySection &Section = *getOriginSection();
+  assert(Section.containsRange(getAddress(), getMaxSize()) &&
+         "wrong section for function");
+
+  if (!Section.isText() || Section.isVirtual() || !Section.getSize()) {
+    return std::make_error_code(std::errc::bad_address);
+  }
+
+  StringRef SectionContents = Section.getContents();
+
+  assert(SectionContents.size() == Section.getSize() &&
+         "section size mismatch");
+
+  // Function offset from the section start.
+  uint64_t Offset = getAddress() - Section.getAddress();
+  auto *Bytes = reinterpret_cast<const uint8_t *>(SectionContents.data());
+  return ArrayRef<uint8_t>(Bytes + Offset, getMaxSize());
+}
+
+size_t BinaryFunction::getSizeOfDataInCodeAt(uint64_t Offset) const {
+  if (Islands.DataOffsets.find(Offset) == Islands.DataOffsets.end())
+    return 0;
+
+  auto Iter = Islands.CodeOffsets.upper_bound(Offset);
+  if (Iter != Islands.CodeOffsets.end()) {
+    return *Iter - Offset;
+  }
+  return getSize() - Offset;
+}
+
+bool BinaryFunction::isZeroPaddingAt(uint64_t Offset) const {
+  ArrayRef<uint8_t> FunctionData = *getData();
+  uint64_t EndOfCode = getSize();
+  auto Iter = Islands.DataOffsets.upper_bound(Offset);
+  if (Iter != Islands.DataOffsets.end())
+    EndOfCode = *Iter;
+  for (uint64_t I = Offset; I < EndOfCode; ++I) {
+    if (FunctionData[I] != 0) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool BinaryFunction::disassemble() {
+  NamedRegionTimer T("disassemble", "Disassemble function", "buildfuncs",
+                     "Build Binary Functions", opts::TimeBuild);
+  ErrorOr<ArrayRef<uint8_t>> ErrorOrFunctionData = getData();
+  assert(ErrorOrFunctionData && "function data is not available");
+  ArrayRef<uint8_t> FunctionData = *ErrorOrFunctionData;
+  assert(FunctionData.size() == getMaxSize() &&
+         "function size does not match raw data size");
+
+  auto &Ctx = BC.Ctx;
+  auto &MIB = BC.MIB;
+
+  // Insert a label at the beginning of the function. This will be our first
+  // basic block.
+  Labels[0] = Ctx->createNamedTempSymbol("BB0");
+
+  auto handlePCRelOperand =
+      [&](MCInst &Instruction, uint64_t Address, uint64_t Size) {
+    uint64_t TargetAddress = 0;
+    if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address,
+                                       Size)) {
+      errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n";
+      BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, errs());
+      errs() << '\n';
+      Instruction.dump_pretty(errs(), BC.InstPrinter.get());
+      errs() << '\n';
+      errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x"
+             << Twine::utohexstr(Address) << ". Skipping function " << *this
+             << ".\n";
+      if (BC.HasRelocations)
+        exit(1);
+      IsSimple = false;
+      return;
+    }
+    if (TargetAddress == 0 && opts::Verbosity >= 1) {
+      outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this
+             << '\n';
+    }
+
+    const MCSymbol *TargetSymbol;
+    uint64_t TargetOffset;
+    std::tie(TargetSymbol, TargetOffset) =
+      BC.handleAddressRef(TargetAddress, *this, /*IsPCRel*/ true);
+    const MCExpr *Expr = MCSymbolRefExpr::create(TargetSymbol,
+                                                 MCSymbolRefExpr::VK_None,
+                                                 *BC.Ctx);
+    if (TargetOffset) {
+      const MCConstantExpr *Offset =
+          MCConstantExpr::create(TargetOffset, *BC.Ctx);
+      Expr = MCBinaryExpr::createAdd(Expr, Offset, *BC.Ctx);
+    }
+    MIB->replaceMemOperandDisp(
+        Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor(
+                         Instruction,
+                         Expr,
+                         *BC.Ctx, 0)));
+  };
+
+  // Used to fix the target of linker-generated AArch64 stubs with no relocation
+  // info
+  auto fixStubTarget = [&](MCInst &LoadLowBits, MCInst &LoadHiBits,
+                           uint64_t Target) {
+    const MCSymbol *TargetSymbol;
+    uint64_t Addend = 0;
+    std::tie(TargetSymbol, Addend) = BC.handleAddressRef(Target, *this, true);
+
+    int64_t Val;
+    MIB->replaceImmWithSymbolRef(LoadHiBits, TargetSymbol, Addend, Ctx.get(),
+                                 Val, ELF::R_AARCH64_ADR_PREL_PG_HI21);
+    MIB->replaceImmWithSymbolRef(LoadLowBits, TargetSymbol, Addend, Ctx.get(),
+                                 Val, ELF::R_AARCH64_ADD_ABS_LO12_NC);
+  };
+
+  auto handleExternalReference = [&](MCInst &Instruction, uint64_t Size,
+                                     uint64_t Offset, uint64_t TargetAddress,
+                                     bool &IsCall) -> MCSymbol * {
+    const bool IsCondBranch = MIB->isConditionalBranch(Instruction);
+    const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
+    MCSymbol *TargetSymbol = nullptr;
+    InterproceduralReferences.insert(TargetAddress);
+    if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !BC.HasRelocations) {
+      errs() << "BOLT-WARNING: relaxed tail call detected at 0x"
+             << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this
+             << ". Code size will be increased.\n";
+    }
+
+    assert(!MIB->isTailCall(Instruction) &&
+           "synthetic tail call instruction found");
+
+    // This is a call regardless of the opcode.
+    // Assign proper opcode for tail calls, so that they could be
+    // treated as calls.
+    if (!IsCall) {
+      if (!MIB->convertJmpToTailCall(Instruction)) {
+        assert(IsCondBranch && "unknown tail call instruction");
+        if (opts::Verbosity >= 2) {
+          errs() << "BOLT-WARNING: conditional tail call detected in "
+                 << "function " << *this << " at 0x"
+                 << Twine::utohexstr(AbsoluteInstrAddr) << ".\n";
+        }
+      }
+      IsCall = true;
+    }
+
+    TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "FUNCat");
+    if (opts::Verbosity >= 2 && TargetAddress == 0) {
+      // We actually see calls to address 0 in presence of weak
+      // symbols originating from libraries. This code is never meant
+      // to be executed.
+      outs() << "BOLT-INFO: Function " << *this
+             << " has a call to address zero.\n";
+    }
+
+    return TargetSymbol;
+  };
+
+  auto handleIndirectBranch = [&](MCInst &Instruction, uint64_t Size,
+                                  uint64_t Offset) {
+    uint64_t IndirectTarget = 0;
+    IndirectBranchType Result =
+        processIndirectBranch(Instruction, Size, Offset, IndirectTarget);
+    switch (Result) {
+    default:
+      llvm_unreachable("unexpected result");
+    case IndirectBranchType::POSSIBLE_TAIL_CALL: {
+      bool Result = MIB->convertJmpToTailCall(Instruction);
+      (void)Result;
+      assert(Result);
+      break;
+    }
+    case IndirectBranchType::POSSIBLE_JUMP_TABLE:
+    case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE:
+      if (opts::JumpTables == JTS_NONE)
+        IsSimple = false;
+      break;
+    case IndirectBranchType::POSSIBLE_FIXED_BRANCH: {
+      if (containsAddress(IndirectTarget)) {
+        const MCSymbol *TargetSymbol = getOrCreateLocalLabel(IndirectTarget);
+        Instruction.clear();
+        MIB->createUncondBranch(Instruction, TargetSymbol, BC.Ctx.get());
+        TakenBranches.emplace_back(Offset, IndirectTarget - getAddress());
+        HasFixedIndirectBranch = true;
+      } else {
+        MIB->convertJmpToTailCall(Instruction);
+        InterproceduralReferences.insert(IndirectTarget);
+      }
+      break;
+    }
+    case IndirectBranchType::UNKNOWN:
+      // Keep processing. We'll do more checks and fixes in
+      // postProcessIndirectBranches().
+      UnknownIndirectBranchOffsets.emplace(Offset);
+      break;
+    }
+  };
+
+  // Check for linker veneers, which lack relocations and need manual
+  // adjustments.
+  auto handleAArch64IndirectCall = [&](MCInst &Instruction, uint64_t Offset) {
+    const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
+    MCInst *TargetHiBits, *TargetLowBits;
+    uint64_t TargetAddress;
+    if (MIB->matchLinkerVeneer(Instructions.begin(), Instructions.end(),
+                               AbsoluteInstrAddr, Instruction, TargetHiBits,
+                               TargetLowBits, TargetAddress)) {
+      MIB->addAnnotation(Instruction, "AArch64Veneer", true);
+
+      uint8_t Counter = 0;
+      for (auto It = std::prev(Instructions.end()); Counter != 2;
+           --It, ++Counter) {
+        MIB->addAnnotation(It->second, "AArch64Veneer", true);
+      }
+
+      fixStubTarget(*TargetLowBits, *TargetHiBits, TargetAddress);
+    }
+  };
+
+  uint64_t Size = 0;  // instruction size
+  for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
+    MCInst Instruction;
+    const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
+
+    // Check for data inside code and ignore it
+    if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) {
+      Size = DataInCodeSize;
+      continue;
+    }
+
+    if (!BC.DisAsm->getInstruction(Instruction,
+                                   Size,
+                                   FunctionData.slice(Offset),
+                                   AbsoluteInstrAddr,
+                                   nulls())) {
+      // Functions with "soft" boundaries, e.g. coming from assembly source,
+      // can have 0-byte padding at the end.
+      if (isZeroPaddingAt(Offset))
+        break;
+
+      errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
+             << Twine::utohexstr(Offset) << " (address 0x"
+             << Twine::utohexstr(AbsoluteInstrAddr) << ") in function "
+             << *this << '\n';
+      // Some AVX-512 instructions could not be disassembled at all.
+      if (BC.HasRelocations && opts::TrapOnAVX512 && BC.isX86()) {
+        setTrapOnEntry();
+        BC.TrappedFunctions.push_back(this);
+      } else {
+        setIgnored();
+      }
+
+      break;
+    }
+
+    // Check integrity of LLVM assembler/disassembler.
+    if (opts::CheckEncoding && !BC.MIB->isBranch(Instruction) &&
+        !BC.MIB->isCall(Instruction) && !BC.MIB->isNoop(Instruction)) {
+      if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) {
+        errs() << "BOLT-WARNING: mismatching LLVM encoding detected in "
+               << "function " << *this << " for instruction :\n";
+        BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
+        errs() << '\n';
+      }
+    }
+
+    // Special handling for AVX-512 instructions.
+    if (MIB->hasEVEXEncoding(Instruction)) {
+      if (BC.HasRelocations && opts::TrapOnAVX512) {
+        setTrapOnEntry();
+        BC.TrappedFunctions.push_back(this);
+        break;
+      }
+
+      // Check if our disassembly is correct and matches the assembler output.
+      if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) {
+        if (opts::Verbosity >= 1) {
+          errs() << "BOLT-WARNING: internal assembler/disassembler error "
+                    "detected for AVX512 instruction:\n";
+          BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
+          errs() << " in function " << *this << '\n';
+        }
+
+        setIgnored();
+        break;
+      }
+    }
+
+    // Check if there's a relocation associated with this instruction.
+    bool UsedReloc = false;
+    for (auto Itr = Relocations.lower_bound(Offset),
+         ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) {
+      const Relocation &Relocation = Itr->second;
+
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: replacing immediate 0x"
+                        << Twine::utohexstr(Relocation.Value)
+                        << " with relocation"
+                           " against "
+                        << Relocation.Symbol << "+" << Relocation.Addend
+                        << " in function " << *this
+                        << " for instruction at offset 0x"
+                        << Twine::utohexstr(Offset) << '\n');
+
+      // Process reference to the primary symbol.
+      if (!Relocation.isPCRelative())
+        BC.handleAddressRef(Relocation.Value - Relocation.Addend,
+                            *this,
+                            /*IsPCRel*/ false);
+
+      int64_t Value = Relocation.Value;
+      const bool Result = BC.MIB->replaceImmWithSymbolRef(
+          Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value,
+          Relocation.Type);
+      (void)Result;
+      assert(Result && "cannot replace immediate with relocation");
+
+      // For aarch, if we replaced an immediate with a symbol from a
+      // relocation, we mark it so we do not try to further process a
+      // pc-relative operand. All we need is the symbol.
+      if (BC.isAArch64())
+        UsedReloc = true;
+
+      // Make sure we replaced the correct immediate (instruction
+      // can have multiple immediate operands).
+      if (BC.isX86()) {
+        assert(truncateToSize(static_cast<uint64_t>(Value),
+                              Relocation::getSizeForType(Relocation.Type)) ==
+               truncateToSize(Relocation.Value,
+                              Relocation::getSizeForType(Relocation.Type)) &&
+                                "immediate value mismatch in function");
+      }
+    }
+
+    // Convert instruction to a shorter version that could be relaxed if
+    // needed.
+    MIB->shortenInstruction(Instruction);
+
+    if (MIB->isBranch(Instruction) || MIB->isCall(Instruction)) {
+      uint64_t TargetAddress = 0;
+      if (MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size,
+                              TargetAddress)) {
+        // Check if the target is within the same function. Otherwise it's
+        // a call, possibly a tail call.
+        //
+        // If the target *is* the function address it could be either a branch
+        // or a recursive call.
+        bool IsCall = MIB->isCall(Instruction);
+        const bool IsCondBranch = MIB->isConditionalBranch(Instruction);
+        MCSymbol *TargetSymbol = nullptr;
+
+        if (BC.MIB->isUnsupportedBranch(Instruction.getOpcode())) {
+          setIgnored();
+          if (BinaryFunction *TargetFunc =
+              BC.getBinaryFunctionContainingAddress(TargetAddress))
+            TargetFunc->setIgnored();
+        }
+
+        if (IsCall && containsAddress(TargetAddress)) {
+          if (TargetAddress == getAddress()) {
+            // Recursive call.
+            TargetSymbol = getSymbol();
+          } else {
+            if (BC.isX86()) {
+              // Dangerous old-style x86 PIC code. We may need to freeze this
+              // function, so preserve the function as is for now.
+              PreserveNops = true;
+            } else {
+              errs() << "BOLT-WARNING: internal call detected at 0x"
+                     << Twine::utohexstr(AbsoluteInstrAddr) << " in function "
+                     << *this << ". Skipping.\n";
+              IsSimple = false;
+            }
+          }
+        }
+
+        if (!TargetSymbol) {
+          // Create either local label or external symbol.
+          if (containsAddress(TargetAddress)) {
+            TargetSymbol = getOrCreateLocalLabel(TargetAddress);
+          } else {
+            if (TargetAddress == getAddress() + getSize() &&
+                TargetAddress < getAddress() + getMaxSize()) {
+              // Result of __builtin_unreachable().
+              LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump past end detected at 0x"
+                                << Twine::utohexstr(AbsoluteInstrAddr)
+                                << " in function " << *this
+                                << " : replacing with nop.\n");
+              BC.MIB->createNoop(Instruction);
+              if (IsCondBranch) {
+                // Register branch offset for profile validation.
+                IgnoredBranches.emplace_back(Offset, Offset + Size);
+              }
+              goto add_instruction;
+            }
+            // May update Instruction and IsCall
+            TargetSymbol = handleExternalReference(Instruction, Size, Offset,
+                                                   TargetAddress, IsCall);
+          }
+        }
+
+        if (!IsCall) {
+          // Add taken branch info.
+          TakenBranches.emplace_back(Offset, TargetAddress - getAddress());
+        }
+        BC.MIB->replaceBranchTarget(Instruction, TargetSymbol, &*Ctx);
+
+        // Mark CTC.
+        if (IsCondBranch && IsCall) {
+          MIB->setConditionalTailCall(Instruction, TargetAddress);
+        }
+      } else {
+        // Could not evaluate branch. Should be an indirect call or an
+        // indirect branch. Bail out on the latter case.
+        if (MIB->isIndirectBranch(Instruction))
+          handleIndirectBranch(Instruction, Size, Offset);
+        // Indirect call. We only need to fix it if the operand is RIP-relative.
+        if (IsSimple && MIB->hasPCRelOperand(Instruction))
+          handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size);
+
+        if (BC.isAArch64())
+          handleAArch64IndirectCall(Instruction, Offset);
+      }
+    } else if (MIB->hasPCRelOperand(Instruction) && !UsedReloc)
+      handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size);
+
+add_instruction:
+    if (getDWARFLineTable()) {
+      Instruction.setLoc(
+          findDebugLineInformationForInstructionAt(AbsoluteInstrAddr,
+                                                   getDWARFUnit(),
+                                                   getDWARFLineTable()));
+    }
+
+    // Record offset of the instruction for profile matching.
+    if (BC.keepOffsetForInstruction(Instruction)) {
+      MIB->addAnnotation(Instruction, "Offset", static_cast<uint32_t>(Offset));
+    }
+
+    addInstruction(Offset, std::move(Instruction));
+  }
+
+  clearList(Relocations);
+
+  if (!IsSimple) {
+    clearList(Instructions);
+    return false;
+  }
+
+  updateState(State::Disassembled);
+
+  return true;
+}
+
+bool BinaryFunction::scanExternalRefs() {
+  bool Success = true;
+  bool DisassemblyFailed = false;
+
+  // Ignore pseudo functions.
+  if (isPseudo())
+    return Success;
+
+  if (opts::NoScan) {
+    clearList(Relocations);
+    clearList(ExternallyReferencedOffsets);
+
+    return false;
+  }
+
+  // List of external references for this function.
+  std::vector<Relocation> FunctionRelocations;
+
+  static BinaryContext::IndependentCodeEmitter Emitter =
+      BC.createIndependentMCCodeEmitter();
+
+  ErrorOr<ArrayRef<uint8_t>> ErrorOrFunctionData = getData();
+  assert(ErrorOrFunctionData && "function data is not available");
+  ArrayRef<uint8_t> FunctionData = *ErrorOrFunctionData;
+  assert(FunctionData.size() == getMaxSize() &&
+         "function size does not match raw data size");
+
+  uint64_t Size = 0;  // instruction size
+  for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
+    // Check for data inside code and ignore it
+    if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) {
+      Size = DataInCodeSize;
+      continue;
+    }
+
+    const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
+    MCInst Instruction;
+    if (!BC.DisAsm->getInstruction(Instruction,
+                                   Size,
+                                   FunctionData.slice(Offset),
+                                   AbsoluteInstrAddr,
+                                   nulls())) {
+      if (opts::Verbosity >= 1 && !isZeroPaddingAt(Offset)) {
+        errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
+               << Twine::utohexstr(Offset) << " (address 0x"
+               << Twine::utohexstr(AbsoluteInstrAddr) << ") in function "
+               << *this << '\n';
+      }
+      Success = false;
+      DisassemblyFailed = true;
+      break;
+    }
+
+    // Return true if we can skip handling the Target function reference.
+    auto ignoreFunctionRef = [&](const BinaryFunction &Target) {
+      if (&Target == this)
+        return true;
+
+      // Note that later we may decide not to emit Target function. In that
+      // case, we conservatively create references that will be ignored or
+      // resolved to the same function.
+      if (!BC.shouldEmit(Target))
+        return true;
+
+      return false;
+    };
+
+    // Return true if we can ignore reference to the symbol.
+    auto ignoreReference = [&](const MCSymbol *TargetSymbol) {
+      if (!TargetSymbol)
+        return true;
+
+      if (BC.forceSymbolRelocations(TargetSymbol->getName()))
+        return false;
+
+      BinaryFunction *TargetFunction = BC.getFunctionForSymbol(TargetSymbol);
+      if (!TargetFunction)
+        return true;
+
+      return ignoreFunctionRef(*TargetFunction);
+    };
+
+    // Detect if the instruction references an address.
+    // Without relocations, we can only trust PC-relative address modes.
+    uint64_t TargetAddress = 0;
+    bool IsPCRel = false;
+    bool IsBranch = false;
+    if (BC.MIB->hasPCRelOperand(Instruction)) {
+      if (BC.MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
+                                           AbsoluteInstrAddr, Size)) {
+        IsPCRel = true;
+      }
+    } else if (BC.MIB->isCall(Instruction) || BC.MIB->isBranch(Instruction)) {
+      if (BC.MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size,
+                                 TargetAddress)) {
+        IsBranch = true;
+      }
+    }
+
+    MCSymbol *TargetSymbol = nullptr;
+
+    // Create an entry point at reference address if needed.
+    BinaryFunction *TargetFunction =
+        BC.getBinaryFunctionContainingAddress(TargetAddress);
+    if (TargetFunction && !ignoreFunctionRef(*TargetFunction)) {
+      const uint64_t FunctionOffset =
+        TargetAddress - TargetFunction->getAddress();
+      TargetSymbol = FunctionOffset
+                        ? TargetFunction->addEntryPointAtOffset(FunctionOffset)
+                        : TargetFunction->getSymbol();
+    }
+
+    // Can't find more references and not creating relocations.
+    if (!BC.HasRelocations)
+      continue;
+
+    // Create a relocation against the TargetSymbol as the symbol might get
+    // moved.
+    if (TargetSymbol) {
+      if (IsBranch) {
+        BC.MIB->replaceBranchTarget(Instruction, TargetSymbol,
+                                    Emitter.LocalCtx.get());
+      } else if (IsPCRel) {
+        const MCExpr *Expr = MCSymbolRefExpr::create(TargetSymbol,
+                                                     MCSymbolRefExpr::VK_None,
+                                                     *Emitter.LocalCtx.get());
+        BC.MIB->replaceMemOperandDisp(
+            Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor(
+                             Instruction,
+                             Expr,
+                             *Emitter.LocalCtx.get(), 0)));
+      }
+    }
+
+    // Create more relocations based on input file relocations.
+    bool HasRel = false;
+    for (auto Itr = Relocations.lower_bound(Offset),
+         ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) {
+      Relocation &Relocation = Itr->second;
+      if (ignoreReference(Relocation.Symbol))
+        continue;
+
+      int64_t Value = Relocation.Value;
+      const bool Result =
+        BC.MIB->replaceImmWithSymbolRef(Instruction,
+                                        Relocation.Symbol,
+                                        Relocation.Addend,
+                                        Emitter.LocalCtx.get(),
+                                        Value,
+                                        Relocation.Type);
+      (void)Result;
+      assert(Result && "cannot replace immediate with relocation");
+
+      HasRel = true;
+    }
+
+    if (!TargetSymbol && !HasRel)
+      continue;
+
+    // Emit the instruction using temp emitter and generate relocations.
+    SmallString<256> Code;
+    SmallVector<MCFixup, 4> Fixups;
+    raw_svector_ostream VecOS(Code);
+    Emitter.MCE->encodeInstruction(Instruction, VecOS, Fixups, *BC.STI);
+
+    // Create relocation for every fixup.
+    for (const MCFixup &Fixup : Fixups) {
+      Optional<Relocation> Rel = BC.MIB->createRelocation(Fixup, *BC.MAB);
+      if (!Rel) {
+        Success = false;
+        continue;
+      }
+
+      if (Relocation::getSizeForType(Rel->Type) < 4) {
+        // If the instruction uses a short form, then we might not be able
+        // to handle the rewrite without relaxation, and hence cannot reliably
+        // create an external reference relocation.
+        Success = false;
+        continue;
+      }
+      Rel->Offset +=  getAddress() - getOriginSection()->getAddress() + Offset;
+      FunctionRelocations.push_back(*Rel);
+    }
+
+    if (!Success)
+      break;
+  }
+
+  // Add relocations unless disassembly failed for this function.
+  if (!DisassemblyFailed) {
+    for (Relocation &Rel : FunctionRelocations) {
+      getOriginSection()->addPendingRelocation(Rel);
+    }
+  }
+
+  // Inform BinaryContext that this function symbols will not be defined and
+  // relocations should not be created against them.
+  if (BC.HasRelocations) {
+    for (std::pair<const uint32_t, MCSymbol *> &LI : Labels) {
+      BC.UndefinedSymbols.insert(LI.second);
+    }
+    if (FunctionEndLabel) {
+      BC.UndefinedSymbols.insert(FunctionEndLabel);
+    }
+  }
+
+  clearList(Relocations);
+  clearList(ExternallyReferencedOffsets);
+
+  if (Success && BC.HasRelocations) {
+    HasExternalRefRelocations = true;
+  }
+
+  if (opts::Verbosity >= 1 && !Success) {
+    outs() << "BOLT-INFO: failed to scan refs for  " << *this << '\n';
+  }
+
+  return Success;
+}
+
+void BinaryFunction::postProcessEntryPoints() {
+  if (!isSimple())
+    return;
+
+  for (auto &KV : Labels) {
+    MCSymbol *Label = KV.second;
+    if (!getSecondaryEntryPointSymbol(Label))
+      continue;
+
+    // In non-relocation mode there's potentially an external undetectable
+    // reference to the entry point and hence we cannot move this entry
+    // point. Optimizing without moving could be difficult.
+    if (!BC.HasRelocations)
+      setSimple(false);
+
+    const uint32_t Offset = KV.first;
+
+    // If we are at Offset 0 and there is no instruction associated with it,
+    // this means this is an empty function. Just ignore. If we find an
+    // instruction at this offset, this entry point is valid.
+    if (!Offset || getInstructionAtOffset(Offset)) {
+      continue;
+    }
+
+    // On AArch64 there are legitimate reasons to have references past the
+    // end of the function, e.g. jump tables.
+    if (BC.isAArch64() && Offset == getSize()) {
+      continue;
+    }
+
+    errs() << "BOLT-WARNING: reference in the middle of instruction "
+              "detected in function " << *this
+           << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
+    if (BC.HasRelocations) {
+      setIgnored();
+    }
+    setSimple(false);
+    return;
+  }
+}
+
+void BinaryFunction::postProcessJumpTables() {
+  // Create labels for all entries.
+  for (auto &JTI : JumpTables) {
+    JumpTable &JT = *JTI.second;
+    if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) {
+      opts::JumpTables = JTS_MOVE;
+      outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was "
+                "detected in function " << *this << '\n';
+    }
+    for (unsigned I = 0; I < JT.OffsetEntries.size(); ++I) {
+      MCSymbol *Label =
+          getOrCreateLocalLabel(getAddress() + JT.OffsetEntries[I],
+                                /*CreatePastEnd*/ true);
+      JT.Entries.push_back(Label);
+    }
+
+    const uint64_t BDSize =
+        BC.getBinaryDataAtAddress(JT.getAddress())->getSize();
+    if (!BDSize) {
+      BC.setBinaryDataSize(JT.getAddress(), JT.getSize());
+    } else {
+      assert(BDSize >= JT.getSize() &&
+             "jump table cannot be larger than the containing object");
+    }
+  }
+
+  // Add TakenBranches from JumpTables.
+  //
+  // We want to do it after initial processing since we don't know jump tables'
+  // boundaries until we process them all.
+  for (auto &JTSite : JTSites) {
+    const uint64_t JTSiteOffset = JTSite.first;
+    const uint64_t JTAddress = JTSite.second;
+    const JumpTable *JT = getJumpTableContainingAddress(JTAddress);
+    assert(JT && "cannot find jump table for address");
+
+    uint64_t EntryOffset = JTAddress - JT->getAddress();
+    while (EntryOffset < JT->getSize()) {
+      uint64_t TargetOffset = JT->OffsetEntries[EntryOffset / JT->EntrySize];
+      if (TargetOffset < getSize()) {
+        TakenBranches.emplace_back(JTSiteOffset, TargetOffset);
+
+        if (opts::StrictMode)
+          registerReferencedOffset(TargetOffset);
+      }
+
+      EntryOffset += JT->EntrySize;
+
+      // A label at the next entry means the end of this jump table.
+      if (JT->Labels.count(EntryOffset))
+        break;
+    }
+  }
+  clearList(JTSites);
+
+  // Free memory used by jump table offsets.
+  for (auto &JTI : JumpTables) {
+    JumpTable &JT = *JTI.second;
+    clearList(JT.OffsetEntries);
+  }
+
+  // Conservatively populate all possible destinations for unknown indirect
+  // branches.
+  if (opts::StrictMode && hasInternalReference()) {
+    for (uint64_t Offset : UnknownIndirectBranchOffsets) {
+      for (uint64_t PossibleDestination : ExternallyReferencedOffsets) {
+        // Ignore __builtin_unreachable().
+        if (PossibleDestination == getSize())
+          continue;
+        TakenBranches.emplace_back(Offset, PossibleDestination);
+      }
+    }
+  }
+
+  // Remove duplicates branches. We can get a bunch of them from jump tables.
+  // Without doing jump table value profiling we don't have use for extra
+  // (duplicate) branches.
+  std::sort(TakenBranches.begin(), TakenBranches.end());
+  auto NewEnd = std::unique(TakenBranches.begin(), TakenBranches.end());
+  TakenBranches.erase(NewEnd, TakenBranches.end());
+}
+
+bool BinaryFunction::postProcessIndirectBranches(
+    MCPlusBuilder::AllocatorIdTy AllocId) {
+  auto addUnknownControlFlow = [&](BinaryBasicBlock &BB) {
+    HasUnknownControlFlow = true;
+    BB.removeAllSuccessors();
+    for (uint64_t PossibleDestination : ExternallyReferencedOffsets) {
+      if (BinaryBasicBlock *SuccBB = getBasicBlockAtOffset(PossibleDestination))
+        BB.addSuccessor(SuccBB);
+    }
+  };
+
+  uint64_t NumIndirectJumps = 0;
+  MCInst *LastIndirectJump = nullptr;
+  BinaryBasicBlock *LastIndirectJumpBB = nullptr;
+  uint64_t LastJT = 0;
+  uint16_t LastJTIndexReg = BC.MIB->getNoRegister();
+  for (BinaryBasicBlock *BB : layout()) {
+    for (MCInst &Instr : *BB) {
+      if (!BC.MIB->isIndirectBranch(Instr))
+        continue;
+
+      // If there's an indirect branch in a single-block function -
+      // it must be a tail call.
+      if (layout_size() == 1) {
+        BC.MIB->convertJmpToTailCall(Instr);
+        return true;
+      }
+
+      ++NumIndirectJumps;
+
+      if (opts::StrictMode && !hasInternalReference()) {
+        BC.MIB->convertJmpToTailCall(Instr);
+        break;
+      }
+
+      // Validate the tail call or jump table assumptions now that we know
+      // basic block boundaries.
+      if (BC.MIB->isTailCall(Instr) || BC.MIB->getJumpTable(Instr)) {
+        const unsigned PtrSize = BC.AsmInfo->getCodePointerSize();
+        MCInst *MemLocInstr;
+        unsigned BaseRegNum, IndexRegNum;
+        int64_t DispValue;
+        const MCExpr *DispExpr;
+        MCInst *PCRelBaseInstr;
+        IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
+            Instr, BB->begin(), BB->end(), PtrSize, MemLocInstr, BaseRegNum,
+            IndexRegNum, DispValue, DispExpr, PCRelBaseInstr);
+        if (Type != IndirectBranchType::UNKNOWN || MemLocInstr != nullptr)
+          continue;
+
+        if (!opts::StrictMode)
+          return false;
+
+        if (BC.MIB->isTailCall(Instr)) {
+          BC.MIB->convertTailCallToJmp(Instr);
+        } else {
+          LastIndirectJump = &Instr;
+          LastIndirectJumpBB = BB;
+          LastJT = BC.MIB->getJumpTable(Instr);
+          LastJTIndexReg = BC.MIB->getJumpTableIndexReg(Instr);
+          BC.MIB->unsetJumpTable(Instr);
+
+          JumpTable *JT = BC.getJumpTableContainingAddress(LastJT);
+          if (JT->Type == JumpTable::JTT_NORMAL) {
+            // Invalidating the jump table may also invalidate other jump table
+            // boundaries. Until we have/need a support for this, mark the
+            // function as non-simple.
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: rejected jump table reference"
+                              << JT->getName() << " in " << *this << '\n');
+            return false;
+          }
+        }
+
+        addUnknownControlFlow(*BB);
+        continue;
+      }
+
+      // If this block contains an epilogue code and has an indirect branch,
+      // then most likely it's a tail call. Otherwise, we cannot tell for sure
+      // what it is and conservatively reject the function's CFG.
+      bool IsEpilogue = false;
+      for (const MCInst &Instr : *BB) {
+        if (BC.MIB->isLeave(Instr) || BC.MIB->isPop(Instr)) {
+          IsEpilogue = true;
+          break;
+        }
+      }
+      if (IsEpilogue) {
+        BC.MIB->convertJmpToTailCall(Instr);
+        BB->removeAllSuccessors();
+        continue;
+      }
+
+      if (opts::Verbosity >= 2) {
+        outs() << "BOLT-INFO: rejected potential indirect tail call in "
+               << "function " << *this << " in basic block "
+               << BB->getName() << ".\n";
+        LLVM_DEBUG(BC.printInstructions(dbgs(), BB->begin(), BB->end(),
+                                        BB->getOffset(), this, true));
+      }
+
+      if (!opts::StrictMode)
+        return false;
+
+      addUnknownControlFlow(*BB);
+    }
+  }
+
+  if (HasInternalLabelReference)
+    return false;
+
+  // If there's only one jump table, and one indirect jump, and no other
+  // references, then we should be able to derive the jump table even if we
+  // fail to match the pattern.
+  if (HasUnknownControlFlow && NumIndirectJumps == 1 &&
+      JumpTables.size() == 1 && LastIndirectJump) {
+    BC.MIB->setJumpTable(*LastIndirectJump, LastJT, LastJTIndexReg, AllocId);
+    HasUnknownControlFlow = false;
+
+    // re-populate successors based on the jump table.
+    std::set<const MCSymbol *> JTLabels;
+    LastIndirectJumpBB->removeAllSuccessors();
+    const JumpTable *JT = getJumpTableContainingAddress(LastJT);
+    for (const MCSymbol *Label : JT->Entries) {
+      JTLabels.emplace(Label);
+    }
+    for (const MCSymbol *Label : JTLabels) {
+      BinaryBasicBlock *BB = getBasicBlockForLabel(Label);
+      // Ignore __builtin_unreachable()
+      if (!BB) {
+        assert(Label == getFunctionEndLabel() && "if no BB found, must be end");
+        continue;
+      }
+      LastIndirectJumpBB->addSuccessor(BB);
+    }
+  }
+
+  if (HasFixedIndirectBranch)
+    return false;
+
+  if (HasUnknownControlFlow && !BC.HasRelocations)
+    return false;
+
+  return true;
+}
+
+void BinaryFunction::recomputeLandingPads() {
+  updateBBIndices(0);
+
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    BB->LandingPads.clear();
+    BB->Throwers.clear();
+  }
+
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    std::unordered_set<const BinaryBasicBlock *> BBLandingPads;
+    for (MCInst &Instr : *BB) {
+      if (!BC.MIB->isInvoke(Instr))
+        continue;
+
+      const Optional<MCPlus::MCLandingPad> EHInfo = BC.MIB->getEHInfo(Instr);
+      if (!EHInfo || !EHInfo->first)
+        continue;
+
+      BinaryBasicBlock *LPBlock = getBasicBlockForLabel(EHInfo->first);
+      if (!BBLandingPads.count(LPBlock)) {
+        BBLandingPads.insert(LPBlock);
+        BB->LandingPads.emplace_back(LPBlock);
+        LPBlock->Throwers.emplace_back(BB);
+      }
+    }
+  }
+}
+
+bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
+  auto &MIB = BC.MIB;
+
+  if (!isSimple()) {
+    assert(!BC.HasRelocations &&
+           "cannot process file with non-simple function in relocs mode");
+    return false;
+  }
+
+  if (CurrentState != State::Disassembled)
+    return false;
+
+  assert(BasicBlocks.empty() && "basic block list should be empty");
+  assert((Labels.find(0) != Labels.end()) &&
+         "first instruction should always have a label");
+
+  // Create basic blocks in the original layout order:
+  //
+  //  * Every instruction with associated label marks
+  //    the beginning of a basic block.
+  //  * Conditional instruction marks the end of a basic block,
+  //    except when the following instruction is an
+  //    unconditional branch, and the unconditional branch is not
+  //    a destination of another branch. In the latter case, the
+  //    basic block will consist of a single unconditional branch
+  //    (missed "double-jump" optimization).
+  //
+  // Created basic blocks are sorted in layout order since they are
+  // created in the same order as instructions, and instructions are
+  // sorted by offsets.
+  BinaryBasicBlock *InsertBB = nullptr;
+  BinaryBasicBlock *PrevBB = nullptr;
+  bool IsLastInstrNop = false;
+  uint64_t LastInstrOffset = 0;
+
+  auto addCFIPlaceholders =
+      [this](uint64_t CFIOffset, BinaryBasicBlock *InsertBB) {
+        for (auto FI = OffsetToCFI.lower_bound(CFIOffset),
+                  FE = OffsetToCFI.upper_bound(CFIOffset);
+             FI != FE; ++FI) {
+          addCFIPseudo(InsertBB, InsertBB->end(), FI->second);
+        }
+      };
+
+  // For profiling purposes we need to save the offset of the last instruction
+  // in the basic block. But in certain cases we don't if the instruction was
+  // the last one, and we have to go back and update its offset.
+  auto updateOffset = [&](uint64_t Offset) {
+    assert(PrevBB && PrevBB != InsertBB && "invalid previous block");
+    MCInst *PrevInstr = PrevBB->getLastNonPseudoInstr();
+    if (PrevInstr && !MIB->hasAnnotation(*PrevInstr, "Offset"))
+      MIB->addAnnotation(*PrevInstr, "Offset", static_cast<uint32_t>(Offset),
+                         AllocatorId);
+  };
+
+  for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) {
+    const uint32_t Offset = I->first;
+    MCInst &Instr = I->second;
+
+    auto LI = Labels.find(Offset);
+    if (LI != Labels.end()) {
+      // Always create new BB at branch destination.
+      PrevBB = InsertBB;
+      InsertBB = addBasicBlock(LI->first, LI->second,
+                               opts::PreserveBlocksAlignment && IsLastInstrNop);
+      if (PrevBB)
+        updateOffset(LastInstrOffset);
+    }
+
+    const uint64_t InstrInputAddr = I->first + Address;
+    bool IsSDTMarker =
+        MIB->isNoop(Instr) && BC.SDTMarkers.count(InstrInputAddr);
+    bool IsLKMarker = BC.LKMarkers.count(InstrInputAddr);
+    if (IsSDTMarker || IsLKMarker) {
+      HasSDTMarker = true;
+      LLVM_DEBUG(dbgs() << "SDTMarker or LKMarker detected in the input at : "
+                        << utohexstr(InstrInputAddr) << "\n");
+      if (!MIB->hasAnnotation(Instr, "Offset")) {
+        MIB->addAnnotation(Instr, "Offset", static_cast<uint32_t>(Offset),
+                           AllocatorId);
+      }
+    }
+
+    // Ignore nops except SDT markers. We use nops to derive alignment of the
+    // next basic block. It will not always work, as some blocks are naturally
+    // aligned, but it's just part of heuristic for block alignment.
+    if (MIB->isNoop(Instr) && !PreserveNops && !IsSDTMarker && !IsLKMarker) {
+      IsLastInstrNop = true;
+      continue;
+    }
+    if (!InsertBB) {
+      // It must be a fallthrough or unreachable code. Create a new block unless
+      // we see an unconditional branch following a conditional one. The latter
+      // should not be a conditional tail call.
+      assert(PrevBB && "no previous basic block for a fall through");
+      MCInst *PrevInstr = PrevBB->getLastNonPseudoInstr();
+      assert(PrevInstr && "no previous instruction for a fall through");
+      if (MIB->isUnconditionalBranch(Instr) &&
+          !MIB->isUnconditionalBranch(*PrevInstr) &&
+          !MIB->getConditionalTailCall(*PrevInstr)) {
+        // Temporarily restore inserter basic block.
+        InsertBB = PrevBB;
+      } else {
+        MCSymbol *Label;
+        {
+          auto L = BC.scopeLock();
+          Label = BC.Ctx->createNamedTempSymbol("FT");
+        }
+        InsertBB = addBasicBlock(
+            Offset, Label, opts::PreserveBlocksAlignment && IsLastInstrNop);
+        updateOffset(LastInstrOffset);
+      }
+    }
+    if (Offset == 0) {
+      // Add associated CFI pseudos in the first offset (0)
+      addCFIPlaceholders(0, InsertBB);
+    }
+
+    const bool IsBlockEnd = MIB->isTerminator(Instr);
+    IsLastInstrNop = MIB->isNoop(Instr);
+    LastInstrOffset = Offset;
+    InsertBB->addInstruction(std::move(Instr));
+
+    // Add associated CFI instrs. We always add the CFI instruction that is
+    // located immediately after this instruction, since the next CFI
+    // instruction reflects the change in state caused by this instruction.
+    auto NextInstr = std::next(I);
+    uint64_t CFIOffset;
+    if (NextInstr != E)
+      CFIOffset = NextInstr->first;
+    else
+      CFIOffset = getSize();
+
+    // Note: this potentially invalidates instruction pointers/iterators.
+    addCFIPlaceholders(CFIOffset, InsertBB);
+
+    if (IsBlockEnd) {
+      PrevBB = InsertBB;
+      InsertBB = nullptr;
+    }
+  }
+
+  if (BasicBlocks.empty()) {
+    setSimple(false);
+    return false;
+  }
+
+  // Intermediate dump.
+  LLVM_DEBUG(print(dbgs(), "after creating basic blocks"));
+
+  // TODO: handle properly calls to no-return functions,
+  // e.g. exit(3), etc. Otherwise we'll see a false fall-through
+  // blocks.
+
+  for (std::pair<uint32_t, uint32_t> &Branch : TakenBranches) {
+    LLVM_DEBUG(dbgs() << "registering branch [0x"
+                      << Twine::utohexstr(Branch.first) << "] -> [0x"
+                      << Twine::utohexstr(Branch.second) << "]\n");
+    BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first);
+    BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second);
+    if (!FromBB || !ToBB) {
+      if (!FromBB)
+        errs() << "BOLT-ERROR: cannot find BB containing the branch.\n";
+      if (!ToBB)
+        errs() << "BOLT-ERROR: cannot find BB containing branch destination.\n";
+      BC.exitWithBugReport("disassembly failed - inconsistent branch found.",
+                           *this);
+    }
+
+    FromBB->addSuccessor(ToBB);
+  }
+
+  // Add fall-through branches.
+  PrevBB = nullptr;
+  bool IsPrevFT = false; // Is previous block a fall-through.
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    if (IsPrevFT) {
+      PrevBB->addSuccessor(BB);
+    }
+    if (BB->empty()) {
+      IsPrevFT = true;
+      PrevBB = BB;
+      continue;
+    }
+
+    MCInst *LastInstr = BB->getLastNonPseudoInstr();
+    assert(LastInstr &&
+           "should have non-pseudo instruction in non-empty block");
+
+    if (BB->succ_size() == 0) {
+      // Since there's no existing successors, we know the last instruction is
+      // not a conditional branch. Thus if it's a terminator, it shouldn't be a
+      // fall-through.
+      //
+      // Conditional tail call is a special case since we don't add a taken
+      // branch successor for it.
+      IsPrevFT = !MIB->isTerminator(*LastInstr) ||
+                 MIB->getConditionalTailCall(*LastInstr);
+    } else if (BB->succ_size() == 1) {
+      IsPrevFT = MIB->isConditionalBranch(*LastInstr);
+    } else {
+      IsPrevFT = false;
+    }
+
+    PrevBB = BB;
+  }
+
+  if (!IsPrevFT) {
+    // Possibly a call that does not return.
+    LLVM_DEBUG(dbgs() << "last block was marked as a fall-through in " << *this
+                      << '\n');
+  }
+
+  // Assign landing pads and throwers info.
+  recomputeLandingPads();
+
+  // Assign CFI information to each BB entry.
+  annotateCFIState();
+
+  // Annotate invoke instructions with GNU_args_size data.
+  propagateGnuArgsSizeInfo(AllocatorId);
+
+  // Set the basic block layout to the original order and set end offsets.
+  PrevBB = nullptr;
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    BasicBlocksLayout.emplace_back(BB);
+    if (PrevBB)
+      PrevBB->setEndOffset(BB->getOffset());
+    PrevBB = BB;
+  }
+  PrevBB->setEndOffset(getSize());
+
+  updateLayoutIndices();
+
+  normalizeCFIState();
+
+  // Clean-up memory taken by intermediate structures.
+  //
+  // NB: don't clear Labels list as we may need them if we mark the function
+  //     as non-simple later in the process of discovering extra entry points.
+  clearList(Instructions);
+  clearList(OffsetToCFI);
+  clearList(TakenBranches);
+
+  // Update the state.
+  CurrentState = State::CFG;
+
+  // Make any necessary adjustments for indirect branches.
+  if (!postProcessIndirectBranches(AllocatorId)) {
+    if (opts::Verbosity) {
+      errs() << "BOLT-WARNING: failed to post-process indirect branches for "
+             << *this << '\n';
+    }
+    // In relocation mode we want to keep processing the function but avoid
+    // optimizing it.
+    setSimple(false);
+  }
+
+  clearList(ExternallyReferencedOffsets);
+  clearList(UnknownIndirectBranchOffsets);
+
+  return true;
+}
+
+void BinaryFunction::postProcessCFG() {
+  if (isSimple() && !BasicBlocks.empty()) {
+    // Convert conditional tail call branches to conditional branches that jump
+    // to a tail call.
+    removeConditionalTailCalls();
+
+    postProcessProfile();
+
+    // Eliminate inconsistencies between branch instructions and CFG.
+    postProcessBranches();
+  }
+
+  calculateMacroOpFusionStats();
+
+  // The final cleanup of intermediate structures.
+  clearList(IgnoredBranches);
+
+  // Remove "Offset" annotations, unless we need an address-translation table
+  // later. This has no cost, since annotations are allocated by a bumpptr
+  // allocator and won't be released anyway until late in the pipeline.
+  if (!requiresAddressTranslation() && !opts::Instrument)
+    for (BinaryBasicBlock *BB : layout())
+      for (MCInst &Inst : *BB)
+        BC.MIB->removeAnnotation(Inst, "Offset");
+
+  assert((!isSimple() || validateCFG()) &&
+         "invalid CFG detected after post-processing");
+}
+
+void BinaryFunction::calculateMacroOpFusionStats() {
+  if (!getBinaryContext().isX86())
+    return;
+  for (BinaryBasicBlock *BB : layout()) {
+    auto II = BB->getMacroOpFusionPair();
+    if (II == BB->end())
+      continue;
+
+    // Check offset of the second instruction.
+    // FIXME: arch-specific.
+    const uint32_t Offset =
+        BC.MIB->getAnnotationWithDefault<uint32_t>(*std::next(II), "Offset", 0);
+    if (!Offset || (getAddress() + Offset) % 64)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "\nmissed macro-op fusion at address 0x"
+                      << Twine::utohexstr(getAddress() + Offset)
+                      << " in function " << *this << "; executed "
+                      << BB->getKnownExecutionCount() << " times.\n");
+    ++BC.MissedMacroFusionPairs;
+    BC.MissedMacroFusionExecCount += BB->getKnownExecutionCount();
+  }
+}
+
+void BinaryFunction::removeTagsFromProfile() {
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    if (BB->ExecutionCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      BB->ExecutionCount = 0;
+    for (BinaryBasicBlock::BinaryBranchInfo &BI : BB->branch_info()) {
+      if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+          BI.MispredictedCount != BinaryBasicBlock::COUNT_NO_PROFILE)
+        continue;
+      BI.Count = 0;
+      BI.MispredictedCount = 0;
+    }
+  }
+}
+
+void BinaryFunction::removeConditionalTailCalls() {
+  // Blocks to be appended at the end.
+  std::vector<std::unique_ptr<BinaryBasicBlock>> NewBlocks;
+
+  for (auto BBI = begin(); BBI != end(); ++BBI) {
+    BinaryBasicBlock &BB = *BBI;
+    MCInst *CTCInstr = BB.getLastNonPseudoInstr();
+    if (!CTCInstr)
+      continue;
+
+    Optional<uint64_t> TargetAddressOrNone =
+        BC.MIB->getConditionalTailCall(*CTCInstr);
+    if (!TargetAddressOrNone)
+      continue;
+
+    // Gather all necessary information about CTC instruction before
+    // annotations are destroyed.
+    const int32_t CFIStateBeforeCTC = BB.getCFIStateAtInstr(CTCInstr);
+    uint64_t CTCTakenCount = BinaryBasicBlock::COUNT_NO_PROFILE;
+    uint64_t CTCMispredCount = BinaryBasicBlock::COUNT_NO_PROFILE;
+    if (hasValidProfile()) {
+      CTCTakenCount =
+        BC.MIB->getAnnotationWithDefault<uint64_t>(*CTCInstr, "CTCTakenCount");
+      CTCMispredCount =
+        BC.MIB->getAnnotationWithDefault<uint64_t>(*CTCInstr,
+                                                   "CTCMispredCount");
+    }
+
+    // Assert that the tail call does not throw.
+    assert(!BC.MIB->getEHInfo(*CTCInstr) &&
+           "found tail call with associated landing pad");
+
+    // Create a basic block with an unconditional tail call instruction using
+    // the same destination.
+    const MCSymbol *CTCTargetLabel = BC.MIB->getTargetSymbol(*CTCInstr);
+    assert(CTCTargetLabel && "symbol expected for conditional tail call");
+    MCInst TailCallInstr;
+    BC.MIB->createTailCall(TailCallInstr, CTCTargetLabel, BC.Ctx.get());
+    // Link new BBs to the original input offset of the BB where the CTC
+    // is, so we can map samples recorded in new BBs back to the original BB
+    // seem in the input binary (if using BAT)
+    std::unique_ptr<BinaryBasicBlock> TailCallBB = createBasicBlock(
+        BB.getInputOffset(), BC.Ctx->createNamedTempSymbol("TC"));
+    TailCallBB->addInstruction(TailCallInstr);
+    TailCallBB->setCFIState(CFIStateBeforeCTC);
+
+    // Add CFG edge with profile info from BB to TailCallBB.
+    BB.addSuccessor(TailCallBB.get(), CTCTakenCount, CTCMispredCount);
+
+    // Add execution count for the block.
+    TailCallBB->setExecutionCount(CTCTakenCount);
+
+    BC.MIB->convertTailCallToJmp(*CTCInstr);
+
+    BC.MIB->replaceBranchTarget(*CTCInstr, TailCallBB->getLabel(),
+                                BC.Ctx.get());
+
+    // Add basic block to the list that will be added to the end.
+    NewBlocks.emplace_back(std::move(TailCallBB));
+
+    // Swap edges as the TailCallBB corresponds to the taken branch.
+    BB.swapConditionalSuccessors();
+
+    // This branch is no longer a conditional tail call.
+    BC.MIB->unsetConditionalTailCall(*CTCInstr);
+  }
+
+  insertBasicBlocks(std::prev(end()),
+                    std::move(NewBlocks),
+                    /* UpdateLayout */ true,
+                    /* UpdateCFIState */ false);
+}
+
+uint64_t BinaryFunction::getFunctionScore() const {
+  if (FunctionScore != -1)
+    return FunctionScore;
+
+  if (!isSimple() || !hasValidProfile()) {
+    FunctionScore = 0;
+    return FunctionScore;
+  }
+
+  uint64_t TotalScore = 0ULL;
+  for (BinaryBasicBlock *BB : layout()) {
+    uint64_t BBExecCount = BB->getExecutionCount();
+    if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      continue;
+    TotalScore += BBExecCount;
+  }
+  FunctionScore = TotalScore;
+  return FunctionScore;
+}
+
+void BinaryFunction::annotateCFIState() {
+  assert(CurrentState == State::Disassembled && "unexpected function state");
+  assert(!BasicBlocks.empty() && "basic block list should not be empty");
+
+  // This is an index of the last processed CFI in FDE CFI program.
+  uint32_t State = 0;
+
+  // This is an index of RememberState CFI reflecting effective state right
+  // after execution of RestoreState CFI.
+  //
+  // It differs from State iff the CFI at (State-1)
+  // was RestoreState (modulo GNU_args_size CFIs, which are ignored).
+  //
+  // This allows us to generate shorter replay sequences when producing new
+  // CFI programs.
+  uint32_t EffectiveState = 0;
+
+  // For tracking RememberState/RestoreState sequences.
+  std::stack<uint32_t> StateStack;
+
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    BB->setCFIState(EffectiveState);
+
+    for (const MCInst &Instr : *BB) {
+      const MCCFIInstruction *CFI = getCFIFor(Instr);
+      if (!CFI)
+        continue;
+
+      ++State;
+
+      switch (CFI->getOperation()) {
+      case MCCFIInstruction::OpRememberState:
+        StateStack.push(EffectiveState);
+        EffectiveState = State;
+        break;
+      case MCCFIInstruction::OpRestoreState:
+        assert(!StateStack.empty() && "corrupt CFI stack");
+        EffectiveState = StateStack.top();
+        StateStack.pop();
+        break;
+      case MCCFIInstruction::OpGnuArgsSize:
+        // OpGnuArgsSize CFIs do not affect the CFI state.
+        break;
+      default:
+        // Any other CFI updates the state.
+        EffectiveState = State;
+        break;
+      }
+    }
+  }
+
+  assert(StateStack.empty() && "corrupt CFI stack");
+}
+
+namespace {
+
+/// Our full interpretation of a DWARF CFI machine state at a given point
+struct CFISnapshot {
+  /// CFA register number and offset defining the canonical frame at this
+  /// point, or the number of a rule (CFI state) that computes it with a
+  /// DWARF expression. This number will be negative if it refers to a CFI
+  /// located in the CIE instead of the FDE.
+  uint32_t CFAReg;
+  int32_t CFAOffset;
+  int32_t CFARule;
+  /// Mapping of rules (CFI states) that define the location of each
+  /// register. If absent, no rule defining the location of such register
+  /// was ever read. This number will be negative if it refers to a CFI
+  /// located in the CIE instead of the FDE.
+  DenseMap<int32_t, int32_t> RegRule;
+
+  /// References to CIE, FDE and expanded instructions after a restore state
+  const std::vector<MCCFIInstruction> &CIE;
+  const std::vector<MCCFIInstruction> &FDE;
+  const DenseMap<int32_t, SmallVector<int32_t, 4>> &FrameRestoreEquivalents;
+
+  /// Current FDE CFI number representing the state where the snapshot is at
+  int32_t CurState;
+
+  /// Used when we don't have information about which state/rule to apply
+  /// to recover the location of either the CFA or a specific register
+  constexpr static int32_t UNKNOWN = std::numeric_limits<int32_t>::min();
+
+private:
+  /// Update our snapshot by executing a single CFI
+  void update(const MCCFIInstruction &Instr, int32_t RuleNumber) {
+    switch (Instr.getOperation()) {
+    case MCCFIInstruction::OpSameValue:
+    case MCCFIInstruction::OpRelOffset:
+    case MCCFIInstruction::OpOffset:
+    case MCCFIInstruction::OpRestore:
+    case MCCFIInstruction::OpUndefined:
+    case MCCFIInstruction::OpRegister:
+    case MCCFIInstruction::OpExpression:
+    case MCCFIInstruction::OpValExpression:
+      RegRule[Instr.getRegister()] = RuleNumber;
+      break;
+    case MCCFIInstruction::OpDefCfaRegister:
+      CFAReg = Instr.getRegister();
+      CFARule = UNKNOWN;
+      break;
+    case MCCFIInstruction::OpDefCfaOffset:
+      CFAOffset = Instr.getOffset();
+      CFARule = UNKNOWN;
+      break;
+    case MCCFIInstruction::OpDefCfa:
+      CFAReg = Instr.getRegister();
+      CFAOffset = Instr.getOffset();
+      CFARule = UNKNOWN;
+      break;
+    case MCCFIInstruction::OpDefCfaExpression:
+      CFARule = RuleNumber;
+      break;
+    case MCCFIInstruction::OpAdjustCfaOffset:
+    case MCCFIInstruction::OpWindowSave:
+    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpNegateRAState:
+      llvm_unreachable("unsupported CFI opcode");
+      break;
+    case MCCFIInstruction::OpRememberState:
+    case MCCFIInstruction::OpRestoreState:
+    case MCCFIInstruction::OpGnuArgsSize:
+      // do not affect CFI state
+      break;
+    }
+  }
+
+public:
+  /// Advance state reading FDE CFI instructions up to State number
+  void advanceTo(int32_t State) {
+    for (int32_t I = CurState, E = State; I != E; ++I) {
+      const MCCFIInstruction &Instr = FDE[I];
+      if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) {
+        update(Instr, I);
+        continue;
+      }
+      // If restore state instruction, fetch the equivalent CFIs that have
+      // the same effect of this restore. This is used to ensure remember-
+      // restore pairs are completely removed.
+      auto Iter = FrameRestoreEquivalents.find(I);
+      if (Iter == FrameRestoreEquivalents.end())
+        continue;
+      for (int32_t RuleNumber : Iter->second) {
+        update(FDE[RuleNumber], RuleNumber);
+      }
+    }
+
+    assert(((CFAReg != (uint32_t)UNKNOWN && CFAOffset != UNKNOWN) ||
+            CFARule != UNKNOWN) &&
+           "CIE did not define default CFA?");
+
+    CurState = State;
+  }
+
+  /// Interpret all CIE and FDE instructions up until CFI State number and
+  /// populate this snapshot
+  CFISnapshot(
+      const std::vector<MCCFIInstruction> &CIE,
+      const std::vector<MCCFIInstruction> &FDE,
+      const DenseMap<int32_t, SmallVector<int32_t, 4>> &FrameRestoreEquivalents,
+      int32_t State)
+      : CIE(CIE), FDE(FDE), FrameRestoreEquivalents(FrameRestoreEquivalents) {
+    CFAReg = UNKNOWN;
+    CFAOffset = UNKNOWN;
+    CFARule = UNKNOWN;
+    CurState = 0;
+
+    for (int32_t I = 0, E = CIE.size(); I != E; ++I) {
+      const MCCFIInstruction &Instr = CIE[I];
+      update(Instr, -I);
+    }
+
+    advanceTo(State);
+  }
+
+};
+
+/// A CFI snapshot with the capability of checking if incremental additions to
+/// it are redundant. This is used to ensure we do not emit two CFI instructions
+/// back-to-back that are doing the same state change, or to avoid emitting a
+/// CFI at all when the state at that point would not be modified after that CFI
+struct CFISnapshotDiff : public CFISnapshot {
+  bool RestoredCFAReg{false};
+  bool RestoredCFAOffset{false};
+  DenseMap<int32_t, bool> RestoredRegs;
+
+  CFISnapshotDiff(const CFISnapshot &S) : CFISnapshot(S) {}
+
+  CFISnapshotDiff(
+      const std::vector<MCCFIInstruction> &CIE,
+      const std::vector<MCCFIInstruction> &FDE,
+      const DenseMap<int32_t, SmallVector<int32_t, 4>> &FrameRestoreEquivalents,
+      int32_t State)
+      : CFISnapshot(CIE, FDE, FrameRestoreEquivalents, State) {}
+
+  /// Return true if applying Instr to this state is redundant and can be
+  /// dismissed.
+  bool isRedundant(const MCCFIInstruction &Instr) {
+    switch (Instr.getOperation()) {
+    case MCCFIInstruction::OpSameValue:
+    case MCCFIInstruction::OpRelOffset:
+    case MCCFIInstruction::OpOffset:
+    case MCCFIInstruction::OpRestore:
+    case MCCFIInstruction::OpUndefined:
+    case MCCFIInstruction::OpRegister:
+    case MCCFIInstruction::OpExpression:
+    case MCCFIInstruction::OpValExpression: {
+      if (RestoredRegs[Instr.getRegister()])
+        return true;
+      RestoredRegs[Instr.getRegister()] = true;
+      const int32_t CurRegRule =
+          RegRule.find(Instr.getRegister()) != RegRule.end()
+              ? RegRule[Instr.getRegister()]
+              : UNKNOWN;
+      if (CurRegRule == UNKNOWN) {
+        if (Instr.getOperation() == MCCFIInstruction::OpRestore ||
+            Instr.getOperation() == MCCFIInstruction::OpSameValue)
+          return true;
+        return false;
+      }
+      const MCCFIInstruction &LastDef =
+          CurRegRule < 0 ? CIE[-CurRegRule] : FDE[CurRegRule];
+      return LastDef == Instr;
+    }
+    case MCCFIInstruction::OpDefCfaRegister:
+      if (RestoredCFAReg)
+        return true;
+      RestoredCFAReg = true;
+      return CFAReg == Instr.getRegister();
+    case MCCFIInstruction::OpDefCfaOffset:
+      if (RestoredCFAOffset)
+        return true;
+      RestoredCFAOffset = true;
+      return CFAOffset == Instr.getOffset();
+    case MCCFIInstruction::OpDefCfa:
+      if (RestoredCFAReg && RestoredCFAOffset)
+        return true;
+      RestoredCFAReg = true;
+      RestoredCFAOffset = true;
+      return CFAReg == Instr.getRegister() && CFAOffset == Instr.getOffset();
+    case MCCFIInstruction::OpDefCfaExpression:
+      if (RestoredCFAReg && RestoredCFAOffset)
+        return true;
+      RestoredCFAReg = true;
+      RestoredCFAOffset = true;
+      return false;
+    case MCCFIInstruction::OpAdjustCfaOffset:
+    case MCCFIInstruction::OpWindowSave:
+    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpNegateRAState:
+      llvm_unreachable("unsupported CFI opcode");
+      return false;
+    case MCCFIInstruction::OpRememberState:
+    case MCCFIInstruction::OpRestoreState:
+    case MCCFIInstruction::OpGnuArgsSize:
+      // do not affect CFI state
+      return true;
+    }
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+bool BinaryFunction::replayCFIInstrs(int32_t FromState, int32_t ToState,
+                                     BinaryBasicBlock *InBB,
+                                     BinaryBasicBlock::iterator InsertIt) {
+  if (FromState == ToState)
+    return true;
+  assert(FromState < ToState && "can only replay CFIs forward");
+
+  CFISnapshotDiff CFIDiff(CIEFrameInstructions, FrameInstructions,
+                          FrameRestoreEquivalents, FromState);
+
+  std::vector<uint32_t> NewCFIs;
+  for (int32_t CurState = FromState; CurState < ToState; ++CurState) {
+    MCCFIInstruction *Instr = &FrameInstructions[CurState];
+    if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) {
+      auto Iter = FrameRestoreEquivalents.find(CurState);
+      assert(Iter != FrameRestoreEquivalents.end());
+      NewCFIs.insert(NewCFIs.end(), Iter->second.begin(),
+                     Iter->second.end());
+      // RestoreState / Remember will be filtered out later by CFISnapshotDiff,
+      // so we might as well fall-through here.
+    }
+    NewCFIs.push_back(CurState);
+    continue;
+  }
+
+  // Replay instructions while avoiding duplicates
+  for (auto I = NewCFIs.rbegin(), E = NewCFIs.rend(); I != E; ++I) {
+    if (CFIDiff.isRedundant(FrameInstructions[*I]))
+      continue;
+    InsertIt = addCFIPseudo(InBB, InsertIt, *I);
+  }
+
+  return true;
+}
+
+SmallVector<int32_t, 4>
+BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
+                               BinaryBasicBlock *InBB,
+                               BinaryBasicBlock::iterator &InsertIt) {
+  SmallVector<int32_t, 4> NewStates;
+
+  CFISnapshot ToCFITable(CIEFrameInstructions, FrameInstructions,
+                                FrameRestoreEquivalents, ToState);
+  CFISnapshotDiff FromCFITable(ToCFITable);
+  FromCFITable.advanceTo(FromState);
+
+  auto undoState = [&](const MCCFIInstruction &Instr) {
+    switch (Instr.getOperation()) {
+    case MCCFIInstruction::OpRememberState:
+    case MCCFIInstruction::OpRestoreState:
+      break;
+    case MCCFIInstruction::OpSameValue:
+    case MCCFIInstruction::OpRelOffset:
+    case MCCFIInstruction::OpOffset:
+    case MCCFIInstruction::OpRestore:
+    case MCCFIInstruction::OpUndefined:
+    case MCCFIInstruction::OpRegister:
+    case MCCFIInstruction::OpExpression:
+    case MCCFIInstruction::OpValExpression: {
+      if (ToCFITable.RegRule.find(Instr.getRegister()) ==
+          ToCFITable.RegRule.end()) {
+        FrameInstructions.emplace_back(
+            MCCFIInstruction::createRestore(nullptr, Instr.getRegister()));
+        if (FromCFITable.isRedundant(FrameInstructions.back())) {
+          FrameInstructions.pop_back();
+          break;
+        }
+        NewStates.push_back(FrameInstructions.size() - 1);
+        InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1);
+        ++InsertIt;
+        break;
+      }
+      const int32_t Rule = ToCFITable.RegRule[Instr.getRegister()];
+      if (Rule < 0) {
+        if (FromCFITable.isRedundant(CIEFrameInstructions[-Rule]))
+          break;
+        NewStates.push_back(FrameInstructions.size());
+        InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size());
+        ++InsertIt;
+        FrameInstructions.emplace_back(CIEFrameInstructions[-Rule]);
+        break;
+      }
+      if (FromCFITable.isRedundant(FrameInstructions[Rule]))
+        break;
+      NewStates.push_back(Rule);
+      InsertIt = addCFIPseudo(InBB, InsertIt, Rule);
+      ++InsertIt;
+      break;
+    }
+    case MCCFIInstruction::OpDefCfaRegister:
+    case MCCFIInstruction::OpDefCfaOffset:
+    case MCCFIInstruction::OpDefCfa:
+    case MCCFIInstruction::OpDefCfaExpression:
+      if (ToCFITable.CFARule == CFISnapshot::UNKNOWN) {
+        FrameInstructions.emplace_back(MCCFIInstruction::cfiDefCfa(
+            nullptr, ToCFITable.CFAReg, ToCFITable.CFAOffset));
+        if (FromCFITable.isRedundant(FrameInstructions.back())) {
+          FrameInstructions.pop_back();
+          break;
+        }
+        NewStates.push_back(FrameInstructions.size() - 1);
+        InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1);
+        ++InsertIt;
+      } else if (ToCFITable.CFARule < 0) {
+        if (FromCFITable.isRedundant(CIEFrameInstructions[-ToCFITable.CFARule]))
+          break;
+        NewStates.push_back(FrameInstructions.size());
+        InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size());
+        ++InsertIt;
+        FrameInstructions.emplace_back(
+            CIEFrameInstructions[-ToCFITable.CFARule]);
+      } else if (!FromCFITable.isRedundant(
+                     FrameInstructions[ToCFITable.CFARule])) {
+        NewStates.push_back(ToCFITable.CFARule);
+        InsertIt = addCFIPseudo(InBB, InsertIt, ToCFITable.CFARule);
+        ++InsertIt;
+      }
+      break;
+    case MCCFIInstruction::OpAdjustCfaOffset:
+    case MCCFIInstruction::OpWindowSave:
+    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpNegateRAState:
+      llvm_unreachable("unsupported CFI opcode");
+      break;
+    case MCCFIInstruction::OpGnuArgsSize:
+      // do not affect CFI state
+      break;
+    }
+  };
+
+
+  // Undo all modifications from ToState to FromState
+  for (int32_t I = ToState, E = FromState; I != E; ++I) {
+    const MCCFIInstruction &Instr = FrameInstructions[I];
+    if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) {
+      undoState(Instr);
+      continue;
+    }
+    auto Iter = FrameRestoreEquivalents.find(I);
+    if (Iter == FrameRestoreEquivalents.end())
+      continue;
+    for (int32_t State : Iter->second)
+      undoState(FrameInstructions[State]);
+  }
+
+  return NewStates;
+}
+
+void BinaryFunction::normalizeCFIState() {
+  // Reordering blocks with remember-restore state instructions can be specially
+  // tricky. When rewriting the CFI, we omit remember-restore state instructions
+  // entirely. For restore state, we build a map expanding each restore to the
+  // equivalent unwindCFIState sequence required at that point to achieve the
+  // same effect of the restore. All remember state are then just ignored.
+  std::stack<int32_t> Stack;
+  for (BinaryBasicBlock *CurBB : BasicBlocksLayout) {
+    for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
+      if (MCCFIInstruction *CFI = getCFIFor(*II)) {
+        if (CFI->getOperation() == MCCFIInstruction::OpRememberState) {
+          Stack.push(II->getOperand(0).getImm());
+          continue;
+        }
+        if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) {
+          const int32_t RememberState = Stack.top();
+          const int32_t CurState = II->getOperand(0).getImm();
+          FrameRestoreEquivalents[CurState] =
+              unwindCFIState(CurState, RememberState, CurBB, II);
+          Stack.pop();
+        }
+      }
+    }
+  }
+}
+
+bool BinaryFunction::finalizeCFIState() {
+  LLVM_DEBUG(
+      dbgs() << "Trying to fix CFI states for each BB after reordering.\n");
+  LLVM_DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this
+                    << ": ");
+
+  int32_t State = 0;
+  bool SeenCold = false;
+  const char *Sep = "";
+  (void)Sep;
+  for (BinaryBasicBlock *BB : BasicBlocksLayout) {
+    const int32_t CFIStateAtExit = BB->getCFIStateAtExit();
+
+    // Hot-cold border: check if this is the first BB to be allocated in a cold
+    // region (with a different FDE). If yes, we need to reset the CFI state.
+    if (!SeenCold && BB->isCold()) {
+      State = 0;
+      SeenCold = true;
+    }
+
+    // We need to recover the correct state if it doesn't match expected
+    // state at BB entry point.
+    if (BB->getCFIState() < State) {
+      // In this case, State is currently higher than what this BB expect it
+      // to be. To solve this, we need to insert CFI instructions to undo
+      // the effect of all CFI from BB's state to current State.
+      auto InsertIt = BB->begin();
+      unwindCFIState(State, BB->getCFIState(), BB, InsertIt);
+    } else if (BB->getCFIState() > State) {
+      // If BB's CFI state is greater than State, it means we are behind in the
+      // state. Just emit all instructions to reach this state at the
+      // beginning of this BB. If this sequence of instructions involve
+      // remember state or restore state, bail out.
+      if (!replayCFIInstrs(State, BB->getCFIState(), BB, BB->begin()))
+        return false;
+    }
+
+    State = CFIStateAtExit;
+    LLVM_DEBUG(dbgs() << Sep << State; Sep = ", ");
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+
+  for (BinaryBasicBlock *BB : BasicBlocksLayout) {
+    for (auto II = BB->begin(); II != BB->end(); ) {
+      MCCFIInstruction *CFI = getCFIFor(*II);
+      if (CFI &&
+          (CFI->getOperation() == MCCFIInstruction::OpRememberState ||
+           CFI->getOperation() == MCCFIInstruction::OpRestoreState)) {
+        II = BB->eraseInstruction(II);
+      } else {
+        ++II;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool BinaryFunction::requiresAddressTranslation() const {
+  return opts::EnableBAT || hasSDTMarker() || hasPseudoProbe();
+}
+
+uint64_t BinaryFunction::getInstructionCount() const {
+  uint64_t Count = 0;
+  for (BinaryBasicBlock *const &Block : BasicBlocksLayout) {
+    Count += Block->getNumNonPseudos();
+  }
+  return Count;
+}
+
+bool BinaryFunction::hasLayoutChanged() const {
+  return ModifiedLayout;
+}
+
+uint64_t BinaryFunction::getEditDistance() const {
+  return ComputeEditDistance<BinaryBasicBlock *>(BasicBlocksPreviousLayout,
+                                                 BasicBlocksLayout);
+}
+
+void BinaryFunction::clearDisasmState() {
+  clearList(Instructions);
+  clearList(IgnoredBranches);
+  clearList(TakenBranches);
+  clearList(InterproceduralReferences);
+
+  if (BC.HasRelocations) {
+    for (std::pair<const uint32_t, MCSymbol *> &LI : Labels) {
+      BC.UndefinedSymbols.insert(LI.second);
+    }
+    if (FunctionEndLabel) {
+      BC.UndefinedSymbols.insert(FunctionEndLabel);
+    }
+  }
+}
+
+void BinaryFunction::setTrapOnEntry() {
+  clearDisasmState();
+
+  auto addTrapAtOffset = [&](uint64_t Offset) {
+    MCInst TrapInstr;
+    BC.MIB->createTrap(TrapInstr);
+    addInstruction(Offset, std::move(TrapInstr));
+  };
+
+  addTrapAtOffset(0);
+  for (const std::pair<const uint32_t, MCSymbol *> &KV : getLabels()) {
+    if (getSecondaryEntryPointSymbol(KV.second)) {
+      addTrapAtOffset(KV.first);
+    }
+  }
+
+  TrapsOnEntry = true;
+}
+
+void BinaryFunction::setIgnored() {
+  if (opts::processAllFunctions()) {
+    // We can accept ignored functions before they've been disassembled.
+    // In that case, they would still get disassembled and emited, but not
+    // optimized.
+    assert(CurrentState == State::Empty &&
+           "cannot ignore non-empty functions in current mode");
+    IsIgnored = true;
+    return;
+  }
+
+  clearDisasmState();
+
+  // Clear CFG state too.
+  if (hasCFG()) {
+    releaseCFG();
+
+    for (BinaryBasicBlock *BB : BasicBlocks) {
+      delete BB;
+    }
+    clearList(BasicBlocks);
+
+    for (BinaryBasicBlock *BB : DeletedBasicBlocks) {
+      delete BB;
+    }
+    clearList(DeletedBasicBlocks);
+
+    clearList(BasicBlocksLayout);
+    clearList(BasicBlocksPreviousLayout);
+  }
+
+  CurrentState = State::Empty;
+
+  IsIgnored = true;
+  IsSimple = false;
+  LLVM_DEBUG(dbgs() << "Ignoring " << getPrintName() << '\n');
+}
+
+void BinaryFunction::duplicateConstantIslands() {
+  for (BinaryBasicBlock *BB : layout()) {
+    if (!BB->isCold())
+      continue;
+
+    for (MCInst &Inst : *BB) {
+      int OpNum = 0;
+      for (MCOperand &Operand : Inst) {
+        if (!Operand.isExpr()) {
+          ++OpNum;
+          continue;
+        }
+        const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, OpNum);
+        // Check if this is an island symbol
+        if (!Islands.Symbols.count(Symbol) &&
+            !Islands.ProxySymbols.count(Symbol))
+          continue;
+
+        // Create cold symbol, if missing
+        auto ISym = Islands.ColdSymbols.find(Symbol);
+        MCSymbol *ColdSymbol;
+        if (ISym != Islands.ColdSymbols.end()) {
+          ColdSymbol = ISym->second;
+        } else {
+          ColdSymbol = BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".cold");
+          Islands.ColdSymbols[Symbol] = ColdSymbol;
+          // Check if this is a proxy island symbol and update owner proxy map
+          if (Islands.ProxySymbols.count(Symbol)) {
+            BinaryFunction *Owner = Islands.ProxySymbols[Symbol];
+            auto IProxiedSym = Owner->Islands.Proxies[this].find(Symbol);
+            Owner->Islands.ColdProxies[this][IProxiedSym->second] =
+              ColdSymbol;
+          }
+        }
+
+        // Update instruction reference
+        Operand = MCOperand::createExpr(BC.MIB->getTargetExprFor(
+            Inst,
+            MCSymbolRefExpr::create(ColdSymbol, MCSymbolRefExpr::VK_None,
+                                    *BC.Ctx),
+            *BC.Ctx, 0));
+        ++OpNum;
+      }
+    }
+  }
+}
+
+namespace {
+
+#ifndef MAX_PATH
+#define MAX_PATH 255
+#endif
+
+std::string constructFilename(std::string Filename,
+                              std::string Annotation,
+                              std::string Suffix) {
+  std::replace(Filename.begin(), Filename.end(), '/', '-');
+  if (!Annotation.empty()) {
+    Annotation.insert(0, "-");
+  }
+  if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) {
+    assert(Suffix.size() + Annotation.size() <= MAX_PATH);
+    if (opts::Verbosity >= 1) {
+      errs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix
+             << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n";
+    }
+    Filename.resize(MAX_PATH - (Suffix.size() + Annotation.size()));
+  }
+  Filename += Annotation;
+  Filename += Suffix;
+  return Filename;
+}
+
+std::string formatEscapes(const std::string& Str) {
+  std::string Result;
+  for (unsigned I = 0; I < Str.size(); ++I) {
+    char C = Str[I];
+    switch (C) {
+    case '\n':
+      Result += "&#13;";
+      break;
+    case '"':
+      break;
+    default:
+      Result += C;
+      break;
+    }
+  }
+  return Result;
+}
+
+}
+
+void BinaryFunction::dumpGraph(raw_ostream& OS) const {
+  OS << "strict digraph \"" << getPrintName() << "\" {\n";
+  uint64_t Offset = Address;
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    auto LayoutPos = std::find(BasicBlocksLayout.begin(),
+                               BasicBlocksLayout.end(),
+                               BB);
+    unsigned Layout = LayoutPos - BasicBlocksLayout.begin();
+    const char* ColdStr = BB->isCold() ? " (cold)" : "";
+    OS << format("\"%s\" [label=\"%s%s\\n(C:%lu,O:%lu,I:%u,L:%u:CFI:%u)\"]\n",
+                 BB->getName().data(),
+                 BB->getName().data(),
+                 ColdStr,
+                 (BB->ExecutionCount != BinaryBasicBlock::COUNT_NO_PROFILE
+                  ? BB->ExecutionCount
+                  : 0),
+                 BB->getOffset(),
+                 getIndex(BB),
+                 Layout,
+                 BB->getCFIState());
+    OS << format("\"%s\" [shape=box]\n", BB->getName().data());
+    if (opts::DotToolTipCode) {
+      std::string Str;
+      raw_string_ostream CS(Str);
+      Offset = BC.printInstructions(CS, BB->begin(), BB->end(), Offset, this);
+      const std::string Code = formatEscapes(CS.str());
+      OS << format("\"%s\" [tooltip=\"%s\"]\n",
+                   BB->getName().data(),
+                   Code.c_str());
+    }
+
+    // analyzeBranch is just used to get the names of the branch
+    // opcodes.
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    const bool Success = BB->analyzeBranch(TBB,
+                                           FBB,
+                                           CondBranch,
+                                           UncondBranch);
+
+    const MCInst *LastInstr = BB->getLastNonPseudoInstr();
+    const bool IsJumpTable = LastInstr && BC.MIB->getJumpTable(*LastInstr);
+
+    auto BI = BB->branch_info_begin();
+    for (BinaryBasicBlock *Succ : BB->successors()) {
+      std::string Branch;
+      if (Success) {
+        if (Succ == BB->getConditionalSuccessor(true)) {
+          Branch = CondBranch ? std::string(BC.InstPrinter->getOpcodeName(
+                                    CondBranch->getOpcode()))
+                              : "TB";
+        } else if (Succ == BB->getConditionalSuccessor(false)) {
+          Branch = UncondBranch ? std::string(BC.InstPrinter->getOpcodeName(
+                                      UncondBranch->getOpcode()))
+                                : "FB";
+        } else {
+          Branch = "FT";
+        }
+      }
+      if (IsJumpTable) {
+        Branch = "JT";
+      }
+      OS << format("\"%s\" -> \"%s\" [label=\"%s",
+                   BB->getName().data(),
+                   Succ->getName().data(),
+                   Branch.c_str());
+
+      if (BB->getExecutionCount() != COUNT_NO_PROFILE &&
+          BI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) {
+        OS << "\\n(C:" << BI->Count << ",M:" << BI->MispredictedCount << ")";
+      } else if (ExecutionCount != COUNT_NO_PROFILE &&
+                 BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
+        OS << "\\n(IC:" << BI->Count << ")";
+      }
+      OS << "\"]\n";
+
+      ++BI;
+    }
+    for (BinaryBasicBlock *LP : BB->landing_pads()) {
+      OS << format("\"%s\" -> \"%s\" [constraint=false style=dashed]\n",
+                   BB->getName().data(),
+                   LP->getName().data());
+    }
+  }
+  OS << "}\n";
+}
+
+void BinaryFunction::viewGraph() const {
+  SmallString<MAX_PATH> Filename;
+  if (std::error_code EC =
+          sys::fs::createTemporaryFile("bolt-cfg", "dot", Filename)) {
+    errs() << "BOLT-ERROR: " << EC.message() << ", unable to create "
+           << " bolt-cfg-XXXXX.dot temporary file.\n";
+    return;
+  }
+  dumpGraphToFile(std::string(Filename));
+  if (DisplayGraph(Filename)) {
+    errs() << "BOLT-ERROR: Can't display " << Filename << " with graphviz.\n";
+  }
+  if (std::error_code EC = sys::fs::remove(Filename)) {
+    errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove "
+           << Filename << "\n";
+  }
+}
+
+void BinaryFunction::dumpGraphForPass(std::string Annotation) const {
+  std::string Filename = constructFilename(getPrintName(), Annotation, ".dot");
+  outs() << "BOLT-DEBUG: Dumping CFG to " << Filename << "\n";
+  dumpGraphToFile(Filename);
+}
+
+void BinaryFunction::dumpGraphToFile(std::string Filename) const {
+  std::error_code EC;
+  raw_fd_ostream of(Filename, EC, sys::fs::OF_None);
+  if (EC) {
+    if (opts::Verbosity >= 1) {
+      errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
+             << Filename << " for output.\n";
+    }
+    return;
+  }
+  dumpGraph(of);
+}
+
+bool BinaryFunction::validateCFG() const {
+  bool Valid = true;
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    Valid &= BB->validateSuccessorInvariants();
+  }
+
+  if (!Valid)
+    return Valid;
+
+  // Make sure all blocks in CFG are valid.
+  auto validateBlock = [this](const BinaryBasicBlock *BB, StringRef Desc) {
+    if (!BB->isValid()) {
+      errs() << "BOLT-ERROR: deleted " << Desc << " " << BB->getName()
+             << " detected in:\n";
+      this->dump();
+      return false;
+    }
+    return true;
+  };
+  for (const BinaryBasicBlock *BB : BasicBlocks) {
+    if (!validateBlock(BB, "block"))
+      return false;
+    for (const BinaryBasicBlock *PredBB : BB->predecessors())
+      if (!validateBlock(PredBB, "predecessor"))
+        return false;
+    for (const BinaryBasicBlock *SuccBB : BB->successors())
+      if (!validateBlock(SuccBB, "successor"))
+        return false;
+    for (const BinaryBasicBlock *LP : BB->landing_pads())
+      if (!validateBlock(LP, "landing pad"))
+        return false;
+    for (const BinaryBasicBlock *Thrower : BB->throwers())
+      if (!validateBlock(Thrower, "thrower"))
+        return false;
+  }
+
+  for (const BinaryBasicBlock *BB : BasicBlocks) {
+    std::unordered_set<const BinaryBasicBlock *> BBLandingPads;
+    for (const BinaryBasicBlock *LP : BB->landing_pads()) {
+      if (BBLandingPads.count(LP)) {
+        errs() << "BOLT-ERROR: duplicate landing pad detected in"
+               << BB->getName() << " in function " << *this << '\n';
+        return false;
+      }
+      BBLandingPads.insert(LP);
+    }
+
+    std::unordered_set<const BinaryBasicBlock *> BBThrowers;
+    for (const BinaryBasicBlock *Thrower : BB->throwers()) {
+      if (BBThrowers.count(Thrower)) {
+        errs() << "BOLT-ERROR: duplicate thrower detected in"
+               << BB->getName() << " in function " << *this << '\n';
+        return false;
+      }
+      BBThrowers.insert(Thrower);
+    }
+
+    for (const BinaryBasicBlock *LPBlock : BB->landing_pads()) {
+      if (std::find(LPBlock->throw_begin(), LPBlock->throw_end(), BB)
+            == LPBlock->throw_end()) {
+        errs() << "BOLT-ERROR: inconsistent landing pad detected in "
+               << *this << ": " << BB->getName()
+               << " is in LandingPads but not in " << LPBlock->getName()
+               << " Throwers\n";
+        return false;
+      }
+    }
+    for (const BinaryBasicBlock *Thrower : BB->throwers()) {
+      if (std::find(Thrower->lp_begin(), Thrower->lp_end(), BB)
+            == Thrower->lp_end()) {
+        errs() << "BOLT-ERROR: inconsistent thrower detected in "
+               << *this << ": " << BB->getName()
+               << " is in Throwers list but not in " << Thrower->getName()
+               << " LandingPads\n";
+        return false;
+      }
+    }
+  }
+
+  return Valid;
+}
+
+void BinaryFunction::fixBranches() {
+  auto &MIB = BC.MIB;
+  MCContext *Ctx = BC.Ctx.get();
+
+  for (unsigned I = 0, E = BasicBlocksLayout.size(); I != E; ++I) {
+    BinaryBasicBlock *BB = BasicBlocksLayout[I];
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch))
+      continue;
+
+    // We will create unconditional branch with correct destination if needed.
+    if (UncondBranch)
+      BB->eraseInstruction(BB->findInstruction(UncondBranch));
+
+    // Basic block that follows the current one in the final layout.
+    const BinaryBasicBlock *NextBB = nullptr;
+    if (I + 1 != E && BB->isCold() == BasicBlocksLayout[I + 1]->isCold())
+      NextBB = BasicBlocksLayout[I + 1];
+
+    if (BB->succ_size() == 1) {
+      // __builtin_unreachable() could create a conditional branch that
+      // falls-through into the next function - hence the block will have only
+      // one valid successor. Since behaviour is undefined - we replace
+      // the conditional branch with an unconditional if required.
+      if (CondBranch)
+        BB->eraseInstruction(BB->findInstruction(CondBranch));
+      if (BB->getSuccessor() == NextBB)
+        continue;
+      BB->addBranchInstruction(BB->getSuccessor());
+    } else if (BB->succ_size() == 2) {
+      assert(CondBranch && "conditional branch expected");
+      const BinaryBasicBlock *TSuccessor = BB->getConditionalSuccessor(true);
+      const BinaryBasicBlock *FSuccessor = BB->getConditionalSuccessor(false);
+      // Check whether we support reversing this branch direction
+      const bool IsSupported =
+          !MIB->isUnsupportedBranch(CondBranch->getOpcode());
+      if (NextBB && NextBB == TSuccessor && IsSupported) {
+        std::swap(TSuccessor, FSuccessor);
+        {
+          auto L = BC.scopeLock();
+          MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx);
+        }
+        BB->swapConditionalSuccessors();
+      } else {
+        auto L = BC.scopeLock();
+        MIB->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx);
+      }
+      if (TSuccessor == FSuccessor) {
+        BB->removeDuplicateConditionalSuccessor(CondBranch);
+      }
+      if (!NextBB ||
+          ((NextBB != TSuccessor || !IsSupported) && NextBB != FSuccessor)) {
+        // If one of the branches is guaranteed to be "long" while the other
+        // could be "short", then prioritize short for "taken". This will
+        // generate a sequence 1 byte shorter on x86.
+        if (IsSupported && BC.isX86() &&
+            TSuccessor->isCold() != FSuccessor->isCold() &&
+            BB->isCold() != TSuccessor->isCold()) {
+          std::swap(TSuccessor, FSuccessor);
+          {
+            auto L = BC.scopeLock();
+            MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(),
+                                        Ctx);
+          }
+          BB->swapConditionalSuccessors();
+        }
+        BB->addBranchInstruction(FSuccessor);
+      }
+    }
+    // Cases where the number of successors is 0 (block ends with a
+    // terminator) or more than 2 (switch table) don't require branch
+    // instruction adjustments.
+  }
+  assert((!isSimple() || validateCFG())
+         && "Invalid CFG detected after fixing branches");
+}
+
+void BinaryFunction::propagateGnuArgsSizeInfo(
+    MCPlusBuilder::AllocatorIdTy AllocId) {
+  assert(CurrentState == State::Disassembled && "unexpected function state");
+
+  if (!hasEHRanges() || !usesGnuArgsSize())
+    return;
+
+  // The current value of DW_CFA_GNU_args_size affects all following
+  // invoke instructions until the next CFI overrides it.
+  // It is important to iterate basic blocks in the original order when
+  // assigning the value.
+  uint64_t CurrentGnuArgsSize = 0;
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    for (auto II = BB->begin(); II != BB->end(); ) {
+      MCInst &Instr = *II;
+      if (BC.MIB->isCFI(Instr)) {
+        MCCFIInstruction *CFI = getCFIFor(Instr);
+        if (CFI->getOperation() == MCCFIInstruction::OpGnuArgsSize) {
+          CurrentGnuArgsSize = CFI->getOffset();
+          // Delete DW_CFA_GNU_args_size instructions and only regenerate
+          // during the final code emission. The information is embedded
+          // inside call instructions.
+          II = BB->erasePseudoInstruction(II);
+          continue;
+        }
+      } else if (BC.MIB->isInvoke(Instr)) {
+        // Add the value of GNU_args_size as an extra operand to invokes.
+        BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize, AllocId);
+      }
+      ++II;
+    }
+  }
+}
+
+void BinaryFunction::postProcessBranches() {
+  if (!isSimple())
+    return;
+  for (BinaryBasicBlock *BB : BasicBlocksLayout) {
+    auto LastInstrRI = BB->getLastNonPseudo();
+    if (BB->succ_size() == 1) {
+      if (LastInstrRI != BB->rend() &&
+          BC.MIB->isConditionalBranch(*LastInstrRI)) {
+        // __builtin_unreachable() could create a conditional branch that
+        // falls-through into the next function - hence the block will have only
+        // one valid successor. Such behaviour is undefined and thus we remove
+        // the conditional branch while leaving a valid successor.
+        BB->eraseInstruction(std::prev(LastInstrRI.base()));
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: erasing conditional branch in "
+                          << BB->getName() << " in function " << *this << '\n');
+      }
+    } else if (BB->succ_size() == 0) {
+      // Ignore unreachable basic blocks.
+      if (BB->pred_size() == 0 || BB->isLandingPad())
+        continue;
+
+      // If it's the basic block that does not end up with a terminator - we
+      // insert a return instruction unless it's a call instruction.
+      if (LastInstrRI == BB->rend()) {
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: at least one instruction expected in BB "
+                   << BB->getName() << " in function " << *this << '\n');
+        continue;
+      }
+      if (!BC.MIB->isTerminator(*LastInstrRI) &&
+          !BC.MIB->isCall(*LastInstrRI)) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding return to basic block "
+                          << BB->getName() << " in function " << *this << '\n');
+        MCInst ReturnInstr;
+        BC.MIB->createReturn(ReturnInstr);
+        BB->addInstruction(ReturnInstr);
+      }
+    }
+  }
+  assert(validateCFG() && "invalid CFG");
+}
+
+MCSymbol *BinaryFunction::addEntryPointAtOffset(uint64_t Offset) {
+  assert(Offset && "cannot add primary entry point");
+  assert(CurrentState == State::Empty || CurrentState == State::Disassembled);
+
+  const uint64_t EntryPointAddress = getAddress() + Offset;
+  MCSymbol *LocalSymbol = getOrCreateLocalLabel(EntryPointAddress);
+
+  MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(LocalSymbol);
+  if (EntrySymbol)
+    return EntrySymbol;
+
+  if (BinaryData *EntryBD = BC.getBinaryDataAtAddress(EntryPointAddress)) {
+    EntrySymbol = EntryBD->getSymbol();
+  } else {
+    EntrySymbol =
+      BC.getOrCreateGlobalSymbol(EntryPointAddress,
+                                 Twine("__ENTRY_") + getOneName() + "@");
+  }
+  SecondaryEntryPoints[LocalSymbol] = EntrySymbol;
+
+  BC.setSymbolToFunctionMap(EntrySymbol, this);
+
+  return EntrySymbol;
+}
+
+MCSymbol *BinaryFunction::addEntryPoint(const BinaryBasicBlock &BB) {
+  assert(CurrentState == State::CFG &&
+         "basic block can be added as an entry only in a function with CFG");
+
+  if (&BB == BasicBlocks.front())
+    return getSymbol();
+
+  MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(BB);
+  if (EntrySymbol)
+    return EntrySymbol;
+
+  EntrySymbol =
+    BC.Ctx->getOrCreateSymbol("__ENTRY_" + BB.getLabel()->getName());
+
+  SecondaryEntryPoints[BB.getLabel()] = EntrySymbol;
+
+  BC.setSymbolToFunctionMap(EntrySymbol, this);
+
+  return EntrySymbol;
+}
+
+MCSymbol *BinaryFunction::getSymbolForEntryID(uint64_t EntryID) {
+  if (EntryID == 0)
+    return getSymbol();
+
+  if (!isMultiEntry())
+    return nullptr;
+
+  uint64_t NumEntries = 0;
+  if (hasCFG()) {
+    for (BinaryBasicBlock *BB : BasicBlocks) {
+      MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB);
+      if (!EntrySymbol)
+        continue;
+      if (NumEntries == EntryID)
+        return EntrySymbol;
+      ++NumEntries;
+    }
+  } else {
+    for (std::pair<const uint32_t, MCSymbol *> &KV : Labels) {
+      MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second);
+      if (!EntrySymbol)
+        continue;
+      if (NumEntries == EntryID)
+        return EntrySymbol;
+      ++NumEntries;
+    }
+  }
+
+  return nullptr;
+}
+
+uint64_t BinaryFunction::getEntryIDForSymbol(const MCSymbol *Symbol) const {
+  if (!isMultiEntry())
+    return 0;
+
+  for (const MCSymbol *FunctionSymbol : getSymbols())
+    if (FunctionSymbol == Symbol)
+      return 0;
+
+  // Check all secondary entries available as either basic blocks or lables.
+  uint64_t NumEntries = 0;
+  for (const BinaryBasicBlock *BB : BasicBlocks) {
+    MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB);
+    if (!EntrySymbol)
+      continue;
+    if (EntrySymbol == Symbol)
+      return NumEntries;
+    ++NumEntries;
+  }
+  NumEntries = 0;
+  for (const std::pair<const uint32_t, MCSymbol *> &KV : Labels) {
+    MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second);
+    if (!EntrySymbol)
+      continue;
+    if (EntrySymbol == Symbol)
+      return NumEntries;
+    ++NumEntries;
+  }
+
+  llvm_unreachable("symbol not found");
+}
+
+bool BinaryFunction::forEachEntryPoint(EntryPointCallbackTy Callback) const {
+  bool Status = Callback(0, getSymbol());
+  if (!isMultiEntry())
+    return Status;
+
+  for (const std::pair<const uint32_t, MCSymbol *> &KV : Labels) {
+    if (!Status)
+      break;
+
+    MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second);
+    if (!EntrySymbol)
+      continue;
+
+    Status = Callback(KV.first, EntrySymbol);
+  }
+
+  return Status;
+}
+
+BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const {
+  BasicBlockOrderType DFS;
+  unsigned Index = 0;
+  std::stack<BinaryBasicBlock *> Stack;
+
+  // Push entry points to the stack in reverse order.
+  //
+  // NB: we rely on the original order of entries to match.
+  for (auto BBI = layout_rbegin(); BBI != layout_rend(); ++BBI) {
+    BinaryBasicBlock *BB = *BBI;
+    if (isEntryPoint(*BB))
+      Stack.push(BB);
+    BB->setLayoutIndex(BinaryBasicBlock::InvalidIndex);
+  }
+
+  while (!Stack.empty()) {
+    BinaryBasicBlock *BB = Stack.top();
+    Stack.pop();
+
+    if (BB->getLayoutIndex() != BinaryBasicBlock::InvalidIndex)
+      continue;
+
+    BB->setLayoutIndex(Index++);
+    DFS.push_back(BB);
+
+    for (BinaryBasicBlock *SuccBB : BB->landing_pads()) {
+      Stack.push(SuccBB);
+    }
+
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch) &&
+        CondBranch && BB->succ_size() == 2) {
+      if (BC.MIB->getCanonicalBranchCondCode(BC.MIB->getCondCode(
+              *CondBranch)) == BC.MIB->getCondCode(*CondBranch)) {
+        Stack.push(BB->getConditionalSuccessor(true));
+        Stack.push(BB->getConditionalSuccessor(false));
+      } else {
+        Stack.push(BB->getConditionalSuccessor(false));
+        Stack.push(BB->getConditionalSuccessor(true));
+      }
+    } else {
+      for (BinaryBasicBlock *SuccBB : BB->successors()) {
+        Stack.push(SuccBB);
+      }
+    }
+  }
+
+  return DFS;
+}
+
+size_t BinaryFunction::computeHash(bool UseDFS,
+                                   OperandHashFuncTy OperandHashFunc) const {
+  if (size() == 0)
+    return 0;
+
+  assert(hasCFG() && "function is expected to have CFG");
+
+  const std::vector<BinaryBasicBlock *> &Order =
+      UseDFS ? dfs() : BasicBlocksLayout;
+
+  // The hash is computed by creating a string of all instruction opcodes and
+  // possibly their operands and then hashing that string with std::hash.
+  std::string HashString;
+  for (const BinaryBasicBlock *BB : Order) {
+    for (const MCInst &Inst : *BB) {
+      unsigned Opcode = Inst.getOpcode();
+
+      if (BC.MIB->isPseudo(Inst))
+        continue;
+
+      // Ignore unconditional jumps since we check CFG consistency by processing
+      // basic blocks in order and do not rely on branches to be in-sync with
+      // CFG. Note that we still use condition code of conditional jumps.
+      if (BC.MIB->isUnconditionalBranch(Inst))
+        continue;
+
+      if (Opcode == 0)
+        HashString.push_back(0);
+
+      while (Opcode) {
+        uint8_t LSB = Opcode & 0xff;
+        HashString.push_back(LSB);
+        Opcode = Opcode >> 8;
+      }
+
+      for (unsigned I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) {
+        HashString.append(OperandHashFunc(Inst.getOperand(I)));
+      }
+    }
+  }
+
+  return Hash = std::hash<std::string>{}(HashString);
+}
+
+void BinaryFunction::insertBasicBlocks(
+    BinaryBasicBlock *Start,
+    std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
+    const bool UpdateLayout,
+    const bool UpdateCFIState,
+    const bool RecomputeLandingPads) {
+  const auto StartIndex = Start ? getIndex(Start) : -1;
+  const size_t NumNewBlocks = NewBBs.size();
+
+  BasicBlocks.insert(BasicBlocks.begin() + (StartIndex + 1),
+                     NumNewBlocks,
+                     nullptr);
+
+  auto I = StartIndex + 1;
+  for (std::unique_ptr<BinaryBasicBlock> &BB : NewBBs) {
+    assert(!BasicBlocks[I]);
+    BasicBlocks[I++] = BB.release();
+  }
+
+  if (RecomputeLandingPads) {
+    recomputeLandingPads();
+  } else {
+    updateBBIndices(0);
+  }
+
+  if (UpdateLayout) {
+    updateLayout(Start, NumNewBlocks);
+  }
+
+  if (UpdateCFIState) {
+    updateCFIState(Start, NumNewBlocks);
+  }
+}
+
+BinaryFunction::iterator BinaryFunction::insertBasicBlocks(
+    BinaryFunction::iterator StartBB,
+    std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
+    const bool UpdateLayout,
+    const bool UpdateCFIState,
+    const bool RecomputeLandingPads) {
+  const unsigned StartIndex = getIndex(&*StartBB);
+  const size_t NumNewBlocks = NewBBs.size();
+
+  BasicBlocks.insert(BasicBlocks.begin() + StartIndex + 1, NumNewBlocks,
+                     nullptr);
+  auto RetIter = BasicBlocks.begin() + StartIndex + 1;
+
+  unsigned I = StartIndex + 1;
+  for (std::unique_ptr<BinaryBasicBlock> &BB : NewBBs) {
+    assert(!BasicBlocks[I]);
+    BasicBlocks[I++] = BB.release();
+  }
+
+  if (RecomputeLandingPads) {
+    recomputeLandingPads();
+  } else {
+    updateBBIndices(0);
+  }
+
+  if (UpdateLayout) {
+    updateLayout(*std::prev(RetIter), NumNewBlocks);
+  }
+
+  if (UpdateCFIState) {
+    updateCFIState(*std::prev(RetIter), NumNewBlocks);
+  }
+
+  return RetIter;
+}
+
+void BinaryFunction::updateBBIndices(const unsigned StartIndex) {
+  for (unsigned I = StartIndex; I < BasicBlocks.size(); ++I) {
+    BasicBlocks[I]->Index = I;
+  }
+}
+
+void BinaryFunction::updateCFIState(BinaryBasicBlock *Start,
+                                    const unsigned NumNewBlocks) {
+  const int32_t CFIState = Start->getCFIStateAtExit();
+  const unsigned StartIndex = getIndex(Start) + 1;
+  for (unsigned I = 0; I < NumNewBlocks; ++I) {
+    BasicBlocks[StartIndex + I]->setCFIState(CFIState);
+  }
+}
+
+void BinaryFunction::updateLayout(BinaryBasicBlock *Start,
+                                  const unsigned NumNewBlocks) {
+  // If start not provided insert new blocks at the beginning
+  if (!Start) {
+    BasicBlocksLayout.insert(layout_begin(), BasicBlocks.begin(),
+                             BasicBlocks.begin() + NumNewBlocks);
+    updateLayoutIndices();
+    return;
+  }
+
+  // Insert new blocks in the layout immediately after Start.
+  auto Pos = std::find(layout_begin(), layout_end(), Start);
+  assert(Pos != layout_end());
+  BinaryBasicBlock **Begin = &BasicBlocks[getIndex(Start) + 1];
+  BinaryBasicBlock **End = &BasicBlocks[getIndex(Start) + NumNewBlocks + 1];
+  BasicBlocksLayout.insert(Pos + 1, Begin, End);
+  updateLayoutIndices();
+}
+
+bool BinaryFunction::checkForAmbiguousJumpTables() {
+  SmallSet<uint64_t, 4> JumpTables;
+  for (BinaryBasicBlock *&BB : BasicBlocks) {
+    for (MCInst &Inst : *BB) {
+      if (!BC.MIB->isIndirectBranch(Inst))
+        continue;
+      uint64_t JTAddress = BC.MIB->getJumpTable(Inst);
+      if (!JTAddress)
+        continue;
+      // This address can be inside another jump table, but we only consider
+      // it ambiguous when the same start address is used, not the same JT
+      // object.
+      if (!JumpTables.count(JTAddress)) {
+        JumpTables.insert(JTAddress);
+        continue;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+void BinaryFunction::disambiguateJumpTables(
+    MCPlusBuilder::AllocatorIdTy AllocId) {
+  assert((opts::JumpTables != JTS_BASIC && isSimple()) || !BC.HasRelocations);
+  SmallPtrSet<JumpTable *, 4> JumpTables;
+  for (BinaryBasicBlock *&BB : BasicBlocks) {
+    for (MCInst &Inst : *BB) {
+      if (!BC.MIB->isIndirectBranch(Inst))
+        continue;
+      JumpTable *JT = getJumpTable(Inst);
+      if (!JT)
+        continue;
+      auto Iter = JumpTables.find(JT);
+      if (Iter == JumpTables.end()) {
+        JumpTables.insert(JT);
+        continue;
+      }
+      // This instruction is an indirect jump using a jump table, but it is
+      // using the same jump table of another jump. Try all our tricks to
+      // extract the jump table symbol and make it point to a new, duplicated JT
+      MCPhysReg BaseReg1;
+      uint64_t Scale;
+      const MCSymbol *Target;
+      // In case we match if our first matcher, first instruction is the one to
+      // patch
+      MCInst *JTLoadInst = &Inst;
+      // Try a standard indirect jump matcher, scale 8
+      std::unique_ptr<MCPlusBuilder::MCInstMatcher> IndJmpMatcher =
+          BC.MIB->matchIndJmp(BC.MIB->matchReg(BaseReg1),
+                              BC.MIB->matchImm(Scale), BC.MIB->matchReg(),
+                              /*Offset=*/BC.MIB->matchSymbol(Target));
+      if (!IndJmpMatcher->match(
+              *BC.MRI, *BC.MIB,
+              MutableArrayRef<MCInst>(&*BB->begin(), &Inst + 1), -1) ||
+          BaseReg1 != BC.MIB->getNoRegister() ||
+          Scale != 8) {
+        MCPhysReg BaseReg2;
+        uint64_t Offset;
+        // Standard JT matching failed. Trying now:
+        //     movq  "jt.2397/1"(,%rax,8), %rax
+        //     jmpq  *%rax
+        std::unique_ptr<MCPlusBuilder::MCInstMatcher> LoadMatcherOwner =
+            BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg1),
+                              BC.MIB->matchImm(Scale), BC.MIB->matchReg(),
+                              /*Offset=*/BC.MIB->matchSymbol(Target));
+        MCPlusBuilder::MCInstMatcher *LoadMatcher = LoadMatcherOwner.get();
+        std::unique_ptr<MCPlusBuilder::MCInstMatcher> IndJmpMatcher2 =
+            BC.MIB->matchIndJmp(std::move(LoadMatcherOwner));
+        if (!IndJmpMatcher2->match(
+                *BC.MRI, *BC.MIB,
+                MutableArrayRef<MCInst>(&*BB->begin(), &Inst + 1), -1) ||
+            BaseReg1 != BC.MIB->getNoRegister() || Scale != 8) {
+          // JT matching failed. Trying now:
+          // PIC-style matcher, scale 4
+          //    addq    %rdx, %rsi
+          //    addq    %rdx, %rdi
+          //    leaq    DATAat0x402450(%rip), %r11
+          //    movslq  (%r11,%rdx,4), %rcx
+          //    addq    %r11, %rcx
+          //    jmpq    *%rcx # JUMPTABLE @0x402450
+          std::unique_ptr<MCPlusBuilder::MCInstMatcher> PICIndJmpMatcher =
+              BC.MIB->matchIndJmp(BC.MIB->matchAdd(
+                  BC.MIB->matchReg(BaseReg1),
+                  BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg2),
+                                    BC.MIB->matchImm(Scale), BC.MIB->matchReg(),
+                                    BC.MIB->matchImm(Offset))));
+          std::unique_ptr<MCPlusBuilder::MCInstMatcher> LEAMatcherOwner =
+              BC.MIB->matchLoadAddr(BC.MIB->matchSymbol(Target));
+          MCPlusBuilder::MCInstMatcher *LEAMatcher = LEAMatcherOwner.get();
+          std::unique_ptr<MCPlusBuilder::MCInstMatcher> PICBaseAddrMatcher =
+              BC.MIB->matchIndJmp(BC.MIB->matchAdd(std::move(LEAMatcherOwner),
+                                                   BC.MIB->matchAnyOperand()));
+          if (!PICIndJmpMatcher->match(
+                  *BC.MRI, *BC.MIB,
+                  MutableArrayRef<MCInst>(&*BB->begin(), &Inst + 1), -1) ||
+              Scale != 4 || BaseReg1 != BaseReg2 || Offset != 0 ||
+              !PICBaseAddrMatcher->match(
+                  *BC.MRI, *BC.MIB,
+                  MutableArrayRef<MCInst>(&*BB->begin(), &Inst + 1), -1)) {
+            llvm_unreachable("Failed to extract jump table base");
+            continue;
+          }
+          // Matched PIC, identify the instruction with the reference to the JT
+          JTLoadInst = LEAMatcher->CurInst;
+        } else {
+          // Matched non-PIC
+          JTLoadInst = LoadMatcher->CurInst;
+        }
+      }
+
+      uint64_t NewJumpTableID = 0;
+      const MCSymbol *NewJTLabel;
+      std::tie(NewJumpTableID, NewJTLabel) =
+          BC.duplicateJumpTable(*this, JT, Target);
+      {
+        auto L = BC.scopeLock();
+        BC.MIB->replaceMemOperandDisp(*JTLoadInst, NewJTLabel, BC.Ctx.get());
+      }
+      // We use a unique ID with the high bit set as address for this "injected"
+      // jump table (not originally in the input binary).
+      BC.MIB->setJumpTable(Inst, NewJumpTableID, 0, AllocId);
+    }
+  }
+}
+
+bool BinaryFunction::replaceJumpTableEntryIn(BinaryBasicBlock *BB,
+                                             BinaryBasicBlock *OldDest,
+                                             BinaryBasicBlock *NewDest) {
+  MCInst *Instr = BB->getLastNonPseudoInstr();
+  if (!Instr || !BC.MIB->isIndirectBranch(*Instr))
+    return false;
+  uint64_t JTAddress = BC.MIB->getJumpTable(*Instr);
+  assert(JTAddress && "Invalid jump table address");
+  JumpTable *JT = getJumpTableContainingAddress(JTAddress);
+  assert(JT && "No jump table structure for this indirect branch");
+  bool Patched = JT->replaceDestination(JTAddress, OldDest->getLabel(),
+                                        NewDest->getLabel());
+  (void)Patched;
+  assert(Patched && "Invalid entry to be replaced in jump table");
+  return true;
+}
+
+BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From,
+                                            BinaryBasicBlock *To) {
+  // Create intermediate BB
+  MCSymbol *Tmp;
+  {
+    auto L = BC.scopeLock();
+    Tmp = BC.Ctx->createNamedTempSymbol("SplitEdge");
+  }
+  // Link new BBs to the original input offset of the From BB, so we can map
+  // samples recorded in new BBs back to the original BB seem in the input
+  // binary (if using BAT)
+  std::unique_ptr<BinaryBasicBlock> NewBB =
+      createBasicBlock(From->getInputOffset(), Tmp);
+  BinaryBasicBlock *NewBBPtr = NewBB.get();
+
+  // Update "From" BB
+  auto I = From->succ_begin();
+  auto BI = From->branch_info_begin();
+  for (; I != From->succ_end(); ++I) {
+    if (*I == To)
+      break;
+    ++BI;
+  }
+  assert(I != From->succ_end() && "Invalid CFG edge in splitEdge!");
+  uint64_t OrigCount = BI->Count;
+  uint64_t OrigMispreds = BI->MispredictedCount;
+  replaceJumpTableEntryIn(From, To, NewBBPtr);
+  From->replaceSuccessor(To, NewBBPtr, OrigCount, OrigMispreds);
+
+  NewBB->addSuccessor(To, OrigCount, OrigMispreds);
+  NewBB->setExecutionCount(OrigCount);
+  NewBB->setIsCold(From->isCold());
+
+  // Update CFI and BB layout with new intermediate BB
+  std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
+  NewBBs.emplace_back(std::move(NewBB));
+  insertBasicBlocks(From, std::move(NewBBs), true, true,
+                    /*RecomputeLandingPads=*/false);
+  return NewBBPtr;
+}
+
+void BinaryFunction::deleteConservativeEdges() {
+  // Our goal is to aggressively remove edges from the CFG that we believe are
+  // wrong. This is used for instrumentation, where it is safe to remove
+  // fallthrough edges because we won't reorder blocks.
+  for (auto I = BasicBlocks.begin(), E = BasicBlocks.end(); I != E; ++I) {
+    BinaryBasicBlock *BB = *I;
+    if (BB->succ_size() != 1 || BB->size() == 0)
+      continue;
+
+    auto NextBB = std::next(I);
+    MCInst* Last = BB->getLastNonPseudoInstr();
+    // Fallthrough is a landing pad? Delete this edge (as long as we don't
+    // have a direct jump to it)
+    if ((*BB->succ_begin())->isLandingPad() && NextBB != E &&
+        *BB->succ_begin() == *NextBB && Last && !BC.MIB->isBranch(*Last)) {
+      BB->removeAllSuccessors();
+      continue;
+    }
+
+    // Look for suspicious calls at the end of BB where gcc may optimize it and
+    // remove the jump to the epilogue when it knows the call won't return.
+    if (!Last || !BC.MIB->isCall(*Last))
+      continue;
+
+    const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(*Last);
+    if (!CalleeSymbol)
+      continue;
+
+    StringRef CalleeName = CalleeSymbol->getName();
+    if (CalleeName != "__cxa_throw@PLT" &&
+        CalleeName != "_Unwind_Resume@PLT" &&
+        CalleeName != "__cxa_rethrow@PLT" &&
+        CalleeName != "exit@PLT" &&
+        CalleeName != "abort@PLT" )
+      continue;
+
+    BB->removeAllSuccessors();
+  }
+}
+
+bool BinaryFunction::isDataMarker(const SymbolRef &Symbol,
+                                  uint64_t SymbolSize) const {
+  // For aarch64, the ABI defines mapping symbols so we identify data in the
+  // code section (see IHI0056B). $d identifies a symbol starting data contents.
+  if (BC.isAArch64() && Symbol.getType() &&
+      cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 &&
+      Symbol.getName() && cantFail(Symbol.getName()) == "$d")
+    return true;
+  return false;
+}
+
+bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol,
+                                  uint64_t SymbolSize) const {
+  // For aarch64, the ABI defines mapping symbols so we identify data in the
+  // code section (see IHI0056B). $x identifies a symbol starting code or the
+  // end of a data chunk inside code.
+  if (BC.isAArch64() && Symbol.getType() &&
+      cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 &&
+      Symbol.getName() && cantFail(Symbol.getName()) == "$x")
+    return true;
+  return false;
+}
+
+bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
+                                          uint64_t SymbolSize) const {
+  // If this symbol is in a different section from the one where the
+  // function symbol is, don't consider it as valid.
+  if (!getOriginSection()->containsAddress(
+          cantFail(Symbol.getAddress(), "cannot get symbol address")))
+    return false;
+
+  // Some symbols are tolerated inside function bodies, others are not.
+  // The real function boundaries may not be known at this point.
+  if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize))
+    return true;
+
+  // It's okay to have a zero-sized symbol in the middle of non-zero-sized
+  // function.
+  if (SymbolSize == 0 && containsAddress(cantFail(Symbol.getAddress())))
+    return true;
+
+  if (cantFail(Symbol.getType()) != SymbolRef::ST_Unknown)
+    return false;
+
+  if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global)
+    return false;
+
+  return true;
+}
+
+void BinaryFunction::adjustExecutionCount(uint64_t Count) {
+  if (getKnownExecutionCount() == 0 || Count == 0)
+    return;
+
+  if (ExecutionCount < Count)
+    Count = ExecutionCount;
+
+  double AdjustmentRatio = ((double) ExecutionCount - Count) / ExecutionCount;
+  if (AdjustmentRatio < 0.0)
+    AdjustmentRatio = 0.0;
+
+  for (BinaryBasicBlock *&BB : layout())
+    BB->adjustExecutionCount(AdjustmentRatio);
+
+  ExecutionCount -= Count;
+}
+
+BinaryFunction::~BinaryFunction() {
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    delete BB;
+  }
+  for (BinaryBasicBlock *BB : DeletedBasicBlocks) {
+    delete BB;
+  }
+}
+
+void BinaryFunction::calculateLoopInfo() {
+  // Discover loops.
+  BinaryDominatorTree DomTree;
+  DomTree.recalculate(*this);
+  BLI.reset(new BinaryLoopInfo());
+  BLI->analyze(DomTree);
+
+  // Traverse discovered loops and add depth and profile information.
+  std::stack<BinaryLoop *> St;
+  for (auto I = BLI->begin(), E = BLI->end(); I != E; ++I) {
+    St.push(*I);
+    ++BLI->OuterLoops;
+  }
+
+  while (!St.empty()) {
+    BinaryLoop *L = St.top();
+    St.pop();
+    ++BLI->TotalLoops;
+    BLI->MaximumDepth = std::max(L->getLoopDepth(), BLI->MaximumDepth);
+
+    // Add nested loops in the stack.
+    for (BinaryLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+      St.push(*I);
+    }
+
+    // Skip if no valid profile is found.
+    if (!hasValidProfile()) {
+      L->EntryCount = COUNT_NO_PROFILE;
+      L->ExitCount = COUNT_NO_PROFILE;
+      L->TotalBackEdgeCount = COUNT_NO_PROFILE;
+      continue;
+    }
+
+    // Compute back edge count.
+    SmallVector<BinaryBasicBlock *, 1> Latches;
+    L->getLoopLatches(Latches);
+
+    for (BinaryBasicBlock *Latch : Latches) {
+      auto BI = Latch->branch_info_begin();
+      for (BinaryBasicBlock *Succ : Latch->successors()) {
+        if (Succ == L->getHeader()) {
+          assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+                 "profile data not found");
+          L->TotalBackEdgeCount += BI->Count;
+        }
+        ++BI;
+      }
+    }
+
+    // Compute entry count.
+    L->EntryCount = L->getHeader()->getExecutionCount() - L->TotalBackEdgeCount;
+
+    // Compute exit count.
+    SmallVector<BinaryLoop::Edge, 1> ExitEdges;
+    L->getExitEdges(ExitEdges);
+    for (BinaryLoop::Edge &Exit : ExitEdges) {
+      const BinaryBasicBlock *Exiting = Exit.first;
+      const BinaryBasicBlock *ExitTarget = Exit.second;
+      auto BI = Exiting->branch_info_begin();
+      for (BinaryBasicBlock *Succ : Exiting->successors()) {
+        if (Succ == ExitTarget) {
+          assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+                 "profile data not found");
+          L->ExitCount += BI->Count;
+        }
+        ++BI;
+      }
+    }
+  }
+}
+
+void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
+  if (!isEmitted()) {
+    assert(!isInjected() && "injected function should be emitted");
+    setOutputAddress(getAddress());
+    setOutputSize(getSize());
+    return;
+  }
+
+  const uint64_t BaseAddress = getCodeSection()->getOutputAddress();
+  ErrorOr<BinarySection &> ColdSection = getColdCodeSection();
+  const uint64_t ColdBaseAddress =
+      isSplit() ? ColdSection->getOutputAddress() : 0;
+  if (BC.HasRelocations || isInjected()) {
+    const uint64_t StartOffset = Layout.getSymbolOffset(*getSymbol());
+    const uint64_t EndOffset = Layout.getSymbolOffset(*getFunctionEndLabel());
+    setOutputAddress(BaseAddress + StartOffset);
+    setOutputSize(EndOffset - StartOffset);
+    if (hasConstantIsland()) {
+      const uint64_t DataOffset =
+          Layout.getSymbolOffset(*getFunctionConstantIslandLabel());
+      setOutputDataAddress(BaseAddress + DataOffset);
+    }
+    if (isSplit()) {
+      const MCSymbol *ColdStartSymbol = getColdSymbol();
+      assert(ColdStartSymbol && ColdStartSymbol->isDefined() &&
+             "split function should have defined cold symbol");
+      const MCSymbol *ColdEndSymbol = getFunctionColdEndLabel();
+      assert(ColdEndSymbol && ColdEndSymbol->isDefined() &&
+             "split function should have defined cold end symbol");
+      const uint64_t ColdStartOffset = Layout.getSymbolOffset(*ColdStartSymbol);
+      const uint64_t ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol);
+      cold().setAddress(ColdBaseAddress + ColdStartOffset);
+      cold().setImageSize(ColdEndOffset - ColdStartOffset);
+      if (hasConstantIsland()) {
+        const uint64_t DataOffset =
+            Layout.getSymbolOffset(*getFunctionColdConstantIslandLabel());
+        setOutputColdDataAddress(ColdBaseAddress + DataOffset);
+      }
+    }
+  } else {
+    setOutputAddress(getAddress());
+    setOutputSize(
+        Layout.getSymbolOffset(*getFunctionEndLabel()));
+  }
+
+  // Update basic block output ranges for the debug info, if we have
+  // secondary entry points in the symbol table to update or if writing BAT.
+  if (!opts::UpdateDebugSections && !isMultiEntry() &&
+      !requiresAddressTranslation())
+    return;
+
+  // Output ranges should match the input if the body hasn't changed.
+  if (!isSimple() && !BC.HasRelocations)
+    return;
+
+  // AArch64 may have functions that only contains a constant island (no code).
+  if (layout_begin() == layout_end())
+    return;
+
+  BinaryBasicBlock *PrevBB = nullptr;
+  for (auto BBI = layout_begin(), BBE = layout_end(); BBI != BBE; ++BBI) {
+    BinaryBasicBlock *BB = *BBI;
+    assert(BB->getLabel()->isDefined() && "symbol should be defined");
+    const uint64_t BBBaseAddress = BB->isCold() ? ColdBaseAddress : BaseAddress;
+    if (!BC.HasRelocations) {
+      if (BB->isCold()) {
+        assert(BBBaseAddress == cold().getAddress());
+      } else {
+        assert(BBBaseAddress == getOutputAddress());
+      }
+    }
+    const uint64_t BBOffset = Layout.getSymbolOffset(*BB->getLabel());
+    const uint64_t BBAddress = BBBaseAddress + BBOffset;
+    BB->setOutputStartAddress(BBAddress);
+
+    if (PrevBB) {
+      uint64_t PrevBBEndAddress = BBAddress;
+      if (BB->isCold() != PrevBB->isCold()) {
+        PrevBBEndAddress =
+            getOutputAddress() + getOutputSize();
+      }
+      PrevBB->setOutputEndAddress(PrevBBEndAddress);
+    }
+    PrevBB = BB;
+
+    BB->updateOutputValues(Layout);
+  }
+  PrevBB->setOutputEndAddress(PrevBB->isCold() ?
+      cold().getAddress() + cold().getImageSize() :
+      getOutputAddress() + getOutputSize());
+}
+
+DebugAddressRangesVector BinaryFunction::getOutputAddressRanges() const {
+  DebugAddressRangesVector OutputRanges;
+
+  if (isFolded())
+    return OutputRanges;
+
+  if (IsFragment)
+    return OutputRanges;
+
+  OutputRanges.emplace_back(getOutputAddress(),
+                            getOutputAddress() + getOutputSize());
+  if (isSplit()) {
+    assert(isEmitted() && "split function should be emitted");
+    OutputRanges.emplace_back(cold().getAddress(),
+                              cold().getAddress() + cold().getImageSize());
+  }
+
+  if (isSimple())
+    return OutputRanges;
+
+  for (BinaryFunction *Frag : Fragments) {
+    assert(!Frag->isSimple() &&
+           "fragment of non-simple function should also be non-simple");
+    OutputRanges.emplace_back(Frag->getOutputAddress(),
+                              Frag->getOutputAddress() + Frag->getOutputSize());
+  }
+
+  return OutputRanges;
+}
+
+uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const {
+  if (isFolded())
+    return 0;
+
+  // If the function hasn't changed return the same address.
+  if (!isEmitted())
+    return Address;
+
+  if (Address < getAddress())
+    return 0;
+
+  // Check if the address is associated with an instruction that is tracked
+  // by address translation.
+  auto KV = InputOffsetToAddressMap.find(Address - getAddress());
+  if (KV != InputOffsetToAddressMap.end()) {
+    return KV->second;
+  }
+
+  // FIXME: #18950828 - we rely on relative offsets inside basic blocks to stay
+  //        intact. Instead we can use pseudo instructions and/or annotations.
+  const uint64_t Offset = Address - getAddress();
+  const BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset);
+  if (!BB) {
+    // Special case for address immediately past the end of the function.
+    if (Offset == getSize())
+      return getOutputAddress() + getOutputSize();
+
+    return 0;
+  }
+
+  return std::min(BB->getOutputAddressRange().first + Offset - BB->getOffset(),
+                  BB->getOutputAddressRange().second);
+}
+
+DebugAddressRangesVector BinaryFunction::translateInputToOutputRanges(
+    const DWARFAddressRangesVector &InputRanges) const {
+  DebugAddressRangesVector OutputRanges;
+
+  if (isFolded())
+    return OutputRanges;
+
+  // If the function hasn't changed return the same ranges.
+  if (!isEmitted()) {
+    OutputRanges.resize(InputRanges.size());
+    std::transform(InputRanges.begin(), InputRanges.end(),
+                   OutputRanges.begin(),
+                   [](const DWARFAddressRange &Range) {
+                     return DebugAddressRange(Range.LowPC, Range.HighPC);
+                   });
+    return OutputRanges;
+  }
+
+  // Even though we will merge ranges in a post-processing pass, we attempt to
+  // merge them in a main processing loop as it improves the processing time.
+  uint64_t PrevEndAddress = 0;
+  for (const DWARFAddressRange &Range : InputRanges) {
+    if (!containsAddress(Range.LowPC)) {
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
+                 << *this << " : [0x" << Twine::utohexstr(Range.LowPC) << ", 0x"
+                 << Twine::utohexstr(Range.HighPC) << "]\n");
+      PrevEndAddress = 0;
+      continue;
+    }
+    uint64_t InputOffset = Range.LowPC - getAddress();
+    const uint64_t InputEndOffset =
+        std::min(Range.HighPC - getAddress(), getSize());
+
+    auto BBI = std::upper_bound(BasicBlockOffsets.begin(),
+                                BasicBlockOffsets.end(),
+                                BasicBlockOffset(InputOffset, nullptr),
+                                CompareBasicBlockOffsets());
+    --BBI;
+    do {
+      const BinaryBasicBlock *BB = BBI->second;
+      if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) {
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
+                   << *this << " : [0x" << Twine::utohexstr(Range.LowPC)
+                   << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n");
+        PrevEndAddress = 0;
+        break;
+      }
+
+      // Skip the range if the block was deleted.
+      if (const uint64_t OutputStart = BB->getOutputAddressRange().first) {
+        const uint64_t StartAddress =
+            OutputStart + InputOffset - BB->getOffset();
+        uint64_t EndAddress = BB->getOutputAddressRange().second;
+        if (InputEndOffset < BB->getEndOffset())
+          EndAddress = StartAddress + InputEndOffset - InputOffset;
+
+        if (StartAddress == PrevEndAddress) {
+          OutputRanges.back().HighPC = std::max(OutputRanges.back().HighPC,
+                                                EndAddress);
+        } else {
+          OutputRanges.emplace_back(StartAddress,
+                                    std::max(StartAddress, EndAddress));
+        }
+        PrevEndAddress = OutputRanges.back().HighPC;
+      }
+
+      InputOffset = BB->getEndOffset();
+      ++BBI;
+    } while (InputOffset < InputEndOffset);
+  }
+
+  // Post-processing pass to sort and merge ranges.
+  std::sort(OutputRanges.begin(), OutputRanges.end());
+  DebugAddressRangesVector MergedRanges;
+  PrevEndAddress = 0;
+  for (const DebugAddressRange &Range : OutputRanges) {
+    if (Range.LowPC <= PrevEndAddress) {
+      MergedRanges.back().HighPC = std::max(MergedRanges.back().HighPC,
+                                            Range.HighPC);
+    } else {
+      MergedRanges.emplace_back(Range.LowPC, Range.HighPC);
+    }
+    PrevEndAddress = MergedRanges.back().HighPC;
+  }
+
+  return MergedRanges;
+}
+
+MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) {
+  if (CurrentState == State::Disassembled) {
+    auto II = Instructions.find(Offset);
+    return (II == Instructions.end()) ? nullptr : &II->second;
+  } else if (CurrentState == State::CFG) {
+    BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset);
+    if (!BB)
+      return nullptr;
+
+    for (MCInst &Inst : *BB) {
+      constexpr uint32_t InvalidOffset = std::numeric_limits<uint32_t>::max();
+      if (Offset == BC.MIB->getAnnotationWithDefault<uint32_t>(Inst, "Offset",
+                                                               InvalidOffset))
+        return &Inst;
+    }
+
+    return nullptr;
+  } else {
+    llvm_unreachable("invalid CFG state to use getInstructionAtOffset()");
+  }
+}
+
+DebugLocationsVector BinaryFunction::translateInputToOutputLocationList(
+    const DebugLocationsVector &InputLL) const {
+  DebugLocationsVector OutputLL;
+
+  if (isFolded()) {
+    return OutputLL;
+  }
+
+  // If the function hasn't changed - there's nothing to update.
+  if (!isEmitted()) {
+    return InputLL;
+  }
+
+  uint64_t PrevEndAddress = 0;
+  SmallVectorImpl<uint8_t> *PrevExpr = nullptr;
+  for (const DebugLocationEntry &Entry : InputLL) {
+    const uint64_t Start = Entry.LowPC;
+    const uint64_t End = Entry.HighPC;
+    if (!containsAddress(Start)) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected "
+                            "for "
+                        << *this << " : [0x" << Twine::utohexstr(Start)
+                        << ", 0x" << Twine::utohexstr(End) << "]\n");
+      continue;
+    }
+    uint64_t InputOffset = Start - getAddress();
+    const uint64_t InputEndOffset = std::min(End - getAddress(), getSize());
+    auto BBI = std::upper_bound(BasicBlockOffsets.begin(),
+                                BasicBlockOffsets.end(),
+                                BasicBlockOffset(InputOffset, nullptr),
+                                CompareBasicBlockOffsets());
+    --BBI;
+    do {
+      const BinaryBasicBlock *BB = BBI->second;
+      if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected "
+                             "for "
+                          << *this << " : [0x" << Twine::utohexstr(Start)
+                          << ", 0x" << Twine::utohexstr(End) << "]\n");
+        PrevEndAddress = 0;
+        break;
+      }
+
+      // Skip the range if the block was deleted.
+      if (const uint64_t OutputStart = BB->getOutputAddressRange().first) {
+        const uint64_t StartAddress =
+            OutputStart + InputOffset - BB->getOffset();
+        uint64_t EndAddress = BB->getOutputAddressRange().second;
+        if (InputEndOffset < BB->getEndOffset())
+          EndAddress = StartAddress + InputEndOffset - InputOffset;
+
+        if (StartAddress == PrevEndAddress && Entry.Expr == *PrevExpr) {
+          OutputLL.back().HighPC = std::max(OutputLL.back().HighPC, EndAddress);
+        } else {
+          OutputLL.emplace_back(
+              DebugLocationEntry{StartAddress,
+                                 std::max(StartAddress, EndAddress),
+                                 Entry.Expr});
+        }
+        PrevEndAddress = OutputLL.back().HighPC;
+        PrevExpr = &OutputLL.back().Expr;
+      }
+
+      ++BBI;
+      InputOffset = BB->getEndOffset();
+    } while (InputOffset < InputEndOffset);
+  }
+
+  // Sort and merge adjacent entries with identical location.
+  std::stable_sort(OutputLL.begin(), OutputLL.end(),
+      [] (const DebugLocationEntry &A, const DebugLocationEntry &B) {
+        return A.LowPC < B.LowPC;
+      });
+  DebugLocationsVector MergedLL;
+  PrevEndAddress = 0;
+  PrevExpr = nullptr;
+  for (const DebugLocationEntry &Entry : OutputLL) {
+    if (Entry.LowPC <= PrevEndAddress && *PrevExpr == Entry.Expr) {
+      MergedLL.back().HighPC = std::max(Entry.HighPC, MergedLL.back().HighPC);
+    } else {
+      const uint64_t Begin = std::max(Entry.LowPC, PrevEndAddress);
+      const uint64_t End = std::max(Begin, Entry.HighPC);
+      MergedLL.emplace_back(DebugLocationEntry{Begin, End, Entry.Expr});
+    }
+    PrevEndAddress = MergedLL.back().HighPC;
+    PrevExpr = &MergedLL.back().Expr;
+  }
+
+  return MergedLL;
+}
+
+void BinaryFunction::printLoopInfo(raw_ostream &OS) const {
+  OS << "Loop Info for Function \"" << *this << "\"";
+  if (hasValidProfile()) {
+    OS << " (count: " << getExecutionCount() << ")";
+  }
+  OS << "\n";
+
+  std::stack<BinaryLoop *> St;
+  for (auto I = BLI->begin(), E = BLI->end(); I != E; ++I) {
+    St.push(*I);
+  }
+  while (!St.empty()) {
+    BinaryLoop *L = St.top();
+    St.pop();
+
+    for (BinaryLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+      St.push(*I);
+    }
+
+    if (!hasValidProfile())
+      continue;
+
+    OS << (L->getLoopDepth() > 1 ? "Nested" : "Outer") << " loop header: "
+       << L->getHeader()->getName();
+    OS << "\n";
+    OS << "Loop basic blocks: ";
+    const char *Sep = "";
+    for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
+      OS << Sep << (*BI)->getName();
+      Sep = ", ";
+    }
+    OS << "\n";
+    if (hasValidProfile()) {
+      OS << "Total back edge count: " << L->TotalBackEdgeCount << "\n";
+      OS << "Loop entry count: " << L->EntryCount << "\n";
+      OS << "Loop exit count: " << L->ExitCount << "\n";
+      if (L->EntryCount > 0) {
+        OS << "Average iters per entry: "
+           << format("%.4lf", (double)L->TotalBackEdgeCount / L->EntryCount)
+           << "\n";
+      }
+    }
+    OS << "----\n";
+  }
+
+  OS << "Total number of loops: "<< BLI->TotalLoops << "\n";
+  OS << "Number of outer loops: " << BLI->OuterLoops << "\n";
+  OS << "Maximum nested loop depth: " << BLI->MaximumDepth << "\n\n";
+}
+
+bool BinaryFunction::isAArch64Veneer() const {
+  if (BasicBlocks.size() != 1)
+    return false;
+
+  BinaryBasicBlock &BB = **BasicBlocks.begin();
+  if (BB.size() != 3)
+    return false;
+
+  for (MCInst &Inst : BB) {
+    if (!BC.MIB->hasAnnotation(Inst, "AArch64Veneer"))
+      return false;
+  }
+
+  return true;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h
new file mode 100644
index 000000000000..1695e16fd180
--- /dev/null
+++ b/bolt/src/BinaryFunction.h
@@ -0,0 +1,2591 @@
+//===--- BinaryFunction.h - Interface for machine-level function ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to function in binary (machine) form. This is assembly-level
+// code representation with the control flow.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_H
+
+#include "BinaryBasicBlock.h"
+#include "BinaryContext.h"
+#include "BinaryLoop.h"
+#include "BinarySection.h"
+#include "DebugData.h"
+#include "JumpTable.h"
+#include "MCPlus.h"
+#include "NameResolver.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using namespace llvm::object;
+
+namespace llvm {
+
+class DWARFUnit;
+
+namespace bolt {
+
+using InputOffsetToAddressMapTy = std::unordered_map<uint64_t, uint64_t>;
+
+/// Types of macro-fusion alignment corrections.
+enum MacroFusionType {
+  MFT_NONE,
+  MFT_HOT,
+  MFT_ALL
+};
+
+enum IndirectCallPromotionType : char {
+  ICP_NONE,        /// Don't perform ICP.
+  ICP_CALLS,       /// Perform ICP on indirect calls.
+  ICP_JUMP_TABLES, /// Perform ICP on jump tables.
+  ICP_ALL          /// Perform ICP on calls and jump tables.
+};
+
+/// Information on a single indirect call to a particular callee.
+struct IndirectCallProfile {
+  MCSymbol *Symbol;
+  uint32_t Offset;
+  uint64_t Count;
+  uint64_t Mispreds;
+
+  IndirectCallProfile(MCSymbol *Symbol, uint64_t Count,
+                      uint64_t Mispreds, uint32_t Offset = 0)
+    : Symbol(Symbol), Offset(Offset), Count(Count), Mispreds(Mispreds) {}
+
+  bool operator==(const IndirectCallProfile &Other) const {
+    return Symbol == Other.Symbol &&
+           Offset == Other.Offset;
+  }
+};
+
+/// Aggregated information for an indirect call site.
+using IndirectCallSiteProfile = SmallVector<IndirectCallProfile, 4>;
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const bolt::IndirectCallSiteProfile &ICSP) {
+  std::string TempString;
+  raw_string_ostream SS(TempString);
+
+  const char *Sep = "\n        ";
+  uint64_t TotalCount = 0;
+  uint64_t TotalMispreds = 0;
+  for (const IndirectCallProfile &CSP : ICSP) {
+    SS << Sep << "{ " << (CSP.Symbol ? CSP.Symbol->getName() : "<unknown>")
+       << ": " << CSP.Count << " (" << CSP.Mispreds << " misses) }";
+    Sep = ",\n        ";
+    TotalCount += CSP.Count;
+    TotalMispreds += CSP.Mispreds;
+  }
+  SS.flush();
+
+  OS << TotalCount << " (" << TotalMispreds << " misses) :" << TempString;
+  return OS;
+}
+
+/// BinaryFunction is a representation of machine-level function.
+///
+/// In the input binary, an instance of BinaryFunction can represent a fragment
+/// of a function if the higher-level function was split, e.g. into hot and cold
+/// parts. The fragment containing the main entry point is called a parent
+/// or the main fragment.
+class BinaryFunction {
+public:
+  enum class State : char {
+    Empty = 0,        /// Function body is empty.
+    Disassembled,     /// Function have been disassembled.
+    CFG,              /// Control flow graph has been built.
+    CFG_Finalized,    /// CFG is finalized. No optimizations allowed.
+    EmittedCFG,       /// Instructions have been emitted to output.
+    Emitted,          /// Same as above plus CFG is destroyed.
+  };
+
+  /// Types of profile the function can use. Could be a combination.
+  enum {
+    PF_NONE = 0,         /// No profile.
+    PF_LBR = 1,          /// Profile is based on last branch records.
+    PF_SAMPLE = 2,       /// Non-LBR sample-based profile.
+    PF_MEMEVENT = 4,     /// Profile has mem events.
+  };
+
+  /// Struct for tracking exception handling ranges.
+  struct CallSite {
+    const MCSymbol *Start;
+    const MCSymbol *End;
+    const MCSymbol *LP;
+    uint64_t Action;
+  };
+
+  using IslandProxiesType =
+    std::map<BinaryFunction *, std::map<const MCSymbol *, MCSymbol *>>;
+
+  struct IslandInfo {
+    /// Temporary holder of offsets that are data markers (used in AArch)
+    /// It is possible to have data in code sections. To ease the identification
+    /// of data in code sections, the ABI requires the symbol table to have
+    /// symbols named "$d" identifying the start of data inside code and "$x"
+    /// identifying the end of a chunk of data inside code. DataOffsets contain
+    /// all offsets of $d symbols and CodeOffsets all offsets of $x symbols.
+    std::set<uint64_t> DataOffsets;
+    std::set<uint64_t> CodeOffsets;
+
+    /// Offsets in function that are data values in a constant island identified
+    /// after disassembling
+    std::map<uint64_t, MCSymbol *> Offsets;
+    SmallPtrSet<MCSymbol *, 4> Symbols;
+    std::map<const MCSymbol *, BinaryFunction *> ProxySymbols;
+    std::map<const MCSymbol *, MCSymbol *> ColdSymbols;
+    /// Keeps track of other functions we depend on because there is a reference
+    /// to the constant islands in them.
+    IslandProxiesType Proxies, ColdProxies;
+    std::set<BinaryFunction *> Dependency; // The other way around
+  };
+
+  static constexpr uint64_t COUNT_NO_PROFILE =
+    BinaryBasicBlock::COUNT_NO_PROFILE;
+
+  /// We have to use at least 2-byte alignment for functions because of C++ ABI.
+  static constexpr unsigned MinAlign = 2;
+
+  static const char TimerGroupName[];
+  static const char TimerGroupDesc[];
+
+  using BasicBlockOrderType = std::vector<BinaryBasicBlock *>;
+
+  /// Mark injected functions
+  bool IsInjected = false;
+
+  using LSDATypeTableTy = std::vector<uint64_t>;
+
+private:
+  /// Current state of the function.
+  State CurrentState{State::Empty};
+
+  /// A list of symbols associated with the function entry point.
+  ///
+  /// Multiple symbols would typically result from identical code-folding
+  /// optimization.
+  typedef SmallVector<MCSymbol *, 1> SymbolListTy;
+  SymbolListTy Symbols;
+
+  /// The list of names this function is known under. Used for fuzzy-matching
+  /// the function to its name in a profile, command line, etc.
+  std::vector<std::string> Aliases;
+
+  /// Containing section in the input file.
+  BinarySection *OriginSection = nullptr;
+
+  /// Address of the function in memory. Also could be an offset from
+  /// base address for position independent binaries.
+  uint64_t Address;
+
+  /// Original size of the function.
+  uint64_t Size;
+
+  /// Address of the function in output.
+  uint64_t OutputAddress{0};
+
+  /// Size of the function in the output file.
+  uint64_t OutputSize{0};
+
+  /// Offset in the file.
+  uint64_t FileOffset{0};
+
+  /// Maximum size this function is allowed to have.
+  uint64_t MaxSize{std::numeric_limits<uint64_t>::max()};
+
+  /// Alignment requirements for the function.
+  uint16_t Alignment{2};
+
+  /// Maximum number of bytes used for alignment of hot part of the function.
+  uint16_t MaxAlignmentBytes{0};
+
+  /// Maximum number of bytes used for alignment of cold part of the function.
+  uint16_t MaxColdAlignmentBytes{0};
+
+  const MCSymbol *PersonalityFunction{nullptr};
+  uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel};
+
+  BinaryContext &BC;
+
+  std::unique_ptr<BinaryLoopInfo> BLI;
+
+  /// Set of external addresses in the code that are not a function start
+  /// and are referenced from this function.
+  std::set<uint64_t> InterproceduralReferences;
+
+  /// All labels in the function that are referenced via relocations from
+  /// data objects. Typically these are jump table destinations and computed
+  /// goto labels.
+  std::set<uint64_t> ExternallyReferencedOffsets;
+
+  /// Offsets of indirect branches with unknown destinations.
+  std::set<uint64_t> UnknownIndirectBranchOffsets;
+
+  /// A set of local and global symbols corresponding to secondary entry points.
+  /// Each additional function entry point has a corresponding entry in the map.
+  /// The key is a local symbol corresponding to a basic block and the value
+  /// is a global symbol corresponding to an external entry point.
+  std::unordered_map<const MCSymbol *, MCSymbol *> SecondaryEntryPoints;
+
+  /// False if the function is too complex to reconstruct its control
+  /// flow graph.
+  /// In relocation mode we still disassemble and re-assemble such functions.
+  bool IsSimple{true};
+
+  /// Indication that the function should be ignored for optimization purposes.
+  /// If we can skip emission of some functions, then ignored functions could
+  /// be not fully disassembled and will not be emitted.
+  bool IsIgnored{false};
+
+  /// Pseudo functions should not be disassembled or emitted.
+  bool IsPseudo{false};
+
+  /// True if the original function code has all necessary relocations to track
+  /// addresses of functions emitted to new locations. Typically set for
+  /// functions that we are not going to emit.
+  bool HasExternalRefRelocations{false};
+
+  /// True if the function has an indirect branch with unknown destination.
+  bool HasUnknownControlFlow{false};
+
+  /// The code from inside the function references one of the code locations
+  /// from the same function as a data, i.e. it's possible the label is used
+  /// inside an address calculation or could be referenced from outside.
+  bool HasInternalLabelReference{false};
+
+  /// In AArch64, preserve nops to maintain code equal to input (assuming no
+  /// optimizations are done).
+  bool PreserveNops{false};
+
+  /// Indicate if this function has associated exception handling metadata.
+  bool HasEHRanges{false};
+
+  /// True if the function uses DW_CFA_GNU_args_size CFIs.
+  bool UsesGnuArgsSize{false};
+
+  /// True if the function might have a profile available externally.
+  /// Used to check if processing of the function is required under certain
+  /// conditions.
+  bool HasProfileAvailable{false};
+
+  bool HasMemoryProfile{false};
+
+  /// Execution halts whenever this function is entered.
+  bool TrapsOnEntry{false};
+
+  /// True if the function had an indirect branch with a fixed internal
+  /// destination.
+  bool HasFixedIndirectBranch{false};
+
+  /// True if the function is a fragment of another function. This means that
+  /// this function could only be entered via its parent or one of its sibling
+  /// fragments. It could be entered at any basic block. It can also return
+  /// the control to any basic block of its parent or its sibling.
+  bool IsFragment{false};
+
+  /// Indicate that the function body has SDT marker
+  bool HasSDTMarker{false};
+
+  /// Indicate that the function body has Pseudo Probe
+  bool HasPseudoProbe{BC.getUniqueSectionByName(".pseudo_probe_desc") &&
+                      BC.getUniqueSectionByName(".pseudo_probe")};
+
+  /// True if the original entry point was patched.
+  bool IsPatched{false};
+
+  /// True if the function contains jump table with entries pointing to
+  /// locations in fragments.
+  bool HasSplitJumpTable{false};
+
+  /// True if there are no control-flow edges with successors in other functions
+  /// (i.e. if tail calls have edges to function-local basic blocks).
+  /// Set to false by SCTC. Dynostats can't be reliably computed for
+  /// functions with non-canonical CFG.
+  /// This attribute is only valid when hasCFG() == true.
+  bool HasCanonicalCFG{true};
+
+  /// The address for the code for this function in codegen memory.
+  /// Used for functions that are emitted in a dedicated section with a fixed
+  /// address. E.g. for functions that are overwritten in-place.
+  uint64_t ImageAddress{0};
+
+  /// The size of the code in memory.
+  uint64_t ImageSize{0};
+
+  /// Name for the section this function code should reside in.
+  std::string CodeSectionName;
+
+  /// Name for the corresponding cold code section.
+  std::string ColdCodeSectionName;
+
+  /// Parent function fragment for split function fragments.
+  BinaryFunction *ParentFragment{nullptr};
+
+  /// Indicate if the function body was folded into another function.
+  /// Used by ICF optimization.
+  BinaryFunction *FoldedIntoFunction{nullptr};
+
+  /// All fragments for a parent function.
+  std::unordered_set<BinaryFunction *> Fragments;
+
+  /// The profile data for the number of times the function was executed.
+  uint64_t ExecutionCount{COUNT_NO_PROFILE};
+
+  /// Profile match ratio.
+  float ProfileMatchRatio{0.0f};
+
+  /// Raw branch count for this function in the profile
+  uint64_t RawBranchCount{0};
+
+  /// Indicates the type of profile the function is using.
+  uint16_t ProfileFlags{PF_NONE};
+
+  /// For functions with mismatched profile we store all call profile
+  /// information at a function level (as opposed to tying it to
+  /// specific call sites).
+  IndirectCallSiteProfile AllCallSites;
+
+  /// Score of the function (estimated number of instructions executed,
+  /// according to profile data). -1 if the score has not been calculated yet.
+  mutable int64_t FunctionScore{-1};
+
+  /// Original LSDA address for the function.
+  uint64_t LSDAAddress{0};
+
+  /// Containing compilation unit for the function.
+  DWARFUnit *DwarfUnit{nullptr};
+
+  /// Last computed hash value. Note that the value could be recomputed using
+  /// different parameters by every pass.
+  mutable uint64_t Hash{0};
+
+  /// For PLT functions it contains a symbol associated with a function
+  /// reference. It is nullptr for non-PLT functions.
+  const MCSymbol *PLTSymbol{nullptr};
+
+  /// Function order for streaming into the destination binary.
+  uint32_t Index{-1U};
+
+  /// Get basic block index assuming it belongs to this function.
+  unsigned getIndex(const BinaryBasicBlock *BB) const {
+    assert(BB->getIndex() < BasicBlocks.size());
+    return BB->getIndex();
+  }
+
+  /// Return basic block that originally contained offset \p Offset
+  /// from the function start.
+  BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset);
+
+  const BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset) const {
+    return const_cast<BinaryFunction *>(this)
+      ->getBasicBlockContainingOffset(Offset);
+  }
+
+  /// Return basic block that started at offset \p Offset.
+  BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) {
+    BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset);
+    return BB && BB->getOffset() == Offset ? BB : nullptr;
+  }
+
+  /// Release memory taken by the list.
+  template<typename T> BinaryFunction &clearList(T& List) {
+    T TempList;
+    TempList.swap(List);
+    return *this;
+  }
+
+  /// Update the indices of all the basic blocks starting at StartIndex.
+  void updateBBIndices(const unsigned StartIndex);
+
+  /// Annotate each basic block entry with its current CFI state. This is
+  /// run right after the construction of CFG while basic blocks are in their
+  /// original order.
+  void annotateCFIState();
+
+  /// Associate DW_CFA_GNU_args_size info with invoke instructions
+  /// (call instructions with non-empty landing pad).
+  void propagateGnuArgsSizeInfo(MCPlusBuilder::AllocatorIdTy AllocId);
+
+  /// Synchronize branch instructions with CFG.
+  void postProcessBranches();
+
+  /// The address offset where we emitted the constant island, that is, the
+  /// chunk of data in the function code area (AArch only)
+  int64_t OutputDataOffset{0};
+  int64_t OutputColdDataOffset{0};
+
+  /// Map labels to corresponding basic blocks.
+  std::unordered_map<const MCSymbol *, BinaryBasicBlock *> LabelToBB;
+
+  using BranchListType = std::vector<std::pair<uint32_t, uint32_t>>;
+  BranchListType TakenBranches;       /// All local taken branches.
+  BranchListType IgnoredBranches;     /// Branches ignored by CFG purposes.
+
+  /// Map offset in the function to a label.
+  /// Labels are used for building CFG for simple functions. For non-simple
+  /// function in relocation mode we need to emit them for relocations
+  /// referencing function internals to work (e.g. jump tables).
+  using LabelsMapType = std::map<uint32_t, MCSymbol *>;
+  LabelsMapType Labels;
+
+  /// Temporary holder of instructions before CFG is constructed.
+  /// Map offset in the function to MCInst.
+  using InstrMapType = std::map<uint32_t, MCInst>;
+  InstrMapType Instructions;
+
+  /// List of DWARF CFI instructions. Original CFI from the binary must be
+  /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs
+  /// if needed (to fix state after reordering BBs).
+  using CFIInstrMapType = std::vector<MCCFIInstruction>;
+  using cfi_iterator       = CFIInstrMapType::iterator;
+  using const_cfi_iterator = CFIInstrMapType::const_iterator;
+
+  /// We don't decode Call Frame Info encoded in DWARF program state
+  /// machine. Instead we define a "CFI State" - a frame information that
+  /// is a result of executing FDE CFI program up to a given point. The
+  /// program consists of opaque Call Frame Instructions:
+  ///
+  ///   CFI #0
+  ///   CFI #1
+  ///   ....
+  ///   CFI #N
+  ///
+  /// When we refer to "CFI State K" - it corresponds to a row in an abstract
+  /// Call Frame Info table. This row is reached right before executing CFI #K.
+  ///
+  /// At any point of execution in a function we are in any one of (N + 2)
+  /// states described in the original FDE program. We can't have more states
+  /// without intelligent processing of CFIs.
+  ///
+  /// When the final layout of basic blocks is known, and we finalize CFG,
+  /// we modify the original program to make sure the same state could be
+  /// reached even when basic blocks containing CFI instructions are executed
+  /// in a different order.
+  CFIInstrMapType FrameInstructions;
+
+  /// A map of restore state CFI instructions to their equivalent CFI
+  /// instructions that produce the same state, in order to eliminate
+  /// remember-restore CFI instructions when rewriting CFI.
+  DenseMap<int32_t , SmallVector<int32_t, 4>> FrameRestoreEquivalents;
+
+  // For tracking exception handling ranges.
+  std::vector<CallSite> CallSites;
+  std::vector<CallSite> ColdCallSites;
+
+  /// Binary blobs representing action, type, and type index tables for this
+  /// function' LSDA (exception handling).
+  ArrayRef<uint8_t> LSDAActionTable;
+  ArrayRef<uint8_t> LSDATypeIndexTable;
+
+  /// Vector of addresses of types referenced by LSDA.
+  LSDATypeTableTy LSDATypeTable;
+
+  /// Vector of addresses of entries in LSDATypeTable used for indirect
+  /// addressing.
+  LSDATypeTableTy LSDATypeAddressTable;
+
+  /// Marking for the beginning of language-specific data area for the function.
+  MCSymbol *LSDASymbol{nullptr};
+  MCSymbol *ColdLSDASymbol{nullptr};
+
+  /// Map to discover which CFIs are attached to a given instruction offset.
+  /// Maps an instruction offset into a FrameInstructions offset.
+  /// This is only relevant to the buildCFG phase and is discarded afterwards.
+  std::multimap<uint32_t, uint32_t> OffsetToCFI;
+
+  /// List of CFI instructions associated with the CIE (common to more than one
+  /// function and that apply before the entry basic block).
+  CFIInstrMapType CIEFrameInstructions;
+
+  /// All compound jump tables for this function. This duplicates what's stored
+  /// in the BinaryContext, but additionally it gives quick access for all
+  /// jump tables used by this function.
+  ///
+  /// <OriginalAddress> -> <JumpTable *>
+  std::map<uint64_t, JumpTable *> JumpTables;
+
+  /// All jump table sites in the function before CFG is built.
+  std::vector<std::pair<uint64_t, uint64_t>> JTSites;
+
+  /// List of relocations in this function.
+  std::map<uint64_t, Relocation> Relocations;
+
+  /// Information on function constant islands.
+  IslandInfo Islands;
+
+  // Blocks are kept sorted in the layout order. If we need to change the
+  // layout (if BasicBlocksLayout stores a different order than BasicBlocks),
+  // the terminating instructions need to be modified.
+  using BasicBlockListType = std::vector<BinaryBasicBlock *>;
+  BasicBlockListType BasicBlocks;
+  BasicBlockListType DeletedBasicBlocks;
+  BasicBlockOrderType BasicBlocksLayout;
+  /// Previous layout replaced by modifyLayout
+  BasicBlockOrderType BasicBlocksPreviousLayout;
+  bool ModifiedLayout{false};
+
+  /// BasicBlockOffsets are used during CFG construction to map from code
+  /// offsets to BinaryBasicBlocks.  Any modifications made to the CFG
+  /// after initial construction are not reflected in this data structure.
+  using BasicBlockOffset = std::pair<uint64_t, BinaryBasicBlock *>;
+  struct CompareBasicBlockOffsets {
+    bool operator()(const BasicBlockOffset &A,
+                    const BasicBlockOffset &B) const {
+      return A.first < B.first;
+    }
+  };
+  std::vector<BasicBlockOffset> BasicBlockOffsets;
+
+  MCSymbol *ColdSymbol{nullptr};
+
+  /// Symbol at the end of the function.
+  mutable MCSymbol *FunctionEndLabel{nullptr};
+
+  /// Symbol at the end of the cold part of split function.
+  mutable MCSymbol *FunctionColdEndLabel{nullptr};
+
+  mutable MCSymbol *FunctionConstantIslandLabel{nullptr};
+  mutable MCSymbol *FunctionColdConstantIslandLabel{nullptr};
+
+  /// Unique number associated with the function.
+  uint64_t  FunctionNumber;
+
+  /// Count the number of functions created.
+  static uint64_t Count;
+
+  /// Map offsets of special instructions to addresses in the output.
+  InputOffsetToAddressMapTy InputOffsetToAddressMap;
+
+  /// Register alternative function name.
+  void addAlternativeName(std::string NewName) {
+    Aliases.push_back(std::move(NewName));
+  }
+
+  /// Return a label at a given \p Address in the function. If the label does
+  /// not exist - create it. Assert if the \p Address does not belong to
+  /// the function. If \p CreatePastEnd is true, then return the function
+  /// end label when the \p Address points immediately past the last byte
+  /// of the function.
+  /// NOTE: the function always returns a local (temp) symbol, even if there's
+  ///       a global symbol that corresponds to an entry at this address.
+  MCSymbol *getOrCreateLocalLabel(uint64_t Address, bool CreatePastEnd = false);
+
+  /// Register an entry point at a given \p Offset into the function.
+  void markDataAtOffset(uint64_t Offset) {
+    Islands.DataOffsets.emplace(Offset);
+  }
+
+  /// Register an entry point at a given \p Offset into the function.
+  void markCodeAtOffset(uint64_t Offset) {
+    Islands.CodeOffsets.emplace(Offset);
+  }
+
+  /// Register secondary entry point at a given \p Offset into the function.
+  /// Return global symbol for use by extern function references.
+  MCSymbol *addEntryPointAtOffset(uint64_t Offset);
+
+  /// Register an internal offset in a function referenced from outside.
+  void registerReferencedOffset(uint64_t Offset) {
+    ExternallyReferencedOffsets.emplace(Offset);
+  }
+
+  /// True if there are references to internals of this function from data,
+  /// e.g. from jump tables.
+  bool hasInternalReference() const {
+    return !ExternallyReferencedOffsets.empty();
+  }
+
+  /// Return an entry ID corresponding to a symbol known to belong to
+  /// the function.
+  ///
+  /// Prefer to use BinaryContext::getFunctionForSymbol(EntrySymbol, &ID)
+  /// instead of calling this function directly.
+  uint64_t getEntryIDForSymbol(const MCSymbol *EntrySymbol) const;
+
+  /// If the function represents a secondary split function fragment, set its
+  /// parent fragment to \p BF.
+  void setParentFragment(BinaryFunction &BF) {
+    assert(IsFragment && "function must be a fragment to have a parent");
+    assert((!ParentFragment || ParentFragment == &BF) &&
+           "cannot have more than one parent function");
+    ParentFragment = &BF;
+  }
+
+  /// Register a child fragment for the main fragment of a split function.
+  void addFragment(BinaryFunction &BF) {
+    Fragments.insert(&BF);
+  }
+
+  void addInstruction(uint64_t Offset, MCInst &&Instruction) {
+    Instructions.emplace(Offset, std::forward<MCInst>(Instruction));
+  }
+
+  /// Convert CFI instructions to a standard form (remove remember/restore).
+  void normalizeCFIState();
+
+  /// Analyze and process indirect branch \p Instruction before it is
+  /// added to Instructions list.
+  IndirectBranchType processIndirectBranch(MCInst &Instruction,
+                                           unsigned Size,
+                                           uint64_t Offset,
+                                           uint64_t &TargetAddress);
+
+  DenseMap<const MCInst *, SmallVector<MCInst *, 4>>
+  computeLocalUDChain(const MCInst *CurInstr);
+
+  BinaryFunction &operator=(const BinaryFunction &) = delete;
+  BinaryFunction(const BinaryFunction &) = delete;
+
+  friend class MachORewriteInstance;
+  friend class RewriteInstance;
+  friend class BinaryContext;
+  friend class DataReader;
+  friend class DataAggregator;
+
+  static std::string buildCodeSectionName(StringRef Name,
+                                          const BinaryContext &BC);
+  static std::string buildColdCodeSectionName(StringRef Name,
+                                              const BinaryContext &BC);
+
+  /// Creation should be handled by RewriteInstance or BinaryContext
+  BinaryFunction(const std::string &Name, BinarySection &Section,
+                 uint64_t Address, uint64_t Size, BinaryContext &BC)
+      : OriginSection(&Section), Address(Address), Size(Size), BC(BC),
+        CodeSectionName(buildCodeSectionName(Name, BC)),
+        ColdCodeSectionName(buildColdCodeSectionName(Name, BC)),
+        FunctionNumber(++Count) {
+    Symbols.push_back(BC.Ctx->getOrCreateSymbol(Name));
+  }
+
+  /// This constructor is used to create an injected function
+  BinaryFunction(const std::string &Name, BinaryContext &BC, bool IsSimple)
+      : Address(0), Size(0), BC(BC), IsSimple(IsSimple),
+        CodeSectionName(buildCodeSectionName(Name, BC)),
+        ColdCodeSectionName(buildColdCodeSectionName(Name, BC)),
+        FunctionNumber(++Count) {
+    Symbols.push_back(BC.Ctx->getOrCreateSymbol(Name));
+    IsInjected = true;
+  }
+
+  /// Clear state of the function that could not be disassembled or if its
+  /// disassembled state was later invalidated.
+  void clearDisasmState();
+
+  /// Release memory allocated for CFG and instructions.
+  /// We still keep basic blocks for address translation/mapping purposes.
+  void releaseCFG() {
+    for (BinaryBasicBlock *BB : BasicBlocks) {
+      BB->releaseCFG();
+    }
+    for (BinaryBasicBlock *BB : DeletedBasicBlocks) {
+      BB->releaseCFG();
+    }
+
+    clearList(CallSites);
+    clearList(ColdCallSites);
+    clearList(LSDATypeTable);
+    clearList(LSDATypeAddressTable);
+
+    clearList(LabelToBB);
+
+    if (!isMultiEntry())
+      clearList(Labels);
+
+    clearList(FrameInstructions);
+    clearList(FrameRestoreEquivalents);
+  }
+
+public:
+  BinaryFunction(BinaryFunction &&) = default;
+
+  using iterator = pointee_iterator<BasicBlockListType::iterator>;
+  using const_iterator = pointee_iterator<BasicBlockListType::const_iterator>;
+  using reverse_iterator =
+      pointee_iterator<BasicBlockListType::reverse_iterator>;
+  using const_reverse_iterator =
+      pointee_iterator<BasicBlockListType::const_reverse_iterator>;
+
+  typedef BasicBlockOrderType::iterator order_iterator;
+  typedef BasicBlockOrderType::const_iterator const_order_iterator;
+  typedef BasicBlockOrderType::reverse_iterator reverse_order_iterator;
+  typedef BasicBlockOrderType::const_reverse_iterator
+      const_reverse_order_iterator;
+
+  // CFG iterators.
+  iterator                 begin()       { return BasicBlocks.begin(); }
+  const_iterator           begin() const { return BasicBlocks.begin(); }
+  iterator                 end  ()       { return BasicBlocks.end();   }
+  const_iterator           end  () const { return BasicBlocks.end();   }
+
+  reverse_iterator        rbegin()       { return BasicBlocks.rbegin(); }
+  const_reverse_iterator  rbegin() const { return BasicBlocks.rbegin(); }
+  reverse_iterator        rend  ()       { return BasicBlocks.rend();   }
+  const_reverse_iterator  rend  () const { return BasicBlocks.rend();   }
+
+  size_t                    size() const { return BasicBlocks.size();}
+  bool                     empty() const { return BasicBlocks.empty(); }
+  const BinaryBasicBlock &front() const  { return *BasicBlocks.front(); }
+        BinaryBasicBlock &front()        { return *BasicBlocks.front(); }
+  const BinaryBasicBlock & back() const  { return *BasicBlocks.back(); }
+        BinaryBasicBlock & back()        { return *BasicBlocks.back(); }
+  inline iterator_range<iterator> blocks() {
+    return iterator_range<iterator>(begin(), end());
+  }
+  inline iterator_range<const_iterator> blocks() const {
+    return iterator_range<const_iterator>(begin(), end());
+  }
+
+  // Iterators by pointer.
+  BasicBlockListType::iterator pbegin()  { return BasicBlocks.begin(); }
+  BasicBlockListType::iterator pend()    { return BasicBlocks.end(); }
+
+  order_iterator       layout_begin()    { return BasicBlocksLayout.begin(); }
+  const_order_iterator layout_begin()    const
+                                         { return BasicBlocksLayout.begin(); }
+  order_iterator       layout_end()      { return BasicBlocksLayout.end(); }
+  const_order_iterator layout_end()      const
+                                         { return BasicBlocksLayout.end(); }
+  reverse_order_iterator       layout_rbegin()
+                                         { return BasicBlocksLayout.rbegin(); }
+  const_reverse_order_iterator layout_rbegin() const
+                                         { return BasicBlocksLayout.rbegin(); }
+  reverse_order_iterator       layout_rend()
+                                         { return BasicBlocksLayout.rend(); }
+  const_reverse_order_iterator layout_rend()   const
+                                         { return BasicBlocksLayout.rend(); }
+  size_t   layout_size()  const { return BasicBlocksLayout.size(); }
+  bool     layout_empty() const { return BasicBlocksLayout.empty(); }
+  const BinaryBasicBlock *layout_front() const
+                                         { return BasicBlocksLayout.front(); }
+        BinaryBasicBlock *layout_front() { return BasicBlocksLayout.front(); }
+  const BinaryBasicBlock *layout_back()  const
+                                         { return BasicBlocksLayout.back(); }
+        BinaryBasicBlock *layout_back()  { return BasicBlocksLayout.back(); }
+
+  inline iterator_range<order_iterator> layout() {
+    return iterator_range<order_iterator>(BasicBlocksLayout.begin(),
+                                          BasicBlocksLayout.end());
+  }
+
+  inline iterator_range<const_order_iterator> layout() const {
+    return iterator_range<const_order_iterator>(BasicBlocksLayout.begin(),
+                                                BasicBlocksLayout.end());
+  }
+
+  inline iterator_range<reverse_order_iterator> rlayout() {
+    return
+      iterator_range<reverse_order_iterator>(BasicBlocksLayout.rbegin(),
+                                             BasicBlocksLayout.rend());
+  }
+
+  inline iterator_range<const_reverse_order_iterator> rlayout() const {
+    return
+      iterator_range<const_reverse_order_iterator>(BasicBlocksLayout.rbegin(),
+                                                   BasicBlocksLayout.rend());
+  }
+
+  cfi_iterator        cie_begin()       { return CIEFrameInstructions.begin(); }
+  const_cfi_iterator  cie_begin() const { return CIEFrameInstructions.begin(); }
+  cfi_iterator        cie_end()         { return CIEFrameInstructions.end(); }
+  const_cfi_iterator  cie_end()   const { return CIEFrameInstructions.end(); }
+  bool                cie_empty() const { return CIEFrameInstructions.empty(); }
+
+  inline iterator_range<cfi_iterator> cie() {
+    return iterator_range<cfi_iterator>(cie_begin(), cie_end());
+  }
+  inline iterator_range<const_cfi_iterator> cie() const {
+    return iterator_range<const_cfi_iterator>(cie_begin(), cie_end());
+  }
+
+  /// Iterate over all jump tables associated with this function.
+  iterator_range<std::map<uint64_t, JumpTable *>::const_iterator>
+  jumpTables() const {
+    return make_range(JumpTables.begin(), JumpTables.end());
+  }
+
+  /// Returns the raw binary encoding of this function.
+  ErrorOr<ArrayRef<uint8_t>> getData() const;
+
+  BinaryFunction &updateState(BinaryFunction::State State) {
+    CurrentState = State;
+    return *this;
+  }
+
+  /// Update layout of basic blocks used for output.
+  void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) {
+    BasicBlocksPreviousLayout = BasicBlocksLayout;
+
+    if (NewLayout != BasicBlocksLayout) {
+      ModifiedLayout = true;
+      BasicBlocksLayout.clear();
+      BasicBlocksLayout.swap(NewLayout);
+    }
+  }
+
+  /// Recompute landing pad information for the function and all its blocks.
+  void recomputeLandingPads();
+
+  /// Return current basic block layout.
+  const BasicBlockOrderType &getLayout() const {
+    return BasicBlocksLayout;
+  }
+
+  /// Return a list of basic blocks sorted using DFS and update layout indices
+  /// using the same order. Does not modify the current layout.
+  BasicBlockOrderType dfs() const;
+
+  /// Find the loops in the CFG of the function and store information about
+  /// them.
+  void calculateLoopInfo();
+
+  /// Calculate missed macro-fusion opportunities and update BinaryContext
+  /// stats.
+  void calculateMacroOpFusionStats();
+
+  /// Returns if loop detection has been run for this function.
+  bool hasLoopInfo() const {
+    return BLI != nullptr;
+  }
+
+  const BinaryLoopInfo &getLoopInfo() {
+    return *BLI.get();
+  }
+
+  bool isLoopFree() {
+    if (!hasLoopInfo()) {
+      calculateLoopInfo();
+    }
+    return BLI->empty();
+  }
+
+  /// Print loop information about the function.
+  void printLoopInfo(raw_ostream &OS) const;
+
+  /// View CFG in graphviz program
+  void viewGraph() const;
+
+  /// Dump CFG in graphviz format
+  void dumpGraph(raw_ostream& OS) const;
+
+  /// Dump CFG in graphviz format to file.
+  void dumpGraphToFile(std::string Filename) const;
+
+  /// Dump CFG in graphviz format to a file with a filename that is derived
+  /// from the function name and Annotation strings.  Useful for dumping the
+  /// CFG after an optimization pass.
+  void dumpGraphForPass(std::string Annotation = "") const;
+
+  /// Return BinaryContext for the function.
+  const BinaryContext &getBinaryContext() const {
+    return BC;
+  }
+
+  /// Return BinaryContext for the function.
+  BinaryContext &getBinaryContext() {
+    return BC;
+  }
+
+  /// Attempt to validate CFG invariants.
+  bool validateCFG() const;
+
+  BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) {
+    auto I = LabelToBB.find(Label);
+    return I == LabelToBB.end() ? nullptr : I->second;
+  }
+
+  const BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) const {
+    auto I = LabelToBB.find(Label);
+    return I == LabelToBB.end() ? nullptr : I->second;
+  }
+
+  /// Returns the basic block after the given basic block in the layout or
+  /// nullptr the last basic block is given.
+  const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB,
+                                             bool IgnoreSplits = true) const {
+    return
+      const_cast<BinaryFunction *>(this)->getBasicBlockAfter(BB, IgnoreSplits);
+  }
+
+  BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB,
+                                       bool IgnoreSplits = true) {
+    for (auto I = layout_begin(), E = layout_end(); I != E; ++I) {
+      auto Next = std::next(I);
+      if (*I == BB && Next != E) {
+        return (IgnoreSplits || (*I)->isCold() == (*Next)->isCold())
+          ? *Next : nullptr;
+      }
+    }
+    return nullptr;
+  }
+
+  /// Retrieve the landing pad BB associated with invoke instruction \p Invoke
+  /// that is in \p BB. Return nullptr if none exists
+  BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,
+                                       const MCInst &InvokeInst) const {
+    assert(BC.MIB->isInvoke(InvokeInst) && "must be invoke instruction");
+    const Optional<MCPlus::MCLandingPad> LP = BC.MIB->getEHInfo(InvokeInst);
+    if (LP && LP->first) {
+      BinaryBasicBlock *LBB = BB.getLandingPad(LP->first);
+      assert (LBB && "Landing pad should be defined");
+      return LBB;
+    }
+    return nullptr;
+  }
+
+  /// Return instruction at a given offset in the function. Valid before
+  /// CFG is constructed or while instruction offsets are available in CFG.
+  MCInst *getInstructionAtOffset(uint64_t Offset);
+
+  const MCInst *getInstructionAtOffset(uint64_t Offset) const {
+    return const_cast<BinaryFunction *>(this)->getInstructionAtOffset(Offset);
+  }
+
+  /// Return jump table that covers a given \p Address in memory.
+  JumpTable *getJumpTableContainingAddress(uint64_t Address) {
+    auto JTI = JumpTables.upper_bound(Address);
+    if (JTI == JumpTables.begin())
+      return nullptr;
+    --JTI;
+    if (JTI->first + JTI->second->getSize() > Address)
+      return JTI->second;
+    if (JTI->second->getSize() == 0 && JTI->first == Address)
+      return JTI->second;
+    return nullptr;
+  }
+
+  const JumpTable *getJumpTableContainingAddress(uint64_t Address) const {
+    return const_cast<BinaryFunction *>(this)->
+      getJumpTableContainingAddress(Address);
+  }
+
+  /// Return the name of the function if the function has just one name.
+  /// If the function has multiple names - return one followed
+  /// by "(*#<numnames>)".
+  ///
+  /// We should use getPrintName() for diagnostics and use
+  /// hasName() to match function name against a given string.
+  ///
+  /// NOTE: for disambiguating names of local symbols we use the following
+  ///       naming schemes:
+  ///           primary:     <function>/<id>
+  ///           alternative: <function>/<file>/<id2>
+  std::string getPrintName() const {
+    const size_t NumNames = Symbols.size() + Aliases.size();
+    return NumNames == 1
+        ? getOneName().str()
+        : (getOneName().str() + "(*" + std::to_string(NumNames) + ")");
+  }
+
+  /// The function may have many names. For that reason, we avoid having
+  /// getName() method as most of the time the user needs a different
+  /// interface, such as forEachName(), hasName(), hasNameRegex(), etc.
+  /// In some cases though, we need just a name uniquely identifying
+  /// the function, and that's what this method is for.
+  StringRef getOneName() const {
+    return Symbols[0]->getName();
+  }
+
+  /// Return the name of the function as getPrintName(), but also trying
+  /// to demangle it.
+  std::string getDemangledName() const;
+
+  /// Call \p Callback for every name of this function as long as the Callback
+  /// returns false. Stop if Callback returns true or all names have been used.
+  /// Return the name for which the Callback returned true if any.
+  template <typename FType>
+  Optional<StringRef> forEachName(FType Callback) const {
+    for (MCSymbol *Symbol : Symbols)
+      if (Callback(Symbol->getName()))
+        return Symbol->getName();
+
+    for (const std::string &Name : Aliases)
+      if (Callback(StringRef(Name)))
+        return StringRef(Name);
+
+    return NoneType();
+  }
+
+  /// Check if (possibly one out of many) function name matches the given
+  /// string. Use this member function instead of direct name comparison.
+  bool hasName(const std::string &FunctionName) const {
+    auto Res = forEachName([&](StringRef Name) {
+      return Name == FunctionName;
+    });
+    return Res.hasValue();
+  }
+
+  /// Check if any of function names matches the given regex.
+  Optional<StringRef> hasNameRegex(const StringRef NameRegex) const;
+
+  /// Check if any of restored function names matches the given regex.
+  /// Restored name means stripping BOLT-added suffixes like "/1",
+  Optional<StringRef> hasRestoredNameRegex(const StringRef NameRegex) const;
+
+  /// Return a vector of all possible names for the function.
+  const std::vector<StringRef> getNames() const {
+    std::vector<StringRef> AllNames;
+    forEachName([&AllNames] (StringRef Name) {
+        AllNames.push_back(Name);
+        return false;
+    });
+
+    return AllNames;
+  }
+
+  /// Return a state the function is in (see BinaryFunction::State definition
+  /// for description).
+  State getState() const {
+    return CurrentState;
+  }
+
+  /// Return true if function has a control flow graph available.
+  bool hasCFG() const {
+    return getState() == State::CFG ||
+           getState() == State::CFG_Finalized ||
+           getState() == State::EmittedCFG;
+  }
+
+  /// Return true if the function state implies that it includes instructions.
+  bool hasInstructions() const {
+    return getState() == State::Disassembled ||
+           hasCFG();
+  }
+
+  bool isEmitted() const {
+    return getState() == State::EmittedCFG ||
+           getState() == State::Emitted;
+  }
+
+  /// Return the section in the input binary this function originated from or
+  /// nullptr if the function did not originate from the file.
+  BinarySection *getOriginSection() const {
+    return OriginSection;
+  }
+
+  void setOriginSection(BinarySection *Section) {
+    OriginSection = Section;
+  }
+
+  /// Return true if the function did not originate from the primary input file.
+  bool isInjected() const {
+    return IsInjected;
+  }
+
+  /// Return original address of the function (or offset from base for PIC).
+  uint64_t getAddress() const {
+    return Address;
+  }
+
+  uint64_t getOutputAddress() const {
+    return OutputAddress;
+  }
+
+  uint64_t getOutputSize() const {
+    return OutputSize;
+  }
+
+  /// Does this function have a valid streaming order index?
+  bool hasValidIndex() const {
+    return Index != -1U;
+  }
+
+  /// Get the streaming order index for this function.
+  uint32_t getIndex() const {
+    return Index;
+  }
+
+  /// Set the streaming order index for this function.
+  void setIndex(uint32_t Idx) {
+    assert(!hasValidIndex());
+    Index = Idx;
+  }
+
+  /// Get the original address for the given basic block within this function.
+  uint64_t getBasicBlockOriginalAddress(const BinaryBasicBlock *BB) const {
+    return Address + BB->getOffset();
+  }
+
+  /// Return offset of the function body in the binary file.
+  uint64_t getFileOffset() const {
+    return FileOffset;
+  }
+
+  /// Return (original) byte size of the function.
+  uint64_t getSize() const {
+    return Size;
+  }
+
+  /// Return the maximum size the body of the function could have.
+  uint64_t getMaxSize() const {
+    return MaxSize;
+  }
+
+  /// Return the number of emitted instructions for this function.
+  uint32_t getNumNonPseudos() const {
+    uint32_t N = 0;
+    for (BinaryBasicBlock *const &BB : layout()) {
+      N += BB->getNumNonPseudos();
+    }
+    return N;
+  }
+
+  /// Return MC symbol associated with the function.
+  /// All references to the function should use this symbol.
+  MCSymbol *getSymbol() {
+    return Symbols[0];
+  }
+
+  /// Return MC symbol associated with the function (const version).
+  /// All references to the function should use this symbol.
+  const MCSymbol *getSymbol() const {
+    return Symbols[0];
+  }
+
+  /// Return a list of symbols associated with the main entry of the function.
+  SymbolListTy &getSymbols() { return Symbols; }
+  const SymbolListTy &getSymbols() const { return Symbols; }
+
+  /// If a local symbol \p BBLabel corresponds to a basic block that is a
+  /// secondary entry point into the function, then return a global symbol
+  /// that represents the secondary entry point. Otherwise return nullptr.
+  MCSymbol *getSecondaryEntryPointSymbol(const MCSymbol *BBLabel) const {
+    auto I = SecondaryEntryPoints.find(BBLabel);
+    if (I == SecondaryEntryPoints.end())
+      return nullptr;
+
+    return I->second;
+  }
+
+  /// If the basic block serves as a secondary entry point to the function,
+  /// return a global symbol representing the entry. Otherwise return nullptr.
+  MCSymbol *getSecondaryEntryPointSymbol(const BinaryBasicBlock &BB) const {
+    return getSecondaryEntryPointSymbol(BB.getLabel());
+  }
+
+  /// Return true if the basic block is an entry point into the function
+  /// (either primary or secondary).
+  bool isEntryPoint(const BinaryBasicBlock &BB) const {
+    if (&BB == BasicBlocks.front())
+      return true;
+    return getSecondaryEntryPointSymbol(BB);
+  }
+
+
+  /// Return MC symbol corresponding to an enumerated entry for multiple-entry
+  /// functions.
+  MCSymbol *getSymbolForEntryID(uint64_t EntryNum);
+  const MCSymbol *getSymbolForEntryID(uint64_t EntryNum) const {
+    return const_cast<BinaryFunction *>(this)->getSymbolForEntryID(EntryNum);
+  }
+
+  using EntryPointCallbackTy = function_ref<bool(uint64_t, const MCSymbol*)>;
+
+  /// Invoke \p Callback function for every entry point in the function starting
+  /// with the main entry and using entries in the ascending address order.
+  /// Stop calling the function after false is returned by the callback.
+  ///
+  /// Pass an offset of the entry point in the input binary and a corresponding
+  /// global symbol to the callback function.
+  ///
+  /// Return true of all callbacks returned true, false otherwise.
+  bool forEachEntryPoint(EntryPointCallbackTy Callback) const;
+
+  MCSymbol *getColdSymbol() {
+    if (ColdSymbol)
+      return ColdSymbol;
+
+    ColdSymbol = BC.Ctx->getOrCreateSymbol(
+        NameResolver::append(getSymbol()->getName(), ".cold.0"));
+
+    return ColdSymbol;
+  }
+
+  /// Return MC symbol associated with the end of the function.
+  MCSymbol *getFunctionEndLabel() const {
+    assert(BC.Ctx && "cannot be called with empty context");
+    if (!FunctionEndLabel) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+      FunctionEndLabel = BC.Ctx->createNamedTempSymbol("func_end");
+    }
+    return FunctionEndLabel;
+  }
+
+  /// Return MC symbol associated with the end of the cold part of the function.
+  MCSymbol *getFunctionColdEndLabel() const {
+    if (!FunctionColdEndLabel) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+      FunctionColdEndLabel = BC.Ctx->createNamedTempSymbol("func_cold_end");
+    }
+    return FunctionColdEndLabel;
+  }
+
+  /// Return a label used to identify where the constant island was emitted
+  /// (AArch only). This is used to update the symbol table accordingly,
+  /// emitting data marker symbols as required by the ABI.
+  MCSymbol *getFunctionConstantIslandLabel() const {
+    if (!FunctionConstantIslandLabel) {
+      FunctionConstantIslandLabel =
+          BC.Ctx->createNamedTempSymbol("func_const_island");
+    }
+    return FunctionConstantIslandLabel;
+  }
+
+  MCSymbol *getFunctionColdConstantIslandLabel() const {
+    if (!FunctionColdConstantIslandLabel) {
+      FunctionColdConstantIslandLabel =
+          BC.Ctx->createNamedTempSymbol("func_cold_const_island");
+    }
+    return FunctionColdConstantIslandLabel;
+  }
+
+  /// Return true if this is a function representing a PLT entry.
+  bool isPLTFunction() const {
+    return PLTSymbol != nullptr;
+  }
+
+  /// Return PLT function reference symbol for PLT functions and nullptr for
+  /// non-PLT functions.
+  const MCSymbol *getPLTSymbol() const {
+    return PLTSymbol;
+  }
+
+  /// Set function PLT reference symbol for PLT functions.
+  void setPLTSymbol(const MCSymbol *Symbol) {
+    assert(Size == 0 && "function size should be 0 for PLT functions");
+    PLTSymbol = Symbol;
+    IsPseudo = true;
+  }
+
+  /// Update output values of the function based on the final \p Layout.
+  void updateOutputValues(const MCAsmLayout &Layout);
+
+  /// Return mapping of input to output addresses. Most users should call
+  /// translateInputToOutputAddress() for address translation.
+  InputOffsetToAddressMapTy &getInputOffsetToAddressMap() {
+    assert(isEmitted() && "cannot use address mapping before code emission");
+    return InputOffsetToAddressMap;
+  }
+
+  /// Register relocation type \p RelType at a given \p Address in the function
+  /// against \p Symbol.
+  /// Assert if the \p Address is not inside this function.
+  void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t RelType,
+                     uint64_t Addend, uint64_t Value) {
+    assert(Address >= getAddress() && Address < getAddress() + getMaxSize() &&
+           "address is outside of the function");
+    uint64_t Offset = Address - getAddress();
+    switch (RelType) {
+    case ELF::R_X86_64_8:
+    case ELF::R_X86_64_16:
+    case ELF::R_X86_64_32:
+    case ELF::R_X86_64_32S:
+    case ELF::R_X86_64_64:
+    case ELF::R_AARCH64_ABS64:
+    case ELF::R_AARCH64_LDST64_ABS_LO12_NC:
+    case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+    case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+    case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+    case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+    case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+    case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+    case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+    case ELF::R_AARCH64_ADD_ABS_LO12_NC:
+    case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+    case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
+    case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+    case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+    case ELF::R_AARCH64_ADR_GOT_PAGE:
+    case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+    case ELF::R_AARCH64_ADR_PREL_PG_HI21:
+      Relocations[Offset] = Relocation{Offset, Symbol, RelType, Addend, Value};
+      break;
+    case ELF::R_X86_64_PC32:
+    case ELF::R_X86_64_PC8:
+    case ELF::R_X86_64_PLT32:
+    case ELF::R_X86_64_GOTPCRELX:
+    case ELF::R_X86_64_REX_GOTPCRELX:
+    case ELF::R_AARCH64_JUMP26:
+    case ELF::R_AARCH64_CALL26:
+    case ELF::R_AARCH64_TLSDESC_CALL:
+      break;
+
+    // The following relocations are ignored.
+    case ELF::R_X86_64_GOTPCREL:
+    case ELF::R_X86_64_TPOFF32:
+    case ELF::R_X86_64_GOTTPOFF:
+      return;
+    default:
+      llvm_unreachable("unexpected relocation type in code");
+    }
+  }
+
+  /// Return the name of the section this function originated from.
+  Optional<StringRef> getOriginSectionName() const {
+    if (!OriginSection)
+      return NoneType();
+    return OriginSection->getName();
+  }
+
+  /// Return internal section name for this function.
+  StringRef getCodeSectionName() const {
+    return StringRef(CodeSectionName);
+  }
+
+  /// Assign a code section name to the function.
+  void setCodeSectionName(StringRef Name) {
+    CodeSectionName = std::string(Name);
+  }
+
+  /// Get output code section.
+  ErrorOr<BinarySection &> getCodeSection() const {
+    return BC.getUniqueSectionByName(getCodeSectionName());
+  }
+
+  /// Return cold code section name for the function.
+  StringRef getColdCodeSectionName() const {
+    return StringRef(ColdCodeSectionName);
+  }
+
+  /// Assign a section name for the cold part of the function.
+  void setColdCodeSectionName(StringRef Name) {
+    ColdCodeSectionName = std::string(Name);
+  }
+
+  /// Get output code section for cold code of this function.
+  ErrorOr<BinarySection &> getColdCodeSection() const {
+    return BC.getUniqueSectionByName(getColdCodeSectionName());
+  }
+
+  /// Return true iif the function will halt execution on entry.
+  bool trapsOnEntry() const {
+    return TrapsOnEntry;
+  }
+
+  /// Make the function always trap on entry. Other than the trap instruction,
+  /// the function body will be empty.
+  void setTrapOnEntry();
+
+  /// Return true if the function could be correctly processed.
+  bool isSimple() const {
+    return IsSimple;
+  }
+
+  /// Return true if the function should be ignored for optimization purposes.
+  bool isIgnored() const {
+    return IsIgnored;
+  }
+
+  /// Return true if the function should not be disassembled, emitted, or
+  /// otherwise processed.
+  bool isPseudo() const {
+    return IsPseudo;
+  }
+
+  /// Return true if the function contains a jump table with entries pointing
+  /// to split fragments.
+  bool hasSplitJumpTable() const {
+    return HasSplitJumpTable;
+  }
+
+  /// Return true if all CFG edges have local successors.
+  bool hasCanonicalCFG() const {
+    return HasCanonicalCFG;
+  }
+
+  /// Return true if the original function code has all necessary relocations
+  /// to track addresses of functions emitted to new locations.
+  bool hasExternalRefRelocations() const {
+    return HasExternalRefRelocations;
+  }
+
+  /// Return true if the function has instruction(s) with unknown control flow.
+  bool hasUnknownControlFlow() const {
+    return HasUnknownControlFlow;
+  }
+
+  /// Return true if the function body is non-contiguous.
+  bool isSplit() const {
+    return isSimple() &&
+           layout_size() &&
+           layout_front()->isCold() != layout_back()->isCold();
+  }
+
+  /// Return true if the function has exception handling tables.
+  bool hasEHRanges() const {
+    return HasEHRanges;
+  }
+
+  /// Return true if the function uses DW_CFA_GNU_args_size CFIs.
+  bool usesGnuArgsSize() const {
+    return UsesGnuArgsSize;
+  }
+
+  /// Return true if the function has more than one entry point.
+  bool isMultiEntry() const {
+    return !SecondaryEntryPoints.empty();
+  }
+
+  /// Return true if the function might have a profile available externally,
+  /// but not yet populated into the function.
+  bool hasProfileAvailable() const {
+    return HasProfileAvailable;
+  }
+
+  bool hasMemoryProfile() const {
+    return HasMemoryProfile;
+  }
+
+  /// Return true if the body of the function was merged into another function.
+  bool isFolded() const {
+    return FoldedIntoFunction != nullptr;
+  }
+
+  /// If this function was folded, return the function it was folded into.
+  BinaryFunction *getFoldedIntoFunction() const {
+    return FoldedIntoFunction;
+  }
+
+  /// Return true if the function uses jump tables.
+  bool hasJumpTables() const {
+    return !JumpTables.empty();
+  }
+
+  /// Return true if the function has SDT marker
+  bool hasSDTMarker() const { return HasSDTMarker; }
+
+  /// Return true if the function has Pseudo Probe
+  bool hasPseudoProbe() const { return HasPseudoProbe; }
+
+  /// Return true if the original entry point was patched.
+  bool isPatched() const {
+    return IsPatched;
+  }
+
+  const JumpTable *getJumpTable(const MCInst &Inst) const {
+    const uint64_t Address = BC.MIB->getJumpTable(Inst);
+    return getJumpTableContainingAddress(Address);
+  }
+
+  JumpTable *getJumpTable(const MCInst &Inst) {
+    const uint64_t Address = BC.MIB->getJumpTable(Inst);
+    return getJumpTableContainingAddress(Address);
+  }
+
+  const MCSymbol *getPersonalityFunction() const {
+    return PersonalityFunction;
+  }
+
+  uint8_t getPersonalityEncoding() const {
+    return PersonalityEncoding;
+  }
+
+  const std::vector<CallSite> &getCallSites() const {
+    return CallSites;
+  }
+
+  const std::vector<CallSite> &getColdCallSites() const {
+    return ColdCallSites;
+  }
+
+  const ArrayRef<uint8_t> getLSDAActionTable() const {
+    return LSDAActionTable;
+  }
+
+  const LSDATypeTableTy &getLSDATypeTable() const {
+    return LSDATypeTable;
+  }
+
+  const LSDATypeTableTy &getLSDATypeAddressTable() const {
+    return LSDATypeAddressTable;
+  }
+
+  const ArrayRef<uint8_t> getLSDATypeIndexTable() const {
+    return LSDATypeIndexTable;
+  }
+
+  const LabelsMapType &getLabels() const {
+    return Labels;
+  }
+
+  IslandInfo &getIslandInfo() {
+    return Islands;
+  }
+
+  const IslandInfo &getIslandInfo() const {
+    return Islands;
+  }
+
+  /// Return true if the function has CFI instructions
+  bool hasCFI() const {
+    return !FrameInstructions.empty() || !CIEFrameInstructions.empty();
+  }
+
+  /// Return unique number associated with the function.
+  uint64_t getFunctionNumber() const {
+    return FunctionNumber;
+  }
+
+  /// Return true if the given address \p PC is inside the function body.
+  bool containsAddress(uint64_t PC, bool UseMaxSize = false) const {
+    if (UseMaxSize)
+      return Address <= PC && PC < Address + MaxSize;
+    return Address <= PC && PC < Address + Size;
+  }
+
+  /// Create a basic block at a given \p Offset in the
+  /// function.
+  /// If \p DeriveAlignment is true, set the alignment of the block based
+  /// on the alignment of the existing offset.
+  /// The new block is not inserted into the CFG.  The client must
+  /// use insertBasicBlocks to add any new blocks to the CFG.
+  std::unique_ptr<BinaryBasicBlock>
+  createBasicBlock(uint64_t Offset,
+                   MCSymbol *Label = nullptr,
+                   bool DeriveAlignment = false) {
+    assert(BC.Ctx && "cannot be called with empty context");
+    if (!Label) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+      Label = BC.Ctx->createNamedTempSymbol("BB");
+    }
+    auto BB = std::unique_ptr<BinaryBasicBlock>(
+      new BinaryBasicBlock(this, Label, Offset));
+
+    if (DeriveAlignment) {
+      uint64_t DerivedAlignment = Offset & (1 + ~Offset);
+      BB->setAlignment(std::min(DerivedAlignment, uint64_t(32)));
+    }
+
+    LabelToBB.emplace(Label, BB.get());
+
+    return BB;
+  }
+
+  /// Create a basic block at a given \p Offset in the
+  /// function and append it to the end of list of blocks.
+  /// If \p DeriveAlignment is true, set the alignment of the block based
+  /// on the alignment of the existing offset.
+  ///
+  /// Returns NULL if basic block already exists at the \p Offset.
+  BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label = nullptr,
+                                  bool DeriveAlignment = false) {
+    assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) &&
+           "basic block already exists in pre-CFG state");
+
+    if (!Label) {
+      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+      Label = BC.Ctx->createNamedTempSymbol("BB");
+    }
+    std::unique_ptr<BinaryBasicBlock> BBPtr =
+        createBasicBlock(Offset, Label, DeriveAlignment);
+    BasicBlocks.emplace_back(BBPtr.release());
+
+    BinaryBasicBlock *BB = BasicBlocks.back();
+    BB->setIndex(BasicBlocks.size() - 1);
+
+    if (CurrentState == State::Disassembled) {
+      BasicBlockOffsets.emplace_back(Offset, BB);
+    } else if (CurrentState == State::CFG) {
+      BB->setLayoutIndex(layout_size());
+      BasicBlocksLayout.emplace_back(BB);
+    }
+
+    assert(CurrentState == State::CFG ||
+           (std::is_sorted(BasicBlockOffsets.begin(),
+                           BasicBlockOffsets.end(),
+                           CompareBasicBlockOffsets()) &&
+            std::is_sorted(begin(), end())));
+
+    return BB;
+  }
+
+  /// Add basic block \BB as an entry point to the function. Return global
+  /// symbol associated with the entry.
+  MCSymbol *addEntryPoint(const BinaryBasicBlock &BB);
+
+  /// Mark all blocks that are unreachable from a root (entry point
+  /// or landing pad) as invalid.
+  void markUnreachableBlocks();
+
+  /// Rebuilds BBs layout, ignoring dead BBs. Returns the number of removed
+  /// BBs and the removed number of bytes of code.
+  std::pair<unsigned, uint64_t> eraseInvalidBBs();
+
+  /// Get the relative order between two basic blocks in the original
+  /// layout.  The result is > 0 if B occurs before A and < 0 if B
+  /// occurs after A.  If A and B are the same block, the result is 0.
+  signed getOriginalLayoutRelativeOrder(const BinaryBasicBlock *A,
+                                        const BinaryBasicBlock *B) const {
+    return getIndex(A) - getIndex(B);
+  }
+
+  /// Return basic block range that originally contained offset \p Offset
+  /// from the function start to the function end.
+  iterator_range<iterator> getBasicBlockRangeFromOffsetToEnd(uint64_t Offset) {
+    BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset);
+    return BB
+      ? iterator_range<iterator>(BasicBlocks.begin() + getIndex(BB), end())
+      : iterator_range<iterator>(end(), end());
+  }
+
+  /// Insert the BBs contained in NewBBs into the basic blocks for this
+  /// function. Update the associated state of all blocks as needed, i.e.
+  /// BB offsets and BB indices. The new BBs are inserted after Start.
+  /// This operation could affect fallthrough branches for Start.
+  ///
+  void insertBasicBlocks(
+    BinaryBasicBlock *Start,
+    std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
+    const bool UpdateLayout = true,
+    const bool UpdateCFIState = true,
+    const bool RecomputeLandingPads = true);
+
+  iterator insertBasicBlocks(
+    iterator StartBB,
+    std::vector<std::unique_ptr<BinaryBasicBlock>> &&NewBBs,
+    const bool UpdateLayout = true,
+    const bool UpdateCFIState = true,
+    const bool RecomputeLandingPads = true);
+
+  /// Update the basic block layout for this function.  The BBs from
+  /// [Start->Index, Start->Index + NumNewBlocks) are inserted into the
+  /// layout after the BB indicated by Start.
+  void updateLayout(BinaryBasicBlock* Start, const unsigned NumNewBlocks);
+
+  /// Make sure basic blocks' indices match the current layout.
+  void updateLayoutIndices() const {
+    unsigned Index = 0;
+    for (BinaryBasicBlock *BB : layout()) {
+      BB->setLayoutIndex(Index++);
+    }
+  }
+
+  /// Recompute the CFI state for NumNewBlocks following Start after inserting
+  /// new blocks into the CFG.  This must be called after updateLayout.
+  void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks);
+
+  /// Return true if we detected ambiguous jump tables in this function, which
+  /// happen when one JT is used in more than one indirect jumps. This precludes
+  /// us from splitting edges for this JT unless we duplicate the JT (see
+  /// disambiguateJumpTables).
+  bool checkForAmbiguousJumpTables();
+
+  /// Detect when two distinct indirect jumps are using the same jump table and
+  /// duplicate it, allocating a separate JT for each indirect branch. This is
+  /// necessary for code transformations on the CFG that change an edge induced
+  /// by an indirect branch, e.g.: instrumentation or shrink wrapping. However,
+  /// this is only possible if we are not updating jump tables in place, but are
+  /// writing it to a new location (moving them).
+  void disambiguateJumpTables(MCPlusBuilder::AllocatorIdTy AllocId);
+
+  /// Change \p OrigDest to \p NewDest in the jump table used at the end of
+  /// \p BB. Returns false if \p OrigDest couldn't be find as a valid target
+  /// and no replacement took place.
+  bool replaceJumpTableEntryIn(BinaryBasicBlock *BB,
+                               BinaryBasicBlock *OldDest,
+                               BinaryBasicBlock *NewDest);
+
+  /// Split the CFG edge <From, To> by inserting an intermediate basic block.
+  /// Returns a pointer to this new intermediate basic block. BB "From" will be
+  /// updated to jump to the intermediate block, which in turn will have an
+  /// unconditional branch to BB "To".
+  /// User needs to manually call fixBranches(). This function only creates the
+  /// correct CFG edges.
+  BinaryBasicBlock *splitEdge(BinaryBasicBlock *From, BinaryBasicBlock *To);
+
+  /// We may have built an overly conservative CFG for functions with calls
+  /// to functions that the compiler knows will never return. In this case,
+  /// clear all successors from these blocks.
+  void deleteConservativeEdges();
+
+  /// Determine direction of the branch based on the current layout.
+  /// Callee is responsible of updating basic block indices prior to using
+  /// this function (e.g. by calling BinaryFunction::updateLayoutIndices()).
+  static bool isForwardBranch(const BinaryBasicBlock *From,
+                              const BinaryBasicBlock *To) {
+    assert(From->getFunction() == To->getFunction() &&
+           "basic blocks should be in the same function");
+    return To->getLayoutIndex() > From->getLayoutIndex();
+  }
+
+  /// Determine direction of the call to callee symbol relative to the start
+  /// of this function.
+  /// Note: this doesn't take function splitting into account.
+  bool isForwardCall(const MCSymbol *CalleeSymbol) const;
+
+  /// Dump function information to debug output. If \p PrintInstructions
+  /// is true - include instruction disassembly.
+  void dump(bool PrintInstructions = true) const;
+
+  /// Print function information to the \p OS stream.
+  void print(raw_ostream &OS, std::string Annotation = "",
+             bool PrintInstructions = true) const;
+
+  /// Print all relocations between \p Offset and \p Offset + \p Size in
+  /// this function.
+  void printRelocations(raw_ostream &OS, uint64_t Offset, uint64_t Size) const;
+
+  /// Return true if function has a profile, even if the profile does not
+  /// match CFG 100%.
+  bool hasProfile() const {
+    return ExecutionCount != COUNT_NO_PROFILE;
+  }
+
+  /// Return true if function profile is present and accurate.
+  bool hasValidProfile() const {
+    return ExecutionCount != COUNT_NO_PROFILE &&
+           ProfileMatchRatio == 1.0f;
+  }
+
+  /// Mark this function as having a valid profile.
+  void markProfiled(uint16_t Flags) {
+    if (ExecutionCount == COUNT_NO_PROFILE)
+      ExecutionCount = 0;
+    ProfileFlags = Flags;
+    ProfileMatchRatio = 1.0f;
+  }
+
+  /// Return flags describing a profile for this function.
+  uint16_t getProfileFlags() const {
+    return ProfileFlags;
+  }
+
+  void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) {
+    assert(!Instructions.empty());
+
+    // Fix CFI instructions skipping NOPs. We need to fix this because changing
+    // CFI state after a NOP, besides being wrong and inaccurate,  makes it
+    // harder for us to recover this information, since we can create empty BBs
+    // with NOPs and then reorder it away.
+    // We fix this by moving the CFI instruction just before any NOPs.
+    auto I = Instructions.lower_bound(Offset);
+    if (Offset == getSize()) {
+      assert(I == Instructions.end() && "unexpected iterator value");
+      // Sometimes compiler issues restore_state after all instructions
+      // in the function (even after nop).
+      --I;
+      Offset = I->first;
+    }
+    assert(I->first == Offset && "CFI pointing to unknown instruction");
+    if (I == Instructions.begin()) {
+      CIEFrameInstructions.emplace_back(std::forward<MCCFIInstruction>(Inst));
+      return;
+    }
+
+    --I;
+    while (I != Instructions.begin() && BC.MIB->isNoop(I->second)) {
+      Offset = I->first;
+      --I;
+    }
+    OffsetToCFI.emplace(Offset, FrameInstructions.size());
+    FrameInstructions.emplace_back(std::forward<MCCFIInstruction>(Inst));
+    return;
+  }
+
+  BinaryBasicBlock::iterator addCFIInstruction(BinaryBasicBlock *BB,
+                                               BinaryBasicBlock::iterator Pos,
+                                               MCCFIInstruction &&Inst) {
+    size_t Idx = FrameInstructions.size();
+    FrameInstructions.emplace_back(std::forward<MCCFIInstruction>(Inst));
+    return addCFIPseudo(BB, Pos, Idx);
+  }
+
+  /// Insert a CFI pseudo instruction in a basic block. This pseudo instruction
+  /// is a placeholder that refers to a real MCCFIInstruction object kept by
+  /// this function that will be emitted at that position.
+  BinaryBasicBlock::iterator addCFIPseudo(BinaryBasicBlock *BB,
+                                          BinaryBasicBlock::iterator Pos,
+                                          uint32_t Offset) {
+    MCInst CFIPseudo;
+    BC.MIB->createCFI(CFIPseudo, Offset);
+    return BB->insertPseudoInstr(Pos, CFIPseudo);
+  }
+
+  /// Retrieve the MCCFIInstruction object associated with a CFI pseudo.
+  MCCFIInstruction* getCFIFor(const MCInst &Instr) {
+    if (!BC.MIB->isCFI(Instr))
+      return nullptr;
+    uint32_t Offset = Instr.getOperand(0).getImm();
+    assert(Offset < FrameInstructions.size() && "Invalid CFI offset");
+    return &FrameInstructions[Offset];
+  }
+
+  const MCCFIInstruction* getCFIFor(const MCInst &Instr) const {
+    if (!BC.MIB->isCFI(Instr))
+      return nullptr;
+    uint32_t Offset = Instr.getOperand(0).getImm();
+    assert(Offset < FrameInstructions.size() && "Invalid CFI offset");
+    return &FrameInstructions[Offset];
+  }
+
+  BinaryFunction &setFileOffset(uint64_t Offset) {
+    FileOffset = Offset;
+    return *this;
+  }
+
+  BinaryFunction &setSize(uint64_t S) {
+    Size = S;
+    return *this;
+  }
+
+  BinaryFunction &setMaxSize(uint64_t Size) {
+    MaxSize = Size;
+    return *this;
+  }
+
+  BinaryFunction &setOutputAddress(uint64_t Address) {
+    OutputAddress = Address;
+    return *this;
+  }
+
+  BinaryFunction &setOutputSize(uint64_t Size) {
+    OutputSize = Size;
+    return *this;
+  }
+
+  BinaryFunction &setSimple(bool Simple) {
+    IsSimple = Simple;
+    return *this;
+  }
+
+  void setPseudo(bool Pseudo) {
+    IsPseudo = Pseudo;
+  }
+
+  BinaryFunction &setUsesGnuArgsSize(bool Uses = true) {
+    UsesGnuArgsSize = Uses;
+    return *this;
+  }
+
+  BinaryFunction &setHasProfileAvailable(bool V = true) {
+    HasProfileAvailable = V;
+    return *this;
+  }
+
+  /// Mark function that should not be emitted.
+  void setIgnored();
+
+  void setIsPatched(bool V) {
+    IsPatched = V;
+  }
+
+  void setHasSplitJumpTable(bool V) {
+    HasSplitJumpTable = V;
+  }
+
+  void setHasCanonicalCFG(bool V) {
+    HasCanonicalCFG = V;
+  }
+
+  void setFolded(BinaryFunction *BF) {
+    FoldedIntoFunction = BF;
+  }
+
+  BinaryFunction &setPersonalityFunction(uint64_t Addr) {
+    assert(!PersonalityFunction && "can't set personality function twice");
+    PersonalityFunction = BC.getOrCreateGlobalSymbol(Addr, "FUNCat");
+    return *this;
+  }
+
+  BinaryFunction &setPersonalityEncoding(uint8_t Encoding) {
+    PersonalityEncoding = Encoding;
+    return *this;
+  }
+
+  BinaryFunction &setAlignment(uint16_t Align) {
+    Alignment = Align;
+    return *this;
+  }
+
+  uint16_t getAlignment() const {
+    return Alignment;
+  }
+
+  BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) {
+    MaxAlignmentBytes = MaxAlignBytes;
+    return *this;
+  }
+
+  uint16_t getMaxAlignmentBytes() const {
+    return MaxAlignmentBytes;
+  }
+
+  BinaryFunction &setMaxColdAlignmentBytes(uint16_t MaxAlignBytes) {
+    MaxColdAlignmentBytes = MaxAlignBytes;
+    return *this;
+  }
+
+  uint16_t getMaxColdAlignmentBytes() const {
+    return MaxColdAlignmentBytes;
+  }
+
+  BinaryFunction &setImageAddress(uint64_t Address) {
+    ImageAddress = Address;
+    return *this;
+  }
+
+  /// Return the address of this function' image in memory.
+  uint64_t getImageAddress() const {
+    return ImageAddress;
+  }
+
+  BinaryFunction &setImageSize(uint64_t Size) {
+    ImageSize = Size;
+    return *this;
+  }
+
+  /// Return the size of this function' image in memory.
+  uint64_t getImageSize() const {
+    return ImageSize;
+  }
+
+  /// Return true if the function is a secondary fragment of another function.
+  bool isFragment() const {
+    return IsFragment;
+  }
+
+  /// Return parent function fragment if this function is a secondary (child)
+  /// fragment of another function.
+  BinaryFunction *getParentFragment() const {
+    return ParentFragment;
+  }
+
+  /// If the function is a nested child fragment of another function, return its
+  /// topmost parent fragment.
+  const BinaryFunction *getTopmostFragment() const {
+    const BinaryFunction *BF = this;
+    while (BF->getParentFragment())
+      BF = BF->getParentFragment();
+
+    return BF;
+  }
+
+  /// Set the profile data for the number of times the function was called.
+  BinaryFunction &setExecutionCount(uint64_t Count) {
+    ExecutionCount = Count;
+    return *this;
+  }
+
+  /// Adjust execution count for the function by a given \p Count. The value
+  /// \p Count will be subtracted from the current function count.
+  ///
+  /// The function will proportionally adjust execution count for all
+  /// basic blocks and edges in the control flow graph.
+  void adjustExecutionCount(uint64_t Count);
+
+  /// Set LSDA address for the function.
+  BinaryFunction &setLSDAAddress(uint64_t Address) {
+    LSDAAddress = Address;
+    return *this;
+  }
+
+  /// Set LSDA symbol for the function.
+  BinaryFunction &setLSDASymbol(MCSymbol *Symbol) {
+    LSDASymbol = Symbol;
+    return *this;
+  }
+
+  /// Return the profile information about the number of times
+  /// the function was executed.
+  ///
+  /// Return COUNT_NO_PROFILE if there's no profile info.
+  uint64_t getExecutionCount() const {
+    return ExecutionCount;
+  }
+
+  /// Return the raw profile information about the number of branch
+  /// executions corresponding to this function.
+  uint64_t getRawBranchCount() const { return RawBranchCount; }
+
+  /// Return the execution count for functions with known profile.
+  /// Return 0 if the function has no profile.
+  uint64_t getKnownExecutionCount() const {
+    return ExecutionCount == COUNT_NO_PROFILE ? 0 : ExecutionCount;
+  }
+
+  /// Return original LSDA address for the function or NULL.
+  uint64_t getLSDAAddress() const {
+    return LSDAAddress;
+  }
+
+  /// Return symbol pointing to function's LSDA.
+  MCSymbol *getLSDASymbol() {
+    if (LSDASymbol)
+      return LSDASymbol;
+    if (CallSites.empty())
+      return nullptr;
+
+    LSDASymbol =
+      BC.Ctx->getOrCreateSymbol(Twine("GCC_except_table") +
+                                Twine::utohexstr(getFunctionNumber()));
+
+    return LSDASymbol;
+  }
+
+  /// Return symbol pointing to function's LSDA for the cold part.
+  MCSymbol *getColdLSDASymbol() {
+    if (ColdLSDASymbol)
+      return ColdLSDASymbol;
+    if (ColdCallSites.empty())
+      return nullptr;
+
+    ColdLSDASymbol =
+      BC.Ctx->getOrCreateSymbol(Twine("GCC_cold_except_table") +
+                                Twine::utohexstr(getFunctionNumber()));
+
+    return ColdLSDASymbol;
+  }
+
+  /// True if the symbol is a mapping symbol used in AArch64 to delimit
+  /// data inside code section.
+  bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const;
+  bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const;
+
+  void setOutputDataAddress(uint64_t Address) {
+    OutputDataOffset = Address;
+  }
+
+  uint64_t getOutputDataAddress() const {
+    return OutputDataOffset;
+  }
+
+  void setOutputColdDataAddress(uint64_t Address) {
+    OutputColdDataOffset = Address;
+  }
+
+  uint64_t getOutputColdDataAddress() const {
+    return OutputColdDataOffset;
+  }
+
+  /// If \p Address represents an access to a constant island managed by this
+  /// function, return a symbol so code can safely refer to it. Otherwise,
+  /// return nullptr. First return value is the symbol for reference in the
+  /// hot code area while the second return value is the symbol for reference
+  /// in the cold code area, as when the function is split the islands are
+  /// duplicated.
+  MCSymbol *getOrCreateIslandAccess(uint64_t Address) {
+    MCSymbol *Symbol;
+    if (!isInConstantIsland(Address))
+      return nullptr;
+
+    // Register our island at global namespace
+    Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat");
+
+    // Internal bookkeeping
+    const uint64_t Offset = Address - getAddress();
+    assert((!Islands.Offsets.count(Offset) ||
+            Islands.Offsets[Offset] == Symbol) &&
+           "Inconsistent island symbol management");
+    if (!Islands.Offsets.count(Offset)) {
+      Islands.Offsets[Offset] = Symbol;
+      Islands.Symbols.insert(Symbol);
+    }
+    return Symbol;
+  }
+
+  /// Called by an external function which wishes to emit references to constant
+  /// island symbols of this function. We create a proxy for it, so we emit
+  /// separate symbols when emitting our constant island on behalf of this other
+  /// function.
+  MCSymbol *
+  getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction &Referrer) {
+    MCSymbol *Symbol = getOrCreateIslandAccess(Address);
+    if (!Symbol)
+      return nullptr;
+
+    MCSymbol *Proxy;
+    if (!Islands.Proxies[&Referrer].count(Symbol)) {
+      Proxy =
+          BC.Ctx->getOrCreateSymbol(Symbol->getName() +
+                                    ".proxy.for." + Referrer.getPrintName());
+      Islands.Proxies[&Referrer][Symbol] = Proxy;
+      Islands.Proxies[&Referrer][Proxy] = Symbol;
+    }
+    Proxy = Islands.Proxies[&Referrer][Symbol];
+    return Proxy;
+  }
+
+  /// Detects whether \p Address is inside a data region in this function
+  /// (constant islands).
+  bool isInConstantIsland(uint64_t Address) const {
+    if (Address < getAddress())
+      return false;
+
+    uint64_t Offset = Address - getAddress();
+
+    if (Offset >= getMaxSize())
+      return false;
+
+    auto DataIter = Islands.DataOffsets.upper_bound(Offset);
+    if (DataIter == Islands.DataOffsets.begin())
+      return false;
+    DataIter = std::prev(DataIter);
+
+    auto CodeIter = Islands.CodeOffsets.upper_bound(Offset);
+    if (CodeIter == Islands.CodeOffsets.begin())
+      return true;
+
+    return *std::prev(CodeIter) <= *DataIter;
+  }
+
+  uint64_t
+  estimateConstantIslandSize(const BinaryFunction *OnBehalfOf = nullptr) const {
+    uint64_t Size = 0;
+    for (auto DataIter = Islands.DataOffsets.begin();
+         DataIter != Islands.DataOffsets.end();
+         ++DataIter) {
+      auto NextData = std::next(DataIter);
+      auto CodeIter = Islands.CodeOffsets.lower_bound(*DataIter);
+      if (CodeIter == Islands.CodeOffsets.end() &&
+          NextData == Islands.DataOffsets.end()) {
+        Size += getMaxSize() - *DataIter;
+        continue;
+      }
+
+      uint64_t NextMarker;
+      if (CodeIter == Islands.CodeOffsets.end())
+        NextMarker = *NextData;
+      else if (NextData == Islands.DataOffsets.end())
+        NextMarker = *CodeIter;
+      else
+        NextMarker = (*CodeIter > *NextData) ? *NextData : *CodeIter;
+
+      Size += NextMarker - *DataIter;
+    }
+
+    if (!OnBehalfOf) {
+      for (BinaryFunction *ExternalFunc : Islands.Dependency)
+        Size += ExternalFunc->estimateConstantIslandSize(this);
+    }
+    return Size;
+  }
+
+  bool hasConstantIsland() const {
+    return !Islands.DataOffsets.empty();
+  }
+
+  /// Return true iff the symbol could be seen inside this function otherwise
+  /// it is probably another function.
+  bool isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const;
+
+  /// Disassemble function from raw data.
+  /// If successful, this function will populate the list of instructions
+  /// for this function together with offsets from the function start
+  /// in the input. It will also populate Labels with destinations for
+  /// local branches, and TakenBranches with [from, to] info.
+  ///
+  /// The Function should be properly initialized before this function
+  /// is called. I.e. function address and size should be set.
+  ///
+  /// Returns true on successful disassembly, and updates the current
+  /// state to State:Disassembled.
+  ///
+  /// Returns false if disassembly failed.
+  bool disassemble();
+
+  /// Scan function for references to other functions. In relocation mode,
+  /// add relocations for external references.
+  ///
+  /// Return true on success.
+  bool scanExternalRefs();
+
+  /// Return the size of a data object located at \p Offset in the function.
+  /// Return 0 if there is no data object at the \p Offset.
+  size_t getSizeOfDataInCodeAt(uint64_t Offset) const;
+
+  /// Verify that starting at \p Offset function contents are filled with
+  /// zero-value bytes.
+  bool isZeroPaddingAt(uint64_t Offset) const;
+
+  /// Check that entry points have an associated instruction at their
+  /// offsets after disassembly.
+  void postProcessEntryPoints();
+
+  /// Post-processing for jump tables after disassembly. Since their
+  /// boundaries are not known until all call sites are seen, we need this
+  /// extra pass to perform any final adjustments.
+  void postProcessJumpTables();
+
+  /// Builds a list of basic blocks with successor and predecessor info.
+  ///
+  /// The function should in Disassembled state prior to call.
+  ///
+  /// Returns true on success and update the current function state to
+  /// State::CFG. Returns false if CFG cannot be built.
+  bool buildCFG(MCPlusBuilder::AllocatorIdTy);
+
+  /// Perform post-processing of the CFG.
+  void postProcessCFG();
+
+  /// Verify that any assumptions we've made about indirect branches were
+  /// correct and also make any necessary changes to unknown indirect branches.
+  ///
+  /// Catch-22: we need to know indirect branch targets to build CFG, and
+  /// in order to determine the value for indirect branches we need to know CFG.
+  ///
+  /// As such, the process of decoding indirect branches is broken into 2 steps:
+  /// first we make our best guess about a branch without knowing the CFG,
+  /// and later after we have the CFG for the function, we verify our earlier
+  /// assumptions and also do our best at processing unknown indirect branches.
+  ///
+  /// Return true upon successful processing, or false if the control flow
+  /// cannot be statically evaluated for any given indirect branch.
+  bool postProcessIndirectBranches(MCPlusBuilder::AllocatorIdTy AllocId);
+
+  /// Return all call site profile info for this function.
+  IndirectCallSiteProfile &getAllCallSites() {
+    return AllCallSites;
+  }
+
+  const IndirectCallSiteProfile &getAllCallSites() const {
+    return AllCallSites;
+  }
+
+  /// Walks the list of basic blocks filling in missing information about
+  /// edge frequency for fall-throughs.
+  ///
+  /// Assumes the CFG has been built and edge frequency for taken branches
+  /// has been filled with LBR data.
+  void inferFallThroughCounts();
+
+  /// Clear execution profile of the function.
+  void clearProfile();
+
+  /// Converts conditional tail calls to unconditional tail calls. We do this to
+  /// handle conditional tail calls correctly and to give a chance to the
+  /// simplify conditional tail call pass to decide whether to re-optimize them
+  /// using profile information.
+  void removeConditionalTailCalls();
+
+  // Convert COUNT_NO_PROFILE to 0
+  void removeTagsFromProfile();
+
+  /// Computes a function hotness score: the sum of the products of BB frequency
+  /// and size.
+  uint64_t getFunctionScore() const;
+
+  /// Return true if the layout has been changed by basic block reordering,
+  /// false otherwise.
+  bool hasLayoutChanged() const;
+
+  /// Get the edit distance of the new layout with respect to the previous
+  /// layout after basic block reordering.
+  uint64_t getEditDistance() const;
+
+  /// Get the number of instructions within this function.
+  uint64_t getInstructionCount() const;
+
+  const CFIInstrMapType &getFDEProgram() const {
+    return FrameInstructions;
+  }
+
+  void moveRememberRestorePair(BinaryBasicBlock *BB);
+
+  bool replayCFIInstrs(int32_t FromState, int32_t ToState,
+                       BinaryBasicBlock *InBB,
+                       BinaryBasicBlock::iterator InsertIt);
+
+  /// unwindCFIState is used to unwind from a higher to a lower state number
+  /// without using remember-restore instructions. We do that by keeping track
+  /// of what values have been changed from state A to B and emitting
+  /// instructions that undo this change.
+  SmallVector<int32_t, 4> unwindCFIState(int32_t FromState, int32_t ToState,
+                                         BinaryBasicBlock *InBB,
+                                         BinaryBasicBlock::iterator &InsertIt);
+
+  /// After reordering, this function checks the state of CFI and fixes it if it
+  /// is corrupted. If it is unable to fix it, it returns false.
+  bool finalizeCFIState();
+
+  /// Return true if this function needs an address-transaltion table after
+  /// its code emission.
+  bool requiresAddressTranslation() const;
+
+  /// Adjust branch instructions to match the CFG.
+  ///
+  /// As it comes to internal branches, the CFG represents "the ultimate source
+  /// of truth". Transformations on functions and blocks have to update the CFG
+  /// and fixBranches() would make sure the correct branch instructions are
+  /// inserted at the end of basic blocks.
+  ///
+  /// We do require a conditional branch at the end of the basic block if
+  /// the block has 2 successors as CFG currently lacks the conditional
+  /// code support (it will probably stay that way). We only use this
+  /// branch instruction for its conditional code, the destination is
+  /// determined by CFG - first successor representing true/taken branch,
+  /// while the second successor - false/fall-through branch.
+  ///
+  /// When we reverse the branch condition, the CFG is updated accordingly.
+  void fixBranches();
+
+  /// Mark function as finalized. No further optimizations are permitted.
+  void setFinalized() {
+    CurrentState = State::CFG_Finalized;
+  }
+
+  void setEmitted(bool KeepCFG = false) {
+    CurrentState = State::EmittedCFG;
+    if (!KeepCFG) {
+      releaseCFG();
+      CurrentState = State::Emitted;
+    }
+  }
+
+  /// Process LSDA information for the function.
+  void parseLSDA(ArrayRef<uint8_t> LSDAData, uint64_t LSDAAddress);
+
+  /// Update exception handling ranges for the function.
+  void updateEHRanges();
+
+  /// Traverse cold basic blocks and replace references to constants in islands
+  /// with a proxy symbol for the duplicated constant island that is going to be
+  /// emitted in the cold region.
+  void duplicateConstantIslands();
+
+  /// Merge profile data of this function into those of the given
+  /// function. The functions should have been proven identical with
+  /// isIdenticalWith.
+  void mergeProfileDataInto(BinaryFunction &BF) const;
+
+  /// Returns the last computed hash value of the function.
+  size_t getHash() const {
+    return Hash;
+  }
+
+  using OperandHashFuncTy =
+    function_ref<typename std::string(const MCOperand&)>;
+
+  /// Compute the hash value of the function based on its contents.
+  ///
+  /// If \p UseDFS is set, process basic blocks in DFS order. Otherwise, use
+  /// the existing layout order.
+  ///
+  /// By default, instruction operands are ignored while calculating the hash.
+  /// The caller can change this via passing \p OperandHashFunc function.
+  /// The return result of this function will be mixed with internal hash.
+  size_t computeHash(bool UseDFS = false,
+                     OperandHashFuncTy OperandHashFunc =
+                       [](const MCOperand&) { return std::string(); }) const;
+
+  void setDWARFUnit(DWARFUnit *Unit) {
+    DwarfUnit = Unit;
+  }
+
+  /// Return DWARF compile unit for this function.
+  DWARFUnit *getDWARFUnit() const {
+    return DwarfUnit;
+  }
+
+  /// Return line info table for this function.
+  const DWARFDebugLine::LineTable *getDWARFLineTable() const {
+    return getDWARFUnit() ? BC.DwCtx->getLineTableForUnit(getDWARFUnit())
+                          : nullptr;
+  }
+
+  /// Finalize profile for the function.
+  void postProcessProfile();
+
+  /// Returns an estimate of the function's hot part after splitting.
+  /// This is a very rough estimate, as with C++ exceptions there are
+  /// blocks we don't move, and it makes no attempt at estimating the size
+  /// of the added/removed branch instructions.
+  /// Note that this size is optimistic and the actual size may increase
+  /// after relaxation.
+  size_t estimateHotSize(const bool UseSplitSize = true) const {
+    size_t Estimate = 0;
+    if (UseSplitSize && isSplit()) {
+      for (const BinaryBasicBlock *BB : BasicBlocksLayout) {
+        if (!BB->isCold()) {
+          Estimate += BC.computeCodeSize(BB->begin(), BB->end());
+        }
+      }
+    } else {
+      for (const BinaryBasicBlock *BB : BasicBlocksLayout) {
+        if (BB->getKnownExecutionCount() != 0) {
+          Estimate += BC.computeCodeSize(BB->begin(), BB->end());
+        }
+      }
+    }
+    return Estimate;
+  }
+
+  size_t estimateColdSize() const {
+    if (!isSplit())
+      return estimateSize();
+    size_t Estimate = 0;
+    for (const BinaryBasicBlock *BB : BasicBlocksLayout) {
+      if (BB->isCold()) {
+        Estimate += BC.computeCodeSize(BB->begin(), BB->end());
+      }
+    }
+    return Estimate;
+  }
+
+  size_t estimateSize() const {
+    size_t Estimate = 0;
+    for (const BinaryBasicBlock *BB : BasicBlocksLayout) {
+      Estimate += BC.computeCodeSize(BB->begin(), BB->end());
+    }
+    return Estimate;
+  }
+
+  /// Return output address ranges for a function.
+  DebugAddressRangesVector getOutputAddressRanges() const;
+
+  /// Given an address corresponding to an instruction in the input binary,
+  /// return an address of this instruction in output binary.
+  ///
+  /// Return 0 if no matching address could be found or the instruction was
+  /// removed.
+  uint64_t translateInputToOutputAddress(uint64_t Address) const;
+
+  /// Take address ranges corresponding to the input binary and translate
+  /// them to address ranges in the output binary.
+  DebugAddressRangesVector translateInputToOutputRanges(
+      const DWARFAddressRangesVector &InputRanges) const;
+
+  /// Similar to translateInputToOutputRanges() but operates on location lists
+  /// and moves associated data to output location lists.
+  DebugLocationsVector translateInputToOutputLocationList(
+      const DebugLocationsVector &InputLL) const;
+
+  /// Return true if the function is an AArch64 linker inserted veneer
+  bool isAArch64Veneer() const;
+
+  virtual ~BinaryFunction();
+
+  /// Info for fragmented functions.
+  class FragmentInfo {
+  private:
+    uint64_t Address{0};
+    uint64_t ImageAddress{0};
+    uint64_t ImageSize{0};
+    uint64_t FileOffset{0};
+  public:
+    uint64_t getAddress() const { return Address; }
+    uint64_t getImageAddress() const { return ImageAddress; }
+    uint64_t getImageSize() const { return ImageSize; }
+    uint64_t getFileOffset() const { return FileOffset; }
+
+    void setAddress(uint64_t VAddress) { Address = VAddress; }
+    void setImageAddress(uint64_t Address) { ImageAddress = Address; }
+    void setImageSize(uint64_t Size) { ImageSize = Size; }
+    void setFileOffset(uint64_t Offset) { FileOffset = Offset; }
+  };
+
+  /// Cold fragment of the function.
+  FragmentInfo ColdFragment;
+
+  FragmentInfo &cold() { return ColdFragment; }
+
+  const FragmentInfo &cold() const { return ColdFragment; }
+
+  /// Mark child fragments as ignored.
+  void ignoreFragments() {
+    for (BinaryFunction *Fragment : Fragments)
+      Fragment->setIgnored();
+  }
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const BinaryFunction &Function) {
+  OS << Function.getPrintName();
+  return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const BinaryFunction::State State) {
+  switch (State) {
+  case BinaryFunction::State::Empty:        OS << "empty";  break;
+  case BinaryFunction::State::Disassembled: OS << "disassembled";  break;
+  case BinaryFunction::State::CFG:          OS << "CFG constructed";  break;
+  case BinaryFunction::State::CFG_Finalized:OS << "CFG finalized";  break;
+  case BinaryFunction::State::EmittedCFG:   OS << "emitted with CFG";  break;
+  case BinaryFunction::State::Emitted:      OS << "emitted";  break;
+  }
+
+  return OS;
+}
+
+} // namespace bolt
+
+
+// GraphTraits specializations for function basic block graphs (CFGs)
+template <> struct GraphTraits<bolt::BinaryFunction *> :
+  public GraphTraits<bolt::BinaryBasicBlock *> {
+  static NodeRef getEntryNode(bolt::BinaryFunction *F) {
+    return *F->layout_begin();
+  }
+
+  using nodes_iterator = pointer_iterator<bolt::BinaryFunction::iterator>;
+
+  static nodes_iterator nodes_begin(bolt::BinaryFunction *F) {
+    llvm_unreachable("Not implemented");
+    return nodes_iterator(F->begin());
+  }
+  static nodes_iterator nodes_end(bolt::BinaryFunction *F) {
+    llvm_unreachable("Not implemented");
+    return nodes_iterator(F->end());
+  }
+  static size_t size(bolt::BinaryFunction *F) {
+    return F->size();
+  }
+};
+
+template <> struct GraphTraits<const bolt::BinaryFunction *> :
+  public GraphTraits<const bolt::BinaryBasicBlock *> {
+  static NodeRef getEntryNode(const bolt::BinaryFunction *F) {
+    return *F->layout_begin();
+  }
+
+  using nodes_iterator = pointer_iterator<bolt::BinaryFunction::const_iterator>;
+
+  static nodes_iterator nodes_begin(const bolt::BinaryFunction *F) {
+    llvm_unreachable("Not implemented");
+    return nodes_iterator(F->begin());
+  }
+  static nodes_iterator nodes_end(const bolt::BinaryFunction *F) {
+    llvm_unreachable("Not implemented");
+    return nodes_iterator(F->end());
+  }
+  static size_t size(const bolt::BinaryFunction *F) {
+    return F->size();
+  }
+};
+
+template <> struct GraphTraits<Inverse<bolt::BinaryFunction *>> :
+  public GraphTraits<Inverse<bolt::BinaryBasicBlock *>> {
+  static NodeRef getEntryNode(Inverse<bolt::BinaryFunction *> G) {
+    return *G.Graph->layout_begin();
+  }
+};
+
+template <> struct GraphTraits<Inverse<const bolt::BinaryFunction *>> :
+  public GraphTraits<Inverse<const bolt::BinaryBasicBlock *>> {
+  static NodeRef getEntryNode(Inverse<const bolt::BinaryFunction *> G) {
+    return *G.Graph->layout_begin();
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BinaryFunctionProfile.cpp b/bolt/src/BinaryFunctionProfile.cpp
new file mode 100644
index 000000000000..d7aebe485026
--- /dev/null
+++ b/bolt/src/BinaryFunctionProfile.cpp
@@ -0,0 +1,372 @@
+//===--- BinaryFunctionProfile.cpp                                 --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "Passes/MCF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt-prof"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<IndirectCallPromotionType> IndirectCallPromotion;
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+
+static cl::opt<MCFCostFunction>
+DoMCF("mcf",
+  cl::desc("solve a min cost flow problem on the CFG to fix edge counts "
+           "(default=disable)"),
+  cl::init(MCF_DISABLE),
+  cl::values(
+    clEnumValN(MCF_DISABLE, "none",
+               "disable MCF"),
+    clEnumValN(MCF_LINEAR, "linear",
+               "cost function is inversely proportional to edge count"),
+    clEnumValN(MCF_QUADRATIC, "quadratic",
+               "cost function is inversely proportional to edge count squared"),
+    clEnumValN(MCF_LOG, "log",
+               "cost function is inversely proportional to log of edge count"),
+    clEnumValN(MCF_BLAMEFTS, "blamefts",
+               "tune cost to blame fall-through edges for surplus flow")),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+FixFuncCounts("fix-func-counts",
+  cl::desc("adjust function counts based on basic blocks execution count"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+FixBlockCounts("fix-block-counts",
+  cl::desc("adjust block counts based on outgoing branch counts"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+InferFallThroughs("infer-fall-throughs",
+  cl::desc("infer execution count for fall-through blocks"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+void BinaryFunction::postProcessProfile() {
+  if (!hasValidProfile()) {
+    clearProfile();
+    return;
+  }
+
+  if (!(getProfileFlags() & PF_LBR)) {
+    // Check if MCF post-processing was requested.
+    if (opts::DoMCF != MCF_DISABLE) {
+      removeTagsFromProfile();
+      solveMCF(*this, opts::DoMCF);
+    }
+    return;
+  }
+
+  // If we have at least some branch data for the function indicate that it
+  // was executed.
+  if (opts::FixFuncCounts && ExecutionCount == 0) {
+    ExecutionCount = 1;
+  }
+
+  // Compute preliminary execution count for each basic block.
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    if ((!BB->isEntryPoint() && !BB->isLandingPad()) ||
+        BB->ExecutionCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      BB->ExecutionCount = 0;
+  }
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    auto SuccBIIter = BB->branch_info_begin();
+    for (BinaryBasicBlock *Succ : BB->successors()) {
+      // All incoming edges to the primary entry have been accounted for, thus
+      // we skip the update here.
+      if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+          Succ != BasicBlocks.front())
+        Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count);
+      ++SuccBIIter;
+    }
+  }
+
+  if (opts::FixBlockCounts) {
+    for (BinaryBasicBlock *BB : BasicBlocks) {
+      // Make sure that execution count of a block is at least the branch count
+      // of an incoming/outgoing jump.
+      auto SuccBIIter = BB->branch_info_begin();
+      for (BinaryBasicBlock *Succ : BB->successors()) {
+        uint64_t Count = SuccBIIter->Count;
+        if (Count != BinaryBasicBlock::COUNT_NO_PROFILE && Count > 0) {
+          Succ->setExecutionCount(std::max(Succ->getExecutionCount(), Count));
+          BB->setExecutionCount(std::max(BB->getExecutionCount(), Count));
+        }
+        ++SuccBIIter;
+      }
+      // Make sure that execution count of a block is at least the number of
+      // function calls from the block.
+      for (MCInst &Inst : *BB) {
+        // Ignore non-call instruction
+        if (!BC.MIA->isCall(Inst))
+          continue;
+
+        auto CountAnnt = BC.MIB->tryGetAnnotationAs<uint64_t>(Inst, "Count");
+        if (CountAnnt) {
+          BB->setExecutionCount(std::max(BB->getExecutionCount(), *CountAnnt));
+        }
+      }
+    }
+  }
+
+  if (opts::InferFallThroughs)
+    inferFallThroughCounts();
+
+  // Check if MCF post-processing was requested.
+  if (opts::DoMCF != MCF_DISABLE) {
+    removeTagsFromProfile();
+    solveMCF(*this, opts::DoMCF);
+  }
+
+  // Update profile information for jump tables based on CFG branch data.
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    const MCInst *LastInstr = BB->getLastNonPseudoInstr();
+    if (!LastInstr)
+      continue;
+    const uint64_t JTAddress = BC.MIB->getJumpTable(*LastInstr);
+    if (!JTAddress)
+      continue;
+    JumpTable *JT = getJumpTableContainingAddress(JTAddress);
+    if (!JT)
+      continue;
+
+    uint64_t TotalBranchCount = 0;
+    for (const BinaryBasicBlock::BinaryBranchInfo &BranchInfo :
+         BB->branch_info()) {
+      TotalBranchCount += BranchInfo.Count;
+    }
+    JT->Count += TotalBranchCount;
+
+    if (opts::IndirectCallPromotion < ICP_JUMP_TABLES &&
+        opts::JumpTables < JTS_AGGRESSIVE)
+      continue;
+
+    if (JT->Counts.empty())
+      JT->Counts.resize(JT->Entries.size());
+    auto EI = JT->Entries.begin();
+    uint64_t Delta = (JTAddress - JT->getAddress()) / JT->EntrySize;
+    EI += Delta;
+    while (EI != JT->Entries.end()) {
+      const BinaryBasicBlock *TargetBB = getBasicBlockForLabel(*EI);
+      if (TargetBB) {
+        const BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
+            BB->getBranchInfo(*TargetBB);
+        assert(Delta < JT->Counts.size());
+        JT->Counts[Delta].Count += BranchInfo.Count;
+        JT->Counts[Delta].Mispreds += BranchInfo.MispredictedCount;
+      }
+      ++Delta;
+      ++EI;
+      // A label marks the start of another jump table.
+      if (JT->Labels.count(Delta * JT->EntrySize))
+        break;
+    }
+  }
+}
+
+void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const {
+  // No reason to merge invalid or empty profiles into BF.
+  if (!hasValidProfile())
+    return;
+
+  // Update function execution count.
+  if (getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) {
+    BF.setExecutionCount(BF.getKnownExecutionCount() + getExecutionCount());
+  }
+
+  // Since we are merging a valid profile, the new profile should be valid too.
+  // It has either already been valid, or it has been cleaned up.
+  BF.ProfileMatchRatio = 1.0f;
+
+  // Update basic block and edge counts.
+  auto BBMergeI = BF.begin();
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    BinaryBasicBlock *BBMerge = &*BBMergeI;
+    assert(getIndex(BB) == BF.getIndex(BBMerge));
+
+    // Update basic block count.
+    if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) {
+      BBMerge->setExecutionCount(
+          BBMerge->getKnownExecutionCount() + BB->getExecutionCount());
+    }
+
+    // Update edge count for successors of this basic block.
+    auto BBMergeSI = BBMerge->succ_begin();
+    auto BIMergeI = BBMerge->branch_info_begin();
+    auto BII = BB->branch_info_begin();
+    for (const BinaryBasicBlock *BBSucc : BB->successors()) {
+      (void)BBSucc;
+      assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI));
+
+      // At this point no branch count should be set to COUNT_NO_PROFILE.
+      assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+             "unexpected unknown branch profile");
+      assert(BIMergeI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+             "unexpected unknown branch profile");
+
+      BIMergeI->Count += BII->Count;
+
+      // When we merge inferred and real fall-through branch data, the merged
+      // data is considered inferred.
+      if (BII->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED &&
+          BIMergeI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) {
+        BIMergeI->MispredictedCount += BII->MispredictedCount;
+      } else {
+        BIMergeI->MispredictedCount = BinaryBasicBlock::COUNT_INFERRED;
+      }
+
+      ++BBMergeSI;
+      ++BII;
+      ++BIMergeI;
+    }
+    assert(BBMergeSI == BBMerge->succ_end());
+
+    ++BBMergeI;
+  }
+  assert(BBMergeI == BF.end());
+
+  // Merge jump tables profile info.
+  auto JTMergeI = BF.JumpTables.begin();
+  for (const auto &JTEntry : JumpTables) {
+    if (JTMergeI->second->Counts.empty())
+      JTMergeI->second->Counts.resize(JTEntry.second->Counts.size());
+    auto CountMergeI = JTMergeI->second->Counts.begin();
+    for (const JumpTable::JumpInfo &JI : JTEntry.second->Counts) {
+      CountMergeI->Count += JI.Count;
+      CountMergeI->Mispreds += JI.Mispreds;
+      ++CountMergeI;
+    }
+    assert(CountMergeI == JTMergeI->second->Counts.end());
+
+    ++JTMergeI;
+  }
+  assert(JTMergeI == BF.JumpTables.end());
+}
+
+void BinaryFunction::inferFallThroughCounts() {
+  // Work on a basic block at a time, propagating frequency information
+  // forwards.
+  // It is important to walk in the layout order.
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    const uint64_t BBExecCount = BB->getExecutionCount();
+
+    // Propagate this information to successors, filling in fall-through edges
+    // with frequency information
+    if (BB->succ_size() == 0)
+      continue;
+
+    // Calculate frequency of outgoing branches from this node according to
+    // LBR data.
+    uint64_t ReportedBranches = 0;
+    for (const BinaryBasicBlock::BinaryBranchInfo &SuccBI : BB->branch_info()) {
+      if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE)
+        ReportedBranches += SuccBI.Count;
+    }
+
+    // Get taken count of conditional tail call if the block ends with one.
+    uint64_t CTCTakenCount = 0;
+    const MCInst *CTCInstr = BB->getLastNonPseudoInstr();
+    if (CTCInstr && BC.MIB->getConditionalTailCall(*CTCInstr)) {
+      CTCTakenCount =
+        BC.MIB->getAnnotationWithDefault<uint64_t>(*CTCInstr, "CTCTakenCount");
+    }
+
+    // Calculate frequency of throws from this node according to LBR data
+    // for branching into associated landing pads. Since it is possible
+    // for a landing pad to be associated with more than one basic blocks,
+    // we may overestimate the frequency of throws for such blocks.
+    uint64_t ReportedThrows = 0;
+    for (const BinaryBasicBlock *LP : BB->landing_pads()) {
+      ReportedThrows += LP->getExecutionCount();
+    }
+
+    const uint64_t TotalReportedJumps =
+      ReportedBranches + CTCTakenCount + ReportedThrows;
+
+    // Infer the frequency of the fall-through edge, representing not taking the
+    // branch.
+    uint64_t Inferred = 0;
+    if (BBExecCount > TotalReportedJumps)
+      Inferred = BBExecCount - TotalReportedJumps;
+
+    LLVM_DEBUG(
+      if (BBExecCount < TotalReportedJumps)
+        dbgs()
+            << "Fall-through inference is slightly inconsistent. "
+               "exec frequency is less than the outgoing edges frequency ("
+            << BBExecCount << " < " << ReportedBranches
+            << ") for  BB at offset 0x"
+            << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n';
+    );
+
+    if (BB->succ_size() <= 2) {
+      // Skip if the last instruction is an unconditional jump.
+      const MCInst *LastInstr = BB->getLastNonPseudoInstr();
+      if (LastInstr &&
+          (BC.MIB->isUnconditionalBranch(*LastInstr) ||
+           BC.MIB->isIndirectBranch(*LastInstr)))
+        continue;
+      // If there is an FT it will be the last successor.
+      auto &SuccBI = *BB->branch_info_rbegin();
+      auto &Succ = *BB->succ_rbegin();
+      if (SuccBI.Count == 0) {
+        SuccBI.Count = Inferred;
+        SuccBI.MispredictedCount = BinaryBasicBlock::COUNT_INFERRED;
+        Succ->ExecutionCount += Inferred;
+      }
+    }
+  }
+
+  return;
+}
+
+void BinaryFunction::clearProfile() {
+  // Keep function execution profile the same. Only clear basic block and edge
+  // counts.
+  for (BinaryBasicBlock *BB : BasicBlocks) {
+    BB->ExecutionCount = 0;
+    for (BinaryBasicBlock::BinaryBranchInfo &BI : BB->branch_info()) {
+      BI.Count = 0;
+      BI.MispredictedCount = 0;
+    }
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/BinaryLoop.h b/bolt/src/BinaryLoop.h
new file mode 100644
index 000000000000..a09d75ed8fda
--- /dev/null
+++ b/bolt/src/BinaryLoop.h
@@ -0,0 +1,93 @@
+//===--- BinaryLoop.h - Interface for machine-level loop ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BinaryLoop class, which represents a loop in the
+// CFG of a binary function, and the BinaryLoopInfo class, which stores
+// information about all the loops of a binary function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_LOOP_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_LOOP_H
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+
+namespace llvm {
+namespace bolt {
+
+class BinaryBasicBlock;
+
+using BinaryDomTreeNode = DomTreeNodeBase<BinaryBasicBlock>;
+using BinaryDominatorTree =  DomTreeBase<BinaryBasicBlock>;
+
+class BinaryLoop : public LoopBase<BinaryBasicBlock, BinaryLoop> {
+public:
+  BinaryLoop() : LoopBase<BinaryBasicBlock, BinaryLoop>() { }
+
+  // The total count of all the back edges of this loop.
+  uint64_t TotalBackEdgeCount{0};
+
+  // The times the loop is entered from outside.
+  uint64_t EntryCount{0};
+
+  // The times the loop is exited.
+  uint64_t ExitCount{0};
+
+  // Most of the public interface is provided by LoopBase.
+
+protected:
+  friend class LoopInfoBase<BinaryBasicBlock, BinaryLoop>;
+  explicit BinaryLoop(BinaryBasicBlock *BB) :
+    LoopBase<BinaryBasicBlock, BinaryLoop>(BB) { }
+};
+
+class BinaryLoopInfo : public LoopInfoBase<BinaryBasicBlock, BinaryLoop> {
+public:
+  BinaryLoopInfo() { }
+
+  unsigned OuterLoops{0};
+  unsigned TotalLoops{0};
+  unsigned MaximumDepth{0};
+
+  // Most of the public interface is provided by LoopInfoBase.
+};
+
+} // namespace bolt
+} // namespace llvm
+
+namespace llvm {
+
+// BinaryDominatorTree GraphTraits specializations.
+template <> struct GraphTraits<bolt::BinaryDomTreeNode *>
+  : public DomTreeGraphTraitsBase<bolt::BinaryDomTreeNode,
+                                  bolt::BinaryDomTreeNode::iterator> {};
+
+template <> struct GraphTraits<const bolt::BinaryDomTreeNode *>
+  : public DomTreeGraphTraitsBase<const bolt::BinaryDomTreeNode,
+                                  bolt::BinaryDomTreeNode::const_iterator> {};
+
+template <> struct GraphTraits<bolt::BinaryDominatorTree *>
+  : public GraphTraits<bolt::BinaryDomTreeNode *> {
+  static NodeRef getEntryNode(bolt::BinaryDominatorTree *DT) {
+    return DT->getRootNode();
+  }
+
+  static nodes_iterator nodes_begin(bolt::BinaryDominatorTree *N) {
+    return df_begin(getEntryNode(N));
+  }
+
+  static nodes_iterator nodes_end(bolt::BinaryDominatorTree *N) {
+    return df_end(getEntryNode(N));
+  }
+};
+
+} // namescpae llvm
+
+#endif
diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp
new file mode 100644
index 000000000000..07d307279481
--- /dev/null
+++ b/bolt/src/BinaryPassManager.cpp
@@ -0,0 +1,539 @@
+//===--- BinaryPassManager.cpp - Binary-level analysis/optimization passes ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryPassManager.h"
+#include "Passes/Aligner.h"
+#include "Passes/AllocCombiner.h"
+#include "Passes/FrameOptimizer.h"
+#include "Passes/IdenticalCodeFolding.h"
+#include "Passes/IndirectCallPromotion.h"
+#include "Passes/Inliner.h"
+#include "Passes/Instrumentation.h"
+#include "Passes/JTFootprintReduction.h"
+#include "Passes/LongJmp.h"
+#include "Passes/LoopInversionPass.h"
+#include "Passes/PLTCall.h"
+#include "Passes/PatchEntries.h"
+#include "Passes/RegReAssign.h"
+#include "Passes/ReorderData.h"
+#include "Passes/ReorderFunctions.h"
+#include "Passes/RetpolineInsertion.h"
+#include "Passes/SplitFunctions.h"
+#include "Passes/StokeInfo.h"
+#include "Passes/ValidateInternalCalls.h"
+#include "Passes/VeneerElimination.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+extern cl::OptionCategory BoltCategory;
+
+extern cl::opt<bool> Instrument;
+
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<bool> PrintAll;
+extern cl::opt<bool> PrintDynoStats;
+extern cl::opt<bool> DumpDotAll;
+extern cl::opt<bolt::PLTCall::OptType> PLT;
+
+static cl::opt<bool>
+DynoStatsAll("dyno-stats-all",
+  cl::desc("print dyno stats after each stage"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+EliminateUnreachable("eliminate-unreachable",
+  cl::desc("eliminate unreachable code"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+ICF("icf",
+  cl::desc("fold functions with identical code"),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+JTFootprintReductionFlag("jt-footprint-reduction",
+  cl::desc("make jump tables size smaller at the cost of using more "
+           "instructions at jump sites"),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintJTFootprintReduction("print-after-jt-footprint-reduction",
+  cl::desc("print function after jt-footprint-reduction pass"),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+NeverPrint("never-print",
+  cl::desc("never print"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::ReallyHidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+PrintAfterBranchFixup("print-after-branch-fixup",
+  cl::desc("print function after fixing local branches"),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintAfterLowering("print-after-lowering",
+  cl::desc("print function after instruction lowering"),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintFOP("print-fop",
+  cl::desc("print functions after frame optimizer pass"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+PrintFinalized("print-finalized",
+  cl::desc("print function after CFG is finalized"),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintLongJmp("print-longjmp",
+  cl::desc("print functions after longjmp pass"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintICF("print-icf",
+  cl::desc("print functions after ICF optimization"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintICP("print-icp",
+  cl::desc("print functions after indirect call promotion"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintRegReAssign("print-regreassign",
+  cl::desc("print functions after regreassign pass"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintInline("print-inline",
+  cl::desc("print functions after inlining optimization"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintOptimizeBodyless("print-optimize-bodyless",
+  cl::desc("print functions after bodyless optimization"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintPLT("print-plt",
+  cl::desc("print functions after PLT optimization"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintPeepholes("print-peepholes",
+  cl::desc("print functions after peephole optimization"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+PrintReordered("print-reordered",
+  cl::desc("print functions after layout optimization"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintReorderedFunctions("print-reordered-functions",
+  cl::desc("print functions after clustering"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintSCTC("print-sctc",
+  cl::desc("print functions after conditional tail call simplification"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintSimplifyROLoads("print-simplify-rodata-loads",
+  cl::desc("print functions after simplification of RO data loads"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintSplit("print-split",
+  cl::desc("print functions after code splitting"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintUCE("print-uce",
+  cl::desc("print functions after unreachable code elimination"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintProfileStats("print-profile-stats",
+  cl::desc("print profile quality/bias analysis"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+SimplifyConditionalTailCalls("simplify-conditional-tail-calls",
+  cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+SimplifyRODataLoads("simplify-rodata-loads",
+  cl::desc("simplify loads from read-only sections by replacing the memory "
+           "operand with the constant found in the corresponding section"),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+RegReAssign("reg-reassign",
+  cl::desc("reassign registers so as to avoid using REX prefixes in hot code"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+StringOps("inline-memcpy",
+  cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::list<std::string>
+SpecializeMemcpy1("memcpy1-spec",
+  cl::desc("list of functions with call sites for which to specialize memcpy() "
+           "for size 1"),
+  cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+StripRepRet("strip-rep-ret",
+  cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+llvm::cl::opt<bool>
+TimeOpts("time-opts",
+  cl::desc("print time spent in each optimization"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static llvm::cl::opt<bool>
+VerifyCFG("verify-cfg",
+  cl::desc("verify the CFG after every pass"),
+  cl::init(false),
+  cl::Hidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static llvm::cl::opt<bool>
+Stoke("stoke",
+  cl::desc("turn on the stoke analysis"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static llvm::cl::opt<bool>
+PrintStoke("print-stoke",
+  cl::desc("print functions after stoke analysis"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static llvm::cl::opt<bool>
+  PrintVeneerElimination("print-veneer-elimination",
+    cl::desc("print functions after veneer elimination pass"),
+    cl::init(false),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static llvm::cl::opt<bool>
+  PrintRetpolineInsertion("print-retpoline-insertion",
+    cl::desc("print functions after retpoline insertion pass"),
+    cl::init(false),
+    cl::ZeroOrMore,
+    cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+using namespace opts;
+
+const char BinaryFunctionPassManager::TimerGroupName[] =
+    "passman";
+const char BinaryFunctionPassManager::TimerGroupDesc[] =
+    "Binary Function Pass Manager";
+
+void BinaryFunctionPassManager::runPasses() {
+  auto &BFs = BC.getBinaryFunctions();
+  for (size_t PassIdx = 0; PassIdx < Passes.size(); PassIdx++) {
+    const std::pair<const bool, std::unique_ptr<BinaryFunctionPass>>
+        &OptPassPair = Passes[PassIdx];
+    if (!OptPassPair.first)
+      continue;
+
+    const std::unique_ptr<BinaryFunctionPass> &Pass = OptPassPair.second;
+    std::string PassIdName =
+        formatv("{0:2}_{1}", PassIdx, Pass->getName()).str();
+
+    if (opts::Verbosity > 0) {
+      outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n";
+    }
+
+    NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName,
+                       TimerGroupDesc, TimeOpts);
+
+    callWithDynoStats(
+      [this,&Pass] {
+        Pass->runOnFunctions(BC);
+      },
+      BFs,
+      Pass->getName(),
+      opts::DynoStatsAll
+    );
+
+    if (opts::VerifyCFG &&
+        !std::accumulate(
+           BFs.begin(), BFs.end(),
+           true,
+           [](const bool Valid,
+              const std::pair<const uint64_t, BinaryFunction> &It) {
+             return Valid && It.second.validateCFG();
+           })) {
+      errs() << "BOLT-ERROR: Invalid CFG detected after pass "
+             << Pass->getName() << "\n";
+      exit(1);
+    }
+
+    if (opts::Verbosity > 0) {
+      outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n";
+    }
+
+    if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass())
+      continue;
+
+    const std::string Message = std::string("after ") + Pass->getName();
+
+    for (auto &It : BFs) {
+      auto &Function = It.second;
+
+      if (!Pass->shouldPrint(Function))
+        continue;
+
+      Function.print(outs(), Message, true);
+
+      if (opts::DumpDotAll)
+        Function.dumpGraphForPass(PassIdName);
+    }
+  }
+}
+
+void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
+  BinaryFunctionPassManager Manager(BC);
+
+  const DynoStats InitialDynoStats = getDynoStats(BC.getBinaryFunctions());
+
+  if (opts::Instrument) {
+    Manager.registerPass(std::make_unique<Instrumentation>(NeverPrint));
+  }
+
+  // Here we manage dependencies/order manually, since passes are run in the
+  // order they're registered.
+
+  // Run this pass first to use stats for the original functions.
+  Manager.registerPass(std::make_unique<PrintProgramStats>(NeverPrint));
+
+  if (opts::PrintProfileStats)
+    Manager.registerPass(std::make_unique<PrintProfileStats>(NeverPrint));
+
+  Manager.registerPass(std::make_unique<ValidateInternalCalls>(NeverPrint));
+
+  Manager.registerPass(std::make_unique<StripRepRet>(NeverPrint),
+                       opts::StripRepRet);
+
+  Manager.registerPass(std::make_unique<IdenticalCodeFolding>(PrintICF),
+                       opts::ICF);
+
+  if (BC.isAArch64())
+      Manager.registerPass(
+          std::make_unique<VeneerElimination>(PrintVeneerElimination));
+
+  Manager.registerPass(
+      std::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
+      !opts::SpecializeMemcpy1.empty());
+
+  Manager.registerPass(std::make_unique<InlineMemcpy>(NeverPrint),
+                       opts::StringOps);
+
+  Manager.registerPass(std::make_unique<IndirectCallPromotion>(PrintICP));
+
+  Manager.registerPass(
+      std::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
+      opts::JTFootprintReductionFlag);
+
+  Manager.registerPass(
+    std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
+    opts::SimplifyRODataLoads);
+
+  Manager.registerPass(std::make_unique<RegReAssign>(PrintRegReAssign),
+                       opts::RegReAssign);
+
+  Manager.registerPass(std::make_unique<Inliner>(PrintInline));
+
+  Manager.registerPass(std::make_unique<IdenticalCodeFolding>(PrintICF),
+                       opts::ICF);
+
+  Manager.registerPass(std::make_unique<PLTCall>(PrintPLT));
+
+  Manager.registerPass(std::make_unique<ReorderBasicBlocks>(PrintReordered));
+
+  Manager.registerPass(
+    std::make_unique<EliminateUnreachableBlocks>(PrintUCE),
+    opts::EliminateUnreachable);
+
+  Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));
+
+  Manager.registerPass(std::make_unique<LoopInversionPass>());
+
+  // This pass syncs local branches with CFG. If any of the following
+  // passes breaks the sync - they either need to re-run the pass or
+  // fix branches consistency internally.
+  Manager.registerPass(std::make_unique<FixupBranches>(PrintAfterBranchFixup));
+
+  // This pass should come close to last since it uses the estimated hot
+  // size of a function to determine the order.  It should definitely
+  // also happen after any changes to the call graph are made, e.g. inlining.
+  Manager.registerPass(
+    std::make_unique<ReorderFunctions>(PrintReorderedFunctions));
+
+  // Print final dyno stats right while CFG and instruction analysis are intact.
+  Manager.registerPass(
+    std::make_unique<DynoStatsPrintPass>(
+      InitialDynoStats, "after all optimizations before SCTC and FOP"),
+    opts::PrintDynoStats | opts::DynoStatsAll);
+
+  // Add the StokeInfo pass, which extract functions for stoke optimization and
+  // get the liveness information for them
+  Manager.registerPass(std::make_unique<StokeInfo>(PrintStoke), opts::Stoke);
+
+  // This pass introduces conditional jumps into external functions.
+  // Between extending CFG to support this and isolating this pass we chose
+  // the latter. Thus this pass will do double jump removal and unreachable
+  // code elimination if necessary and won't rely on peepholes/UCE for these
+  // optimizations.
+  // More generally this pass should be the last optimization pass that
+  // modifies branches/control flow.  This pass is run after function
+  // reordering so that it can tell whether calls are forward/backward
+  // accurately.
+  Manager.registerPass(
+      std::make_unique<SimplifyConditionalTailCalls>(PrintSCTC),
+      opts::SimplifyConditionalTailCalls);
+
+  Manager.registerPass(std::make_unique<Peepholes>(PrintPeepholes));
+
+  Manager.registerPass(std::make_unique<AlignerPass>());
+
+  // Perform reordering on data contained in one or more sections using
+  // memory profiling data.
+  Manager.registerPass(std::make_unique<ReorderData>());
+
+  // This pass should always run last.*
+  Manager.registerPass(std::make_unique<FinalizeFunctions>(PrintFinalized));
+
+  // FrameOptimizer has an implicit dependency on FinalizeFunctions.
+  // FrameOptimizer move values around and needs to update CFIs. To do this, it
+  // must read CFI, interpret it and rewrite it, so CFIs need to be correctly
+  // placed according to the final layout.
+  Manager.registerPass(std::make_unique<FrameOptimizerPass>(PrintFOP));
+
+  Manager.registerPass(std::make_unique<AllocCombinerPass>(PrintFOP));
+
+  Manager.registerPass(
+      std::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
+
+  // Assign each function an output section.
+  Manager.registerPass(std::make_unique<AssignSections>());
+
+  // Patch original function entries
+  if (BC.HasRelocations)
+    Manager.registerPass(std::make_unique<PatchEntries>());
+
+  // Tighten branches according to offset differences between branch and
+  // targets. No extra instructions after this pass, otherwise we may have
+  // relocations out of range and crash during linking.
+  if (BC.isAArch64())
+    Manager.registerPass(std::make_unique<LongJmpPass>(PrintLongJmp));
+
+  // This pass turns tail calls into jumps which makes them invisible to
+  // function reordering. It's unsafe to use any CFG or instruction analysis
+  // after this point.
+  Manager.registerPass(
+    std::make_unique<InstructionLowering>(PrintAfterLowering));
+
+  // In non-relocation mode, mark functions that do not fit into their original
+  // space as non-simple if we have to (e.g. for correct debug info update).
+  // NOTE: this pass depends on finalized code.
+  if (!BC.HasRelocations)
+    Manager.registerPass(std::make_unique<CheckLargeFunctions>(NeverPrint));
+
+  Manager.registerPass(std::make_unique<LowerAnnotations>(NeverPrint));
+
+  Manager.runPasses();
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/BinaryPassManager.h b/bolt/src/BinaryPassManager.h
new file mode 100644
index 000000000000..511ddaad637a
--- /dev/null
+++ b/bolt/src/BinaryPassManager.h
@@ -0,0 +1,60 @@
+//===--- BinaryPassManager.h - Binary-level analysis/optimization passes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A very simple binary-level analysis/optimization passes system.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H
+
+#include "Passes/BinaryPasses.h"
+#include <memory>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+class BinaryContext;
+
+/// Simple class for managing analyses and optimizations on BinaryFunctions.
+class BinaryFunctionPassManager {
+private:
+  BinaryContext &BC;
+  std::vector<std::pair<const bool,
+                        std::unique_ptr<BinaryFunctionPass>>> Passes;
+
+ public:
+  static const char TimerGroupName[];
+  static const char TimerGroupDesc[];
+
+  BinaryFunctionPassManager(BinaryContext &BC)
+    : BC(BC) {}
+
+  /// Adds a pass to this manager based on the value of its corresponding
+  /// command-line option.
+  void registerPass(std::unique_ptr<BinaryFunctionPass> Pass,
+                    const bool Run) {
+    Passes.emplace_back(Run, std::move(Pass));
+  }
+
+  /// Adds an unconditionally run pass to this manager.
+  void registerPass(std::unique_ptr<BinaryFunctionPass> Pass) {
+    Passes.emplace_back(true, std::move(Pass));
+  }
+
+  /// Run all registered passes in the order they were added.
+  void runPasses();
+
+  /// Runs all enabled implemented passes on all functions.
+  static void runAllPasses(BinaryContext &BC);
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BinarySection.cpp b/bolt/src/BinarySection.cpp
new file mode 100644
index 000000000000..4b7d632d6962
--- /dev/null
+++ b/bolt/src/BinarySection.cpp
@@ -0,0 +1,319 @@
+//===--- BinarySection.cpp  - Interface for object file section -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinarySection.h"
+#include "BinaryContext.h"
+#include "Utils.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+extern cl::opt<bool> PrintRelocations;
+extern cl::opt<bool> HotData;
+}
+
+bool BinarySection::isELF() const {
+  return BC.isELF();
+}
+
+bool BinarySection::isMachO() const {
+  return BC.isMachO();
+}
+
+uint64_t
+BinarySection::hash(const BinaryData &BD,
+                    std::map<const BinaryData *, uint64_t> &Cache) const {
+  auto Itr = Cache.find(&BD);
+  if (Itr != Cache.end())
+    return Itr->second;
+
+  Cache[&BD] = 0;
+
+  uint64_t Offset = BD.getAddress() - getAddress();
+  const uint64_t EndOffset = BD.getEndAddress() - getAddress();
+  auto Begin = Relocations.lower_bound(Relocation{Offset, 0, 0, 0, 0});
+  auto End = Relocations.upper_bound(Relocation{EndOffset, 0, 0, 0, 0});
+  const StringRef Contents = getContents();
+
+  hash_code Hash = hash_combine(hash_value(BD.getSize()),
+                                hash_value(BD.getSectionName()));
+
+  while (Begin != End) {
+    const Relocation &Rel = *Begin++;
+    Hash = hash_combine(
+      Hash,
+      hash_value(Contents.substr(Offset, Begin->Offset - Offset)));
+    if (BinaryData *RelBD = BC.getBinaryDataByName(Rel.Symbol->getName())) {
+      Hash = hash_combine(Hash, hash(*RelBD, Cache));
+    }
+    Offset = Rel.Offset + Rel.getSize();
+  }
+
+  Hash = hash_combine(
+    Hash,
+    hash_value(Contents.substr(Offset, EndOffset - Offset)));
+
+  Cache[&BD] = Hash;
+
+  return Hash;
+}
+
+void BinarySection::emitAsData(MCStreamer &Streamer, StringRef NewName) const {
+  StringRef SectionName = !NewName.empty() ? NewName : getName();
+  StringRef SectionContents = getContents();
+  MCSectionELF *ELFSection =
+      BC.Ctx->getELFSection(SectionName, getELFType(), getELFFlags());
+
+  Streamer.SwitchSection(ELFSection);
+  Streamer.emitValueToAlignment(getAlignment());
+
+  if (BC.HasRelocations && opts::HotData && isReordered())
+    Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_start"));
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting "
+                    << (isAllocatable() ? "" : "non-")
+                    << "allocatable data section " << SectionName << '\n');
+
+  if (!hasRelocations()) {
+    Streamer.emitBytes(SectionContents);
+  } else {
+    uint64_t SectionOffset = 0;
+    for (const Relocation &Relocation : relocations()) {
+      assert(Relocation.Offset < SectionContents.size() && "overflow detected");
+      // Skip undefined symbols.
+      if (BC.UndefinedSymbols.count(Relocation.Symbol))
+        continue;
+      if (SectionOffset < Relocation.Offset) {
+        Streamer.emitBytes(
+            SectionContents.substr(SectionOffset,
+                                   Relocation.Offset - SectionOffset));
+        SectionOffset = Relocation.Offset;
+      }
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol "
+                        << (Relocation.Symbol ? Relocation.Symbol->getName()
+                                              : StringRef("<none>"))
+                        << " at offset 0x"
+                        << Twine::utohexstr(Relocation.Offset) << " with size "
+                        << Relocation::getSizeForType(Relocation.Type) << '\n');
+      size_t RelocationSize = Relocation.emit(&Streamer);
+      SectionOffset += RelocationSize;
+    }
+    assert(SectionOffset <= SectionContents.size() && "overflow error");
+    if (SectionOffset < SectionContents.size()) {
+      Streamer.emitBytes(SectionContents.substr(SectionOffset));
+    }
+  }
+
+  if (BC.HasRelocations && opts::HotData && isReordered())
+    Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_end"));
+}
+
+void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS,
+                                            SymbolResolverFuncTy Resolver) {
+  if (PendingRelocations.empty() && Patches.empty())
+    return;
+
+  const uint64_t SectionAddress = getAddress();
+
+  // We apply relocations to original section contents. For allocatable sections
+  // this means using their input file offsets, since the output file offset
+  // could change (e.g. for new instance of .text). For non-allocatable
+  // sections, the output offset should always be a valid one.
+  const uint64_t SectionFileOffset = isAllocatable() ? getInputFileOffset()
+                                                     : getOutputFileOffset();
+  LLVM_DEBUG(
+      dbgs() << "BOLT-DEBUG: flushing pending relocations for section "
+             << getName() << '\n'
+             << "  address: 0x" << Twine::utohexstr(SectionAddress) << '\n'
+             << "  offset: 0x" << Twine::utohexstr(SectionFileOffset) << '\n');
+
+  for (BinaryPatch &Patch : Patches) {
+    OS.pwrite(Patch.Bytes.data(),
+              Patch.Bytes.size(),
+              SectionFileOffset + Patch.Offset);
+  }
+
+
+  for (Relocation &Reloc : PendingRelocations) {
+    uint64_t Value = Reloc.Addend;
+    if (Reloc.Symbol)
+      Value += Resolver(Reloc.Symbol);
+    switch(Reloc.Type) {
+    default:
+      LLVM_DEBUG(dbgs() << Reloc.Type << '\n';);
+      llvm_unreachable("unhandled relocation type");
+    case ELF::R_X86_64_64:
+    case ELF::R_X86_64_32: {
+      OS.pwrite(reinterpret_cast<const char*>(&Value),
+                Relocation::getSizeForType(Reloc.Type),
+                SectionFileOffset + Reloc.Offset);
+      break;
+    }
+    case ELF::R_X86_64_PC32: {
+      Value -= SectionAddress + Reloc.Offset;
+      OS.pwrite(reinterpret_cast<const char*>(&Value),
+                Relocation::getSizeForType(Reloc.Type),
+                SectionFileOffset + Reloc.Offset);
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: writing value 0x"
+                        << Twine::utohexstr(Value) << " of size "
+                        << Relocation::getSizeForType(Reloc.Type)
+                        << " at offset 0x" << Twine::utohexstr(Reloc.Offset)
+                        << " address 0x"
+                        << Twine::utohexstr(SectionAddress + Reloc.Offset)
+                        << " Offset 0x"
+                        << Twine::utohexstr(SectionFileOffset + Reloc.Offset)
+                        << '\n';);
+      break;
+    }
+    }
+    LLVM_DEBUG(
+        dbgs() << "BOLT-DEBUG: writing value 0x" << Twine::utohexstr(Value)
+               << " of size " << Relocation::getSizeForType(Reloc.Type)
+               << " at section offset 0x" << Twine::utohexstr(Reloc.Offset)
+               << " address 0x"
+               << Twine::utohexstr(SectionAddress + Reloc.Offset)
+               << " file offset 0x"
+               << Twine::utohexstr(SectionFileOffset + Reloc.Offset) << '\n';);
+  }
+
+  clearList(PendingRelocations);
+}
+
+BinarySection::~BinarySection() {
+  if (isReordered()) {
+    delete[] getData();
+    return;
+  }
+
+  if (!isAllocatable() &&
+      (!hasSectionRef() ||
+       OutputContents.data() != getContents(Section).data())) {
+    delete[] getOutputData();
+  }
+}
+
+void BinarySection::clearRelocations() {
+  clearList(Relocations);
+}
+
+void BinarySection::print(raw_ostream &OS) const {
+  OS << getName() << ", "
+     << "0x" << Twine::utohexstr(getAddress()) << ", "
+     << getSize()
+     << " (0x" << Twine::utohexstr(getOutputAddress()) << ", "
+     << getOutputSize() << ")"
+     << ", data = " << getData()
+     << ", output data = " << getOutputData();
+
+  if (isAllocatable())
+    OS << " (allocatable)";
+
+  if (isVirtual())
+    OS << " (virtual)";
+
+  if (isTLS())
+    OS << " (tls)";
+
+  if (opts::PrintRelocations) {
+    for (const Relocation &R : relocations())
+      OS << "\n  " << R;
+  }
+}
+
+BinarySection::RelocationSetType
+BinarySection::reorderRelocations(bool Inplace) const {
+  assert(PendingRelocations.empty() &&
+         "reodering pending relocations not supported");
+  RelocationSetType NewRelocations;
+  for (const Relocation &Rel : relocations()) {
+    uint64_t RelAddr = Rel.Offset + getAddress();
+    BinaryData *BD = BC.getBinaryDataContainingAddress(RelAddr);
+    BD = BD->getAtomicRoot();
+    assert(BD);
+
+    if ((!BD->isMoved() && !Inplace) || BD->isJumpTable())
+      continue;
+
+    Relocation NewRel(Rel);
+    uint64_t RelOffset = RelAddr - BD->getAddress();
+    NewRel.Offset = BD->getOutputOffset() + RelOffset;
+    assert(NewRel.Offset < getSize());
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: moving " << Rel << " -> " << NewRel
+                      << "\n");
+    auto Res = NewRelocations.emplace(std::move(NewRel));
+    (void)Res;
+    assert(Res.second && "Can't overwrite existing relocation");
+  }
+  return NewRelocations;
+}
+
+void BinarySection::reorderContents(const std::vector<BinaryData *> &Order,
+                                    bool Inplace) {
+  IsReordered = true;
+
+  Relocations = reorderRelocations(Inplace);
+
+  std::string Str;
+  raw_string_ostream OS(Str);
+  const char *Src = Contents.data();
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: reorderContents for " << Name << "\n");
+  for (BinaryData *BD : Order) {
+    assert((BD->isMoved() || !Inplace) && !BD->isJumpTable());
+    assert(BD->isAtomic() && BD->isMoveable());
+    const uint64_t SrcOffset = BD->getAddress() - getAddress();
+    assert(SrcOffset < Contents.size());
+    assert(SrcOffset == BD->getOffset());
+    while (OS.tell() < BD->getOutputOffset()) {
+      OS.write((unsigned char)0);
+    }
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ " << OS.tell()
+                      << "\n");
+    OS.write(&Src[SrcOffset], BD->getOutputSize());
+  }
+  if (Relocations.empty()) {
+    // If there are no existing relocations, tack a phony one at the end
+    // of the reordered segment to force LLVM to recognize and map this
+    // section.
+    MCSymbol *ZeroSym = BC.registerNameAtAddress("Zero", 0, 0, 0);
+    addRelocation(OS.tell(), ZeroSym, ELF::R_X86_64_64, 0xdeadbeef);
+
+    uint64_t Zero = 0;
+    OS.write(reinterpret_cast<const char *>(&Zero), sizeof(Zero));
+  }
+  auto *NewData = reinterpret_cast<char *>(copyByteArray(OS.str()));
+  Contents = OutputContents = StringRef(NewData, OS.str().size());
+  OutputSize = Contents.size();
+}
+
+std::string BinarySection::encodeELFNote(StringRef NameStr, StringRef DescStr,
+                                         uint32_t Type) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  const uint32_t NameSz = NameStr.size() + 1;
+  const uint32_t DescSz = DescStr.size();
+  OS.write(reinterpret_cast<const char *>(&(NameSz)), 4);
+  OS.write(reinterpret_cast<const char *>(&(DescSz)), 4);
+  OS.write(reinterpret_cast<const char *>(&(Type)), 4);
+  OS << NameStr << '\0';
+  for (uint64_t I = NameSz; I < alignTo(NameSz, 4); ++I) {
+    OS << '\0';
+  }
+  OS << DescStr;
+  for (uint64_t I = DescStr.size(); I < alignTo(DescStr.size(), 4); ++I) {
+    OS << '\0';
+  }
+  return OS.str();
+}
diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h
new file mode 100644
index 000000000000..acb4ba438331
--- /dev/null
+++ b/bolt/src/BinarySection.h
@@ -0,0 +1,545 @@
+//===--- BinarySection.h - Interface for object file section --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H
+#define LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H
+
+#include "DebugData.h"
+#include "Relocation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <memory>
+#include <set>
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+
+using namespace object;
+
+namespace bolt {
+
+class BinaryContext;
+class BinaryData;
+
+/// A class to manage binary sections that also manages related relocations.
+class BinarySection {
+  friend class BinaryContext;
+
+  BinaryContext &BC;          // Owning BinaryContext
+  std::string Name;           // Section name
+  const SectionRef Section;   // SectionRef (may be null)
+  StringRef Contents;         // Input section contents
+  const uint64_t Address;     // Address of section in input binary (may be 0)
+  const uint64_t Size;        // Input section size
+  uint64_t InputFileOffset{0};// Offset in the input binary
+  unsigned Alignment;         // alignment in bytes (must be > 0)
+  unsigned ELFType;           // ELF section type
+  unsigned ELFFlags;          // ELF section flags
+
+  // Relocations associated with this section. Relocation offsets are
+  // wrt. to the original section address and size.
+  using RelocationSetType = std::set<Relocation, std::less<>>;
+  RelocationSetType Relocations;
+
+  // Dynamic relocations associated with this section. Relocation offsets are
+  // from the original section address.
+  RelocationSetType DynamicRelocations;
+
+  // Pending relocations for this section.
+  std::vector<Relocation> PendingRelocations;
+
+  struct BinaryPatch {
+    uint64_t Offset;
+    SmallString<8> Bytes;
+
+    BinaryPatch(uint64_t Offset, const SmallVectorImpl<char> &Bytes)
+      : Offset(Offset), Bytes(Bytes.begin(), Bytes.end()) {}
+  };
+  std::vector<BinaryPatch> Patches;
+  /// Patcher used to apply simple changes to sections of the input binary.
+  std::unique_ptr<BinaryPatcher> Patcher;
+
+  // Output info
+  bool IsFinalized{false};         // Has this section had output information
+                                   // finalized?
+  std::string OutputName;          // Output section name (if the section has
+                                   // been renamed)
+  uint64_t OutputAddress{0};       // Section address for the rewritten binary.
+  uint64_t OutputSize{0};          // Section size in the rewritten binary.
+  uint64_t OutputFileOffset{0};    // File offset in the rewritten binary file.
+  StringRef OutputContents;        // Rewritten section contents.
+  unsigned SectionID{-1u};         // Unique ID used for address mapping.
+                                   // Set by ExecutableFileMemoryManager.
+  uint32_t Index{0};               // Section index in the output file.
+  mutable bool IsReordered{false}; // Have the contents been reordered?
+  bool IsAnonymous{false};         // True if the name should not be included
+                                   // in the output file.
+
+  uint64_t hash(const BinaryData &BD,
+                std::map<const BinaryData *, uint64_t> &Cache) const;
+
+  // non-copyable
+  BinarySection(const BinarySection &) = delete;
+  BinarySection(BinarySection &&) = delete;
+  BinarySection &operator=(const BinarySection &) = delete;
+  BinarySection &operator=(BinarySection &&) = delete;
+
+  static StringRef getName(SectionRef Section) {
+    return cantFail(Section.getName());
+  }
+  static StringRef getContents(SectionRef Section) {
+    if (Section.getObject()->isELF() &&
+        ELFSectionRef(Section).getType() == ELF::SHT_NOBITS)
+      return StringRef();
+
+    Expected<StringRef> ContentsOrErr = Section.getContents();
+    if (!ContentsOrErr) {
+      Error E = ContentsOrErr.takeError();
+      errs() << "BOLT-ERROR: cannot get section contents for "
+             << getName(Section) << ": " << E << ".\n";
+      exit(1);
+    }
+    return *ContentsOrErr;
+  }
+
+  /// Get the set of relocations refering to data in this section that
+  /// has been reordered.  The relocation offsets will be modified to
+  /// reflect the new data locations.
+  RelocationSetType reorderRelocations(bool Inplace) const;
+
+  /// Set output info for this section.
+  void update(uint8_t *NewData,
+              uint64_t NewSize,
+              unsigned NewAlignment,
+              unsigned NewELFType,
+              unsigned NewELFFlags) {
+    assert(NewAlignment > 0 && "section alignment must be > 0");
+    Alignment = NewAlignment;
+    ELFType = NewELFType;
+    ELFFlags = NewELFFlags;
+    OutputSize = NewSize;
+    OutputContents = StringRef(reinterpret_cast<const char*>(NewData),
+                               NewData ? NewSize : 0);
+    IsFinalized = true;
+  }
+public:
+  /// Copy a section.
+  explicit BinarySection(BinaryContext &BC,
+                         StringRef Name,
+                         const BinarySection &Section)
+    : BC(BC),
+      Name(Name),
+      Section(Section.getSectionRef()),
+      Contents(Section.getContents()),
+      Address(Section.getAddress()),
+      Size(Section.getSize()),
+      Alignment(Section.getAlignment()),
+      ELFType(Section.getELFType()),
+      ELFFlags(Section.getELFFlags()),
+      Relocations(Section.Relocations),
+      PendingRelocations(Section.PendingRelocations),
+      OutputName(Name) {
+  }
+
+  BinarySection(BinaryContext &BC,
+                SectionRef Section)
+    : BC(BC),
+      Name(getName(Section)),
+      Section(Section),
+      Contents(getContents(Section)),
+      Address(Section.getAddress()),
+      Size(Section.getSize()),
+      Alignment(Section.getAlignment()),
+      OutputName(Name) {
+    if (isELF()) {
+      ELFType = ELFSectionRef(Section).getType();
+      ELFFlags = ELFSectionRef(Section).getFlags();
+      InputFileOffset = ELFSectionRef(Section).getOffset();
+    } else if (isMachO()) {
+      auto *O = cast<MachOObjectFile>(Section.getObject());
+      InputFileOffset =
+          O->is64Bit() ? O->getSection64(Section.getRawDataRefImpl()).offset
+                       : O->getSection(Section.getRawDataRefImpl()).offset;
+    }
+  }
+
+  // TODO: pass Data as StringRef/ArrayRef? use StringRef::copy method.
+  BinarySection(BinaryContext &BC,
+                StringRef Name,
+                uint8_t *Data,
+                uint64_t Size,
+                unsigned Alignment,
+                unsigned ELFType,
+                unsigned ELFFlags)
+    : BC(BC),
+      Name(Name),
+      Contents(reinterpret_cast<const char*>(Data), Data ? Size : 0),
+      Address(0),
+      Size(Size),
+      Alignment(Alignment),
+      ELFType(ELFType),
+      ELFFlags(ELFFlags),
+      IsFinalized(true),
+      OutputName(Name),
+      OutputSize(Size),
+      OutputContents(Contents) {
+    assert(Alignment > 0 && "section alignment must be > 0");
+  }
+
+  ~BinarySection();
+
+  /// Helper function to generate the proper ELF flags from section properties.
+  static unsigned getFlags(bool IsReadOnly = true,
+                           bool IsText = false,
+                           bool IsAllocatable = false) {
+    unsigned Flags = 0;
+    if (IsAllocatable)
+      Flags |= ELF::SHF_ALLOC;
+    if (!IsReadOnly)
+      Flags |= ELF::SHF_WRITE;
+    if (IsText)
+      Flags |= ELF::SHF_EXECINSTR;
+    return Flags;
+  }
+
+  operator bool() const {
+    return ELFType != ELF::SHT_NULL;
+  }
+
+  bool operator==(const BinarySection &Other) const {
+    return (Name == Other.Name &&
+            Address == Other.Address &&
+            Size == Other.Size &&
+            getData() == Other.getData() &&
+            Alignment == Other.Alignment &&
+            ELFType == Other.ELFType &&
+            ELFFlags == Other.ELFFlags);
+  }
+
+  bool operator!=(const BinarySection &Other) const {
+    return !operator==(Other);
+  }
+
+  // Order sections by their immutable properties.
+  bool operator<(const BinarySection &Other) const {
+    return (getAddress() < Other.getAddress() ||
+            (getAddress() == Other.getAddress() &&
+             (getSize() < Other.getSize() ||
+              (getSize() == Other.getSize() &&
+               getName() < Other.getName()))));
+  }
+
+  ///
+  /// Basic property access.
+  ///
+  BinaryContext &getBinaryContext() { return BC; }
+  bool isELF() const;
+  bool isMachO() const;
+  StringRef getName() const { return Name; }
+  uint64_t getAddress() const { return Address; }
+  uint64_t getEndAddress() const { return Address + Size; }
+  uint64_t getSize() const { return Size; }
+  uint64_t getInputFileOffset() const { return InputFileOffset; }
+  uint64_t getAlignment() const { return Alignment; }
+  bool isText() const {
+    if (isELF())
+      return (ELFFlags & ELF::SHF_EXECINSTR);
+    return getSectionRef().isText();
+  }
+  bool isData() const {
+    if (isELF())
+      return (ELFType == ELF::SHT_PROGBITS &&
+              (ELFFlags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)));
+    return getSectionRef().isData();
+  }
+  bool isBSS() const {
+    return (ELFType == ELF::SHT_NOBITS &&
+            (ELFFlags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)));
+  }
+  bool isTLS() const {
+    return (ELFFlags & ELF::SHF_TLS);
+  }
+  bool isTBSS() const {
+    return isBSS() && isTLS();
+  }
+  bool isVirtual() const { return ELFType == ELF::SHT_NOBITS; }
+  bool isRela() const { return ELFType == ELF::SHT_RELA; }
+  bool isReadOnly() const {
+    return ((ELFFlags & ELF::SHF_ALLOC) &&
+            !(ELFFlags & ELF::SHF_WRITE) &&
+            ELFType == ELF::SHT_PROGBITS);
+  }
+  bool isAllocatable() const {
+    if (isELF()) {
+      return (ELFFlags & ELF::SHF_ALLOC) && !isTBSS();
+    } else {
+      // On non-ELF assume all sections are allocatable.
+      return true;
+    }
+  }
+  bool isReordered() const { return IsReordered; }
+  bool isAnonymous() const { return IsAnonymous; }
+  unsigned getELFType() const { return ELFType; }
+  unsigned getELFFlags() const { return ELFFlags; }
+
+  uint8_t *getData() {
+    return reinterpret_cast<uint8_t *>(const_cast<char *>(getContents().data()));
+  }
+  const uint8_t *getData() const {
+    return reinterpret_cast<const uint8_t *>(getContents().data());
+  }
+  StringRef getContents() const { return Contents; }
+  void clearContents() { Contents = {}; }
+  bool hasSectionRef() const { return Section != SectionRef(); }
+  SectionRef getSectionRef() const { return Section; }
+
+  /// Does this section contain the given \p Address?
+  /// Note: this is in terms of the original mapped binary addresses.
+  bool containsAddress(uint64_t Address) const {
+    return (getAddress() <= Address && Address < getEndAddress()) ||
+           (getSize() == 0 && getAddress() == Address);
+  }
+
+  /// Does this section contain the range [\p Address, \p Address + \p Size)?
+  /// Note: this is in terms of the original mapped binary addresses.
+  bool containsRange(uint64_t Address, uint64_t Size) const {
+    return containsAddress(Address) && Address + Size <= getEndAddress();
+  }
+
+  /// Iterate over all non-pending relocations for this section.
+  iterator_range<RelocationSetType::iterator> relocations() {
+    return make_range(Relocations.begin(), Relocations.end());
+  }
+
+  /// Iterate over all non-pending relocations for this section.
+  iterator_range<RelocationSetType::const_iterator> relocations() const {
+    return make_range(Relocations.begin(), Relocations.end());
+  }
+
+  /// Does this section have any non-pending relocations?
+  bool hasRelocations() const {
+    return !Relocations.empty();
+  }
+
+  /// Does this section have any pending relocations?
+  bool hasPendingRelocations() const {
+    return !PendingRelocations.empty();
+  }
+
+  /// Remove non-pending relocation with the given /p Offset.
+  bool removeRelocationAt(uint64_t Offset) {
+    auto Itr = Relocations.find(Offset);
+    if (Itr != Relocations.end()) {
+      Relocations.erase(Itr);
+      return true;
+    }
+    return false;
+  }
+
+  void clearRelocations();
+
+  /// Add a new relocation at the given /p Offset.
+  void addRelocation(uint64_t Offset,
+                     MCSymbol *Symbol,
+                     uint64_t Type,
+                     uint64_t Addend,
+                     uint64_t Value = 0,
+                     bool Pending = false) {
+    assert(Offset < getSize() && "offset not within section bounds");
+    if (!Pending) {
+      Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value});
+    } else {
+      PendingRelocations.emplace_back(
+          Relocation{Offset, Symbol, Type, Addend, Value});
+    }
+  }
+
+  /// Add a dynamic relocation at the given /p Offset.
+  void addDynamicRelocation(uint64_t Offset,
+                            MCSymbol *Symbol,
+                            uint64_t Type,
+                            uint64_t Addend,
+                            uint64_t Value = 0) {
+    assert(Offset < getSize() && "offset not within section bounds");
+    DynamicRelocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value});
+  }
+
+  /// Add relocation against the original contents of this section.
+  void addPendingRelocation(const Relocation &Rel) {
+    PendingRelocations.push_back(Rel);
+  }
+
+  /// Add patch to the input contents of this section.
+  void addPatch(uint64_t Offset, const SmallVectorImpl<char> &Bytes) {
+    Patches.emplace_back(BinaryPatch(Offset, Bytes));
+  }
+
+  /// Register patcher for this section.
+  void registerPatcher(std::unique_ptr<BinaryPatcher> BPatcher) {
+    Patcher = std::move(BPatcher);
+  }
+
+  /// Returns the patcher
+  BinaryPatcher *getPatcher() { return Patcher.get(); }
+
+  /// Lookup the relocation (if any) at the given /p Offset.
+  const Relocation *getRelocationAt(uint64_t Offset) const {
+    auto Itr = Relocations.find(Offset);
+    return Itr != Relocations.end() ? &*Itr : nullptr;
+  }
+
+  /// Lookup the relocation (if any) at the given /p Offset.
+  const Relocation *getDynamicRelocationAt(uint64_t Offset) const {
+    Relocation Key{Offset, 0, 0, 0, 0};
+    auto Itr = DynamicRelocations.find(Key);
+    return Itr != DynamicRelocations.end() ? &*Itr : nullptr;
+  }
+
+  uint64_t hash(const BinaryData &BD) const {
+    std::map<const BinaryData *, uint64_t> Cache;
+    return hash(BD, Cache);
+  }
+
+  ///
+  /// Property accessors related to output data.
+  ///
+
+  bool isFinalized() const { return IsFinalized; }
+  void setIsFinalized() { IsFinalized = true; }
+  StringRef getOutputName() const { return OutputName; }
+  uint64_t getOutputSize() const { return OutputSize; }
+  uint8_t *getOutputData() {
+    return reinterpret_cast<uint8_t *>(const_cast<char *>(getOutputContents().data()));
+  }
+  const uint8_t *getOutputData() const {
+    return reinterpret_cast<const uint8_t *>(getOutputContents().data());
+  }
+  StringRef getOutputContents() const { return OutputContents; }
+  uint64_t getAllocAddress() const {
+    return reinterpret_cast<uint64_t>(getOutputData());
+  }
+  uint64_t getOutputAddress() const { return OutputAddress; }
+  uint64_t getOutputFileOffset() const { return OutputFileOffset; }
+  unsigned getSectionID() const {
+    assert(hasValidSectionID() && "trying to use uninitialized section id");
+    return SectionID;
+  }
+  bool hasValidSectionID() const {
+    return SectionID != -1u;
+  }
+  uint32_t getIndex() const {
+    return Index;
+  }
+
+  // mutation
+  void setOutputAddress(uint64_t Address) {
+    OutputAddress = Address;
+  }
+  void setOutputFileOffset(uint64_t Offset) {
+    OutputFileOffset = Offset;
+  }
+  void setSectionID(unsigned ID) {
+    assert(!hasValidSectionID() && "trying to set section id twice");
+    SectionID = ID;
+  }
+  void setIndex(uint32_t I) {
+    Index = I;
+  }
+  void setOutputName(StringRef Name) {
+    OutputName = std::string(Name);
+  }
+  void setAnonymous(bool Flag) {
+    IsAnonymous = Flag;
+  }
+
+  /// Emit the section as data, possibly with relocations. Use name \p NewName
+  //  for the section during emission if non-empty.
+  void emitAsData(MCStreamer &Streamer, StringRef NewName = StringRef()) const;
+
+  using SymbolResolverFuncTy = llvm::function_ref<uint64_t(const MCSymbol *)>;
+
+  /// Flush all pending relocations to patch original contents of sections
+  /// that were not emitted via MCStreamer.
+  void flushPendingRelocations(raw_pwrite_stream &OS,
+                               SymbolResolverFuncTy Resolver);
+
+  /// Reorder the contents of this section according to /p Order.  If
+  /// /p Inplace is true, the entire contents of the section is reordered,
+  /// otherwise the new contents contain only the reordered data.
+  void reorderContents(const std::vector<BinaryData *> &Order, bool Inplace);
+
+  void print(raw_ostream &OS) const;
+
+  /// Write the contents of an ELF note section given the name of the producer,
+  /// a number identifying the type of note and the contents of the note in
+  /// \p DescStr.
+  static std::string encodeELFNote(StringRef NameStr, StringRef DescStr,
+                                   uint32_t Type);
+
+  /// Code for ELF notes written by producer 'BOLT'
+  enum {
+    NT_BOLT_BAT = 1,
+    NT_BOLT_INSTRUMENTATION_TABLES = 2
+  };
+};
+
+inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) {
+  auto *Array = new uint8_t[Size];
+  memcpy(Array, Data, Size);
+  return Array;
+}
+
+inline uint8_t *copyByteArray(StringRef Buffer) {
+  return copyByteArray(reinterpret_cast<const uint8_t*>(Buffer.data()),
+                       Buffer.size());
+}
+
+inline uint8_t *copyByteArray(ArrayRef<char> Buffer) {
+  return copyByteArray(reinterpret_cast<const uint8_t*>(Buffer.data()),
+                       Buffer.size());
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, const BinarySection &Section) {
+  Section.print(OS);
+  return OS;
+}
+
+struct SDTMarkerInfo {
+  uint64_t PC;
+  uint64_t Base;
+  uint64_t Semaphore;
+  StringRef Provider;
+  StringRef Name;
+  StringRef Args;
+
+  /// The offset of PC within the note section
+  unsigned PCOffset;
+};
+
+/// Linux Kernel special sections point to a specific instruction in many cases.
+/// Unlike SDTMarkerInfo, these markers can come from different sections.
+struct LKInstructionMarkerInfo {
+  uint64_t SectionOffset;
+  int32_t PCRelativeOffset;
+  bool IsPCRelative;
+  StringRef SectionName;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/BoltAddressTranslation.cpp b/bolt/src/BoltAddressTranslation.cpp
new file mode 100644
index 000000000000..698a414f1d31
--- /dev/null
+++ b/bolt/src/BoltAddressTranslation.cpp
@@ -0,0 +1,295 @@
+//===--- BoltAddressTranslation.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#include "BoltAddressTranslation.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
+
+#define DEBUG_TYPE "bolt-bat"
+
+namespace llvm {
+namespace bolt {
+
+const char* BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
+
+void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
+                                               const BinaryBasicBlock &BB,
+                                               uint64_t FuncAddress) {
+  const uint64_t BBOutputOffset =
+      BB.getOutputAddressRange().first - FuncAddress;
+  const uint32_t BBInputOffset = BB.getInputOffset();
+
+  assert(BBInputOffset != BinaryBasicBlock::INVALID_OFFSET &&
+         "Every output BB must track back to an input BB for profile "
+         "collection in bolted binaries");
+
+  LLVM_DEBUG(dbgs() << "BB " << BB.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(BBOutputOffset)
+                    << " Val: " << Twine::utohexstr(BBInputOffset) << "\n");
+  // In case of conflicts (same Key mapping to different Vals), the last
+  // update takes precedence. Of course it is not ideal to have conflicts and
+  // those happen when we have an empty BB that either contained only
+  // NOPs or a jump to the next block (successor). Either way, the successor
+  // and this deleted block will both share the same output address (the same
+  // key), and we need to map back. We choose here to privilege the successor by
+  // allowing it to overwrite the previously inserted key in the map.
+  Map[BBOutputOffset] = BBInputOffset;
+
+  for (const auto &IOPair : BB.getOffsetTranslationTable()) {
+    const uint64_t OutputOffset = IOPair.first + BBOutputOffset;
+    const uint32_t InputOffset = IOPair.second;
+
+    // Is this the first instruction in the BB? No need to duplicate the entry.
+    if (OutputOffset == BBOutputOffset)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(OutputOffset) << " Val: "
+                      << Twine::utohexstr(InputOffset) << " (branch)\n");
+    Map.insert(
+        std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset | BRANCHENTRY));
+  }
+}
+
+void BoltAddressTranslation::write(raw_ostream &OS) {
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    // We don't need a translation table if the body of the function hasn't
+    // changed
+    if (!BC.HasRelocations && !Function.isSimple())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
+    LLVM_DEBUG(dbgs() << " Address reference: 0x"
+                      << Twine::utohexstr(Function.getOutputAddress()) << "\n");
+    MapTy Map;
+    const bool IsSplit = Function.isSplit();
+    for (BinaryBasicBlock *&BB : Function.layout()) {
+      if (IsSplit && BB->isCold())
+        break;
+      writeEntriesForBB(Map, *BB, Function.getOutputAddress());
+    }
+    Maps.insert(std::pair<uint64_t, MapTy>(Function.getOutputAddress(), Map));
+
+    if (!IsSplit)
+      continue;
+
+    // Cold map
+    Map.clear();
+    LLVM_DEBUG(dbgs() << " Cold part\n");
+    for (BinaryBasicBlock *&BB : Function.layout()) {
+      if (!BB->isCold())
+        continue;
+      writeEntriesForBB(Map, *BB, Function.cold().getAddress());
+    }
+    Maps.insert(std::pair<uint64_t, MapTy>(Function.cold().getAddress(), Map));
+    ColdPartSource.insert(std::pair<uint64_t, uint64_t>(
+        Function.cold().getAddress(), Function.getOutputAddress()));
+  }
+
+  const uint32_t NumFuncs = Maps.size();
+  OS.write(reinterpret_cast<const char *>(&NumFuncs), 4);
+  LLVM_DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n");
+  for (auto &MapEntry : Maps) {
+    const uint64_t Address = MapEntry.first;
+    MapTy &Map = MapEntry.second;
+    const uint32_t NumEntries = Map.size();
+    LLVM_DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
+                      << Twine::utohexstr(Address) << ".\n");
+    OS.write(reinterpret_cast<const char *>(&Address), 8);
+    OS.write(reinterpret_cast<const char *>(&NumEntries), 4);
+    for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
+      OS.write(reinterpret_cast<const char *>(&KeyVal.first), 4);
+      OS.write(reinterpret_cast<const char *>(&KeyVal.second), 4);
+    }
+  }
+  const uint32_t NumColdEntries = ColdPartSource.size();
+  LLVM_DEBUG(dbgs() << "Writing " << NumColdEntries
+                    << " cold part mappings.\n");
+  OS.write(reinterpret_cast<const char *>(&NumColdEntries), 4);
+  for (std::pair<const uint64_t, uint64_t> &ColdEntry : ColdPartSource) {
+    OS.write(reinterpret_cast<const char *>(&ColdEntry.first), 8);
+    OS.write(reinterpret_cast<const char *>(&ColdEntry.second), 8);
+    LLVM_DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> "
+                      << Twine::utohexstr(ColdEntry.second) << "\n");
+  }
+
+  outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
+  outs() << "BOLT-INFO: Wrote " << NumColdEntries
+         << " BAT cold-to-hot entries\n";
+}
+
+std::error_code BoltAddressTranslation::parse(StringRef Buf) {
+  DataExtractor DE = DataExtractor(Buf, true, 8);
+  uint64_t Offset = 0;
+  if (Buf.size() < 12)
+    return make_error_code(llvm::errc::io_error);
+
+  const uint32_t NameSz = DE.getU32(&Offset);
+  const uint32_t DescSz = DE.getU32(&Offset);
+  const uint32_t Type = DE.getU32(&Offset);
+
+  if (Type != BinarySection::NT_BOLT_BAT ||
+      Buf.size() + Offset < alignTo(NameSz, 4) + DescSz)
+    return make_error_code(llvm::errc::io_error);
+
+  StringRef Name = Buf.slice(Offset, Offset + NameSz);
+  Offset = alignTo(Offset + NameSz, 4);
+  if (Name.substr(0, 4) != "BOLT")
+    return make_error_code(llvm::errc::io_error);
+
+  if (Buf.size() - Offset < 4)
+    return make_error_code(llvm::errc::io_error);
+
+  const uint32_t NumFunctions = DE.getU32(&Offset);
+  LLVM_DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n");
+  for (uint32_t I = 0; I < NumFunctions; ++I) {
+    if (Buf.size() - Offset < 12)
+      return make_error_code(llvm::errc::io_error);
+
+    const uint64_t Address = DE.getU64(&Offset);
+    const uint32_t NumEntries = DE.getU32(&Offset);
+    MapTy Map;
+
+    LLVM_DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
+                      << Twine::utohexstr(Address) << "\n");
+    if (Buf.size() - Offset < 8 * NumEntries)
+      return make_error_code(llvm::errc::io_error);
+    for (uint32_t J = 0; J < NumEntries; ++J) {
+      const uint32_t OutputAddr = DE.getU32(&Offset);
+      const uint32_t InputAddr = DE.getU32(&Offset);
+      Map.insert(std::pair<uint32_t, uint32_t>(OutputAddr, InputAddr));
+      LLVM_DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> "
+                        << Twine::utohexstr(InputAddr) << "\n");
+    }
+    Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
+  }
+
+  if (Buf.size() - Offset < 4)
+    return make_error_code(llvm::errc::io_error);
+
+  const uint32_t NumColdEntries = DE.getU32(&Offset);
+  LLVM_DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n");
+  for (uint32_t I = 0; I < NumColdEntries; ++I) {
+    if (Buf.size() - Offset < 16)
+      return make_error_code(llvm::errc::io_error);
+    const uint32_t ColdAddress = DE.getU64(&Offset);
+    const uint32_t HotAddress = DE.getU64(&Offset);
+    ColdPartSource.insert(
+        std::pair<uint64_t, uint64_t>(ColdAddress, HotAddress));
+    LLVM_DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> "
+                      << Twine::utohexstr(HotAddress) << "\n");
+  }
+  outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
+  outs() << "BOLT-INFO: Parsed " << NumColdEntries
+         << " BAT cold-to-hot entries\n";
+
+  return std::error_code();
+}
+
+uint64_t BoltAddressTranslation::translate(const BinaryFunction &Func,
+                                           uint64_t Offset,
+                                           bool IsBranchSrc) const {
+  auto Iter = Maps.find(Func.getAddress());
+  if (Iter == Maps.end())
+    return Offset;
+
+  const MapTy &Map = Iter->second;
+  auto KeyVal = Map.upper_bound(Offset);
+  if (KeyVal == Map.begin())
+    return Offset;
+
+  --KeyVal;
+
+  const uint32_t Val = KeyVal->second & ~BRANCHENTRY;
+  // Branch source addresses are translated to the first instruction of the
+  // source BB to avoid accounting for modifications BOLT may have made in the
+  // BB regarding deletion/addition of instructions.
+  if (IsBranchSrc)
+    return Val;
+  return Offset - KeyVal->first + Val;
+}
+
+Optional<BoltAddressTranslation::FallthroughListTy>
+BoltAddressTranslation::getFallthroughsInTrace(
+    const BinaryFunction &Func, uint64_t From, uint64_t To) const {
+  SmallVector<std::pair<uint64_t, uint64_t>, 16> Res;
+
+  // Filter out trivial case
+  if (From >= To)
+    return Res;
+
+  From -= Func.getAddress();
+  To -= Func.getAddress();
+
+  auto Iter = Maps.find(Func.getAddress());
+  if (Iter == Maps.end()) {
+    return NoneType();
+  }
+
+  const MapTy &Map = Iter->second;
+  auto FromIter = Map.upper_bound(From);
+  if (FromIter == Map.begin())
+    return Res;
+  // Skip instruction entries, to create fallthroughs we are only interested in
+  // BB boundaries
+  do {
+    if (FromIter == Map.begin())
+      return Res;
+    --FromIter;
+  } while (FromIter->second & BRANCHENTRY);
+
+  auto ToIter = Map.upper_bound(To);
+  if (ToIter == Map.begin())
+    return Res;
+  --ToIter;
+  if (FromIter->first >= ToIter->first)
+    return Res;
+
+  for (auto Iter = FromIter; Iter != ToIter; ) {
+    const uint32_t Src = Iter->first;
+    if (Iter->second & BRANCHENTRY) {
+      ++Iter;
+      continue;
+    }
+
+    ++Iter;
+    while (Iter->second & BRANCHENTRY && Iter != ToIter) {
+      ++Iter;
+    }
+    if (Iter->second & BRANCHENTRY)
+      break;
+    Res.emplace_back(Src, Iter->first);
+  }
+
+  return Res;
+}
+
+uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
+  auto Iter = ColdPartSource.find(Address);
+  if (Iter == ColdPartSource.end())
+    return 0;
+  return Iter->second;
+}
+
+bool BoltAddressTranslation::enabledFor(
+    llvm::object::ELFObjectFileBase *InputFile) const {
+  for (const SectionRef &Section : InputFile->sections()) {
+    Expected<StringRef> SectionNameOrErr = Section.getName();
+    if (Error E = SectionNameOrErr.takeError())
+      continue;
+
+    if (SectionNameOrErr.get() == SECTION_NAME)
+      return true;
+  }
+  return false;
+}
+}
+}
diff --git a/bolt/src/BoltAddressTranslation.h b/bolt/src/BoltAddressTranslation.h
new file mode 100644
index 000000000000..c0b553158eea
--- /dev/null
+++ b/bolt/src/BoltAddressTranslation.h
@@ -0,0 +1,135 @@
+//===--- BoltAddressTranslation.h -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
+#define LLVM_TOOLS_LLVM_BOLT_BOLTADDRESSTRANSLATION_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <map>
+#include <system_error>
+
+namespace llvm {
+class raw_ostream;
+
+namespace object {
+class ELFObjectFileBase;
+} // namespace object
+
+namespace bolt {
+class BinaryBasicBlock;
+class BinaryContext;
+class BinaryFunction;
+
+/// The map of output addresses to input ones to be used when translating
+/// samples collected in a binary that was already processed by BOLT. We do not
+/// support reoptimizing a binary already processed by BOLT, but we do support
+/// collecting samples in a binary processed by BOLT. We then translate samples
+/// back to addresses from the input (original) binary, one that can be
+/// optimized. The goal is to avoid special deployments of non-bolted binaries
+/// just for the purposes of data collection.
+///
+/// The in-memory representation of the map is as follows. Each function has its
+/// own map. A function is identified by its output address. This is the key to
+/// retrieve a translation map. The translation map is a collection of ordered
+/// keys identifying the start of a region (relative to the function start) in
+/// the output address space (addresses in the binary processed by BOLT).
+///
+/// A translation then happens when perf2bolt needs to convert sample addresses
+/// in the output address space back to input addresses, valid to run BOLT in
+/// the original input binary. To convert, perf2bolt first needs to fetch the
+/// translation map for a sample recorded in a given function. It then finds
+/// the largest key that is still smaller or equal than the recorded address.
+/// It then converts this address to use the value of this key.
+///
+///   Example translation Map for function foo
+///      KEY                             VALUE                    BB?
+///    Output offset1 (first BB)         Original input offset1   Y
+///    ...
+///    Output offsetN (last branch)      Original input offsetN   N
+///
+/// The information on whether a given entry is a BB start or an instruction
+/// that changes control flow is encoded in the last (highest) bit of VALUE.
+///
+/// Notes:
+/// Instructions that will never appear in LBR because they do not cause control
+/// flow change are omitted from this map. Basic block locations are recorded
+/// because they can be a target of a jump (To address in the LBR) and also to
+/// recreate the BB layout of this function. We use the BB layout map to
+/// recreate fall-through jumps in the profile, given an LBR trace.
+class BoltAddressTranslation {
+public:
+  // In-memory representation of the address translation table
+  using MapTy = std::map<uint32_t, uint32_t>;
+
+  // List of taken fall-throughs
+  using FallthroughListTy = SmallVector<std::pair<uint64_t, uint64_t>, 16>;
+
+  /// Name of the ELF section where the table will be serialized to in the
+  /// output binary
+  static const char *SECTION_NAME;
+
+  BoltAddressTranslation(BinaryContext &BC) : BC(BC) {}
+
+  /// Write the serialized address translation tables for each reordered
+  /// function
+  void write(raw_ostream &OS);
+
+  /// Read the serialized address translation tables and load them internally
+  /// in memory. Return a parse error if failed.
+  std::error_code parse(StringRef Buf);
+
+  /// If the maps are loaded in memory, perform the lookup to translate LBR
+  /// addresses in \p Func.
+  uint64_t translate(const BinaryFunction &Func, uint64_t Offset,
+                     bool IsBranchSrc) const;
+
+  /// Use the map keys containing basic block addresses to infer fall-throughs
+  /// taken in the path started at FirstLBR.To and ending at SecondLBR.From.
+  /// Return NoneType if trace is invalid or the list of fall-throughs
+  /// otherwise.
+  Optional<FallthroughListTy>
+  getFallthroughsInTrace(const BinaryFunction &Func, uint64_t From,
+                         uint64_t To) const;
+
+  /// If available, fetch the address of the hot part linked to the cold part
+  /// at \p Address. Return 0 otherwise.
+  uint64_t fetchParentAddress(uint64_t Address) const;
+
+  /// True if the input binary has a translation table we can use to convert
+  /// addresses when aggregating profile
+  bool enabledFor(llvm::object::ELFObjectFileBase *InputFile) const;
+
+private:
+  /// Helper to update \p Map by inserting one or more BAT entries reflecting
+  /// \p BB for function located at \p FuncAddress. At least one entry will be
+  /// emitted for the start of the BB. More entries may be emitted to cover
+  /// the location of calls or any instruction that may change control flow.
+  void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB,
+                         uint64_t FuncAddress);
+
+  BinaryContext &BC;
+
+  std::map<uint64_t, MapTy> Maps;
+
+  /// Links outlined cold bocks to their original function
+  std::map<uint64_t, uint64_t> ColdPartSource;
+
+  /// Identifies the address of a control-flow changing instructions in a
+  /// translation map entry
+  const static uint32_t BRANCHENTRY = 0x80000000;
+};
+}
+
+}
+
+#endif
diff --git a/bolt/src/BoltDiff.cpp b/bolt/src/BoltDiff.cpp
new file mode 100644
index 000000000000..b80c7f2be618
--- /dev/null
+++ b/bolt/src/BoltDiff.cpp
@@ -0,0 +1,717 @@
+//===--- BoltDiff.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RewriteInstance methods related to comparing one instance to another, used
+// by the boltdiff tool to print a report.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Passes/IdenticalCodeFolding.h"
+#include "ProfileReaderBase.h"
+#include "RewriteInstance.h"
+#include "llvm/Support/CommandLine.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "boltdiff"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+namespace opts {
+extern cl::OptionCategory BoltDiffCategory;
+extern cl::opt<bool> NeverPrint;
+extern cl::opt<bool> ICF;
+
+static cl::opt<bool>
+IgnoreLTOSuffix("ignore-lto-suffix",
+  cl::desc("ignore lto_priv or const suffixes when matching functions"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+PrintUnmapped("print-unmapped",
+  cl::desc("print functions of binary 2 that were not matched to any "
+           "function in binary 1"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+PrintProfiledUnmapped("print-profiled-unmapped",
+  cl::desc("print functions that have profile in binary 1 but do not "
+           "in binary 2"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+PrintDiffCFG("print-diff-cfg",
+  cl::desc("print the CFG of important functions that changed in "
+           "binary 2"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+PrintDiffBBs("print-diff-bbs",
+  cl::desc("print the basic blocks showed in top differences"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+MatchByHash("match-by-hash",
+  cl::desc("match functions in binary 2 to binary 1 if they have the same "
+           "hash of a function in binary 1"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+IgnoreUnchanged("ignore-unchanged",
+  cl::desc("do not diff functions whose contents have not been changed from "
+           "one binary to another"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<unsigned>
+DisplayCount("display-count",
+  cl::desc("number of functions to display when printing the top largest "
+           "differences in function activity"),
+  cl::init(10),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+NormalizeByBin1("normalize-by-bin1",
+  cl::desc("show execution count of functions in binary 2 as a ratio of the "
+           "total samples in binary 1 - make sure both profiles have equal "
+           "collection time and sampling rate for this to make sense"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltDiffCategory));
+
+} // end namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+/// Helper used to print colored numbers
+void printColoredPercentage(double Perc) {
+  if (outs().has_colors() && Perc > 0.0)
+    outs().changeColor(raw_ostream::RED);
+  else if (outs().has_colors() && Perc < 0.0)
+    outs().changeColor(raw_ostream::GREEN);
+  else if (outs().has_colors())
+    outs().changeColor(raw_ostream::YELLOW);
+  outs() << format("%.2f", Perc) << "%";
+  if (outs().has_colors())
+    outs().resetColor();
+}
+
+void setLightColor() {
+  if (opts::PrintDiffBBs && outs().has_colors())
+    outs().changeColor(raw_ostream::CYAN);
+}
+
+void setTitleColor() {
+  if (outs().has_colors())
+    outs().changeColor(raw_ostream::WHITE, /*Bold=*/true);
+}
+
+void setRegularColor() {
+  if (outs().has_colors())
+    outs().resetColor();
+}
+
+} // end anonymous namespace
+
+/// Perform the comparison between two binaries with profiling information
+class RewriteInstanceDiff {
+  typedef std::tuple<const BinaryBasicBlock *, const BinaryBasicBlock *, double>
+      EdgeTy;
+
+  RewriteInstance &RI1;
+  RewriteInstance &RI2;
+
+  // The map of functions keyed by functions in binary 2, providing its
+  // corresponding function in binary 1
+  std::map<const BinaryFunction *, const BinaryFunction *> FuncMap;
+
+  // The map of basic blocks correspondence, analogue to FuncMap for BBs,
+  // sorted by score difference
+  std::map<const BinaryBasicBlock *, const BinaryBasicBlock *> BBMap;
+
+  // The map of edge correspondence
+  std::map<double, std::pair<EdgeTy, EdgeTy>> EdgeMap;
+
+  // Maps all known basic blocks back to their parent function
+  std::map<const BinaryBasicBlock *, const BinaryFunction *> BBToFuncMap;
+
+  // Accounting which functions were matched
+  std::set<const BinaryFunction *> Bin1MappedFuncs;
+  std::set<const BinaryFunction *> Bin2MappedFuncs;
+
+  // Structures for our 3 matching strategies: by name, by hash and by lto name,
+  // from the strongest to the weakest bind between two functions
+  StringMap<const BinaryFunction *> NameLookup;
+  DenseMap<size_t, const BinaryFunction *> HashLookup;
+  StringMap<const BinaryFunction *> LTONameLookup1;
+  StringMap<const BinaryFunction *> LTONameLookup2;
+
+  // Score maps used to order and find hottest functions
+  std::multimap<double, const BinaryFunction *> LargestBin1;
+  std::multimap<double, const BinaryFunction *> LargestBin2;
+
+  // Map multiple functions in the same LTO bucket to a single parent function
+  // representing all functions sharing the same prefix
+  std::map<const BinaryFunction *, const BinaryFunction *> LTOMap1;
+  std::map<const BinaryFunction *, const BinaryFunction *> LTOMap2;
+  std::map<const BinaryFunction *, double> LTOAggregatedScore1;
+  std::map<const BinaryFunction *, double> LTOAggregatedScore2;
+
+  // Map scores in bin2 and 1 keyed by a binary 2 function - post-matching
+  DenseMap<const BinaryFunction *, std::pair<double, double>> ScoreMap;
+
+  double getNormalizedScore(const BinaryFunction &Function,
+                            const RewriteInstance &Ctx) {
+    if (!opts::NormalizeByBin1)
+      return static_cast<double>(Function.getFunctionScore()) / Ctx.getTotalScore();
+    return static_cast<double>(Function.getFunctionScore()) / RI1.getTotalScore();
+  }
+
+  double getNormalizedScore(const BinaryBasicBlock &BB,
+                            const RewriteInstance &Ctx) {
+    if (!opts::NormalizeByBin1)
+      return static_cast<double>(BB.getKnownExecutionCount()) / Ctx.getTotalScore();
+    return static_cast<double>(BB.getKnownExecutionCount()) / RI1.getTotalScore();
+  }
+
+  double getNormalizedScore(BinaryBasicBlock::branch_info_iterator BIIter,
+                            const RewriteInstance &Ctx) {
+    double Score =
+              BIIter->Count == BinaryBasicBlock::COUNT_NO_PROFILE
+                  ? 0
+                  : BIIter->Count;
+    if (!opts::NormalizeByBin1)
+      return Score / Ctx.getTotalScore();
+    return Score / RI1.getTotalScore();
+  }
+
+  /// Initialize data structures used for function lookup in binary 1, used
+  /// later when matching functions in binary 2 to corresponding functions
+  /// in binary 1
+  void buildLookupMaps() {
+    for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
+      StringRef LTOName;
+      const BinaryFunction &Function = BFI.second;
+      const double Score = getNormalizedScore(Function, RI1);
+      LargestBin1.insert(std::make_pair<>(Score, &Function));
+      for (const StringRef Name : Function.getNames()) {
+        if (Optional<StringRef> OptionalLTOName = getLTOCommonName(Name))
+          LTOName = *OptionalLTOName;
+        NameLookup[Name] = &Function;
+      }
+      if (opts::MatchByHash && Function.hasCFG())
+        HashLookup[Function.computeHash(/*UseDFS=*/true)] = &Function;
+      if (opts::IgnoreLTOSuffix && !LTOName.empty()) {
+        if (!LTONameLookup1.count(LTOName))
+          LTONameLookup1[LTOName] = &Function;
+        LTOMap1[&Function] = LTONameLookup1[LTOName];
+      }
+    }
+
+    // Compute LTONameLookup2 and LargestBin2
+    for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
+      StringRef LTOName;
+      const BinaryFunction &Function = BFI.second;
+      const double Score = getNormalizedScore(Function, RI2);
+      LargestBin2.insert(std::make_pair<>(Score, &Function));
+      for (const StringRef Name : Function.getNames()) {
+        if (Optional<StringRef> OptionalLTOName = getLTOCommonName(Name))
+          LTOName = *OptionalLTOName;
+      }
+      if (opts::IgnoreLTOSuffix && !LTOName.empty()) {
+        if (!LTONameLookup2.count(LTOName))
+          LTONameLookup2[LTOName] = &Function;
+        LTOMap2[&Function] = LTONameLookup2[LTOName];
+      }
+    }
+  }
+
+  /// Match functions in binary 2 with functions in binary 1
+  void matchFunctions() {
+    outs() << "BOLT-DIFF: Mapping functions in Binary2 to Binary1\n";
+    uint64_t BothHaveProfile = 0ull;
+    std::set<const BinaryFunction *> Bin1ProfiledMapped;
+
+    for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
+      const BinaryFunction &Function2 = BFI2.second;
+      StringRef LTOName;
+      bool Match = false;
+      for (const StringRef Name : Function2.getNames()) {
+        auto Iter = NameLookup.find(Name);
+        if (Optional<StringRef> OptionalLTOName = getLTOCommonName(Name))
+          LTOName = *OptionalLTOName;
+        if (Iter == NameLookup.end())
+          continue;
+        FuncMap.insert(std::make_pair<>(&Function2, Iter->second));
+        Bin1MappedFuncs.insert(Iter->second);
+        Bin2MappedFuncs.insert(&Function2);
+        if (Function2.hasValidProfile() && Iter->second->hasValidProfile()) {
+          ++BothHaveProfile;
+          Bin1ProfiledMapped.insert(Iter->second);
+        }
+        Match = true;
+        break;
+      }
+      if (Match || !Function2.hasCFG())
+        continue;
+      auto Iter = HashLookup.find(Function2.computeHash(/*UseDFS*/true));
+      if (Iter != HashLookup.end()) {
+        FuncMap.insert(std::make_pair<>(&Function2, Iter->second));
+        Bin1MappedFuncs.insert(Iter->second);
+        Bin2MappedFuncs.insert(&Function2);
+        if (Function2.hasValidProfile() && Iter->second->hasValidProfile()) {
+          ++BothHaveProfile;
+          Bin1ProfiledMapped.insert(Iter->second);
+        }
+        continue;
+      }
+      if (LTOName.empty())
+        continue;
+      auto LTOIter = LTONameLookup1.find(LTOName);
+      if (LTOIter != LTONameLookup1.end()) {
+        FuncMap.insert(std::make_pair<>(&Function2, LTOIter->second));
+        Bin1MappedFuncs.insert(LTOIter->second);
+        Bin2MappedFuncs.insert(&Function2);
+        if (Function2.hasValidProfile() && LTOIter->second->hasValidProfile()) {
+          ++BothHaveProfile;
+          Bin1ProfiledMapped.insert(LTOIter->second);
+        }
+      }
+    }
+    PrintProgramStats PPS(opts::NeverPrint);
+    outs() << "* BOLT-DIFF: Starting print program stats pass for binary 1\n";
+    PPS.runOnFunctions(*RI1.BC);
+    outs() << "* BOLT-DIFF: Starting print program stats pass for binary 2\n";
+    PPS.runOnFunctions(*RI2.BC);
+    outs() << "=====\n";
+    outs() << "Inputs share " << BothHaveProfile
+           << " functions with valid profile.\n";
+    if (opts::PrintProfiledUnmapped) {
+      outs() << "\nFunctions in profile 1 that are missing in the profile 2:\n";
+      std::vector<const BinaryFunction *> Unmapped;
+      for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
+        const BinaryFunction &Function = BFI.second;
+        if (!Function.hasValidProfile() || Bin1ProfiledMapped.count(&Function))
+          continue;
+        Unmapped.emplace_back(&Function);
+      }
+      std::sort(Unmapped.begin(), Unmapped.end(),
+                [&](const BinaryFunction *A, const BinaryFunction *B) {
+                  return A->getFunctionScore() > B->getFunctionScore();
+                });
+      for (const BinaryFunction *Function : Unmapped) {
+        outs() << Function->getPrintName() << " : ";
+        outs() << Function->getFunctionScore() << "\n";
+      }
+      outs() << "=====\n";
+    }
+  }
+
+  /// Check if opcodes in BB1 match those in BB2
+  bool compareBBs(const BinaryBasicBlock &BB1,
+                  const BinaryBasicBlock &BB2) const {
+    auto Iter1 = BB1.begin();
+    auto Iter2 = BB2.begin();
+    if ((Iter1 == BB1.end() && Iter2 != BB2.end()) ||
+        (Iter1 != BB1.end() && Iter2 == BB2.end()))
+      return false;
+
+    while (Iter1 != BB1.end()) {
+      if (Iter2 == BB2.end() ||
+          Iter1->getOpcode() != Iter2->getOpcode())
+        return false;
+
+      ++Iter1;
+      ++Iter2;
+    }
+
+    if (Iter2 != BB2.end())
+      return false;
+    return true;
+  }
+
+  /// For a function in binary 2 that matched one in binary 1, now match each
+  /// individual basic block in it to its corresponding blocks in binary 1.
+  /// Also match each edge in binary 2 to the corresponding ones in binary 1.
+  void matchBasicBlocks() {
+    for (const auto &MapEntry : FuncMap) {
+      const BinaryFunction *const &Func1 = MapEntry.second;
+      const BinaryFunction *const &Func2 = MapEntry.first;
+
+      auto Iter1 = Func1->layout_begin();
+      auto Iter2 = Func2->layout_begin();
+
+      bool Match = true;
+      std::map<const BinaryBasicBlock *, const BinaryBasicBlock *> Map;
+      std::map<double, std::pair<EdgeTy, EdgeTy>> EMap;
+      while (Iter1 != Func1->layout_end()) {
+        if (Iter2 == Func2->layout_end()) {
+          Match = false;
+          break;
+        }
+        if (!compareBBs(**Iter1, **Iter2)) {
+          Match = false;
+          break;
+        }
+        Map.insert(std::make_pair<>(*Iter2, *Iter1));
+
+        auto SuccIter1 = (*Iter1)->succ_begin();
+        auto SuccIter2 = (*Iter2)->succ_begin();
+        auto BIIter1 = (*Iter1)->branch_info_begin();
+        auto BIIter2 = (*Iter2)->branch_info_begin();
+        while (SuccIter1 != (*Iter1)->succ_end()) {
+          if (SuccIter2 == (*Iter2)->succ_end()) {
+            Match = false;
+            break;
+          }
+          const double ScoreEdge1 = getNormalizedScore(BIIter1, RI1);
+          const double ScoreEdge2 = getNormalizedScore(BIIter2, RI2);
+          EMap.insert(std::make_pair<>(
+              std::abs(ScoreEdge2 - ScoreEdge1),
+              std::make_pair<>(
+                  std::make_tuple<>(*Iter2, *SuccIter2, ScoreEdge2),
+                  std::make_tuple<>(*Iter1, *SuccIter1, ScoreEdge1))));
+
+          ++SuccIter1;
+          ++SuccIter2;
+          ++BIIter1;
+          ++BIIter2;
+        }
+        if (SuccIter2 != (*Iter2)->succ_end())
+          Match = false;
+        if (!Match)
+          break;
+
+        BBToFuncMap[*Iter1] = Func1;
+        BBToFuncMap[*Iter2] = Func2;
+        ++Iter1;
+        ++Iter2;
+      }
+      if (!Match || Iter2 != Func2->layout_end())
+        continue;
+
+      BBMap.insert(Map.begin(), Map.end());
+      EdgeMap.insert(EMap.begin(), EMap.end());
+    }
+  }
+
+  /// Print the largest differences in basic block performance from binary 1
+  /// to binary 2
+  void reportHottestBBDiffs() {
+    std::map<double, const BinaryBasicBlock *> LargestDiffs;
+    for (const auto &MapEntry : BBMap) {
+      const BinaryBasicBlock *BB2 = MapEntry.first;
+      const BinaryBasicBlock *BB1 = MapEntry.second;
+      LargestDiffs.insert(
+          std::make_pair<>(std::abs(getNormalizedScore(*BB2, RI2) -
+                                    getNormalizedScore(*BB1, RI1)),
+                           BB2));
+    }
+
+    unsigned Printed = 0;
+    setTitleColor();
+    outs()
+        << "\nTop " << opts::DisplayCount
+        << " largest differences in basic block performance bin 2 -> bin 1:\n";
+    outs() << "=========================================================\n";
+    setRegularColor();
+    outs() << " * Functions with different contents do not appear here\n\n";
+    for (auto I = LargestDiffs.rbegin(), E = LargestDiffs.rend(); I != E; ++I) {
+      const BinaryBasicBlock *BB2 = I->second;
+      const double Score2 = getNormalizedScore(*BB2, RI2);
+      const double Score1 = getNormalizedScore(*BBMap[BB2], RI1);
+      outs() << "BB " << BB2->getName() << " from "
+             << BBToFuncMap[BB2]->getDemangledName()
+             << "\n\tScore bin1 = " << format("%.4f", Score1 * 100.0)
+             << "%\n\tScore bin2 = " << format("%.4f", Score2 * 100.0);
+      outs() << "%\t(Difference: ";
+      printColoredPercentage((Score2 - Score1) * 100.0);
+      outs() << ")\n";
+      if (opts::PrintDiffBBs) {
+        setLightColor();
+        BB2->dump();
+        setRegularColor();
+      }
+      if (Printed++ == opts::DisplayCount)
+        break;
+    }
+  }
+
+  /// Print the largest differences in edge counts from one binary to another
+  void reportHottestEdgeDiffs() {
+    unsigned Printed = 0;
+    setTitleColor();
+    outs()
+        << "\nTop " << opts::DisplayCount
+        << " largest differences in edge hotness bin 2 -> bin 1:\n";
+    outs() << "=========================================================\n";
+    setRegularColor();
+    outs() << " * Functions with different contents do not appear here\n";
+    for (auto I = EdgeMap.rbegin(), E = EdgeMap.rend(); I != E; ++I) {
+      std::tuple<const BinaryBasicBlock *, const BinaryBasicBlock *, double>
+          &Edge2 = I->second.first;
+      std::tuple<const BinaryBasicBlock *, const BinaryBasicBlock *, double>
+          &Edge1 = I->second.second;
+      const double Score2 = std::get<2>(Edge2);
+      const double Score1 = std::get<2>(Edge1);
+      outs() << "Edge (" << std::get<0>(Edge2)->getName() << " -> "
+             << std::get<1>(Edge2)->getName() << ") in "
+             << BBToFuncMap[std::get<0>(Edge2)]->getDemangledName()
+             << "\n\tScore bin1 = " << format("%.4f", Score1 * 100.0)
+             << "%\n\tScore bin2 = " << format("%.4f", Score2 * 100.0);
+      outs() << "%\t(Difference: ";
+      printColoredPercentage((Score2 - Score1) * 100.0);
+      outs() << ")\n";
+      if (opts::PrintDiffBBs) {
+        setLightColor();
+        std::get<0>(Edge2)->dump();
+        std::get<1>(Edge2)->dump();
+        setRegularColor();
+      }
+      if (Printed++ == opts::DisplayCount)
+        break;
+    }
+ }
+
+  /// For LTO functions sharing the same prefix (for example, func1.lto_priv.1
+  /// and func1.lto_priv.2 share the func1.lto_priv prefix), compute aggregated
+  /// scores for them. This is used to avoid reporting all LTO functions as
+  /// having a large difference in performance because hotness shifted from
+  /// LTO variant 1 to variant 2, even though they represent the same function.
+  void computeAggregatedLTOScore() {
+    for (const auto &BFI : RI1.BC->getBinaryFunctions()) {
+      const BinaryFunction &Function = BFI.second;
+      double Score = getNormalizedScore(Function, RI1);
+      auto Iter = LTOMap1.find(&Function);
+      if (Iter == LTOMap1.end())
+        continue;
+      LTOAggregatedScore1[Iter->second] += Score;
+    }
+
+    double UnmappedScore = 0;
+    for (const auto &BFI : RI2.BC->getBinaryFunctions()) {
+      const BinaryFunction &Function = BFI.second;
+      bool Matched = FuncMap.find(&Function) != FuncMap.end();
+      double Score = getNormalizedScore(Function, RI2);
+      auto Iter = LTOMap2.find(&Function);
+      if (Iter == LTOMap2.end()) {
+        if (!Matched)
+          UnmappedScore += Score;
+        continue;
+      }
+      LTOAggregatedScore2[Iter->second] += Score;
+      if (FuncMap.find(Iter->second) == FuncMap.end())
+        UnmappedScore += Score;
+    }
+    int64_t Unmapped =
+      RI2.BC->getBinaryFunctions().size() - Bin2MappedFuncs.size();
+    outs() << "BOLT-DIFF: " << Unmapped
+           << " functions in Binary2 have no correspondence to any other "
+              "function in Binary1.\n";
+
+    // Print the hotness score of functions in binary 2 that were not matched
+    // to any function in binary 1
+    outs() << "BOLT-DIFF: These unmapped functions in Binary2 represent "
+           << format("%.2f", UnmappedScore * 100.0) << "% of execution.\n";
+  }
+
+  /// Print the largest hotness differences from binary 2 to binary 1
+  void reportHottestFuncDiffs() {
+    std::multimap<double, decltype(FuncMap)::value_type> LargestDiffs;
+    for (const auto &MapEntry : FuncMap) {
+      const BinaryFunction *const &Func1 = MapEntry.second;
+      const BinaryFunction *const &Func2 = MapEntry.first;
+      double Score1 = getNormalizedScore(*Func1, RI1);
+      auto Iter1 = LTOMap1.find(Func1);
+      if (Iter1 != LTOMap1.end()) {
+        Score1 = LTOAggregatedScore1[Iter1->second];
+      }
+      double Score2 = getNormalizedScore(*Func2, RI2);
+      auto Iter2 = LTOMap2.find(Func2);
+      if (Iter2 != LTOMap2.end()) {
+        Score2 = LTOAggregatedScore2[Iter2->second];
+      }
+      if (Score1 == 0.0 || Score2 == 0.0)
+        continue;
+      LargestDiffs.insert(
+          std::make_pair<>(std::abs(Score1 - Score2), MapEntry));
+      ScoreMap[Func2] = std::make_pair<>(Score1, Score2);
+    }
+
+    unsigned Printed = 0;
+    setTitleColor();
+    outs() << "\nTop " << opts::DisplayCount
+           << " largest differences in performance bin 2 -> bin 1:\n";
+    outs() << "=========================================================\n";
+    setRegularColor();
+    for (auto I = LargestDiffs.rbegin(), E = LargestDiffs.rend(); I != E; ++I) {
+      const std::pair<const BinaryFunction *const, const BinaryFunction *>
+          &MapEntry = I->second;
+      if (opts::IgnoreUnchanged &&
+          MapEntry.second->computeHash(/*UseDFS=*/true) ==
+          MapEntry.first->computeHash(/*UseDFS=*/true))
+        continue;
+      const std::pair<double, double> &Scores = ScoreMap[MapEntry.first];
+      outs() << "Function " << MapEntry.first->getDemangledName();
+      if (MapEntry.first->getDemangledName() !=
+          MapEntry.second->getDemangledName())
+        outs() << "\nmatched  " << MapEntry.second->getDemangledName();
+      outs() << "\n\tScore bin1 = " << format("%.2f", Scores.first * 100.0)
+             << "%\n\tScore bin2 = " << format("%.2f", Scores.second * 100.0)
+             << "%\t(Difference: ";
+      printColoredPercentage((Scores.second - Scores.first) * 100.0);
+      outs() << ")";
+      if (MapEntry.second->computeHash(/*UseDFS=*/true) !=
+          MapEntry.first->computeHash(/*UseDFS=*/true)) {
+        outs() << "\t[Functions have different contents]";
+        if (opts::PrintDiffCFG) {
+          outs() << "\n *** CFG for function in binary 1:\n";
+          setLightColor();
+          MapEntry.second->dump();
+          setRegularColor();
+          outs() << "\n *** CFG for function in binary 2:\n";
+          setLightColor();
+          MapEntry.first->dump();
+          setRegularColor();
+        }
+      }
+      outs() << "\n";
+      if (Printed++ == opts::DisplayCount)
+        break;
+    }
+  }
+
+  /// Print hottest functions from each binary
+  void reportHottestFuncs() {
+    unsigned Printed = 0;
+    setTitleColor();
+    outs() << "\nTop " << opts::DisplayCount
+           << " hottest functions in binary 2:\n";
+    outs() << "=====================================\n";
+    setRegularColor();
+    for (auto I = LargestBin2.rbegin(), E = LargestBin2.rend(); I != E; ++I) {
+      const std::pair<const double, const BinaryFunction *> &MapEntry = *I;
+      outs() << "Function " << MapEntry.second->getDemangledName() << "\n";
+      auto Iter = ScoreMap.find(MapEntry.second);
+      if (Iter != ScoreMap.end()) {
+        outs() << "\tScore bin1 = "
+               << format("%.2f", Iter->second.first * 100.0) << "%\n";
+      }
+      outs() << "\tScore bin2 = " << format("%.2f", MapEntry.first * 100.0)
+             << "%\n";
+      if (Printed++ == opts::DisplayCount)
+        break;
+    }
+
+    Printed = 0;
+    setTitleColor();
+    outs() << "\nTop " << opts::DisplayCount
+           << " hottest functions in binary 1:\n";
+    outs() << "=====================================\n";
+    setRegularColor();
+    for (auto I = LargestBin1.rbegin(), E = LargestBin1.rend(); I != E; ++I) {
+      const std::pair<const double, const BinaryFunction *> &MapEntry = *I;
+      outs() << "Function " << MapEntry.second->getDemangledName()
+             << "\n\tScore bin1 = " << format("%.2f", MapEntry.first * 100.0)
+             << "%\n";
+      if (Printed++ == opts::DisplayCount)
+        break;
+    }
+  }
+
+  /// Print functions in binary 2 that did not match anything in binary 1.
+  /// Unfortunately, in an LTO build, even a small change can lead to several
+  /// LTO variants being unmapped, corresponding to local functions that never
+  /// appear in one of the binaries because they were previously inlined.
+  void reportUnmapped() {
+    outs() << "List of functions from binary 2 that were not matched with any "
+           << "function in binary 1:\n";
+    for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) {
+      const BinaryFunction &Function2 = BFI2.second;
+      if (Bin2MappedFuncs.count(&Function2))
+        continue;
+      outs() << Function2.getPrintName() << "\n";
+    }
+  }
+
+public:
+  /// Main entry point: coordinate all tasks necessary to compare two binaries
+  void compareAndReport() {
+    buildLookupMaps();
+    matchFunctions();
+    if (opts::IgnoreLTOSuffix)
+      computeAggregatedLTOScore();
+    matchBasicBlocks();
+    reportHottestFuncDiffs();
+    reportHottestBBDiffs();
+    reportHottestEdgeDiffs();
+    reportHottestFuncs();
+    if (!opts::PrintUnmapped)
+      return;
+    reportUnmapped();
+  }
+
+  RewriteInstanceDiff(RewriteInstance &RI1, RewriteInstance &RI2)
+      : RI1(RI1), RI2(RI2) {
+    compareAndReport();
+  }
+
+};
+
+} // end nampespace bolt
+} // end namespace llvm
+
+void RewriteInstance::compare(RewriteInstance &RI2) {
+  outs() << "BOLT-DIFF: ======== Binary1 vs. Binary2 ========\n";
+  outs() << "Trace for binary 1 has " << this->getTotalScore()
+         << " instructions executed.\n";
+  outs() << "Trace for binary 2 has " << RI2.getTotalScore()
+         << " instructions executed.\n";
+  if (opts::NormalizeByBin1) {
+    double Diff2to1 = static_cast<double>(RI2.getTotalScore() - this->getTotalScore()) /
+                       this->getTotalScore();
+    outs() << "Binary2 change in score with respect to Binary1: ";
+    printColoredPercentage(Diff2to1 * 100.0);
+    outs() << "\n";
+  }
+
+  if (!this->getTotalScore() || !RI2.getTotalScore()) {
+    outs() << "BOLT-DIFF: Both binaries must have recorded activity in known "
+              "functions.\n";
+    return;
+  }
+
+  // Pre-pass ICF
+  if (opts::ICF) {
+    IdenticalCodeFolding ICF(opts::NeverPrint);
+    outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
+    ICF.runOnFunctions(*BC);
+    outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
+    ICF.runOnFunctions(*RI2.BC);
+  }
+
+  RewriteInstanceDiff RID(*this, RI2);
+}
diff --git a/bolt/src/CMakeLists.txt b/bolt/src/CMakeLists.txt
new file mode 100644
index 000000000000..cc630b3bd1d5
--- /dev/null
+++ b/bolt/src/CMakeLists.txt
@@ -0,0 +1,123 @@
+add_subdirectory(merge-fdata)
+add_subdirectory(Passes)
+add_subdirectory(RuntimeLibs)
+add_subdirectory(Target)
+
+# Get the current git revision for BOLT.
+function(get_version ofn)
+  find_program(git_executable NAMES git git.exe git.cmd)
+  if (git_executable)
+    execute_process(COMMAND ${git_executable} rev-parse HEAD
+      WORKING_DIRECTORY ${LLVM_MAIN_SRC_DIR}
+      TIMEOUT 5
+      RESULT_VARIABLE git_result
+      OUTPUT_VARIABLE git_output)
+      if( git_result EQUAL 0 )
+        string(STRIP "${git_output}" git_ref_id)
+        set(BOLT_REVISION "${git_ref_id}")
+      endif()
+  endif()
+
+  # If we can't find a revision, set it to "<unknown>".
+  if (NOT BOLT_REVISION)
+    set(BOLT_REVISION "<unknown>")
+  endif()
+
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
+    COMMAND echo '"${BOLT_REVISION}"' > ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
+    COMMENT "Generating bogus ${ofn}..."
+  )
+
+  set(VERSION_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} PARENT_SCOPE)
+
+  # `make clean' must remove all those generated files:
+  set_property(DIRECTORY APPEND
+    PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${ofn})
+  set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${ofn} PROPERTIES
+    GENERATED 1)
+endfunction()
+
+# Creates a public target for generating the revision file.
+function(add_public_gen_version_target target)
+  add_custom_target(${target} DEPENDS ${VERSION_OUTPUT})
+  set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE)
+endfunction()
+
+get_version(BoltRevision.inc)
+add_public_gen_version_target(GenBoltRevision)
+
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  BOLTPasses
+  BOLTRuntimeLibs
+  CodeGen
+  Core
+  DebugInfoDWARF
+  MC
+  MCDisassembler
+  MCParser
+  Object
+  Orcjit
+  Support
+  )
+
+string(FIND "${LLVM_TARGETS_TO_BUILD}" "AArch64" POSITION)
+if (NOT ${POSITION} EQUAL -1)
+  list(APPEND LLVM_LINK_COMPONENTS BOLTTargetAArch64)
+  set(BOLT_AArch64 On)
+endif()
+
+string(FIND "${LLVM_TARGETS_TO_BUILD}" "X86" POSITION)
+if (NOT ${POSITION} EQUAL -1)
+  list(APPEND LLVM_LINK_COMPONENTS BOLTTargetX86)
+  set(BOLT_X64 On)
+endif()
+
+add_llvm_tool(llvm-bolt
+  llvm-bolt.cpp
+  BinaryBasicBlock.cpp
+  BinaryContext.cpp
+  BinaryData.cpp
+  BinaryEmitter.cpp
+  BinaryFunction.cpp
+  BinaryFunctionProfile.cpp
+  BinaryPassManager.cpp
+  BinarySection.cpp
+  BoltAddressTranslation.cpp
+  BoltDiff.cpp
+  CacheMetrics.cpp
+  DataAggregator.cpp
+  DataReader.cpp
+  DebugData.cpp
+  DWARFRewriter.cpp
+  DynoStats.cpp
+  Exceptions.cpp
+  ExecutableFileMemoryManager.cpp
+  Heatmap.cpp
+  JumpTable.cpp
+  MachORewriteInstance.cpp
+  MCPlusBuilder.cpp
+  ParallelUtilities.cpp
+  ProfileReaderBase.cpp
+  Relocation.cpp
+  RewriteInstance.cpp
+  Utils.cpp
+  YAMLProfileReader.cpp
+  YAMLProfileWriter.cpp
+
+  DEPENDS
+  intrinsics_gen
+  bolt_rt
+  )
+
+if (DEFINED BOLT_AArch64)
+  target_compile_definitions(llvm-bolt PRIVATE AARCH64_AVAILABLE)
+endif()
+
+if (DEFINED BOLT_X64)
+  target_compile_definitions(llvm-bolt PRIVATE X86_AVAILABLE)
+endif()
+
+add_llvm_tool_symlink(perf2bolt llvm-bolt)
+add_llvm_tool_symlink(llvm-boltdiff llvm-bolt)
+add_llvm_tool_symlink(llvm-bolt-heatmap llvm-bolt)
diff --git a/bolt/src/CacheMetrics.cpp b/bolt/src/CacheMetrics.cpp
new file mode 100644
index 000000000000..3bb1cf520085
--- /dev/null
+++ b/bolt/src/CacheMetrics.cpp
@@ -0,0 +1,316 @@
+//===------ CacheMetrics.cpp - Calculate metrics for instruction cache ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functions to show metrics of cache lines
+//
+//===----------------------------------------------------------------------===//
+
+#include "CacheMetrics.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/CommandLine.h"
+#include <unordered_map>
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<double> ForwardWeight;
+extern cl::opt<double> BackwardWeight;
+extern cl::opt<unsigned> ForwardDistance;
+extern cl::opt<unsigned> BackwardDistance;
+extern cl::opt<unsigned> ITLBPageSize;
+extern cl::opt<unsigned> ITLBEntries;
+
+}
+
+namespace {
+
+/// Initialize and return a position map for binary basic blocks
+void extractBasicBlockInfo(
+  const std::vector<BinaryFunction *> &BinaryFunctions,
+  std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
+  std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
+
+  for (BinaryFunction *BF : BinaryFunctions) {
+    const BinaryContext &BC = BF->getBinaryContext();
+    for (BinaryBasicBlock *BB : BF->layout()) {
+      if (BF->isSimple() || BC.HasRelocations) {
+        // Use addresses/sizes as in the output binary
+        BBAddr[BB] = BB->getOutputAddressRange().first;
+        BBSize[BB] = BB->getOutputSize();
+      } else {
+        // Output ranges should match the input if the body hasn't changed
+        BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress();
+        BBSize[BB] = BB->getOriginalSize();
+      }
+    }
+  }
+}
+
+/// Calculate TSP metric, which quantifies the number of fallthrough jumps in
+/// the ordering of basic blocks
+double calcTSPScore(
+  const std::vector<BinaryFunction *> &BinaryFunctions,
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
+
+  double Score = 0;
+  for (BinaryFunction *BF : BinaryFunctions) {
+    if (!BF->hasProfile())
+      continue;
+    for (BinaryBasicBlock *SrcBB : BF->layout()) {
+      auto BI = SrcBB->branch_info_begin();
+      for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
+        if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+            BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB))
+          Score += BI->Count;
+        ++BI;
+      }
+    }
+  }
+  return Score;
+}
+
+/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache
+/// misses for a given ordering of basic blocks
+double calcExtTSPScore(
+  const std::vector<BinaryFunction *> &BinaryFunctions,
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
+
+  double Score = 0.0;
+  for (BinaryFunction *BF : BinaryFunctions) {
+    if (!BF->hasProfile())
+      continue;
+    for (BinaryBasicBlock *SrcBB : BF->layout()) {
+      auto BI = SrcBB->branch_info_begin();
+      for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
+        if (DstBB != SrcBB) {
+          Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB),
+                                             BBSize.at(SrcBB),
+                                             BBAddr.at(DstBB),
+                                             BI->Count);
+        }
+        ++BI;
+      }
+    }
+  }
+  return Score;
+}
+
+using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
+
+/// Build a simplified version of the call graph: For every function, keep
+/// its callers and the frequencies of the calls
+std::unordered_map<const BinaryFunction *, Predecessors>
+extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
+  std::unordered_map<const BinaryFunction *, Predecessors> Calls;
+
+  for (BinaryFunction *SrcFunction : BinaryFunctions) {
+    const BinaryContext &BC = SrcFunction->getBinaryContext();
+    for (BinaryBasicBlock *BB : SrcFunction->layout()) {
+      // Find call instructions and extract target symbols from each one
+      for (MCInst &Inst : *BB) {
+        if (!BC.MIB->isCall(Inst))
+          continue;
+
+        // Call info
+        const MCSymbol* DstSym = BC.MIB->getTargetSymbol(Inst);
+        uint64_t Count = BB->getKnownExecutionCount();
+        // Ignore calls w/o information
+        if (DstSym == nullptr || Count == 0)
+          continue;
+
+        const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
+        // Ignore recursive calls
+        if (DstFunction == nullptr ||
+            DstFunction->layout_empty() ||
+            DstFunction == SrcFunction)
+          continue;
+
+        // Record the call
+        Calls[DstFunction].emplace_back(SrcFunction, Count);
+      }
+    }
+  }
+  return Calls;
+}
+
+/// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
+/// Given an assignment of functions to the i-TLB pages), we divide all
+/// functions calls into two categories:
+/// - 'short' ones that have a caller-callee distance less than a page;
+/// - 'long' ones where the distance exceeds a page.
+/// The short calls are likely to result in a i-TLB cache hit. For the long ones,
+/// the hit/miss result depends on the 'hotness' of the page (i.e., how often
+/// the page is accessed). Assuming that functions are sent to the i-TLB cache
+/// in a random order, the probability that a page is present in the cache is
+/// proportional to the number of samples corresponding to the functions on the
+/// page. The following procedure detects short and long calls, and estimates
+/// the expected number of cache misses for the long ones.
+double expectedCacheHitRatio(
+  const std::vector<BinaryFunction *> &BinaryFunctions,
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
+  const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
+
+  const double PageSize = opts::ITLBPageSize;
+  const uint64_t CacheEntries = opts::ITLBEntries;
+  std::unordered_map<const BinaryFunction *, Predecessors> Calls =
+      extractFunctionCalls(BinaryFunctions);
+  // Compute 'hotness' of the functions
+  double TotalSamples = 0;
+  std::unordered_map<BinaryFunction *, double> FunctionSamples;
+  for (BinaryFunction *BF : BinaryFunctions) {
+    double Samples = 0;
+    for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
+      Samples += Pair.second;
+    }
+    Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
+    FunctionSamples[BF] = Samples;
+    TotalSamples += Samples;
+  }
+
+  // Compute 'hotness' of the pages
+  std::unordered_map<uint64_t, double> PageSamples;
+  for (BinaryFunction *BF : BinaryFunctions) {
+    if (BF->layout_empty())
+      continue;
+    double Page = BBAddr.at(BF->layout_front()) / PageSize;
+    PageSamples[Page] += FunctionSamples.at(BF);
+  }
+
+  // Computing the expected number of misses for every function
+  double Misses = 0;
+  for (BinaryFunction *BF : BinaryFunctions) {
+    // Skip the function if it has no samples
+    if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0)
+      continue;
+    double Samples = FunctionSamples.at(BF);
+    double Page = BBAddr.at(BF->layout_front()) / PageSize;
+    // The probability that the page is not present in the cache
+    double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
+
+    // Processing all callers of the function
+    for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
+      BinaryFunction *SrcFunction = Pair.first;
+      double SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize;
+      // Is this a 'long' or a 'short' call?
+      if (Page != SrcPage) {
+        // This is a miss
+        Misses += MissProb * Pair.second;
+      }
+      Samples -= Pair.second;
+    }
+    assert(Samples >= 0.0 && "Function samples computed incorrectly");
+    // The remaining samples likely come from the jitted code
+    Misses += Samples * MissProb;
+  }
+
+  return 100.0 * (1.0 - Misses / TotalSamples);
+}
+
+} // end namespace anonymous
+
+double CacheMetrics::extTSPScore(uint64_t SrcAddr,
+                                 uint64_t SrcSize,
+                                 uint64_t DstAddr,
+                                 uint64_t Count) {
+  assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
+
+  // Fallthrough
+  if (SrcAddr + SrcSize == DstAddr) {
+    // Assume that FallthroughWeight = 1.0 after normalization
+    return static_cast<double>(Count);
+  }
+  // Forward
+  if (SrcAddr + SrcSize < DstAddr) {
+    const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
+    if (Dist <= opts::ForwardDistance) {
+      double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
+      return opts::ForwardWeight * Prob * Count;
+    }
+    return 0;
+  }
+  // Backward
+  const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
+  if (Dist <= opts::BackwardDistance) {
+    double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
+    return opts::BackwardWeight * Prob * Count;
+  }
+  return 0;
+}
+
+void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
+  // Stats related to hot-cold code splitting
+  size_t NumFunctions = 0;
+  size_t NumProfiledFunctions = 0;
+  size_t NumHotFunctions = 0;
+  size_t NumBlocks = 0;
+  size_t NumHotBlocks = 0;
+
+  size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
+  size_t TotalCodeMaxAddr = 0;
+  size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
+  size_t HotCodeMaxAddr = 0;
+
+  for (BinaryFunction *BF : BFs) {
+    NumFunctions++;
+    if (BF->hasProfile())
+      NumProfiledFunctions++;
+    if (BF->hasValidIndex())
+      NumHotFunctions++;
+    for (BinaryBasicBlock *BB : BF->layout()) {
+      NumBlocks++;
+      size_t BBAddrMin = BB->getOutputAddressRange().first;
+      size_t BBAddrMax = BB->getOutputAddressRange().second;
+      TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
+      TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
+      if (BF->hasValidIndex() && !BB->isCold()) {
+        NumHotBlocks++;
+        HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
+        HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
+      }
+    }
+  }
+
+  outs() << format("  There are %zu functions;", NumFunctions)
+         << format(" %zu (%.2lf%%) are in the hot section,",
+                   NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions)
+         << format(" %zu (%.2lf%%) have profile\n",
+                   NumProfiledFunctions, 100.0 * NumProfiledFunctions / NumFunctions);
+  outs() << format("  There are %zu basic blocks;", NumBlocks)
+         << format(" %zu (%.2lf%%) are in the hot section\n",
+                  NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks);
+
+  assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
+  size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
+  size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
+
+  size_t HugePage2MB = 2 << 20;
+  outs() << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, %.2lf huge pages)\n",
+                   100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize,
+                   double(HotCodeSize) / HugePage2MB);
+
+  // Stats related to expected cache performance
+  std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
+  std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
+  extractBasicBlockInfo(BFs, BBAddr, BBSize);
+
+  outs() << "  Expected i-TLB cache hit ratio: "
+         << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
+
+  outs() << "  TSP score: "
+         << format("%.0lf\n", calcTSPScore(BFs, BBAddr, BBSize));
+
+  outs() << "  ExtTSP score: "
+         << format("%.0lf\n", calcExtTSPScore(BFs, BBAddr, BBSize));
+}
diff --git a/bolt/src/CacheMetrics.h b/bolt/src/CacheMetrics.h
new file mode 100644
index 000000000000..3e797b890bdc
--- /dev/null
+++ b/bolt/src/CacheMetrics.h
@@ -0,0 +1,42 @@
+//===- CacheMetrics.h - Interface for instruction cache evaluation       --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functions to show metrics of cache lines
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H
+#define LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H
+
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+class BinaryFunction;
+namespace CacheMetrics {
+
+/// Calculate various metrics related to instruction cache performance.
+void printAll(const std::vector<BinaryFunction *> &BinaryFunctions);
+
+/// Calculate Extended-TSP metric, which quantifies the expected number of
+/// i-cache misses for a given pair of basic blocks. The parameters are:
+/// - SrcAddr is the address of the source block;
+/// - SrcSize is the size of the source block;
+/// - DstAddr is the address of the destination block;
+/// - Count is the number of jumps between the pair of blocks.
+double extTSPScore(uint64_t SrcAddr,
+                   uint64_t SrcSize,
+                   uint64_t DstAddr,
+                   uint64_t Count);
+
+} // namespace CacheMetrics
+} // namespace bolt
+} // namespace llvm
+
+#endif //LLVM_CACHEMETRICS_H
diff --git a/bolt/src/DWARFRewriter.cpp b/bolt/src/DWARFRewriter.cpp
new file mode 100644
index 000000000000..bd00da372842
--- /dev/null
+++ b/bolt/src/DWARFRewriter.cpp
@@ -0,0 +1,1304 @@
+//===--- DWARFRewriter.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFRewriter.h"
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "DebugData.h"
+#include "ParallelUtilities.h"
+#include "Utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+LLVM_ATTRIBUTE_UNUSED
+static void printDie(const DWARFDie &DIE) {
+  DIDumpOptions DumpOpts;
+  DumpOpts.ShowForm = true;
+  DumpOpts.Verbose = true;
+  DumpOpts.ChildRecurseDepth = 0;
+  DumpOpts.ShowChildren = 0;
+  DIE.dump(dbgs(), 0, DumpOpts);
+}
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace object;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+extern cl::opt<unsigned> Verbosity;
+
+static cl::opt<bool>
+KeepARanges("keep-aranges",
+  cl::desc("keep or generate .debug_aranges section if .gdb_index is written"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+DeterministicDebugInfo("deterministic-debuginfo",
+  cl::desc("disables parallel execution of tasks that may produce"
+           "nondeterministic debug info"),
+  cl::init(true),
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string>
+    DwoOutputPath("dwo-output-path",
+                  cl::desc("Path to where .dwo files will be written out to."),
+                  cl::init(""), cl::cat(BoltCategory));
+
+static cl::opt<bool>
+    DebugSkeletonCu("debug-skeleton-cu",
+                    cl::desc("Prints out offsetrs for abbrev and debu_info of "
+                             "Skeleton CUs that get patched."),
+                    cl::ZeroOrMore, cl::Hidden, cl::init(false),
+                    cl::cat(BoltCategory));
+} // namespace opts
+
+/// Returns DWO Name to be used. Handles case where user specifies output DWO
+/// directory, and there are duplicate names. Assumes DWO ID is unique.
+static std::string
+getDWOName(llvm::DWARFUnit &CU,
+           std::unordered_map<std::string, uint32_t> *NameToIndexMap,
+           std::unordered_map<uint64_t, std::string> &DWOIdToName) {
+  llvm::Optional<uint64_t> DWOId = CU.getDWOId();
+  assert(DWOId && "DWO ID not found.");
+  (void)DWOId;
+  auto NameIter = DWOIdToName.find(*DWOId);
+  if (NameIter != DWOIdToName.end())
+    return NameIter->second;
+
+  std::string DWOName = dwarf::toString(
+      CU.getUnitDIE().find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+      "");
+  assert(!DWOName.empty() &&
+         "DW_AT_dwo_name/DW_AT_GNU_dwo_name does not exists.");
+  if (NameToIndexMap && !opts::DwoOutputPath.empty()) {
+    auto Iter = NameToIndexMap->find(DWOName);
+    if (Iter == NameToIndexMap->end()) {
+      Iter = NameToIndexMap->insert({DWOName, 0}).first;
+    }
+    DWOName.append(std::to_string(Iter->second));
+    ++Iter->second;
+  }
+  DWOName.append(".dwo");
+  DWOIdToName[*DWOId] = DWOName;
+  return DWOName;
+}
+
+static bool isHighPcFormEightBytes(dwarf::Form DwarfForm) {
+  return DwarfForm == dwarf::DW_FORM_addr || DwarfForm == dwarf::DW_FORM_data8;
+}
+
+void DWARFRewriter::updateDebugInfo() {
+  ErrorOr<BinarySection &> DebugAbbrev =
+      BC.getUniqueSectionByName(".debug_abbrev");
+  ErrorOr<BinarySection &> DebugInfo = BC.getUniqueSectionByName(".debug_info");
+
+  if (!DebugAbbrev || !DebugInfo)
+    return;
+
+  DebugAbbrev->registerPatcher(std::make_unique<DebugAbbrevPatcher>());
+  auto *AbbrevPatcher =
+      static_cast<DebugAbbrevPatcher *>(DebugAbbrev->getPatcher());
+
+  DebugInfo->registerPatcher(std::make_unique<SimpleBinaryPatcher>());
+  auto *DebugInfoPatcher =
+      static_cast<SimpleBinaryPatcher *>(DebugInfo->getPatcher());
+
+  ARangesSectionWriter = std::make_unique<DebugARangesSectionWriter>();
+  RangesSectionWriter = std::make_unique<DebugRangesSectionWriter>();
+  StrWriter = std::make_unique<DebugStrWriter>(&BC);
+
+  AddrWriter = std::make_unique<DebugAddrWriter>(&BC);
+  DebugLoclistWriter::setAddressWriter(AddrWriter.get());
+
+  uint64_t NumCUs = BC.DwCtx->getNumCompileUnits();
+  if ((opts::NoThreads || opts::DeterministicDebugInfo) &&
+      BC.getNumDWOCUs() == 0) {
+    // Use single entry for efficiency when running single-threaded
+    NumCUs = 1;
+  }
+
+  LocListWritersByCU.reserve(NumCUs);
+
+  for (size_t CUIndex = 0; CUIndex < NumCUs; ++CUIndex) {
+    LocListWritersByCU[CUIndex] = std::make_unique<DebugLocWriter>(&BC);
+  }
+  // Unordered maps to handle name collision if output DWO directory is
+  // specified.
+  std::unordered_map<std::string, uint32_t> NameToIndexMap;
+  std::unordered_map<uint64_t, std::string> DWOIdToName;
+
+  auto updateDWONameCompDir = [&](DWARFUnit &Unit) -> void {
+    const DWARFDie &DIE = Unit.getUnitDIE();
+    uint64_t AttrOffset = 0;
+    Optional<DWARFFormValue> ValDwoName =
+        DIE.find(dwarf::DW_AT_GNU_dwo_name, &AttrOffset);
+    (void)ValDwoName;
+    assert(ValDwoName && "Skeleton CU doesn't have dwo_name.");
+
+    std::string ObjectName = getDWOName(Unit, &NameToIndexMap, DWOIdToName);
+    uint32_t NewOffset = StrWriter->addString(ObjectName.c_str());
+    DebugInfoPatcher->addLE32Patch(AttrOffset, NewOffset);
+
+    Optional<DWARFFormValue> ValCompDir =
+        DIE.find(dwarf::DW_AT_comp_dir, &AttrOffset);
+    (void)ValCompDir;
+    assert(ValCompDir && "DW_AT_comp_dir is not in Skeleton CU.");
+    if (!opts::DwoOutputPath.empty()) {
+      uint32_t NewOffset = StrWriter->addString(opts::DwoOutputPath.c_str());
+      DebugInfoPatcher->addLE32Patch(AttrOffset, NewOffset);
+    }
+  };
+
+  uint32_t AbbrevOffsetModifier = 0;
+  // Case 1) Range_base found: patch .debug_info
+  // Case 2) Range_base not found, but Ranges will be used: patch
+  // .debug_info/.debug_abbrev
+  auto updateRangeBase = [&](DWARFUnit &Unit, uint64_t RangeBase,
+                             bool WasRangeBaseUsed) -> void {
+    uint64_t AttrOffset = 0;
+    DWARFDie DIE = Unit.getUnitDIE();
+    Optional<DWARFFormValue> ValRangeBase =
+        DIE.find(dwarf::DW_AT_GNU_ranges_base, &AttrOffset);
+    bool NeedToPatch = ValRangeBase.hasValue();
+    uint32_t PrevAbbrevOffsetModifier = AbbrevOffsetModifier;
+    // Case where Skeleton CU doesn't have DW_AT_GNU_ranges_base
+    if (!NeedToPatch && WasRangeBaseUsed) {
+      const DWARFAbbreviationDeclaration *AbbreviationDecl =
+          DIE.getAbbreviationDeclarationPtr();
+      if (Optional<DWARFFormValue> ValLowPC =
+              DIE.find(dwarf::DW_AT_low_pc, &AttrOffset)) {
+
+        Optional<DWARFFormValue> ValHighPC = DIE.find(dwarf::DW_AT_high_pc);
+        uint32_t NumBytesToFill = 7;
+
+        AbbrevPatcher->addAttributePatch(AbbreviationDecl, dwarf::DW_AT_low_pc,
+                                         dwarf::DW_AT_GNU_ranges_base,
+                                         dwarf::DW_FORM_indirect);
+        // Bolt converts DW_AT_low_pc/DW_AT_high_pc to DW_AT_low_pc/DW_at_ranges
+        // DW_AT_high_pc can be 4 or 8 bytes. If it's 8 bytes need to use first
+        // 4 bytes.
+        if (ValHighPC && isHighPcFormEightBytes(ValHighPC->getForm())) {
+          NumBytesToFill += 4;
+        }
+        LLVM_DEBUG(if (opts::DebugSkeletonCu) dbgs()
+                       << "AttrOffset: " << Twine::utohexstr(AttrOffset) << "\n"
+                       << "Die Offset: " << Twine::utohexstr(DIE.getOffset())
+                       << "\n"
+                       << "AbbrDecl offfset: "
+                       << Twine::utohexstr(Unit.getAbbrOffset()) << "\n"
+                       << "Unit Offset: " << Twine::utohexstr(Unit.getOffset())
+                       << "\n\n";);
+        DebugInfoPatcher->addUDataPatch(AttrOffset, dwarf::DW_FORM_udata, 1);
+        DebugInfoPatcher->addUDataPatch(AttrOffset + 1, RangeBase,
+                                        NumBytesToFill);
+
+        // 1 Byte for DW_AT_GNU_ranges_base (since it's 2 bytes vs DW_AT_low_pc)
+        AbbrevOffsetModifier += 1;
+      } else {
+        errs() << "BOLT-WARNING: Skeleton CU at 0x"
+               << Twine::utohexstr(DIE.getOffset())
+               << " doesn't have DW_AT_GNU_ranges_base, or "
+                  "DW_AT_low_pc to convert\n";
+        return;
+      }
+    }
+    if (NeedToPatch)
+      DebugInfoPatcher->addLE32Patch(AttrOffset,
+                                     static_cast<uint32_t>(RangeBase));
+
+    // DWARF4
+    // unit_length - 4 bytes
+    // version - 2 bytes
+    // So + 6 to patch debug_abbrev_offset
+    if (PrevAbbrevOffsetModifier)
+      DebugInfoPatcher->addLE32Patch(
+          Unit.getOffset() + 6, static_cast<uint32_t>(Unit.getAbbrOffset()) +
+                                    PrevAbbrevOffsetModifier);
+  };
+
+  auto processUnitDIE = [&](size_t CUIndex, DWARFUnit *Unit) {
+    uint64_t RangeBase = RangesSectionWriter->getSectionOffset();
+    updateUnitDebugInfo(CUIndex, *Unit, *DebugInfoPatcher, *AbbrevPatcher);
+    if (llvm::Optional<uint64_t> DWOId = Unit->getDWOId()) {
+      Optional<DWARFUnit *> CU = BC.getDWOCU(*DWOId);
+      if (CU) {
+        updateDWONameCompDir(*Unit);
+        // Assuming there is unique DWOID per binary. i.e. Two or more CUs don't
+        // have same DWO ID.
+        assert(LocListWritersByCU.count(*DWOId) == 0 &&
+               "LocList writer for DWO unit already exists.");
+        LocListWritersByCU[*DWOId] =
+            std::make_unique<DebugLoclistWriter>(&BC, *DWOId);
+        SimpleBinaryPatcher *DwoDebugInfoPatcher =
+            getBinaryDWODebugInfoPatcher(*DWOId);
+        DwoDebugInfoPatcher->setRangeBase(RangeBase);
+        updateUnitDebugInfo(*DWOId, *(*CU), *DwoDebugInfoPatcher,
+                            *getBinaryDWOAbbrevPatcher(*DWOId));
+        static_cast<DebugLoclistWriter *>(LocListWritersByCU[*DWOId].get())
+            ->finalizePatches();
+        updateRangeBase(*Unit, RangeBase,
+                        DwoDebugInfoPatcher->getWasRangBasedUsed());
+      }
+    }
+  };
+
+  if (opts::NoThreads || opts::DeterministicDebugInfo) {
+    for (std::unique_ptr<DWARFUnit> &CU : BC.DwCtx->compile_units()) {
+      processUnitDIE(0, CU.get());
+    }
+  } else {
+    // Update unit debug info in parallel
+    ThreadPool &ThreadPool = ParallelUtilities::getThreadPool();
+    size_t CUIndex = 0;
+    for (std::unique_ptr<DWARFUnit> &CU : BC.DwCtx->compile_units()) {
+      ThreadPool.async(processUnitDIE, CUIndex, CU.get());
+      CUIndex++;
+    }
+
+    ThreadPool.wait();
+  }
+
+  flushPendingRanges(*DebugInfoPatcher);
+
+  finalizeDebugSections(*DebugInfoPatcher);
+
+  writeOutDWOFiles(DWOIdToName);
+
+  updateGdbIndexSection();
+}
+
+void DWARFRewriter::updateUnitDebugInfo(uint64_t CUIndex, DWARFUnit &Unit,
+                                        SimpleBinaryPatcher &DebugInfoPatcher,
+                                        DebugAbbrevPatcher &AbbrevPatcher) {
+  // Cache debug ranges so that the offset for identical ranges could be reused.
+  std::map<DebugAddressRangesVector, uint64_t> CachedRanges;
+
+  auto &DebugLocWriter = *LocListWritersByCU[CUIndex].get();
+
+  uint64_t DIEOffset = Unit.getOffset() + Unit.getHeaderSize();
+  uint64_t NextCUOffset = Unit.getNextUnitOffset();
+  DWARFDebugInfoEntry Die;
+  DWARFDataExtractor DebugInfoData = Unit.getDebugInfoExtractor();
+  uint32_t Depth = 0;
+
+  while (
+      Die.extractFast(Unit, &DIEOffset, DebugInfoData, NextCUOffset, Depth)) {
+    if (const DWARFAbbreviationDeclaration *AbbrDecl =
+            Die.getAbbreviationDeclarationPtr()) {
+      if (AbbrDecl->hasChildren())
+        ++Depth;
+    } else {
+      // NULL entry.
+      if (Depth > 0)
+        --Depth;
+      if (Depth == 0)
+        break;
+    }
+
+    DWARFDie DIE(&Unit, &Die);
+
+    switch (DIE.getTag()) {
+    case dwarf::DW_TAG_compile_unit: {
+      auto ModuleRangesOrError = DIE.getAddressRanges();
+      if (!ModuleRangesOrError) {
+        consumeError(ModuleRangesOrError.takeError());
+        break;
+      }
+      DWARFAddressRangesVector ModuleRanges = *ModuleRangesOrError;
+      DebugAddressRangesVector OutputRanges =
+          BC.translateModuleAddressRanges(ModuleRanges);
+      const uint64_t RangesSectionOffset =
+          RangesSectionWriter->addRanges(OutputRanges);
+      ARangesSectionWriter->addCURanges(Unit.getOffset(),
+                                        std::move(OutputRanges));
+      updateDWARFObjectAddressRanges(DIE, RangesSectionOffset, DebugInfoPatcher,
+                                     AbbrevPatcher);
+      break;
+    }
+    case dwarf::DW_TAG_subprogram: {
+      // Get function address either from ranges or [LowPC, HighPC) pair.
+      bool UsesRanges = false;
+      uint64_t Address;
+      uint64_t SectionIndex, HighPC;
+      if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) {
+        Expected<DWARFAddressRangesVector> RangesOrError =
+            DIE.getAddressRanges();
+        if (!RangesOrError) {
+          consumeError(RangesOrError.takeError());
+          break;
+        }
+        DWARFAddressRangesVector Ranges = *RangesOrError;
+        // Not a function definition.
+        if (Ranges.empty())
+          break;
+
+        Address = Ranges.front().LowPC;
+        UsesRanges = true;
+      }
+
+      // Clear cached ranges as the new function will have its own set.
+      CachedRanges.clear();
+
+      DebugAddressRangesVector FunctionRanges;
+      if (const BinaryFunction *Function =
+              BC.getBinaryFunctionAtAddress(Address))
+        FunctionRanges = Function->getOutputAddressRanges();
+      // Update ranges.
+      if (UsesRanges) {
+        updateDWARFObjectAddressRanges(
+            DIE, RangesSectionWriter->addRanges(FunctionRanges),
+            DebugInfoPatcher, AbbrevPatcher);
+      } else {
+        // Delay conversion of [LowPC, HighPC) into DW_AT_ranges if possible.
+        const DWARFAbbreviationDeclaration *Abbrev =
+            DIE.getAbbreviationDeclarationPtr();
+        assert(Abbrev && "abbrev expected");
+
+        // Create a critical section.
+        static std::shared_timed_mutex CriticalSectionMutex;
+        std::unique_lock<std::shared_timed_mutex> Lock(CriticalSectionMutex);
+
+        if (FunctionRanges.size() > 1) {
+          convertPending(Abbrev, DebugInfoPatcher, AbbrevPatcher);
+          // Exit critical section early.
+          Lock.unlock();
+          convertToRanges(DIE, FunctionRanges, DebugInfoPatcher);
+        } else if (ConvertedRangesAbbrevs.find(Abbrev) !=
+                   ConvertedRangesAbbrevs.end()) {
+          // Exit critical section early.
+          Lock.unlock();
+          convertToRanges(DIE, FunctionRanges, DebugInfoPatcher);
+        } else {
+          if (FunctionRanges.empty())
+            FunctionRanges.emplace_back(DebugAddressRange());
+          addToPendingRanges(Abbrev, DIE, FunctionRanges, Unit.getDWOId());
+        }
+      }
+      break;
+    }
+    case dwarf::DW_TAG_lexical_block:
+    case dwarf::DW_TAG_inlined_subroutine:
+    case dwarf::DW_TAG_try_block:
+    case dwarf::DW_TAG_catch_block: {
+      uint64_t RangesSectionOffset =
+          RangesSectionWriter->getEmptyRangesOffset();
+      Expected<DWARFAddressRangesVector> RangesOrError = DIE.getAddressRanges();
+      const BinaryFunction *Function = RangesOrError && !RangesOrError->empty()
+          ? BC.getBinaryFunctionContainingAddress(RangesOrError->front().LowPC)
+          : nullptr;
+      if (Function) {
+        DebugAddressRangesVector OutputRanges =
+            Function->translateInputToOutputRanges(*RangesOrError);
+        LLVM_DEBUG(if (OutputRanges.empty() != RangesOrError->empty()) {
+          dbgs() << "BOLT-DEBUG: problem with DIE at 0x"
+                 << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x"
+                 << Twine::utohexstr(Unit.getOffset()) << '\n';
+        });
+        RangesSectionOffset = RangesSectionWriter->addRanges(
+            std::move(OutputRanges), CachedRanges);
+      } else if (!RangesOrError) {
+        consumeError(RangesOrError.takeError());
+      }
+      updateDWARFObjectAddressRanges(DIE, RangesSectionOffset, DebugInfoPatcher,
+                                     AbbrevPatcher);
+      break;
+    }
+    default: {
+      // Handle any tag that can have DW_AT_location attribute.
+      DWARFFormValue Value;
+      uint64_t AttrOffset;
+      if (Optional<DWARFFormValue> V =
+              DIE.find(dwarf::DW_AT_location, &AttrOffset)) {
+        Value = *V;
+        if (Value.isFormClass(DWARFFormValue::FC_Constant) ||
+            Value.isFormClass(DWARFFormValue::FC_SectionOffset)) {
+          uint64_t Offset = Value.isFormClass(DWARFFormValue::FC_Constant)
+                                      ? Value.getAsUnsignedConstant().getValue()
+                                      : Value.getAsSectionOffset().getValue();
+          DebugLocationsVector InputLL;
+
+          Optional<object::SectionedAddress> SectionAddress =
+              Unit.getBaseAddress();
+          uint64_t BaseAddress = 0;
+          if (SectionAddress)
+            BaseAddress = SectionAddress->Address;
+
+          Error E = Unit.getLocationTable().visitLocationList(
+              &Offset, [&](const DWARFLocationEntry &Entry) {
+                switch (Entry.Kind) {
+                default:
+                  llvm_unreachable("Unsupported DWARFLocationEntry Kind.");
+                case dwarf::DW_LLE_end_of_list:
+                  return false;
+                case dwarf::DW_LLE_base_address:
+                  assert(Entry.SectionIndex == SectionedAddress::UndefSection &&
+                         "absolute address expected");
+                  BaseAddress = Entry.Value0;
+                  break;
+                case dwarf::DW_LLE_offset_pair:
+                  assert(
+                      (Entry.SectionIndex == SectionedAddress::UndefSection &&
+                       !Unit.isDWOUnit()) &&
+                      "absolute address expected");
+                  InputLL.emplace_back(DebugLocationEntry{
+                      BaseAddress + Entry.Value0, BaseAddress + Entry.Value1,
+                      Entry.Loc});
+                  break;
+                case dwarf::DW_LLE_startx_length:
+                  assert(Unit.isDWOUnit() &&
+                         "None DWO Unit with DW_LLE_startx_length encoding.");
+                  Optional<object::SectionedAddress> EntryAddress =
+                      Unit.getAddrOffsetSectionItem(Entry.Value0);
+                  assert(EntryAddress && "Address does not exist.");
+                  InputLL.emplace_back(DebugLocationEntry{
+                      EntryAddress->Address,
+                      EntryAddress->Address + Entry.Value1, Entry.Loc});
+                  break;
+                }
+                return true;
+              });
+
+          uint64_t OutputLocListOffset = DebugLocWriter::EmptyListTag;
+          if (E || InputLL.empty()) {
+            errs() << "BOLT-WARNING: empty location list detected at 0x"
+                   << Twine::utohexstr(Offset) << " for DIE at 0x"
+                   << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x"
+                   << Twine::utohexstr(Unit.getOffset()) << '\n';
+          } else {
+            const uint64_t Address = InputLL.front().LowPC;
+            if (const BinaryFunction *Function =
+                    BC.getBinaryFunctionContainingAddress(Address)) {
+              const DebugLocationsVector OutputLL = Function
+                  ->translateInputToOutputLocationList(InputLL);
+              LLVM_DEBUG(if (OutputLL.empty()) {
+                dbgs() << "BOLT-DEBUG: location list translated to an empty "
+                          "one at 0x"
+                       << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x"
+                       << Twine::utohexstr(Unit.getOffset()) << '\n';
+              });
+              OutputLocListOffset = DebugLocWriter.addList(OutputLL);
+            }
+          }
+
+          if (OutputLocListOffset != DebugLocWriter::EmptyListTag) {
+            std::lock_guard<std::mutex> Lock(LocListDebugInfoPatchesMutex);
+            if (Unit.isDWOUnit()) {
+              // Not sure if better approach is to hide all of this away in a
+              // class. Also re-using LocListDebugInfoPatchType. Wasting some
+              // space for DWOID/CUIndex.
+              DwoLocListDebugInfoPatches[CUIndex].push_back(
+                  {AttrOffset, CUIndex, OutputLocListOffset});
+            } else {
+              LocListDebugInfoPatches.push_back(
+                  {AttrOffset, CUIndex, OutputLocListOffset});
+            }
+          } else {
+            std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
+            DebugInfoPatcher.addLE32Patch(AttrOffset,
+                                          DebugLocWriter::EmptyListOffset);
+          }
+        } else {
+          assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) ||
+                  Value.isFormClass(DWARFFormValue::FC_Block)) &&
+                 "unexpected DW_AT_location form");
+          if (Unit.isDWOUnit()) {
+            ArrayRef<uint8_t> Expr = *Value.getAsBlock();
+            DataExtractor Data(
+                StringRef((const char *)Expr.data(), Expr.size()),
+                Unit.getContext().isLittleEndian(), 0);
+            DWARFExpression LocExpr(Data, Unit.getAddressByteSize(),
+                                    Unit.getFormParams().Format);
+            for (auto &Expr : LocExpr) {
+              if (Expr.getCode() != dwarf::DW_OP_GNU_addr_index)
+                continue;
+              uint64_t Index = Expr.getRawOperand(0);
+              Optional<object::SectionedAddress> EntryAddress =
+                  Unit.getAddrOffsetSectionItem(Index);
+              assert(EntryAddress && "Address is not found.");
+              assert(Index <= std::numeric_limits<uint32_t>::max() &&
+                     "Invalid Operand Index.");
+              AddrWriter->addIndexAddress(EntryAddress->Address,
+                                          static_cast<uint32_t>(Index),
+                                          *Unit.getDWOId());
+            }
+          }
+        }
+      } else if (Optional<DWARFFormValue> V =
+                     DIE.find(dwarf::DW_AT_low_pc, &AttrOffset)) {
+        Value = *V;
+        const Optional<uint64_t> Result = Value.getAsAddress();
+        if (Result.hasValue()) {
+          const uint64_t Address = Result.getValue();
+          uint64_t NewAddress = 0;
+          if (const BinaryFunction *Function =
+                  BC.getBinaryFunctionContainingAddress(Address)) {
+            NewAddress = Function->translateInputToOutputAddress(Address);
+            LLVM_DEBUG(dbgs()
+                       << "BOLT-DEBUG: Fixing low_pc 0x"
+                       << Twine::utohexstr(Address) << " for DIE with tag "
+                       << DIE.getTag() << " to 0x"
+                       << Twine::utohexstr(NewAddress) << '\n');
+          }
+
+          dwarf::Form Form = Value.getForm();
+          assert(Form != dwarf::DW_FORM_LLVM_addrx_offset &&
+                 "DW_FORM_LLVM_addrx_offset is not supported");
+          std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
+          if (Form == dwarf::DW_FORM_GNU_addr_index) {
+            assert(Unit.isDWOUnit() &&
+                   "DW_FORM_GNU_addr_index in Non DWO unit.");
+            uint64_t Index = Value.getRawUValue();
+            // If there is no new address, storing old address.
+            // Re-using Index to make implementation easier.
+            // DW_FORM_GNU_addr_index is variable lenght encoding so we either
+            // have to create indices of same sizes, or use same index.
+            AddrWriter->addIndexAddress(NewAddress ? NewAddress : Address,
+                                        Index, *Unit.getDWOId());
+          } else {
+            DebugInfoPatcher.addLE64Patch(AttrOffset, NewAddress);
+          }
+        } else if (opts::Verbosity >= 1) {
+          errs() << "BOLT-WARNING: unexpected form value for attribute at 0x"
+                 << Twine::utohexstr(AttrOffset);
+        }
+      }
+    }
+    }
+  }
+
+  if (DIEOffset > NextCUOffset) {
+    errs() << "BOLT-WARNING: corrupt DWARF detected at 0x"
+           << Twine::utohexstr(Unit.getOffset()) << '\n';
+  }
+}
+
+void DWARFRewriter::updateDWARFObjectAddressRanges(
+    const DWARFDie DIE, uint64_t DebugRangesOffset,
+    SimpleBinaryPatcher &DebugInfoPatcher, DebugAbbrevPatcher &AbbrevPatcher) {
+
+  // Some objects don't have an associated DIE and cannot be updated (such as
+  // compiler-generated functions).
+  if (!DIE) {
+    return;
+  }
+
+  const DWARFAbbreviationDeclaration *AbbreviationDecl =
+      DIE.getAbbreviationDeclarationPtr();
+  if (!AbbreviationDecl) {
+    if (opts::Verbosity >= 1) {
+      errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: "
+             << "skipping update. DIE at offset 0x"
+             << Twine::utohexstr(DIE.getOffset()) << '\n';
+    }
+    return;
+  }
+
+  if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) {
+    // Case 1: The object was already non-contiguous and had DW_AT_ranges.
+    // In this case we simply need to update the value of DW_AT_ranges.
+    uint64_t AttrOffset = -1U;
+    DIE.find(dwarf::DW_AT_ranges, &AttrOffset);
+    assert(AttrOffset != -1U && "failed to locate DWARF attribute");
+
+    std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
+    DebugInfoPatcher.addLE32Patch(
+        AttrOffset, DebugRangesOffset - DebugInfoPatcher.getRangeBase());
+  } else {
+    // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back
+    // to back. We replace the attributes with DW_AT_ranges and DW_AT_low_pc.
+    // The low_pc attribute is required for DW_TAG_compile_units to set a base
+    // address.
+    //
+    // Since DW_AT_ranges takes 4-byte DW_FROM_sec_offset value, we have to fill
+    // in up to 12-bytes left after removal of low/high pc field from
+    // .debug_info.
+    //
+    // To fill in the gap we use a variable length DW_FORM_udata encoding for
+    // DW_AT_low_pc. We exploit the fact that the encoding can take an arbitrary
+    // large size.
+    if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) &&
+        AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) {
+      convertToRanges(AbbreviationDecl, AbbrevPatcher);
+      convertToRanges(DIE, DebugRangesOffset, DebugInfoPatcher);
+    } else {
+      if (opts::Verbosity >= 1) {
+        errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x"
+               << Twine::utohexstr(DIE.getOffset()) << '\n';
+      }
+    }
+  }
+}
+
+void DWARFRewriter::updateLineTableOffsets() {
+  const MCSection *LineSection =
+    BC.Ctx->getObjectFileInfo()->getDwarfLineSection();
+  auto CurrentFragment = LineSection->begin();
+  uint64_t CurrentOffset = 0;
+  uint64_t Offset = 0;
+
+  ErrorOr<BinarySection &> DbgInfoSection =
+      BC.getUniqueSectionByName(".debug_info");
+  ErrorOr<BinarySection &> TypeInfoSection =
+      BC.getUniqueSectionByName(".debug_types");
+  assert(((BC.DwCtx->getNumTypeUnits() > 0 && TypeInfoSection) ||
+          BC.DwCtx->getNumTypeUnits() == 0) &&
+         "Was not able to retrieve Debug Types section.");
+
+  // There is no direct connection between CU and TU, but same offsets,
+  // encoded in DW_AT_stmt_list, into .debug_line get modified.
+  // We take advantage of that to map original CU line table offsets to new
+  // ones.
+  std::unordered_map<uint64_t, uint64_t> DebugLineOffsetMap;
+
+  auto GetStatementListValue = [](DWARFUnit *Unit) {
+    Optional<DWARFFormValue> StmtList =
+        Unit->getUnitDIE().find(dwarf::DW_AT_stmt_list);
+    Optional<uint64_t> Offset = dwarf::toSectionOffset(StmtList);
+    assert(Offset && "Was not able to retreive value of DW_AT_stmt_list.");
+    return *Offset;
+  };
+
+  for (const std::unique_ptr<DWARFUnit> &CU : BC.DwCtx->compile_units()) {
+    const unsigned CUID = CU->getOffset();
+    MCSymbol *Label = BC.Ctx->getMCDwarfLineTable(CUID).getLabel();
+    if (!Label)
+      continue;
+
+    const uint64_t LTOffset =
+      BC.DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list);
+    if (!LTOffset)
+      continue;
+
+    // Line tables are stored in MCContext in ascending order of offset in the
+    // output file, thus we can compute all table's offset by passing through
+    // each fragment at most once, continuing from the last CU's beginning
+    // instead of from the first fragment.
+    MCFragment *Fragment = Label->getFragment();
+    while (&*CurrentFragment != Fragment) {
+      switch (CurrentFragment->getKind()) {
+      case MCFragment::FT_Dwarf:
+        Offset += cast<MCDwarfLineAddrFragment>(*CurrentFragment)
+          .getContents().size() - CurrentOffset;
+        break;
+      case MCFragment::FT_Data:
+        Offset += cast<MCDataFragment>(*CurrentFragment)
+          .getContents().size() - CurrentOffset;
+        break;
+      default:
+        llvm_unreachable(".debug_line section shouldn't contain other types "
+                         "of fragments.");
+      }
+      ++CurrentFragment;
+      CurrentOffset = 0;
+    }
+
+    Offset += Label->getOffset() - CurrentOffset;
+    CurrentOffset = Label->getOffset();
+
+    DebugLineOffsetMap[GetStatementListValue(CU.get())] = Offset;
+    assert(DbgInfoSection && ".debug_info section must exist");
+    DbgInfoSection->addRelocation(LTOffset,
+                                  nullptr,
+                                  ELF::R_X86_64_32,
+                                  Offset,
+                                  0,
+                                  /*Pending=*/true);
+
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUID
+                      << " has line table at " << Offset << "\n");
+  }
+
+  for (const std::unique_ptr<DWARFUnit> &TU : BC.DwCtx->types_section_units()) {
+    DWARFUnit *Unit = TU.get();
+    const uint64_t LTOffset =
+        BC.DwCtx->getAttrFieldOffsetForUnit(Unit, dwarf::DW_AT_stmt_list);
+    if (!LTOffset)
+      continue;
+    auto Iter = DebugLineOffsetMap.find(GetStatementListValue(Unit));
+    assert(Iter != DebugLineOffsetMap.end() &&
+           "Type Unit Updated Line Number Entry does not exist.");
+    TypeInfoSection->addRelocation(LTOffset, nullptr, ELF::R_X86_64_32,
+                                   Iter->second, 0, /*Pending=*/true);
+  }
+
+  // Set .debug_info as finalized so it won't be skipped over when
+  // we process sections while writing out the new binary.  This ensures
+  // that the pending relocations will be processed and not ignored.
+  if(DbgInfoSection)
+    DbgInfoSection->setIsFinalized();
+
+  if (TypeInfoSection)
+    TypeInfoSection->setIsFinalized();
+}
+
+void DWARFRewriter::finalizeDebugSections(
+    SimpleBinaryPatcher &DebugInfoPatcher) {
+  // Skip .debug_aranges if we are re-generating .gdb_index.
+  if (opts::KeepARanges || !BC.getGdbIndexSection()) {
+    SmallVector<char, 16> ARangesBuffer;
+    raw_svector_ostream OS(ARangesBuffer);
+
+    auto MAB = std::unique_ptr<MCAsmBackend>(BC.TheTarget->createMCAsmBackend(
+        *BC.STI, *BC.MRI, MCTargetOptions()));
+
+    ARangesSectionWriter->writeARangesSection(OS);
+    const StringRef &ARangesContents = OS.str();
+
+    BC.registerOrUpdateNoteSection(".debug_aranges",
+                                    copyByteArray(ARangesContents),
+                                    ARangesContents.size());
+  }
+
+  if (StrWriter->isInitialized()) {
+    RewriteInstance::addToDebugSectionsToOverwrite(".debug_str");
+    std::unique_ptr<DebugStrBufferVector> DebugStrSectionContents =
+        StrWriter->finalize();
+    BC.registerOrUpdateNoteSection(".debug_str",
+                                   copyByteArray(*DebugStrSectionContents),
+                                   DebugStrSectionContents->size());
+  }
+
+  if (AddrWriter->isInitialized()) {
+    AddressSectionBuffer AddressSectionContents = AddrWriter->finalize();
+    BC.registerOrUpdateNoteSection(".debug_addr",
+                                   copyByteArray(AddressSectionContents),
+                                   AddressSectionContents.size());
+    for (auto &CU : BC.DwCtx->compile_units()) {
+      DWARFDie DIE = CU->getUnitDIE();
+      uint64_t AttrOffset = 0;
+      if (Optional<DWARFFormValue> Val =
+              DIE.find(dwarf::DW_AT_GNU_addr_base, &AttrOffset)) {
+        uint64_t Offset = AddrWriter->getOffset(*CU->getDWOId());
+        DebugInfoPatcher.addLE32Patch(AttrOffset, static_cast<int32_t>(Offset));
+      }
+    }
+  }
+
+  std::unique_ptr<RangesBufferVector> RangesSectionContents =
+      RangesSectionWriter->finalize();
+  BC.registerOrUpdateNoteSection(".debug_ranges",
+                                  copyByteArray(*RangesSectionContents),
+                                  RangesSectionContents->size());
+
+  std::unique_ptr<LocBufferVector> LocationListSectionContents =
+      makeFinalLocListsSection(DebugInfoPatcher);
+  BC.registerOrUpdateNoteSection(".debug_loc",
+                                  copyByteArray(*LocationListSectionContents),
+                                  LocationListSectionContents->size());
+}
+
+void DWARFRewriter::writeOutDWOFiles(
+    std::unordered_map<uint64_t, std::string> &DWOIdToName) {
+  std::string DebugData = "";
+  auto applyPatch = [&](BinaryPatcher *Patcher, StringRef Data,
+                        uint32_t Offset) -> StringRef {
+    DebugData = Data.str();
+    Patcher->patchBinary(DebugData, Offset);
+    return StringRef(DebugData.c_str(), DebugData.size());
+  };
+
+  using DWOSectionContribution =
+      const DWARFUnitIndex::Entry::SectionContribution;
+  auto getSliceData = [&](const DWARFUnitIndex::Entry *DWOEntry,
+                          StringRef OutData, DWARFSectionKind Sec,
+                          uint32_t &DWPOffset) -> StringRef {
+    if (DWOEntry) {
+      DWOSectionContribution *DWOContrubution = DWOEntry->getContribution(Sec);
+      DWPOffset = DWOContrubution->Offset;
+      OutData = OutData.substr(DWPOffset, DWOContrubution->Length);
+    }
+    return OutData;
+  };
+
+  // Setup DWP code once.
+  DWARFContext *DWOCtx = BC.getDWOContext();
+  const DWARFUnitIndex *CUIndex = nullptr;
+  bool IsDWP = false;
+  if (DWOCtx) {
+    CUIndex = &DWOCtx->getCUIndex();
+    IsDWP = !CUIndex->getRows().empty();
+  }
+
+  for (const std::unique_ptr<DWARFUnit> &CU : BC.DwCtx->compile_units()) {
+    Optional<uint64_t> DWOId = CU->getDWOId();
+    if (!DWOId)
+      continue;
+
+    Optional<DWARFUnit *> DWOCU = BC.getDWOCU(*DWOId);
+    if (!DWOCU)
+      continue;
+
+    const object::ObjectFile *File =
+        (*DWOCU)->getContext().getDWARFObj().getFile();
+    std::string CompDir = opts::DwoOutputPath.empty()
+                              ? CU->getCompilationDir()
+                              : opts::DwoOutputPath.c_str();
+    std::string ObjectName = getDWOName(*CU.get(), nullptr, DWOIdToName);
+    auto FullPath = CompDir.append("/").append(ObjectName);
+
+    std::error_code EC;
+    std::unique_ptr<ToolOutputFile> TempOut =
+        std::make_unique<ToolOutputFile>(FullPath, EC, sys::fs::OF_None);
+
+    std::unique_ptr<BinaryContext> TmpBC = BinaryContext::createBinaryContext(
+        File, false,
+        DWARFContext::create(*File, nullptr, "", WithColor::defaultErrorHandler,
+                             WithColor::defaultWarningHandler,
+                             /*UsesRelocs=*/false));
+    std::unique_ptr<MCStreamer> Streamer = TmpBC->createStreamer(TempOut->os());
+    const MCObjectFileInfo &MCOFI = *Streamer->getContext().getObjectFileInfo();
+
+    const DWARFUnitIndex::Entry *DWOEntry = nullptr;
+    if (IsDWP)
+      DWOEntry = CUIndex->getFromHash(*DWOId);
+
+    const StringMap<MCSection *> KnownSections = {
+        {".debug_info.dwo", MCOFI.getDwarfInfoDWOSection()},
+        {".debug_types.dwo", MCOFI.getDwarfTypesDWOSection()},
+        {".debug_str.dwo", MCOFI.getDwarfStrDWOSection()},
+        {".debug_str_offsets.dwo", MCOFI.getDwarfStrOffDWOSection()},
+        {".debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()},
+        {".debug_loc.dwo", MCOFI.getDwarfLocDWOSection()},
+        {".debug_line.dwo", MCOFI.getDwarfLineDWOSection()}};
+
+    for (const SectionRef &Section : File->sections()) {
+      Expected<StringRef> SectionName = Section.getName();
+      assert(SectionName && "Invalid section name.");
+      auto SectionIter = KnownSections.find(*SectionName);
+      if (SectionIter == KnownSections.end())
+        continue;
+      Streamer->SwitchSection(SectionIter->second);
+      Expected<StringRef> Contents = Section.getContents();
+      assert(Contents && "Invalid contents.");
+      StringRef OutData(*Contents);
+      std::unique_ptr<LocBufferVector> Data;
+      uint32_t DWPOffset = 0;
+
+      if (SectionName->equals(".debug_info.dwo")) {
+        OutData = getSliceData(DWOEntry, OutData,
+                               DWARFSectionKind::DW_SECT_INFO, DWPOffset);
+        SimpleBinaryPatcher *Patcher = getBinaryDWODebugInfoPatcher(*DWOId);
+        OutData = applyPatch(Patcher, OutData, DWPOffset);
+      } else if (SectionName->equals(".debug_types.dwo")) {
+        OutData = getSliceData(DWOEntry, OutData,
+                               DWARFSectionKind::DW_SECT_EXT_TYPES, DWPOffset);
+      } else if (SectionName->equals(".debug_str.dwo")) {
+        OutData = (*DWOCU)->getStringSection();
+      } else if (SectionName->equals(".debug_str_offsets.dwo")) {
+        OutData =
+            getSliceData(DWOEntry, OutData,
+                         DWARFSectionKind::DW_SECT_STR_OFFSETS, DWPOffset);
+      } else if (SectionName->equals(".debug_abbrev.dwo")) {
+        OutData = getSliceData(DWOEntry, OutData,
+                               DWARFSectionKind::DW_SECT_ABBREV, DWPOffset);
+        DebugAbbrevPatcher *Patcher = getBinaryDWOAbbrevPatcher(*DWOId);
+        OutData = applyPatch(Patcher, OutData, DWPOffset);
+      } else if (SectionName->equals(".debug_loc.dwo")) {
+        DebugLocWriter *LocWriter = LocListWritersByCU[*DWOId].get();
+        Data = LocWriter->finalize();
+        // Creating explicit with creating of StringRef here, otherwise
+        // with impicit conversion it will take null byte as end of
+        // string.
+        OutData = StringRef(reinterpret_cast<const char *>(Data->data()),
+                            Data->size());
+      } else if (SectionName->equals(".debug_line.dwo")) {
+        OutData = getSliceData(DWOEntry, OutData,
+                               DWARFSectionKind::DW_SECT_LINE, DWPOffset);
+      } else {
+        errs() << "BOLT-WARNING: Unsupported Debug section: " << *SectionName
+               << "\n";
+      }
+
+      Streamer->emitBytes(OutData);
+    }
+    Streamer->Finish();
+    TempOut->keep();
+  }
+}
+
+void DWARFRewriter::updateGdbIndexSection() {
+  if (!BC.getGdbIndexSection())
+    return;
+
+  // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for
+  // .gdb_index section format.
+
+  StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents();
+
+  const char *Data = GdbIndexContents.data();
+
+  // Parse the header.
+  const uint32_t Version = read32le(Data);
+  if (Version != 7 && Version != 8) {
+    errs() << "BOLT-ERROR: can only process .gdb_index versions 7 and 8\n";
+    exit(1);
+  }
+
+  // Some .gdb_index generators use file offsets while others use section
+  // offsets. Hence we can only rely on offsets relative to each other,
+  // and ignore their absolute values.
+  const uint32_t CUListOffset = read32le(Data + 4);
+  const uint32_t CUTypesOffset = read32le(Data + 8);
+  const uint32_t AddressTableOffset = read32le(Data + 12);
+  const uint32_t SymbolTableOffset = read32le(Data + 16);
+  const uint32_t ConstantPoolOffset = read32le(Data + 20);
+  Data += 24;
+
+  // Map CUs offsets to indices and verify existing index table.
+  std::map<uint32_t, uint32_t> OffsetToIndexMap;
+  const uint32_t CUListSize = CUTypesOffset - CUListOffset;
+  const unsigned NumCUs = BC.DwCtx->getNumCompileUnits();
+  if (CUListSize != NumCUs * 16) {
+    errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n";
+    exit(1);
+  }
+  for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) {
+    const DWARFUnit *CU = BC.DwCtx->getUnitAtIndex(Index);
+    const uint64_t Offset = read64le(Data);
+    if (CU->getOffset() != Offset) {
+      errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n";
+      exit(1);
+    }
+
+    OffsetToIndexMap[Offset] = Index;
+  }
+
+  // Ignore old address table.
+  const uint32_t OldAddressTableSize = SymbolTableOffset - AddressTableOffset;
+  // Move Data to the beginning of symbol table.
+  Data += SymbolTableOffset - CUTypesOffset;
+
+  // Calculate the size of the new address table.
+  uint32_t NewAddressTableSize = 0;
+  for (const auto &CURangesPair : ARangesSectionWriter->getCUAddressRanges()) {
+    const SmallVector<DebugAddressRange, 2> &Ranges = CURangesPair.second;
+    NewAddressTableSize += Ranges.size() * 20;
+  }
+
+  // Difference between old and new table (and section) sizes.
+  // Could be negative.
+  int32_t Delta = NewAddressTableSize - OldAddressTableSize;
+
+  size_t NewGdbIndexSize = GdbIndexContents.size() + Delta;
+
+  // Free'd by ExecutableFileMemoryManager.
+  auto *NewGdbIndexContents = new uint8_t[NewGdbIndexSize];
+  uint8_t *Buffer = NewGdbIndexContents;
+
+  write32le(Buffer, Version);
+  write32le(Buffer + 4, CUListOffset);
+  write32le(Buffer + 8, CUTypesOffset);
+  write32le(Buffer + 12, AddressTableOffset);
+  write32le(Buffer + 16, SymbolTableOffset + Delta);
+  write32le(Buffer + 20, ConstantPoolOffset + Delta);
+  Buffer += 24;
+
+  // Copy over CU list and types CU list.
+  memcpy(Buffer, GdbIndexContents.data() + 24,
+         AddressTableOffset - CUListOffset);
+  Buffer += AddressTableOffset - CUListOffset;
+
+  // Generate new address table.
+  for (const std::pair<const uint64_t, DebugAddressRangesVector> &CURangesPair :
+       ARangesSectionWriter->getCUAddressRanges()) {
+    const uint32_t CUIndex = OffsetToIndexMap[CURangesPair.first];
+    const DebugAddressRangesVector &Ranges = CURangesPair.second;
+    for (const DebugAddressRange &Range : Ranges) {
+      write64le(Buffer, Range.LowPC);
+      write64le(Buffer + 8, Range.HighPC);
+      write32le(Buffer + 16, CUIndex);
+      Buffer += 20;
+    }
+  }
+
+  const size_t TrailingSize =
+      GdbIndexContents.data() + GdbIndexContents.size() - Data;
+  assert(Buffer + TrailingSize == NewGdbIndexContents + NewGdbIndexSize &&
+         "size calculation error");
+
+  // Copy over the rest of the original data.
+  memcpy(Buffer, Data, TrailingSize);
+
+  // Register the new section.
+  BC.registerOrUpdateNoteSection(".gdb_index",
+                                  NewGdbIndexContents,
+                                  NewGdbIndexSize);
+}
+
+void DWARFRewriter::convertToRanges(const DWARFAbbreviationDeclaration *Abbrev,
+                                    DebugAbbrevPatcher &AbbrevPatcher) {
+  dwarf::Form HighPCForm = Abbrev->findAttribute(dwarf::DW_AT_high_pc)->Form;
+  dwarf::Form LowPCForm = Abbrev->findAttribute(dwarf::DW_AT_low_pc)->Form;
+
+  std::lock_guard<std::mutex> Lock(AbbrevPatcherMutex);
+  // DW_FORM_GNU_addr_index is already variable encoding so nothing to do there.
+  // If HighForm is 8 bytes need to change low_pc to be variable encoding to
+  // consume extra bytes from high_pc, since DW_FORM_sec_offset is 4 bytes for
+  // DWARF32.
+  if (LowPCForm != dwarf::DW_FORM_GNU_addr_index &&
+      isHighPcFormEightBytes(HighPCForm))
+    AbbrevPatcher.addAttributePatch(Abbrev, dwarf::DW_AT_low_pc,
+                                    dwarf::DW_AT_low_pc,
+                                    dwarf::DW_FORM_indirect);
+
+  AbbrevPatcher.addAttributePatch(Abbrev, dwarf::DW_AT_high_pc,
+                                  dwarf::DW_AT_ranges,
+                                  dwarf::DW_FORM_sec_offset);
+}
+
+void DWARFRewriter::convertToRanges(DWARFDie DIE,
+                                    const DebugAddressRangesVector &Ranges,
+                                    SimpleBinaryPatcher &DebugInfoPatcher) {
+  uint64_t RangesSectionOffset;
+  if (Ranges.empty()) {
+    RangesSectionOffset = RangesSectionWriter->getEmptyRangesOffset();
+  } else {
+    RangesSectionOffset = RangesSectionWriter->addRanges(Ranges);
+  }
+
+  convertToRanges(DIE, RangesSectionOffset, DebugInfoPatcher);
+}
+
+void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev,
+                                   SimpleBinaryPatcher &DebugInfoPatcher,
+                                   DebugAbbrevPatcher &AbbrevPatcher) {
+  if (ConvertedRangesAbbrevs.count(Abbrev))
+    return;
+
+  convertToRanges(Abbrev, AbbrevPatcher);
+
+  auto I = PendingRanges.find(Abbrev);
+  if (I != PendingRanges.end()) {
+    for (std::pair<DWARFDieWrapper, DebugAddressRange> &Pair : I->second) {
+      convertToRanges(Pair.first, {Pair.second}, DebugInfoPatcher);
+    }
+    PendingRanges.erase(I);
+  }
+
+  ConvertedRangesAbbrevs.emplace(Abbrev);
+}
+
+void DWARFRewriter::addToPendingRanges(
+    const DWARFAbbreviationDeclaration *Abbrev, DWARFDie DIE,
+    DebugAddressRangesVector &FunctionRanges, Optional<uint64_t> DWOId) {
+  Optional<DWARFFormValue> LowPcValue = DIE.find(dwarf::DW_AT_low_pc);
+  Optional<DWARFFormValue> HighPcValue = DIE.find(dwarf::DW_AT_high_pc);
+  if (LowPcValue &&
+      LowPcValue->getForm() == dwarf::Form::DW_FORM_GNU_addr_index) {
+    assert(DWOId && "Invalid DWO ID.");
+    (void)DWOId;
+    assert(HighPcValue && "Low PC exists, but not High PC.");
+    (void)HighPcValue;
+    uint64_t IndexL = LowPcValue->getRawUValue();
+    uint64_t IndexH = HighPcValue->getRawUValue();
+    for (auto Address : FunctionRanges) {
+      AddrWriter->addIndexAddress(Address.LowPC, IndexL, *DWOId);
+      // 2.17.2
+      // If the value of the DW_AT_high_pc is of class address, it is the
+      // relocated address of the first location past the last instruction
+      // associated with the entity; if it is of class constant, the value is
+      // an unsigned integer offset which when added to the low PC gives the
+      // address of the first location past the last instruction associated
+      // with the entity.
+      if (!HighPcValue->isFormClass(DWARFFormValue::FC_Constant))
+        AddrWriter->addIndexAddress(Address.HighPC, IndexH, *DWOId);
+    }
+  }
+  PendingRanges[Abbrev].emplace_back(
+      std::make_pair(DWARFDieWrapper(DIE), FunctionRanges.front()));
+}
+
+std::unique_ptr<LocBufferVector>
+DWARFRewriter::makeFinalLocListsSection(SimpleBinaryPatcher &DebugInfoPatcher) {
+  auto LocBuffer = std::make_unique<LocBufferVector>();
+  auto LocStream = std::make_unique<raw_svector_ostream>(*LocBuffer);
+  auto Writer =
+    std::unique_ptr<MCObjectWriter>(BC.createObjectWriter(*LocStream));
+
+  uint64_t SectionOffset = 0;
+
+  // Add an empty list as the first entry;
+  const char Zeroes[16] = {0};
+  *LocStream << StringRef(Zeroes, 16);
+  SectionOffset += 2 * 8;
+
+  std::unordered_map<uint64_t, uint64_t> SectionOffsetByCU(
+      LocListWritersByCU.size());
+
+  for (std::pair<const uint64_t, std::unique_ptr<DebugLocWriter>> &Loc :
+       LocListWritersByCU) {
+    uint64_t CUIndex = Loc.first;
+    DebugLocWriter *LocWriter = Loc.second.get();
+    if (llvm::isa<DebugLoclistWriter>(*LocWriter))
+      continue;
+    SectionOffsetByCU[CUIndex] = SectionOffset;
+    std::unique_ptr<LocBufferVector> CurrCULocationLists =
+        LocWriter->finalize();
+    *LocStream << *CurrCULocationLists;
+    SectionOffset += CurrCULocationLists->size();
+  }
+
+  for (std::pair<const uint64_t, VectorLocListDebugInfoPatchType> &Iter :
+       DwoLocListDebugInfoPatches) {
+    uint64_t DWOId = Iter.first;
+    SimpleBinaryPatcher *Patcher = getBinaryDWODebugInfoPatcher(DWOId);
+    for (LocListDebugInfoPatchType &Patch : Iter.second) {
+      Patcher->addLE32Patch(Patch.DebugInfoOffset,
+                            SectionOffsetByCU[Patch.CUIndex] +
+                                Patch.CUWriterOffset);
+    }
+  }
+
+  for (LocListDebugInfoPatchType &Patch : LocListDebugInfoPatches) {
+    DebugInfoPatcher.addLE32Patch(Patch.DebugInfoOffset,
+                                  SectionOffsetByCU[Patch.CUIndex] +
+                                      Patch.CUWriterOffset);
+  }
+
+  return LocBuffer;
+}
+
+void DWARFRewriter::flushPendingRanges(SimpleBinaryPatcher &DebugInfoPatcher) {
+  for (std::pair<const DWARFAbbreviationDeclaration *const,
+                 std::vector<std::pair<DWARFDieWrapper, DebugAddressRange>>>
+           &I : PendingRanges) {
+    for (std::pair<DWARFDieWrapper, DebugAddressRange> &RangePair : I.second) {
+      patchLowHigh(RangePair.first, RangePair.second, DebugInfoPatcher);
+    }
+  }
+  clearList(PendingRanges);
+}
+
+namespace {
+
+void getRangeAttrData(
+    DWARFDie DIE,
+    uint64_t &LowPCOffset, uint64_t &HighPCOffset,
+    DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) {
+  LowPCOffset = -1U;
+  HighPCOffset = -1U;
+  LowPCFormValue = *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset);
+  HighPCFormValue = *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset);
+
+  if ((LowPCFormValue.getForm() != dwarf::DW_FORM_addr &&
+       LowPCFormValue.getForm() != dwarf::DW_FORM_GNU_addr_index) ||
+      (HighPCFormValue.getForm() != dwarf::DW_FORM_addr &&
+       HighPCFormValue.getForm() != dwarf::DW_FORM_data8 &&
+       HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) {
+    errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE "
+           << "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n";
+    return;
+  }
+  if ((LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) &&
+      LowPCFormValue.getForm() != dwarf::DW_FORM_GNU_addr_index) {
+    errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. "
+           << "Cannot update DIE at offset 0x"
+           << Twine::utohexstr(DIE.getOffset()) << '\n';
+    return;
+  }
+}
+
+}
+
+void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range,
+                                 SimpleBinaryPatcher &DebugInfoPatcher) {
+  uint64_t LowPCOffset, HighPCOffset;
+  DWARFFormValue LowPCFormValue, HighPCFormValue;
+  getRangeAttrData(
+      DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
+  auto *TempDebugPatcher = &DebugInfoPatcher;
+  if (LowPCFormValue.getForm() == dwarf::DW_FORM_GNU_addr_index) {
+    DWARFUnit *Unit = DIE.getDwarfUnit();
+    assert(Unit->isDWOUnit() && "DW_FORM_GNU_addr_index not part of DWO.");
+    uint32_t AddressIndex =
+        AddrWriter->getIndexFromAddress(Range.LowPC, *Unit->getDWOId());
+    TempDebugPatcher = getBinaryDWODebugInfoPatcher(*Unit->getDWOId());
+    TempDebugPatcher->addUDataPatch(LowPCOffset, AddressIndex,
+                                    std::abs(int(HighPCOffset - LowPCOffset)));
+    // TODO: In DWARF5 support ULEB128 for high_pc
+  } else {
+    TempDebugPatcher->addLE64Patch(LowPCOffset, Range.LowPC);
+  }
+
+  if (isHighPcFormEightBytes(HighPCFormValue.getForm())) {
+    TempDebugPatcher->addLE64Patch(HighPCOffset, Range.HighPC - Range.LowPC);
+  } else {
+    TempDebugPatcher->addLE32Patch(HighPCOffset, Range.HighPC - Range.LowPC);
+  }
+}
+
+void DWARFRewriter::convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset,
+                                    SimpleBinaryPatcher &DebugInfoPatcher) {
+  uint64_t LowPCOffset, HighPCOffset;
+  DWARFFormValue LowPCFormValue, HighPCFormValue;
+  getRangeAttrData(DIE, LowPCOffset, HighPCOffset, LowPCFormValue,
+                   HighPCFormValue);
+
+  unsigned LowPCSize = 0;
+  assert(DIE.getDwarfUnit()->getAddressByteSize() == 8);
+  if (isHighPcFormEightBytes(HighPCFormValue.getForm())) {
+    LowPCSize = 12;
+  } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) {
+    LowPCSize = 8;
+  } else {
+    llvm_unreachable("unexpected form");
+  }
+
+  std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
+  uint32_t BaseOffset = 0;
+  if (LowPCFormValue.getForm() == dwarf::DW_FORM_GNU_addr_index) {
+    // Add Indexer is already variable length encoding.
+    DebugInfoPatcher.addUDataPatch(LowPCOffset, 0,
+                                   std::abs(int(HighPCOffset - LowPCOffset)) +
+                                       LowPCSize - 8);
+    // Ranges are relative to DW_AT_GNU_ranges_base.
+    BaseOffset = DebugInfoPatcher.getRangeBase();
+  } else if (LowPCSize == 12) {
+    // Creatively encoding dwarf::DW_FORM_addr in to 4 bytes.
+    // Write an indirect 0 value for DW_AT_low_pc so that we can fill
+    // 12 bytes of space.
+    // The Abbrev wa already changed.
+    DebugInfoPatcher.addUDataPatch(LowPCOffset, dwarf::DW_FORM_addr, 4);
+    DebugInfoPatcher.addLE64Patch(LowPCOffset + 4, 0);
+  } else {
+    DebugInfoPatcher.addLE64Patch(LowPCOffset, 0);
+  }
+  DebugInfoPatcher.addLE32Patch(HighPCOffset + LowPCSize - 8,
+                                RangesSectionOffset - BaseOffset);
+}
diff --git a/bolt/src/DWARFRewriter.h b/bolt/src/DWARFRewriter.h
new file mode 100644
index 000000000000..d7f7904f443d
--- /dev/null
+++ b/bolt/src/DWARFRewriter.h
@@ -0,0 +1,229 @@
+//===--- DWARFRewriter.h --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
+#define LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H
+
+#include "DebugData.h"
+#include "RewriteInstance.h"
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+namespace bolt {
+
+class BinaryContext;
+
+class DWARFRewriter {
+  DWARFRewriter() = delete;
+
+  BinaryContext &BC;
+
+  std::mutex DebugInfoPatcherMutex;
+
+  std::mutex AbbrevPatcherMutex;
+
+  /// Stores and serializes information that will be put into the
+  /// .debug_ranges DWARF section.
+  std::unique_ptr<DebugRangesSectionWriter> RangesSectionWriter;
+
+  /// Stores and serializes information that will be put into the
+  /// .debug_aranges DWARF section.
+  std::unique_ptr<DebugARangesSectionWriter> ARangesSectionWriter;
+
+  /// Stores and serializes information that will be put into the
+  /// .debug_addr DWARF section.
+  std::unique_ptr<DebugAddrWriter> AddrWriter;
+
+  /// Stores and serializes information that will be put in to the
+  /// .debug_addr DWARF section.
+  /// Does not do de-duplication.
+  std::unique_ptr<DebugStrWriter> StrWriter;
+
+  using LocWriters =
+      std::unordered_map<uint64_t, std::unique_ptr<DebugLocWriter>>;
+  /// Use a separate location list writer for each compilation unit
+  LocWriters LocListWritersByCU;
+
+  using DebugAbbrevDWOPatchers =
+      std::unordered_map<uint64_t, std::unique_ptr<DebugAbbrevPatcher>>;
+  /// Binary patchers for DWO Abbrev sections.
+  DebugAbbrevDWOPatchers BinaryDWOAbbrevPatchers;
+
+  using DebugInfoDWOPatchers =
+      std::unordered_map<uint64_t, std::unique_ptr<SimpleBinaryPatcher>>;
+  /// Binary patchers for DWO debug_info sections.
+  DebugInfoDWOPatchers BinaryDWODebugInfoPatchers;
+
+  struct LocListDebugInfoPatchType {
+    uint64_t DebugInfoOffset;
+    size_t CUIndex;
+    uint64_t CUWriterOffset;
+  };
+  using VectorLocListDebugInfoPatchType =
+      std::vector<LocListDebugInfoPatchType>;
+  /// The list of debug info patches to be made once individual
+  /// location list writers have been filled
+  VectorLocListDebugInfoPatchType LocListDebugInfoPatches;
+
+  std::unordered_map<uint64_t, VectorLocListDebugInfoPatchType>
+      DwoLocListDebugInfoPatches;
+
+  std::mutex LocListDebugInfoPatchesMutex;
+
+  /// Update debug info for all DIEs in \p Unit.
+  void updateUnitDebugInfo(uint64_t CUIndex, DWARFUnit &Unit,
+                           SimpleBinaryPatcher &DebugInfoPatcher,
+                           DebugAbbrevPatcher &AbbrevPatcher);
+
+  /// Patches the binary for an object's address ranges to be updated.
+  /// The object can be a anything that has associated address ranges via either
+  /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc).
+  /// \p DebugRangesOffset is the offset in .debug_ranges of the object's
+  /// new address ranges in the output binary.
+  /// \p Unit Compile unit the object belongs to.
+  /// \p DIE is the object's DIE in the input binary.
+  void updateDWARFObjectAddressRanges(const DWARFDie DIE,
+                                      uint64_t DebugRangesOffset,
+                                      SimpleBinaryPatcher &DebugInfoPatcher,
+                                      DebugAbbrevPatcher &AbbrevPatcher);
+
+  std::unique_ptr<LocBufferVector>
+  makeFinalLocListsSection(SimpleBinaryPatcher &DebugInfoPatcher);
+
+  /// Generate new contents for .debug_ranges and .debug_aranges section.
+  void finalizeDebugSections(SimpleBinaryPatcher &DebugInfoPatcher);
+
+  /// Patches the binary for DWARF address ranges (e.g. in functions and lexical
+  /// blocks) to be updated.
+  void updateDebugAddressRanges();
+
+  /// Rewrite .gdb_index section if present.
+  void updateGdbIndexSection();
+
+  /// Output .dwo files.
+  void writeOutDWOFiles(std::unordered_map<uint64_t, std::string> &DWOIdToName);
+  /// Abbreviations that were converted to use DW_AT_ranges.
+  std::set<const DWARFAbbreviationDeclaration *> ConvertedRangesAbbrevs;
+
+  /// DWARFDie contains a pointer to a DIE and hence gets invalidated once the
+  /// embedded DIE is destroyed. This wrapper class stores a DIE internally and
+  /// could be cast to a DWARFDie that is valid even after the initial DIE is
+  /// destroyed.
+  struct DWARFDieWrapper {
+    DWARFUnit *Unit;
+    DWARFDebugInfoEntry DIE;
+
+    DWARFDieWrapper(DWARFUnit *Unit, DWARFDebugInfoEntry DIE) :
+      Unit(Unit),
+      DIE(DIE) {}
+
+    DWARFDieWrapper(DWARFDie &Die) :
+      Unit(Die.getDwarfUnit()),
+      DIE(*Die.getDebugInfoEntry()) {}
+
+    operator DWARFDie() {
+      return DWARFDie(Unit, &DIE);
+    }
+  };
+
+  /// DIEs with abbrevs that were not converted to DW_AT_ranges.
+  /// We only update those when all DIEs have been processed to guarantee that
+  /// the abbrev (which is shared) is intact.
+  using PendingRangesType = std::unordered_map<
+    const DWARFAbbreviationDeclaration *,
+    std::vector<std::pair<DWARFDieWrapper, DebugAddressRange>>>;
+
+  PendingRangesType PendingRanges;
+
+  /// Convert \p Abbrev from using a simple DW_AT_(low|high)_pc range to
+  /// DW_AT_ranges.
+  void convertToRanges(const DWARFAbbreviationDeclaration *Abbrev,
+                       DebugAbbrevPatcher &AbbrevPatcher);
+
+  /// Update \p DIE that was using DW_AT_(low|high)_pc with DW_AT_ranges offset.
+  void convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset,
+                       SimpleBinaryPatcher &DebugInfoPatcher);
+
+  /// Same as above, but takes a vector of \p Ranges as a parameter.
+  void convertToRanges(DWARFDie DIE, const DebugAddressRangesVector &Ranges,
+                       SimpleBinaryPatcher &DebugInfoPatcher);
+
+  /// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range.
+  void patchLowHigh(DWARFDie DIE, DebugAddressRange Range,
+                    SimpleBinaryPatcher &DebugInfoPatcher);
+
+  /// Convert pending ranges associated with the given \p Abbrev.
+  void convertPending(const DWARFAbbreviationDeclaration *Abbrev,
+                      SimpleBinaryPatcher &DebugInfoPatcher,
+                      DebugAbbrevPatcher &AbbrevPatcher);
+
+  /// Adds to Pending Ranges.
+  /// For Debug Fission also adding to .debug_addr to take care of a case where
+  /// some entries are not converted to ranges and left as
+  /// DW_AT_low_pc/DW_AT_high_pc.
+  void addToPendingRanges(const DWARFAbbreviationDeclaration *Abbrev,
+                          DWARFDie DIE, DebugAddressRangesVector &Ranges,
+                          Optional<uint64_t> DWOId);
+
+  /// Once all DIEs were seen, update DW_AT_(low|high)_pc values.
+  void flushPendingRanges(SimpleBinaryPatcher &DebugInfoPatcher);
+
+  /// Helper function for creating and returnning DWO DebugInfo and Abbrev
+  /// patchers.
+  template <class T, class Patcher>
+  Patcher *getBinaryDWOPatcherHelper(T &BinaryPatchers, uint64_t DwoId) {
+    auto Iter = BinaryPatchers.find(DwoId);
+    if (Iter == BinaryPatchers.end()) {
+      // Using make_pair instead of {} to work around bug in older version of
+      // the library. https://timsong-cpp.github.io/lwg-issues/2354
+      Iter = BinaryPatchers
+                 .insert(std::make_pair(DwoId, std::make_unique<Patcher>()))
+                 .first;
+    }
+
+    return static_cast<Patcher *>(Iter->second.get());
+  }
+
+public:
+  DWARFRewriter(BinaryContext &BC) : BC(BC) {}
+
+  /// Main function for updating the DWARF debug info.
+  void updateDebugInfo();
+
+  /// Computes output .debug_line line table offsets for each compile unit,
+  /// and updates stmt_list for a corresponding compile unit.
+  void updateLineTableOffsets();
+
+  /// Returns a DWO Debug Info Patcher for DWO ID.
+  /// Creates a new instance if it does not already exist.
+  SimpleBinaryPatcher *getBinaryDWODebugInfoPatcher(uint64_t DwoId) {
+    return getBinaryDWOPatcherHelper<DebugInfoDWOPatchers, SimpleBinaryPatcher>(
+        BinaryDWODebugInfoPatchers, DwoId);
+  }
+
+  /// Returns a DWO Abbrev Patcher for DWO ID.
+  /// Creates a new instance if it's not already exists.
+  DebugAbbrevPatcher *getBinaryDWOAbbrevPatcher(uint64_t DwoId) {
+    return getBinaryDWOPatcherHelper<DebugAbbrevDWOPatchers,
+                                     DebugAbbrevPatcher>(
+        BinaryDWOAbbrevPatchers, DwoId);
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp
new file mode 100644
index 000000000000..d560ce11ebec
--- /dev/null
+++ b/bolt/src/DataAggregator.cpp
@@ -0,0 +1,2284 @@
+//===-- DataAggregator.cpp - Perf data aggregator ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions reads profile data written by perf record,
+// aggregate it and then write it back to an output file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "BoltAddressTranslation.h"
+#include "DataAggregator.h"
+#include "Heatmap.h"
+#include "Utils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/Timer.h"
+#include <map>
+#include <unordered_map>
+
+#include <unistd.h>
+
+#define DEBUG_TYPE "aggregator"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory AggregatorCategory;
+extern bool HeatmapMode;
+extern bool LinuxKernelMode;
+extern cl::SubCommand HeatmapCommand;
+extern cl::opt<bool> AggregateOnly;
+extern cl::opt<std::string> OutputFilename;
+
+static cl::opt<bool>
+BasicAggregation("nl",
+  cl::desc("aggregate basic samples (without LBR info)"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<bool>
+FilterMemProfile("filter-mem-profile",
+  cl::desc("if processing a memory profile, filter out stack or heap accesses "
+           "that won't be useful for BOLT to reduce profile file size"),
+  cl::init(true),
+  cl::cat(AggregatorCategory));
+
+static cl::opt<unsigned long long>
+FilterPID("pid",
+  cl::desc("only use samples from process with specified PID"),
+  cl::init(0),
+  cl::Optional,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<unsigned>
+HeatmapBlock("block-size",
+  cl::desc("size of a heat map block in bytes (default 64)"),
+  cl::init(64),
+  cl::sub(HeatmapCommand));
+
+static cl::opt<std::string>
+HeatmapFile("o",
+  cl::init("-"),
+  cl::desc("heatmap output file (default stdout)"),
+  cl::Optional,
+  cl::sub(HeatmapCommand));
+
+static cl::opt<unsigned long long>
+HeatmapMaxAddress("max-address",
+  cl::init(0xffffffff),
+  cl::desc("maximum address considered valid for heatmap (default 4GB)"),
+  cl::Optional,
+  cl::sub(HeatmapCommand));
+
+static cl::opt<unsigned long long>
+HeatmapMinAddress("min-address",
+  cl::init(0x0),
+  cl::desc("minimum address considered valid for heatmap (default 0)"),
+  cl::Optional,
+  cl::sub(HeatmapCommand));
+
+static cl::opt<bool>
+IgnoreBuildID("ignore-build-id",
+  cl::desc("continue even if build-ids in input binary and perf.data mismatch"),
+  cl::init(false),
+  cl::cat(AggregatorCategory));
+
+static cl::opt<bool>
+IgnoreInterruptLBR("ignore-interrupt-lbr",
+  cl::desc("ignore kernel interrupt LBR that happens asynchronously"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<unsigned long long>
+MaxSamples("max-samples",
+  cl::init(-1ULL),
+  cl::desc("maximum number of samples to read from LBR profile"),
+  cl::Optional,
+  cl::Hidden,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<bool>
+ReadPreAggregated("pa",
+  cl::desc("skip perf and read data from a pre-aggregated file format"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<bool>
+TimeAggregator("time-aggr",
+  cl::desc("time BOLT aggregator"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<bool>
+UseEventPC("use-event-pc",
+  cl::desc("use event PC in combination with LBR sampling"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
+static cl::opt<bool>
+WriteAutoFDOData("autofdo",
+  cl::desc("generate autofdo textual data instead of bolt data"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(AggregatorCategory));
+
+}
+
+namespace {
+
+const char TimerGroupName[] = "aggregator";
+const char TimerGroupDesc[] = "Aggregator";
+
+}
+
+constexpr uint64_t DataAggregator::KernelBaseAddr;
+
+DataAggregator::~DataAggregator() {
+  deleteTempFiles();
+}
+
+namespace {
+void deleteTempFile(const std::string &FileName) {
+  if (std::error_code Errc = sys::fs::remove(FileName.c_str())) {
+    errs() << "PERF2BOLT: failed to delete temporary file "
+           << FileName << " with error " << Errc.message() << "\n";
+  }
+}
+}
+
+void DataAggregator::deleteTempFiles() {
+  for (std::string &FileName : TempFiles) {
+    deleteTempFile(FileName);
+  }
+  TempFiles.clear();
+}
+
+void DataAggregator::findPerfExecutable() {
+  Optional<std::string> PerfExecutable =
+      sys::Process::FindInEnvPath("PATH", "perf");
+  if (!PerfExecutable) {
+    outs() << "PERF2BOLT: No perf executable found!\n";
+    exit(1);
+  }
+  PerfPath = *PerfExecutable;
+}
+
+void DataAggregator::start() {
+  outs() << "PERF2BOLT: Starting data aggregation job for " << Filename
+         << "\n";
+
+  // Don't launch perf for pre-aggregated files
+  if (opts::ReadPreAggregated)
+    return;
+
+  findPerfExecutable();
+
+  if (opts::BasicAggregation) {
+    launchPerfProcess("events without LBR",
+                      MainEventsPPI,
+                      "script -F pid,event,ip",
+                      /*Wait = */false);
+  } else {
+    launchPerfProcess("branch events",
+                      MainEventsPPI,
+                      "script -F pid,ip,brstack",
+                      /*Wait = */false);
+  }
+
+  // Note: we launch script for mem events regardless of the option, as the
+  //       command fails fairly fast if mem events were not collected.
+  launchPerfProcess("mem events",
+                    MemEventsPPI,
+                    "script -F pid,event,addr,ip",
+                    /*Wait = */false);
+
+  launchPerfProcess("process events",
+                    MMapEventsPPI,
+                    "script --show-mmap-events",
+                    /*Wait = */false);
+
+  launchPerfProcess("task events",
+                    TaskEventsPPI,
+                    "script --show-task-events",
+                    /*Wait = */false);
+}
+
+void DataAggregator::abort() {
+  if (opts::ReadPreAggregated)
+    return;
+
+  std::string Error;
+
+  // Kill subprocesses in case they are not finished
+  sys::Wait(TaskEventsPPI.PI, 1, false, &Error);
+  sys::Wait(MMapEventsPPI.PI, 1, false, &Error);
+  sys::Wait(MainEventsPPI.PI, 1, false, &Error);
+  sys::Wait(MemEventsPPI.PI, 1, false, &Error);
+
+  deleteTempFiles();
+
+  exit(1);
+}
+
+void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
+                                       const char *ArgsString, bool Wait) {
+  SmallVector<StringRef, 4> Argv;
+
+  outs() << "PERF2BOLT: spawning perf job to read " << Name << '\n';
+  Argv.push_back(PerfPath.data());
+
+  char *WritableArgsString = strdup(ArgsString);
+  char *Str = WritableArgsString;
+  do {
+    Argv.push_back(Str);
+    while (*Str && *Str != ' ')
+      ++Str;
+    if (!*Str)
+      break;
+    *Str++ = 0;
+  } while (true);
+
+  Argv.push_back("-f");
+  Argv.push_back("-i");
+  Argv.push_back(Filename.c_str());
+
+  if (std::error_code Errc =
+          sys::fs::createTemporaryFile("perf.script", "out", PPI.StdoutPath)) {
+    errs() << "PERF2BOLT: failed to create temporary file "
+           << PPI.StdoutPath << " with error " << Errc.message()
+           << "\n";
+    exit(1);
+  }
+  TempFiles.push_back(PPI.StdoutPath.data());
+
+  if (std::error_code Errc =
+          sys::fs::createTemporaryFile("perf.script", "err", PPI.StderrPath)) {
+    errs() << "PERF2BOLT: failed to create temporary file "
+           << PPI.StderrPath << " with error " << Errc.message() << "\n";
+    exit(1);
+  }
+  TempFiles.push_back(PPI.StderrPath.data());
+
+  Optional<StringRef> Redirects[] = {
+      llvm::None,                        // Stdin
+      StringRef(PPI.StdoutPath.data()),  // Stdout
+      StringRef(PPI.StderrPath.data())}; // Stderr
+
+  LLVM_DEBUG({
+    dbgs() << "Launching perf: ";
+    for (StringRef Arg : Argv)
+      dbgs() << Arg << " ";
+    dbgs() << " 1> " << PPI.StdoutPath.data() << " 2> " << PPI.StderrPath.data()
+           << "\n";
+  });
+
+  if (Wait) {
+    PPI.PI.ReturnCode = sys::ExecuteAndWait(PerfPath.data(), Argv,
+                                            /*envp*/ llvm::None, Redirects);
+  } else {
+    PPI.PI = sys::ExecuteNoWait(PerfPath.data(), Argv, /*envp*/ llvm::None,
+                                Redirects);
+  }
+
+  free(WritableArgsString);
+}
+
+void DataAggregator::processFileBuildID(StringRef FileBuildID) {
+  PerfProcessInfo BuildIDProcessInfo;
+  launchPerfProcess("buildid list",
+                    BuildIDProcessInfo,
+                    "buildid-list",
+                    /*Wait = */true);
+
+  if (BuildIDProcessInfo.PI.ReturnCode != 0) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(BuildIDProcessInfo.StderrPath.data());
+    StringRef ErrBuf = (*MB)->getBuffer();
+
+    errs() << "PERF-ERROR: return code " << BuildIDProcessInfo.PI.ReturnCode
+           << '\n';
+    errs() << ErrBuf;
+    return;
+  }
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+    MemoryBuffer::getFileOrSTDIN(BuildIDProcessInfo.StdoutPath.data());
+  if (std::error_code EC = MB.getError()) {
+    errs() << "Cannot open " << BuildIDProcessInfo.StdoutPath.data() << ": "
+           << EC.message() << "\n";
+    return;
+  }
+
+  FileBuf.reset(MB->release());
+  ParsingBuf = FileBuf->getBuffer();
+  if (ParsingBuf.empty()) {
+    errs() << "PERF2BOLT-WARNING: build-id will not be checked because perf "
+              "data was recorded without it\n";
+    return;
+  }
+
+  Col = 0;
+  Line = 1;
+  Optional<StringRef> FileName = getFileNameForBuildID(FileBuildID);
+  if (!FileName) {
+    errs() << "PERF2BOLT-ERROR: failed to match build-id from perf output. "
+              "This indicates the input binary supplied for data aggregation "
+              "is not the same recorded by perf when collecting profiling "
+              "data, or there were no samples recorded for the binary. "
+              "Use -ignore-build-id option to override.\n";
+    if (!opts::IgnoreBuildID) {
+      abort();
+    }
+  } else if (*FileName != llvm::sys::path::filename(BC->getFilename())) {
+    errs() << "PERF2BOLT-WARNING: build-id matched a different file name\n";
+    BuildIDBinaryName = std::string(*FileName);
+  } else {
+    outs() << "PERF2BOLT: matched build-id and file name\n";
+  }
+
+  return;
+}
+
+bool DataAggregator::checkPerfDataMagic(StringRef FileName) {
+  if (opts::ReadPreAggregated)
+    return true;
+
+  int FD;
+  if (sys::fs::openFileForRead(FileName, FD)) {
+    return false;
+  }
+
+  char Buf[7] = {0, 0, 0, 0, 0, 0, 0};
+
+  if (::read(FD, Buf, 7) == -1) {
+    ::close(FD);
+    return false;
+  }
+  ::close(FD);
+
+  if (strncmp(Buf, "PERFILE", 7) == 0)
+    return true;
+  return false;
+}
+
+void DataAggregator::parsePreAggregated() {
+  std::string Error;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = MB.getError()) {
+    errs() << "PERF2BOLT-ERROR: cannot open " << Filename << ": "
+           << EC.message() << "\n";
+    exit(1);
+  }
+
+  FileBuf.reset(MB->release());
+  ParsingBuf = FileBuf->getBuffer();
+  Col = 0;
+  Line = 1;
+  if (parsePreAggregatedLBRSamples()) {
+    errs() << "PERF2BOLT: failed to parse samples\n";
+    exit(1);
+  }
+}
+
+std::error_code DataAggregator::writeAutoFDOData(StringRef OutputFilename) {
+  outs() << "PERF2BOLT: writing data for autofdo tools...\n";
+  NamedRegionTimer T("writeAutoFDO", "Processing branch events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
+
+  std::error_code EC;
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    return EC;
+
+  // Format:
+  // number of unique traces
+  // from_1-to_1:count_1
+  // from_2-to_2:count_2
+  // ......
+  // from_n-to_n:count_n
+  // number of unique sample addresses
+  // addr_1:count_1
+  // addr_2:count_2
+  // ......
+  // addr_n:count_n
+  // number of unique LBR entries
+  // src_1->dst_1:count_1
+  // src_2->dst_2:count_2
+  // ......
+  // src_n->dst_n:count_n
+
+  const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress;
+
+  // AutoFDO addresses are relative to the first allocated loadable program
+  // segment
+  auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t {
+    if (Address < FirstAllocAddress)
+      return 0;
+    return Address - FirstAllocAddress;
+  };
+
+  OutFile << FallthroughLBRs.size() << "\n";
+  for (const auto &AggrLBR : FallthroughLBRs) {
+    const Trace &Trace = AggrLBR.first;
+    const FTInfo &Info = AggrLBR.second;
+    OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "-"
+            << Twine::utohexstr(filterAddress(Trace.To)) << ":"
+            << (Info.InternCount + Info.ExternCount) << "\n";
+  }
+
+  OutFile << BasicSamples.size() << "\n";
+  for (const auto &Sample : BasicSamples) {
+    uint64_t PC = Sample.first;
+    uint64_t HitCount = Sample.second;
+    OutFile << Twine::utohexstr(filterAddress(PC)) << ":" << HitCount << "\n";
+  }
+
+  OutFile << BranchLBRs.size() << "\n";
+  for (const auto &AggrLBR : BranchLBRs) {
+    const Trace &Trace = AggrLBR.first;
+    const BranchInfo &Info = AggrLBR.second;
+    OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "->"
+            << Twine::utohexstr(filterAddress(Trace.To)) << ":"
+            << Info.TakenCount << "\n";
+  }
+
+  outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, "
+         << BasicSamples.size() << " sample addresses and " << BranchLBRs.size()
+         << " unique branches to " << OutputFilename << "\n";
+
+  return std::error_code();
+}
+
+void DataAggregator::filterBinaryMMapInfo() {
+  if (opts::FilterPID) {
+    auto MMapInfoIter = BinaryMMapInfo.find(opts::FilterPID);
+    if (MMapInfoIter != BinaryMMapInfo.end()) {
+      MMapInfo MMap = MMapInfoIter->second;
+      BinaryMMapInfo.clear();
+      BinaryMMapInfo.insert(std::make_pair(MMap.PID, MMap));
+    } else {
+      if (errs().has_colors())
+        errs().changeColor(raw_ostream::RED);
+      errs() << "PERF2BOLT-ERROR: could not find a profile matching PID \""
+             << opts::FilterPID << "\"" << " for binary \""
+             << BC->getFilename() <<"\".";
+      assert(!BinaryMMapInfo.empty() && "No memory map for matching binary");
+      errs() << " Profile for the following process is available:\n";
+      for (std::pair<const uint64_t, MMapInfo> &MMI : BinaryMMapInfo) {
+        outs() << "  " << MMI.second.PID
+               << (MMI.second.Forked ? " (forked)\n" : "\n");
+      }
+      if (errs().has_colors())
+        errs().resetColor();
+
+      exit(1);
+    }
+  }
+}
+
+Error DataAggregator::preprocessProfile(BinaryContext &BC) {
+  this->BC = &BC;
+
+  if (opts::ReadPreAggregated) {
+    parsePreAggregated();
+    return Error::success();
+  }
+
+  if (Optional<StringRef> FileBuildID = BC.getFileBuildID()) {
+    outs() << "BOLT-INFO: binary build-id is:     " << *FileBuildID << "\n";
+    processFileBuildID(*FileBuildID);
+  } else {
+    errs() << "BOLT-WARNING: build-id will not be checked because we could "
+              "not read one from input binary\n";
+  }
+
+  auto prepareToParse = [&](StringRef Name, PerfProcessInfo &Process) {
+    std::string Error;
+    outs() << "PERF2BOLT: waiting for perf " << Name
+           << " collection to finish...\n";
+    sys::ProcessInfo PI = sys::Wait(Process.PI, 0, true, &Error);
+
+    if (!Error.empty()) {
+      errs() << "PERF-ERROR: " << PerfPath << ": " << Error << "\n";
+      deleteTempFiles();
+      exit(1);
+    }
+
+    if (PI.ReturnCode != 0) {
+      ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorMB =
+        MemoryBuffer::getFileOrSTDIN(Process.StderrPath.data());
+      StringRef ErrBuf = (*ErrorMB)->getBuffer();
+
+      errs() << "PERF-ERROR: return code " << PI.ReturnCode << "\n";
+      errs() << ErrBuf;
+      deleteTempFiles();
+      exit(1);
+    }
+
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Process.StdoutPath.data());
+    if (std::error_code EC = MB.getError()) {
+      errs() << "Cannot open " << Process.StdoutPath.data() << ": "
+             << EC.message() << "\n";
+      deleteTempFiles();
+      exit(1);
+    }
+
+    FileBuf.reset(MB->release());
+    ParsingBuf = FileBuf->getBuffer();
+    Col = 0;
+    Line = 1;
+  };
+
+  if (opts::LinuxKernelMode) {
+    // Current MMap parsing logic does not work with linux kernel.
+    // MMap entries for linux kernel uses PERF_RECORD_MMAP
+    // format instead of typical PERF_RECORD_MMAP2 format.
+    // Since linux kernel address mapping is absolute (same as
+    // in the ELF file), we avoid parsing MMap in linux kernel mode.
+    // While generating optimized linux kernel binary, we may need
+    // to parse MMap entries.
+
+    // In linux kernel mode, we analyze and optimize
+    // all linux kernel binary instructions, irrespective
+    // of whether they are due to system calls or due to
+    // interrupts. Therefore, we cannot ignore interrupt
+    // in Linux kernel mode.
+    opts::IgnoreInterruptLBR = false;
+  } else {
+    prepareToParse("mmap events", MMapEventsPPI);
+    if (parseMMapEvents()) {
+      errs() << "PERF2BOLT: failed to parse mmap events\n";
+    }
+  }
+
+  prepareToParse("task events", TaskEventsPPI);
+  if (parseTaskEvents()) {
+    errs() << "PERF2BOLT: failed to parse task events\n";
+  }
+
+  filterBinaryMMapInfo();
+  prepareToParse("events", MainEventsPPI);
+
+  if (opts::HeatmapMode) {
+    if (std::error_code EC = printLBRHeatMap()) {
+      errs() << "ERROR: failed to print heat map: " << EC.message() << '\n';
+      exit(1);
+    }
+    exit(0);
+  }
+
+  if ((!opts::BasicAggregation && parseBranchEvents()) ||
+      (opts::BasicAggregation && parseBasicEvents())) {
+    errs() << "PERF2BOLT: failed to parse samples\n";
+  }
+
+  // We can finish early if the goal is just to generate data for autofdo
+  if (opts::WriteAutoFDOData) {
+    if (std::error_code EC = writeAutoFDOData(opts::OutputFilename)) {
+      errs() << "Error writing autofdo data to file: " << EC.message() << "\n";
+    }
+    deleteTempFiles();
+    exit(0);
+  }
+
+  // Special handling for memory events
+  std::string Error;
+  sys::ProcessInfo PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error);
+  if (PI.ReturnCode != 0) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(MemEventsPPI.StderrPath.data());
+    StringRef ErrBuf = (*MB)->getBuffer();
+
+    deleteTempFiles();
+
+    Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
+                 "Cannot print 'addr' field.");
+    if (!NoData.match(ErrBuf)) {
+      errs() << "PERF-ERROR: return code " << PI.ReturnCode << "\n";
+      errs() << ErrBuf;
+      exit(1);
+    }
+    return Error::success();
+  }
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+    MemoryBuffer::getFileOrSTDIN(MemEventsPPI.StdoutPath.data());
+  if (std::error_code EC = MB.getError()) {
+    errs() << "Cannot open " << MemEventsPPI.StdoutPath.data() << ": "
+           << EC.message() << "\n";
+    deleteTempFiles();
+    exit(1);
+  }
+
+  FileBuf.reset(MB->release());
+  ParsingBuf = FileBuf->getBuffer();
+  Col = 0;
+  Line = 1;
+  if (const std::error_code EC = parseMemEvents()) {
+    errs() << "PERF2BOLT: failed to parse memory events: "
+           << EC.message() << '\n';
+  }
+
+  deleteTempFiles();
+
+  return Error::success();
+}
+
+Error DataAggregator::readProfile(BinaryContext &BC) {
+  processProfile(BC);
+
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    convertBranchData(Function);
+  }
+
+  if (opts::AggregateOnly) {
+    if (std::error_code EC = writeAggregatedFile(opts::OutputFilename)) {
+      report_error("cannot create output data file", EC);
+    }
+  }
+
+  return Error::success();
+}
+
+bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) {
+  return Function.hasProfileAvailable();
+}
+
+void DataAggregator::processProfile(BinaryContext &BC) {
+  if (opts::ReadPreAggregated)
+    processPreAggregated();
+  else if (opts::BasicAggregation)
+    processBasicEvents();
+  else
+    processBranchEvents();
+
+  processMemEvents();
+
+  // Mark all functions with registered events as having a valid profile.
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &BF = BFI.second;
+    if (getBranchData(BF)) {
+      const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
+                                                : BinaryFunction::PF_LBR;
+      BF.markProfiled(Flags);
+    }
+  }
+
+  // Release intermediate storage.
+  clear(BranchLBRs);
+  clear(FallthroughLBRs);
+  clear(AggregatedLBRs);
+  clear(BasicSamples);
+  clear(MemSamples);
+}
+
+BinaryFunction *
+DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) const {
+  if (!BC->containsAddress(Address))
+    return nullptr;
+
+  return BC->getBinaryFunctionContainingAddress(Address, /*CheckPastEnd=*/false,
+                                                /*UseMaxSize=*/true);
+}
+
+StringRef DataAggregator::getLocationName(BinaryFunction &Func,
+                                          uint64_t Count) {
+  if (!BAT)
+    return Func.getOneName();
+
+  const BinaryFunction *OrigFunc = &Func;
+  if (const uint64_t HotAddr = BAT->fetchParentAddress(Func.getAddress())) {
+    NumColdSamples += Count;
+    BinaryFunction *HotFunc = getBinaryFunctionContainingAddress(HotAddr);
+    if (HotFunc)
+      OrigFunc = HotFunc;
+  }
+  // If it is a local function, prefer the name containing the file name where
+  // the local function was declared
+  for (StringRef AlternativeName : OrigFunc->getNames()) {
+    size_t FileNameIdx = AlternativeName.find('/');
+    // Confirm the alternative name has the pattern Symbol/FileName/1 before
+    // using it
+    if (FileNameIdx == StringRef::npos ||
+        AlternativeName.find('/', FileNameIdx + 1) == StringRef::npos)
+      continue;
+    return AlternativeName;
+  }
+  return OrigFunc->getOneName();
+}
+
+bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address,
+                              uint64_t Count) {
+  auto I = NamesToSamples.find(Func.getOneName());
+  if (I == NamesToSamples.end()) {
+    bool Success;
+    StringRef LocName = getLocationName(Func, Count);
+    std::tie(I, Success) = NamesToSamples.insert(std::make_pair(
+        Func.getOneName(),
+        FuncSampleData(LocName, FuncSampleData::ContainerTy())));
+  }
+
+  Address -= Func.getAddress();
+  if (BAT)
+    Address = BAT->translate(Func, Address, /*IsBranchSrc=*/false);
+
+  I->second.bumpCount(Address, Count);
+  return true;
+}
+
+bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,
+                                   uint64_t To, uint64_t Count,
+                                   uint64_t Mispreds) {
+  FuncBranchData *AggrData = getBranchData(Func);
+  if (!AggrData) {
+    AggrData = &NamesToBranches[Func.getOneName()];
+    AggrData->Name = getLocationName(Func, Count);
+    setBranchData(Func, AggrData);
+  }
+
+  From -= Func.getAddress();
+  To -= Func.getAddress();
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName()
+                    << " @ " << Twine::utohexstr(From) << " -> "
+                    << Func.getPrintName() << " @ " << Twine::utohexstr(To)
+                    << '\n');
+  if (BAT) {
+    From = BAT->translate(Func, From, /*IsBranchSrc=*/true);
+    To = BAT->translate(Func, To, /*IsBranchSrc=*/false);
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: "
+                      << Func.getPrintName() << " @ " << Twine::utohexstr(From)
+                      << " -> " << Func.getPrintName() << " @ "
+                      << Twine::utohexstr(To) << '\n');
+  }
+
+  AggrData->bumpBranchCount(From, To, Count, Mispreds);
+  return true;
+}
+
+bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
+                                   BinaryFunction *ToFunc, uint64_t From,
+                                   uint64_t To, uint64_t Count,
+                                   uint64_t Mispreds) {
+  FuncBranchData *FromAggrData = nullptr;
+  FuncBranchData *ToAggrData = nullptr;
+  StringRef SrcFunc;
+  StringRef DstFunc;
+  if (FromFunc) {
+    SrcFunc = getLocationName(*FromFunc, Count);
+    FromAggrData = getBranchData(*FromFunc);
+    if (!FromAggrData) {
+      FromAggrData = &NamesToBranches[FromFunc->getOneName()];
+      FromAggrData->Name = SrcFunc;
+      setBranchData(*FromFunc, FromAggrData);
+    }
+    From -= FromFunc->getAddress();
+    if (BAT)
+      From = BAT->translate(*FromFunc, From, /*IsBranchSrc=*/true);
+
+    recordExit(*FromFunc, From, Mispreds, Count);
+  }
+  if (ToFunc) {
+    DstFunc = getLocationName(*ToFunc, 0);
+    ToAggrData = getBranchData(*ToFunc);
+    if (!ToAggrData) {
+      ToAggrData = &NamesToBranches[ToFunc->getOneName()];
+      ToAggrData->Name = DstFunc;
+      setBranchData(*ToFunc, ToAggrData);
+    }
+    To -= ToFunc->getAddress();
+    if (BAT)
+      To = BAT->translate(*ToFunc, To, /*IsBranchSrc=*/false);
+
+    recordEntry(*ToFunc, To, Mispreds, Count);
+  }
+
+  if (FromAggrData)
+    FromAggrData->bumpCallCount(From, Location(!DstFunc.empty(), DstFunc, To),
+                                Count, Mispreds);
+  if (ToAggrData)
+    ToAggrData->bumpEntryCount(Location(!SrcFunc.empty(), SrcFunc, From), To,
+                               Count, Mispreds);
+  return true;
+}
+
+bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
+                              uint64_t Mispreds) {
+  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
+  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
+  if (!FromFunc && !ToFunc)
+    return false;
+
+  if (FromFunc == ToFunc) {
+    recordBranch(*FromFunc, From - FromFunc->getAddress(),
+                 To - FromFunc->getAddress(), Count, Mispreds);
+    return doIntraBranch(*FromFunc, From, To, Count, Mispreds);
+  }
+
+  return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
+}
+
+bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
+                             uint64_t Count) {
+  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(First.To);
+  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
+  if (!FromFunc || !ToFunc) {
+    LLVM_DEBUG(
+        dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
+               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+               << " and ending in " << ToFunc->getPrintName() << " @ "
+               << ToFunc->getPrintName() << " @ "
+               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
+    NumLongRangeTraces += Count;
+    return false;
+  }
+  if (FromFunc != ToFunc) {
+    NumInvalidTraces += Count;
+    LLVM_DEBUG(
+        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
+               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+               << " and ending in " << ToFunc->getPrintName() << " @ "
+               << ToFunc->getPrintName() << " @ "
+               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
+    return false;
+  }
+
+  Optional<BoltAddressTranslation::FallthroughListTy> FTs =
+      BAT ? BAT->getFallthroughsInTrace(*FromFunc, First.To, Second.From)
+          : getFallthroughsInTrace(*FromFunc, First, Second, Count);
+  if (!FTs) {
+    LLVM_DEBUG(
+        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
+               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+               << " and ending in " << ToFunc->getPrintName() << " @ "
+               << ToFunc->getPrintName() << " @ "
+               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
+    NumInvalidTraces += Count;
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
+                    << FromFunc->getPrintName() << ":"
+                    << Twine::utohexstr(First.To) << " to "
+                    << Twine::utohexstr(Second.From) << ".\n");
+  for (const std::pair<uint64_t, uint64_t> &Pair : *FTs) {
+    doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(),
+                  Pair.second + FromFunc->getAddress(), Count, false);
+  }
+
+  return true;
+}
+
+bool DataAggregator::recordTrace(
+    BinaryFunction &BF,
+    const LBREntry &FirstLBR,
+    const LBREntry &SecondLBR,
+    uint64_t Count,
+    SmallVector<std::pair<uint64_t, uint64_t>, 16> *Branches) const {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  if (!BF.isSimple())
+    return false;
+
+  assert(BF.hasCFG() && "can only record traces in CFG state");
+
+  // Offsets of the trace within this function.
+  const uint64_t From = FirstLBR.To - BF.getAddress();
+  const uint64_t To = SecondLBR.From - BF.getAddress();
+
+  if (From > To)
+    return false;
+
+  BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(From);
+  BinaryBasicBlock *ToBB = BF.getBasicBlockContainingOffset(To);
+
+  if (!FromBB || !ToBB)
+    return false;
+
+  // Adjust FromBB if the first LBR is a return from the last instruction in
+  // the previous block (that instruction should be a call).
+  if (From == FromBB->getOffset() && !BF.containsAddress(FirstLBR.From) &&
+      !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
+    BinaryBasicBlock *PrevBB = BF.BasicBlocksLayout[FromBB->getIndex() - 1];
+    if (PrevBB->getSuccessor(FromBB->getLabel())) {
+      const MCInst *Instr = PrevBB->getLastNonPseudoInstr();
+      if (Instr && BC.MIB->isCall(*Instr)) {
+        FromBB = PrevBB;
+      } else {
+        LLVM_DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR
+                          << '\n');
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n');
+    }
+  }
+
+  // Fill out information for fall-through edges. The From and To could be
+  // within the same basic block, e.g. when two call instructions are in the
+  // same block. In this case we skip the processing.
+  if (FromBB == ToBB) {
+    return true;
+  }
+
+  // Process blocks in the original layout order.
+  BinaryBasicBlock *BB = BF.BasicBlocksLayout[FromBB->getIndex()];
+  assert(BB == FromBB && "index mismatch");
+  while (BB != ToBB) {
+    BinaryBasicBlock *NextBB = BF.BasicBlocksLayout[BB->getIndex() + 1];
+    assert((NextBB && NextBB->getOffset() > BB->getOffset()) && "bad layout");
+
+    // Check for bad LBRs.
+    if (!BB->getSuccessor(NextBB->getLabel())) {
+      LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n"
+                        << "  " << FirstLBR << '\n'
+                        << "  " << SecondLBR << '\n');
+      return false;
+    }
+
+   // Record fall-through jumps
+    BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(*NextBB);
+    BI.Count += Count;
+
+    if (Branches) {
+      const MCInst *Instr = BB->getLastNonPseudoInstr();
+      uint64_t Offset = 0;
+      if (Instr) {
+        Offset = BC.MIB->getAnnotationWithDefault<uint32_t>(*Instr, "Offset");
+      } else {
+        Offset = BB->getOffset();
+      }
+      Branches->emplace_back(Offset, NextBB->getOffset());
+    }
+
+    BB = NextBB;
+  }
+
+  return true;
+}
+
+Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
+DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
+                                       const LBREntry &FirstLBR,
+                                       const LBREntry &SecondLBR,
+                                       uint64_t Count) const {
+  SmallVector<std::pair<uint64_t, uint64_t>, 16> Res;
+
+  if (!recordTrace(BF, FirstLBR, SecondLBR, Count, &Res))
+    return NoneType();
+
+  return Res;
+}
+
+bool DataAggregator::recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
+                                 uint64_t Count) const {
+  if (To > BF.getSize())
+    return false;
+
+  if (!BF.hasProfile())
+    BF.ExecutionCount = 0;
+
+  BinaryBasicBlock *EntryBB = nullptr;
+  if (To == 0) {
+    BF.ExecutionCount += Count;
+    if (!BF.empty())
+      EntryBB = &BF.front();
+  } else if (BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(To)) {
+    if (BB->isEntryPoint())
+      EntryBB = BB;
+  }
+
+  if (EntryBB)
+    EntryBB->setExecutionCount(EntryBB->getKnownExecutionCount() + Count);
+
+  return true;
+}
+
+bool DataAggregator::recordExit(BinaryFunction &BF, uint64_t From,
+                                bool Mispred, uint64_t Count) const {
+  if (!BF.isSimple() || From > BF.getSize())
+    return false;
+
+  if (!BF.hasProfile())
+    BF.ExecutionCount = 0;
+
+  return true;
+}
+
+ErrorOr<LBREntry> DataAggregator::parseLBREntry() {
+  LBREntry Res;
+  ErrorOr<StringRef> FromStrRes = parseString('/');
+  if (std::error_code EC = FromStrRes.getError())
+    return EC;
+  StringRef OffsetStr = FromStrRes.get();
+  if (OffsetStr.getAsInteger(0, Res.From)) {
+    reportError("expected hexadecimal number with From address");
+    Diag << "Found: " << OffsetStr << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  ErrorOr<StringRef> ToStrRes = parseString('/');
+  if (std::error_code EC = ToStrRes.getError())
+    return EC;
+  OffsetStr = ToStrRes.get();
+  if (OffsetStr.getAsInteger(0, Res.To)) {
+    reportError("expected hexadecimal number with To address");
+    Diag << "Found: " << OffsetStr << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  ErrorOr<StringRef> MispredStrRes = parseString('/');
+  if (std::error_code EC = MispredStrRes.getError())
+    return EC;
+  StringRef MispredStr = MispredStrRes.get();
+  if (MispredStr.size() != 1 ||
+      (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
+    reportError("expected single char for mispred bit");
+    Diag << "Found: " << MispredStr << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+  Res.Mispred = MispredStr[0] == 'M';
+
+  static bool MispredWarning = true;;
+  if (MispredStr[0] == '-' && MispredWarning) {
+    errs() << "PERF2BOLT-WARNING: misprediction bit is missing in profile\n";
+    MispredWarning = false;
+  }
+
+  ErrorOr<StringRef> Rest = parseString(FieldSeparator, true);
+  if (std::error_code EC = Rest.getError())
+    return EC;
+  if (Rest.get().size() < 5) {
+    reportError("expected rest of LBR entry");
+    Diag << "Found: " << Rest.get() << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+  return Res;
+}
+
+bool DataAggregator::checkAndConsumeFS() {
+  if (ParsingBuf[0] != FieldSeparator) {
+    return false;
+  }
+  ParsingBuf = ParsingBuf.drop_front(1);
+  Col += 1;
+  return true;
+}
+
+void DataAggregator::consumeRestOfLine() {
+  size_t LineEnd = ParsingBuf.find_first_of('\n');
+  if (LineEnd == StringRef::npos) {
+    ParsingBuf = StringRef();
+    Col = 0;
+    Line += 1;
+    return;
+  }
+  ParsingBuf = ParsingBuf.drop_front(LineEnd + 1);
+  Col = 0;
+  Line += 1;
+}
+
+ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
+  PerfBranchSample Res;
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
+  if (std::error_code EC = PIDRes.getError())
+    return EC;
+  auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
+  if (!opts::LinuxKernelMode && MMapInfoIter == BinaryMMapInfo.end()) {
+    consumeRestOfLine();
+    return make_error_code(errc::no_such_process);
+  }
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<uint64_t> PCRes = parseHexField(FieldSeparator, true);
+  if (std::error_code EC = PCRes.getError())
+    return EC;
+  Res.PC = PCRes.get();
+
+  if (checkAndConsumeNewLine())
+    return Res;
+
+  while (!checkAndConsumeNewLine()) {
+    checkAndConsumeFS();
+
+    ErrorOr<LBREntry> LBRRes = parseLBREntry();
+    if (std::error_code EC = LBRRes.getError())
+      return EC;
+    LBREntry LBR = LBRRes.get();
+    if (ignoreKernelInterrupt(LBR))
+      continue;
+    if (!BC->HasFixedLoadAddress)
+      adjustLBR(LBR, MMapInfoIter->second);
+    Res.LBR.push_back(LBR);
+  }
+
+  return Res;
+}
+
+ErrorOr<DataAggregator::PerfBasicSample> DataAggregator::parseBasicSample() {
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
+  if (std::error_code EC = PIDRes.getError())
+    return EC;
+
+  auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
+  if (MMapInfoIter == BinaryMMapInfo.end()) {
+    consumeRestOfLine();
+    return PerfBasicSample{StringRef(), 0};
+  }
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<StringRef> Event = parseString(FieldSeparator);
+  if (std::error_code EC = Event.getError())
+    return EC;
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<uint64_t> AddrRes = parseHexField(FieldSeparator, true);
+  if (std::error_code EC = AddrRes.getError()) {
+    return EC;
+  }
+
+  if (!checkAndConsumeNewLine()) {
+    reportError("expected end of line");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  uint64_t Address = *AddrRes;
+  if (!BC->HasFixedLoadAddress)
+    adjustAddress(Address, MMapInfoIter->second);
+
+  return PerfBasicSample{Event.get(), Address};
+}
+
+ErrorOr<DataAggregator::PerfMemSample> DataAggregator::parseMemSample() {
+  PerfMemSample Res{0,0};
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
+  if (std::error_code EC = PIDRes.getError())
+    return EC;
+
+  auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
+  if (MMapInfoIter == BinaryMMapInfo.end()) {
+    consumeRestOfLine();
+    return Res;
+  }
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<StringRef> Event = parseString(FieldSeparator);
+  if (std::error_code EC = Event.getError())
+    return EC;
+  if (Event.get().find("mem-loads") == StringRef::npos) {
+    consumeRestOfLine();
+    return Res;
+  }
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<uint64_t> AddrRes = parseHexField(FieldSeparator);
+  if (std::error_code EC = AddrRes.getError()) {
+    return EC;
+  }
+
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<uint64_t> PCRes = parseHexField(FieldSeparator, true);
+  if (std::error_code EC = PCRes.getError()) {
+    consumeRestOfLine();
+    return EC;
+  }
+
+  if (!checkAndConsumeNewLine()) {
+    reportError("expected end of line");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  uint64_t Address = *AddrRes;
+  if (!BC->HasFixedLoadAddress)
+    adjustAddress(Address, MMapInfoIter->second);
+
+  return PerfMemSample{PCRes.get(), Address};
+}
+
+ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
+  auto parseOffset = [this]() -> ErrorOr<Location> {
+    ErrorOr<uint64_t> Res = parseHexField(FieldSeparator);
+    if (std::error_code EC = Res.getError())
+      return EC;
+    return Location(Res.get());
+  };
+
+  size_t Sep = ParsingBuf.find_first_of(" \n");
+  if (Sep == StringRef::npos)
+    return parseOffset();
+  StringRef LookAhead = ParsingBuf.substr(0, Sep);
+  if (LookAhead.find_first_of(":") == StringRef::npos)
+    return parseOffset();
+
+  ErrorOr<StringRef> BuildID = parseString(':');
+  if (std::error_code EC = BuildID.getError())
+    return EC;
+  ErrorOr<uint64_t> Offset = parseHexField(FieldSeparator);
+  if (std::error_code EC = Offset.getError())
+    return EC;
+  return Location(true, BuildID.get(), Offset.get());
+}
+
+ErrorOr<DataAggregator::AggregatedLBREntry>
+DataAggregator::parseAggregatedLBREntry() {
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
+  if (std::error_code EC = TypeOrErr.getError())
+    return EC;
+  auto Type = AggregatedLBREntry::BRANCH;
+  if (TypeOrErr.get() == "B") {
+    Type = AggregatedLBREntry::BRANCH;
+  } else if (TypeOrErr.get() == "F") {
+    Type = AggregatedLBREntry::FT;
+  } else if (TypeOrErr.get() == "f") {
+    Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
+  } else {
+    reportError("expected B, F or f");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  while (checkAndConsumeFS()) {}
+  ErrorOr<Location> From = parseLocationOrOffset();
+  if (std::error_code EC = From.getError())
+    return EC;
+
+  while (checkAndConsumeFS()) {}
+  ErrorOr<Location> To = parseLocationOrOffset();
+  if (std::error_code EC = To.getError())
+    return EC;
+
+  while (checkAndConsumeFS()) {}
+  ErrorOr<int64_t> Frequency =
+      parseNumberField(FieldSeparator, Type != AggregatedLBREntry::BRANCH);
+  if (std::error_code EC = Frequency.getError())
+    return EC;
+
+  uint64_t Mispreds = 0;
+  if (Type == AggregatedLBREntry::BRANCH) {
+    while (checkAndConsumeFS()) {}
+    ErrorOr<int64_t> MispredsOrErr = parseNumberField(FieldSeparator, true);
+    if (std::error_code EC = MispredsOrErr.getError())
+      return EC;
+    Mispreds = static_cast<uint64_t>(MispredsOrErr.get());
+  }
+
+  if (!checkAndConsumeNewLine()) {
+    reportError("expected end of line");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  return AggregatedLBREntry{From.get(), To.get(),
+                            static_cast<uint64_t>(Frequency.get()), Mispreds,
+                            Type};
+}
+
+bool DataAggregator::hasData() {
+  if (ParsingBuf.size() == 0)
+    return false;
+
+  return true;
+}
+
+bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
+  return opts::IgnoreInterruptLBR &&
+         (LBR.From >= KernelBaseAddr || LBR.To >= KernelBaseAddr);
+}
+
+std::error_code DataAggregator::printLBRHeatMap() {
+  outs() << "PERF2BOLT: parse branch events...\n";
+  NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
+                     TimerGroupDesc, opts::TimeAggregator);
+
+  if (opts::LinuxKernelMode) {
+    opts::HeatmapMaxAddress = 0xffffffffffffffff;
+    opts::HeatmapMinAddress = KernelBaseAddr;
+  }
+  Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress,
+             opts::HeatmapMaxAddress);
+  uint64_t NumTotalSamples = 0;
+
+  while (hasData()) {
+    ErrorOr<PerfBranchSample> SampleRes = parseBranchSample();
+    if (std::error_code EC = SampleRes.getError()) {
+      if (EC == errc::no_such_process)
+        continue;
+      return EC;
+    }
+
+    PerfBranchSample &Sample = SampleRes.get();
+
+    // LBRs are stored in reverse execution order. NextLBR refers to the next
+    // executed branch record.
+    const LBREntry *NextLBR = nullptr;
+    for (const LBREntry &LBR : Sample.LBR) {
+      if (NextLBR) {
+        // Record fall-through trace.
+        const uint64_t TraceFrom = LBR.To;
+        const uint64_t TraceTo = NextLBR->From;
+        ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount;
+      }
+      NextLBR = &LBR;
+    }
+    if (!Sample.LBR.empty()) {
+      HM.registerAddress(Sample.LBR.front().To);
+      HM.registerAddress(Sample.LBR.back().From);
+    }
+    NumTotalSamples += Sample.LBR.size();
+  }
+
+  if (!NumTotalSamples) {
+    errs() << "HEATMAP-ERROR: no LBR traces detected in profile. "
+              "Cannot build heatmap.\n";
+    exit(1);
+  }
+
+  outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n";
+  outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n";
+
+  outs() << "HEATMAP: building heat map...\n";
+
+  for (const auto &LBR : FallthroughLBRs) {
+    const Trace &Trace = LBR.first;
+    const FTInfo &Info = LBR.second;
+    HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount);
+  }
+
+  if (HM.getNumInvalidRanges())
+    outs() << "HEATMAP: invalid traces: " << HM.getNumInvalidRanges() << '\n';
+
+  if (!HM.size()) {
+    errs() << "HEATMAP-ERROR: no valid traces registered\n";
+    exit(1);
+  }
+
+  HM.print(opts::HeatmapFile);
+  if (opts::HeatmapFile == "-") {
+    HM.printCDF(opts::HeatmapFile);
+  } else {
+    HM.printCDF(opts::HeatmapFile + ".csv");
+  }
+
+  return std::error_code();
+}
+
+std::error_code DataAggregator::parseBranchEvents() {
+  outs() << "PERF2BOLT: parse branch events...\n";
+  NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
+                     TimerGroupDesc, opts::TimeAggregator);
+
+  uint64_t NumTotalSamples = 0;
+  uint64_t NumEntries = 0;
+  uint64_t NumSamples = 0;
+  uint64_t NumSamplesNoLBR = 0;
+  uint64_t NumTraces = 0;
+  bool NeedsSkylakeFix = false;
+
+  while (hasData() && NumTotalSamples < opts::MaxSamples) {
+    ++NumTotalSamples;
+
+    ErrorOr<PerfBranchSample> SampleRes = parseBranchSample();
+    if (std::error_code EC = SampleRes.getError()) {
+      if (EC == errc::no_such_process)
+        continue;
+      return EC;
+    }
+    ++NumSamples;
+
+    PerfBranchSample &Sample = SampleRes.get();
+    if (opts::WriteAutoFDOData)
+      ++BasicSamples[Sample.PC];
+
+    if (Sample.LBR.empty()) {
+      ++NumSamplesNoLBR;
+      continue;
+    }
+
+    NumEntries += Sample.LBR.size();
+    if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
+      errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
+      NeedsSkylakeFix = true;
+    }
+
+    // LBRs are stored in reverse execution order. NextPC refers to the next
+    // recorded executed PC.
+    uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
+    uint32_t NumEntry = 0;
+    for (const LBREntry &LBR : Sample.LBR) {
+      ++NumEntry;
+      // Hardware bug workaround: Intel Skylake (which has 32 LBR entries)
+      // sometimes record entry 32 as an exact copy of entry 31. This will cause
+      // us to likely record an invalid trace and generate a stale function for
+      // BAT mode (non BAT disassembles the function and is able to ignore this
+      // trace at aggregation time). Drop first 2 entries (last two, in
+      // chronological order)
+      if (NeedsSkylakeFix && NumEntry <= 2)
+        continue;
+      if (NextPC) {
+        // Record fall-through trace.
+        const uint64_t TraceFrom = LBR.To;
+        const uint64_t TraceTo = NextPC;
+        const BinaryFunction *TraceBF =
+            getBinaryFunctionContainingAddress(TraceFrom);
+        if (TraceBF && TraceBF->containsAddress(TraceTo)) {
+          FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
+          if (TraceBF->containsAddress(LBR.From)) {
+            ++Info.InternCount;
+          } else {
+            ++Info.ExternCount;
+          }
+        } else {
+          if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
+            LLVM_DEBUG(dbgs()
+                       << "Invalid trace starting in "
+                       << TraceBF->getPrintName() << " @ "
+                       << Twine::utohexstr(TraceFrom - TraceBF->getAddress())
+                       << " and ending @ " << Twine::utohexstr(TraceTo)
+                       << '\n');
+            ++NumInvalidTraces;
+          } else {
+            LLVM_DEBUG(dbgs()
+                       << "Out of range trace starting in "
+                       << (TraceBF ? TraceBF->getPrintName() : "None") << " @ "
+                       << Twine::utohexstr(
+                              TraceFrom - (TraceBF ? TraceBF->getAddress() : 0))
+                       << " and ending in "
+                       << (getBinaryFunctionContainingAddress(TraceTo)
+                               ? getBinaryFunctionContainingAddress(TraceTo)
+                                     ->getPrintName()
+                               : "None")
+                       << " @ "
+                       << Twine::utohexstr(
+                              TraceTo -
+                              (getBinaryFunctionContainingAddress(TraceTo)
+                                   ? getBinaryFunctionContainingAddress(TraceTo)
+                                         ->getAddress()
+                                   : 0))
+                       << '\n');
+            ++NumLongRangeTraces;
+          }
+        }
+        ++NumTraces;
+      }
+      NextPC = LBR.From;
+
+      uint64_t From = LBR.From;
+      if (!getBinaryFunctionContainingAddress(From))
+        From = 0;
+      uint64_t To = LBR.To;
+      if (!getBinaryFunctionContainingAddress(To))
+        To = 0;
+      if (!From && !To)
+        continue;
+      BranchInfo &Info = BranchLBRs[Trace(From, To)];
+      ++Info.TakenCount;
+      Info.MispredCount += LBR.Mispred;
+    }
+  }
+
+  for (const auto &LBR : BranchLBRs) {
+    const Trace &Trace = LBR.first;
+    if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.From))
+      BF->setHasProfileAvailable();
+    if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.To))
+      BF->setHasProfileAvailable();
+  }
+
+  auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) {
+    OS << " (";
+    if (OS.has_colors()) {
+      if (Percent > T2) {
+        OS.changeColor(raw_ostream::RED);
+      } else if (Percent > T1) {
+        OS.changeColor(raw_ostream::YELLOW);
+      } else {
+        OS.changeColor(raw_ostream::GREEN);
+      }
+    }
+    OS << format("%.1f%%", Percent);
+    if (OS.has_colors())
+      OS.resetColor();
+    OS << ")";
+  };
+
+  outs() << "PERF2BOLT: read " << NumSamples << " samples and "
+         << NumEntries << " LBR entries\n";
+  if (NumTotalSamples) {
+    if (NumSamples && NumSamplesNoLBR == NumSamples) {
+      // Note: we don't know if perf2bolt is being used to parse memory samples
+      // at this point. In this case, it is OK to parse zero LBRs.
+      errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
+                "LBR. Record profile with perf record -j any or run perf2bolt "
+                "in no-LBR mode with -nl (the performance improvement in -nl "
+                "mode may be limited)\n";
+    } else {
+      const uint64_t IgnoredSamples = NumTotalSamples - NumSamples;
+      const float PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples;
+      outs() << "PERF2BOLT: " << IgnoredSamples << " samples";
+      printColored(outs(), PercentIgnored, 20, 50);
+      outs() << " were ignored\n";
+      if (PercentIgnored > 50.0f) {
+        errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples "
+                  "were attributed to the input binary\n";
+      }
+    }
+  }
+  outs() << "PERF2BOLT: traces mismatching disassembled function contents: "
+         << NumInvalidTraces;
+  float Perc = 0.0f;
+  if (NumTraces > 0) {
+    Perc = NumInvalidTraces * 100.0f / NumTraces;
+    printColored(outs(), Perc, 5, 10);
+  }
+  outs() << "\n";
+  if (Perc > 10.0f) {
+    outs() << "\n !! WARNING !! This high mismatch ratio indicates the input "
+              "binary is probably not the same binary used during profiling "
+              "collection. The generated data may be ineffective for improving "
+              "performance.\n\n";
+  }
+
+  outs() << "PERF2BOLT: out of range traces involving unknown regions: "
+         << NumLongRangeTraces;
+  if (NumTraces > 0) {
+    outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces);
+  }
+  outs() << "\n";
+
+  if (NumColdSamples > 0) {
+    const float ColdSamples = NumColdSamples * 100.0f / NumTotalSamples;
+    outs() << "PERF2BOLT: " << NumColdSamples
+           << format(" (%.1f%%)", ColdSamples)
+           << " samples recorded in cold regions of split functions.\n";
+    if (ColdSamples > 5.0f) {
+      outs()
+          << "WARNING: The BOLT-processed binary where samples were collected "
+             "likely used bad data or your service observed a large shift in "
+             "profile. You may want to audit this.\n";
+    }
+  }
+
+  return std::error_code();
+}
+
+void DataAggregator::processBranchEvents() {
+  outs() << "PERF2BOLT: processing branch events...\n";
+  NamedRegionTimer T("processBranch", "Processing branch events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
+
+  for (const auto &AggrLBR : FallthroughLBRs) {
+    const Trace &Loc = AggrLBR.first;
+    const FTInfo &Info = AggrLBR.second;
+    LBREntry First{Loc.From, Loc.From, false};
+    LBREntry Second{Loc.To, Loc.To, false};
+    if (Info.InternCount) {
+      doTrace(First, Second, Info.InternCount);
+    }
+    if (Info.ExternCount) {
+      First.From = 0;
+      doTrace(First, Second, Info.ExternCount);
+    }
+  }
+
+  for (const auto &AggrLBR : BranchLBRs) {
+    const Trace &Loc = AggrLBR.first;
+    const BranchInfo &Info = AggrLBR.second;
+    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
+  }
+}
+
+std::error_code DataAggregator::parseBasicEvents() {
+  outs() << "PERF2BOLT: parsing basic events (without LBR)...\n";
+  NamedRegionTimer T("parseBasic", "Parsing basic events", TimerGroupName,
+                     TimerGroupDesc, opts::TimeAggregator);
+  while (hasData()) {
+    ErrorOr<PerfBasicSample> Sample = parseBasicSample();
+    if (std::error_code EC = Sample.getError())
+      return EC;
+
+    if (!Sample->PC)
+      continue;
+
+    if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
+      BF->setHasProfileAvailable();
+
+    ++BasicSamples[Sample->PC];
+    EventNames.insert(Sample->EventName);
+  }
+
+  return std::error_code();
+}
+
+void DataAggregator::processBasicEvents() {
+  outs() << "PERF2BOLT: processing basic events (without LBR)...\n";
+  NamedRegionTimer T("processBasic", "Processing basic events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
+  uint64_t OutOfRangeSamples = 0;
+  uint64_t NumSamples = 0;
+  for (auto &Sample : BasicSamples) {
+    const uint64_t PC = Sample.first;
+    const uint64_t HitCount = Sample.second;
+    NumSamples += HitCount;
+    BinaryFunction *Func = getBinaryFunctionContainingAddress(PC);
+    if (!Func) {
+      OutOfRangeSamples += HitCount;
+      continue;
+    }
+
+    doSample(*Func, PC, HitCount);
+  }
+  outs() << "PERF2BOLT: read " << NumSamples << " samples\n";
+
+  outs() << "PERF2BOLT: out of range samples recorded in unknown regions: "
+         << OutOfRangeSamples;
+  float Perc = 0.0f;
+  if (NumSamples > 0) {
+    outs() << " (";
+    Perc = OutOfRangeSamples * 100.0f / NumSamples;
+    if (outs().has_colors()) {
+      if (Perc > 60.0f) {
+        outs().changeColor(raw_ostream::RED);
+      } else if (Perc > 40.0f) {
+        outs().changeColor(raw_ostream::YELLOW);
+      } else {
+        outs().changeColor(raw_ostream::GREEN);
+      }
+    }
+    outs() << format("%.1f%%", Perc);
+    if (outs().has_colors())
+      outs().resetColor();
+    outs() << ")";
+  }
+  outs() << "\n";
+  if (Perc > 80.0f) {
+    outs() << "\n !! WARNING !! This high mismatch ratio indicates the input "
+              "binary is probably not the same binary used during profiling "
+              "collection. The generated data may be ineffective for improving "
+              "performance.\n\n";
+  }
+}
+
+std::error_code DataAggregator::parseMemEvents() {
+  outs() << "PERF2BOLT: parsing memory events...\n";
+  NamedRegionTimer T("parseMemEvents", "Parsing mem events", TimerGroupName,
+                     TimerGroupDesc, opts::TimeAggregator);
+  while (hasData()) {
+    ErrorOr<PerfMemSample> Sample = parseMemSample();
+    if (std::error_code EC = Sample.getError())
+      return EC;
+
+    if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
+      BF->setHasProfileAvailable();
+
+    MemSamples.emplace_back(std::move(Sample.get()));
+  }
+
+  return std::error_code();
+}
+
+void DataAggregator::processMemEvents() {
+  NamedRegionTimer T("ProcessMemEvents", "Processing mem events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
+  for (const PerfMemSample &Sample : MemSamples) {
+    uint64_t PC = Sample.PC;
+    uint64_t Addr = Sample.Addr;
+    StringRef FuncName;
+    StringRef MemName;
+
+    // Try to resolve symbol for PC
+    BinaryFunction *Func = getBinaryFunctionContainingAddress(PC);
+    if (!Func) {
+      LLVM_DEBUG(if (PC != 0) {
+        dbgs() << "Skipped mem event: 0x" << Twine::utohexstr(PC) << " => 0x"
+               << Twine::utohexstr(Addr) << "\n";
+      });
+      continue;
+    }
+
+    FuncName = Func->getOneName();
+    PC -= Func->getAddress();
+
+    // Try to resolve symbol for memory load
+    if (BinaryData *BD = BC->getBinaryDataContainingAddress(Addr)) {
+      MemName = BD->getName();
+      Addr -= BD->getAddress();
+    } else if (opts::FilterMemProfile) {
+      // Filter out heap/stack accesses
+      continue;
+    }
+
+    const Location FuncLoc(!FuncName.empty(), FuncName, PC);
+    const Location AddrLoc(!MemName.empty(), MemName, Addr);
+
+    FuncMemData *MemData = &NamesToMemEvents[FuncName];
+    setMemData(*Func, MemData);
+    MemData->update(FuncLoc, AddrLoc);
+    LLVM_DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n");
+  }
+}
+
+std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
+  outs() << "PERF2BOLT: parsing pre-aggregated profile...\n";
+  NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
+  while (hasData()) {
+    ErrorOr<AggregatedLBREntry> AggrEntry = parseAggregatedLBREntry();
+    if (std::error_code EC = AggrEntry.getError())
+      return EC;
+
+    if (BinaryFunction *BF =
+            getBinaryFunctionContainingAddress(AggrEntry->From.Offset))
+      BF->setHasProfileAvailable();
+    if (BinaryFunction *BF =
+            getBinaryFunctionContainingAddress(AggrEntry->To.Offset))
+      BF->setHasProfileAvailable();
+
+    AggregatedLBRs.emplace_back(std::move(AggrEntry.get()));
+  }
+
+  return std::error_code();
+}
+
+void DataAggregator::processPreAggregated() {
+  outs() << "PERF2BOLT: processing pre-aggregated profile...\n";
+  NamedRegionTimer T("processAggregated", "Processing aggregated branch events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
+
+  uint64_t NumTraces = 0;
+  for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) {
+    switch (AggrEntry.EntryType) {
+    case AggregatedLBREntry::BRANCH:
+      doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
+               AggrEntry.Mispreds);
+      break;
+    case AggregatedLBREntry::FT:
+    case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {
+      LBREntry First{AggrEntry.EntryType == AggregatedLBREntry::FT
+                         ? AggrEntry.From.Offset
+                         : 0,
+                     AggrEntry.From.Offset, false};
+      LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false};
+      doTrace(First, Second, AggrEntry.Count);
+      NumTraces += AggrEntry.Count;
+      break;
+    }
+    }
+  }
+
+  outs() << "PERF2BOLT: read " << AggregatedLBRs.size()
+         << " aggregated LBR entries\n";
+  outs() << "PERF2BOLT: traces mismatching disassembled function contents: "
+         << NumInvalidTraces;
+  float Perc = 0.0f;
+  if (NumTraces > 0) {
+    outs() << " (";
+    Perc = NumInvalidTraces * 100.0f / NumTraces;
+    if (outs().has_colors()) {
+      if (Perc > 10.0f) {
+        outs().changeColor(raw_ostream::RED);
+      } else if (Perc > 5.0f) {
+        outs().changeColor(raw_ostream::YELLOW);
+      } else {
+        outs().changeColor(raw_ostream::GREEN);
+      }
+    }
+    outs() << format("%.1f%%", Perc);
+    if (outs().has_colors())
+      outs().resetColor();
+    outs() << ")";
+  }
+  outs() << "\n";
+  if (Perc > 10.0f) {
+    outs() << "\n !! WARNING !! This high mismatch ratio indicates the input "
+              "binary is probably not the same binary used during profiling "
+              "collection. The generated data may be ineffective for improving "
+              "performance.\n\n";
+  }
+
+  outs() << "PERF2BOLT: Out of range traces involving unknown regions: "
+         << NumLongRangeTraces;
+  if (NumTraces > 0) {
+    outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces);
+  }
+  outs() << "\n";
+}
+
+Optional<pid_t>
+DataAggregator::parseCommExecEvent() {
+  size_t LineEnd = ParsingBuf.find_first_of("\n");
+  if (LineEnd == StringRef::npos) {
+    reportError("expected rest of line");
+    Diag << "Found: " << ParsingBuf << "\n";
+    return NoneType();
+  }
+  StringRef Line = ParsingBuf.substr(0, LineEnd);
+
+  size_t Pos = Line.find("PERF_RECORD_COMM exec");
+  if (Pos == StringRef::npos) {
+    return NoneType();
+  }
+  Line = Line.drop_front(Pos);
+
+  // Line:
+  //  PERF_RECORD_COMM exec: <name>:<pid>/<tid>"
+  StringRef PIDStr = Line.rsplit(':').second.split('/').first;
+  pid_t PID;
+  if (PIDStr.getAsInteger(10, PID)) {
+    reportError("expected PID");
+    Diag << "Found: " << PIDStr << "in '" << Line << "'\n";
+    return NoneType();
+  }
+
+  return PID;
+}
+
+namespace {
+Optional<uint64_t> parsePerfTime(const StringRef TimeStr) {
+  const StringRef SecTimeStr = TimeStr.split('.').first;
+  const StringRef USecTimeStr = TimeStr.split('.').second;
+  uint64_t SecTime;
+  uint64_t USecTime;
+  if (SecTimeStr.getAsInteger(10, SecTime) ||
+      USecTimeStr.getAsInteger(10, USecTime)) {
+    return NoneType();
+  }
+  return SecTime * 1000000ULL + USecTime;
+}
+}
+
+Optional<DataAggregator::ForkInfo>
+DataAggregator::parseForkEvent() {
+  while (checkAndConsumeFS()) {}
+
+  size_t LineEnd = ParsingBuf.find_first_of("\n");
+  if (LineEnd == StringRef::npos) {
+    reportError("expected rest of line");
+    Diag << "Found: " << ParsingBuf << "\n";
+    return NoneType();
+  }
+  StringRef Line = ParsingBuf.substr(0, LineEnd);
+
+  size_t Pos = Line.find("PERF_RECORD_FORK");
+  if (Pos == StringRef::npos) {
+    consumeRestOfLine();
+    return NoneType();
+  }
+
+  ForkInfo FI;
+
+  const StringRef TimeStr =
+      Line.substr(0, Pos).rsplit(':').first.rsplit(FieldSeparator).second;
+  if (Optional<uint64_t> TimeRes = parsePerfTime(TimeStr)) {
+    FI.Time = *TimeRes;
+  }
+
+  Line = Line.drop_front(Pos);
+
+  // Line:
+  //  PERF_RECORD_FORK(<child_pid>:<child_tid>):(<parent_pid>:<parent_tid>)
+  const StringRef ChildPIDStr = Line.split('(').second.split(':').first;
+  if (ChildPIDStr.getAsInteger(10, FI.ChildPID)) {
+    reportError("expected PID");
+    Diag << "Found: " << ChildPIDStr << "in '" << Line << "'\n";
+    return NoneType();
+  }
+
+  const StringRef ParentPIDStr = Line.rsplit('(').second.split(':').first;
+  if (ParentPIDStr.getAsInteger(10, FI.ParentPID)) {
+    reportError("expected PID");
+    Diag << "Found: " << ParentPIDStr << "in '" << Line << "'\n";
+    return NoneType();
+  }
+
+  consumeRestOfLine();
+
+  return FI;
+}
+
+ErrorOr<std::pair<StringRef, DataAggregator::MMapInfo>>
+DataAggregator::parseMMapEvent() {
+  while (checkAndConsumeFS()) {}
+
+  MMapInfo ParsedInfo;
+
+  size_t LineEnd = ParsingBuf.find_first_of("\n");
+  if (LineEnd == StringRef::npos) {
+    reportError("expected rest of line");
+    Diag << "Found: " << ParsingBuf << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+  StringRef Line = ParsingBuf.substr(0, LineEnd);
+
+  size_t Pos = Line.find("PERF_RECORD_MMAP2");
+  if (Pos == StringRef::npos) {
+    consumeRestOfLine();
+    return std::make_pair(StringRef(), ParsedInfo);
+  }
+
+  // Line:
+  //   {<name> .* <sec>.<usec>: }PERF_RECORD_MMAP2 <pid>/<tid>: .* <file_name>
+
+  const StringRef TimeStr =
+      Line.substr(0, Pos).rsplit(':').first.rsplit(FieldSeparator).second;
+  if (Optional<uint64_t> TimeRes = parsePerfTime(TimeStr)) {
+    ParsedInfo.Time = *TimeRes;
+  }
+
+  Line = Line.drop_front(Pos);
+
+  // Line:
+  //   PERF_RECORD_MMAP2 <pid>/<tid>: [<hexbase>(<hexsize>) .*]: .* <file_name>
+
+  StringRef FileName = Line.rsplit(FieldSeparator).second;
+  if (FileName.startswith("//") || FileName.startswith("[")) {
+    consumeRestOfLine();
+    return std::make_pair(StringRef(), ParsedInfo);
+  }
+  FileName = sys::path::filename(FileName);
+
+  const StringRef PIDStr = Line.split(FieldSeparator).second.split('/').first;
+  if (PIDStr.getAsInteger(10, ParsedInfo.PID)) {
+    reportError("expected PID");
+    Diag << "Found: " << PIDStr << "in '" << Line << "'\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  const StringRef BaseAddressStr = Line.split('[').second.split('(').first;
+  if (BaseAddressStr.getAsInteger(0, ParsedInfo.BaseAddress)) {
+    reportError("expected base address");
+    Diag << "Found: " << BaseAddressStr << "in '" << Line << "'\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  const StringRef SizeStr = Line.split('(').second.split(')').first;
+  if (SizeStr.getAsInteger(0, ParsedInfo.Size)) {
+    reportError("expected mmaped size");
+    Diag << "Found: " << SizeStr << "in '" << Line << "'\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  const StringRef OffsetStr =
+      Line.split('@').second.ltrim().split(FieldSeparator).first;
+  if (OffsetStr.getAsInteger(0, ParsedInfo.Offset)) {
+    reportError("expected mmaped page-aligned offset");
+    Diag << "Found: " << OffsetStr << "in '" << Line << "'\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  consumeRestOfLine();
+
+  return std::make_pair(FileName, ParsedInfo);
+}
+
+std::error_code DataAggregator::parseMMapEvents() {
+  outs() << "PERF2BOLT: parsing perf-script mmap events output\n";
+  NamedRegionTimer T("parseMMapEvents", "Parsing mmap events", TimerGroupName,
+                     TimerGroupDesc, opts::TimeAggregator);
+
+  std::multimap<StringRef, MMapInfo> GlobalMMapInfo;
+  while (hasData()) {
+    ErrorOr<std::pair<StringRef, MMapInfo>> FileMMapInfoRes = parseMMapEvent();
+    if (std::error_code EC = FileMMapInfoRes.getError())
+      return EC;
+
+    std::pair<StringRef, MMapInfo> FileMMapInfo = FileMMapInfoRes.get();
+    if (FileMMapInfo.second.PID == -1)
+      continue;
+
+    // Consider only the first mapping of the file for any given PID
+    bool PIDExists = false;
+    auto Range = GlobalMMapInfo.equal_range(FileMMapInfo.first);
+    for (auto MI = Range.first; MI != Range.second; ++MI) {
+      if (MI->second.PID == FileMMapInfo.second.PID) {
+        PIDExists = true;
+        break;
+      }
+    }
+    if (PIDExists)
+      continue;
+
+    GlobalMMapInfo.insert(FileMMapInfo);
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "FileName -> mmap info:\n";
+    for (const std::pair<const StringRef, MMapInfo> &Pair : GlobalMMapInfo) {
+      dbgs() << "  " << Pair.first << " : " << Pair.second.PID << " [0x"
+             << Twine::utohexstr(Pair.second.BaseAddress) << ", "
+             << Twine::utohexstr(Pair.second.Size) << " @ "
+             << Twine::utohexstr(Pair.second.Offset) << "]\n";
+    }
+  });
+
+  StringRef NameToUse = llvm::sys::path::filename(BC->getFilename());
+  if (GlobalMMapInfo.count(NameToUse) == 0 && !BuildIDBinaryName.empty()) {
+    errs() << "PERF2BOLT-WARNING: using \"" << BuildIDBinaryName
+           << "\" for profile matching\n";
+    NameToUse = BuildIDBinaryName;
+  }
+
+  auto Range = GlobalMMapInfo.equal_range(NameToUse);
+  for (auto I = Range.first; I != Range.second; ++I) {
+    const MMapInfo &MMapInfo = I->second;
+    if (BC->HasFixedLoadAddress && MMapInfo.BaseAddress) {
+      // Check that the binary mapping matches one of the segments.
+      bool MatchFound = false;
+      for (auto &KV : BC->SegmentMapInfo) {
+        SegmentInfo &SegInfo = KV.second;
+        // The mapping is page-aligned and hence the BaseAddress could be
+        // different from the segment start address. We cannot know the page
+        // size of the mapping, but we know it should not exceed the segment
+        // alignment value. Hence we are performing an approximate check.
+        if (SegInfo.Address >= MMapInfo.BaseAddress &&
+            SegInfo.Address - MMapInfo.BaseAddress < SegInfo.Alignment) {
+          MatchFound = true;
+          break;
+        }
+      }
+      if (!MatchFound) {
+        errs() << "PERF2BOLT-WARNING: ignoring mapping of " << NameToUse
+               << " at 0x" << Twine::utohexstr(MMapInfo.BaseAddress) << '\n';
+        continue;
+      }
+    }
+
+    BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo));
+  }
+
+  if (BinaryMMapInfo.empty()) {
+    if (errs().has_colors())
+      errs().changeColor(raw_ostream::RED);
+    errs() << "PERF2BOLT-ERROR: could not find a profile matching binary \""
+           << BC->getFilename() << "\".";
+    if (!GlobalMMapInfo.empty()) {
+      errs() << " Profile for the following binary name(s) is available:\n";
+      for (auto I = GlobalMMapInfo.begin(), IE = GlobalMMapInfo.end(); I != IE;
+           I = GlobalMMapInfo.upper_bound(I->first)) {
+        errs() << "  " << I->first << '\n';
+      }
+      errs() << "Please rename the input binary.\n";
+    } else {
+      errs() << " Failed to extract any binary name from a profile.\n";
+    }
+    if (errs().has_colors())
+      errs().resetColor();
+
+    exit(1);
+  }
+
+  return std::error_code();
+}
+
+std::error_code DataAggregator::parseTaskEvents() {
+  outs() << "PERF2BOLT: parsing perf-script task events output\n";
+  NamedRegionTimer T("parseTaskEvents", "Parsing task events", TimerGroupName,
+                     TimerGroupDesc, opts::TimeAggregator);
+
+  while (hasData()) {
+    if (Optional<pid_t> CommInfo = parseCommExecEvent()) {
+      // Remove forked child that ran execve
+      auto MMapInfoIter = BinaryMMapInfo.find(*CommInfo);
+      if (MMapInfoIter != BinaryMMapInfo.end() &&
+          MMapInfoIter->second.Forked) {
+        BinaryMMapInfo.erase(MMapInfoIter);
+      }
+      consumeRestOfLine();
+      continue;
+    }
+
+    Optional<ForkInfo> ForkInfo = parseForkEvent();
+    if (!ForkInfo)
+      continue;
+
+    if (ForkInfo->ParentPID == ForkInfo->ChildPID)
+      continue;
+
+    if (ForkInfo->Time == 0) {
+      // Process was forked and mmaped before perf ran. In this case the child
+      // should have its own mmap entry unless it was execve'd.
+      continue;
+    }
+
+    auto MMapInfoIter = BinaryMMapInfo.find(ForkInfo->ParentPID);
+    if (MMapInfoIter == BinaryMMapInfo.end())
+      continue;
+
+    MMapInfo MMapInfo = MMapInfoIter->second;
+    MMapInfo.PID = ForkInfo->ChildPID;
+    MMapInfo.Forked = true;
+    BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo));
+  }
+
+  outs() << "PERF2BOLT: input binary is associated with "
+         << BinaryMMapInfo.size() << " PID(s)\n";
+
+  LLVM_DEBUG({
+    for (std::pair<const uint64_t, MMapInfo> &MMI : BinaryMMapInfo) {
+      outs() << "  " << MMI.second.PID << (MMI.second.Forked ? " (forked)" : "")
+             << ": (0x" << Twine::utohexstr(MMI.second.BaseAddress)
+             << ": 0x" << Twine::utohexstr(MMI.second.Size) << ")\n";
+    }
+  });
+
+  return std::error_code();
+}
+
+Optional<std::pair<StringRef, StringRef>>
+DataAggregator::parseNameBuildIDPair() {
+  while (checkAndConsumeFS()) {}
+
+  ErrorOr<StringRef> BuildIDStr = parseString(FieldSeparator, true);
+  if (std::error_code EC = BuildIDStr.getError())
+    return NoneType();
+
+  ErrorOr<StringRef> NameStr = parseString(FieldSeparator, true);
+  if (std::error_code EC = NameStr.getError())
+    return NoneType();
+
+  consumeRestOfLine();
+  return std::make_pair(NameStr.get(), BuildIDStr.get());
+}
+
+Optional<StringRef>
+DataAggregator::getFileNameForBuildID(StringRef FileBuildID) {
+  while (hasData()) {
+    Optional<std::pair<StringRef, StringRef>> IDPair = parseNameBuildIDPair();
+    if (!IDPair)
+      return NoneType();
+
+    if (IDPair->second.startswith(FileBuildID))
+      return sys::path::filename(IDPair->first);
+  }
+  return NoneType();
+}
+
+std::error_code
+DataAggregator::writeAggregatedFile(StringRef OutputFilename) const {
+  std::error_code EC;
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    return EC;
+
+  bool WriteMemLocs = false;
+
+  auto writeLocation = [&OutFile,&WriteMemLocs](const Location &Loc) {
+    if (WriteMemLocs)
+      OutFile << (Loc.IsSymbol ? "4 " : "3 ");
+    else
+      OutFile << (Loc.IsSymbol ? "1 " : "0 ");
+    OutFile << (Loc.Name.empty() ? "[unknown]" : Loc.Name)  << " "
+            << Twine::utohexstr(Loc.Offset)
+            << FieldSeparator;
+  };
+
+  uint64_t BranchValues = 0;
+  uint64_t MemValues = 0;
+
+  if (BAT)
+    OutFile << "boltedcollection\n";
+  if (opts::BasicAggregation) {
+    OutFile << "no_lbr";
+    for (const StringMapEntry<NoneType> &Entry : EventNames) {
+      OutFile << " " << Entry.getKey();
+    }
+    OutFile << "\n";
+
+    for (const StringMapEntry<FuncSampleData> &Func : NamesToSamples) {
+      for (const SampleInfo &SI : Func.getValue().Data) {
+        writeLocation(SI.Loc);
+        OutFile << SI.Hits << "\n";
+        ++BranchValues;
+      }
+    }
+  } else {
+    for (const StringMapEntry<FuncBranchData> &Func : NamesToBranches) {
+      for (const llvm::bolt::BranchInfo &BI : Func.getValue().Data) {
+        writeLocation(BI.From);
+        writeLocation(BI.To);
+        OutFile << BI.Mispreds << " " << BI.Branches << "\n";
+        ++BranchValues;
+      }
+      for (const llvm::bolt::BranchInfo &BI : Func.getValue().EntryData) {
+        // Do not output if source is a known symbol, since this was already
+        // accounted for in the source function
+        if (BI.From.IsSymbol)
+          continue;
+        writeLocation(BI.From);
+        writeLocation(BI.To);
+        OutFile << BI.Mispreds << " " << BI.Branches << "\n";
+        ++BranchValues;
+      }
+    }
+
+    WriteMemLocs = true;
+    for (const StringMapEntry<FuncMemData> &Func : NamesToMemEvents) {
+      for (const MemInfo &MemEvent : Func.getValue().Data) {
+        writeLocation(MemEvent.Offset);
+        writeLocation(MemEvent.Addr);
+        OutFile << MemEvent.Count << "\n";
+        ++MemValues;
+      }
+    }
+  }
+
+  outs() << "PERF2BOLT: wrote " << BranchValues << " objects and "
+         << MemValues << " memory objects to " << OutputFilename << "\n";
+
+  return std::error_code();
+}
+
+void DataAggregator::dump() const {
+  DataReader::dump();
+}
+
+void DataAggregator::dump(const LBREntry &LBR) const {
+  Diag << "From: " << Twine::utohexstr(LBR.From)
+       << " To: " << Twine::utohexstr(LBR.To) << " Mispred? " << LBR.Mispred
+       << "\n";
+}
+
+void DataAggregator::dump(const PerfBranchSample &Sample) const {
+  Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n";
+  for (const LBREntry &LBR : Sample.LBR) {
+    dump(LBR);
+  }
+}
+
+void DataAggregator::dump(const PerfMemSample &Sample) const {
+  Diag << "Sample mem entries: " << Sample.PC << ": " << Sample.Addr << "\n";
+}
diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h
new file mode 100644
index 000000000000..0d153e7af454
--- /dev/null
+++ b/bolt/src/DataAggregator.h
@@ -0,0 +1,487 @@
+//===-- DataAggregator.h - Perf data aggregator -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions reads profile data written by perf record,
+// aggregates it and then writes it back to an output file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DATA_AGGREGATOR_H
+#define LLVM_TOOLS_LLVM_BOLT_DATA_AGGREGATOR_H
+
+#include "DataReader.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Program.h"
+#include <unordered_map>
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+class BinaryContext;
+class BoltAddressTranslation;
+
+/// DataAggregator inherits all parsing logic from DataReader as well as
+/// its data structures used to represent aggregated profile data in memory.
+///
+/// The aggregator works by dispatching two separate perf-script jobs that
+/// read perf samples and perf task annotations. Later, we read the output
+/// files to extract information about which PID was used for this binary.
+/// With the PID, we filter the samples and extract all LBR entries.
+///
+/// To aggregate LBR entries, we rely on a BinaryFunction map to locate the
+/// original function where the event happened. Then, we convert a raw address
+/// to an offset relative to the start of this function and aggregate branch
+/// information for each function.
+///
+/// This must be coordinated with RewriteInstance so we have BinaryFunctions in
+/// State::Disassembled. After this state, BinaryFunction will drop the
+/// instruction map with original addresses we rely on to validate the traces
+/// found in the LBR.
+///
+/// The last step is to write the aggregated data to disk in the output file
+/// specified by the user.
+class DataAggregator : public DataReader {
+public:
+  explicit DataAggregator(StringRef Filename)
+      : DataReader(Filename) {
+    start();
+  }
+
+  ~DataAggregator();
+
+  StringRef getReaderName() const override {
+    return "perf data aggregator";
+  }
+
+  bool isTrustedSource() const override {
+    return true;
+  }
+
+  Error preprocessProfile(BinaryContext &BC) override;
+
+  Error readProfilePreCFG(BinaryContext &BC) override {
+    return Error::success();
+  }
+
+  Error readProfile(BinaryContext &BC) override;
+
+  bool mayHaveProfileData(const BinaryFunction &BF) override;
+
+  /// Set Bolt Address Translation Table when processing samples collected in
+  /// bolted binaries
+  void setBAT(BoltAddressTranslation *B) override { BAT = B; }
+
+  /// Check whether \p FileName is a perf.data file
+  static bool checkPerfDataMagic(StringRef FileName);
+
+private:
+  struct PerfBranchSample {
+    SmallVector<LBREntry, 32> LBR;
+    uint64_t PC;
+  };
+
+  struct PerfBasicSample {
+    StringRef EventName;
+    uint64_t PC;
+  };
+
+  struct PerfMemSample {
+    uint64_t PC;
+    uint64_t Addr;
+  };
+
+  /// Used for parsing specific pre-aggregated input files.
+  struct AggregatedLBREntry {
+    enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
+    Location From;
+    Location To;
+    uint64_t Count;
+    uint64_t Mispreds;
+    Type EntryType;
+  };
+
+  struct Trace {
+    uint64_t From;
+    uint64_t To;
+    Trace(uint64_t From, uint64_t To)
+      : From(From), To(To) {}
+    bool operator==(const Trace &Other) const {
+      return From == Other.From && To == Other.To;
+    }
+  };
+
+  struct TraceHash {
+    size_t operator()(const Trace &L) const {
+      return std::hash<uint64_t>()(L.From << 32 | L.To);
+    }
+  };
+
+  struct FTInfo {
+    uint64_t InternCount{0};
+    uint64_t ExternCount{0};
+  };
+
+  struct BranchInfo {
+    uint64_t TakenCount{0};
+    uint64_t MispredCount{0};
+  };
+
+  /// Intermediate storage for profile data. We save the results of parsing
+  /// and use them later for processing and assigning profile.
+  std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs;
+  std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
+  std::vector<AggregatedLBREntry> AggregatedLBRs;
+  std::unordered_map<uint64_t, uint64_t> BasicSamples;
+  std::vector<PerfMemSample> MemSamples;
+
+  template<typename T> void clear(T& Container) {
+    T TempContainer;
+    TempContainer.swap(Container);
+  }
+
+  /// Perf utility full path name
+  std::string PerfPath;
+
+  /// Perf process spawning bookkeeping
+  struct PerfProcessInfo {
+    bool IsFinished{false};
+    sys::ProcessInfo PI;
+    SmallVector<char, 256> StdoutPath;
+    SmallVector<char, 256> StderrPath;
+  };
+
+  /// Process info for spawned processes
+  PerfProcessInfo MainEventsPPI;
+  PerfProcessInfo MemEventsPPI;
+  PerfProcessInfo MMapEventsPPI;
+  PerfProcessInfo TaskEventsPPI;
+
+  /// Kernel VM starts at fixed based address
+  /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt
+  static constexpr uint64_t KernelBaseAddr = 0xffff800000000000;
+
+  /// Current list of created temporary files
+  std::vector<std::string> TempFiles;
+
+  /// Name of the binary with matching build-id from perf.data if different
+  /// from the file name in BC.
+  std::string BuildIDBinaryName;
+
+  /// Memory map info for a single file
+  struct MMapInfo {
+    pid_t PID{-1LL};
+    uint64_t BaseAddress;
+    uint64_t Size;
+    uint64_t Offset;
+    bool Forked{false};
+    uint64_t Time{0ULL}; // time in micro seconds
+  };
+
+  /// Per-PID map info for the binary
+  std::unordered_map<uint64_t, MMapInfo> BinaryMMapInfo;
+
+  /// Fork event info
+  struct ForkInfo {
+    pid_t ParentPID;
+    pid_t ChildPID;
+    uint64_t Time{0ULL};
+  };
+
+  /// References to core BOLT data structures
+  BinaryContext *BC{nullptr};
+
+  BoltAddressTranslation *BAT{nullptr};
+
+  /// Update function execution profile with a recorded trace.
+  /// A trace is region of code executed between two LBR entries supplied in
+  /// execution order.
+  ///
+  /// Return true if the trace is valid, false otherwise.
+  bool recordTrace(BinaryFunction &BF,
+      const LBREntry &First,
+      const LBREntry &Second,
+      uint64_t Count = 1,
+      SmallVector<std::pair<uint64_t, uint64_t>, 16> *Branches = nullptr) const;
+
+  /// Return a vector of offsets corresponding to a trace in a function
+  /// (see recordTrace() above).
+  Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
+  getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
+                         const LBREntry &Second, uint64_t Count = 1) const;
+
+  /// Record external entry into the function \p BF.
+  ///
+  /// Return true if the entry is valid, false otherwise.
+  bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
+                   uint64_t Count = 1) const;
+
+  /// Record exit from the function \p BF via a call or return.
+  ///
+  /// Return true if the exit point is valid, false otherwise.
+  bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
+                  uint64_t Count = 1) const;
+
+  /// Aggregation statistics
+  uint64_t NumInvalidTraces{0};
+  uint64_t NumLongRangeTraces{0};
+  uint64_t NumColdSamples{0};
+
+  /// Looks into system PATH for Linux Perf and set up the aggregator to use it
+  void findPerfExecutable();
+
+  /// Launch a perf subprocess with given args and save output for later
+  /// parsing.
+  void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
+                         const char *ArgsString, bool Wait);
+
+  /// Delete all temporary files created to hold the output generated by spawned
+  /// subprocesses during the aggregation job
+  void deleteTempFiles();
+
+  // Semantic pass helpers
+
+  /// Look up which function contains an address by using out map of
+  /// disassembled BinaryFunctions
+  BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
+
+  /// Retrieve the location name to be used for samples recorded in \p Func.
+  /// If doing BAT translation, link cold parts to the hot part  names (used by
+  /// the original binary).  \p Count specifies how many samples were recorded
+  /// at that location, so we can tally total activity in cold areas if we are
+  /// dealing with profiling data collected in a bolted binary. For LBRs,
+  /// \p Count should only be used for the source of the branch to avoid
+  /// counting cold activity twice (one for source and another for destination).
+  StringRef getLocationName(BinaryFunction &Func, uint64_t Count);
+
+  /// Semantic actions - parser hooks to interpret parsed perf samples
+  /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
+  bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count);
+
+  /// Register an intraprocedural branch \p Branch.
+  bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
+                     uint64_t Count, uint64_t Mispreds);
+
+  /// Register an interprocedural branch from \p FromFunc to \p ToFunc with
+  /// offsets \p From and \p To, respectively.
+  bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc,
+                     uint64_t From, uint64_t To, uint64_t Count,
+                     uint64_t Mispreds);
+
+  /// Register a \p Branch.
+  bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
+
+  /// Register a trace between two LBR entries supplied in execution order.
+  bool doTrace(const LBREntry &First, const LBREntry &Second,
+               uint64_t Count = 1);
+
+  /// Parser helpers
+  /// Return false if we exhausted our parser buffer and finished parsing
+  /// everything
+  bool hasData();
+
+  /// Print heat map based on LBR samples.
+  std::error_code printLBRHeatMap();
+
+  /// Parse a single perf sample containing a PID associated with a sequence of
+  /// LBR entries. If the PID does not correspond to the binary we are looking
+  /// for, return std::errc::no_such_process. If other parsing errors occur,
+  /// return the error. Otherwise, return the parsed sample.
+  ErrorOr<PerfBranchSample> parseBranchSample();
+
+  /// Parse a single perf sample containing a PID associated with an event name
+  /// and a PC
+  ErrorOr<PerfBasicSample> parseBasicSample();
+
+  /// Parse a single perf sample containing a PID associated with an IP and
+  /// address.
+  ErrorOr<PerfMemSample> parseMemSample();
+
+  /// Parse pre-aggregated LBR samples created by an external tool
+  ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
+
+  /// Parse either buildid:offset or just offset, representing a location in the
+  /// binary. Used exclusevely for pre-aggregated LBR samples.
+  ErrorOr<Location> parseLocationOrOffset();
+
+  /// Check if a field separator is the next char to parse and, if yes, consume
+  /// it and return true
+  bool checkAndConsumeFS();
+
+  /// Consume the entire line
+  void consumeRestOfLine();
+
+  /// Parse a single LBR entry as output by perf script -Fbrstack
+  ErrorOr<LBREntry> parseLBREntry();
+
+  /// Parse and pre-aggregate branch events.
+  std::error_code parseBranchEvents();
+
+  /// Process all branch events.
+  void processBranchEvents();
+
+  /// This member function supports generating data for AutoFDO LLVM tools.
+  std::error_code writeAutoFDOData(StringRef OutputFilename);
+
+  /// Parse the full output generated by perf script to report non-LBR samples.
+  std::error_code parseBasicEvents();
+
+  /// Process non-LBR events.
+  void processBasicEvents();
+
+  /// Parse the full output generated by perf script to report memory events.
+  std::error_code parseMemEvents();
+
+  /// Process parsed memory events profile.
+  void processMemEvents();
+
+  /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping
+  /// between the binary name and its memory layout in a process with a given
+  /// PID.
+  /// On success return a <FileName, MMapInfo> pair.
+  ErrorOr<std::pair<StringRef, MMapInfo>> parseMMapEvent();
+
+  /// Parse PERF_RECORD_FORK event.
+  Optional<ForkInfo> parseForkEvent();
+
+  /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string.
+  Optional<pid_t> parseCommExecEvent();
+
+  /// Parse the full output generated by `perf script --show-mmap-events`
+  /// to generate mapping between binary files and their memory mappings for
+  /// all PIDs.
+  std::error_code parseMMapEvents();
+
+  /// Parse output of `perf script --show-task-events`, and forked processes
+  /// to the set of tracked PIDs.
+  std::error_code parseTaskEvents();
+
+  /// Parse a single pair of binary full path and associated build-id
+  Optional<std::pair<StringRef, StringRef>> parseNameBuildIDPair();
+
+  /// Parse the output generated by "perf buildid-list" to extract build-ids
+  /// and return a file name matching a given \p FileBuildID.
+  Optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
+
+  /// Coordinate reading and parsing of pre-aggregated file
+  ///
+  /// The regular perf2bolt aggregation job is to read perf output directly.
+  /// However, if the data is coming from a database instead of perf, one could
+  /// write a query to produce a pre-aggregated file. This function deals with
+  /// this case.
+  ///
+  /// The pre-aggregated file contains aggregated LBR data, but without binary
+  /// knowledge. BOLT will parse it and, using information from the disassembled
+  /// binary, augment it with fall-through edge frequency information. After
+  /// this step is finished, this data can be either written to disk to be
+  /// consumed by BOLT later, or can be used by BOLT immediately if kept in
+  /// memory.
+  ///
+  /// File format syntax:
+  /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
+  ///       [<mispred_count>]
+  ///
+  /// B - indicates an aggregated branch
+  /// F - an aggregated fall-through
+  /// f - an aggregated fall-through with external origin - used to disambiguate
+  ///       between a return hitting a basic block head and a regular internal
+  ///       jump to the block
+  ///
+  /// <start_id> - build id of the object containing the start address. We can
+  /// skip it for the main binary and use "X" for an unknown object. This will
+  /// save some space and facilitate human parsing.
+  ///
+  /// <start_offset> - hex offset from the object base load address (0 for the
+  /// main executable unless it's PIE) to the start address.
+  ///
+  /// <end_id>, <end_offset> - same for the end address.
+  ///
+  /// <count> - total aggregated count of the branch or a fall-through.
+  ///
+  /// <mispred_count> - the number of times the branch was mispredicted.
+  /// Omitted for fall-throughs.
+  ///
+  /// Example:
+  /// F 41be50 41be50 3
+  /// F 41be90 41be90 4
+  /// B 4b1942 39b57f0 3 0
+  /// B 4b196f 4b19e0 2 0
+  void parsePreAggregated();
+
+  /// Parse the full output of pre-aggregated LBR samples generated by
+  /// an external tool.
+  std::error_code parsePreAggregatedLBRSamples();
+
+  /// Process parsed pre-aggregated data.
+  void processPreAggregated();
+
+  /// If \p Address falls into the binary address space based on memory
+  /// mapping info \p MMI, then adjust it for further processing by subtracting
+  /// the base load address. External addresses, i.e. addresses that do not
+  /// correspond to the binary allocated address space, are adjusted to avoid
+  /// conflicts.
+  void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const {
+    if (Address >= MMI.BaseAddress && Address < MMI.BaseAddress + MMI.Size) {
+      // NOTE: Assumptions about the binary segment load table (PH for ELF)
+      //  Segment file offset equals virtual address (which is true for .so)
+      //  There aren't multiple executable segments loaded because MMapInfo
+      //  doesn't support them.
+      Address -= MMI.BaseAddress - MMI.Offset;
+    } else if (Address < MMI.Size) {
+      // Make sure the address is not treated as belonging to the binary.
+      Address = (-1ULL);
+    }
+  }
+
+  /// Adjust addresses in \p LBR entry.
+  void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const {
+    adjustAddress(LBR.From, MMI);
+    adjustAddress(LBR.To, MMI);
+  }
+
+  /// Ignore kernel/user transition LBR if requested
+  bool ignoreKernelInterrupt(LBREntry &LBR) const;
+
+  /// Populate functions in \p BC with profile.
+  void processProfile(BinaryContext &BC);
+
+  /// Start an aggregation job asynchronously.
+  void start();
+
+  /// Returns true if this aggregation job is using a translation table to
+  /// remap samples collected on binaries already processed by BOLT.
+  bool usesBAT() const { return BAT; }
+
+  /// Force all subprocesses to stop and cancel aggregation
+  void abort();
+
+  /// Dump data structures into a file readable by llvm-bolt
+  std::error_code writeAggregatedFile(StringRef OutputFilename) const;
+
+  /// Filter out binaries based on PID
+  void filterBinaryMMapInfo();
+
+  /// If we have a build-id available for the input file, use it to assist
+  /// matching profile to a binary.
+  ///
+  /// If the binary name changed after profile collection, use build-id
+  /// to get the proper name in perf data when build-ids are available.
+  /// If \p FileBuildID has no match, then issue an error and exit.
+  void processFileBuildID(StringRef FileBuildID);
+
+  /// Debugging dump methods
+  void dump() const;
+  void dump(const LBREntry &LBR) const;
+  void dump(const PerfBranchSample &Sample) const;
+  void dump(const PerfMemSample &Sample) const;
+};
+}
+}
+
+#endif
diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp
new file mode 100644
index 000000000000..42b995b19586
--- /dev/null
+++ b/bolt/src/DataReader.cpp
@@ -0,0 +1,1448 @@
+//===-- DataReader.cpp - Perf data reader -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions reads profile data written by the perf2bolt
+// utility and stores it in memory for llvm-bolt consumption.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "BinaryFunction.h"
+#include "DataReader.h"
+#include "Passes/MCF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <map>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt-prof"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+extern llvm::cl::opt<unsigned> Verbosity;
+
+static cl::opt<bool>
+DumpData("dump-data",
+  cl::desc("dump parsed bolt data for debugging"),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+Optional<StringRef> getLTOCommonName(const StringRef Name) {
+  size_t LTOSuffixPos = Name.find(".lto_priv.");
+  if (LTOSuffixPos != StringRef::npos) {
+    return Name.substr(0, LTOSuffixPos + 10);
+  } else if ((LTOSuffixPos = Name.find(".constprop.")) != StringRef::npos) {
+    return Name.substr(0, LTOSuffixPos + 11);
+  } else {
+    return NoneType();
+  }
+}
+
+namespace {
+
+/// Return true if the function name can change across compilations.
+bool hasVolatileName(const BinaryFunction &BF) {
+  for (const StringRef Name : BF.getNames()) {
+    if (getLTOCommonName(Name))
+      return true;
+  }
+  return false;
+}
+
+/// Return standard name of the function possibly renamed by BOLT.
+StringRef normalizeName(StringRef Name) {
+  // Strip "PG." prefix used for globalized locals.
+  return Name.startswith("PG.") ? Name.substr(2) : Name;
+}
+
+} // anonymous namespace
+
+raw_ostream &operator<<(raw_ostream &OS, const Location &Loc) {
+  if (Loc.IsSymbol) {
+    OS << Loc.Name;
+    if (Loc.Offset)
+      OS << "+" << Twine::utohexstr(Loc.Offset);
+  } else {
+    OS << Twine::utohexstr(Loc.Offset);
+  }
+  return OS;
+}
+
+iterator_range<FuncBranchData::ContainerTy::const_iterator>
+FuncBranchData::getBranchRange(uint64_t From) const {
+  assert(std::is_sorted(Data.begin(), Data.end()));
+  struct Compare {
+    bool operator()(const BranchInfo &BI, const uint64_t Val) const {
+      return BI.From.Offset < Val;
+    }
+    bool operator()(const uint64_t Val, const BranchInfo &BI) const {
+      return Val < BI.From.Offset;
+    }
+  };
+  auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare());
+  return iterator_range<ContainerTy::const_iterator>(Range.first, Range.second);
+}
+
+void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) {
+  Data.insert(Data.end(), FBD.Data.begin(), FBD.Data.end());
+  for (auto I = Data.begin(), E = Data.end(); I != E; ++I) {
+    if (I->From.Name == FBD.Name) {
+      I->From.Name = this->Name;
+      I->From.Offset += Offset;
+    }
+    if (I->To.Name == FBD.Name) {
+      I->To.Name = this->Name;
+      I->To.Offset += Offset;
+    }
+  }
+  std::stable_sort(Data.begin(), Data.end());
+  ExecutionCount += FBD.ExecutionCount;
+  for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) {
+    assert(I->To.Name == FBD.Name);
+    auto NewElmt = EntryData.insert(EntryData.end(), *I);
+    NewElmt->To.Name = this->Name;
+    NewElmt->To.Offset += Offset;
+  }
+}
+
+uint64_t FuncBranchData::getNumExecutedBranches() const {
+  uint64_t ExecutedBranches = 0;
+  for (const BranchInfo &BI : Data) {
+    int64_t BranchCount = BI.Branches;
+    assert(BranchCount >= 0 && "branch execution count should not be negative");
+    ExecutedBranches += BranchCount;
+  }
+  return ExecutedBranches;
+}
+
+void SampleInfo::mergeWith(const SampleInfo &SI) {
+  Hits += SI.Hits;
+}
+
+void SampleInfo::print(raw_ostream &OS) const {
+  OS << Loc.IsSymbol << " " << Loc.Name << " " << Twine::utohexstr(Loc.Offset)
+     << " " << Hits << "\n";
+}
+
+uint64_t
+FuncSampleData::getSamples(uint64_t Start, uint64_t End) const {
+  assert(std::is_sorted(Data.begin(), Data.end()));
+  struct Compare {
+    bool operator()(const SampleInfo &SI, const uint64_t Val) const {
+      return SI.Loc.Offset < Val;
+    }
+    bool operator()(const uint64_t Val, const SampleInfo &SI) const {
+      return Val < SI.Loc.Offset;
+    }
+  };
+  uint64_t Result = 0;
+  for (auto I = std::lower_bound(Data.begin(), Data.end(), Start, Compare()),
+            E = std::lower_bound(Data.begin(), Data.end(), End, Compare());
+       I != E; ++I) {
+    Result += I->Hits;
+  }
+  return Result;
+}
+
+void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) {
+  auto Iter = Index.find(Offset);
+  if (Iter == Index.end()) {
+    Data.emplace_back(Location(true, Name, Offset), Count);
+    Index[Offset] = Data.size() - 1;
+    return;
+  }
+  SampleInfo &SI = Data[Iter->second];
+  SI.Hits += Count;
+}
+
+void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo,
+                                     uint64_t Count, uint64_t Mispreds) {
+  auto Iter = IntraIndex[OffsetFrom].find(OffsetTo);
+  if (Iter == IntraIndex[OffsetFrom].end()) {
+    Data.emplace_back(Location(true, Name, OffsetFrom),
+                      Location(true, Name, OffsetTo), Mispreds, Count);
+    IntraIndex[OffsetFrom][OffsetTo] = Data.size() - 1;
+    return;
+  }
+  BranchInfo &BI = Data[Iter->second];
+  BI.Branches += Count;
+  BI.Mispreds += Mispreds;
+}
+
+void FuncBranchData::bumpCallCount(uint64_t OffsetFrom, const Location &To,
+                                   uint64_t Count, uint64_t Mispreds) {
+  auto Iter = InterIndex[OffsetFrom].find(To);
+  if (Iter == InterIndex[OffsetFrom].end()) {
+    Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispreds, Count);
+    InterIndex[OffsetFrom][To] = Data.size() - 1;
+    return;
+  }
+  BranchInfo &BI = Data[Iter->second];
+  BI.Branches += Count;
+  BI.Mispreds += Mispreds;
+}
+
+void FuncBranchData::bumpEntryCount(const Location &From, uint64_t OffsetTo,
+                                    uint64_t Count, uint64_t Mispreds) {
+  auto Iter = EntryIndex[OffsetTo].find(From);
+  if (Iter == EntryIndex[OffsetTo].end()) {
+    EntryData.emplace_back(From, Location(true, Name, OffsetTo), Mispreds,
+                           Count);
+    EntryIndex[OffsetTo][From] = EntryData.size() - 1;
+    return;
+  }
+  BranchInfo &BI = EntryData[Iter->second];
+  BI.Branches += Count;
+  BI.Mispreds += Mispreds;
+}
+
+void BranchInfo::mergeWith(const BranchInfo &BI) {
+  Branches += BI.Branches;
+  Mispreds += BI.Mispreds;
+}
+
+void BranchInfo::print(raw_ostream &OS) const {
+  OS << From.IsSymbol << " " << From.Name << " "
+     << Twine::utohexstr(From.Offset) << " "
+     << To.IsSymbol << " " << To.Name << " "
+     << Twine::utohexstr(To.Offset) << " "
+     << Mispreds << " " << Branches << '\n';
+}
+
+ErrorOr<const BranchInfo &> FuncBranchData::getBranch(uint64_t From,
+                                                      uint64_t To) const {
+  for (const BranchInfo &I : Data) {
+    if (I.From.Offset == From && I.To.Offset == To &&
+        I.From.Name == I.To.Name)
+      return I;
+  }
+  return make_error_code(llvm::errc::invalid_argument);
+}
+
+ErrorOr<const BranchInfo &>
+FuncBranchData::getDirectCallBranch(uint64_t From) const {
+  // Commented out because it can be expensive.
+  // assert(std::is_sorted(Data.begin(), Data.end()));
+  struct Compare {
+    bool operator()(const BranchInfo &BI, const uint64_t Val) const {
+      return BI.From.Offset < Val;
+    }
+    bool operator()(const uint64_t Val, const BranchInfo &BI) const {
+      return Val < BI.From.Offset;
+    }
+  };
+  auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare());
+  for (auto I = Range.first; I != Range.second; ++I) {
+    if (I->From.Name != I->To.Name)
+      return *I;
+  }
+  return make_error_code(llvm::errc::invalid_argument);
+}
+
+void MemInfo::print(raw_ostream &OS) const {
+  OS << (Offset.IsSymbol + 3) << " " << Offset.Name << " "
+     << Twine::utohexstr(Offset.Offset) << " "
+     << (Addr.IsSymbol + 3) << " " << Addr.Name << " "
+     << Twine::utohexstr(Addr.Offset) << " "
+     << Count << "\n";
+}
+
+void MemInfo::prettyPrint(raw_ostream &OS) const {
+  OS << "(PC: " << Offset << ", M: " << Addr << ", C: " << Count << ")";
+}
+
+void FuncMemData::update(const Location &Offset, const Location &Addr) {
+  auto Iter = EventIndex[Offset.Offset].find(Addr);
+  if (Iter == EventIndex[Offset.Offset].end()) {
+    Data.emplace_back(MemInfo(Offset, Addr, 1));
+    EventIndex[Offset.Offset][Addr] = Data.size() - 1;
+    return;
+  }
+  ++Data[Iter->second].Count;
+}
+
+Error DataReader::preprocessProfile(BinaryContext &BC) {
+  if (std::error_code EC = parseInput()) {
+    return errorCodeToError(EC);
+  }
+
+  if (opts::DumpData) {
+    dump();
+  }
+
+  if (collectedInBoltedBinary()) {
+    outs() << "BOLT-INFO: profile collection done on a binary already "
+              "processed by BOLT\n";
+  }
+
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (FuncMemData *MemData = getMemDataForNames(Function.getNames())) {
+      setMemData(Function, MemData);
+      MemData->Used = true;
+    }
+    if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) {
+      setBranchData(Function, FuncData);
+      Function.ExecutionCount = FuncData->ExecutionCount;
+      FuncData->Used = true;
+    }
+  }
+
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    matchProfileMemData(Function);
+  }
+
+  return Error::success();
+}
+
+Error DataReader::readProfilePreCFG(BinaryContext &BC) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    FuncMemData *MemoryData = getMemData(Function);
+    if (!MemoryData)
+      continue;
+
+    for (MemInfo &MI : MemoryData->Data) {
+      const uint64_t Offset = MI.Offset.Offset;
+      auto II = Function.Instructions.find(Offset);
+      if (II == Function.Instructions.end()) {
+        // Ignore bad instruction address.
+        continue;
+      }
+
+      auto &MemAccessProfile =
+        BC.MIB->getOrCreateAnnotationAs<MemoryAccessProfile>(
+            II->second, "MemoryAccessProfile");
+      BinaryData *BD = nullptr;
+      if (MI.Addr.IsSymbol) {
+        BD = BC.getBinaryDataByName(MI.Addr.Name);
+      }
+      MemAccessProfile.AddressAccessInfo.
+        push_back({BD, MI.Addr.Offset, MI.Count});
+      auto NextII = std::next(II);
+      if (NextII == Function.Instructions.end()) {
+        MemAccessProfile.NextInstrOffset = Function.getSize();
+      } else {
+        MemAccessProfile.NextInstrOffset = II->first;
+      }
+    }
+    Function.HasMemoryProfile = true;
+  }
+
+  return Error::success();
+}
+
+Error DataReader::readProfile(BinaryContext &BC) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    readProfile(Function);
+  }
+
+  uint64_t NumUnused = 0;
+  for (const StringMapEntry<FuncBranchData> &FuncData : NamesToBranches)
+    if (!FuncData.getValue().Used)
+      ++NumUnused;
+  BC.setNumUnusedProfiledObjects(NumUnused);
+
+  return Error::success();
+}
+
+std::error_code DataReader::parseInput() {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = MB.getError()) {
+    Diag << "cannot open " << Filename << ": " << EC.message() << "\n";
+    return EC;
+  }
+  FileBuf = std::move(MB.get());
+  ParsingBuf = FileBuf->getBuffer();
+  if (std::error_code EC = parse()) {
+    return EC;
+  }
+  if (!ParsingBuf.empty()) {
+    Diag << "WARNING: invalid profile data detected at line " << Line
+         << ". Possibly corrupted profile.\n";
+  }
+
+  buildLTONameMaps();
+
+  return std::error_code();
+}
+
+void DataReader::readProfile(BinaryFunction &BF) {
+  if (BF.empty())
+    return;
+
+  if (!hasLBR()) {
+    BF.ProfileFlags = BinaryFunction::PF_SAMPLE;
+    readSampleData(BF);
+    return;
+  }
+
+  BF.ProfileFlags = BinaryFunction::PF_LBR;
+
+  // Possibly assign/re-assign branch profile data.
+  matchProfileData(BF);
+
+  FuncBranchData *FBD = getBranchData(BF);
+  if (!FBD)
+    return;
+
+  // Assign basic block counts to function entry points. These only include
+  // counts for outside entries.
+  //
+  // There is a slight skew introduced here as branches originated from RETs
+  // may be accounted for in the execution count of an entry block if the last
+  // instruction in a predecessor fall-through block is a call. This situation
+  // should rarely happen because there are few multiple-entry functions.
+  for (const BranchInfo &BI : FBD->EntryData) {
+    BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(BI.To.Offset);
+    if (BB && (BB->isEntryPoint() || BB->isLandingPad())) {
+      uint64_t Count = BB->getExecutionCount();
+      if (Count == BinaryBasicBlock::COUNT_NO_PROFILE)
+        Count = 0;
+      BB->setExecutionCount(Count + BI.Branches);
+    }
+  }
+
+  uint64_t MismatchedBranches = 0;
+  for (const BranchInfo &BI : FBD->Data) {
+    if (BI.From.Name != BI.To.Name) {
+      continue;
+    }
+
+    if (!recordBranch(BF, BI.From.Offset, BI.To.Offset,
+                      BI.Branches, BI.Mispreds)) {
+      LLVM_DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> "
+                        << BI.To.Offset << '\n');
+      ++MismatchedBranches;
+    }
+  }
+
+  // Convert branch data into annotations.
+  convertBranchData(BF);
+}
+
+void DataReader::matchProfileData(BinaryFunction &BF) {
+  // This functionality is available for LBR-mode only
+  // TODO: Implement evaluateProfileData() for samples, checking whether
+  // sample addresses match instruction addresses in the function
+  if (!hasLBR())
+    return;
+
+  FuncBranchData *FBD = getBranchData(BF);
+  if (FBD) {
+    BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
+    BF.RawBranchCount = FBD->getNumExecutedBranches();
+    if (BF.ProfileMatchRatio == 1.0f) {
+      if (fetchProfileForOtherEntryPoints(BF)) {
+        BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
+        BF.ExecutionCount = FBD->ExecutionCount;
+        BF.RawBranchCount = FBD->getNumExecutedBranches();
+      }
+      return;
+    }
+  }
+
+  // Check if the function name can fluctuate between several compilations
+  // possibly triggered by minor unrelated code changes in the source code
+  // of the input binary.
+  if (!hasVolatileName(BF))
+    return;
+
+  // Check for a profile that matches with 100% confidence.
+  const std::vector<FuncBranchData *> AllBranchData =
+      getBranchDataForNamesRegex(BF.getNames());
+  for (FuncBranchData *NewBranchData : AllBranchData) {
+    // Prevent functions from sharing the same profile.
+    if (NewBranchData->Used)
+      continue;
+
+    if (evaluateProfileData(BF, *NewBranchData) != 1.0f)
+      continue;
+
+    if (FBD)
+      FBD->Used = false;
+
+    // Update function profile data with the new set.
+    setBranchData(BF, NewBranchData);
+    NewBranchData->Used = true;
+    BF.ExecutionCount = NewBranchData->ExecutionCount;
+    BF.ProfileMatchRatio = 1.0f;
+    break;
+  }
+}
+
+void DataReader::matchProfileMemData(BinaryFunction &BF) {
+  const std::vector<FuncMemData *> AllMemData =
+      getMemDataForNamesRegex(BF.getNames());
+  for (FuncMemData *NewMemData : AllMemData) {
+    // Prevent functions from sharing the same profile.
+    if (NewMemData->Used)
+      continue;
+
+    if (FuncMemData *MD = getMemData(BF))
+      MD->Used = false;
+
+    // Update function profile data with the new set.
+    setMemData(BF, NewMemData);
+    NewMemData->Used = true;
+    break;
+  }
+}
+
+bool DataReader::fetchProfileForOtherEntryPoints(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  FuncBranchData *FBD = getBranchData(BF);
+  if (!FBD)
+    return false;
+
+  // Check if we are missing profiling data for secondary entry points
+  bool First = true;
+  bool Updated = false;
+  for (BinaryBasicBlock *BB : BF.BasicBlocks) {
+    if (First) {
+      First = false;
+      continue;
+    }
+    if (BB->isEntryPoint()) {
+      uint64_t EntryAddress = BB->getOffset() + BF.getAddress();
+      // Look for branch data associated with this entry point
+      if (BinaryData *BD = BC.getBinaryDataAtAddress(EntryAddress)) {
+        if (FuncBranchData *Data = getBranchDataForSymbols(BD->getSymbols())) {
+          FBD->appendFrom(*Data, BB->getOffset());
+          Data->Used = true;
+          Updated = true;
+        }
+      }
+    }
+  }
+
+  return Updated;
+}
+
+float DataReader::evaluateProfileData(BinaryFunction &BF,
+                                      const FuncBranchData &BranchData) const {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  // Until we define a minimal profile, we consider an empty branch data to be
+  // a valid profile. It could happen to a function without branches when we
+  // still have an EntryData for the execution count.
+  if (BranchData.Data.empty()) {
+    return 1.0f;
+  }
+
+  uint64_t NumMatchedBranches = 0;
+  for (const BranchInfo &BI : BranchData.Data) {
+    bool IsValid = false;
+    if (BI.From.Name == BI.To.Name) {
+      // Try to record information with 0 count.
+      IsValid = recordBranch(BF, BI.From.Offset, BI.To.Offset, 0);
+    } else if (collectedInBoltedBinary()) {
+      // We can't check branch source for collections in bolted binaries because
+      // the source of the branch may be mapped to the first instruction in a BB
+      // instead of the original branch (which may not exist in the source bin).
+      IsValid = true;
+    } else {
+      // The branch has to originate from this function.
+      // Check for calls, tail calls, rets and indirect branches.
+      // When matching profiling info, we did not reach the stage
+      // when we identify tail calls, so they are still represented
+      // by regular branch instructions and we need isBranch() here.
+      MCInst *Instr = BF.getInstructionAtOffset(BI.From.Offset);
+      // If it's a prefix - skip it.
+      if (Instr && BC.MIB->isPrefix(*Instr))
+        Instr = BF.getInstructionAtOffset(BI.From.Offset + 1);
+      if (Instr &&
+          (BC.MIB->isCall(*Instr) ||
+           BC.MIB->isBranch(*Instr) ||
+           BC.MIB->isReturn(*Instr))) {
+        IsValid = true;
+      }
+    }
+
+    if (IsValid) {
+      ++NumMatchedBranches;
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "\tinvalid branch in " << BF << " : 0x"
+                      << Twine::utohexstr(BI.From.Offset) << " -> ";
+               if (BI.From.Name == BI.To.Name) dbgs()
+               << "0x" << Twine::utohexstr(BI.To.Offset) << '\n';
+               else dbgs() << "<outbounds>\n";);
+  }
+
+  const float MatchRatio = (float)NumMatchedBranches / BranchData.Data.size();
+  if (opts::Verbosity >= 2 && NumMatchedBranches < BranchData.Data.size()) {
+    errs() << "BOLT-WARNING: profile branches match only "
+           << format("%.1f%%", MatchRatio * 100.0f) << " ("
+           << NumMatchedBranches << '/' << BranchData.Data.size()
+           << ") for function " << BF << '\n';
+  }
+
+  return MatchRatio;
+}
+
+void DataReader::readSampleData(BinaryFunction &BF) {
+  FuncSampleData *SampleDataOrErr = getFuncSampleData(BF.getNames());
+  if (!SampleDataOrErr)
+    return;
+
+  // Basic samples mode territory (without LBR info)
+  // First step is to assign BB execution count based on samples from perf
+  BF.ProfileMatchRatio = 1.0f;
+  BF.removeTagsFromProfile();
+  bool NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions");
+  bool NormalizeByCalls = usesEvent("branches");
+  static bool NagUser = true;
+  if (NagUser) {
+    outs()
+        << "BOLT-INFO: operating with basic samples profiling data (no LBR).\n";
+    if (NormalizeByInsnCount) {
+      outs() << "BOLT-INFO: normalizing samples by instruction count.\n";
+    } else if (NormalizeByCalls) {
+      outs() << "BOLT-INFO: normalizing samples by branches.\n";
+    }
+    NagUser = false;
+  }
+  uint64_t LastOffset = BF.getSize();
+  uint64_t TotalEntryCount = 0;
+  for (auto I = BF.BasicBlockOffsets.rbegin(), E = BF.BasicBlockOffsets.rend();
+       I != E; ++I) {
+    uint64_t CurOffset = I->first;
+    // Always work with samples multiplied by 1000 to avoid losing them if we
+    // later need to normalize numbers
+    uint64_t NumSamples =
+        SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000;
+    if (NormalizeByInsnCount && I->second->getNumNonPseudos())
+      NumSamples /= I->second->getNumNonPseudos();
+    else if (NormalizeByCalls) {
+      uint32_t NumCalls = I->second->getNumCalls();
+      NumSamples /= NumCalls + 1;
+    }
+    I->second->setExecutionCount(NumSamples);
+    if (I->second->isEntryPoint())
+      TotalEntryCount += NumSamples;
+    LastOffset = CurOffset;
+  }
+
+  BF.ExecutionCount = TotalEntryCount;
+
+  estimateEdgeCounts(BF);
+}
+
+void DataReader::convertBranchData(BinaryFunction &BF) const {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  if (BF.empty())
+    return;
+
+  FuncBranchData *FBD = getBranchData(BF);
+  if (!FBD)
+    return;
+
+  // Profile information for calls.
+  //
+  // There are 3 cases that we annotate differently:
+  //   1) Conditional tail calls that could be mispredicted.
+  //   2) Indirect calls to multiple destinations with mispredictions.
+  //      Before we validate CFG we have to handle indirect branches here too.
+  //   3) Regular direct calls. The count could be different from containing
+  //      basic block count. Keep this data in case we find it useful.
+  //
+  for (BranchInfo &BI : FBD->Data) {
+    // Ignore internal branches.
+    if (BI.To.IsSymbol && BI.To.Name == BI.From.Name && BI.To.Offset != 0)
+      continue;
+
+    MCInst *Instr = BF.getInstructionAtOffset(BI.From.Offset);
+    if (!Instr ||
+        (!BC.MIB->isCall(*Instr) && !BC.MIB->isIndirectBranch(*Instr)))
+      continue;
+
+    auto setOrUpdateAnnotation = [&](StringRef Name, uint64_t Count) {
+      if (opts::Verbosity >= 1 && BC.MIB->hasAnnotation(*Instr, Name)) {
+        errs() << "BOLT-WARNING: duplicate " << Name << " info for offset 0x"
+               << Twine::utohexstr(BI.From.Offset)
+               << " in function " << BF << '\n';
+      }
+      auto &Value = BC.MIB->getOrCreateAnnotationAs<uint64_t>(*Instr, Name);
+      Value += Count;
+    };
+
+    if (BC.MIB->isIndirectCall(*Instr) || BC.MIB->isIndirectBranch(*Instr)) {
+      IndirectCallSiteProfile &CSP =
+        BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
+            *Instr, "CallProfile");
+      MCSymbol *CalleeSymbol = nullptr;
+      if (BI.To.IsSymbol) {
+        if (BinaryData *BD = BC.getBinaryDataByName(BI.To.Name)) {
+          CalleeSymbol = BD->getSymbol();
+        }
+      }
+      CSP.emplace_back(CalleeSymbol, BI.Branches, BI.Mispreds);
+    } else if (BC.MIB->getConditionalTailCall(*Instr)) {
+      setOrUpdateAnnotation("CTCTakenCount", BI.Branches);
+      setOrUpdateAnnotation("CTCMispredCount", BI.Mispreds);
+    } else {
+      setOrUpdateAnnotation("Count", BI.Branches);
+    }
+  }
+}
+
+bool DataReader::recordBranch(BinaryFunction &BF,
+                              uint64_t From, uint64_t To,
+                              uint64_t Count, uint64_t Mispreds) const {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(From);
+  BinaryBasicBlock *ToBB = BF.getBasicBlockContainingOffset(To);
+
+  if (!FromBB || !ToBB) {
+    LLVM_DEBUG(dbgs() << "failed to get block for recorded branch\n");
+    return false;
+  }
+
+  // Could be bad LBR data; ignore the branch. In the case of data collected
+  // in binaries optimized by BOLT, a source BB may be mapped to two output
+  // BBs as a result of optimizations. In that case, a branch between these
+  // two will be recorded as a branch from A going to A in the source address
+  // space. Keep processing.
+  if (From == To) {
+    return true;
+  }
+
+  if (FromBB->succ_size() == 0) {
+    // Return from a tail call.
+    return true;
+  }
+
+  // Very rarely we will see ignored branches. Do a linear check.
+  for (std::pair<uint32_t, uint32_t> &Branch : BF.IgnoredBranches) {
+    if (Branch == std::make_pair(static_cast<uint32_t>(From),
+                                 static_cast<uint32_t>(To)))
+      return true;
+  }
+
+  if (To != ToBB->getOffset()) {
+    // "To" could be referring to nop instructions in between 2 basic blocks.
+    // While building the CFG we make sure these nops are attributed to the
+    // previous basic block, thus we check if the destination belongs to the
+    // gap past the last instruction.
+    const MCInst *LastInstr = ToBB->getLastNonPseudoInstr();
+    if (LastInstr) {
+      const uint32_t LastInstrOffset =
+          BC.MIB->getAnnotationWithDefault<uint32_t>(*LastInstr, "Offset");
+
+      // With old .fdata we are getting FT branches for "jcc,jmp" sequences.
+      if (To == LastInstrOffset && BC.MIB->isUnconditionalBranch(*LastInstr)) {
+        return true;
+      }
+
+      if (To <= LastInstrOffset) {
+        LLVM_DEBUG(dbgs() << "branch recorded into the middle of the block"
+                          << " in " << BF << " : " << From << " -> " << To
+                          << '\n');
+        return false;
+      }
+    }
+
+    // The real destination is the layout successor of the detected ToBB.
+    if (ToBB == BF.BasicBlocksLayout.back())
+      return false;
+    BinaryBasicBlock *NextBB = BF.BasicBlocksLayout[ToBB->getIndex() + 1];
+    assert((NextBB && NextBB->getOffset() > ToBB->getOffset()) && "bad layout");
+    ToBB = NextBB;
+  }
+
+  // If there's no corresponding instruction for 'From', we have probably
+  // discarded it as a FT from __builtin_unreachable.
+  MCInst *FromInstruction = BF.getInstructionAtOffset(From);
+  if (!FromInstruction) {
+    // If the data was collected in a bolted binary, the From addresses may be
+    // translated to the first instruction of the source BB if BOLT inserted
+    // a new branch that did not exist in the source (we can't map it to the
+    // source instruction, so we map it to the first instr of source BB).
+    // We do not keep offsets for random instructions. So the check above will
+    // evaluate to true if the first instr is not a branch (call/jmp/ret/etc)
+    if (collectedInBoltedBinary()) {
+      if (FromBB->getInputOffset() != From) {
+        LLVM_DEBUG(dbgs() << "offset " << From << " does not match a BB in "
+                          << BF << '\n');
+        return false;
+      }
+      FromInstruction = nullptr;
+    } else {
+      LLVM_DEBUG(dbgs() << "no instruction for offset " << From << " in " << BF
+                        << '\n');
+      return false;
+    }
+  }
+
+  if (!FromBB->getSuccessor(ToBB->getLabel())) {
+    // Check if this is a recursive call or a return from a recursive call.
+    if (FromInstruction && ToBB->isEntryPoint() &&
+        (BC.MIB->isCall(*FromInstruction) ||
+         BC.MIB->isIndirectBranch(*FromInstruction))) {
+      // Execution count is already accounted for.
+      return true;
+    }
+    // For data collected in a bolted binary, we may have created two output BBs
+    // that map to one original block. Branches between these two blocks will
+    // appear here as one BB jumping to itself, even though it has no loop edges.
+    // Ignore these.
+    if (collectedInBoltedBinary() && FromBB == ToBB)
+      return true;
+
+    LLVM_DEBUG(dbgs() << "invalid branch in " << BF << '\n'
+                      << Twine::utohexstr(From) << " -> "
+                      << Twine::utohexstr(To) << '\n');
+    return false;
+  }
+
+  BinaryBasicBlock::BinaryBranchInfo &BI = FromBB->getBranchInfo(*ToBB);
+  BI.Count += Count;
+  // Only update mispredicted count if it the count was real.
+  if (Count) {
+    BI.MispredictedCount += Mispreds;
+  }
+
+  return true;
+}
+
+void DataReader::reportError(StringRef ErrorMsg) {
+  Diag << "Error reading BOLT data input file: line " << Line << ", column "
+       << Col << ": " << ErrorMsg << '\n';
+}
+
+bool DataReader::expectAndConsumeFS() {
+  if (ParsingBuf[0] != FieldSeparator) {
+    reportError("expected field separator");
+    return false;
+  }
+  ParsingBuf = ParsingBuf.drop_front(1);
+  Col += 1;
+  return true;
+}
+
+void DataReader::consumeAllRemainingFS() {
+  while (ParsingBuf[0] == FieldSeparator) {
+    ParsingBuf = ParsingBuf.drop_front(1);
+    Col += 1;
+  }
+}
+
+bool DataReader::checkAndConsumeNewLine() {
+  if (ParsingBuf[0] != '\n')
+    return false;
+
+  ParsingBuf = ParsingBuf.drop_front(1);
+  Col = 0;
+  Line += 1;
+  return true;
+}
+
+ErrorOr<StringRef> DataReader::parseString(char EndChar, bool EndNl) {
+  std::string EndChars(1, EndChar);
+  if (EndNl)
+    EndChars.push_back('\n');
+  size_t StringEnd = ParsingBuf.find_first_of(EndChars);
+  if (StringEnd == StringRef::npos || StringEnd == 0) {
+    reportError("malformed field");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  StringRef Str = ParsingBuf.substr(0, StringEnd);
+
+  // If EndNl was set and nl was found instead of EndChar, do not consume the
+  // new line.
+  bool EndNlInsteadOfEndChar =
+    ParsingBuf[StringEnd] == '\n' && EndChar != '\n';
+  unsigned End = EndNlInsteadOfEndChar ? StringEnd : StringEnd + 1;
+
+  ParsingBuf = ParsingBuf.drop_front(End);
+  if (EndChar == '\n') {
+    Col = 0;
+    Line += 1;
+  } else {
+    Col += End;
+  }
+  return Str;
+}
+
+ErrorOr<int64_t> DataReader::parseNumberField(char EndChar, bool EndNl) {
+  ErrorOr<StringRef> NumStrRes = parseString(EndChar, EndNl);
+  if (std::error_code EC = NumStrRes.getError())
+    return EC;
+  StringRef NumStr = NumStrRes.get();
+  int64_t Num;
+  if (NumStr.getAsInteger(10, Num)) {
+    reportError("expected decimal number");
+    Diag << "Found: " << NumStr << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+  return Num;
+}
+
+ErrorOr<uint64_t> DataReader::parseHexField(char EndChar, bool EndNl) {
+  ErrorOr<StringRef> NumStrRes = parseString(EndChar, EndNl);
+  if (std::error_code EC = NumStrRes.getError())
+    return EC;
+  StringRef NumStr = NumStrRes.get();
+  uint64_t Num;
+  if (NumStr.getAsInteger(16, Num)) {
+    reportError("expected hexidecimal number");
+    Diag << "Found: " << NumStr << "\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+  return Num;
+}
+
+ErrorOr<Location> DataReader::parseLocation(char EndChar,
+                                            bool EndNl,
+                                            bool ExpectMemLoc) {
+  // Read whether the location of the branch should be DSO or a symbol
+  // 0 means it is a DSO. 1 means it is a global symbol. 2 means it is a local
+  // symbol.
+  // The symbol flag is also used to tag memory load events by adding 3 to the
+  // base values, i.e. 3 not a symbol, 4 global symbol and 5 local symbol.
+  if (!ExpectMemLoc &&
+      ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && ParsingBuf[0] != '2') {
+    reportError("expected 0, 1 or 2");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  if (ExpectMemLoc &&
+      ParsingBuf[0] != '3' && ParsingBuf[0] != '4' && ParsingBuf[0] != '5') {
+    reportError("expected 3, 4 or 5");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  bool IsSymbol =
+    (!ExpectMemLoc && (ParsingBuf[0] == '1' || ParsingBuf[0] == '2')) ||
+    (ExpectMemLoc && (ParsingBuf[0] == '4' || ParsingBuf[0] == '5'));
+  ParsingBuf = ParsingBuf.drop_front(1);
+  Col += 1;
+
+  if (!expectAndConsumeFS())
+    return make_error_code(llvm::errc::io_error);
+  consumeAllRemainingFS();
+
+  // Read the string containing the symbol or the DSO name
+  ErrorOr<StringRef> NameRes = parseString(FieldSeparator);
+  if (std::error_code EC = NameRes.getError())
+    return EC;
+  StringRef Name = NameRes.get();
+  consumeAllRemainingFS();
+
+  // Read the offset
+  ErrorOr<uint64_t> Offset = parseHexField(EndChar, EndNl);
+  if (std::error_code EC = Offset.getError())
+    return EC;
+
+  return Location(IsSymbol, Name, Offset.get());
+}
+
+ErrorOr<BranchInfo> DataReader::parseBranchInfo() {
+  ErrorOr<Location> Res = parseLocation(FieldSeparator);
+  if (std::error_code EC = Res.getError())
+    return EC;
+  Location From = Res.get();
+
+  consumeAllRemainingFS();
+  Res = parseLocation(FieldSeparator);
+  if (std::error_code EC = Res.getError())
+    return EC;
+  Location To = Res.get();
+
+  consumeAllRemainingFS();
+  ErrorOr<int64_t> MRes = parseNumberField(FieldSeparator);
+  if (std::error_code EC = MRes.getError())
+    return EC;
+  int64_t NumMispreds = MRes.get();
+
+  consumeAllRemainingFS();
+  ErrorOr<int64_t> BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
+  if (std::error_code EC = BRes.getError())
+    return EC;
+  int64_t NumBranches = BRes.get();
+
+  consumeAllRemainingFS();
+  if (!checkAndConsumeNewLine()) {
+    reportError("expected end of line");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches);
+}
+
+ErrorOr<MemInfo> DataReader::parseMemInfo() {
+  ErrorOr<Location> Res = parseMemLocation(FieldSeparator);
+  if (std::error_code EC = Res.getError())
+    return EC;
+  Location Offset = Res.get();
+
+  consumeAllRemainingFS();
+  Res = parseMemLocation(FieldSeparator);
+  if (std::error_code EC = Res.getError())
+    return EC;
+  Location Addr = Res.get();
+
+  consumeAllRemainingFS();
+  ErrorOr<int64_t> CountRes = parseNumberField(FieldSeparator, true);
+  if (std::error_code EC = CountRes.getError())
+    return EC;
+
+  consumeAllRemainingFS();
+  if (!checkAndConsumeNewLine()) {
+    reportError("expected end of line");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  return MemInfo(Offset, Addr, CountRes.get());
+}
+
+ErrorOr<SampleInfo> DataReader::parseSampleInfo() {
+  ErrorOr<Location> Res = parseLocation(FieldSeparator);
+  if (std::error_code EC = Res.getError())
+    return EC;
+  Location Address = Res.get();
+
+  consumeAllRemainingFS();
+  ErrorOr<int64_t> BRes = parseNumberField(FieldSeparator, /* EndNl = */ true);
+  if (std::error_code EC = BRes.getError())
+    return EC;
+  int64_t Occurrences = BRes.get();
+
+  consumeAllRemainingFS();
+  if (!checkAndConsumeNewLine()) {
+    reportError("expected end of line");
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  return SampleInfo(std::move(Address), Occurrences);
+}
+
+ErrorOr<bool> DataReader::maybeParseNoLBRFlag() {
+  if (ParsingBuf.size() < 6 || ParsingBuf.substr(0, 6) != "no_lbr")
+    return false;
+  ParsingBuf = ParsingBuf.drop_front(6);
+  Col += 6;
+
+  if (ParsingBuf.size() > 0 && ParsingBuf[0] == ' ')
+    ParsingBuf = ParsingBuf.drop_front(1);
+
+  while (ParsingBuf.size() > 0 && ParsingBuf[0] != '\n') {
+    ErrorOr<StringRef> EventName = parseString(' ', true);
+    if (!EventName)
+      return make_error_code(llvm::errc::io_error);
+    EventNames.insert(EventName.get());
+  }
+
+  if (!checkAndConsumeNewLine()) {
+    reportError("malformed no_lbr line");
+    return make_error_code(llvm::errc::io_error);
+  }
+  return true;
+}
+
+ErrorOr<bool> DataReader::maybeParseBATFlag() {
+  if (ParsingBuf.size() < 16 || ParsingBuf.substr(0, 16) != "boltedcollection")
+    return false;
+  ParsingBuf = ParsingBuf.drop_front(16);
+  Col += 16;
+
+  if (!checkAndConsumeNewLine()) {
+    reportError("malformed boltedcollection line");
+    return make_error_code(llvm::errc::io_error);
+  }
+  return true;
+}
+
+
+bool DataReader::hasBranchData() {
+  if (ParsingBuf.size() == 0)
+    return false;
+
+  if (ParsingBuf[0] == '0' || ParsingBuf[0] == '1' || ParsingBuf[0] == '2')
+    return true;
+  return false;
+}
+
+bool DataReader::hasMemData() {
+  if (ParsingBuf.size() == 0)
+    return false;
+
+  if (ParsingBuf[0] == '3' || ParsingBuf[0] == '4' || ParsingBuf[0] == '5')
+    return true;
+  return false;
+}
+
+std::error_code DataReader::parseInNoLBRMode() {
+  auto GetOrCreateFuncEntry = [&](StringRef Name) {
+    auto I = NamesToSamples.find(Name);
+    if (I == NamesToSamples.end()) {
+      bool Success;
+      std::tie(I, Success) = NamesToSamples.insert(std::make_pair(
+          Name, FuncSampleData(Name, FuncSampleData::ContainerTy())));
+
+      assert(Success && "unexpected result of insert");
+    }
+    return I;
+  };
+
+  auto GetOrCreateFuncMemEntry = [&](StringRef Name) {
+    auto I = NamesToMemEvents.find(Name);
+    if (I == NamesToMemEvents.end()) {
+      bool Success;
+      std::tie(I, Success) = NamesToMemEvents.insert(
+          std::make_pair(Name, FuncMemData(Name, FuncMemData::ContainerTy())));
+      assert(Success && "unexpected result of insert");
+    }
+    return I;
+  };
+
+  while (hasBranchData()) {
+    ErrorOr<SampleInfo> Res = parseSampleInfo();
+    if (std::error_code EC = Res.getError())
+      return EC;
+
+    SampleInfo SI = Res.get();
+
+    // Ignore samples not involving known locations
+    if (!SI.Loc.IsSymbol)
+      continue;
+
+    StringMapIterator<FuncSampleData> I = GetOrCreateFuncEntry(SI.Loc.Name);
+    I->getValue().Data.emplace_back(std::move(SI));
+  }
+
+  while (hasMemData()) {
+    ErrorOr<MemInfo> Res = parseMemInfo();
+    if (std::error_code EC = Res.getError())
+      return EC;
+
+    MemInfo MI = Res.get();
+
+    // Ignore memory events not involving known pc.
+    if (!MI.Offset.IsSymbol)
+      continue;
+
+    StringMapIterator<FuncMemData> I = GetOrCreateFuncMemEntry(MI.Offset.Name);
+    I->getValue().Data.emplace_back(std::move(MI));
+  }
+
+  for (StringMapEntry<FuncSampleData> &FuncSamples : NamesToSamples) {
+    std::stable_sort(FuncSamples.second.Data.begin(),
+                     FuncSamples.second.Data.end());
+  }
+
+  for (StringMapEntry<FuncMemData> &MemEvents : NamesToMemEvents) {
+    std::stable_sort(MemEvents.second.Data.begin(),
+                     MemEvents.second.Data.end());
+  }
+
+  return std::error_code();
+}
+
+std::error_code DataReader::parse() {
+  auto GetOrCreateFuncEntry = [&](StringRef Name) {
+    auto I = NamesToBranches.find(Name);
+    if (I == NamesToBranches.end()) {
+      bool Success;
+      std::tie(I, Success) = NamesToBranches.insert(std::make_pair(
+          Name, FuncBranchData(Name, FuncBranchData::ContainerTy(),
+                               FuncBranchData::ContainerTy())));
+      assert(Success && "unexpected result of insert");
+    }
+    return I;
+  };
+
+  auto GetOrCreateFuncMemEntry = [&](StringRef Name) {
+    auto I = NamesToMemEvents.find(Name);
+    if (I == NamesToMemEvents.end()) {
+      bool Success;
+      std::tie(I, Success) = NamesToMemEvents.insert(
+          std::make_pair(Name, FuncMemData(Name, FuncMemData::ContainerTy())));
+      assert(Success && "unexpected result of insert");
+    }
+    return I;
+  };
+
+  Col = 0;
+  Line = 1;
+  ErrorOr<bool> FlagOrErr = maybeParseNoLBRFlag();
+  if (!FlagOrErr)
+    return FlagOrErr.getError();
+  NoLBRMode = *FlagOrErr;
+
+  ErrorOr<bool> BATFlagOrErr = maybeParseBATFlag();
+  if (!BATFlagOrErr)
+    return BATFlagOrErr.getError();
+  BATMode = *BATFlagOrErr;
+
+  if (!hasBranchData() && !hasMemData()) {
+    Diag << "ERROR: no valid profile data found\n";
+    return make_error_code(llvm::errc::io_error);
+  }
+
+  if (NoLBRMode)
+    return parseInNoLBRMode();
+
+  while (hasBranchData()) {
+    ErrorOr<BranchInfo> Res = parseBranchInfo();
+    if (std::error_code EC = Res.getError())
+      return EC;
+
+    BranchInfo BI = Res.get();
+
+    // Ignore branches not involving known location.
+    if (!BI.From.IsSymbol && !BI.To.IsSymbol)
+      continue;
+
+    StringMapIterator<FuncBranchData> I = GetOrCreateFuncEntry(BI.From.Name);
+    I->getValue().Data.emplace_back(std::move(BI));
+
+    // Add entry data for branches to another function or branches
+    // to entry points (including recursive calls)
+    if (BI.To.IsSymbol &&
+        (!BI.From.Name.equals(BI.To.Name) || BI.To.Offset == 0)) {
+      I = GetOrCreateFuncEntry(BI.To.Name);
+      I->getValue().EntryData.emplace_back(std::move(BI));
+    }
+
+    // If destination is the function start - update execution count.
+    // NB: the data is skewed since we cannot tell tail recursion from
+    //     branches to the function start.
+    if (BI.To.IsSymbol && BI.To.Offset == 0) {
+      I = GetOrCreateFuncEntry(BI.To.Name);
+      I->getValue().ExecutionCount += BI.Branches;
+    }
+  }
+
+  while (hasMemData()) {
+    ErrorOr<MemInfo> Res = parseMemInfo();
+    if (std::error_code EC = Res.getError())
+      return EC;
+
+    MemInfo MI = Res.get();
+
+    // Ignore memory events not involving known pc.
+    if (!MI.Offset.IsSymbol)
+      continue;
+
+    StringMapIterator<FuncMemData> I = GetOrCreateFuncMemEntry(MI.Offset.Name);
+    I->getValue().Data.emplace_back(std::move(MI));
+  }
+
+  for (StringMapEntry<FuncBranchData> &FuncBranches : NamesToBranches) {
+    std::stable_sort(FuncBranches.second.Data.begin(),
+                     FuncBranches.second.Data.end());
+  }
+
+  for (StringMapEntry<FuncMemData> &MemEvents : NamesToMemEvents) {
+    std::stable_sort(MemEvents.second.Data.begin(),
+                     MemEvents.second.Data.end());
+  }
+
+  return std::error_code();
+}
+
+void DataReader::buildLTONameMaps() {
+  for (StringMapEntry<FuncBranchData> &FuncData : NamesToBranches) {
+    const StringRef FuncName = FuncData.getKey();
+    const Optional<StringRef> CommonName = getLTOCommonName(FuncName);
+    if (CommonName)
+      LTOCommonNameMap[*CommonName].push_back(&FuncData.getValue());
+  }
+
+  for (StringMapEntry<FuncMemData> &FuncData : NamesToMemEvents) {
+    const StringRef FuncName = FuncData.getKey();
+    const Optional<StringRef> CommonName = getLTOCommonName(FuncName);
+    if (CommonName)
+      LTOCommonNameMemMap[*CommonName].push_back(&FuncData.getValue());
+  }
+}
+
+namespace {
+template <typename MapTy>
+decltype(MapTy::MapEntryTy::second) *
+fetchMapEntry(MapTy &Map, const std::vector<MCSymbol *> &Symbols) {
+  // Do a reverse order iteration since the name in profile has a higher chance
+  // of matching a name at the end of the list.
+  for (auto SI = Symbols.rbegin(), SE = Symbols.rend(); SI != SE; ++SI) {
+    auto I = Map.find(normalizeName((*SI)->getName()));
+    if (I != Map.end())
+      return &I->getValue();
+  }
+  return nullptr;
+}
+
+template <typename MapTy>
+decltype(MapTy::MapEntryTy::second) *
+fetchMapEntry(MapTy &Map, const std::vector<StringRef> &FuncNames) {
+  // Do a reverse order iteration since the name in profile has a higher chance
+  // of matching a name at the end of the list.
+  for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) {
+    auto I = Map.find(normalizeName(*FI));
+    if (I != Map.end())
+      return &I->getValue();
+  }
+  return nullptr;
+}
+
+template <typename MapTy>
+std::vector<decltype(MapTy::MapEntryTy::second) *>
+fetchMapEntriesRegex(
+  MapTy &Map,
+  const StringMap<std::vector<decltype(MapTy::MapEntryTy::second) *>> &LTOCommonNameMap,
+  const std::vector<StringRef> &FuncNames) {
+  std::vector<decltype(MapTy::MapEntryTy::second) *> AllData;
+  // Do a reverse order iteration since the name in profile has a higher chance
+  // of matching a name at the end of the list.
+  for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) {
+    StringRef Name = *FI;
+    Name = normalizeName(Name);
+    const Optional<StringRef> LTOCommonName = getLTOCommonName(Name);
+    if (LTOCommonName) {
+      auto I = LTOCommonNameMap.find(*LTOCommonName);
+      if (I != LTOCommonNameMap.end()) {
+        const std::vector<decltype(MapTy::MapEntryTy::second) *> &CommonData =
+            I->getValue();
+        AllData.insert(AllData.end(), CommonData.begin(), CommonData.end());
+      }
+    } else {
+      auto I = Map.find(Name);
+      if (I != Map.end()) {
+        return {&I->getValue()};
+      }
+    }
+  }
+  return AllData;
+}
+
+}
+
+bool DataReader::mayHaveProfileData(const BinaryFunction &Function) {
+  if (getBranchData(Function) || getMemData(Function))
+    return true;
+
+  if (getBranchDataForNames(Function.getNames()) ||
+      getMemDataForNames(Function.getNames()))
+    return true;
+
+  if (!hasVolatileName(Function))
+    return false;
+
+  const std::vector<FuncBranchData *> AllBranchData =
+      getBranchDataForNamesRegex(Function.getNames());
+  if (!AllBranchData.empty())
+    return true;
+
+  const std::vector<FuncMemData *> AllMemData =
+      getMemDataForNamesRegex(Function.getNames());
+  if (!AllMemData.empty())
+    return true;
+
+  return false;
+}
+
+FuncBranchData *
+DataReader::getBranchDataForNames(const std::vector<StringRef> &FuncNames) {
+  return fetchMapEntry<NamesToBranchesMapTy>(NamesToBranches, FuncNames);
+}
+
+FuncBranchData *
+DataReader::getBranchDataForSymbols(const std::vector<MCSymbol *> &Symbols) {
+  return fetchMapEntry<NamesToBranchesMapTy>(NamesToBranches, Symbols);
+}
+
+FuncMemData *
+DataReader::getMemDataForNames(const std::vector<StringRef> &FuncNames) {
+  return fetchMapEntry<NamesToMemEventsMapTy>(NamesToMemEvents, FuncNames);
+}
+
+FuncSampleData *
+DataReader::getFuncSampleData(const std::vector<StringRef> &FuncNames) {
+  return fetchMapEntry<NamesToSamplesMapTy>(NamesToSamples, FuncNames);
+}
+
+std::vector<FuncBranchData *>
+DataReader::getBranchDataForNamesRegex(
+    const std::vector<StringRef> &FuncNames) {
+  return fetchMapEntriesRegex(NamesToBranches, LTOCommonNameMap, FuncNames);
+}
+
+std::vector<FuncMemData *>
+DataReader::getMemDataForNamesRegex(const std::vector<StringRef> &FuncNames) {
+  return fetchMapEntriesRegex(NamesToMemEvents, LTOCommonNameMemMap, FuncNames);
+}
+
+bool DataReader::hasLocalsWithFileName() const {
+  for (const StringMapEntry<FuncBranchData> &Func : NamesToBranches) {
+    const StringRef &FuncName = Func.getKey();
+    if (FuncName.count('/') == 2 && FuncName[0] != '/')
+      return true;
+  }
+  return false;
+}
+
+void DataReader::dump() const {
+  for (const StringMapEntry<FuncBranchData> &Func : NamesToBranches) {
+    Diag << Func.getKey() << " branches:\n";
+    for (const BranchInfo &BI : Func.getValue().Data) {
+      Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " "
+           << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n";
+    }
+    Diag << Func.getKey() << " entry points:\n";
+    for (const BranchInfo &BI : Func.getValue().EntryData) {
+      Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " "
+           << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n";
+    }
+  }
+
+  for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) {
+    StringRef Event = I->getKey();
+    Diag << "Data was collected with event: " << Event << "\n";
+  }
+  for (const StringMapEntry<FuncSampleData> &Func : NamesToSamples) {
+    Diag << Func.getKey() << " samples:\n";
+    for (const SampleInfo &SI : Func.getValue().Data) {
+      Diag << SI.Loc.Name << " " << SI.Loc.Offset << " "
+           << SI.Hits << "\n";
+    }
+  }
+
+  for (const StringMapEntry<FuncMemData> &Func : NamesToMemEvents) {
+    Diag << "Memory events for " << Func.getValue().Name;
+    Location LastOffset(0);
+    for (const MemInfo &MI : Func.getValue().Data) {
+      if (MI.Offset == LastOffset) {
+        Diag << ", " << MI.Addr << "/" << MI.Count;
+      } else {
+        Diag << "\n" << MI.Offset << ": " << MI.Addr << "/" << MI.Count;
+      }
+      LastOffset = MI.Offset;
+    }
+    Diag << "\n";
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/DataReader.h b/bolt/src/DataReader.h
new file mode 100644
index 000000000000..294dec7f2cb3
--- /dev/null
+++ b/bolt/src/DataReader.h
@@ -0,0 +1,551 @@
+//===-- Reader/DataReader.h - Perf data reader ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions reads profile data written by the perf2bolt
+// utility and stores it in memory for llvm-bolt consumption.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DATA_READER_H
+#define LLVM_TOOLS_LLVM_BOLT_DATA_READER_H
+
+#include "ProfileReaderBase.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+class MCSymbol;
+
+namespace bolt {
+
+class BinaryFunction;
+
+struct LBREntry {
+  uint64_t From;
+  uint64_t To;
+  bool Mispred;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) {
+  OS << "0x" << Twine::utohexstr(LBR.From)
+     << " -> 0x" << Twine::utohexstr(LBR.To);
+  return OS;
+}
+
+struct Location {
+  bool IsSymbol;
+  StringRef Name;
+  uint64_t Offset;
+
+  explicit Location(uint64_t Offset)
+      : IsSymbol(false), Name("[unknown]"), Offset(Offset) {}
+
+  Location(bool IsSymbol, StringRef Name, uint64_t Offset)
+      : IsSymbol(IsSymbol), Name(Name), Offset(Offset) {}
+
+  bool operator==(const Location &RHS) const {
+    return IsSymbol == RHS.IsSymbol &&
+           Name == RHS.Name &&
+           (Name == "[heap]" || Offset == RHS.Offset);
+  }
+
+  bool operator<(const Location &RHS) const {
+    if (IsSymbol != RHS.IsSymbol)
+      return IsSymbol < RHS.IsSymbol;
+
+    if (Name != RHS.Name)
+      return Name < RHS.Name;
+
+    return Name != "[heap]" && Offset < RHS.Offset;
+  }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const Location &Loc);
+};
+
+typedef std::vector<std::pair<Location, Location>> BranchContext;
+
+struct BranchInfo {
+  Location From;
+  Location To;
+  int64_t Mispreds;
+  int64_t Branches;
+
+  BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches)
+      : From(std::move(From)), To(std::move(To)), Mispreds(Mispreds),
+        Branches(Branches) {}
+
+  bool operator==(const BranchInfo &RHS) const {
+    return From == RHS.From &&
+           To == RHS.To;
+  }
+
+  bool operator<(const BranchInfo &RHS) const {
+    if (From < RHS.From)
+      return true;
+
+    if (From == RHS.From)
+      return (To < RHS.To);
+
+    return false;
+  }
+
+  /// Merges branch and misprediction counts of \p BI with those of this object.
+  void mergeWith(const BranchInfo &BI);
+
+  void print(raw_ostream &OS) const;
+};
+
+struct FuncBranchData {
+  typedef std::vector<BranchInfo> ContainerTy;
+
+  StringRef Name;
+  ContainerTy Data;
+  ContainerTy EntryData;
+
+  /// Total execution count for the function.
+  int64_t ExecutionCount{0};
+
+  /// Indicate if the data was used.
+  bool Used{false};
+
+  FuncBranchData() {}
+
+  FuncBranchData(StringRef Name, ContainerTy Data)
+    : Name(Name), Data(std::move(Data)) {}
+
+  FuncBranchData(StringRef Name, ContainerTy Data, ContainerTy EntryData)
+    : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {}
+
+  ErrorOr<const BranchInfo &> getBranch(uint64_t From, uint64_t To) const;
+
+  /// Returns the branch info object associated with a direct call originating
+  /// from the given offset. If no branch info object is found, an error is
+  /// returned. If the offset corresponds to an indirect call the behavior is
+  /// undefined.
+  ErrorOr<const BranchInfo &> getDirectCallBranch(uint64_t From) const;
+
+  /// Find all the branches originating at From.
+  iterator_range<ContainerTy::const_iterator> getBranchRange(
+    uint64_t From) const;
+
+  /// Append the branch data of another function located \p Offset bytes away
+  /// from the entry of this function.
+  void appendFrom(const FuncBranchData &FBD, uint64_t Offset);
+
+  /// Returns the total number of executed branches in this function
+  /// by counting the number of executed branches for each BranchInfo
+  uint64_t getNumExecutedBranches() const;
+
+  /// Aggregation helpers
+  DenseMap<uint64_t, DenseMap<uint64_t, size_t>> IntraIndex;
+  DenseMap<uint64_t, DenseMap<Location, size_t>> InterIndex;
+  DenseMap<uint64_t, DenseMap<Location, size_t>> EntryIndex;
+
+  void bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, uint64_t Count,
+                       uint64_t Mispreds);
+  void bumpCallCount(uint64_t OffsetFrom, const Location &To, uint64_t Count,
+                     uint64_t Mispreds);
+  void bumpEntryCount(const Location &From, uint64_t OffsetTo, uint64_t Count,
+                      uint64_t Mispreds);
+};
+
+/// MemInfo represents a single memory load from an address \p Addr at an \p
+/// Offset within a function.  \p Count represents how many times a particular
+/// address was seen.
+struct MemInfo {
+  Location Offset;
+  Location Addr;
+  uint64_t Count;
+
+  bool operator==(const MemInfo &RHS) const {
+    return Offset == RHS.Offset && Addr == RHS.Addr;
+  }
+
+  bool operator<(const MemInfo &RHS) const {
+    if (Offset < RHS.Offset)
+      return true;
+
+    if (Offset == RHS.Offset)
+      return (Addr < RHS.Addr);
+
+    return false;
+  }
+
+  void mergeWith(const MemInfo &MI) {
+    Count += MI.Count;
+  }
+
+  void print(raw_ostream &OS) const;
+  void prettyPrint(raw_ostream &OS) const;
+
+  MemInfo(const Location &Offset, const Location &Addr, uint64_t Count = 0)
+    : Offset(Offset), Addr(Addr), Count(Count) {}
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const MemInfo &MI) {
+    MI.prettyPrint(OS);
+    return OS;
+  }
+};
+
+/// Helper class to store memory load events recorded in the address space of
+/// a given function, analogous to FuncBranchData but for memory load events
+/// instead of branches.
+struct FuncMemData {
+  typedef std::vector<MemInfo> ContainerTy;
+
+  StringRef Name;
+  ContainerTy Data;
+
+  /// Indicate if the data was used.
+  bool Used{false};
+
+  DenseMap<uint64_t, DenseMap<Location, size_t>> EventIndex;
+
+  /// Update \p Data with a memory event.  Events with the same
+  /// \p Offset and \p Addr will be coalesced.
+  void update(const Location &Offset, const Location &Addr);
+
+  FuncMemData() {}
+
+  FuncMemData(StringRef Name, ContainerTy Data)
+    : Name(Name), Data(std::move(Data)) {}
+};
+
+/// Similar to BranchInfo, but instead of recording from-to address (an edge),
+/// it records the address of a perf event and the number of times samples hit
+/// this address.
+struct SampleInfo {
+  Location Loc;
+  int64_t Hits;
+
+  SampleInfo(Location Loc, int64_t Hits)
+      : Loc(std::move(Loc)), Hits(Hits) {}
+
+  bool operator==(const SampleInfo &RHS) const {
+    return Loc == RHS.Loc;
+  }
+
+  bool operator<(const SampleInfo &RHS) const {
+    if (Loc < RHS.Loc)
+      return true;
+
+    return false;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  void mergeWith(const SampleInfo &SI);
+};
+
+/// Helper class to store samples recorded in the address space of a given
+/// function, analogous to FuncBranchData but for samples instead of branches.
+struct FuncSampleData {
+  typedef std::vector<SampleInfo> ContainerTy;
+
+  StringRef Name;
+  ContainerTy Data;
+
+  FuncSampleData(StringRef Name, ContainerTy Data)
+      : Name(Name), Data(std::move(Data)) {}
+
+  /// Get the number of samples recorded in [Start, End)
+  uint64_t getSamples(uint64_t Start, uint64_t End) const;
+
+  /// Aggregation helper
+  DenseMap<uint64_t, size_t> Index;
+
+  void bumpCount(uint64_t Offset, uint64_t Count);
+};
+
+/// DataReader Class
+///
+class DataReader : public ProfileReaderBase {
+public:
+  explicit DataReader(StringRef Filename)
+      : ProfileReaderBase(Filename),
+        Diag(errs()) {}
+
+  StringRef getReaderName() const override {
+    return "branch profile reader";
+  }
+
+  bool isTrustedSource() const override {
+    return false;
+  }
+
+  virtual Error preprocessProfile(BinaryContext &BC) override;
+
+  virtual Error readProfilePreCFG(BinaryContext &BC) override;
+
+  virtual Error readProfile(BinaryContext &BC) override;
+
+  virtual bool hasLocalsWithFileName() const override;
+
+  virtual bool mayHaveProfileData(const BinaryFunction &BF) override;
+
+  /// Return all event names used to collect this profile
+  virtual StringSet<> getEventNames() const override {
+    return EventNames;
+  }
+
+protected:
+  /// Read profile information available for the function.
+  void readProfile(BinaryFunction &BF);
+
+  /// In functions with multiple entry points, the profile collection records
+  /// data for other entry points in a different function entry. This function
+  /// attempts to fetch extra profile data for each secondary entry point.
+  bool fetchProfileForOtherEntryPoints(BinaryFunction &BF);
+
+  /// Find the best matching profile for a function after the creation of basic
+  /// blocks.
+  void matchProfileData(BinaryFunction &BF);
+
+  /// Find the best matching memory data profile for a function before the
+  /// creation of basic blocks.
+  void matchProfileMemData(BinaryFunction &BF);
+
+  /// Check how closely \p BranchData matches the function \p BF.
+  /// Return accuracy (ranging from 0.0 to 1.0) of the matching.
+  float evaluateProfileData(BinaryFunction &BF,
+                            const FuncBranchData &BranchData) const;
+
+  /// If our profile data comes from sample addresses instead of LBR entries,
+  /// collect sample count for all addresses in this function address space,
+  /// aggregating them per basic block and assigning an execution count to each
+  /// basic block based on the number of samples recorded at those addresses.
+  /// The last step is to infer edge counts based on BB execution count. Note
+  /// this is the opposite of the LBR way, where we infer BB execution count
+  /// based on edge counts.
+  void readSampleData(BinaryFunction &BF);
+
+  /// Convert function-level branch data into instruction annotations.
+  void convertBranchData(BinaryFunction &BF) const;
+
+  /// Update function \p BF profile with a taken branch.
+  /// \p Count could be 0 if verification of the branch is required.
+  ///
+  /// Return true if the branch is valid, false otherwise.
+  bool recordBranch(BinaryFunction &BF,
+                    uint64_t From, uint64_t To, uint64_t Count = 1,
+                    uint64_t Mispreds = 0) const;
+
+  /// Parses the input bolt data file into internal data structures. We expect
+  /// the file format to follow the syntax below.
+  ///
+  /// <is symbol?> <closest elf symbol or DSO name> <relative FROM address>
+  /// <is symbol?> <closest elf symbol or DSO name> <relative TO address>
+  /// <number of mispredictions> <number of branches>
+  ///
+  /// In <is symbol?> field we record 0 if our closest address is a DSO load
+  /// address or 1 if our closest address is an ELF symbol.
+  ///
+  /// Examples:
+  ///
+  ///  1 main 3fb 0 /lib/ld-2.21.so 12 4 221
+  ///
+  /// The example records branches from symbol main, offset 3fb, to DSO ld-2.21,
+  /// offset 12, with 4 mispredictions and 221 branches.
+  ///
+  ///  2 t2.c/func 11 1 globalfunc 1d 0 1775 2
+  ///  0 1002 2
+  ///  2 t2.c/func 31 2 t2.c/func d
+  ///  2 t2.c/func 18 2 t2.c/func 20
+  ///  0 773 2
+  ///  2 t2.c/func 71 2 t2.c/func d
+  ///  2 t2.c/func 18 2 t2.c/func 60
+  ///
+  /// The examples records branches from local symbol func (from t2.c), offset
+  /// 11, to global symbol globalfunc, offset 1d, with 1775 branches, no
+  /// mispreds. Of these branches, 1002 were preceeded by a sequence of
+  /// branches from func, offset 18 to offset 20 and then from offset 31 to
+  /// offset d. The rest 773 branches were preceeded by a different sequence
+  /// of branches, from func, offset 18 to offset 60 and then from offset 71 to
+  /// offset d.
+  std::error_code parse();
+
+  /// When no_lbr is the first line of the file, activate No LBR mode. In this
+  /// mode we read the addresses where samples were recorded directly instead of
+  /// LBR entries. The line format is almost the same, except for a missing <to>
+  /// triple and a missing mispredictions field:
+  ///
+  /// no_lbr
+  /// <is symbol?> <closest elf symbol or DSO name> <relative address> <count>
+  /// ...
+  ///
+  /// Example:
+  ///
+  /// no_lbr                           # First line of fdata file
+  ///  1 BZ2_compressBlock 466c 3
+  ///  1 BZ2_hbMakeCodeLengths 29c 1
+  ///
+  std::error_code parseInNoLBRMode();
+
+  /// Return branch data matching one of the names in \p FuncNames.
+  FuncBranchData *
+  getBranchDataForNames(const std::vector<StringRef> &FuncNames);
+
+  /// Return branch data matching one of the \p Symbols.
+  FuncBranchData *
+  getBranchDataForSymbols(const std::vector<MCSymbol *> &Symbols);
+
+  /// Return mem data matching one of the names in \p FuncNames.
+  FuncMemData *getMemDataForNames(const std::vector<StringRef> &FuncNames);
+
+  FuncSampleData *
+  getFuncSampleData(const std::vector<StringRef> &FuncNames);
+
+  /// Return a vector of all FuncBranchData matching the list of names.
+  /// Internally use fuzzy matching to match special names like LTO-generated
+  /// function names.
+  std::vector<FuncBranchData *>
+  getBranchDataForNamesRegex(const std::vector<StringRef> &FuncNames);
+
+  /// Return a vector of all FuncMemData matching the list of names.
+  /// Internally use fuzzy matching to match special names like LTO-generated
+  /// function names.
+  std::vector<FuncMemData *>
+  getMemDataForNamesRegex(const std::vector<StringRef> &FuncNames);
+
+  /// Return branch data profile associated with function \p BF  or nullptr
+  /// if the function has no associated profile.
+  FuncBranchData *getBranchData(const BinaryFunction &BF) const {
+    auto FBDI = FuncsToBranches.find(&BF);
+    if (FBDI == FuncsToBranches.end())
+      return nullptr;
+    return FBDI->second;
+  }
+
+  /// Updates branch profile data associated with function \p BF.
+  void setBranchData(const BinaryFunction &BF, FuncBranchData *FBD) {
+    FuncsToBranches[&BF] = FBD;
+  }
+
+  /// Return memory profile data associated with function \p BF, or nullptr
+  /// if the function has no associated profile.
+  FuncMemData *getMemData(const BinaryFunction &BF) const {
+    auto FMDI = FuncsToMemData.find(&BF);
+    if (FMDI == FuncsToMemData.end())
+      return nullptr;
+    return FMDI->second;
+  }
+
+  /// Updates the memory profile data associated with function \p BF.
+  void setMemData(const BinaryFunction &BF, FuncMemData *FMD) {
+    FuncsToMemData[&BF] = FMD;
+  }
+
+  using NamesToBranchesMapTy = StringMap<FuncBranchData>;
+  using NamesToSamplesMapTy = StringMap<FuncSampleData>;
+  using NamesToMemEventsMapTy = StringMap<FuncMemData>;
+  using FuncsToBranchesMapTy =
+    std::unordered_map<const BinaryFunction *, FuncBranchData *>;
+  using FuncsToMemDataMapTy =
+    std::unordered_map<const BinaryFunction *, FuncMemData *>;
+
+  /// Dumps the entire data structures parsed. Used for debugging.
+  void dump() const;
+
+  /// Return false only if we are running with profiling data that lacks LBR.
+  bool hasLBR() const { return !NoLBRMode; }
+
+  /// Return true if the profiling data was collected in a bolted binary. This
+  /// means we lose the ability to identify stale data at some branch locations,
+  /// since we have to be more permissive in some cases.
+  bool collectedInBoltedBinary() const { return BATMode; }
+
+  /// Return true if event named \p Name was used to collect this profile data.
+  bool usesEvent(StringRef Name) const {
+    for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) {
+      StringRef Event = I->getKey();
+      if (Event.find(Name) != StringRef::npos)
+        return true;
+    }
+    return false;
+  }
+
+  /// Open the file and parse the contents.
+  std::error_code parseInput();
+
+  /// Build suffix map once the profile data is parsed.
+  void buildLTONameMaps();
+
+  void reportError(StringRef ErrorMsg);
+  bool expectAndConsumeFS();
+  void consumeAllRemainingFS();
+  bool checkAndConsumeNewLine();
+  ErrorOr<StringRef> parseString(char EndChar, bool EndNl=false);
+  ErrorOr<int64_t> parseNumberField(char EndChar, bool EndNl=false);
+  ErrorOr<uint64_t> parseHexField(char EndChar, bool EndNl=false);
+  ErrorOr<Location> parseLocation(char EndChar, bool EndNl, bool ExpectMemLoc);
+  ErrorOr<Location> parseLocation(char EndChar, bool EndNl=false) {
+    return parseLocation(EndChar, EndNl, false);
+  }
+  ErrorOr<Location> parseMemLocation(char EndChar, bool EndNl=false) {
+    return parseLocation(EndChar, EndNl, true);
+  }
+  ErrorOr<BranchInfo> parseBranchInfo();
+  ErrorOr<SampleInfo> parseSampleInfo();
+  ErrorOr<MemInfo> parseMemInfo();
+  ErrorOr<bool> maybeParseNoLBRFlag();
+  ErrorOr<bool> maybeParseBATFlag();
+  bool hasBranchData();
+  bool hasMemData();
+
+  /// An in-memory copy of the input data file - owns strings used in reader.
+  std::unique_ptr<MemoryBuffer> FileBuf;
+  raw_ostream &Diag;
+  StringRef ParsingBuf;
+  unsigned Line{0};
+  unsigned Col{0};
+  NamesToBranchesMapTy NamesToBranches;
+  NamesToSamplesMapTy NamesToSamples;
+  NamesToMemEventsMapTy NamesToMemEvents;
+  FuncsToBranchesMapTy FuncsToBranches;
+  FuncsToMemDataMapTy FuncsToMemData;
+  bool NoLBRMode{false};
+  bool BATMode{false};
+  StringSet<> EventNames;
+  static const char FieldSeparator = ' ';
+
+  /// Maps of common LTO names to possible matching profiles.
+  StringMap<std::vector<FuncBranchData *>> LTOCommonNameMap;
+  StringMap<std::vector<FuncMemData *>> LTOCommonNameMemMap;
+};
+
+}
+
+/// DenseMapInfo allows us to use the DenseMap LLVM data structure to store
+/// Locations
+template<> struct DenseMapInfo<bolt::Location> {
+  static inline bolt::Location getEmptyKey() {
+    return bolt::Location(true, StringRef(), static_cast<uint64_t>(-1LL));
+  }
+  static inline bolt::Location getTombstoneKey() {
+    return bolt::Location(true, StringRef(), static_cast<uint64_t>(-2LL));;
+  }
+  static unsigned getHashValue(const bolt::Location &L) {
+    return (unsigned(DenseMapInfo<StringRef>::getHashValue(L.Name)) >> 4) ^
+           (unsigned(L.Offset));
+  }
+  static bool isEqual(const bolt::Location &LHS,
+                      const bolt::Location &RHS) {
+    return LHS.IsSymbol == RHS.IsSymbol && LHS.Name == RHS.Name &&
+           LHS.Offset == RHS.Offset;
+  }
+};
+
+}
+
+#endif
diff --git a/bolt/src/DebugData.cpp b/bolt/src/DebugData.cpp
new file mode 100644
index 000000000000..4681a27cfd50
--- /dev/null
+++ b/bolt/src/DebugData.cpp
@@ -0,0 +1,473 @@
+//===- DebugData.cpp - Representation and writing of debugging information. ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DebugData.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/LEB128.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt-debug-info"
+
+namespace opts {
+extern llvm::cl::opt<unsigned> Verbosity;
+}
+
+namespace llvm {
+namespace bolt {
+
+const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0};
+
+namespace {
+
+// Writes address ranges to Writer as pairs of 64-bit (address, size).
+// If RelativeRange is true, assumes the address range to be written must be of
+// the form (begin address, range size), otherwise (begin address, end address).
+// Terminates the list by writing a pair of two zeroes.
+// Returns the number of written bytes.
+uint64_t writeAddressRanges(
+    raw_svector_ostream &Stream,
+    const DebugAddressRangesVector &AddressRanges,
+    const bool WriteRelativeRanges = false) {
+  for (const DebugAddressRange &Range : AddressRanges) {
+    support::endian::write(Stream, Range.LowPC, support::little);
+    support::endian::write(
+        Stream, WriteRelativeRanges ? Range.HighPC - Range.LowPC : Range.HighPC,
+        support::little);
+  }
+  // Finish with 0 entries.
+  support::endian::write(Stream, 0ULL, support::little);
+  support::endian::write(Stream, 0ULL, support::little);
+  return AddressRanges.size() * 16 + 16;
+}
+
+} // namespace
+
+DebugRangesSectionWriter::DebugRangesSectionWriter() {
+  RangesBuffer = std::make_unique<RangesBufferVector>();
+  RangesStream = std::make_unique<raw_svector_ostream>(*RangesBuffer);
+
+  // Add an empty range as the first entry;
+  SectionOffset +=
+      writeAddressRanges(*RangesStream.get(), DebugAddressRangesVector{});
+}
+
+uint64_t DebugRangesSectionWriter::addRanges(
+    DebugAddressRangesVector &&Ranges,
+    std::map<DebugAddressRangesVector, uint64_t> &CachedRanges) {
+  if (Ranges.empty())
+    return getEmptyRangesOffset();
+
+  const auto RI = CachedRanges.find(Ranges);
+  if (RI != CachedRanges.end())
+    return RI->second;
+
+  const uint64_t EntryOffset = addRanges(Ranges);
+  CachedRanges.emplace(std::move(Ranges), EntryOffset);
+
+  return EntryOffset;
+}
+
+uint64_t
+DebugRangesSectionWriter::addRanges(const DebugAddressRangesVector &Ranges) {
+  if (Ranges.empty())
+    return getEmptyRangesOffset();
+
+  // Reading the SectionOffset and updating it should be atomic to guarantee
+  // unique and correct offsets in patches.
+  std::lock_guard<std::mutex> Lock(WriterMutex);
+  const uint32_t EntryOffset = SectionOffset;
+  SectionOffset += writeAddressRanges(*RangesStream.get(), Ranges);
+
+  return EntryOffset;
+}
+
+uint64_t DebugRangesSectionWriter::getSectionOffset() {
+  std::lock_guard<std::mutex> Lock(WriterMutex);
+  return SectionOffset;
+}
+
+void DebugARangesSectionWriter::addCURanges(uint64_t CUOffset,
+                                            DebugAddressRangesVector &&Ranges) {
+  std::lock_guard<std::mutex> Lock(CUAddressRangesMutex);
+  CUAddressRanges.emplace(CUOffset, std::move(Ranges));
+}
+
+void DebugARangesSectionWriter::writeARangesSection(
+    raw_svector_ostream &RangesStream) const {
+  // For reference on the format of the .debug_aranges section, see the DWARF4
+  // specification, section 6.1.4 Lookup by Address
+  // http://www.dwarfstd.org/doc/DWARF4.pdf
+  for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) {
+    const uint64_t Offset = CUOffsetAddressRangesPair.first;
+    const DebugAddressRangesVector &AddressRanges =
+        CUOffsetAddressRangesPair.second;
+
+    // Emit header.
+
+    // Size of this set: 8 (size of the header) + 4 (padding after header)
+    // + 2*sizeof(uint64_t) bytes for each of the ranges, plus an extra
+    // pair of uint64_t's for the terminating, zero-length range.
+    // Does not include size field itself.
+    uint32_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1);
+
+    // Header field #1: set size.
+    support::endian::write(RangesStream, Size, support::little);
+
+    // Header field #2: version number, 2 as per the specification.
+    support::endian::write(RangesStream, static_cast<uint16_t>(2),
+                           support::little);
+
+    // Header field #3: debug info offset of the correspondent compile unit.
+    support::endian::write(RangesStream, static_cast<uint32_t>(Offset),
+                           support::little);
+
+    // Header field #4: address size.
+    // 8 since we only write ELF64 binaries for now.
+    RangesStream << char(8);
+
+    // Header field #5: segment size of target architecture.
+    RangesStream << char(0);
+
+    // Padding before address table - 4 bytes in the 64-bit-pointer case.
+    support::endian::write(RangesStream, static_cast<uint32_t>(0),
+                           support::little);
+
+    writeAddressRanges(RangesStream, AddressRanges, true);
+  }
+}
+
+DebugAddrWriter::DebugAddrWriter(BinaryContext *Bc) { BC = Bc; }
+
+void DebugAddrWriter::AddressForDWOCU::dump() {
+  std::vector<IndexAddressPair> SortedMap(indexToAddressBegin(),
+                                          indexToAdddessEnd());
+  // Sorting address in increasing order of indices.
+  std::sort(SortedMap.begin(), SortedMap.end(),
+            [](const IndexAddressPair &A, const IndexAddressPair &B) {
+              return A.first < B.first;
+            });
+  for (auto &Pair : SortedMap)
+    dbgs() << Twine::utohexstr(Pair.second) << "\t" << Pair.first << "\n";
+}
+uint32_t DebugAddrWriter::getIndexFromAddress(uint64_t Address,
+                                              uint64_t DWOId) {
+  if (!AddressMaps.count(DWOId))
+    AddressMaps[DWOId] = AddressForDWOCU();
+
+  AddressForDWOCU &Map = AddressMaps[DWOId];
+  auto Entry = Map.find(Address);
+  if (Entry == Map.end()) {
+    auto Index = Map.getNextIndex();
+    Entry = Map.insert(Address, Index).first;
+  }
+  return Entry->second;
+}
+
+// Case1) Address is not in map insert in to AddresToIndex and IndexToAddres
+// Case2) Address is in the map but Index is higher or equal. Need to update
+// IndexToAddrss. Case3) Address is in the map but Index is lower. Need to
+// update AddressToIndex and IndexToAddress
+void DebugAddrWriter::addIndexAddress(uint64_t Address, uint32_t Index,
+                                      uint64_t DWOId) {
+  AddressForDWOCU &Map = AddressMaps[DWOId];
+  auto Entry = Map.find(Address);
+  if (Entry != Map.end()) {
+    if (Entry->second > Index)
+      Map.updateAddressToIndex(Address, Index);
+    Map.updateIndexToAddrss(Address, Index);
+  } else
+    Map.insert(Address, Index);
+}
+
+AddressSectionBuffer DebugAddrWriter::finalize() {
+  // Need to layout all sections within .debug_addr
+  // Within each section sort Address by index.
+  AddressSectionBuffer Buffer;
+  raw_svector_ostream AddressStream(Buffer);
+  for (std::unique_ptr<DWARFUnit> &CU : BC->DwCtx->compile_units()) {
+    Optional<uint64_t> DWOId = CU->getDWOId();
+    // Handling the case wehre debug information is a mix of Debug fission and
+    // monolitic.
+    if (!DWOId)
+      continue;
+    auto AM = AddressMaps.find(*DWOId);
+    // Adding to map even if it did not contribute to .debug_addr.
+    // The Skeleton CU will still have DW_AT_GNU_addr_base.
+    DWOIdToOffsetMap[*DWOId] = Buffer.size();
+    // If does not exist this CUs DWO section didn't contribute to .debug_addr.
+    if (AM == AddressMaps.end())
+      continue;
+    std::vector<IndexAddressPair> SortedMap(AM->second.indexToAddressBegin(),
+                                            AM->second.indexToAdddessEnd());
+    // Sorting address in increasing order of indices.
+    std::sort(SortedMap.begin(), SortedMap.end(),
+              [](const IndexAddressPair &A, const IndexAddressPair &B) {
+                return A.first < B.first;
+              });
+
+    uint8_t AddrSize = CU->getAddressByteSize();
+    uint32_t Counter = 0;
+    auto WriteAddress = [&](uint64_t Address) -> void {
+      ++Counter;
+      switch (AddrSize) {
+      default:
+        assert(false && "Address Size is invalid.");
+        break;
+      case 4:
+        support::endian::write(AddressStream, static_cast<uint32_t>(Address),
+                               support::little);
+        break;
+      case 8:
+        support::endian::write(AddressStream, Address, support::little);
+        break;
+      }
+    };
+
+    for (const IndexAddressPair &Val : SortedMap) {
+      while (Val.first > Counter)
+        WriteAddress(0);
+      WriteAddress(Val.second);
+    }
+  }
+
+  return Buffer;
+}
+
+uint64_t DebugAddrWriter::getOffset(uint64_t DWOId) {
+  auto Iter = DWOIdToOffsetMap.find(DWOId);
+  assert(Iter != DWOIdToOffsetMap.end() &&
+         "Offset in to.debug_addr was not found for DWO ID.");
+  return Iter->second;
+}
+
+DebugLocWriter::DebugLocWriter(BinaryContext *BC) {
+  LocBuffer = std::make_unique<LocBufferVector>();
+  LocStream = std::make_unique<raw_svector_ostream>(*LocBuffer);
+}
+
+// DWARF 4: 2.6.2
+uint64_t
+DebugLocWriter::addList(const DebugLocationsVector &LocList) {
+  if (LocList.empty())
+    return EmptyListTag;
+
+  // Since there is a separate DebugLocWriter for each thread,
+  // we don't need a lock to read the SectionOffset and update it.
+  const uint32_t EntryOffset = SectionOffset;
+
+  for (const DebugLocationEntry &Entry : LocList) {
+    support::endian::write(*LocStream, static_cast<uint64_t>(Entry.LowPC),
+                           support::little);
+    support::endian::write(*LocStream, static_cast<uint64_t>(Entry.HighPC),
+                           support::little);
+    support::endian::write(*LocStream, static_cast<uint16_t>(Entry.Expr.size()),
+                           support::little);
+    *LocStream << StringRef(reinterpret_cast<const char *>(Entry.Expr.data()),
+                            Entry.Expr.size());
+    SectionOffset += 2 * 8 + 2 + Entry.Expr.size();
+  }
+  LocStream->write_zeros(16);
+  SectionOffset += 16;
+
+  return EntryOffset;
+}
+
+DebugAddrWriter *DebugLoclistWriter::AddrWriter = nullptr;
+void DebugLoclistWriter::finalizePatches() {
+  auto numOfBytes = [](uint32_t Val) -> uint32_t {
+    int LogVal = (int)std::log2(Val) + 1;
+    uint32_t CeilVal = (LogVal + 8 - 1) / 8;
+    return !Val ? 1 : CeilVal;
+  };
+  (void)numOfBytes;
+
+  for (const auto &Patch : IndexPatches) {
+    uint32_t Index = AddrWriter->getIndexFromAddress(Patch.Address, DWOId);
+    assert(numOfBytes(Index) <= DebugLoclistWriter::NumBytesForIndex &&
+           "Index size in DebugLocation too large.");
+    std::string Buff;
+    raw_string_ostream OS(Buff);
+    encodeULEB128(Index, OS, DebugLoclistWriter::NumBytesForIndex);
+    for (uint32_t I = 0; I < DebugLoclistWriter::NumBytesForIndex; ++I) {
+      (*LocBuffer)[Patch.Offset + I] = Buff[I];
+    }
+  }
+}
+
+uint64_t DebugLoclistWriter::addList(const DebugLocationsVector &LocList) {
+  if (LocList.empty())
+    return EmptyListTag;
+  uint64_t EntryOffset = LocBuffer->size();
+
+  for (const DebugLocationEntry &Entry : LocList) {
+    support::endian::write(*LocStream,
+                           static_cast<uint8_t>(dwarf::DW_LLE_startx_length),
+                           support::little);
+    IndexPatches.emplace_back(static_cast<uint32_t>(LocBuffer->size()),
+                              Entry.LowPC);
+    LocStream->write_zeros(DebugLoclistWriter::NumBytesForIndex);
+    // TODO: Support DWARF5
+    support::endian::write(*LocStream,
+                           static_cast<uint32_t>(Entry.HighPC - Entry.LowPC),
+                           support::little);
+    support::endian::write(*LocStream, static_cast<uint16_t>(Entry.Expr.size()),
+                           support::little);
+    *LocStream << StringRef(reinterpret_cast<const char *>(Entry.Expr.data()),
+                            Entry.Expr.size());
+  }
+  support::endian::write(*LocStream,
+                         static_cast<uint8_t>(dwarf::DW_LLE_end_of_list),
+                         support::little);
+  return EntryOffset;
+}
+
+void SimpleBinaryPatcher::addBinaryPatch(uint32_t Offset,
+                                         const std::string &NewValue) {
+  Patches.emplace_back(Offset, NewValue);
+}
+
+void SimpleBinaryPatcher::addBytePatch(uint32_t Offset, uint8_t Value) {
+  Patches.emplace_back(Offset, std::string(1, Value));
+}
+
+void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue,
+                                     size_t ByteSize) {
+  std::string LE64(ByteSize, 0);
+  for (size_t I = 0; I < ByteSize; ++I) {
+    LE64[I] = NewValue & 0xff;
+    NewValue >>= 8;
+  }
+  Patches.emplace_back(Offset, LE64);
+}
+
+void SimpleBinaryPatcher::addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size) {
+  std::string Buff;
+  raw_string_ostream OS(Buff);
+  encodeULEB128(Value, OS, Size);
+
+  Patches.emplace_back(Offset, OS.str());
+}
+
+void SimpleBinaryPatcher::addLE64Patch(uint32_t Offset, uint64_t NewValue) {
+  addLEPatch(Offset, NewValue, 8);
+}
+
+void SimpleBinaryPatcher::addLE32Patch(uint32_t Offset, uint32_t NewValue) {
+  addLEPatch(Offset, NewValue, 4);
+}
+
+void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents,
+                                      uint32_t DWPOffset = 0) {
+  for (const auto &Patch : Patches) {
+    uint32_t Offset = Patch.first - DWPOffset;
+    const std::string &ByteSequence = Patch.second;
+    assert(Offset + ByteSequence.size() <= BinaryContents.size() &&
+        "Applied patch runs over binary size.");
+    for (uint64_t I = 0, Size = ByteSequence.size(); I < Size; ++I) {
+      BinaryContents[Offset + I] = ByteSequence[I];
+    }
+  }
+}
+
+void DebugAbbrevPatcher::addAttributePatch(
+    const DWARFAbbreviationDeclaration *Abbrev, dwarf::Attribute AttrTag,
+    dwarf::Attribute NewAttrTag, uint8_t NewAttrForm) {
+  assert(Abbrev && "no abbreviation specified");
+
+  if (std::numeric_limits<uint8_t>::max() >= NewAttrTag)
+    AbbrevPatches.emplace(
+        AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm});
+  else
+    AbbrevNonStandardPatches.emplace_back(
+        AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm});
+}
+
+void DebugAbbrevPatcher::patchBinary(std::string &Contents,
+                                     uint32_t DWPOffset = 0) {
+  SimpleBinaryPatcher Patcher;
+
+  for (const AbbrevAttrPatch &Patch : AbbrevPatches) {
+    const DWARFAbbreviationDeclaration::AttributeSpec *const Attribute =
+        Patch.Abbrev->findAttribute(Patch.Attr);
+    assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
+
+    Patcher.addBytePatch(Attribute->AttrOffset - DWPOffset,
+                         static_cast<uint8_t>(Patch.NewAttr));
+    Patcher.addBytePatch(Attribute->FormOffset - DWPOffset, Patch.NewForm);
+  }
+  Patcher.patchBinary(Contents);
+
+  if (AbbrevNonStandardPatches.empty())
+    return;
+
+  std::string Section;
+  Section.reserve(Contents.size() + AbbrevNonStandardPatches.size() / 2);
+
+  const char *Ptr = Contents.c_str();
+  uint32_t Start = 0;
+  std::string Buff;
+  auto Encode = [&](uint16_t Value) -> std::string {
+    Buff.clear();
+    raw_string_ostream OS(Buff);
+    encodeULEB128(Value, OS, 2);
+    return Buff;
+  };
+
+  for (const AbbrevAttrPatch &Patch : AbbrevNonStandardPatches) {
+    const DWARFAbbreviationDeclaration::AttributeSpec *const Attribute =
+        Patch.Abbrev->findAttribute(Patch.Attr);
+    assert(Attribute && "Specified attribute doesn't occur in abbreviation.");
+    assert(Start <= Attribute->AttrOffset &&
+           "Offsets are not in sequential order.");
+    // Assuming old attribute is 1 byte. Otherwise will need to add logic to
+    // determine it's size at runtime.
+    assert(std::numeric_limits<uint8_t>::max() >= Patch.Attr &&
+           "Old attribute is greater then 1 byte.");
+
+    Section.append(Ptr + Start, Attribute->AttrOffset - Start);
+    Section.append(Encode(Patch.NewAttr));
+    Section.append(std::string(1, Patch.NewForm));
+
+    Start = Attribute->FormOffset + 1;
+  }
+  Section.append(Ptr + Start, Contents.size() - Start);
+  Contents = std::move(Section);
+}
+
+void DebugStrWriter::create() {
+  StrBuffer = std::make_unique<DebugStrBufferVector>();
+  StrStream = std::make_unique<raw_svector_ostream>(*StrBuffer);
+}
+
+void DebugStrWriter::initialize() {
+  auto StrSection = BC->DwCtx->getDWARFObj().getStrSection();
+  (*StrStream) << StrSection;
+}
+
+uint32_t DebugStrWriter::addString(StringRef Str) {
+  if (StrBuffer->empty())
+    initialize();
+  auto Offset = StrBuffer->size();
+  (*StrStream) << Str;
+  StrStream->write_zeros(1);
+  return Offset;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/DebugData.h b/bolt/src/DebugData.h
new file mode 100644
index 000000000000..47aa86da8dbf
--- /dev/null
+++ b/bolt/src/DebugData.h
@@ -0,0 +1,491 @@
+//===-- DebugData.h - Representation and writing of debugging information. -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Classes that represent and serialize DWARF-related entities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_DATA_H
+#define LLVM_TOOLS_LLVM_BOLT_DEBUG_DATA_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <map>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+namespace bolt {
+
+class BinaryContext;
+
+/// Address range representation. Takes less space than DWARFAddressRange.
+struct DebugAddressRange {
+  uint64_t LowPC{0};
+  uint64_t HighPC{0};
+
+  DebugAddressRange() = default;
+
+  DebugAddressRange(uint64_t LowPC, uint64_t HighPC)
+      : LowPC(LowPC), HighPC(HighPC) {}
+};
+
+static inline bool operator<(const DebugAddressRange &LHS,
+                             const DebugAddressRange &RHS) {
+  return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
+}
+
+/// DebugAddressRangesVector - represents a set of absolute address ranges.
+using DebugAddressRangesVector = SmallVector<DebugAddressRange, 2>;
+
+/// Address range with location used by .debug_loc section.
+/// More compact than DWARFLocationEntry and uses absolute addresses.
+struct DebugLocationEntry {
+  uint64_t LowPC;
+  uint64_t HighPC;
+  SmallVector<uint8_t, 4> Expr;
+};
+
+using DebugLocationsVector = SmallVector<DebugLocationEntry, 4>;
+
+/// References a row in a DWARFDebugLine::LineTable by the DWARF
+/// Context index of the DWARF Compile Unit that owns the Line Table and the row
+/// index. This is tied to our IR during disassembly so that we can later update
+/// .debug_line information. RowIndex has a base of 1, which means a RowIndex
+/// of 1 maps to the first row of the line table and a RowIndex of 0 is invalid.
+struct DebugLineTableRowRef {
+  uint32_t DwCompileUnitIndex;
+  uint32_t RowIndex;
+
+  const static DebugLineTableRowRef NULL_ROW;
+
+  bool operator==(const DebugLineTableRowRef &Rhs) const {
+    return DwCompileUnitIndex == Rhs.DwCompileUnitIndex &&
+      RowIndex == Rhs.RowIndex;
+  }
+
+  bool operator!=(const DebugLineTableRowRef &Rhs) const {
+    return !(*this == Rhs);
+  }
+
+  static DebugLineTableRowRef fromSMLoc(const SMLoc &Loc) {
+    union {
+      decltype(Loc.getPointer()) Ptr;
+      DebugLineTableRowRef Ref;
+    } U;
+    U.Ptr = Loc.getPointer();
+    return U.Ref;
+  }
+
+  SMLoc toSMLoc() const {
+    union {
+      decltype(SMLoc().getPointer()) Ptr;
+      DebugLineTableRowRef Ref;
+    } U;
+    U.Ref = *this;
+    return SMLoc::getFromPointer(U.Ptr);
+  }
+};
+
+using RangesBufferVector = SmallVector<char, 16>;
+
+/// Serializes the .debug_ranges DWARF section.
+class DebugRangesSectionWriter {
+public:
+  DebugRangesSectionWriter();
+
+  /// Add ranges with caching.
+  uint64_t
+  addRanges(DebugAddressRangesVector &&Ranges,
+            std::map<DebugAddressRangesVector, uint64_t> &CachedRanges);
+
+  /// Add ranges and return offset into section.
+  uint64_t addRanges(const DebugAddressRangesVector &Ranges);
+
+  /// Returns an offset of an empty address ranges list that is always written
+  /// to .debug_ranges
+  uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; }
+
+  /// Returns the SectionOffset.
+  uint64_t getSectionOffset();
+
+  std::unique_ptr<RangesBufferVector> finalize() {
+    return std::move(RangesBuffer);
+  }
+
+private:
+  std::unique_ptr<RangesBufferVector> RangesBuffer;
+
+  std::unique_ptr<raw_svector_ostream> RangesStream;
+
+  std::mutex WriterMutex;
+
+  /// Current offset in the section (updated as new entries are written).
+  /// Starts with 16 since the first 16 bytes are reserved for an empty range.
+  uint32_t SectionOffset{0};
+
+  /// Offset of an empty address ranges list.
+  static constexpr uint64_t EmptyRangesOffset{0};
+};
+
+/// Serializes the .debug_aranges DWARF section.
+class DebugARangesSectionWriter {
+public:
+  /// Add ranges for CU matching \p CUOffset.
+  void addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges);
+
+  /// Writes .debug_aranges with the added ranges to the MCObjectWriter.
+  void writeARangesSection(raw_svector_ostream &RangesStream) const;
+
+  /// Resets the writer to a clear state.
+  void reset() {
+    CUAddressRanges.clear();
+  }
+
+  /// Map DWARFCompileUnit index to ranges.
+  using CUAddressRangesType = std::map<uint64_t, DebugAddressRangesVector>;
+
+  /// Return ranges for a given CU.
+  const CUAddressRangesType &getCUAddressRanges() const {
+    return CUAddressRanges;
+  }
+
+private:
+  /// Map from compile unit offset to the list of address intervals that belong
+  /// to that compile unit. Each interval is a pair
+  /// (first address, interval size).
+  CUAddressRangesType CUAddressRanges;
+
+  std::mutex CUAddressRangesMutex;
+};
+
+using IndexAddressPair = std::pair<uint32_t, uint64_t>;
+using AddressToIndexMap = std::unordered_map<uint64_t, uint32_t>;
+using IndexToAddressMap = std::unordered_map<uint32_t, uint64_t>;
+using AddressSectionBuffer = SmallVector<char, 4>;
+class DebugAddrWriter {
+public:
+  DebugAddrWriter() = delete;
+  DebugAddrWriter(BinaryContext *BC_);
+  /// Given an address returns an index in .debug_addr.
+  /// Adds Address to map.
+  uint32_t getIndexFromAddress(uint64_t Address, uint64_t DWOId);
+
+  /// Adds {Address, Index} to DWO ID CU.
+  void addIndexAddress(uint64_t Address, uint32_t Index, uint64_t DWOId);
+
+  /// Creates consolidated .debug_addr section, and builds DWOID to offset map.
+  AddressSectionBuffer finalize();
+
+  /// Given DWOID returns offset of this CU in to .debug_addr section.
+  uint64_t getOffset(uint64_t DWOId);
+
+  /// Returns False if .debug_addr section was created..
+  bool isInitialized() const { return !AddressMaps.empty(); }
+
+private:
+  class AddressForDWOCU {
+  public:
+    AddressToIndexMap::iterator find(uint64_t Adddress) {
+      return AddressToIndex.find(Adddress);
+    }
+    AddressToIndexMap::iterator end() { return AddressToIndex.end(); }
+    AddressToIndexMap::iterator begin() { return AddressToIndex.begin(); }
+
+    IndexToAddressMap::iterator indexToAdddessEnd() {
+      return IndexToAddress.end();
+    }
+    IndexToAddressMap::iterator indexToAddressBegin() {
+      return IndexToAddress.begin();
+    }
+    uint32_t getNextIndex() {
+      while (IndexToAddress.count(CurrentIndex))
+        ++CurrentIndex;
+      return CurrentIndex;
+    }
+
+    /// Inserts elements in to IndexToAddress and AddressToIndex.
+    /// Follows the same semantics as unordered_map insert.
+    std::pair<AddressToIndexMap::iterator, bool> insert(uint64_t Address,
+                                                        uint32_t Index) {
+      IndexToAddress.insert({Index, Address});
+      return AddressToIndex.insert({Address, Index});
+    }
+
+    /// Updates AddressToIndex Map.
+    /// Follows the same symantics as unordered map [].
+    void updateAddressToIndex(uint64_t Address, uint32_t Index) {
+      AddressToIndex[Address] = Index;
+    }
+
+    /// Updates IndexToAddress Map.
+    /// Follows the same symantics as unordered map [].
+    void updateIndexToAddrss(uint64_t Address, uint32_t Index) {
+      IndexToAddress[Index] = Address;
+    }
+
+    void dump();
+
+  private:
+    AddressToIndexMap AddressToIndex;
+    IndexToAddressMap IndexToAddress;
+    uint32_t CurrentIndex{0};
+  };
+  BinaryContext *BC;
+  /// Maps DWOID to AddressForDWOCU.
+  std::unordered_map<uint64_t, AddressForDWOCU> AddressMaps;
+  /// Maps DWOID to offset within .debug_addr section.
+  std::unordered_map<uint64_t, uint64_t> DWOIdToOffsetMap;
+};
+
+using DebugStrBufferVector = SmallVector<char, 16>;
+class DebugStrWriter {
+public:
+  DebugStrWriter() = delete;
+  DebugStrWriter(BinaryContext *Bc) : BC(Bc) { create(); }
+  std::unique_ptr<DebugStrBufferVector> finalize() {
+    return std::move(StrBuffer);
+  }
+
+  /// Adds string to .debug_str.
+  /// On first invokation it initializes internal data stractures.
+  uint32_t addString(StringRef Str);
+
+  /// Returns False if no strings were added to .debug_str.
+  bool isInitialized() const { return !StrBuffer->empty(); }
+
+private:
+  /// Initializes Buffer and Stream.
+  void initialize();
+  /// Creats internal data stractures.
+  void create();
+  std::unique_ptr<DebugStrBufferVector> StrBuffer;
+  std::unique_ptr<raw_svector_ostream> StrStream;
+  BinaryContext *BC;
+};
+
+using LocBufferVector = SmallVector<char, 16>;
+
+enum class LocWriterKind { DebugLocWriter, DebugLoclistWriter };
+
+/// Serializes part of a .debug_loc DWARF section with LocationLists.
+class DebugLocWriter {
+public:
+  DebugLocWriter() = delete;
+  DebugLocWriter(BinaryContext *BC);
+  virtual ~DebugLocWriter(){};
+
+  virtual uint64_t addList(const DebugLocationsVector &LocList);
+
+  virtual std::unique_ptr<LocBufferVector> finalize() {
+    return std::move(LocBuffer);
+  }
+
+  /// Offset of an empty location list.
+  static constexpr uint32_t EmptyListOffset = 0;
+
+  /// Value returned by addList if list is empty
+  /// Use 64 bits here so that a max 32 bit value can still
+  /// be stored while we use max 64 bit value as empty tag
+  static constexpr uint64_t EmptyListTag = -1;
+
+  LocWriterKind getKind() const { return Kind; }
+
+  static bool classof(const DebugLocWriter *Writer) {
+    return Writer->getKind() == LocWriterKind::DebugLocWriter;
+  }
+
+protected:
+  std::unique_ptr<LocBufferVector> LocBuffer;
+
+  std::unique_ptr<raw_svector_ostream> LocStream;
+  /// Current offset in the section (updated as new entries are written).
+  /// Starts with 0 here since this only writes part of a full location lists
+  /// section. In the final section, the first 16 bytes are reserved for an
+  /// empty list.
+  uint32_t SectionOffset{0};
+  LocWriterKind Kind{LocWriterKind::DebugLocWriter};
+};
+
+class DebugLoclistWriter : public DebugLocWriter {
+public:
+  ~DebugLoclistWriter() {}
+  DebugLoclistWriter() = delete;
+  DebugLoclistWriter(BinaryContext *BC, uint64_t DWOId_)
+      : DebugLocWriter(BC), DWOId(DWOId_) {
+    Kind = LocWriterKind::DebugLoclistWriter;
+    assert(DebugLoclistWriter::AddrWriter &&
+           "Please use SetAddressWriter to initialize "
+           "DebugAddrWriter before instantiation.");
+  }
+
+  static void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; }
+
+  /// Writes out locationList, with index in to .debug_addr to be patched later.
+  uint64_t addList(const DebugLocationsVector &LocList) override;
+
+  /// Finalizes all the location by patching correct index in to .debug_addr.
+  void finalizePatches();
+
+  static bool classof(const DebugLocWriter *Writer) {
+    return Writer->getKind() == LocWriterKind::DebugLoclistWriter;
+  }
+
+private:
+  class Patch {
+  public:
+    Patch() = delete;
+    Patch(uint64_t O, uint64_t A) : Offset(O), Address(A) {}
+    uint64_t Offset{0};
+    uint64_t Address{0};
+  };
+  static DebugAddrWriter *AddrWriter;
+  uint64_t DWOId{0};
+  std::unordered_map<uint64_t, uint32_t> AddressMap;
+  std::vector<Patch> IndexPatches;
+  constexpr static uint32_t NumBytesForIndex{3};
+};
+
+/// Abstract interface for classes that apply modifications to a binary string.
+class BinaryPatcher {
+public:
+  virtual ~BinaryPatcher() {}
+  /// Applies in-place modifications to the binary string \p BinaryContents .
+  /// \p DWPOffset used to correctly patch sections that come from DWP file.
+  virtual void patchBinary(std::string &BinaryContents, uint32_t DWPOffset) = 0;
+};
+
+/// Applies simple modifications to a binary string, such as directly replacing
+/// the contents of a certain portion with a string or an integer.
+class SimpleBinaryPatcher : public BinaryPatcher {
+private:
+  std::vector<std::pair<uint32_t, std::string>> Patches;
+
+  /// Adds a patch to replace the contents of \p ByteSize bytes with the integer
+  /// \p NewValue encoded in little-endian, with the least-significant byte
+  /// being written at the offset \p Offset .
+  void addLEPatch(uint32_t Offset, uint64_t NewValue, size_t ByteSize);
+
+  /// RangeBase for DWO DebugInfo Patcher.
+  uint64_t RangeBase{0};
+
+  /// Gets reset to false when setRangeBase is invoked.
+  /// Gets set to true when getRangeBase is called
+  uint64_t WasRangeBaseUsed{false};
+
+public:
+  virtual ~SimpleBinaryPatcher() {}
+
+  /// Adds a patch to replace the contents of the binary string starting at the
+  /// specified \p Offset with the string \p NewValue.
+  void addBinaryPatch(uint32_t Offset, const std::string &NewValue);
+
+  /// Adds a patch to replace the contents of a single byte of the string, at
+  /// the offset \p Offset, with the value \Value .
+  void addBytePatch(uint32_t Offset, uint8_t Value);
+
+  /// Adds a patch to put the integer \p NewValue encoded as a 64-bit
+  /// little-endian value at offset \p Offset.
+  void addLE64Patch(uint32_t Offset, uint64_t NewValue);
+
+  /// Adds a patch to put the integer \p NewValue encoded as a 32-bit
+  /// little-endian value at offset \p Offset.
+  void addLE32Patch(uint32_t Offset, uint32_t NewValue);
+
+  /// Add a patch at \p Offset with \p Value using unsigned LEB128 encoding with
+  /// size \p Size. \p Size should not be less than a minimum number of bytes
+  /// needed to encode \p Value.
+  void addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size);
+
+  /// Setting DW_AT_GNU_ranges_base
+  void setRangeBase(uint64_t Rb) {
+    WasRangeBaseUsed = false;
+    RangeBase = Rb;
+  }
+
+  /// Gets DW_AT_GNU_ranges_base
+  uint64_t getRangeBase() {
+    WasRangeBaseUsed = true;
+    return RangeBase;
+  }
+
+  /// Proxy for if we broke up low_pc/high_pc to ranges.
+  bool getWasRangBasedUsed() const { return WasRangeBaseUsed; }
+
+  virtual void patchBinary(std::string &BinaryContents,
+                           uint32_t DWPOffset) override;
+};
+
+/// Apply small modifications to the .debug_abbrev DWARF section.
+class DebugAbbrevPatcher : public BinaryPatcher {
+public:
+  /// Patch of changing one attribute to another.
+  struct AbbrevAttrPatch {
+    const DWARFAbbreviationDeclaration *Abbrev;
+    dwarf::Attribute Attr;    // ID of attribute to be replaced.
+    dwarf::Attribute NewAttr; // ID of the new attribute.
+    uint8_t NewForm;          // Form of the new attribute.
+
+    bool operator==(const AbbrevAttrPatch &RHS) const {
+      return Abbrev == RHS.Abbrev && Attr == RHS.Attr;
+    }
+  };
+
+  struct AbbrevHash {
+    size_t operator()(const AbbrevAttrPatch &P) const {
+      return std::hash<uint64_t>()(((uint64_t)P.Abbrev << 16) + P.Attr);
+    }
+  };
+
+private:
+  std::unordered_set<AbbrevAttrPatch, AbbrevHash> AbbrevPatches;
+  /// Using Vector because this is currently being used for specific purpose of
+  /// patching DW_AT_GNU_ranges_base. So don't need fast look up, but do need
+  /// them to be in order.
+  std::vector<AbbrevAttrPatch> AbbrevNonStandardPatches;
+
+public:
+  virtual ~DebugAbbrevPatcher() {}
+  /// Adds a patch to change an attribute of the abbreviation
+  /// \p Abbrev the abbreviation to be modified.
+  /// \p AttrTag ID of the attribute to be replaced.
+  /// \p NewAttrTag ID of the new attribute.
+  /// \p NewAttrForm Form of the new attribute.
+  /// If None standard form is added through this API it will expand the section
+  /// to accomidate it.
+  void addAttributePatch(const DWARFAbbreviationDeclaration *Abbrev,
+                         dwarf::Attribute AttrTag, dwarf::Attribute NewAttrTag,
+                         uint8_t NewAttrForm);
+
+  virtual void patchBinary(std::string &Contents, uint32_t DWPOffset) override;
+
+  /// Finds an abbreviation patch.
+  /// \p AbbrevDecl the abbreviation declaration.
+  /// \p AttrTag Attribute to search for.
+  /// Returns nullptr if patch doesn't exist.
+  const AbbrevAttrPatch *find(const DWARFAbbreviationDeclaration *AbbrevDecl,
+                              dwarf::Attribute AttrTag) {
+    auto Iter = AbbrevPatches.find({AbbrevDecl, AttrTag, AttrTag, 0});
+    if (Iter == AbbrevPatches.end())
+      return nullptr;
+    return &(*Iter);
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/DynoStats.cpp b/bolt/src/DynoStats.cpp
new file mode 100644
index 000000000000..1aa787df5872
--- /dev/null
+++ b/bolt/src/DynoStats.cpp
@@ -0,0 +1,251 @@
+//===--- DynoStats.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "DynoStats.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <string>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+
+static cl::opt<uint32_t>
+DynoStatsScale("dyno-stats-scale",
+  cl::desc("scale to be applied while reporting dyno stats"),
+  cl::Optional,
+  cl::init(1),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+constexpr const char *DynoStats::Desc[];
+
+bool DynoStats::operator<(const DynoStats &Other) const {
+  return std::lexicographical_compare(
+    &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
+    &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT]
+  );
+}
+
+bool DynoStats::operator==(const DynoStats &Other) const {
+  return std::equal(
+    &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
+    &Other.Stats[FIRST_DYNO_STAT]
+  );
+}
+
+bool DynoStats::lessThan(const DynoStats &Other,
+                         ArrayRef<Category> Keys) const {
+  return std::lexicographical_compare(
+    Keys.begin(), Keys.end(),
+    Keys.begin(), Keys.end(),
+    [this,&Other](const Category A, const Category) {
+      return Stats[A] < Other.Stats[A];
+    }
+  );
+}
+
+void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const {
+  auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat,
+                                uint64_t OtherStat) {
+    OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name;
+    if (Other) {
+      if (Stat != OtherStat) {
+       OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0
+       OS << format(" (%+.1f%%)",
+                    ( (float) Stat - (float) OtherStat ) * 100.0 /
+                      (float) (OtherStat) );
+      } else {
+        OS << " (=)";
+      }
+    }
+    OS << '\n';
+  };
+
+  for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
+       Stat < DynoStats::LAST_DYNO_STAT;
+       ++Stat) {
+
+    if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64)
+      continue;
+
+    printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
+  }
+}
+
+void DynoStats::operator+=(const DynoStats &Other) {
+  for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
+       Stat < DynoStats::LAST_DYNO_STAT;
+       ++Stat) {
+    Stats[Stat] += Other[Stat];
+  }
+}
+
+DynoStats getDynoStats(const BinaryFunction &BF) {
+  auto &BC = BF.getBinaryContext();
+
+  DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64());
+
+  // Return empty-stats about the function we don't completely understand.
+  if (!BF.isSimple() || !BF.hasValidProfile() || !BF.hasCanonicalCFG())
+    return Stats;
+
+  // Update enumeration of basic blocks for correct detection of branch'
+  // direction.
+  BF.updateLayoutIndices();
+
+  for (BinaryBasicBlock *const &BB : BF.layout()) {
+    // The basic block execution count equals to the sum of incoming branch
+    // frequencies. This may deviate from the sum of outgoing branches of the
+    // basic block especially since the block may contain a function that
+    // does not return or a function that throws an exception.
+    const uint64_t BBExecutionCount =  BB->getKnownExecutionCount();
+
+    // Ignore empty blocks and blocks that were not executed.
+    if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0)
+      continue;
+
+    // Count AArch64 linker-inserted veneers
+    if(BF.isAArch64Veneer())
+        Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount();
+
+    // Count the number of calls by iterating through all instructions.
+    for (const MCInst &Instr : *BB) {
+      if (BC.MIB->isStore(Instr)) {
+        Stats[DynoStats::STORES] += BBExecutionCount;
+      }
+      if (BC.MIB->isLoad(Instr)) {
+        Stats[DynoStats::LOADS] += BBExecutionCount;
+      }
+
+      if (!BC.MIB->isCall(Instr))
+        continue;
+
+      uint64_t CallFreq = BBExecutionCount;
+      if (BC.MIB->getConditionalTailCall(Instr)) {
+        CallFreq =
+          BC.MIB->getAnnotationWithDefault<uint64_t>(Instr, "CTCTakenCount");
+      }
+      Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
+      if (BC.MIB->isIndirectCall(Instr)) {
+        Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
+      } else if (const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr)) {
+        const BinaryFunction *BF = BC.getFunctionForSymbol(CallSymbol);
+        if (BF && BF->isPLTFunction()) {
+          Stats[DynoStats::PLT_CALLS] += CallFreq;
+
+          // We don't process PLT functions and hence have to adjust relevant
+          // dynostats here for:
+          //
+          //   jmp *GOT_ENTRY(%rip)
+          //
+          // NOTE: this is arch-specific.
+          Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
+          Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
+          Stats[DynoStats::LOADS] += CallFreq;
+          Stats[DynoStats::INSTRUCTIONS] += CallFreq;
+        }
+      }
+    }
+
+    Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount;
+
+    // Jump tables.
+    const MCInst *LastInstr = BB->getLastNonPseudoInstr();
+    if (BC.MIB->getJumpTable(*LastInstr)) {
+      Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount;
+      LLVM_DEBUG(
+        static uint64_t MostFrequentJT;
+        if (BBExecutionCount > MostFrequentJT) {
+          MostFrequentJT = BBExecutionCount;
+          dbgs() << "BOLT-INFO: most frequently executed jump table is in "
+                 << "function " << BF << " in basic block " << BB->getName()
+                 << " executed totally " << BBExecutionCount << " times.\n";
+        }
+      );
+      continue;
+    }
+
+    if (BC.MIB->isIndirectBranch(*LastInstr) && !BC.MIB->isCall(*LastInstr)) {
+      Stats[DynoStats::UNKNOWN_INDIRECT_BRANCHES] += BBExecutionCount;
+      continue;
+    }
+
+    // Update stats for branches.
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+      continue;
+    }
+
+    if (!CondBranch && !UncondBranch) {
+      continue;
+    }
+
+    // Simple unconditional branch.
+    if (!CondBranch) {
+      Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount;
+      continue;
+    }
+
+    // CTCs: instruction annotations could be stripped, hence check the number
+    // of successors to identify conditional tail calls.
+    if (BB->succ_size() == 1) {
+      if (BB->branch_info_begin() != BB->branch_info_end())
+        Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count;
+      continue;
+    }
+
+    // Conditional branch that could be followed by an unconditional branch.
+    uint64_t TakenCount = BB->getTakenBranchInfo().Count;
+    if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      TakenCount = 0;
+
+    uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
+    if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      NonTakenCount = 0;
+
+    if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) {
+      Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount;
+      Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount;
+    } else {
+      Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount;
+      Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount;
+    }
+
+    if (UncondBranch) {
+      Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount;
+    }
+  }
+
+  return Stats;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/DynoStats.h b/bolt/src/DynoStats.h
new file mode 100644
index 000000000000..f3a9b3042764
--- /dev/null
+++ b/bolt/src/DynoStats.h
@@ -0,0 +1,177 @@
+//===--- DynoStats.h ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
+#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+namespace bolt {
+class BinaryFunction;
+
+/// Class encapsulating runtime statistics about an execution unit.
+class DynoStats {
+
+#define DYNO_STATS\
+  D(FIRST_DYNO_STAT,              "<reserved>", Fn)\
+  D(FORWARD_COND_BRANCHES,        "executed forward branches", Fn)\
+  D(FORWARD_COND_BRANCHES_TAKEN,  "taken forward branches", Fn)\
+  D(BACKWARD_COND_BRANCHES,       "executed backward branches", Fn)\
+  D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\
+  D(UNCOND_BRANCHES,              "executed unconditional branches", Fn)\
+  D(FUNCTION_CALLS,               "all function calls", Fn)\
+  D(INDIRECT_CALLS,               "indirect calls", Fn)\
+  D(PLT_CALLS,                    "PLT calls", Fn)\
+  D(INSTRUCTIONS,                 "executed instructions", Fn)\
+  D(LOADS,                        "executed load instructions", Fn)\
+  D(STORES,                       "executed store instructions", Fn)\
+  D(JUMP_TABLE_BRANCHES,          "taken jump table branches", Fn)\
+  D(UNKNOWN_INDIRECT_BRANCHES,    "taken unknown indirect branches", Fn)\
+  D(ALL_BRANCHES,                 "total branches",\
+      Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\
+  D(ALL_TAKEN,                    "taken branches",\
+      Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\
+  D(NONTAKEN_CONDITIONAL,         "non-taken conditional branches",\
+      Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\
+  D(TAKEN_CONDITIONAL,            "taken conditional branches",\
+      Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\
+  D(ALL_CONDITIONAL,              "all conditional branches",\
+      Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\
+  D(VENEER_CALLS_AARCH64,         "linker-inserted veneer calls", Fn)\
+  D(LAST_DYNO_STAT,               "<reserved>", 0)
+
+public:
+#define D(name, ...) name,
+  enum Category : uint8_t { DYNO_STATS };
+#undef D
+
+
+private:
+  uint64_t Stats[LAST_DYNO_STAT+1];
+  bool PrintAArch64Stats;
+
+#define D(name, desc, ...) desc,
+  static constexpr const char *Desc[] = { DYNO_STATS };
+#undef D
+
+public:
+  DynoStats(bool PrintAArch64Stats) {
+    this->PrintAArch64Stats = PrintAArch64Stats;
+    for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat)
+      Stats[Stat] = 0;
+  }
+
+  uint64_t &operator[](size_t I) {
+    assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT &&
+           "index out of bounds");
+    return Stats[I];
+  }
+
+  uint64_t operator[](size_t I) const {
+    switch (I) {
+#define D(name, desc, func) \
+    case name: \
+      return func;
+#define Fn Stats[I]
+#define Fadd(a, b) operator[](a) + operator[](b)
+#define Fsub(a, b) operator[](a) - operator[](b)
+#define F(a) operator[](a)
+#define Radd(a, b) (a + b)
+#define Rsub(a, b) (a - b)
+    DYNO_STATS
+#undef Rsub
+#undef Radd
+#undef F
+#undef Fsub
+#undef Fadd
+#undef Fn
+#undef D
+    default:
+      llvm_unreachable("index out of bounds");
+    }
+    return 0;
+  }
+
+  void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
+
+  void operator+=(const DynoStats &Other);
+  bool operator<(const DynoStats &Other) const;
+  bool operator==(const DynoStats &Other) const;
+  bool operator!=(const DynoStats &Other) const { return !operator==(Other); }
+  bool lessThan(const DynoStats &Other, ArrayRef<Category> Keys) const;
+
+  static const char* Description(const Category C) {
+    return Desc[C];
+  }
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {
+  Stats.print(OS, nullptr);
+  return OS;
+}
+
+DynoStats operator+(const DynoStats &A, const DynoStats &B);
+
+/// Return dynostats for the function.
+///
+/// The function relies on branch instructions being in-sync with CFG for
+/// branch instructions stats. Thus it is better to call it after
+/// fixBranches().
+DynoStats getDynoStats(const BinaryFunction &BF);
+
+/// Return program-wide dynostats.
+template <typename FuncsType>
+inline DynoStats getDynoStats(const FuncsType &Funcs) {
+  bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
+  DynoStats dynoStats(IsAArch64);
+  for (auto &BFI : Funcs) {
+    auto &BF = BFI.second;
+    if (BF.isSimple()) {
+      dynoStats += getDynoStats(BF);
+    }
+  }
+  return dynoStats;
+}
+
+/// Call a function with optional before and after dynostats printing.
+template <typename FnType, typename FuncsType>
+inline void
+callWithDynoStats(FnType &&Func,
+                  const FuncsType &Funcs,
+                  StringRef Phase,
+                  const bool Flag) {
+  bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64();
+  DynoStats DynoStatsBefore(IsAArch64);
+  if (Flag) {
+    DynoStatsBefore = getDynoStats(Funcs);
+  }
+
+  Func();
+
+  if (Flag) {
+    const DynoStats DynoStatsAfter = getDynoStats(Funcs);
+    const bool Changed = (DynoStatsAfter != DynoStatsBefore);
+    outs() << "BOLT-INFO: program-wide dynostats after running "
+           << Phase << (Changed ? "" : " (no change)") << ":\n\n"
+           << DynoStatsBefore << '\n';
+    if (Changed) {
+      DynoStatsAfter.print(outs(), &DynoStatsBefore);
+    }
+    outs() << '\n';
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp
new file mode 100644
index 000000000000..22b1a8439f08
--- /dev/null
+++ b/bolt/src/Exceptions.cpp
@@ -0,0 +1,782 @@
+//===-- Exceptions.cpp - Helpers for processing C++ exceptions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Some of the code is taken from examples/ExceptionDemo
+//
+//===----------------------------------------------------------------------===//
+
+#include "Exceptions.h"
+#include "BinaryFunction.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt-exceptions"
+
+using namespace llvm::dwarf;
+
+namespace opts {
+
+extern llvm::cl::OptionCategory BoltCategory;
+
+extern llvm::cl::opt<unsigned> Verbosity;
+
+static llvm::cl::opt<bool>
+PrintExceptions("print-exceptions",
+  llvm::cl::desc("print exception handling data"),
+  llvm::cl::ZeroOrMore,
+  llvm::cl::Hidden,
+  llvm::cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+// Read and dump the .gcc_exception_table section entry.
+//
+// .gcc_except_table section contains a set of Language-Specific Data Areas -
+// a fancy name for exception handling tables. There's one  LSDA entry per
+// function. However, we can't actually tell which function LSDA refers to
+// unless we parse .eh_frame entry that refers to the LSDA.
+// Then inside LSDA most addresses are encoded relative to the function start,
+// so we need the function context in order to get to real addresses.
+//
+// The best visual representation of the tables comprising LSDA and
+// relationships between them is illustrated at:
+//   https://github.com/itanium-cxx-abi/cxx-abi/blob/master/exceptions.pdf
+// Keep in mind that GCC implementation deviates slightly from that document.
+//
+// To summarize, there are 4 tables in LSDA: call site table, actions table,
+// types table, and types index table (for indirection). The main table contains
+// call site entries. Each call site includes a PC range that can throw an
+// exception, a handler (landing pad), and a reference to an entry in the action
+// table. The handler and/or action could be 0. The action entry is a head
+// of a list of actions associated with a call site. The action table contains
+// all such lists (it could be optimized to share list tails). Each action could
+// be either to catch an exception of a given type, to perform a cleanup, or to
+// propagate the exception after filtering it out (e.g. to make sure function
+// exception specification is not violated). Catch action contains a reference
+// to an entry in the type table, and filter action refers to an entry in the
+// type index table to encode a set of types to filter.
+//
+// Call site table follows LSDA header. Action table immediately follows the
+// call site table.
+//
+// Both types table and type index table start at the same location, but they
+// grow in opposite directions (types go up, indices go down). The beginning of
+// these tables is encoded in LSDA header. Sizes for both of the tables are not
+// included anywhere.
+//
+// We have to parse all of the tables to determine their sizes. Then we have
+// to parse the call site table and associate discovered information with
+// actual call instructions and landing pad blocks.
+//
+// For the purpose of rewriting exception handling tables, we can reuse action,
+// and type index tables in their original binary format.
+//
+// Type table could be encoded using position-independent references, and thus
+// may require relocation.
+//
+// Ideally we should be able to re-write LSDA in-place, without the need to
+// allocate a new space for it. Sadly there's no guarantee that the new call
+// site table will be the same size as GCC uses uleb encodings for PC offsets.
+//
+// Note: some functions have LSDA entries with 0 call site entries.
+void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
+                               uint64_t LSDASectionAddress) {
+  assert(CurrentState == State::Disassembled && "unexpected function state");
+
+  if (!getLSDAAddress())
+    return;
+
+  DWARFDataExtractor Data(
+      StringRef(reinterpret_cast<const char *>(LSDASectionData.data()),
+                LSDASectionData.size()),
+      BC.DwCtx->getDWARFObj().isLittleEndian(), 8);
+  uint64_t Offset = getLSDAAddress() - LSDASectionAddress;
+  assert(Data.isValidOffset(Offset) && "wrong LSDA address");
+
+  uint8_t LPStartEncoding = Data.getU8(&Offset);
+  uint64_t LPStart = 0;
+  if (Optional<uint64_t> MaybeLPStart = Data.getEncodedPointer(
+          &Offset, LPStartEncoding, Offset + LSDASectionAddress))
+    LPStart = *MaybeLPStart;
+
+  assert(LPStart == 0 && "support for split functions not implemented");
+
+  const uint8_t TTypeEncoding = Data.getU8(&Offset);
+  size_t TTypeEncodingSize = 0;
+  uintptr_t TTypeEnd = 0;
+  if (TTypeEncoding != DW_EH_PE_omit) {
+    TTypeEnd = Data.getULEB128(&Offset);
+    TTypeEncodingSize = BC.getDWARFEncodingSize(TTypeEncoding);
+  }
+
+  if (opts::PrintExceptions) {
+    outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress())
+           << " for function " << *this << "]:\n";
+    outs() << "LPStart Encoding = 0x"
+           << Twine::utohexstr(LPStartEncoding) << '\n';
+    outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n';
+    outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding) << '\n';
+    outs() << "TType End = " << TTypeEnd << '\n';
+  }
+
+  // Table to store list of indices in type table. Entries are uleb128 values.
+  const uint64_t TypeIndexTableStart = Offset + TTypeEnd;
+
+  // Offset past the last decoded index.
+  uint64_t MaxTypeIndexTableOffset = 0;
+
+  // Max positive index used in type table.
+  unsigned MaxTypeIndex = 0;
+
+  // The actual type info table starts at the same location, but grows in
+  // opposite direction. TTypeEncoding is used to encode stored values.
+  const uint64_t TypeTableStart = Offset + TTypeEnd;
+
+  uint8_t CallSiteEncoding = Data.getU8(&Offset);
+  uint32_t CallSiteTableLength = Data.getULEB128(&Offset);
+  uint64_t CallSiteTableStart = Offset;
+  uint64_t CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength;
+  uint64_t CallSitePtr = CallSiteTableStart;
+  uint64_t ActionTableStart = CallSiteTableEnd;
+
+  if (opts::PrintExceptions) {
+    outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n';
+    outs() << "CallSite table length = " << CallSiteTableLength << '\n';
+    outs() << '\n';
+  }
+
+  this->HasEHRanges = CallSitePtr < CallSiteTableEnd;
+  const uint64_t RangeBase = getAddress();
+  while (CallSitePtr < CallSiteTableEnd) {
+    uint64_t Start = *Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding,
+                                             CallSitePtr + LSDASectionAddress);
+    uint64_t Length = *Data.getEncodedPointer(
+        &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress);
+    uint64_t LandingPad = *Data.getEncodedPointer(
+        &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress);
+    uint64_t ActionEntry = Data.getULEB128(&CallSitePtr);
+
+    if (opts::PrintExceptions) {
+      outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start)
+             << ", 0x" << Twine::utohexstr(RangeBase + Start + Length)
+             << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad)
+             << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n";
+      outs() << "  current offset is " << (CallSitePtr - CallSiteTableStart)
+             << '\n';
+    }
+
+    // Create a handler entry if necessary.
+    MCSymbol *LPSymbol = nullptr;
+    if (LandingPad) {
+      if (Instructions.find(LandingPad) == Instructions.end()) {
+        if (opts::Verbosity >= 1) {
+          errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad)
+                 << " not pointing to an instruction in function "
+                 << *this << " - ignoring.\n";
+        }
+      } else {
+        auto Label = Labels.find(LandingPad);
+        if (Label != Labels.end()) {
+          LPSymbol = Label->second;
+        } else {
+          LPSymbol = BC.Ctx->createNamedTempSymbol("LP");
+          Labels[LandingPad] = LPSymbol;
+        }
+      }
+    }
+
+    // Mark all call instructions in the range.
+    auto II = Instructions.find(Start);
+    auto IE = Instructions.end();
+    assert(II != IE && "exception range not pointing to an instruction");
+    do {
+      MCInst &Instruction = II->second;
+      if (BC.MIB->isCall(Instruction) &&
+          !BC.MIB->getConditionalTailCall(Instruction)) {
+        assert(!BC.MIB->isInvoke(Instruction) &&
+               "overlapping exception ranges detected");
+        // Add extra operands to a call instruction making it an invoke from
+        // now on.
+        BC.MIB->addEHInfo(Instruction,
+                          MCPlus::MCLandingPad(LPSymbol, ActionEntry));
+      }
+      ++II;
+    } while (II != IE && II->first < Start + Length);
+
+    if (ActionEntry != 0) {
+      auto printType = [&](int Index, raw_ostream &OS) {
+        assert(Index > 0 && "only positive indices are valid");
+        uint64_t TTEntry = TypeTableStart - Index * TTypeEncodingSize;
+        const uint64_t TTEntryAddress = TTEntry + LSDASectionAddress;
+        uint64_t TypeAddress =
+            *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress);
+        if ((TTypeEncoding & DW_EH_PE_pcrel) && TypeAddress == TTEntryAddress) {
+          TypeAddress = 0;
+        }
+        if (TypeAddress == 0) {
+          OS << "<all>";
+          return;
+        }
+        if (TTypeEncoding & DW_EH_PE_indirect) {
+          ErrorOr<uint64_t> PointerOrErr = BC.getPointerAtAddress(TypeAddress);
+          assert(PointerOrErr && "failed to decode indirect address");
+          TypeAddress = *PointerOrErr;
+        }
+        if (BinaryData *TypeSymBD = BC.getBinaryDataAtAddress(TypeAddress)) {
+          OS << TypeSymBD->getName();
+        } else {
+          OS << "0x" << Twine::utohexstr(TypeAddress);
+        }
+      };
+      if (opts::PrintExceptions)
+        outs() << "    actions: ";
+      uint64_t ActionPtr = ActionTableStart + ActionEntry - 1;
+      int64_t ActionType;
+      int64_t ActionNext;
+      const char *Sep = "";
+      do {
+        ActionType = Data.getSLEB128(&ActionPtr);
+        const uint32_t Self = ActionPtr;
+        ActionNext = Data.getSLEB128(&ActionPtr);
+        if (opts::PrintExceptions)
+          outs() << Sep << "(" << ActionType << ", " << ActionNext << ") ";
+        if (ActionType == 0) {
+          if (opts::PrintExceptions)
+            outs() << "cleanup";
+        } else if (ActionType > 0) {
+          // It's an index into a type table.
+          MaxTypeIndex = std::max(MaxTypeIndex,
+                                  static_cast<unsigned>(ActionType));
+          if (opts::PrintExceptions) {
+            outs() << "catch type ";
+            printType(ActionType, outs());
+          }
+        } else { // ActionType < 0
+          if (opts::PrintExceptions)
+            outs() << "filter exception types ";
+          const char *TSep = "";
+          // ActionType is a negative *byte* offset into *uleb128-encoded* table
+          // of indices with base 1.
+          // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are
+          // encoded using uleb128 thus we cannot directly dereference them.
+          uint64_t TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1;
+          while (uint64_t Index = Data.getULEB128(&TypeIndexTablePtr)) {
+            MaxTypeIndex = std::max(MaxTypeIndex, static_cast<unsigned>(Index));
+            if (opts::PrintExceptions) {
+              outs() << TSep;
+              printType(Index, outs());
+              TSep = ", ";
+            }
+          }
+          MaxTypeIndexTableOffset =
+              std::max(MaxTypeIndexTableOffset,
+                       TypeIndexTablePtr - TypeIndexTableStart);
+        }
+
+        Sep = "; ";
+
+        ActionPtr = Self + ActionNext;
+      } while (ActionNext);
+      if (opts::PrintExceptions)
+        outs() << '\n';
+    }
+  }
+  if (opts::PrintExceptions)
+    outs() << '\n';
+
+  assert(TypeIndexTableStart + MaxTypeIndexTableOffset <=
+             Data.getData().size() &&
+         "LSDA entry has crossed section boundary");
+
+  if (TTypeEnd) {
+    LSDAActionTable = LSDASectionData.slice(
+        ActionTableStart, TypeIndexTableStart -
+                              MaxTypeIndex * TTypeEncodingSize -
+                              ActionTableStart);
+    for (unsigned Index = 1; Index <= MaxTypeIndex; ++Index) {
+      uint64_t TTEntry = TypeTableStart - Index * TTypeEncodingSize;
+      const uint64_t TTEntryAddress = TTEntry + LSDASectionAddress;
+      uint64_t TypeAddress =
+          *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress);
+      if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress))
+        TypeAddress = 0;
+      if (TTypeEncoding & DW_EH_PE_indirect) {
+        LSDATypeAddressTable.emplace_back(TypeAddress);
+        if (TypeAddress) {
+          ErrorOr<uint64_t> PointerOrErr = BC.getPointerAtAddress(TypeAddress);
+          assert(PointerOrErr && "failed to decode indirect address");
+          TypeAddress = *PointerOrErr;
+        }
+      }
+      LSDATypeTable.emplace_back(TypeAddress);
+    }
+    LSDATypeIndexTable =
+        LSDASectionData.slice(TypeIndexTableStart, MaxTypeIndexTableOffset);
+  }
+}
+
+void BinaryFunction::updateEHRanges() {
+  if (getSize() == 0)
+    return;
+
+  assert(CurrentState == State::CFG_Finalized && "unexpected state");
+
+  // Build call sites table.
+  struct EHInfo {
+    const MCSymbol *LP; // landing pad
+    uint64_t Action;
+  };
+
+  // If previous call can throw, this is its exception handler.
+  EHInfo PreviousEH = {nullptr, 0};
+
+  // Marker for the beginning of exceptions range.
+  const MCSymbol *StartRange = nullptr;
+
+  // Indicates whether the start range is located in a cold part.
+  bool IsStartInCold = false;
+
+  // Have we crossed hot/cold border for split functions?
+  bool SeenCold = false;
+
+  // Sites to update - either regular or cold.
+  std::vector<CallSite> *Sites = &CallSites;
+
+  for (BinaryBasicBlock *&BB : BasicBlocksLayout) {
+
+    if (BB->isCold() && !SeenCold) {
+      SeenCold = true;
+
+      // Close the range (if any) and change the target call sites.
+      if (StartRange) {
+        Sites->emplace_back(CallSite{StartRange, getFunctionEndLabel(),
+                                     PreviousEH.LP, PreviousEH.Action});
+      }
+      Sites = &ColdCallSites;
+
+      // Reset the range.
+      StartRange = nullptr;
+      PreviousEH = {nullptr, 0};
+    }
+
+    for (auto II = BB->begin(); II != BB->end(); ++II) {
+      if (!BC.MIB->isCall(*II))
+        continue;
+
+      // Instruction can throw an exception that should be handled.
+      const bool Throws = BC.MIB->isInvoke(*II);
+
+      // Ignore the call if it's a continuation of a no-throw gap.
+      if (!Throws && !StartRange)
+        continue;
+
+      // Extract exception handling information from the instruction.
+      const MCSymbol *LP = nullptr;
+      uint64_t Action = 0;
+      if (const Optional<MCPlus::MCLandingPad> EHInfo = BC.MIB->getEHInfo(*II))
+        std::tie(LP, Action) = *EHInfo;
+
+      // No action if the exception handler has not changed.
+      if (Throws &&
+          StartRange &&
+          PreviousEH.LP == LP &&
+          PreviousEH.Action == Action)
+        continue;
+
+      // Same symbol is used for the beginning and the end of the range.
+      const MCSymbol *EHSymbol;
+      MCInst EHLabel;
+      {
+        std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
+        EHSymbol = BC.Ctx->createNamedTempSymbol("EH");
+        BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
+      }
+
+      II = std::next(BB->insertPseudoInstr(II, EHLabel));
+
+      // At this point we could be in one of the following states:
+      //
+      // I. Exception handler has changed and we need to close previous range
+      //    and start a new one.
+      //
+      // II. Start a new exception range after the gap.
+      //
+      // III. Close current exception range and start a new gap.
+      const MCSymbol *EndRange;
+      if (StartRange) {
+        // I, III:
+        EndRange = EHSymbol;
+      } else {
+        // II:
+        StartRange = EHSymbol;
+        IsStartInCold = SeenCold;
+        EndRange = nullptr;
+      }
+
+      // Close the previous range.
+      if (EndRange) {
+        Sites->emplace_back(CallSite{StartRange, EndRange,
+                                     PreviousEH.LP, PreviousEH.Action});
+      }
+
+      if (Throws) {
+        // I, II:
+        StartRange = EHSymbol;
+        IsStartInCold = SeenCold;
+        PreviousEH = EHInfo{LP, Action};
+      } else {
+        StartRange = nullptr;
+      }
+    }
+  }
+
+  // Check if we need to close the range.
+  if (StartRange) {
+    assert((!isSplit() || Sites == &ColdCallSites) && "sites mismatch");
+    const MCSymbol *EndRange =
+        IsStartInCold ? getFunctionColdEndLabel() : getFunctionEndLabel();
+    Sites->emplace_back(CallSite{StartRange, EndRange,
+                                 PreviousEH.LP, PreviousEH.Action});
+  }
+}
+
+const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
+
+CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) {
+  // Prepare FDEs for fast lookup
+  for (const dwarf::FrameEntry &Entry : EHFrame.entries()) {
+    const auto *CurFDE = dyn_cast<dwarf::FDE>(&Entry);
+    // Skip CIEs.
+    if (!CurFDE)
+      continue;
+    // There could me multiple FDEs with the same initial address, and perhaps
+    // different sizes (address ranges). Use the first entry with non-zero size.
+    auto FDEI = FDEs.lower_bound(CurFDE->getInitialLocation());
+    if (FDEI != FDEs.end() && FDEI->first == CurFDE->getInitialLocation()) {
+      if (CurFDE->getAddressRange()) {
+        if (FDEI->second->getAddressRange() == 0) {
+          FDEI->second = CurFDE;
+        } else if (opts::Verbosity > 0) {
+          errs() << "BOLT-WARNING: different FDEs for function at 0x"
+                 << Twine::utohexstr(FDEI->first)
+                 << " detected; sizes: "
+                 << FDEI->second->getAddressRange() << " and "
+                 << CurFDE->getAddressRange() << '\n';
+        }
+      }
+    } else {
+      FDEs.emplace_hint(FDEI, CurFDE->getInitialLocation(), CurFDE);
+    }
+  }
+}
+
+bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
+  uint64_t Address = Function.getAddress();
+  auto I = FDEs.find(Address);
+  // Ignore zero-length FDE ranges.
+  if (I == FDEs.end() || !I->second->getAddressRange())
+    return true;
+
+  const FDE &CurFDE = *I->second;
+  Optional<uint64_t> LSDA = CurFDE.getLSDAAddress();
+  Function.setLSDAAddress(LSDA ? *LSDA : 0);
+
+  uint64_t Offset = 0;
+  uint64_t CodeAlignment = CurFDE.getLinkedCIE()->getCodeAlignmentFactor();
+  uint64_t DataAlignment = CurFDE.getLinkedCIE()->getDataAlignmentFactor();
+  if (CurFDE.getLinkedCIE()->getPersonalityAddress()) {
+    Function.setPersonalityFunction(
+        *CurFDE.getLinkedCIE()->getPersonalityAddress());
+    Function.setPersonalityEncoding(
+        *CurFDE.getLinkedCIE()->getPersonalityEncoding());
+  }
+
+  auto decodeFrameInstruction =
+      [&Function, &Offset, Address, CodeAlignment, DataAlignment](
+          const CFIProgram::Instruction &Instr) {
+        uint8_t Opcode = Instr.Opcode;
+        if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
+          Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
+        switch (Instr.Opcode) {
+        case DW_CFA_nop:
+          break;
+        case DW_CFA_advance_loc4:
+        case DW_CFA_advance_loc2:
+        case DW_CFA_advance_loc1:
+        case DW_CFA_advance_loc:
+          // Advance our current address
+          Offset += CodeAlignment * int64_t(Instr.Ops[0]);
+          break;
+        case DW_CFA_offset_extended_sf:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createOffset(
+                          nullptr, Instr.Ops[0],
+                          DataAlignment * int64_t(Instr.Ops[1])));
+          break;
+        case DW_CFA_offset_extended:
+        case DW_CFA_offset:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createOffset(
+                          nullptr, Instr.Ops[0], DataAlignment * Instr.Ops[1]));
+          break;
+        case DW_CFA_restore_extended:
+        case DW_CFA_restore:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createRestore(nullptr, Instr.Ops[0]));
+          break;
+        case DW_CFA_set_loc:
+          assert(Instr.Ops[0] >= Address && "set_loc out of function bounds");
+          assert(Instr.Ops[0] <= Address + Function.getSize() &&
+                 "set_loc out of function bounds");
+          Offset = Instr.Ops[0] - Address;
+          break;
+
+        case DW_CFA_undefined:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0]));
+          break;
+        case DW_CFA_same_value:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0]));
+          break;
+        case DW_CFA_register:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createRegister(nullptr, Instr.Ops[0],
+                                                       Instr.Ops[1]));
+          break;
+        case DW_CFA_remember_state:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createRememberState(nullptr));
+          break;
+        case DW_CFA_restore_state:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createRestoreState(nullptr));
+          break;
+        case DW_CFA_def_cfa:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::cfiDefCfa(nullptr, Instr.Ops[0],
+                                                  Instr.Ops[1]));
+          break;
+        case DW_CFA_def_cfa_sf:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::cfiDefCfa(
+                          nullptr, Instr.Ops[0],
+                          DataAlignment * int64_t(Instr.Ops[1])));
+          break;
+        case DW_CFA_def_cfa_register:
+          Function.addCFIInstruction(
+              Offset,
+              MCCFIInstruction::createDefCfaRegister(nullptr, Instr.Ops[0]));
+          break;
+        case DW_CFA_def_cfa_offset:
+          Function.addCFIInstruction(
+              Offset,
+              MCCFIInstruction::cfiDefCfaOffset(nullptr, Instr.Ops[0]));
+          break;
+        case DW_CFA_def_cfa_offset_sf:
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::cfiDefCfaOffset(
+                          nullptr, DataAlignment * int64_t(Instr.Ops[0])));
+          break;
+        case DW_CFA_GNU_args_size:
+          Function.addCFIInstruction(
+              Offset,
+              MCCFIInstruction::createGnuArgsSize(nullptr, Instr.Ops[0]));
+          Function.setUsesGnuArgsSize();
+          break;
+        case DW_CFA_val_offset_sf:
+        case DW_CFA_val_offset:
+          if (opts::Verbosity >= 1) {
+            errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n";
+          }
+          return false;
+        case DW_CFA_expression:
+        case DW_CFA_def_cfa_expression:
+        case DW_CFA_val_expression: {
+          MCDwarfExprBuilder Builder;
+          for (DWARFExpression::Operation &ExprOp : *Instr.Expression) {
+            const DWARFExpression::Operation::Description &Desc =
+                ExprOp.getDescription();
+            if (Desc.Op[0] == DWARFExpression::Operation::SizeNA) {
+              Builder.appendOperation(ExprOp.getCode());
+            } else if (Desc.Op[1] == DWARFExpression::Operation::SizeNA) {
+              Builder.appendOperation(ExprOp.getCode(),
+                                      ExprOp.getRawOperand(0));
+            } else {
+              Builder.appendOperation(ExprOp.getCode(), ExprOp.getRawOperand(0),
+                                      ExprOp.getRawOperand(1));
+            }
+          }
+          if (Opcode == DW_CFA_expression) {
+            Function.addCFIInstruction(
+                Offset, MCCFIInstruction::createExpression(
+                            nullptr, Instr.Ops[0], Builder.take()));
+          } else if (Opcode == DW_CFA_def_cfa_expression) {
+            Function.addCFIInstruction(Offset,
+                                       MCCFIInstruction::createDefCfaExpression(
+                                           nullptr, Builder.take()));
+          } else {
+            assert(Opcode == DW_CFA_val_expression && "Unexpected opcode");
+            Function.addCFIInstruction(
+                Offset, MCCFIInstruction::createValExpression(
+                            nullptr, Instr.Ops[0], Builder.take()));
+          }
+          break;
+        }
+        case DW_CFA_MIPS_advance_loc8:
+          if (opts::Verbosity >= 1) {
+            errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n";
+          }
+          return false;
+        case DW_CFA_GNU_window_save:
+        case DW_CFA_lo_user:
+        case DW_CFA_hi_user:
+          if (opts::Verbosity >= 1) {
+            errs() << "BOLT-WARNING: DW_CFA_GNU_* and DW_CFA_*_user "
+                      "unimplemented\n";
+          }
+          return false;
+        default:
+          if (opts::Verbosity >= 1) {
+            errs() << "BOLT-WARNING: Unrecognized CFI instruction: "
+                   << Instr.Opcode << '\n';
+          }
+          return false;
+        }
+
+        return true;
+      };
+
+  for (const CFIProgram::Instruction &Instr : CurFDE.getLinkedCIE()->cfis()) {
+    if (!decodeFrameInstruction(Instr))
+      return false;
+  }
+
+  for (const CFIProgram::Instruction &Instr : CurFDE.cfis()) {
+    if (!decodeFrameInstruction(Instr))
+      return false;
+  }
+
+  return true;
+}
+
+std::vector<char> CFIReaderWriter::generateEHFrameHeader(
+    const DWARFDebugFrame &OldEHFrame,
+    const DWARFDebugFrame &NewEHFrame,
+    uint64_t EHFrameHeaderAddress,
+    std::vector<uint64_t> &FailedAddresses) const {
+  // Common PC -> FDE map to be written into .eh_frame_hdr.
+  std::map<uint64_t, uint64_t> PCToFDE;
+
+  // Presort array for binary search.
+  std::sort(FailedAddresses.begin(), FailedAddresses.end());
+
+  // Initialize PCToFDE using NewEHFrame.
+  for (dwarf::FrameEntry &Entry : NewEHFrame.entries()) {
+    const dwarf::FDE *FDE = dyn_cast<dwarf::FDE>(&Entry);
+    if (FDE == nullptr)
+      continue;
+    const uint64_t FuncAddress = FDE->getInitialLocation();
+    const uint64_t FDEAddress =
+        NewEHFrame.getEHFrameAddress() + FDE->getOffset();
+
+    // Ignore unused FDEs.
+    if (FuncAddress == 0)
+      continue;
+
+    // Add the address to the map unless we failed to write it.
+    if (!std::binary_search(FailedAddresses.begin(), FailedAddresses.end(),
+                            FuncAddress)) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: FDE for function at 0x"
+                        << Twine::utohexstr(FuncAddress) << " is at 0x"
+                        << Twine::utohexstr(FDEAddress) << '\n');
+      PCToFDE[FuncAddress] = FDEAddress;
+    }
+  };
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: new .eh_frame contains "
+                    << std::distance(NewEHFrame.entries().begin(),
+                                     NewEHFrame.entries().end())
+                    << " entries\n");
+
+  // Add entries from the original .eh_frame corresponding to the functions
+  // that we did not update.
+  for (const dwarf::FrameEntry &Entry : OldEHFrame) {
+    const dwarf::FDE *FDE = dyn_cast<dwarf::FDE>(&Entry);
+    if (FDE == nullptr)
+      continue;
+    const uint64_t FuncAddress = FDE->getInitialLocation();
+    const uint64_t FDEAddress =
+        OldEHFrame.getEHFrameAddress() + FDE->getOffset();
+
+    // Add the address if we failed to write it.
+    if (PCToFDE.count(FuncAddress) == 0) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: old FDE for function at 0x"
+                        << Twine::utohexstr(FuncAddress) << " is at 0x"
+                        << Twine::utohexstr(FDEAddress) << '\n');
+      PCToFDE[FuncAddress] = FDEAddress;
+    }
+  };
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: old .eh_frame contains "
+                    << std::distance(OldEHFrame.entries().begin(),
+                                     OldEHFrame.entries().end())
+                    << " entries\n");
+
+  // Generate a new .eh_frame_hdr based on the new map.
+
+  // Header plus table of entries of size 8 bytes.
+  std::vector<char> EHFrameHeader(12 + PCToFDE.size() * 8);
+
+  // Version is 1.
+  EHFrameHeader[0] = 1;
+  // Encoding of the eh_frame pointer.
+  EHFrameHeader[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4;
+  // Encoding of the count field to follow.
+  EHFrameHeader[2] = DW_EH_PE_udata4;
+  // Encoding of the table entries - 4-byte offset from the start of the header.
+  EHFrameHeader[3] = DW_EH_PE_datarel | DW_EH_PE_sdata4;
+
+  // Address of eh_frame. Use the new one.
+  support::ulittle32_t::ref(EHFrameHeader.data() + 4) =
+    NewEHFrame.getEHFrameAddress() - (EHFrameHeaderAddress + 4);
+
+  // Number of entries in the table (FDE count).
+  support::ulittle32_t::ref(EHFrameHeader.data() + 8) = PCToFDE.size();
+
+  // Write the table at offset 12.
+  char *Ptr = EHFrameHeader.data();
+  uint32_t Offset = 12;
+  for (const auto &PCI : PCToFDE) {
+    int64_t InitialPCOffset = PCI.first - EHFrameHeaderAddress;
+    assert(isInt<32>(InitialPCOffset) && "PC offset out of bounds");
+    support::ulittle32_t::ref(Ptr + Offset) = InitialPCOffset;
+    Offset += 4;
+    int64_t FDEOffset = PCI.second - EHFrameHeaderAddress;
+    assert(isInt<32>(FDEOffset) && "FDE offset out of bounds");
+    support::ulittle32_t::ref(Ptr + Offset) = FDEOffset;
+    Offset += 4;
+  }
+
+  return EHFrameHeader;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Exceptions.h b/bolt/src/Exceptions.h
new file mode 100644
index 000000000000..2b66945cfa25
--- /dev/null
+++ b/bolt/src/Exceptions.h
@@ -0,0 +1,63 @@
+//===-- Exceptions.h - Helpers for processing C++ exceptions --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_EXCEPTIONS_H
+#define LLVM_TOOLS_LLVM_BOLT_EXCEPTIONS_H
+
+#include <cstdint>
+#include <map>
+#include <vector>
+
+namespace llvm {
+class DWARFDebugFrame;
+namespace dwarf {
+class FDE;
+} // namespace dwarf
+
+namespace bolt {
+
+class BinaryFunction;
+
+/// \brief Wraps up information to read all CFI instructions and feed them to a
+/// BinaryFunction, as well as rewriting CFI sections.
+class CFIReaderWriter {
+public:
+  explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame);
+
+  bool fillCFIInfoFor(BinaryFunction &Function) const;
+
+  /// Generate .eh_frame_hdr from old and new .eh_frame sections.
+  ///
+  /// Take FDEs from the \p NewEHFrame unless their initial_pc is listed
+  /// in \p FailedAddresses. All other entries are taken from the
+  /// \p OldEHFrame.
+  ///
+  /// \p EHFrameHeaderAddress specifies location of .eh_frame_hdr,
+  /// and is required for relative addressing used in the section.
+  std::vector<char> generateEHFrameHeader(
+      const DWARFDebugFrame &OldEHFrame,
+      const DWARFDebugFrame &NewEHFrame,
+      uint64_t EHFrameHeaderAddress,
+      std::vector<uint64_t> &FailedAddresses) const;
+
+  using FDEsMap = std::map<uint64_t, const dwarf::FDE *>;
+
+  const FDEsMap &getFDEs() const {
+    return FDEs;
+  }
+
+private:
+  FDEsMap FDEs;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/ExecutableFileMemoryManager.cpp b/bolt/src/ExecutableFileMemoryManager.cpp
new file mode 100644
index 000000000000..c56cfef094ff
--- /dev/null
+++ b/bolt/src/ExecutableFileMemoryManager.cpp
@@ -0,0 +1,100 @@
+//===--- ExecutableFileMemoryManager.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExecutableFileMemoryManager.h"
+#include "RewriteInstance.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "efmm"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+namespace llvm {
+
+namespace bolt {
+
+uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size,
+                                                      unsigned Alignment,
+                                                      unsigned SectionID,
+                                                      StringRef SectionName,
+                                                      bool IsCode,
+                                                      bool IsReadOnly) {
+  // Register a debug section as a note section.
+  if (!ObjectsLoaded && RewriteInstance::isDebugSection(SectionName)) {
+    uint8_t *DataCopy = new uint8_t[Size];
+    BinarySection &Section =
+        BC.registerOrUpdateNoteSection(SectionName, DataCopy, Size, Alignment);
+    Section.setSectionID(SectionID);
+    assert(!Section.isAllocatable() && "note sections cannot be allocatable");
+    return DataCopy;
+  }
+
+  if (!IsCode &&
+      (SectionName == ".strtab" ||
+       SectionName == ".symtab" ||
+       SectionName == "" ||
+       SectionName.startswith(".rela."))) {
+    return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID,
+                                                     SectionName, IsReadOnly);
+  }
+
+  uint8_t *Ret;
+  if (IsCode) {
+    Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment,
+                                                    SectionID, SectionName);
+  } else {
+    Ret = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID,
+                                                    SectionName, IsReadOnly);
+  }
+
+  SmallVector<char, 256> Buf;
+  if (ObjectsLoaded > 0) {
+    if (BC.isELF()) {
+      SectionName = (Twine(SectionName) + ".bolt.extra." + Twine(ObjectsLoaded))
+                        .toStringRef(Buf);
+    } else if (BC.isMachO()) {
+      assert((SectionName == "__text" || SectionName == "__data" ||
+              SectionName == "__fini" || SectionName == "__setup" ||
+              SectionName == "__cstring" || SectionName == "__literal16") &&
+             "Unexpected section in the instrumentation library");
+      // Sections coming from the instrumentation runtime are prefixed with "I".
+      SectionName = ("I" + Twine(SectionName)).toStringRef(Buf);
+    }
+  }
+
+  BinarySection &Section = BC.registerOrUpdateSection(
+      SectionName, ELF::SHT_PROGBITS,
+      BinarySection::getFlags(IsReadOnly, IsCode, true), Ret, Size, Alignment);
+  Section.setSectionID(SectionID);
+  assert(Section.isAllocatable() &&
+         "verify that allocatable is marked as allocatable");
+
+  LLVM_DEBUG(
+      dbgs() << "BOLT: allocating "
+             << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data"))
+             << " section : " << SectionName << " with size " << Size
+             << ", alignment " << Alignment << " at 0x" << Ret
+             << ", ID = " << SectionID << "\n");
+  return Ret;
+}
+
+bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) {
+  LLVM_DEBUG(dbgs() << "BOLT: finalizeMemory()\n");
+  ++ObjectsLoaded;
+  return SectionMemoryManager::finalizeMemory(ErrMsg);
+}
+
+ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { }
+
+}
+
+}
diff --git a/bolt/src/ExecutableFileMemoryManager.h b/bolt/src/ExecutableFileMemoryManager.h
new file mode 100644
index 000000000000..1b52162b4173
--- /dev/null
+++ b/bolt/src/ExecutableFileMemoryManager.h
@@ -0,0 +1,71 @@
+//===--- ExecutableFileMemoryManager.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
+#define LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include <cstdint>
+#include <string>
+
+namespace llvm {
+
+namespace bolt {
+class BinaryContext;
+
+/// Class responsible for allocating and managing code and data sections.
+class ExecutableFileMemoryManager : public SectionMemoryManager {
+private:
+  uint8_t *allocateSection(intptr_t Size,
+                           unsigned Alignment,
+                           unsigned SectionID,
+                           StringRef SectionName,
+                           bool IsCode,
+                           bool IsReadOnly);
+  BinaryContext &BC;
+  bool AllowStubs;
+
+public:
+  // Our linker's main purpose is to handle a single object file, created
+  // by RewriteInstance after reading the input binary and reordering it.
+  // After objects finish loading, we increment this. Therefore, whenever
+  // this is greater than zero, we are dealing with additional objects that
+  // will not be managed by BinaryContext but only exist to support linking
+  // user-supplied objects into the main input executable.
+  uint32_t ObjectsLoaded{0};
+
+  ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs)
+      : BC(BC), AllowStubs(AllowStubs) {}
+
+  ~ExecutableFileMemoryManager();
+
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID,
+                               StringRef SectionName) override {
+    return allocateSection(Size, Alignment, SectionID, SectionName,
+                           /*IsCode=*/true, true);
+  }
+
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName,
+                               bool IsReadOnly) override {
+    return allocateSection(Size, Alignment, SectionID, SectionName,
+                           /*IsCode=*/false, IsReadOnly);
+  }
+  bool allowStubAllocation() const override { return AllowStubs; }
+
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Heatmap.cpp b/bolt/src/Heatmap.cpp
new file mode 100644
index 000000000000..c9b497b820df
--- /dev/null
+++ b/bolt/src/Heatmap.cpp
@@ -0,0 +1,280 @@
+//===-- Heatmap.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Heatmap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#define DEBUG_TYPE "bolt-heatmap"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::SubCommand HeatmapCommand;
+
+static cl::opt<unsigned>
+BucketsPerLine("line-size",
+  cl::desc("number of entries per line (default 256)"),
+  cl::init(256),
+  cl::Optional,
+  cl::sub(HeatmapCommand));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+void Heatmap::registerAddressRange(uint64_t StartAddress, uint64_t EndAddress,
+                                   uint64_t Count) {
+  if (ignoreAddress(StartAddress)) {
+    ++NumSkippedRanges;
+    return;
+  }
+
+  if (StartAddress > EndAddress ||
+      EndAddress - StartAddress > 64 * 1024) {
+    LLVM_DEBUG(dbgs() << "invalid range : 0x" << Twine::utohexstr(StartAddress)
+                      << " -> 0x" << Twine::utohexstr(EndAddress) << '\n');
+    ++NumSkippedRanges;
+    return;
+  }
+
+  for (uint64_t Bucket = StartAddress / BucketSize;
+       Bucket <= EndAddress / BucketSize; ++Bucket) {
+    Map[Bucket] += Count;
+  }
+}
+
+void Heatmap::print(StringRef FileName) const {
+  std::error_code EC;
+  raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None);
+  if (EC) {
+    errs() << "error opening output file: " << EC.message() << '\n';
+    exit(1);
+  }
+  print(OS);
+}
+
+void Heatmap::print(raw_ostream &OS) const {
+  const char FillChar = '.';
+
+  const auto DefaultColor = raw_ostream::WHITE;
+  auto changeColor = [&](raw_ostream::Colors Color) -> void {
+    static auto CurrentColor = raw_ostream::BLACK;
+    if (CurrentColor == Color)
+      return;
+    OS.changeColor(Color);
+    CurrentColor = Color;
+  };
+
+  const uint64_t BytesPerLine = opts::BucketsPerLine * BucketSize;
+
+  // Calculate the max value for scaling.
+  uint64_t MaxValue = 0;
+  for (const std::pair<const uint64_t, uint64_t> &Entry : Map) {
+    MaxValue = std::max<uint64_t>(MaxValue, Entry.second);
+  }
+
+  // Print start of the line and fill it with an empty space right before
+  // the Address.
+  auto startLine = [&](uint64_t Address, bool Empty = false) {
+    changeColor(DefaultColor);
+    const uint64_t LineAddress = Address / BytesPerLine * BytesPerLine;
+
+    if (MaxAddress > 0xffffffff)
+      OS << format("0x%016" PRIx64 ": ", LineAddress);
+    else
+      OS << format("0x%08" PRIx64 ": ", LineAddress);
+
+    if (Empty)
+      Address = LineAddress + BytesPerLine;
+    for (uint64_t Fill = LineAddress; Fill < Address; Fill += BucketSize) {
+      OS << FillChar;
+    }
+  };
+
+  // Finish line after \p Address was printed.
+  auto finishLine = [&](uint64_t Address) {
+    const uint64_t End = alignTo(Address + 1, BytesPerLine);
+    for (uint64_t Fill = Address + BucketSize; Fill < End; Fill += BucketSize)
+      OS << FillChar;
+    OS << '\n';
+  };
+
+  // Fill empty space in (Start, End) range.
+  auto fillRange = [&](uint64_t Start, uint64_t End) {
+    if ((Start / BytesPerLine) == (End / BytesPerLine)) {
+      for (uint64_t Fill = Start + BucketSize; Fill < End; Fill += BucketSize) {
+        changeColor(DefaultColor);
+        OS << FillChar;
+      }
+      return;
+    }
+
+    changeColor(DefaultColor);
+    finishLine(Start);
+    Start = alignTo(Start, BytesPerLine);
+
+    uint64_t NumEmptyLines = (End - Start) / BytesPerLine;
+
+    if (NumEmptyLines > 32) {
+      OS << '\n';
+    } else {
+      while (NumEmptyLines--) {
+        startLine(Start, /*Empty=*/true);
+        OS << '\n';
+        Start += BytesPerLine;
+      }
+    }
+
+    startLine(End);
+  };
+
+  static raw_ostream::Colors Colors[] = {
+    raw_ostream::WHITE,
+    raw_ostream::WHITE,
+    raw_ostream::CYAN,
+    raw_ostream::GREEN,
+    raw_ostream::YELLOW,
+    raw_ostream::RED
+  };
+  constexpr size_t NumRanges = sizeof(Colors) / sizeof(Colors[0]);
+
+  uint64_t Range[NumRanges];
+  for (uint64_t I = 0; I < NumRanges; ++I)
+    Range[I] = std::max(I + 1,
+                        (uint64_t) std::pow((double) MaxValue,
+                                            (double) (I + 1) / NumRanges));
+  Range[NumRanges - 1] = std::max((uint64_t) NumRanges, MaxValue);
+
+  // Print scaled value
+  auto printValue = [&](uint64_t Value, bool ResetColor = false) {
+    assert(Value && "should only print positive values");
+    for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) {
+      if (Value <= Range[I]) {
+        changeColor(Colors[I]);
+        break;
+      }
+    }
+    if (Value <= Range[0]) {
+      OS << 'o';
+    } else {
+      OS << 'O';
+    }
+    if (ResetColor)
+      changeColor(DefaultColor);
+  };
+
+  // Print against black background
+  OS.changeColor(raw_ostream::BLACK, /*Bold=*/false, /*Background=*/true);
+  changeColor(DefaultColor);
+
+  // Print map legend
+  OS << "Legend:\n";
+  uint64_t PrevValue = 0;
+  for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) {
+    const uint64_t Value = Range[I];
+    OS << "  ";
+    printValue(Value, true);
+    OS << " : (" << PrevValue << ", " << Value << "]\n";
+    PrevValue = Value;
+  }
+
+  // Pos - character position from right in hex form.
+  auto printHeader = [&](unsigned Pos) {
+    OS << "            ";
+    if (MaxAddress > 0xffffffff)
+      OS << "        ";
+    unsigned PrevValue = unsigned(-1);
+    for (unsigned I = 0; I < BytesPerLine; I += BucketSize) {
+      const unsigned Value = (I & ((1 << Pos * 4) - 1)) >> (Pos - 1) * 4;
+      if (Value != PrevValue) {
+        OS << Twine::utohexstr(Value);
+        PrevValue = Value;
+      } else {
+        OS << ' ';
+      }
+    }
+    OS << '\n';
+  };
+  for (unsigned I = 5; I > 0; --I)
+    printHeader(I);
+
+  uint64_t PrevAddress = 0;
+  for (auto MI = Map.begin(), ME = Map.end(); MI != ME; ++MI) {
+    const std::pair<const uint64_t, uint64_t> &Entry = *MI;
+    uint64_t Address = Entry.first * BucketSize;
+
+    if (PrevAddress) {
+      fillRange(PrevAddress, Address);
+    } else {
+      startLine(Address);
+    }
+
+    printValue(Entry.second);
+
+    PrevAddress = Address;
+  }
+
+  if (PrevAddress) {
+    changeColor(DefaultColor);
+    finishLine(PrevAddress);
+  }
+}
+
+void Heatmap::printCDF(StringRef FileName) const {
+  std::error_code EC;
+  raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None);
+  if (EC) {
+    errs() << "error opening output file: " << EC.message() << '\n';
+    exit(1);
+  }
+  printCDF(OS);
+}
+
+void Heatmap::printCDF(raw_ostream &OS) const {
+  uint64_t NumTotalCounts = 0;
+  std::vector<uint64_t> Counts;
+
+  for (const std::pair<const uint64_t, uint64_t> &KV : Map) {
+    Counts.push_back(KV.second);
+    NumTotalCounts += KV.second;
+  }
+
+  std::sort(Counts.begin(), Counts.end(), std::greater<uint64_t>());
+
+  double RatioLeftInKB = (1.0 * BucketSize) / 1024;
+  assert(NumTotalCounts > 0 &&
+         "total number of heatmap buckets should be greater than 0");
+  double RatioRightInPercent = 100.0 / NumTotalCounts;
+  uint64_t RunningCount = 0;
+
+  OS << "Bucket counts, Size (KB), CDF (%)\n";
+  for (uint64_t I = 0; I < Counts.size(); I++) {
+    RunningCount += Counts[I];
+    OS << format("%llu", (I + 1)) << ", "
+       << format("%.4f", RatioLeftInKB * (I + 1)) << ", "
+       << format("%.4f", RatioRightInPercent * (RunningCount)) << "\n";
+  }
+
+  Counts.clear();
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Heatmap.h b/bolt/src/Heatmap.h
new file mode 100644
index 000000000000..71144119c62e
--- /dev/null
+++ b/bolt/src/Heatmap.h
@@ -0,0 +1,81 @@
+//===-- Heatmap.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_HEATMAP_H
+#define LLVM_TOOLS_LLVM_BOLT_HEATMAP_H
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <map>
+
+namespace llvm {
+class raw_ostream;
+
+namespace bolt {
+
+class Heatmap {
+  /// Number of bytes per entry in the heat map.
+  size_t BucketSize;
+
+  /// Minimum address that is considered to be valid.
+  uint64_t MinAddress;
+
+  /// Maximum address that is considered to be valid.
+  uint64_t MaxAddress;
+
+  /// Count invalid ranges.
+  uint64_t NumSkippedRanges{0};
+
+  /// Map buckets to the number of samples.
+  std::map<uint64_t, uint64_t> Map;
+
+public:
+  explicit Heatmap(uint64_t BucketSize = 4096,
+                   uint64_t MinAddress = 0,
+                   uint64_t MaxAddress = std::numeric_limits<uint64_t>::max())
+    : BucketSize(BucketSize), MinAddress(MinAddress), MaxAddress(MaxAddress)
+  {};
+
+  inline bool ignoreAddress(uint64_t Address) const {
+    return (Address > MaxAddress) || (Address < MinAddress);
+  }
+
+  /// Register a single sample at \p Address.
+  void registerAddress(uint64_t Address) {
+    if (!ignoreAddress(Address))
+      ++Map[Address / BucketSize];
+  }
+
+  /// Register \p Count samples at [\p StartAddress, \p EndAddress ].
+  void registerAddressRange(uint64_t StartAddress, uint64_t EndAddress,
+                            uint64_t Count);
+
+  /// Return the number of ranges that failed to register.
+  uint64_t getNumInvalidRanges() const {
+    return NumSkippedRanges;
+  }
+
+  void print(StringRef FileName) const;
+
+  void print(raw_ostream &OS) const;
+
+  void printCDF(StringRef FileName) const;
+
+  void printCDF(raw_ostream &OS) const;
+
+  size_t size() const {
+    return Map.size();
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/JumpTable.cpp b/bolt/src/JumpTable.cpp
new file mode 100644
index 000000000000..700c9bf881a2
--- /dev/null
+++ b/bolt/src/JumpTable.cpp
@@ -0,0 +1,128 @@
+//===--- JumpTable.h - Representation of a jump table ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "JumpTable.h"
+#include "BinaryFunction.h"
+#include "BinarySection.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+using JumpTable = bolt::JumpTable;
+
+namespace opts {
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+extern cl::opt<unsigned> Verbosity;
+}
+
+bolt::JumpTable::JumpTable(MCSymbol &Symbol, uint64_t Address, size_t EntrySize,
+                           JumpTableType Type, LabelMapType &&Labels,
+                           BinaryFunction &BF, BinarySection &Section)
+    : BinaryData(Symbol, Address, 0, EntrySize, Section), EntrySize(EntrySize),
+      OutputEntrySize(EntrySize), Type(Type), Labels(Labels), Parent(&BF) {}
+
+std::pair<size_t, size_t>
+bolt::JumpTable::getEntriesForAddress(const uint64_t Addr) const {
+  // Check if this is not an address, but a cloned JT id
+  if ((int64_t)Addr < 0ll)
+    return std::make_pair(0, Entries.size());
+
+  const uint64_t InstOffset = Addr - getAddress();
+  size_t StartIndex = 0, EndIndex = 0;
+  uint64_t Offset = 0;
+
+  for (size_t I = 0; I < Entries.size(); ++I) {
+    auto LI = Labels.find(Offset);
+    if (LI != Labels.end()) {
+      const auto NextLI = std::next(LI);
+      const uint64_t NextOffset =
+          NextLI == Labels.end() ? getSize() : NextLI->first;
+      if (InstOffset >= LI->first && InstOffset < NextOffset) {
+        StartIndex = I;
+        EndIndex = I;
+        while (Offset < NextOffset) {
+          ++EndIndex;
+          Offset += EntrySize;
+        }
+        break;
+      }
+    }
+    Offset += EntrySize;
+  }
+
+  return std::make_pair(StartIndex, EndIndex);
+}
+
+bool bolt::JumpTable::replaceDestination(uint64_t JTAddress,
+                                         const MCSymbol *OldDest,
+                                         MCSymbol *NewDest) {
+  bool Patched = false;
+  const std::pair<size_t, size_t> Range = getEntriesForAddress(JTAddress);
+  for (auto I = &Entries[Range.first], E = &Entries[Range.second]; I != E;
+       ++I) {
+    MCSymbol *&Entry = *I;
+    if (Entry == OldDest) {
+      Patched = true;
+      Entry = NewDest;
+    }
+  }
+  return Patched;
+}
+
+void bolt::JumpTable::updateOriginal() {
+  BinaryContext &BC = getSection().getBinaryContext();
+  const uint64_t BaseOffset = getAddress() - getSection().getAddress();
+  uint64_t EntryOffset = BaseOffset;
+  for (MCSymbol *Entry : Entries) {
+    const uint64_t RelType =
+        Type == JTT_NORMAL ? ELF::R_X86_64_64 : ELF::R_X86_64_PC32;
+    const uint64_t RelAddend =
+        Type == JTT_NORMAL ? 0 : EntryOffset - BaseOffset;
+    // Replace existing relocation with the new one to allow any modifications
+    // to the original jump table.
+    if (BC.HasRelocations)
+      getOutputSection().removeRelocationAt(EntryOffset);
+    getOutputSection().addRelocation(EntryOffset, Entry, RelType, RelAddend);
+    EntryOffset += EntrySize;
+  }
+}
+
+void bolt::JumpTable::print(raw_ostream &OS) const {
+  uint64_t Offset = 0;
+  if (Type == JTT_PIC)
+    OS << "PIC ";
+  OS << "Jump table " << getName() << " for function " << *Parent << " at 0x"
+     << Twine::utohexstr(getAddress()) << " with a total count of " << Count
+     << ":\n";
+  for (const uint64_t EntryOffset : OffsetEntries) {
+    OS << "  0x" << Twine::utohexstr(EntryOffset) << '\n';
+  }
+  for (const MCSymbol *Entry : Entries) {
+    auto LI = Labels.find(Offset);
+    if (Offset && LI != Labels.end()) {
+      OS << "Jump Table " << LI->second->getName() << " at 0x"
+         << Twine::utohexstr(getAddress() + Offset)
+        << " (possibly part of larger jump table):\n";
+    }
+    OS << format("  0x%04" PRIx64 " : ", Offset) << Entry->getName();
+    if (!Counts.empty()) {
+      OS << " : " << Counts[Offset / EntrySize].Mispreds
+         << "/" << Counts[Offset / EntrySize].Count;
+    }
+    OS << '\n';
+    Offset += EntrySize;
+  }
+  OS << "\n\n";
+}
diff --git a/bolt/src/JumpTable.h b/bolt/src/JumpTable.h
new file mode 100644
index 000000000000..40619972c2fe
--- /dev/null
+++ b/bolt/src/JumpTable.h
@@ -0,0 +1,131 @@
+//===--- JumpTable.h - Representation of a jump table ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_JUMP_TABLE_H
+#define LLVM_TOOLS_LLVM_BOLT_JUMP_TABLE_H
+
+#include "BinaryData.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+class MCSymbol;
+class raw_ostream;
+
+namespace bolt {
+
+enum JumpTableSupportLevel : char {
+  JTS_NONE = 0,       /// Disable jump tables support.
+  JTS_BASIC = 1,      /// Enable basic jump tables support (in-place).
+  JTS_MOVE = 2,       /// Move jump tables to a separate section.
+  JTS_SPLIT = 3,      /// Enable hot/cold splitting of jump tables.
+  JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables.
+};
+
+class BinaryFunction;
+
+/// Representation of a jump table.
+///
+/// The jump table may include other jump tables that are referenced by
+/// a different label at a different offset in this jump table.
+class JumpTable : public BinaryData {
+  friend class BinaryContext;
+
+  JumpTable() = delete;
+  JumpTable(const JumpTable &) = delete;
+  JumpTable &operator=(const JumpTable &) = delete;
+
+public:
+  enum JumpTableType : char {
+    JTT_NORMAL,
+    JTT_PIC,
+  };
+
+  /// Branch statistics for jump table entries.
+  struct JumpInfo {
+    uint64_t Mispreds{0};
+    uint64_t Count{0};
+  };
+
+  /// Size of the entry used for storage.
+  size_t EntrySize;
+
+  /// Size of the entry size we will write (we may use a more compact layout)
+  size_t OutputEntrySize;
+
+  /// The type of this jump table.
+  JumpTableType Type;
+
+  /// All the entries as labels.
+  std::vector<MCSymbol *> Entries;
+
+  /// All the entries as offsets into a function. Invalid after CFG is built.
+  using OffsetsType = std::vector<uint64_t>;
+  OffsetsType OffsetEntries;
+
+  /// Map <Offset> -> <Label> used for embedded jump tables. Label at 0 offset
+  /// is the main label for the jump table.
+  using LabelMapType = std::map<unsigned, MCSymbol *>;
+  LabelMapType Labels;
+
+  /// Dynamic number of times each entry in the table was referenced.
+  /// Identical entries will have a shared count (identical for every
+  /// entry in the set).
+  std::vector<JumpInfo> Counts;
+
+  /// Total number of times this jump table was used.
+  uint64_t Count{0};
+
+  /// BinaryFunction this jump tables belongs to.
+  BinaryFunction *Parent{nullptr};
+
+private:
+  /// Constructor should only be called by a BinaryContext.
+  JumpTable(MCSymbol &Symbol,
+            uint64_t Address,
+            size_t EntrySize,
+            JumpTableType Type,
+            LabelMapType &&Labels,
+            BinaryFunction &BF,
+            BinarySection &Section);
+
+public:
+  /// Return the size of the jump table.
+  uint64_t getSize() const {
+    return std::max(OffsetEntries.size(), Entries.size()) * EntrySize;
+  }
+
+  const MCSymbol *getFirstLabel() const {
+    assert(Labels.count(0) != 0 && "labels must have an entry at 0");
+    return Labels.find(0)->second;
+  }
+
+  /// Get the indexes for symbol entries that correspond to the jump table
+  /// starting at (or containing) 'Addr'.
+  std::pair<size_t, size_t> getEntriesForAddress(const uint64_t Addr) const;
+
+  virtual bool isJumpTable() const override { return true; }
+
+  /// Change all entries of the jump table in \p JTAddress pointing to
+  /// \p OldDest to \p NewDest. Return false if unsuccessful.
+  bool replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest,
+                          MCSymbol *NewDest);
+
+  /// Update jump table at its original location.
+  void updateOriginal();
+
+  /// Print for debugging purposes.
+  virtual void print(raw_ostream &OS) const override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/MCPlus.h b/bolt/src/MCPlus.h
new file mode 100644
index 000000000000..532140f101c0
--- /dev/null
+++ b/bolt/src/MCPlus.h
@@ -0,0 +1,114 @@
+//===--- MCPlus.h - helpers for MCPlus-level instructions -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_MCPLUS_H
+#define LLVM_TOOLS_LLVM_BOLT_MCPLUS_H
+
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Casting.h"
+
+namespace llvm {
+namespace bolt {
+
+namespace MCPlus {
+
+/// This type represents C++ EH info for a callsite.  The symbol is the landing
+/// pad and the uint64_t represents the action.
+using MCLandingPad = std::pair<const MCSymbol *, uint64_t>;
+
+/// An extension to MCInst is provided via an extra operand of type MCInst with
+/// ANNOTATION_LABEL opcode (i.e. we are tying an annotation instruction to an
+/// existing one). The annotation instruction contains a list of Immediate
+/// operands. Each operand either contains a value, or is a pointer to
+/// an instance of class MCAnnotation.
+///
+/// There are 2 distinct groups of annotations. The first group is a first-class
+/// annotation that affects semantics of the instruction, such as an
+/// exception-handling or jump table information. The second group contains
+/// information that is supplement, and could be discarded without affecting
+/// correctness of the program. Debugging information, and profile information
+/// belong to the second group.
+///
+/// Note: some optimization/transformation passes could use generic annotations
+///       inside the pass and remove these annotations after the pass. In this
+///       case, the internal state saved with annotations could affect the
+///       correctness.
+///
+/// For the first group, we use a reserved annotation index. Operands in
+/// the first groups store a value of an annotation in the immediate field
+/// of their corresponding operand.
+///
+/// Annotations in the second group could be addressed either by name, or by
+/// by and index which could be queried by providing a name.
+class MCAnnotation {
+public:
+  enum Kind {
+    kEHLandingPad,        /// Exception handling landing pad.
+    kEHAction,            /// Action for exception handler.
+    kGnuArgsSize,         /// GNU args size.
+    kJumpTable,           /// Jump Table.
+    kConditionalTailCall, /// CTC.
+    kGeneric              /// First generic annotation.
+  };
+
+  virtual void print(raw_ostream &OS) const = 0;
+  virtual bool equals(const MCAnnotation &) const = 0;
+  virtual ~MCAnnotation() {}
+protected:
+  MCAnnotation() {}
+private:
+  // noncopyable
+  MCAnnotation(const MCAnnotation &Other) = delete;
+  MCAnnotation &operator=(const MCAnnotation &Other) = delete;
+};
+
+/// Instances of this class represent a simple annotation with a
+/// specific value type.
+/// Note that if ValueType contains any heap allocated memory, it will
+/// only be freed if the annotation is removed with the
+/// MCPlusBuilder::removeAnnotation method.  This is because all
+/// annotations are arena allocated.
+template <typename ValueType>
+class MCSimpleAnnotation : public MCAnnotation {
+public:
+  ValueType &getValue() { return Value; }
+  bool equals(const MCAnnotation &Other) const override {
+    return Value == static_cast<const MCSimpleAnnotation &>(Other).Value;
+  }
+  explicit MCSimpleAnnotation(const ValueType &Val)
+    : Value(Val) {}
+
+  void print(raw_ostream &OS) const override {
+    OS << Value;
+  }
+
+private:
+  ValueType Value;
+};
+
+/// Return a number of operands in \Inst excluding operands representing
+/// annotations.
+inline unsigned getNumPrimeOperands(const MCInst &Inst) {
+  if (Inst.getNumOperands() > 0 && std::prev(Inst.end())->isInst()) {
+    assert(std::prev(Inst.end())->
+             getInst()->getOpcode() == TargetOpcode::ANNOTATION_LABEL);
+    return Inst.getNumOperands() - 1;
+  }
+  return Inst.getNumOperands();
+}
+
+} // namespace MCPlus
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/MCPlusBuilder.cpp b/bolt/src/MCPlusBuilder.cpp
new file mode 100644
index 000000000000..d0b17ceb8065
--- /dev/null
+++ b/bolt/src/MCPlusBuilder.cpp
@@ -0,0 +1,477 @@
+//===- MCPlusBuilder.cpp - main interface for MCPlus-level instructions ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Create/analyze/modify instructions at MC+ level.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCPlus.h"
+#include "MCPlusBuilder.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include <cstdint>
+#include <queue>
+
+#define DEBUG_TYPE "mcplus"
+
+using namespace llvm;
+using namespace bolt;
+using namespace MCPlus;
+
+bool MCPlusBuilder::equals(const MCInst &A, const MCInst &B,
+                           CompFuncTy Comp) const {
+  if (A.getOpcode() != B.getOpcode())
+    return false;
+
+  unsigned NumOperands = MCPlus::getNumPrimeOperands(A);
+  if (NumOperands != MCPlus::getNumPrimeOperands(B))
+    return false;
+
+  for (unsigned Index = 0; Index < NumOperands; ++Index) {
+    if (!equals(A.getOperand(Index), B.getOperand(Index), Comp))
+      return false;
+  }
+
+  return true;
+}
+
+bool MCPlusBuilder::equals(const MCOperand &A, const MCOperand &B,
+                           CompFuncTy Comp) const {
+  if (A.isReg()) {
+    if (!B.isReg())
+      return false;
+    return A.getReg() == B.getReg();
+  } else if (A.isImm()) {
+    if (!B.isImm())
+      return false;
+    return A.getImm() == B.getImm();
+  } else if (A.isSFPImm()) {
+    if (!B.isSFPImm())
+      return false;
+    return A.getSFPImm() == B.getSFPImm();
+  } else if (A.isDFPImm()) {
+    if (!B.isDFPImm())
+      return false;
+    return A.getDFPImm() == B.getDFPImm();
+  } else if (A.isExpr()) {
+    if (!B.isExpr())
+      return false;
+    return equals(*A.getExpr(), *B.getExpr(), Comp);
+  } else {
+    llvm_unreachable("unexpected operand kind");
+    return false;
+  }
+}
+
+bool MCPlusBuilder::equals(const MCExpr &A, const MCExpr &B,
+                           CompFuncTy Comp) const {
+  if (A.getKind() != B.getKind())
+    return false;
+
+  switch (A.getKind()) {
+  case MCExpr::Constant: {
+    const auto &ConstA = cast<MCConstantExpr>(A);
+    const auto &ConstB = cast<MCConstantExpr>(B);
+    return ConstA.getValue() == ConstB.getValue();
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymbolA = cast<MCSymbolRefExpr>(A);
+    const MCSymbolRefExpr &SymbolB = cast<MCSymbolRefExpr>(B);
+    return SymbolA.getKind() == SymbolB.getKind() &&
+           Comp(&SymbolA.getSymbol(), &SymbolB.getSymbol());
+  }
+
+  case MCExpr::Unary: {
+    const auto &UnaryA = cast<MCUnaryExpr>(A);
+    const auto &UnaryB = cast<MCUnaryExpr>(B);
+    return UnaryA.getOpcode() == UnaryB.getOpcode() &&
+           equals(*UnaryA.getSubExpr(), *UnaryB.getSubExpr(), Comp);
+  }
+
+  case MCExpr::Binary: {
+    const auto &BinaryA = cast<MCBinaryExpr>(A);
+    const auto &BinaryB = cast<MCBinaryExpr>(B);
+    return BinaryA.getOpcode() == BinaryB.getOpcode() &&
+           equals(*BinaryA.getLHS(), *BinaryB.getLHS(), Comp) &&
+           equals(*BinaryA.getRHS(), *BinaryB.getRHS(), Comp);
+  }
+
+  case MCExpr::Target: {
+    const auto &TargetExprA = cast<MCTargetExpr>(A);
+    const auto &TargetExprB = cast<MCTargetExpr>(B);
+    return equals(TargetExprA, TargetExprB, Comp);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+bool MCPlusBuilder::equals(const MCTargetExpr &A, const MCTargetExpr &B,
+                           CompFuncTy Comp) const {
+    llvm_unreachable("target-specific expressions are unsupported");
+}
+
+Optional<MCLandingPad> MCPlusBuilder::getEHInfo(const MCInst &Inst) const {
+  if (!isCall(Inst))
+    return NoneType();
+  Optional<int64_t> LPSym =
+      getAnnotationOpValue(Inst, MCAnnotation::kEHLandingPad);
+  if (!LPSym)
+    return NoneType();
+  Optional<int64_t> Action =
+      getAnnotationOpValue(Inst, MCAnnotation::kEHAction);
+  if (!Action)
+    return NoneType();
+
+  return std::make_pair(reinterpret_cast<const MCSymbol *>(*LPSym),
+                        static_cast<uint64_t>(*Action));
+}
+
+void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) {
+  if (isCall(Inst)) {
+    assert(!getEHInfo(Inst));
+    setAnnotationOpValue(Inst, MCAnnotation::kEHLandingPad,
+                         reinterpret_cast<int64_t>(LP.first));
+    setAnnotationOpValue(Inst, MCAnnotation::kEHAction,
+                         static_cast<int64_t>(LP.second));
+  }
+}
+
+int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const {
+  Optional<int64_t> Value =
+      getAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize);
+  if (!Value)
+    return -1LL;
+  return *Value;
+}
+
+void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
+                                   AllocatorIdTy AllocId) {
+  assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value");
+  assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set");
+  assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke");
+
+  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId);
+}
+
+uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
+  Optional<int64_t> Value =
+      getAnnotationOpValue(Inst, MCAnnotation::kJumpTable);
+  if (!Value)
+    return 0;
+  return *Value;
+}
+
+uint16_t MCPlusBuilder::getJumpTableIndexReg(const MCInst &Inst) const {
+  return getAnnotationAs<uint16_t>(Inst, "JTIndexReg");
+}
+
+bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value,
+                                 uint16_t IndexReg, AllocatorIdTy AllocId) {
+  if (!isIndirectBranch(Inst))
+    return false;
+  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId);
+  getOrCreateAnnotationAs<uint16_t>(Inst, "JTIndexReg", AllocId) = IndexReg;
+  return true;
+}
+
+bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) {
+  if (!getJumpTable(Inst))
+    return false;
+  removeAnnotation(Inst, MCAnnotation::kJumpTable);
+  removeAnnotation(Inst, "JTIndexReg");
+  return true;
+}
+
+Optional<uint64_t>
+MCPlusBuilder::getConditionalTailCall(const MCInst &Inst) const {
+  Optional<int64_t> Value =
+      getAnnotationOpValue(Inst, MCAnnotation::kConditionalTailCall);
+  if (!Value)
+    return NoneType();
+  return static_cast<uint64_t>(*Value);
+}
+
+bool
+MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) {
+  if (!isConditionalBranch(Inst))
+    return false;
+
+  setAnnotationOpValue(Inst, MCAnnotation::kConditionalTailCall, Dest);
+  return true;
+}
+
+bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) {
+  if (!getConditionalTailCall(Inst))
+    return false;
+  removeAnnotation(Inst, MCAnnotation::kConditionalTailCall);
+  return true;
+}
+
+bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const {
+  const MCInst *AnnotationInst = getAnnotationInst(Inst);
+  if (!AnnotationInst)
+    return false;
+
+  return (bool)getAnnotationOpValue(Inst, Index);
+}
+
+bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) {
+  MCInst *AnnotationInst = getAnnotationInst(Inst);
+  if (!AnnotationInst)
+    return false;
+
+  for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
+    int64_t ImmValue = AnnotationInst->getOperand(I).getImm();
+    if (extractAnnotationIndex(ImmValue) == Index) {
+      AnnotationInst->erase(AnnotationInst->begin() + I);
+      return true;
+    }
+  }
+  return false;
+}
+
+void MCPlusBuilder::stripAnnotations(MCInst &Inst) {
+  MCInst *AnnotationInst = getAnnotationInst(Inst);
+  if (!AnnotationInst)
+    return;
+
+  Inst.erase(std::prev(Inst.end()));
+}
+
+void
+MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const {
+  const MCInst *AnnotationInst = getAnnotationInst(Inst);
+  if (!AnnotationInst)
+    return;
+
+  for (unsigned I = 0; I < AnnotationInst->getNumOperands(); ++I) {
+    const int64_t Imm = AnnotationInst->getOperand(I).getImm();
+    const unsigned Index = extractAnnotationIndex(Imm);
+    const int64_t Value = extractAnnotationValue(Imm);
+    const auto *Annotation =
+            reinterpret_cast<const MCAnnotation *>(Value);
+    if (Index >= MCAnnotation::kGeneric) {
+      OS << " # " << AnnotationNames[Index - MCAnnotation::kGeneric]
+         << ": ";
+      Annotation->print(OS);
+    }
+  }
+}
+
+bool MCPlusBuilder::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                   uint64_t Size, uint64_t &Target) const {
+  return Analysis->evaluateBranch(Inst, Addr, Size, Target);
+}
+
+void MCPlusBuilder::getClobberedRegs(const MCInst &Inst,
+                                     BitVector &Regs) const {
+  if (isPrefix(Inst) || isCFI(Inst))
+    return;
+
+  const MCInstrDesc &InstInfo = Info->get(Inst.getOpcode());
+
+  const MCPhysReg *ImplicitDefs = InstInfo.getImplicitDefs();
+  for (unsigned I = 0, E = InstInfo.getNumImplicitDefs(); I != E; ++I) {
+    Regs |= getAliases(ImplicitDefs[I], /*OnlySmaller=*/false);
+  }
+
+  for (unsigned I = 0, E = InstInfo.getNumDefs(); I != E; ++I) {
+    const MCOperand &Operand = Inst.getOperand(I);
+    assert(Operand.isReg());
+    Regs |= getAliases(Operand.getReg(), /*OnlySmaller=*/false);
+  }
+}
+
+void MCPlusBuilder::getTouchedRegs(const MCInst &Inst,
+                                   BitVector &Regs) const {
+  if (isPrefix(Inst) || isCFI(Inst))
+    return;
+
+  const MCInstrDesc &InstInfo = Info->get(Inst.getOpcode());
+
+  const MCPhysReg *ImplicitDefs = InstInfo.getImplicitDefs();
+  for (unsigned I = 0, E = InstInfo.getNumImplicitDefs(); I != E; ++I) {
+    Regs |= getAliases(ImplicitDefs[I], /*OnlySmaller=*/false);
+  }
+  const MCPhysReg *ImplicitUses = InstInfo.getImplicitUses();
+  for (unsigned I = 0, E = InstInfo.getNumImplicitUses(); I != E; ++I) {
+    Regs |= getAliases(ImplicitUses[I], /*OnlySmaller=*/false);
+  }
+
+  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+    if (!Inst.getOperand(I).isReg())
+      continue;
+    Regs |= getAliases(Inst.getOperand(I).getReg(), /*OnlySmaller=*/false);
+  }
+}
+
+void MCPlusBuilder::getWrittenRegs(const MCInst &Inst,
+                                   BitVector &Regs) const {
+  if (isPrefix(Inst) || isCFI(Inst))
+    return;
+
+  const MCInstrDesc &InstInfo = Info->get(Inst.getOpcode());
+
+  const MCPhysReg *ImplicitDefs = InstInfo.getImplicitDefs();
+  for (unsigned I = 0, E = InstInfo.getNumImplicitDefs(); I != E; ++I) {
+    Regs |= getAliases(ImplicitDefs[I], /*OnlySmaller=*/true);
+  }
+
+  for (unsigned I = 0, E = InstInfo.getNumDefs(); I != E; ++I) {
+    const MCOperand &Operand = Inst.getOperand(I);
+    assert(Operand.isReg());
+    Regs |= getAliases(Operand.getReg(), /*OnlySmaller=*/true);
+  }
+}
+
+void MCPlusBuilder::getUsedRegs(const MCInst &Inst, BitVector &Regs) const {
+  if (isPrefix(Inst) || isCFI(Inst))
+    return;
+
+  const MCInstrDesc &InstInfo = Info->get(Inst.getOpcode());
+
+  const MCPhysReg *ImplicitUses = InstInfo.getImplicitUses();
+  for (unsigned I = 0, E = InstInfo.getNumImplicitUses(); I != E; ++I) {
+    Regs |= getAliases(ImplicitUses[I], /*OnlySmaller=*/true);
+  }
+
+  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+    if (!Inst.getOperand(I).isReg())
+      continue;
+    Regs |= getAliases(Inst.getOperand(I).getReg(), /*OnlySmaller=*/true);
+  }
+}
+
+bool MCPlusBuilder::hasDefOfPhysReg(const MCInst &MI, unsigned Reg) const {
+  const MCInstrDesc &InstInfo = Info->get(MI.getOpcode());
+  return InstInfo.hasDefOfPhysReg(MI, Reg, *RegInfo);
+}
+
+bool MCPlusBuilder::hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const {
+  const MCInstrDesc &InstInfo = Info->get(MI.getOpcode());
+  for (int I = InstInfo.NumDefs; I < InstInfo.NumOperands; ++I)
+    if (MI.getOperand(I).isReg() &&
+        RegInfo->isSubRegisterEq(Reg, MI.getOperand(I).getReg()))
+      return true;
+  if (const uint16_t *ImpUses = InstInfo.ImplicitUses)
+    for (; *ImpUses; ++ImpUses)
+      if (*ImpUses == Reg || RegInfo->isSubRegister(Reg, *ImpUses))
+        return true;
+  return false;
+}
+
+const BitVector &
+MCPlusBuilder::getAliases(MCPhysReg Reg,
+                          bool OnlySmaller) const {
+  // AliasMap caches a mapping of registers to the set of registers that
+  // alias (are sub or superregs of itself, including itself).
+  static std::vector<BitVector> AliasMap;
+  static std::vector<MCPhysReg> SuperReg;
+
+  if (AliasMap.size() > 0) {
+    if (OnlySmaller)
+      return AliasMap[Reg];
+    return AliasMap[SuperReg[Reg]];
+  }
+  // Build alias map
+  for (MCPhysReg I = 0, E = RegInfo->getNumRegs(); I != E; ++I) {
+    BitVector BV(RegInfo->getNumRegs(), false);
+    BV.set(I);
+    AliasMap.emplace_back(std::move(BV));
+    SuperReg.emplace_back(I);
+  }
+  std::queue<MCPhysReg> Worklist;
+  // Propagate alias info upwards. Skip reg 0 (mapped to NoRegister)
+  for (MCPhysReg I = 1, E = RegInfo->getNumRegs(); I < E; ++I) {
+    Worklist.push(I);
+  }
+  while (!Worklist.empty()) {
+    MCPhysReg I = Worklist.front();
+    Worklist.pop();
+    for (MCSubRegIterator SI(I, RegInfo); SI.isValid(); ++SI) {
+      AliasMap[I] |= AliasMap[*SI];
+    }
+    for (MCSuperRegIterator SI(I, RegInfo); SI.isValid(); ++SI) {
+      Worklist.push(*SI);
+    }
+  }
+  // Propagate parent reg downwards
+  for (MCPhysReg I = 1, E = RegInfo->getNumRegs(); I < E; ++I) {
+    Worklist.push(I);
+  }
+  while (!Worklist.empty()) {
+    MCPhysReg I = Worklist.front();
+    Worklist.pop();
+    for (MCSubRegIterator SI(I, RegInfo); SI.isValid(); ++SI) {
+      SuperReg[*SI] = SuperReg[I];
+      Worklist.push(*SI);
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Dumping reg alias table:\n";
+    for (MCPhysReg I = 0, E = RegInfo->getNumRegs(); I != E; ++I) {
+      dbgs() << "Reg " << I << ": ";
+      const BitVector &BV = AliasMap[SuperReg[I]];
+      int Idx = BV.find_first();
+      while (Idx != -1) {
+        dbgs() << Idx << " ";
+        Idx = BV.find_next(Idx);
+      }
+      dbgs() << "\n";
+    }
+  });
+
+  if (OnlySmaller)
+    return AliasMap[Reg];
+  return AliasMap[SuperReg[Reg]];
+}
+
+uint8_t
+MCPlusBuilder::getRegSize(MCPhysReg Reg) const {
+  // SizeMap caches a mapping of registers to their sizes
+  static std::vector<uint8_t> SizeMap;
+
+  if (SizeMap.size() > 0) {
+    return SizeMap[Reg];
+  }
+  SizeMap = std::vector<uint8_t>(RegInfo->getNumRegs());
+  // Build size map
+  for (auto I = RegInfo->regclass_begin(), E = RegInfo->regclass_end(); I != E;
+       ++I) {
+    for (MCPhysReg Reg : *I) {
+      SizeMap[Reg] = I->getSizeInBits() / 8;
+    }
+  }
+
+  return SizeMap[Reg];
+}
+
+bool MCPlusBuilder::setOperandToSymbolRef(MCInst &Inst, int OpNum,
+                                          const MCSymbol *Symbol,
+                                          int64_t Addend, MCContext *Ctx,
+                                          uint64_t RelType) const {
+  MCOperand Operand;
+  if (!Addend) {
+    Operand = MCOperand::createExpr(getTargetExprFor(
+        Inst, MCSymbolRefExpr::create(Symbol, *Ctx), *Ctx, RelType));
+  } else {
+    Operand = MCOperand::createExpr(getTargetExprFor(
+        Inst,
+        MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, *Ctx),
+                                MCConstantExpr::create(Addend, *Ctx), *Ctx),
+        *Ctx, RelType));
+  }
+  Inst.getOperand(OpNum) = Operand;
+  return true;
+}
diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h
new file mode 100644
index 000000000000..b8f48c89471e
--- /dev/null
+++ b/bolt/src/MCPlusBuilder.h
@@ -0,0 +1,1826 @@
+//===--- MCPlusBuilder.h - main interface for MCPlus-level instructions ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Create/analyze/modify instructions at MC+ level.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_MCPLUSBUILDER_H
+#define LLVM_TOOLS_LLVM_BOLT_MCPLUSBUILDER_H
+
+#include "MCPlus.h"
+#include "Relocation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <system_error>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace llvm {
+class MCContext;
+class MCFixup;
+class MCRegisterInfo;
+class MCSymbol;
+class raw_ostream;
+
+namespace bolt {
+
+/// Different types of indirect branches encountered during disassembly.
+enum class IndirectBranchType : char {
+  UNKNOWN = 0,             /// Unable to determine type.
+  POSSIBLE_TAIL_CALL,      /// Possibly a tail call.
+  POSSIBLE_JUMP_TABLE,     /// Possibly a switch/jump table.
+  POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC.
+  POSSIBLE_GOTO,           /// Possibly a gcc's computed goto.
+  POSSIBLE_FIXED_BRANCH,   /// Possibly an indirect branch to a fixed location.
+};
+
+class MCPlusBuilder {
+public:
+  using AllocatorIdTy = uint16_t;
+
+private:
+  /// A struct that represents a single annotation allocator
+  struct AnnotationAllocator {
+    SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
+    BumpPtrAllocator ValueAllocator;
+    std::unordered_set<MCPlus::MCAnnotation *> AnnotationPool;
+  };
+
+  /// A set of annotation allocators
+  std::unordered_map<AllocatorIdTy, AnnotationAllocator> AnnotationAllocators;
+
+  /// A variable that is used to generate unique ids for annotation allocators
+  AllocatorIdTy MaxAllocatorId = 0;
+
+  /// We encode Index and Value into a 64-bit immediate operand value.
+  static int64_t encodeAnnotationImm(unsigned Index, int64_t Value) {
+    assert(Index < 256 && "annotation index max value exceeded");
+    assert((Value == (Value << 8) >> 8) && "annotation value out of range");
+
+    Value &= 0xffffffffffffff;
+    Value |= (int64_t)Index << 56;
+
+    return Value;
+  }
+
+  /// Extract annotation index from immediate operand value.
+  static unsigned extractAnnotationIndex(int64_t ImmValue) {
+    return ImmValue >> 56;
+  }
+
+  /// Extract annotation value from immediate operand value.
+  static int64_t extractAnnotationValue(int64_t ImmValue) {
+    ImmValue &= 0xffffffffffffff;
+    return (ImmValue << 8) >> 8;
+  }
+
+  MCInst *getAnnotationInst(const MCInst &Inst) const {
+    if (Inst.getNumOperands() == 0)
+      return nullptr;
+
+    const MCOperand &LastOp = Inst.getOperand(Inst.getNumOperands() - 1);
+    if (!LastOp.isInst())
+      return nullptr;
+
+    MCInst *AnnotationInst = const_cast<MCInst *>(LastOp.getInst());
+    assert(AnnotationInst->getOpcode() == TargetOpcode::ANNOTATION_LABEL);
+
+    return AnnotationInst;
+  }
+
+  void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value,
+                            AllocatorIdTy AllocatorId = 0) {
+    MCInst *AnnotationInst = getAnnotationInst(Inst);
+    if (!AnnotationInst) {
+      AnnotationAllocator &Allocator = getAnnotationAllocator(AllocatorId);
+      AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst();
+      AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL);
+      Inst.addOperand(MCOperand::createInst(AnnotationInst));
+    }
+
+    const int64_t AnnotationValue = encodeAnnotationImm(Index, Value);
+    for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
+      int64_t ImmValue = AnnotationInst->getOperand(I).getImm();
+      if (extractAnnotationIndex(ImmValue) == Index) {
+        AnnotationInst->getOperand(I).setImm(AnnotationValue);
+        return;
+      }
+    }
+
+    AnnotationInst->addOperand(MCOperand::createImm(AnnotationValue));
+  }
+
+  Optional<int64_t>
+  getAnnotationOpValue(const MCInst &Inst, unsigned Index) const {
+    const MCInst *AnnotationInst = getAnnotationInst(Inst);
+    if (!AnnotationInst)
+      return NoneType();
+
+    for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
+      int64_t ImmValue = AnnotationInst->getOperand(I).getImm();
+      if (extractAnnotationIndex(ImmValue) == Index) {
+        return extractAnnotationValue(ImmValue);
+      }
+    }
+
+    return NoneType();
+  }
+
+protected:
+  const MCInstrAnalysis *Analysis;
+  const MCInstrInfo *Info;
+  const MCRegisterInfo *RegInfo;
+
+  /// Map annotation name into an annotation index.
+  StringMap<uint64_t> AnnotationNameIndexMap;
+
+  /// Names of non-standard annotations.
+  SmallVector<std::string, 8> AnnotationNames;
+
+public:
+  class InstructionIterator
+    : public std::iterator<std::bidirectional_iterator_tag, MCInst> {
+  public:
+    class Impl {
+    public:
+      virtual Impl *Copy() const = 0;
+      virtual void Next() = 0;
+      virtual void Prev() = 0;
+      virtual MCInst &Deref() = 0;
+      virtual bool Compare(const Impl &Other) const = 0;
+      virtual ~Impl() { }
+    };
+
+    template <typename T>
+    class SeqImpl : public Impl {
+    public:
+      virtual Impl *Copy() const override {
+        return new SeqImpl(Itr);
+      }
+      virtual void Next() override { ++Itr; }
+      virtual void Prev() override { --Itr; }
+      virtual MCInst &Deref() override {
+        return const_cast<MCInst &>(*Itr);
+      }
+      virtual bool Compare(const Impl &Other) const override {
+        // assumes that Other is same underlying type
+        return Itr == static_cast<const SeqImpl<T>&>(Other).Itr;
+      }
+      explicit SeqImpl(T &&Itr) : Itr(std::move(Itr)) { }
+      explicit SeqImpl(const T &Itr) : Itr(Itr) { }
+    private:
+      T Itr;
+    };
+
+    template <typename T>
+    class MapImpl : public Impl {
+    public:
+      virtual Impl *Copy() const override {
+        return new MapImpl(Itr);
+      }
+      virtual void Next() override { ++Itr; }
+      virtual void Prev() override { --Itr; }
+      virtual MCInst &Deref() override {
+        return const_cast<MCInst &>(Itr->second);
+      }
+      virtual bool Compare(const Impl &Other) const override {
+        // assumes that Other is same underlying type
+        return Itr == static_cast<const MapImpl<T>&>(Other).Itr;
+      }
+      explicit MapImpl(T &&Itr) : Itr(std::move(Itr)) { }
+      explicit MapImpl(const T &Itr) : Itr(Itr) { }
+    private:
+      T Itr;
+    };
+
+    InstructionIterator &operator++() {
+      Itr->Next();
+      return *this;
+    }
+    InstructionIterator &operator--() {
+      Itr->Prev();
+      return *this;
+    }
+    InstructionIterator operator++(int) {
+      std::unique_ptr<Impl> Tmp(Itr->Copy());
+      Itr->Next();
+      return InstructionIterator(std::move(Tmp));
+    }
+    InstructionIterator operator--(int) {
+      std::unique_ptr<Impl> Tmp(Itr->Copy());
+      Itr->Prev();
+      return InstructionIterator(std::move(Tmp));
+    }
+    bool operator==(const InstructionIterator& Other) const {
+      return Itr->Compare(*Other.Itr);
+    }
+    bool operator!=(const InstructionIterator& Other) const {
+      return !Itr->Compare(*Other.Itr);
+    }
+    MCInst& operator*() { return Itr->Deref(); }
+    MCInst* operator->() { return &Itr->Deref(); }
+
+    InstructionIterator &operator=(InstructionIterator &&Other) {
+      Itr = std::move(Other.Itr);
+      return *this;
+    }
+    InstructionIterator &operator=(const InstructionIterator &Other) {
+      if (this != &Other)
+        Itr.reset(Other.Itr->Copy());
+      return *this;
+    }
+    InstructionIterator() { }
+    InstructionIterator(const InstructionIterator &Other)
+      : Itr(Other.Itr->Copy()) { }
+    InstructionIterator(InstructionIterator &&Other)
+      : Itr(std::move(Other.Itr)) { }
+    explicit InstructionIterator(std::unique_ptr<Impl> Itr)
+      : Itr(std::move(Itr)) { }
+
+    InstructionIterator(std::vector<MCInst>::iterator Itr)
+      : Itr(new SeqImpl<std::vector<MCInst>::iterator>(Itr)) { }
+
+    template <typename T>
+    InstructionIterator(T *Itr)
+      : Itr(new SeqImpl<T *>(Itr)) { }
+
+    InstructionIterator(ArrayRef<MCInst>::iterator Itr)
+      : Itr(new SeqImpl<ArrayRef<MCInst>::iterator>(Itr)) { }
+
+    InstructionIterator(MutableArrayRef<MCInst>::iterator Itr)
+      : Itr(new SeqImpl<MutableArrayRef<MCInst>::iterator>(Itr)) { }
+
+    // TODO: it would be nice to templatize this on the key type.
+    InstructionIterator(std::map<uint32_t, MCInst>::iterator Itr)
+      : Itr(new MapImpl<std::map<uint32_t, MCInst>::iterator>(Itr)) { }
+  private:
+    std::unique_ptr<Impl> Itr;
+  };
+
+public:
+  MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
+                const MCRegisterInfo *RegInfo)
+      : Analysis(Analysis), Info(Info), RegInfo(RegInfo) {
+    // Initialize the default annotation allocator with id 0
+    AnnotationAllocators.emplace(0, AnnotationAllocator());
+    MaxAllocatorId++;
+  }
+
+  /// Initialize a new annotation allocator and return its id
+  AllocatorIdTy initializeNewAnnotationAllocator() {
+    AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());
+    return MaxAllocatorId++;
+  }
+
+  /// Return the annotation allocator of a given id
+  AnnotationAllocator &getAnnotationAllocator(AllocatorIdTy AllocatorId) {
+    assert(AnnotationAllocators.count(AllocatorId) &&
+           "allocator not initialized");
+    return AnnotationAllocators.find(AllocatorId)->second;
+  }
+
+  // Check if an annotation allocator with the given id exists
+  bool checkAllocatorExists(AllocatorIdTy AllocatorId) {
+    return AnnotationAllocators.count(AllocatorId);
+  }
+
+  /// Free the values allocator within the annotation allocator
+  void freeValuesAllocator(AllocatorIdTy AllocatorId) {
+    AnnotationAllocator &Allocator = getAnnotationAllocator(AllocatorId);
+    for (MCPlus::MCAnnotation *Annotation : Allocator.AnnotationPool)
+      Annotation->~MCAnnotation();
+
+    Allocator.AnnotationPool.clear();
+    Allocator.ValueAllocator.Reset();
+  }
+
+  virtual ~MCPlusBuilder() {
+    freeAnnotations();
+  }
+
+  /// Free all memory allocated for annotations
+  void freeAnnotations() {
+    for (auto &Element : AnnotationAllocators) {
+      AnnotationAllocator &Allocator = Element.second;
+      for (MCPlus::MCAnnotation *Annotation : Allocator.AnnotationPool)
+        Annotation->~MCAnnotation();
+
+      Allocator.AnnotationPool.clear();
+      Allocator.ValueAllocator.Reset();
+      Allocator.MCInstAllocator.DestroyAll();
+    }
+  }
+
+  using CompFuncTy = std::function<bool(const MCSymbol *, const MCSymbol *)>;
+
+  bool equals(const MCInst &A, const MCInst &B, CompFuncTy Comp) const;
+
+  bool equals(const MCOperand &A, const MCOperand &B,
+                      CompFuncTy Comp) const;
+
+  bool equals(const MCExpr &A, const MCExpr &B, CompFuncTy Comp) const;
+
+  virtual bool equals(const MCTargetExpr &A, const MCTargetExpr &B,
+                      CompFuncTy Comp) const;
+
+  virtual bool isBranch(const MCInst &Inst) const {
+    return Analysis->isBranch(Inst);
+  }
+
+  virtual bool isConditionalBranch(const MCInst &Inst) const {
+    return Analysis->isConditionalBranch(Inst);
+  }
+
+  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+    return Analysis->isUnconditionalBranch(Inst);
+  }
+
+  virtual bool isIndirectBranch(const MCInst &Inst) const {
+    return Analysis->isIndirectBranch(Inst);
+  }
+
+  /// Returns true if the instruction is memory indirect call or jump
+  virtual bool isBranchOnMem(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Returns true if the instruction is register indirect call or jump
+  virtual bool isBranchOnReg(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Check whether we support inverting this branch
+  virtual bool isUnsupportedBranch(unsigned Opcode) const {
+    return false;
+  }
+
+  /// Return true of the instruction is of pseudo kind.
+  bool isPseudo(const MCInst &Inst) const {
+    return Info->get(Inst.getOpcode()).isPseudo();
+  }
+
+  /// Creates x86 pause instruction.
+  virtual void createPause(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createLfence(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createPushRegister(MCInst &Inst, MCPhysReg Reg,
+                                  unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createPopRegister(MCInst &Inst, MCPhysReg Reg,
+                                 unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createPushFlags(MCInst &Inst, unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createPopFlags(MCInst &Inst, unsigned Size) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target,
+                                MCContext *Ctx, bool IsTailCall) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual MCPhysReg getX86R11() const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Return a register number that is guaranteed to not match with
+  /// any real register on the underlying architecture.
+  virtual MCPhysReg getNoRegister() const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Return a register corresponding to a function integer argument \p ArgNo
+  /// if the argument is passed in a register. Or return the result of
+  /// getNoRegister() otherwise. The enumeration starts at 0.
+  ///
+  /// Note: this should depend on a used calling convention.
+  virtual MCPhysReg getIntArgRegister(unsigned ArgNo) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual bool isIndirectCall(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isCall(const MCInst &Inst) const {
+    return Analysis->isCall(Inst) || isTailCall(Inst);
+  }
+
+  virtual bool isReturn(const MCInst &Inst) const {
+    return Analysis->isReturn(Inst);
+  }
+
+  virtual bool isTerminator(const MCInst &Inst) const {
+    return Analysis->isTerminator(Inst);
+  }
+
+  virtual bool isNoop(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isBreakpoint(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isPrefix(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool deleteREPPrefix(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isEHLabel(const MCInst &Inst) const {
+    return Inst.getOpcode() == TargetOpcode::EH_LABEL;
+  }
+
+  virtual bool isPop(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return the width, in bytes, of the memory access performed by \p Inst, if
+  /// this is a pop instruction. Return zero otherwise.
+  virtual int getPopSize(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  virtual bool isPush(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return the width, in bytes, of the memory access performed by \p Inst, if
+  /// this is a push instruction. Return zero otherwise.
+  virtual int getPushSize(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  virtual bool isADD64rr(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isSUB(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isLEA64r(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isMOVSX64rm32(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isLeave(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isEnter(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isADRP(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isADR(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isMoveMem2Reg(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isLoad(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isStore(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isCleanRegXOR(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Interface and basic functionality of a MCInstMatcher. The idea is to make
+  /// it easy to match one or more MCInsts against a tree-like pattern and
+  /// extract the fragment operands. Example:
+  ///
+  ///   auto IndJmpMatcher =
+  ///       matchIndJmp(matchAdd(matchAnyOperand(), matchAnyOperand()));
+  ///   if (!IndJmpMatcher->match(...))
+  ///     return false;
+  ///
+  /// This matches an indirect jump whose target register is defined by an
+  /// add to form the target address. Matchers should also allow extraction
+  /// of operands, for example:
+  ///
+  ///   uint64_t Scale;
+  ///   auto IndJmpMatcher = BC.MIA->matchIndJmp(
+  ///       BC.MIA->matchAnyOperand(), BC.MIA->matchImm(Scale),
+  ///       BC.MIA->matchReg(), BC.MIA->matchAnyOperand());
+  ///   if (!IndJmpMatcher->match(...))
+  ///     return false;
+  ///
+  /// Here we are interesting in extracting the scale immediate in an indirect
+  /// jump fragment.
+  ///
+  struct MCInstMatcher {
+    MutableArrayRef<MCInst> InstrWindow;
+    MutableArrayRef<MCInst>::iterator CurInst;
+    virtual ~MCInstMatcher() {}
+
+    /// Returns true if the pattern is matched. Needs MCRegisterInfo and
+    /// MCInstrAnalysis for analysis. InstrWindow contains an array
+    /// where the last instruction is always the instruction to start matching
+    /// against a fragment, potentially matching more instructions before it.
+    /// If OpNum is greater than 0, we will not match against the last
+    /// instruction itself but against an operand of the last instruction given
+    /// by the index OpNum. If this operand is a register, we will immediately
+    /// look for a previous instruction defining this register and match against
+    /// it instead. This parent member function contains common bookkeeping
+    /// required to implement this behavior.
+    virtual bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIA,
+                       MutableArrayRef<MCInst> InInstrWindow, int OpNum) {
+      InstrWindow = InInstrWindow;
+      CurInst = InstrWindow.end();
+
+      if (!next())
+        return false;
+
+      if (OpNum < 0)
+        return true;
+
+      if (static_cast<unsigned int>(OpNum) >=
+            MCPlus::getNumPrimeOperands(*CurInst))
+        return false;
+
+      const MCOperand &Op = CurInst->getOperand(OpNum);
+      if (!Op.isReg())
+        return true;
+
+      MCPhysReg Reg = Op.getReg();
+      while (next()) {
+        const MCInstrDesc &InstrDesc = MIA.Info->get(CurInst->getOpcode());
+        if (InstrDesc.hasDefOfPhysReg(*CurInst, Reg, MRI)) {
+          InstrWindow = InstrWindow.slice(0, CurInst - InstrWindow.begin() + 1);
+          return true;
+        }
+      }
+      return false;
+    }
+
+    /// If successfully matched, calling this function will add an annotation
+    /// to all instructions that were matched. This is used to easily tag
+    /// instructions for deletion and implement match-and-replace operations.
+    virtual void annotate(MCPlusBuilder &MIA, StringRef Annotation) {}
+
+    /// Moves internal instruction iterator to the next instruction, walking
+    /// backwards for pattern matching (effectively the previous instruction in
+    /// regular order).
+    bool next() {
+      if (CurInst == InstrWindow.begin())
+        return false;
+      --CurInst;
+      return true;
+    }
+  };
+
+  /// Matches any operand
+  struct AnyOperandMatcher : MCInstMatcher {
+    MCOperand &Op;
+    AnyOperandMatcher(MCOperand &Op) : Op(Op) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIA,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      auto I = InInstrWindow.end();
+      if (I == InInstrWindow.begin())
+        return false;
+      --I;
+      if (OpNum < 0 || static_cast<unsigned int>(OpNum) >=
+            MCPlus::getNumPrimeOperands(*I))
+        return false;
+      Op = I->getOperand(OpNum);
+      return true;
+    }
+
+  };
+
+  /// Matches operands that are immediates
+  struct ImmMatcher : MCInstMatcher {
+    uint64_t &Imm;
+    ImmMatcher(uint64_t &Imm) : Imm(Imm) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIA,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIA, InInstrWindow, OpNum))
+        return false;
+
+      if (OpNum < 0)
+        return false;
+      const MCOperand &Op = CurInst->getOperand(OpNum);
+      if (!Op.isImm())
+        return false;
+      Imm = Op.getImm();
+      return true;
+    }
+  };
+
+  /// Matches operands that are MCSymbols
+  struct SymbolMatcher : MCInstMatcher {
+    const MCSymbol *&Sym;
+    SymbolMatcher(const MCSymbol *&Sym) : Sym(Sym) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIA,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIA, InInstrWindow, OpNum))
+        return false;
+
+      if (OpNum < 0)
+        return false;
+      Sym = MIA.getTargetSymbol(*CurInst, OpNum);
+      return Sym != nullptr;
+    }
+  };
+
+  /// Matches operands that are registers
+  struct RegMatcher : MCInstMatcher {
+    MCPhysReg &Reg;
+    RegMatcher(MCPhysReg &Reg) : Reg(Reg) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIA,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      auto I = InInstrWindow.end();
+      if (I == InInstrWindow.begin())
+        return false;
+      --I;
+      if (OpNum < 0 || static_cast<unsigned int>(OpNum) >=
+            MCPlus::getNumPrimeOperands(*I))
+        return false;
+      const MCOperand &Op = I->getOperand(OpNum);
+      if (!Op.isReg())
+        return false;
+      Reg = Op.getReg();
+      return true;
+    }
+  };
+
+  std::unique_ptr<MCInstMatcher> matchAnyOperand(MCOperand &Op) const {
+    return std::unique_ptr<MCInstMatcher>(new AnyOperandMatcher(Op));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchAnyOperand() const {
+    static MCOperand Unused;
+    return std::unique_ptr<MCInstMatcher>(new AnyOperandMatcher(Unused));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchReg(MCPhysReg &Reg) const {
+    return std::unique_ptr<MCInstMatcher>(new RegMatcher(Reg));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchReg() const {
+    static MCPhysReg Unused;
+    return std::unique_ptr<MCInstMatcher>(new RegMatcher(Unused));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchImm(uint64_t &Imm) const {
+    return std::unique_ptr<MCInstMatcher>(new ImmMatcher(Imm));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchImm() const {
+    static uint64_t Unused;
+    return std::unique_ptr<MCInstMatcher>(new ImmMatcher(Unused));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchSymbol(const MCSymbol *&Sym) const {
+    return std::unique_ptr<MCInstMatcher>(new SymbolMatcher(Sym));
+  }
+
+  std::unique_ptr<MCInstMatcher> matchSymbol() const {
+    static const MCSymbol* Unused;
+    return std::unique_ptr<MCInstMatcher>(new SymbolMatcher(Unused));
+  }
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchIndJmp(std::unique_ptr<MCInstMatcher> Target) const {
+    llvm_unreachable("not implemented");
+    return nullptr;
+  }
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchIndJmp(std::unique_ptr<MCInstMatcher> Base,
+              std::unique_ptr<MCInstMatcher> Scale,
+              std::unique_ptr<MCInstMatcher> Index,
+              std::unique_ptr<MCInstMatcher> Offset) const {
+    llvm_unreachable("not implemented");
+    return nullptr;
+  }
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchAdd(std::unique_ptr<MCInstMatcher> A,
+           std::unique_ptr<MCInstMatcher> B) const {
+    llvm_unreachable("not implemented");
+    return nullptr;
+  }
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchLoadAddr(std::unique_ptr<MCInstMatcher> Target) const {
+    llvm_unreachable("not implemented");
+    return nullptr;
+  }
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchLoad(std::unique_ptr<MCInstMatcher> Base,
+            std::unique_ptr<MCInstMatcher> Scale,
+            std::unique_ptr<MCInstMatcher> Index,
+            std::unique_ptr<MCInstMatcher> Offset) const {
+    llvm_unreachable("not implemented");
+    return nullptr;
+  }
+
+  /// \brief Given a branch instruction try to get the address the branch
+  /// targets. Return true on success, and the address in Target.
+  virtual bool
+  evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                 uint64_t &Target) const;
+
+  /// Return true if one of the operands of the \p Inst instruction uses
+  /// PC-relative addressing.
+  /// Note that PC-relative branches do not fall into this category.
+  virtual bool hasPCRelOperand(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return a number of the operand representing a memory.
+  /// Return -1 if the instruction doesn't have an explicit memory field.
+  virtual int getMemoryOperandNo(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return -1;
+  }
+
+  /// Return true if the instruction is encoded using EVEX (AVX-512).
+  virtual bool hasEVEXEncoding(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return true if a pair of instructions represented by \p Insts
+  /// could be fused into a single uop.
+  virtual bool isMacroOpFusionPair(ArrayRef<MCInst> Insts) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Given an instruction with (compound) memory operand, evaluate and return
+  /// the corresponding values. Note that the operand could be in any position,
+  /// but there is an assumption there's only one compound memory operand.
+  /// Return true upon success, return false if the instruction does not have
+  /// a memory operand.
+  ///
+  /// Since a Displacement field could be either an immediate or an expression,
+  /// the function sets either \p DispImm or \p DispExpr value.
+  virtual bool evaluateX86MemoryOperand(const MCInst &Inst,
+                                        unsigned *BaseRegNum,
+                                        int64_t *ScaleImm,
+                                        unsigned *IndexRegNum,
+                                        int64_t *DispImm,
+                                        unsigned *SegmentRegNum,
+                                        const MCExpr **DispExpr = nullptr) const
+  {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Given an instruction with memory addressing attempt to statically compute
+  /// the address being accessed. Return true on success, and the address in
+  /// \p Target.
+  ///
+  /// For RIP-relative addressing the caller is required to pass instruction
+  /// \p Address and \p Size.
+  virtual bool evaluateMemOperandTarget(const MCInst &Inst,
+                                        uint64_t &Target,
+                                        uint64_t Address = 0,
+                                        uint64_t Size = 0) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return operand iterator pointing to displacement in the compound memory
+  /// operand if such exists. Return Inst.end() otherwise.
+  virtual MCInst::iterator getMemOperandDisp(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return Inst.end();
+  }
+
+  /// Analyze \p Inst and return true if this instruction accesses \p Size
+  /// bytes of the stack frame at position \p StackOffset. \p IsLoad and
+  /// \p IsStore are set accordingly. If both are set, it means it is a
+  /// instruction that reads and updates the same memory location. \p Reg is set
+  /// to the source register in case of a store or destination register in case
+  /// of a load. If the store does not use a source register, \p SrcImm will
+  /// contain the source immediate and \p IsStoreFromReg will be set to false.
+  /// \p Simple is false if the instruction is not fully understood by
+  /// companion functions "replaceMemOperandWithImm" or
+  /// "replaceMemOperandWithReg".
+  virtual bool isStackAccess(const MCInst &Inst, bool &IsLoad, bool &IsStore,
+                             bool &IsStoreFromReg, MCPhysReg &Reg,
+                             int32_t &SrcImm, uint16_t &StackPtrReg,
+                             int64_t &StackOffset, uint8_t &Size,
+                             bool &IsSimple, bool &IsIndexed) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Convert a stack accessing load/store instruction in \p Inst to a PUSH
+  /// or POP saving/restoring the source/dest reg in \p Inst. The original
+  /// stack offset in \p Inst is ignored.
+  virtual void changeToPushOrPop(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Identify stack adjustment instructions -- those that change the stack
+  /// pointer by adding or subtracting an immediate.
+  virtual bool isStackAdjustment(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Use \p Input1 or Input2 as the current value for the input register and
+  /// put in \p Output the changes incurred by executing \p Inst. Return false
+  /// if it was not possible to perform the evaluation.
+  virtual bool evaluateSimple(const MCInst &Inst, int64_t &Output,
+                              std::pair<MCPhysReg, int64_t> Input1,
+                              std::pair<MCPhysReg, int64_t> Input2) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isRegToRegMove(const MCInst &Inst, MCPhysReg &From,
+                              MCPhysReg &To) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual MCPhysReg getStackPointer() const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  virtual MCPhysReg getFramePointer() const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  virtual MCPhysReg getFlagsReg() const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  /// Return true if \p Inst is a instruction that copies either the frame
+  /// pointer or the stack pointer to another general purpose register or
+  /// writes it to a memory location.
+  virtual bool escapesVariable(const MCInst &Inst, bool HasFramePointer) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Discard operand \p OpNum replacing it by a new MCOperand that is a
+  /// MCExpr referencing \p Symbol + \p Addend.
+  virtual bool setOperandToSymbolRef(MCInst &Inst, int OpNum,
+                                     const MCSymbol *Symbol, int64_t Addend,
+                                     MCContext *Ctx, uint64_t RelType) const;
+
+  /// Replace an immediate operand in the instruction \p Inst with a reference
+  /// of the passed \p Symbol plus \p Addend. If the instruction does not have
+  /// an immediate operand or has more than one - then return false. Otherwise
+  /// return true.
+  virtual bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
+                                       int64_t Addend, MCContext *Ctx,
+                                       int64_t &Value, uint64_t RelType) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Add \p NewImm to the current immediate operand of \p Inst. If it is a
+  /// memory accessing instruction, this immediate is the memory address
+  /// displacement. Otherwise, the target operand is the first immediate
+  /// operand found in \p Inst. Return false if no imm operand found.
+  virtual bool addToImm(MCInst &Inst, int64_t &Amt, MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Replace the compound memory operand of Inst with an immediate operand.
+  /// The value of the immediate operand is computed by reading the \p
+  /// ConstantData array starting from \p offset and assuming little-endianess.
+  /// Return true on success. The given instruction is modified in place.
+  virtual bool replaceMemOperandWithImm(MCInst &Inst, StringRef ConstantData,
+                                        uint64_t Offset) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Same as replaceMemOperandWithImm, but for registers.
+  virtual bool replaceMemOperandWithReg(MCInst &Inst, MCPhysReg RegNum) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return true if a move instruction moves a register to itself.
+  virtual bool isRedundantMove(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return true if the instruction is a tail call.
+  virtual bool isTailCall(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return true if the instruction is a call with an exception handling info.
+  virtual bool isInvoke(const MCInst &Inst) const {
+    return isCall(Inst) && getEHInfo(Inst);
+  }
+
+  /// Return true if \p Inst is an instruction that potentially traps when
+  /// working with addresses not aligned to the size of the operand.
+  virtual bool requiresAlignedAddress(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return handler and action info for invoke instruction if present.
+  Optional<MCPlus::MCLandingPad> getEHInfo(const MCInst &Inst) const;
+
+  // Add handler and action info for call instruction.
+  void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP);
+
+  /// Return non-negative GNU_args_size associated with the instruction
+  /// or -1 if there's no associated info.
+  int64_t getGnuArgsSize(const MCInst &Inst) const;
+
+  /// Add the value of GNU_args_size to Inst if it already has EH info.
+  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
+                      AllocatorIdTy AllocId = 0);
+
+  /// Return jump table addressed by this instruction.
+  uint64_t getJumpTable(const MCInst &Inst) const;
+
+  /// Return index register for instruction that uses a jump table.
+  uint16_t getJumpTableIndexReg(const MCInst &Inst) const;
+
+  /// Set jump table addressed by this instruction.
+  bool setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg,
+                    AllocatorIdTy AllocId = 0);
+
+  /// Disassociate instruction with a jump table.
+  bool unsetJumpTable(MCInst &Inst);
+
+  /// Return destination of conditional tail call instruction if \p Inst is one.
+  Optional<uint64_t> getConditionalTailCall(const MCInst &Inst) const;
+
+  /// Mark the \p Instruction as a conditional tail call, and set its
+  /// destination address if it is known. If \p Instruction was already marked,
+  /// update its destination with \p Dest.
+  bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0);
+
+  /// If \p Inst was marked as a conditional tail call convert it to a regular
+  /// branch. Return true if the instruction was converted.
+  bool unsetConditionalTailCall(MCInst &Inst);
+
+  /// Return MCSymbol that represents a target of this instruction at a given
+  /// operand number \p OpNum. If there's no symbol associated with
+  /// the operand - return nullptr.
+  virtual const MCSymbol *getTargetSymbol(const MCInst &Inst,
+                                          unsigned OpNum = 0) const {
+    llvm_unreachable("not implemented");
+    return nullptr;
+  }
+
+  /// Return MCSymbol extracted from a target expression
+  virtual const MCSymbol *getTargetSymbol(const MCExpr *Expr) const {
+    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
+  }
+
+  /// Return MCSymbol/offset extracted from a target expression
+  virtual std::pair<const MCSymbol *, uint64_t>
+  getTargetSymbolInfo(const MCExpr *Expr) const {
+    if (auto *SymExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      return std::make_pair(&SymExpr->getSymbol(), 0);
+    } else if (auto *BinExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+      const auto *SymExpr = dyn_cast<MCSymbolRefExpr>(BinExpr->getLHS());
+      const auto *ConstExpr = dyn_cast<MCConstantExpr>(BinExpr->getRHS());
+      if (BinExpr->getOpcode() == MCBinaryExpr::Add && SymExpr && ConstExpr) {
+        return std::make_pair(&SymExpr->getSymbol(), ConstExpr->getValue());
+      }
+    }
+    return std::make_pair(nullptr, 0);
+  }
+
+  /// Replace displacement in compound memory operand with given \p Operand.
+  virtual bool replaceMemOperandDisp(MCInst &Inst, MCOperand Operand) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return the MCExpr used for absolute references in this target
+  virtual const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr,
+                                         MCContext &Ctx,
+                                         uint64_t RelType) const {
+    return Expr;
+  }
+
+  /// Return a BitVector marking all sub or super registers of \p Reg, including
+  /// itself.
+  virtual const BitVector &getAliases(MCPhysReg Reg,
+                                      bool OnlySmaller = false) const;
+
+  /// Change \p Regs setting all registers used to pass parameters according
+  /// to the host abi. Do nothing if not implemented.
+  virtual BitVector getRegsUsedAsParams() const {
+    llvm_unreachable("not implemented");
+    return BitVector();
+  }
+
+  /// Change \p Regs setting all registers used as callee-saved according
+  /// to the host abi. Do nothing if not implemented.
+  virtual void getCalleeSavedRegs(BitVector &Regs) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Get the default def_in and live_out registers for the function
+  /// Currently only used for the Stoke optimzation
+  virtual void getDefaultDefIn(BitVector &Regs) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Similar to getDefaultDefIn
+  virtual void getDefaultLiveOut(BitVector &Regs) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Change \p Regs with a bitmask with all general purpose regs
+  virtual void getGPRegs(BitVector &Regs, bool IncludeAlias = true) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Change \p Regs with a bitmask with all general purpose regs that can be
+  /// encoded without extra prefix bytes. For x86 only.
+  virtual void getClassicGPRegs(BitVector &Regs) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Return the register width in bytes (1, 2, 4 or 8)
+  virtual uint8_t getRegSize(MCPhysReg Reg) const;
+
+  /// For aliased registers, return an alias of \p Reg that has the width of
+  /// \p Size bytes
+  virtual MCPhysReg getAliasSized(MCPhysReg Reg, uint8_t Size) const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  /// For X86, return whether this register is an upper 8-bit register, such as
+  /// AH, BH, etc.
+  virtual bool isUpper8BitReg(MCPhysReg Reg) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// For X86, return whether this instruction has special constraints that
+  /// prevents it from encoding registers that require a REX prefix.
+  virtual bool cannotUseREX(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Modifies the set \p Regs by adding registers \p Inst may rewrite. Caller
+  /// is responsible for passing a valid BitVector with the size equivalent to
+  /// the number of registers in the target.
+  /// Since this function is called many times during clobber analysis, it
+  /// expects the caller to manage BitVector creation to avoid extra overhead.
+  virtual void getClobberedRegs(const MCInst &Inst, BitVector &Regs) const;
+
+  /// Set of all registers touched by this instruction, including implicit uses
+  /// and defs.
+  virtual void getTouchedRegs(const MCInst &Inst, BitVector &Regs) const;
+
+  /// Set of all registers being written to by this instruction -- includes
+  /// aliases but only if they are strictly smaller than the actual reg
+  virtual void getWrittenRegs(const MCInst &Inst, BitVector &Regs) const;
+
+  /// Set of all registers being read by this instruction -- includes aliases
+  /// but only if they are strictly smaller than the actual reg
+  virtual void getUsedRegs(const MCInst &Inst, BitVector &Regs) const;
+
+  /// Return true if this instruction defines the specified physical
+  /// register either explicitly or implicitly.
+  virtual bool hasDefOfPhysReg(const MCInst &MI, unsigned Reg) const;
+
+  /// Return true if this instruction uses the specified physical
+  /// register either explicitly or implicitly.
+  virtual bool hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const;
+
+  /// Replace displacement in compound memory operand with given \p Label.
+  bool replaceMemOperandDisp(MCInst &Inst, const MCSymbol *Label,
+                             MCContext *Ctx) const {
+    return replaceMemOperandDisp(
+        Inst, MCOperand::createExpr(MCSymbolRefExpr::create(Label, *Ctx)));
+  }
+
+  /// Returns how many bits we have in this instruction to encode a PC-rel
+  /// imm.
+  virtual int getPCRelEncodingSize(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  /// Replace instruction opcode to be a tail call instead of jump.
+  virtual bool convertJmpToTailCall(MCInst &Inst) {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Perform any additional actions to transform a (conditional) tail call
+  /// into a (conditional) jump. Assume the target was already replaced with
+  /// a local one, so the default is to do nothing more.
+  virtual bool convertTailCallToJmp(MCInst &Inst) {
+    return true;
+  }
+
+  /// Replace instruction opcode to be a regural call instead of tail call.
+  virtual bool convertTailCallToCall(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Modify a direct call instruction \p Inst with an indirect call taking
+  /// a destination from a memory location pointed by \p TargetLocation symbol.
+  virtual bool convertCallToIndirectCall(MCInst &Inst,
+                                         const MCSymbol *TargetLocation,
+                                         MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Morph an indirect call into a load where \p Reg holds the call target.
+  virtual void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Replace instruction with a shorter version that could be relaxed later
+  /// if needed.
+  virtual bool shortenInstruction(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Lower a tail call instruction \p Inst if required by target.
+  virtual bool lowerTailCall(MCInst &Inst) {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Receives a list of MCInst of the basic block to analyze and interpret the
+  /// terminators of this basic block. TBB must be initialized with the original
+  /// fall-through for this BB.
+  virtual bool analyzeBranch(InstructionIterator Begin,
+                             InstructionIterator End,
+                             const MCSymbol *&TBB,
+                             const MCSymbol *&FBB,
+                             MCInst *&CondBranch,
+                             MCInst *&UncondBranch) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Analyze \p Instruction to try and determine what type of indirect branch
+  /// it is.  It is assumed that \p Instruction passes isIndirectBranch().
+  /// \p BB is an array of instructions immediately preceding \p Instruction.
+  /// If \p Instruction can be successfully analyzed, the output parameters
+  /// will be set to the different components of the branch.  \p MemLocInstr
+  /// is the instruction that loads up the indirect function pointer.  It may
+  /// or may not be same as \p Instruction.
+  virtual IndirectBranchType analyzeIndirectBranch(
+     MCInst &Instruction,
+     InstructionIterator Begin,
+     InstructionIterator End,
+     const unsigned PtrSize,
+     MCInst *&MemLocInstr,
+     unsigned &BaseRegNum,
+     unsigned &IndexRegNum,
+     int64_t &DispValue,
+     const MCExpr *&DispExpr,
+     MCInst *&PCRelBaseOut
+  ) const {
+    llvm_unreachable("not implemented");
+    return IndirectBranchType::UNKNOWN;
+  }
+
+  virtual bool
+  analyzeVirtualMethodCall(InstructionIterator Begin,
+                           InstructionIterator End,
+                           std::vector<MCInst *> &MethodFetchInsns,
+                           unsigned &VtableRegNum,
+                           unsigned &BaseRegNum,
+                           uint64_t &MethodOffset) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual void createLongJmp(std::vector<MCInst> &Seq, const MCSymbol *Target,
+                             MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual void createShortJmp(std::vector<MCInst> &Seq, const MCSymbol *Target,
+                              MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Return true if the instruction CurInst, in combination with the recent
+  /// history of disassembled instructions supplied by [Begin, End), is a linker
+  /// generated veneer/stub that needs patching. This happens in AArch64 when
+  /// the code is large and the linker needs to generate stubs, but it does
+  /// not put any extra relocation information that could help us to easily
+  /// extract the real target. This function identifies and extract the real
+  /// target in Tgt. The instruction that loads the lower bits of the target
+  /// is put in TgtLowBits, and its pair in TgtHiBits. If the instruction in
+  /// TgtHiBits does not have an immediate operand, but an expression, then
+  /// this expression is put in TgtHiSym and Tgt only contains the lower bits.
+  virtual bool matchLinkerVeneer(InstructionIterator Begin,
+                                 InstructionIterator End,
+                                 uint64_t Address,
+                                 const MCInst &CurInst,
+                                 MCInst *&TargetHiBits,
+                                 MCInst *&TargetLowBits,
+                                 uint64_t &Target) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual int getShortJmpEncodingSize() const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual int getUncondBranchEncodingSize() const {
+    llvm_unreachable("not implemented");
+    return 0;
+  }
+
+  /// Create a no-op instruction.
+  virtual bool createNoop(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Create a return instruction.
+  virtual bool createReturn(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates a new unconditional branch instruction in Inst and set its operand
+  /// to TBB.
+  ///
+  /// Returns true on success.
+  virtual bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+                                  MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates a new call instruction in Inst and sets its operand to
+  /// Target.
+  ///
+  /// Returns true on success.
+  virtual bool createCall(MCInst &Inst, const MCSymbol *Target,
+                          MCContext *Ctx) {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool createIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
+                                  MCContext *Ctx, bool IsTailCall) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates a new tail call instruction in Inst and sets its operand to
+  /// Target.
+  ///
+  /// Returns true on success.
+  virtual bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+                              MCContext *Ctx) {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates a trap instruction in Inst.
+  ///
+  /// Returns true on success.
+  virtual bool createTrap(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates an instruction to bump the stack pointer just like a call.
+  virtual bool createStackPointerIncrement(MCInst &Inst, int Size = 8,
+                                           bool NoFlagsClobber = false) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates an instruction to move the stack pointer just like a ret.
+  virtual bool createStackPointerDecrement(MCInst &Inst, int Size = 8,
+                                           bool NoFlagsClobber = false) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Create a store instruction using \p StackReg as the base register
+  /// and \p Offset as the displacement.
+  virtual bool createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg,
+                                 int Offset, const MCPhysReg &SrcReg,
+                                 int Size) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+                          const MCPhysReg &IndexReg, int64_t Offset,
+                          const MCExpr *OffsetExpr,
+                          const MCPhysReg &AddrSegmentReg,
+                          const MCPhysReg &DstReg, int Size) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest,
+                                   uint32_t Imm) const {
+    llvm_unreachable("not implemented");
+  }
+
+  /// Create instruction to increment contents of target by 1
+  virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
+                               MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Create a fragment of code (sequence of instructions) that load a 32-bit
+  /// address from memory, zero-extends it to 64 and jump to it (indirect jump).
+  virtual bool
+  createIJmp32Frag(SmallVectorImpl<MCInst> &Insts, const MCOperand &BaseReg,
+                   const MCOperand &Scale, const MCOperand &IndexReg,
+                   const MCOperand &Offset, const MCOperand &TmpReg) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Create a load instruction using \p StackReg as the base register
+  /// and \p Offset as the displacement.
+  virtual bool createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
+                                      int Offset, const MCPhysReg &DstReg,
+                                      int Size) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Creates a call frame pseudo instruction. A single operand identifies which
+  /// MCCFIInstruction this MCInst is referring to.
+  ///
+  /// Returns true on success.
+  virtual bool createCFI(MCInst &Inst, int64_t Offset) const {
+    Inst.clear();
+    Inst.setOpcode(TargetOpcode::CFI_INSTRUCTION);
+    Inst.addOperand(MCOperand::createImm(Offset));
+    return true;
+  }
+
+  /// Create an inline version of memcpy(dest, src, 1).
+  virtual std::vector<MCInst> createOneByteMemcpy() const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  /// Create a sequence of instructions to compare contents of a register
+  /// \p RegNo to immediate \Imm and jump to \p Target if they are equal.
+  virtual std::vector<MCInst>
+  createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
+              MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
+  /// (dest + n) instead of dest.
+  virtual std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  /// Create a target-specific relocation out of the \p Fixup.
+  /// Note that not every fixup could be converted into a relocation.
+  virtual Optional<Relocation> createRelocation(const MCFixup &Fixup,
+                                                const MCAsmBackend &MAB) const {
+    llvm_unreachable("not implemented");
+    return Relocation();
+  }
+
+  /// Returns true if instruction is a call frame pseudo instruction.
+  virtual bool isCFI(const MCInst &Inst) const {
+    return Inst.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
+  }
+
+  /// Reverses the branch condition in Inst and update its taken target to TBB.
+  ///
+  /// Returns true on success.
+  virtual bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
+                                      MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return the conditional code used in a conditional jump instruction.
+  /// Returns invalid code if not conditional jump.
+  virtual unsigned getCondCode(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Return canonical branch opcode for a reversible branch opcode. For every
+  /// opposite branch opcode pair Op <-> OpR this function returns one of the
+  /// opcodes which is considered a canonical.
+  virtual unsigned getCanonicalBranchCondCode(unsigned CC) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Sets the taken target of the branch instruction to Target.
+  ///
+  /// Returns true on success.
+  virtual bool replaceBranchTarget(MCInst &Inst, const MCSymbol *TBB,
+                                   MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool createEHLabel(MCInst &Inst, const MCSymbol *Label,
+                             MCContext *Ctx) const {
+    Inst.setOpcode(TargetOpcode::EH_LABEL);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_None, *Ctx)));
+    return true;
+  }
+
+  /// Return annotation index matching the \p Name.
+  Optional<unsigned> getAnnotationIndex(StringRef Name) const {
+    auto AI = AnnotationNameIndexMap.find(Name);
+    if (AI != AnnotationNameIndexMap.end())
+      return AI->second;
+    return NoneType();
+  }
+
+  /// Return annotation index matching the \p Name. Create a new index if the
+  /// \p Name wasn't registered previously.
+  unsigned getOrCreateAnnotationIndex(StringRef Name) {
+    auto AI = AnnotationNameIndexMap.find(Name);
+    if (AI != AnnotationNameIndexMap.end())
+      return AI->second;
+
+    const unsigned Index =
+        AnnotationNameIndexMap.size() + MCPlus::MCAnnotation::kGeneric;
+    AnnotationNameIndexMap.insert(std::make_pair(Name, Index));
+    AnnotationNames.emplace_back(std::string(Name));
+    return Index;
+  }
+
+  /// Store an annotation value on an MCInst.  This assumes the annotation
+  /// is not already present.
+  template <typename ValueType>
+  const ValueType &addAnnotation(MCInst &Inst, unsigned Index,
+                                 const ValueType &Val,
+                                 AllocatorIdTy AllocatorId = 0) {
+    assert(!hasAnnotation(Inst, Index));
+    AnnotationAllocator &Allocator = getAnnotationAllocator(AllocatorId);
+    auto *A = new (Allocator.ValueAllocator)
+        MCPlus::MCSimpleAnnotation<ValueType>(Val);
+
+    if (!std::is_trivial<ValueType>::value) {
+      Allocator.AnnotationPool.insert(A);
+    }
+    setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A),
+                         AllocatorId);
+    return A->getValue();
+  }
+
+  /// Store an annotation value on an MCInst.  This assumes the annotation
+  /// is not already present.
+  template <typename ValueType>
+  const ValueType &addAnnotation(MCInst &Inst, StringRef Name,
+                                 const ValueType &Val,
+                                 AllocatorIdTy AllocatorId = 0) {
+    return addAnnotation(Inst, getOrCreateAnnotationIndex(Name), Val,
+                         AllocatorId);
+  }
+
+  /// Get an annotation as a specific value, but if the annotation does not
+  /// exist, create a new annotation with the default constructor for that type.
+  /// Return a non-const ref so caller can freely modify its contents
+  /// afterwards.
+  template <typename ValueType>
+  ValueType &getOrCreateAnnotationAs(MCInst &Inst, unsigned Index,
+                                     AllocatorIdTy AllocatorId = 0) {
+    auto Val =
+        tryGetAnnotationAs<ValueType>(const_cast<const MCInst &>(Inst), Index);
+    if (!Val)
+      Val = addAnnotation(Inst, Index, ValueType(), AllocatorId);
+    return const_cast<ValueType &>(*Val);
+  }
+
+  /// Get an annotation as a specific value, but if the annotation does not
+  /// exist, create a new annotation with the default constructor for that type.
+  /// Return a non-const ref so caller can freely modify its contents
+  /// afterwards.
+  template <typename ValueType>
+  ValueType &getOrCreateAnnotationAs(MCInst &Inst, StringRef Name,
+                                     AllocatorIdTy AllocatorId = 0) {
+    const unsigned Index = getOrCreateAnnotationIndex(Name);
+    return getOrCreateAnnotationAs<ValueType>(Inst, Index, AllocatorId);
+  }
+
+  /// Get an annotation as a specific value. Assumes that the annotation exists.
+  /// Use hasAnnotation() if the annotation may not exist.
+  template <typename ValueType>
+  ValueType &getAnnotationAs(const MCInst &Inst, unsigned Index) const {
+    Optional<int64_t> Value = getAnnotationOpValue(Inst, Index);
+    assert(Value && "annotation should exist");
+    return reinterpret_cast<MCPlus::MCSimpleAnnotation<ValueType> *>
+      (*Value)->getValue();
+  }
+
+  /// Get an annotation as a specific value. Assumes that the annotation exists.
+  /// Use hasAnnotation() if the annotation may not exist.
+  template <typename ValueType>
+  ValueType &getAnnotationAs(const MCInst &Inst, StringRef Name) const {
+    const auto Index = getAnnotationIndex(Name);
+    assert(Index && "annotation should exist");
+    return getAnnotationAs<ValueType>(Inst, *Index);
+  }
+
+  /// Get an annotation as a specific value. If the annotation does not exist,
+  /// return the \p DefaultValue.
+  template <typename ValueType> const ValueType &
+  getAnnotationWithDefault(const MCInst &Inst, unsigned Index,
+                           const ValueType &DefaultValue = ValueType()) {
+    if (!hasAnnotation(Inst, Index))
+      return DefaultValue;
+    return getAnnotationAs<ValueType>(Inst, Index);
+  }
+
+  /// Get an annotation as a specific value. If the annotation does not exist,
+  /// return the \p DefaultValue.
+  template <typename ValueType> const ValueType &
+  getAnnotationWithDefault(const MCInst &Inst, StringRef Name,
+                           const ValueType &DefaultValue = ValueType()) {
+    const unsigned Index = getOrCreateAnnotationIndex(Name);
+    return getAnnotationWithDefault<ValueType>(Inst, Index, DefaultValue);
+  }
+
+  /// Check if the specified annotation exists on this instruction.
+  bool hasAnnotation(const MCInst &Inst, unsigned Index) const;
+
+  /// Check if an annotation with a specified \p Name exists on \p Inst.
+  bool hasAnnotation(const MCInst &Inst, StringRef Name) const {
+    const auto Index = getAnnotationIndex(Name);
+    if (!Index)
+      return false;
+    return hasAnnotation(Inst, *Index);
+  }
+
+  /// Get an annotation as a specific value, but if the annotation does not
+  /// exist, return errc::result_out_of_range.
+  template <typename ValueType>
+  ErrorOr<const ValueType &> tryGetAnnotationAs(const MCInst &Inst,
+                                                unsigned Index) const {
+    if (!hasAnnotation(Inst, Index))
+      return make_error_code(std::errc::result_out_of_range);
+    return getAnnotationAs<ValueType>(Inst, Index);
+  }
+
+  /// Get an annotation as a specific value, but if the annotation does not
+  /// exist, return errc::result_out_of_range.
+  template <typename ValueType>
+  ErrorOr<const ValueType &> tryGetAnnotationAs(const MCInst &Inst,
+                                                StringRef Name) const {
+    const auto Index = getAnnotationIndex(Name);
+    if (!Index)
+      return make_error_code(std::errc::result_out_of_range);
+    return tryGetAnnotationAs<ValueType>(Inst, *Index);
+  }
+
+  template <typename ValueType>
+  ErrorOr<ValueType &> tryGetAnnotationAs(MCInst &Inst, unsigned Index) const {
+    if (!hasAnnotation(Inst, Index))
+      return make_error_code(std::errc::result_out_of_range);
+    return const_cast<ValueType &>(getAnnotationAs<ValueType>(Inst, Index));
+  }
+
+  template <typename ValueType>
+  ErrorOr<ValueType &> tryGetAnnotationAs(MCInst &Inst, StringRef Name) const {
+    const auto Index = getAnnotationIndex(Name);
+    if (!Index)
+      return make_error_code(std::errc::result_out_of_range);
+    return tryGetAnnotationAs<ValueType>(Inst, *Index);
+  }
+
+  /// Print each annotation attached to \p Inst.
+  void printAnnotations(const MCInst &Inst, raw_ostream &OS) const;
+
+  /// Remove annotation with a given \p Index.
+  ///
+  /// Return true if the annotation was removed, false if the annotation
+  /// was not present.
+  bool removeAnnotation(MCInst &Inst, unsigned Index);
+
+  /// Remove annotation associated with \p Name.
+  ///
+  /// Return true if the annotation was removed, false if the annotation
+  /// was not present.
+  bool removeAnnotation(MCInst &Inst, StringRef Name) {
+    const auto Index = getAnnotationIndex(Name);
+    if (!Index)
+      return false;
+    return removeAnnotation(Inst, *Index);
+  }
+
+  /// Remove meta-data, but don't destroy it.
+  void stripAnnotations(MCInst &Inst);
+
+  virtual std::vector<MCInst>
+  createInstrumentedIndirectCall(const MCInst &CallInst, bool TailCall,
+                                 MCSymbol *HandlerFuncAddr, int CallSiteID,
+                                 MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return std::vector<MCInst>();
+  }
+
+  virtual std::vector<MCInst> createInstrumentedIndCallHandler() const {
+    llvm_unreachable("not implemented");
+    return std::vector<MCInst>();
+  }
+
+  virtual std::vector<MCInst> createInstrumentedIndTailCallHandler() const {
+    llvm_unreachable("not implemented");
+    return std::vector<MCInst>();
+  }
+
+  virtual std::vector<MCInst>
+  createInstrumentedIndCallTrampoline(const MCSymbol *InstrTrampoline,
+                                      const MCSymbol *IndCallHandler,
+                                      MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return std::vector<MCInst>();
+  }
+
+  virtual std::vector<MCInst> createNumCountersGetter(MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  virtual std::vector<MCInst> createInstrLocationsGetter(MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  virtual std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  virtual std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  virtual std::vector<MCInst> createSymbolTrampoline(const MCSymbol *TgtSym,
+                                                     MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return std::vector<MCInst>();
+  }
+
+  virtual std::vector<MCInst> createDummyReturnFunction(MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return std::vector<MCInst>();
+  }
+
+  /// This method takes an indirect call instruction and splits it up into an
+  /// equivalent set of instructions that use direct calls for target
+  /// symbols/addresses that are contained in the Targets vector.  This is done
+  /// by guarding each direct call with a compare instruction to verify that
+  /// the target is correct.
+  /// If the VtableAddrs vector is not empty, the call will have the extra
+  /// load of the method pointer from the vtable eliminated.  When non-empty
+  /// the VtableAddrs vector must be the same size as Targets and include the
+  /// address of a vtable for each corresponding method call in Targets.  The
+  /// MethodFetchInsns vector holds instructions that are used to load the
+  /// correct method for the cold call case.
+  ///
+  /// The return value is a vector of code snippets (essentially basic blocks).
+  /// There is a symbol associated with each snippet except for the first.
+  /// If the original call is not a tail call, the last snippet will have an
+  /// empty vector of instructions.  The label is meant to indicate the basic
+  /// block where all previous snippets are joined, i.e. the instructions that
+  /// would immediate follow the original call.
+  using BlocksVectorTy = std::vector<std::pair<MCSymbol*, std::vector<MCInst>>>;
+  struct MultiBlocksCode {
+    BlocksVectorTy Blocks;
+    std::vector<MCSymbol*> Successors;
+  };
+
+  virtual BlocksVectorTy indirectCallPromotion(
+    const MCInst &CallInst,
+    const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
+    const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
+    const std::vector<MCInst *> &MethodFetchInsns,
+    const bool MinimizeCodeSize,
+    MCContext *Ctx
+  ) {
+    llvm_unreachable("not implemented");
+    return BlocksVectorTy();
+  }
+
+  virtual BlocksVectorTy jumpTablePromotion(
+    const MCInst &IJmpInst,
+    const std::vector<std::pair<MCSymbol *,uint64_t>>& Targets,
+    const std::vector<MCInst *> &TargetFetchInsns,
+    MCContext *Ctx
+  ) const {
+    llvm_unreachable("not implemented");
+    return BlocksVectorTy();
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/MachORewriteInstance.cpp b/bolt/src/MachORewriteInstance.cpp
new file mode 100644
index 000000000000..cec1b9f25b1f
--- /dev/null
+++ b/bolt/src/MachORewriteInstance.cpp
@@ -0,0 +1,595 @@
+//===--- MachORewriteInstance.cpp - Instance of a rewriting process. ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachORewriteInstance.h"
+#include "BinaryContext.h"
+#include "BinaryEmitter.h"
+#include "BinaryFunction.h"
+#include "BinaryPassManager.h"
+#include "DataReader.h"
+#include "ExecutableFileMemoryManager.h"
+#include "JumpTable.h"
+#include "Passes/Instrumentation.h"
+#include "Passes/PatchEntries.h"
+#include "RuntimeLibs/InstrumentationRuntimeLibrary.h"
+#include "Utils.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+namespace opts {
+
+using namespace llvm;
+extern cl::opt<unsigned> AlignText;
+//FIXME! Upstream change
+//extern cl::opt<bool> CheckOverlappingElements;
+extern cl::opt<bool> ForcePatch;
+extern cl::opt<bool> Instrument;
+extern cl::opt<bool> InstrumentCalls;
+extern cl::opt<bolt::JumpTableSupportLevel> JumpTables;
+extern cl::opt<bool> KeepTmp;
+extern cl::opt<bool> NeverPrint;
+extern cl::opt<std::string> OutputFilename;
+extern cl::opt<bool> PrintAfterBranchFixup;
+extern cl::opt<bool> PrintFinalized;
+extern cl::opt<bool> PrintReordered;
+extern cl::opt<bool> PrintSections;
+extern cl::opt<bool> PrintDisasm;
+extern cl::opt<bool> PrintCFG;
+extern cl::opt<std::string> RuntimeInstrumentationLib;
+extern cl::opt<unsigned> Verbosity;
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+MachORewriteInstance::MachORewriteInstance(object::MachOObjectFile *InputFile,
+                                           StringRef ToolPath)
+    : InputFile(InputFile), ToolPath(ToolPath),
+      BC(BinaryContext::createBinaryContext(InputFile, /* IsPIC */ true,
+                                            DWARFContext::create(*InputFile))) {
+  if (opts::Instrument)
+    BC->setRuntimeLibrary(std::make_unique<InstrumentationRuntimeLibrary>());
+}
+
+Error MachORewriteInstance::setProfile(StringRef Filename) {
+  if (!sys::fs::exists(Filename))
+    return errorCodeToError(make_error_code(errc::no_such_file_or_directory));
+
+  if (ProfileReader) {
+    // Already exists
+    return make_error<StringError>(
+        Twine("multiple profiles specified: ") + ProfileReader->getFilename() +
+        " and " + Filename, inconvertibleErrorCode());
+  }
+
+  ProfileReader = std::make_unique<DataReader>(Filename);
+  return Error::success();
+}
+
+void MachORewriteInstance::preprocessProfileData() {
+  if (!ProfileReader)
+    return;
+  if (Error E = ProfileReader->preprocessProfile(*BC.get()))
+    report_error("cannot pre-process profile", std::move(E));
+}
+
+void MachORewriteInstance::processProfileDataPreCFG() {
+  if (!ProfileReader)
+    return;
+  if (Error E = ProfileReader->readProfilePreCFG(*BC.get()))
+    report_error("cannot read profile pre-CFG", std::move(E));
+}
+
+void MachORewriteInstance::processProfileData() {
+  if (!ProfileReader)
+    return;
+  if (Error E = ProfileReader->readProfile(*BC.get()))
+    report_error("cannot read profile", std::move(E));
+}
+
+void MachORewriteInstance::readSpecialSections() {
+  for (const object::SectionRef &Section : InputFile->sections()) {
+    Expected<StringRef> SectionName = Section.getName();;
+    check_error(SectionName.takeError(), "cannot get section name");
+    // Only register sections with names.
+    if (!SectionName->empty()) {
+      BC->registerSection(Section);
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: registering section " << *SectionName
+                 << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x"
+                 << Twine::utohexstr(Section.getAddress() + Section.getSize())
+                 << "\n");
+    }
+  }
+
+  if (opts::PrintSections) {
+    outs() << "BOLT-INFO: Sections from original binary:\n";
+    BC->printSections(outs());
+  }
+}
+
+namespace {
+
+struct DataInCodeRegion {
+  explicit DataInCodeRegion(DiceRef D) {
+    D.getOffset(Offset);
+    D.getLength(Length);
+    D.getKind(Kind);
+  }
+
+  uint32_t Offset;
+  uint16_t Length;
+  uint16_t Kind;
+};
+
+std::vector<DataInCodeRegion> readDataInCode(const MachOObjectFile &O) {
+  const MachO::linkedit_data_command DataInCodeLC =
+      O.getDataInCodeLoadCommand();
+  const uint32_t NumberOfEntries =
+      DataInCodeLC.datasize / sizeof(MachO::data_in_code_entry);
+  std::vector<DataInCodeRegion> DataInCode;
+  DataInCode.reserve(NumberOfEntries);
+  for (auto I = O.begin_dices(), E = O.end_dices(); I != E; ++I)
+    DataInCode.emplace_back(*I);
+  std::stable_sort(DataInCode.begin(), DataInCode.end(),
+                   [](DataInCodeRegion LHS, DataInCodeRegion RHS) {
+                     return LHS.Offset < RHS.Offset;
+                   });
+  return DataInCode;
+}
+
+Optional<uint64_t> readStartAddress(const MachOObjectFile &O) {
+  Optional<uint64_t> StartOffset;
+  Optional<uint64_t> TextVMAddr;
+  for (const object::MachOObjectFile::LoadCommandInfo &LC : O.load_commands()) {
+    switch (LC.C.cmd) {
+    case MachO::LC_MAIN: {
+      MachO::entry_point_command LCMain = O.getEntryPointCommand(LC);
+      StartOffset = LCMain.entryoff;
+      break;
+    }
+    case MachO::LC_SEGMENT: {
+      MachO::segment_command LCSeg = O.getSegmentLoadCommand(LC);
+      StringRef SegmentName(LCSeg.segname,
+                            strnlen(LCSeg.segname, sizeof(LCSeg.segname)));
+      if (SegmentName == "__TEXT")
+        TextVMAddr = LCSeg.vmaddr;
+      break;
+    }
+    case MachO::LC_SEGMENT_64: {
+      MachO::segment_command_64 LCSeg = O.getSegment64LoadCommand(LC);
+      StringRef SegmentName(LCSeg.segname,
+                            strnlen(LCSeg.segname, sizeof(LCSeg.segname)));
+      if (SegmentName == "__TEXT")
+        TextVMAddr = LCSeg.vmaddr;
+      break;
+    }
+    default:
+      continue;
+    }
+  }
+  return (TextVMAddr && StartOffset)
+             ? Optional<uint64_t>(*TextVMAddr + *StartOffset)
+             : llvm::None;
+}
+
+} // anonymous namespace
+
+void MachORewriteInstance::discoverFileObjects() {
+  std::vector<SymbolRef> FunctionSymbols;
+  for (const SymbolRef &S : InputFile->symbols()) {
+    SymbolRef::Type Type = cantFail(S.getType(), "cannot get symbol type");
+    if (Type == SymbolRef::ST_Function)
+      FunctionSymbols.push_back(S);
+  }
+  if (FunctionSymbols.empty())
+    return;
+  std::stable_sort(FunctionSymbols.begin(), FunctionSymbols.end(),
+                   [](const SymbolRef &LHS, const SymbolRef &RHS) {
+                     return cantFail(LHS.getValue()) < cantFail(RHS.getValue());
+                   });
+  for (size_t Index = 0; Index < FunctionSymbols.size(); ++Index) {
+    const uint64_t Address = cantFail(FunctionSymbols[Index].getValue());
+    ErrorOr<BinarySection &> Section = BC->getSectionForAddress(Address);
+    // TODO: It happens for some symbols (e.g. __mh_execute_header).
+    // Add proper logic to handle them correctly.
+    if (!Section) {
+      errs() << "BOLT-WARNING: no section found for address " << Address
+             << "\n";
+      continue;
+    }
+
+    std::string SymbolName =
+        cantFail(FunctionSymbols[Index].getName(), "cannot get symbol name")
+            .str();
+    // Uniquify names of local symbols.
+    if (!(cantFail(FunctionSymbols[Index].getFlags()) & SymbolRef::SF_Global))
+      SymbolName = NR.uniquify(SymbolName);
+
+    section_iterator S = cantFail(FunctionSymbols[Index].getSection());
+    uint64_t EndAddress = S->getAddress() + S->getSize();
+
+    size_t NFIndex = Index + 1;
+    // Skip aliases.
+    while (NFIndex < FunctionSymbols.size() &&
+           cantFail(FunctionSymbols[NFIndex].getValue()) == Address)
+      ++NFIndex;
+    if (NFIndex < FunctionSymbols.size() &&
+        S == cantFail(FunctionSymbols[NFIndex].getSection()))
+      EndAddress = cantFail(FunctionSymbols[NFIndex].getValue());
+
+    const uint64_t SymbolSize = EndAddress - Address;
+    const auto It = BC->getBinaryFunctions().find(Address);
+    if (It == BC->getBinaryFunctions().end()) {
+      BinaryFunction *Function = BC->createBinaryFunction(
+          std::move(SymbolName), *Section, Address, SymbolSize);
+      if (!opts::Instrument)
+        Function->setOutputAddress(Function->getAddress());
+
+    } else {
+      It->second.addAlternativeName(std::move(SymbolName));
+    }
+  }
+
+  const std::vector<DataInCodeRegion> DataInCode = readDataInCode(*InputFile);
+
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    Function.setMaxSize(Function.getSize());
+
+    ErrorOr<ArrayRef<uint8_t>> FunctionData = Function.getData();
+    if (!FunctionData) {
+      errs() << "BOLT-ERROR: corresponding section is non-executable or "
+             << "empty for function " << Function << '\n';
+      continue;
+    }
+
+    // Treat zero-sized functions as non-simple ones.
+    if (Function.getSize() == 0) {
+      Function.setSimple(false);
+      continue;
+    }
+
+    // Offset of the function in the file.
+    const auto *FileBegin =
+        reinterpret_cast<const uint8_t *>(InputFile->getData().data());
+    Function.setFileOffset(FunctionData->begin() - FileBegin);
+
+    // Treat functions which contain data in code as non-simple ones.
+    const auto It = std::lower_bound(
+        DataInCode.cbegin(), DataInCode.cend(), Function.getFileOffset(),
+        [](DataInCodeRegion D, uint64_t Offset) { return D.Offset < Offset; });
+    if (It != DataInCode.cend() &&
+        It->Offset + It->Length <=
+            Function.getFileOffset() + Function.getMaxSize())
+      Function.setSimple(false);
+  }
+
+  BC->StartFunctionAddress = readStartAddress(*InputFile);
+}
+
+void MachORewriteInstance::disassembleFunctions() {
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (!Function.isSimple())
+      continue;
+    Function.disassemble();
+    if (opts::PrintDisasm)
+      Function.print(outs(), "after disassembly", true);
+  }
+}
+
+void MachORewriteInstance::buildFunctionsCFG() {
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (!Function.isSimple())
+      continue;
+    if (!Function.buildCFG(/*AllocId*/ 0)) {
+      errs() << "BOLT-WARNING: failed to build CFG for the function "
+             << Function << "\n";
+    }
+  }
+}
+
+void MachORewriteInstance::postProcessFunctions() {
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (Function.empty())
+      continue;
+    Function.postProcessCFG();
+    if (opts::PrintCFG)
+      Function.print(outs(), "after building cfg", true);
+  }
+}
+
+void MachORewriteInstance::runOptimizationPasses() {
+  BinaryFunctionPassManager Manager(*BC);
+  if (opts::Instrument) {
+    Manager.registerPass(std::make_unique<PatchEntries>());
+    Manager.registerPass(std::make_unique<Instrumentation>(opts::NeverPrint));
+  }
+  Manager.registerPass(
+      std::make_unique<ReorderBasicBlocks>(opts::PrintReordered));
+  Manager.registerPass(
+      std::make_unique<FixupBranches>(opts::PrintAfterBranchFixup));
+  // This pass should always run last.*
+  Manager.registerPass(
+      std::make_unique<FinalizeFunctions>(opts::PrintFinalized));
+
+  Manager.runPasses();
+}
+
+void MachORewriteInstance::mapInstrumentationSection(StringRef SectionName) {
+  if (!opts::Instrument)
+    return;
+  ErrorOr<BinarySection &> Section = BC->getUniqueSectionByName(SectionName);
+  if (!Section) {
+    llvm::errs() << "Cannot find " + SectionName + " section\n";
+    exit(1);
+  }
+  if (!Section->hasValidSectionID())
+    return;
+  RTDyld->reassignSectionAddress(Section->getSectionID(),
+                                 Section->getAddress());
+}
+
+void MachORewriteInstance::mapCodeSections() {
+  for (BinaryFunction *Function : BC->getAllBinaryFunctions()) {
+    if (!Function->isEmitted())
+      continue;
+    if (Function->getOutputAddress() == 0)
+      continue;
+    ErrorOr<BinarySection &> FuncSection = Function->getCodeSection();
+    if (!FuncSection)
+      report_error(
+          (Twine("Cannot find section for function ") + Function->getOneName())
+              .str(),
+          FuncSection.getError());
+
+    FuncSection->setOutputAddress(Function->getOutputAddress());
+    LLVM_DEBUG(dbgs() << "BOLT: mapping 0x"
+                 << Twine::utohexstr(FuncSection->getAllocAddress()) << " to 0x"
+                 << Twine::utohexstr(Function->getOutputAddress()) << '\n');
+    RTDyld->reassignSectionAddress(FuncSection->getSectionID(),
+                                   Function->getOutputAddress());
+    Function->setImageAddress(FuncSection->getAllocAddress());
+    Function->setImageSize(FuncSection->getOutputSize());
+  }
+
+  if (opts::Instrument) {
+    ErrorOr<BinarySection &> BOLT = BC->getUniqueSectionByName("__bolt");
+    if (!BOLT) {
+      llvm::errs() << "Cannot find __bolt section\n";
+      exit(1);
+    }
+    uint64_t Addr = BOLT->getAddress();
+    for (BinaryFunction *Function : BC->getAllBinaryFunctions()) {
+      if (!Function->isEmitted())
+        continue;
+      if (Function->getOutputAddress() != 0)
+        continue;
+      ErrorOr<BinarySection &> FuncSection = Function->getCodeSection();
+      assert(FuncSection && "cannot find section for function");
+      Addr = llvm::alignTo(Addr, 4);
+      FuncSection->setOutputAddress(Addr);
+      RTDyld->reassignSectionAddress(FuncSection->getSectionID(), Addr);
+      Function->setFileOffset(Addr - BOLT->getAddress() +
+                              BOLT->getInputFileOffset());
+      Function->setImageAddress(FuncSection->getAllocAddress());
+      Function->setImageSize(FuncSection->getOutputSize());
+      BC->registerNameAtAddress(Function->getOneName(), Addr, 0, 0);
+      Addr += FuncSection->getOutputSize();
+    }
+  }
+}
+
+namespace {
+
+class BOLTSymbolResolver : public LegacyJITSymbolResolver {
+  BinaryContext &BC;
+public:
+  BOLTSymbolResolver(BinaryContext &BC) : BC(BC) {}
+
+  JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {
+    return JITSymbol(nullptr);
+  }
+
+  JITSymbol findSymbol(const std::string &Name) override {
+    LLVM_DEBUG(dbgs() << "BOLT: looking for " << Name << "\n");
+    if (BinaryData *I = BC.getBinaryDataByName(Name)) {
+      const uint64_t Address = I->isMoved() && !I->isJumpTable()
+                                   ? I->getOutputAddress()
+                                   : I->getAddress();
+      LLVM_DEBUG(dbgs() << "Resolved to address 0x" << Twine::utohexstr(Address)
+                        << "\n");
+      return JITSymbol(Address, JITSymbolFlags());
+    }
+    LLVM_DEBUG(dbgs() << "Resolved to address 0x0\n");
+    return JITSymbol(nullptr);
+  }
+};
+
+} // end anonymous namespace
+
+void MachORewriteInstance::emitAndLink() {
+  std::error_code EC;
+  std::unique_ptr<::llvm::ToolOutputFile> TempOut =
+      std::make_unique<::llvm::ToolOutputFile>(
+          opts::OutputFilename + ".bolt.o", EC, sys::fs::OF_None);
+  check_error(EC, "cannot create output object file");
+
+  if (opts::KeepTmp)
+    TempOut->keep();
+
+  std::unique_ptr<buffer_ostream> BOS =
+      std::make_unique<buffer_ostream>(TempOut->os());
+  raw_pwrite_stream *OS = BOS.get();
+  auto Streamer = BC->createStreamer(*OS);
+
+  emitBinaryContext(*Streamer, *BC, getOrgSecPrefix());
+  Streamer->Finish();
+
+  std::unique_ptr<MemoryBuffer> ObjectMemBuffer =
+      MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false);
+  std::unique_ptr<object::ObjectFile> Obj = cantFail(
+      object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()),
+      "error creating in-memory object");
+  assert(Obj && "createObjectFile cannot return nullptr");
+
+  BOLTSymbolResolver Resolver = BOLTSymbolResolver(*BC);
+
+  MCAsmLayout FinalLayout(
+      static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler());
+
+  BC->EFMM.reset(new ExecutableFileMemoryManager(*BC, /*AllowStubs*/ false));
+
+  RTDyld.reset(new decltype(RTDyld)::element_type(*BC->EFMM, Resolver));
+  RTDyld->setProcessAllSections(true);
+  RTDyld->loadObject(*Obj);
+  if (RTDyld->hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed.\n";
+    exit(1);
+  }
+
+  // Assign addresses to all sections. If key corresponds to the object
+  // created by ourselves, call our regular mapping function. If we are
+  // loading additional objects as part of runtime libraries for
+  // instrumentation, treat them as extra sections.
+  mapCodeSections();
+  mapInstrumentationSection("__counters");
+  mapInstrumentationSection("__tables");
+
+          // TODO: Refactor addRuntimeLibSections to work properly on Mach-O
+          // and use it here.
+  //FIXME! Put this in RtLibrary->link
+//          mapInstrumentationSection("I__setup");
+//          mapInstrumentationSection("I__fini");
+//          mapInstrumentationSection("I__data");
+//          mapInstrumentationSection("I__text");
+//          mapInstrumentationSection("I__cstring");
+//          mapInstrumentationSection("I__literal16");
+
+//  if (auto *RtLibrary = BC->getRuntimeLibrary()) {
+//    RtLibrary->link(*BC, ToolPath, *ES, *OLT);
+//  }
+}
+
+void MachORewriteInstance::writeInstrumentationSection(StringRef SectionName,
+                                                       raw_pwrite_stream &OS) {
+  if (!opts::Instrument)
+    return;
+  ErrorOr<BinarySection &> Section = BC->getUniqueSectionByName(SectionName);
+  if (!Section) {
+    llvm::errs() << "Cannot find " + SectionName + " section\n";
+    exit(1);
+  }
+  if (!Section->hasValidSectionID())
+    return;
+  assert(Section->getInputFileOffset() &&
+         "Section input offset cannot be zero");
+  assert(Section->getAllocAddress() && "Section alloc address cannot be zero");
+  assert(Section->getOutputSize() && "Section output size cannot be zero");
+  OS.pwrite(reinterpret_cast<char *>(Section->getAllocAddress()),
+            Section->getOutputSize(), Section->getInputFileOffset());
+}
+
+void MachORewriteInstance::rewriteFile() {
+  std::error_code EC;
+  Out = std::make_unique<ToolOutputFile>(opts::OutputFilename, EC,
+                                         sys::fs::OF_None);
+  check_error(EC, "cannot create output executable file");
+  raw_fd_ostream &OS = Out->os();
+  OS << InputFile->getData();
+
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (!Function.isSimple())
+      continue;
+    assert(Function.isEmitted() && "Simple function has not been emitted");
+    if (!opts::Instrument && (Function.getImageSize() > Function.getMaxSize()))
+      continue;
+    if (opts::Verbosity >= 2)
+      outs() << "BOLT: rewriting function \"" << Function << "\"\n";
+    OS.pwrite(reinterpret_cast<char *>(Function.getImageAddress()),
+              Function.getImageSize(), Function.getFileOffset());
+  }
+
+  for (const BinaryFunction *Function : BC->getInjectedBinaryFunctions()) {
+    OS.pwrite(reinterpret_cast<char *>(Function->getImageAddress()),
+              Function->getImageSize(), Function->getFileOffset());
+  }
+
+  writeInstrumentationSection("__counters", OS);
+  writeInstrumentationSection("__tables", OS);
+
+  // TODO: Refactor addRuntimeLibSections to work properly on Mach-O and
+  // use it here.
+  writeInstrumentationSection("I__setup", OS);
+  writeInstrumentationSection("I__fini", OS);
+  writeInstrumentationSection("I__data", OS);
+  writeInstrumentationSection("I__text", OS);
+  writeInstrumentationSection("I__cstring", OS);
+  writeInstrumentationSection("I__literal16", OS);
+
+  Out->keep();
+  EC = sys::fs::setPermissions(opts::OutputFilename,
+                               sys::fs::perms::all_all);
+  check_error(EC, "cannot set permissions of output file");
+}
+
+void MachORewriteInstance::adjustCommandLineOptions() {
+//FIXME! Upstream change
+//  opts::CheckOverlappingElements = false;
+  if (!opts::AlignText.getNumOccurrences())
+    opts::AlignText = BC->PageAlign;
+  if (opts::Instrument.getNumOccurrences())
+    opts::ForcePatch = true;
+  opts::JumpTables = JTS_MOVE;
+  opts::InstrumentCalls = false;
+  opts::RuntimeInstrumentationLib = "libbolt_rt_instr_osx.a";
+}
+
+void MachORewriteInstance::run() {
+  adjustCommandLineOptions();
+
+  readSpecialSections();
+
+  discoverFileObjects();
+
+  preprocessProfileData();
+
+  disassembleFunctions();
+
+  processProfileDataPreCFG();
+
+  buildFunctionsCFG();
+
+  processProfileData();
+
+  postProcessFunctions();
+
+  runOptimizationPasses();
+
+  emitAndLink();
+
+  rewriteFile();
+}
+
+MachORewriteInstance::~MachORewriteInstance() {}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/MachORewriteInstance.h b/bolt/src/MachORewriteInstance.h
new file mode 100644
index 000000000000..5f9551723b2e
--- /dev/null
+++ b/bolt/src/MachORewriteInstance.h
@@ -0,0 +1,79 @@
+//===--- MachORewriteInstance.h - Instance of a rewriting process. --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to control an instance of a macho binary rewriting process.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_MACHO_REWRITE_INSTANCE_H
+#define LLVM_TOOLS_LLVM_BOLT_MACHO_REWRITE_INSTANCE_H
+
+#include "NameResolver.h"
+#include "llvm/Support/Error.h"
+#include <memory>
+
+namespace llvm {
+class ToolOutputFile;
+class RuntimeDyld;
+class raw_pwrite_stream;
+namespace object {
+class MachOObjectFile;
+} // namespace object
+
+namespace bolt {
+
+class BinaryContext;
+class ProfileReaderBase;
+
+class MachORewriteInstance {
+  object::MachOObjectFile *InputFile;
+  StringRef ToolPath;
+  std::unique_ptr<BinaryContext> BC;
+
+  NameResolver NR;
+
+  std::unique_ptr<RuntimeDyld> RTDyld;
+
+  std::unique_ptr<ToolOutputFile> Out;
+
+  std::unique_ptr<ProfileReaderBase> ProfileReader;
+  void preprocessProfileData();
+  void processProfileDataPreCFG();
+  void processProfileData();
+
+  static StringRef getOrgSecPrefix() { return ".bolt.org"; }
+
+  void mapInstrumentationSection(StringRef SectionName);
+  void mapCodeSections();
+
+  void adjustCommandLineOptions();
+  void readSpecialSections();
+  void discoverFileObjects();
+  void disassembleFunctions();
+  void buildFunctionsCFG();
+  void postProcessFunctions();
+  void runOptimizationPasses();
+  void emitAndLink();
+
+  void writeInstrumentationSection(StringRef SectionName, raw_pwrite_stream &OS);
+  void rewriteFile();
+
+public:
+  MachORewriteInstance(object::MachOObjectFile *InputFile, StringRef ToolPath);
+  ~MachORewriteInstance();
+
+  Error setProfile(StringRef FileName);
+
+  /// Run all the necessary steps to read, optimize and rewrite the binary.
+  void run();
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/NameResolver.h b/bolt/src/NameResolver.h
new file mode 100644
index 000000000000..9392308f1e2b
--- /dev/null
+++ b/bolt/src/NameResolver.h
@@ -0,0 +1,56 @@
+//===--- NameResolver.h - Helper class for names deduplication ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper class for names deduplication.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_NAME_RESOLVER_H
+#define LLVM_TOOLS_LLVM_BOLT_NAME_RESOLVER_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+
+namespace llvm {
+namespace bolt {
+
+class NameResolver {
+  /// Track the number of duplicate names.
+  StringMap<uint64_t> Counters;
+
+  /// Character guaranteed not to be used by any "native" name passed to
+  /// uniquify() function.
+  static constexpr char Sep = '/';
+
+public:
+  /// Return unique version of the \p Name in the form "Name<Sep><Number>".
+  std::string uniquify(StringRef Name) {
+    const uint64_t ID = ++Counters[Name];
+    return (Name + Twine(Sep) + Twine(ID)).str();
+  }
+
+  /// For uniquified \p Name, return the original form (that may no longer be
+  /// unique).
+  static StringRef restore(StringRef Name) {
+    return Name.substr(0, Name.find_first_of(Sep));
+  }
+
+  /// Append \p Suffix to the original string in \p UniqueName  preserving the
+  /// deduplication form. E.g. append("Name<Sep>42", "Suffix") will return
+  /// "NameSuffix<Sep>42".
+  static std::string append(StringRef UniqueName, StringRef Suffix) {
+    StringRef LHS, RHS;
+    std::tie(LHS, RHS) = UniqueName.split(Sep);
+    return (LHS + Suffix + Twine(Sep) + RHS).str();
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/NameShortener.h b/bolt/src/NameShortener.h
new file mode 100644
index 000000000000..fb59f697ae54
--- /dev/null
+++ b/bolt/src/NameShortener.h
@@ -0,0 +1,34 @@
+//===--- NameShortener.h - Helper class for shortening names --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper class for shortening names.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_NAME_SHORTENER_H
+#define LLVM_TOOLS_LLVM_BOLT_NAME_SHORTENER_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+
+namespace llvm {
+namespace bolt {
+
+class NameShortener {
+  StringMap<uint64_t> IDs;
+
+public:
+  uint64_t getID(StringRef Name) {
+    return IDs.insert({Name, IDs.size()}).first->getValue();
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/ParallelUtilities.cpp b/bolt/src/ParallelUtilities.cpp
new file mode 100644
index 000000000000..1fc0935c36a9
--- /dev/null
+++ b/bolt/src/ParallelUtilities.cpp
@@ -0,0 +1,239 @@
+//===--- ParallelUtilities.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ParallelUtilities.h"
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Timer.h"
+#include <mutex>
+#include <shared_mutex>
+
+#define DEBUG_TYPE "par-utils"
+
+namespace opts {
+extern cl::OptionCategory BoltCategory;
+
+cl::opt<unsigned>
+ThreadCount("thread-count",
+  cl::desc("number of threads"),
+  cl::init(hardware_concurrency().compute_thread_count()),
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+NoThreads("no-threads",
+  cl::desc("disable multithreading"),
+  cl::init(false),
+  cl::cat(BoltCategory));
+
+cl::opt<unsigned>
+TaskCount("tasks-per-thread",
+  cl::desc("number of tasks to be created per thread"),
+  cl::init(20),
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+namespace ParallelUtilities {
+
+namespace {
+/// A single thread pool that is used to run parallel tasks
+std::unique_ptr<ThreadPool> ThreadPoolPtr;
+
+unsigned computeCostFor(const BinaryFunction &BF,
+                        const PredicateTy &SkipPredicate,
+                        const SchedulingPolicy &SchedPolicy) {
+  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
+    return 1;
+
+  if (SkipPredicate && SkipPredicate(BF))
+    return 0;
+
+  switch (SchedPolicy) {
+  case SchedulingPolicy::SP_CONSTANT:
+    return 1;
+  case SchedulingPolicy::SP_INST_LINEAR:
+    return BF.getSize();
+  case SchedulingPolicy::SP_INST_QUADRATIC:
+    return BF.getSize() * BF.getSize();
+  case SchedulingPolicy::SP_BB_LINEAR:
+    return BF.size();
+  case SchedulingPolicy::SP_BB_QUADRATIC:
+    return BF.size() * BF.size();
+  default:
+    llvm_unreachable("unsupported scheduling policy");
+  }
+}
+
+inline unsigned estimateTotalCost(const BinaryContext &BC,
+                                  const PredicateTy &SkipPredicate,
+                                  SchedulingPolicy &SchedPolicy) {
+  if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
+    return BC.getBinaryFunctions().size();
+
+  unsigned TotalCost = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &BF = BFI.second;
+    TotalCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+  }
+
+  // Switch to trivial scheduling if total estimated work is zero
+  if (TotalCost == 0) {
+    outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
+              "switch to  trivial scheduling.\n";
+
+    SchedPolicy = SP_TRIVIAL;
+    TotalCost = BC.getBinaryFunctions().size();
+  }
+  return TotalCost;
+}
+
+} // namespace
+
+ThreadPool &getThreadPool() {
+  if (ThreadPoolPtr.get())
+    return *ThreadPoolPtr;
+
+  ThreadPoolPtr = std::make_unique<ThreadPool>(
+      llvm::hardware_concurrency(opts::ThreadCount));
+  return *ThreadPoolPtr;
+}
+
+void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
+                       WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
+                       std::string LogName, bool ForceSequential,
+                       unsigned TasksPerThread) {
+  if (BC.getBinaryFunctions().size() == 0)
+    return;
+
+  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
+                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
+    Timer T(LogName, LogName);
+    LLVM_DEBUG(T.startTimer());
+
+    for (auto It = BlockBegin; It != BlockEnd; ++It) {
+      BinaryFunction &BF = It->second;
+      if (SkipPredicate && SkipPredicate(BF))
+        continue;
+
+      WorkFunction(BF);
+    }
+    LLVM_DEBUG(T.stopTimer());
+  };
+
+  if (opts::NoThreads || ForceSequential) {
+    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
+    return;
+  }
+
+  // Estimate the overall runtime cost using the scheduling policy
+  const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
+  const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
+  const unsigned BlockCost =
+      TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
+
+  // Divide work into blocks of equal cost
+  ThreadPool &Pool = getThreadPool();
+  auto BlockBegin = BC.getBinaryFunctions().begin();
+  unsigned CurrentCost = 0;
+
+  for (auto It = BC.getBinaryFunctions().begin();
+       It != BC.getBinaryFunctions().end(); ++It) {
+    BinaryFunction &BF = It->second;
+    CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+
+    if (CurrentCost >= BlockCost) {
+      Pool.async(runBlock, BlockBegin, std::next(It));
+      BlockBegin = std::next(It);
+      CurrentCost = 0;
+    }
+  }
+  Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
+  Pool.wait();
+}
+
+void runOnEachFunctionWithUniqueAllocId(
+    BinaryContext &BC, SchedulingPolicy SchedPolicy,
+    WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
+    std::string LogName, bool ForceSequential, unsigned TasksPerThread) {
+  if (BC.getBinaryFunctions().size() == 0)
+    return;
+
+  std::shared_timed_mutex MainLock;
+  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
+                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
+                      MCPlusBuilder::AllocatorIdTy AllocId) {
+    Timer T(LogName, LogName);
+    LLVM_DEBUG(T.startTimer());
+    std::shared_lock<std::shared_timed_mutex> Lock(MainLock);
+    for (auto It = BlockBegin; It != BlockEnd; ++It) {
+      BinaryFunction &BF = It->second;
+      if (SkipPredicate && SkipPredicate(BF))
+        continue;
+
+      WorkFunction(BF, AllocId);
+    }
+    LLVM_DEBUG(T.stopTimer());
+  };
+
+  if (opts::NoThreads || ForceSequential) {
+    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(), 0);
+    return;
+  }
+  // This lock is used to postpone task execution
+  std::unique_lock<std::shared_timed_mutex> Lock(MainLock);
+
+  // Estimate the overall runtime cost using the scheduling policy
+  const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
+  const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
+  const unsigned BlockCost =
+      TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
+
+  // Divide work into blocks of equal cost
+  ThreadPool &Pool = getThreadPool();
+  auto BlockBegin = BC.getBinaryFunctions().begin();
+  unsigned CurrentCost = 0;
+  unsigned AllocId = 1;
+  for (auto It = BC.getBinaryFunctions().begin();
+       It != BC.getBinaryFunctions().end(); ++It) {
+    BinaryFunction &BF = It->second;
+    CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
+
+    if (CurrentCost >= BlockCost) {
+      if (!BC.MIB->checkAllocatorExists(AllocId)) {
+        MCPlusBuilder::AllocatorIdTy Id =
+            BC.MIB->initializeNewAnnotationAllocator();
+        (void)Id;
+        assert(AllocId == Id && "unexpected allocator id created");
+      }
+      Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
+      AllocId++;
+      BlockBegin = std::next(It);
+      CurrentCost = 0;
+    }
+  }
+
+  if (!BC.MIB->checkAllocatorExists(AllocId)) {
+    MCPlusBuilder::AllocatorIdTy Id =
+        BC.MIB->initializeNewAnnotationAllocator();
+    (void)Id;
+    assert(AllocId == Id && "unexpected allocator id created");
+  }
+
+  Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
+  Lock.unlock();
+  Pool.wait();
+}
+
+} // namespace ParallelUtilities
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/ParallelUtilities.h b/bolt/src/ParallelUtilities.h
new file mode 100644
index 000000000000..fe2e452dcfb8
--- /dev/null
+++ b/bolt/src/ParallelUtilities.h
@@ -0,0 +1,80 @@
+//===-- ParallelUtilities.h - ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This class creates an interface that can be used to run parallel tasks that
+// operate on functions. Several scheduling criteria are supported using
+// SchedulingPolicy, and are defined by how the runtime cost should be
+// estimated.
+// If the NoThreads flags is passed, work will execute sequentially.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
+#define LLVM_TOOLS_LLVM_BOLT_PARALLEL_UTILITIES_H
+
+#include "MCPlusBuilder.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::opt<unsigned> ThreadCount;
+extern cl::opt<bool> NoThreads;
+extern cl::opt<unsigned> TaskCount;
+}
+
+namespace llvm {
+class ThreadPool;
+
+namespace bolt {
+class BinaryContext;
+class BinaryFunction;
+
+namespace ParallelUtilities {
+
+using WorkFuncWithAllocTy =
+    std::function<void(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy)>;
+using WorkFuncTy = std::function<void(BinaryFunction &BF)>;
+using PredicateTy = std::function<bool(const BinaryFunction &BF)>;
+
+enum SchedulingPolicy {
+  SP_TRIVIAL,     /// cost is estimated by the number of functions
+  SP_CONSTANT,    /// cost is estimated by the number of non-skipped functions
+  SP_INST_LINEAR, /// cost is estimated by inst count
+  SP_INST_QUADRATIC, /// cost is estimated by the square of the inst count
+  SP_BB_LINEAR,      /// cost is estimated by BB count
+  SP_BB_QUADRATIC,   /// cost is estimated by the square of the BB count
+};
+
+/// Return the managed threadpool and initialize it if not intiliazed
+ThreadPool &getThreadPool();
+
+/// Perform the work on each BinaryFunction except those that are accepted
+/// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
+/// ForceSequential will selectively disable parallel execution and perform the
+/// work sequentially.
+void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
+                       WorkFuncTy WorkFunction,
+                       PredicateTy SkipPredicate = PredicateTy(),
+                       std::string LogName = "", bool ForceSequential = false,
+                       unsigned TasksPerThread = opts::TaskCount);
+
+/// Perform the work on each BinaryFunction except those that are rejected
+/// by SkipPredicate, and create a unique annotation allocator for each
+/// task. This should be used whenever the work function creates annotations to
+/// allow thread-safe annotation creation.
+/// ForceSequential will selectively disable parallel execution and perform the
+/// work sequentially.
+void runOnEachFunctionWithUniqueAllocId(
+    BinaryContext &BC, SchedulingPolicy SchedPolicy,
+    WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
+    std::string LogName = "", bool ForceSequential = false,
+    unsigned TasksPerThread = opts::TaskCount);
+
+} // namespace ParallelUtilities
+} // namespace bolt
+} // namespace llvm
+#endif
diff --git a/bolt/src/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp
new file mode 100644
index 000000000000..8d21c6085503
--- /dev/null
+++ b/bolt/src/Passes/Aligner.cpp
@@ -0,0 +1,199 @@
+//===--- Aligner.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Aligner.h"
+#include "ParallelUtilities.h"
+
+#define DEBUG_TYPE "bolt-aligner"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> AlignBlocks;
+extern cl::opt<bool> PreserveBlocksAlignment;
+
+cl::opt<unsigned>
+AlignBlocksMinSize("align-blocks-min-size",
+  cl::desc("minimal size of the basic block that should be aligned"),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignBlocksThreshold("align-blocks-threshold",
+  cl::desc("align only blocks with frequency larger than containing function "
+           "execution frequency specified in percent. E.g. 1000 means aligning "
+           "blocks that are 10 times more frequently executed than the "
+           "containing function."),
+  cl::init(800),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignFunctions("align-functions",
+  cl::desc("align functions at a given value (relocation mode)"),
+  cl::init(64),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+AlignFunctionsMaxBytes("align-functions-max-bytes",
+  cl::desc("maximum number of bytes to use to align functions"),
+  cl::init(32),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+BlockAlignment("block-alignment",
+  cl::desc("boundary to use for alignment of basic blocks"),
+  cl::init(16),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+UseCompactAligner("use-compact-aligner",
+  cl::desc("Use compact approach for aligning functions"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // end namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// Align function to the specified byte-boundary (typically, 64) offsetting
+// the fuction by not more than the corresponding value
+void alignMaxBytes(BinaryFunction &Function) {
+  Function.setAlignment(opts::AlignFunctions);
+  Function.setMaxAlignmentBytes(opts::AlignFunctionsMaxBytes);
+  Function.setMaxColdAlignmentBytes(opts::AlignFunctionsMaxBytes);
+}
+
+// Align function to the specified byte-boundary (typically, 64) offsetting
+// the fuction by not more than the minimum over
+// -- the size of the function
+// -- the specified number of bytes
+void alignCompact(BinaryFunction &Function, const MCCodeEmitter *Emitter) {
+  const BinaryContext &BC = Function.getBinaryContext();
+  size_t HotSize = 0;
+  size_t ColdSize = 0;
+
+  for (const BinaryBasicBlock *BB : Function.layout()) {
+    if (BB->isCold())
+      ColdSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
+    else
+      HotSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
+  }
+
+  Function.setAlignment(opts::AlignFunctions);
+  if (HotSize > 0)
+    Function.setMaxAlignmentBytes(
+      std::min(size_t(opts::AlignFunctionsMaxBytes), HotSize));
+
+  // using the same option, max-align-bytes, both for cold and hot parts of the
+  // functions, as aligning cold functions typically does not affect performance
+  if (ColdSize > 0)
+    Function.setMaxColdAlignmentBytes(
+      std::min(size_t(opts::AlignFunctionsMaxBytes), ColdSize));
+}
+
+} // end anonymous namespace
+
+void AlignerPass::alignBlocks(BinaryFunction &Function,
+                              const MCCodeEmitter *Emitter) {
+  if (!Function.hasValidProfile() || !Function.isSimple())
+    return;
+
+  const BinaryContext &BC = Function.getBinaryContext();
+
+  const uint64_t FuncCount =
+      std::max<uint64_t>(1, Function.getKnownExecutionCount());
+  BinaryBasicBlock *PrevBB = nullptr;
+  for (BinaryBasicBlock *BB : Function.layout()) {
+    uint64_t Count = BB->getKnownExecutionCount();
+
+    if (Count <= FuncCount * opts::AlignBlocksThreshold / 100) {
+      PrevBB = BB;
+      continue;
+    }
+
+    uint64_t FTCount = 0;
+    if (PrevBB && PrevBB->getFallthrough() == BB) {
+      FTCount = PrevBB->getBranchInfo(*BB).Count;
+    }
+    PrevBB = BB;
+
+    if (Count < FTCount * 2)
+      continue;
+
+    const uint64_t BlockSize =
+        BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
+    const uint64_t BytesToUse =
+        std::min<uint64_t>(opts::BlockAlignment - 1, BlockSize);
+
+    if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize)
+      continue;
+
+    BB->setAlignment(opts::BlockAlignment);
+    BB->setAlignmentMaxBytes(BytesToUse);
+
+    // Update stats.
+    LLVM_DEBUG(
+      std::unique_lock<std::shared_timed_mutex> Lock(AlignHistogramMtx);
+      AlignHistogram[BytesToUse]++;
+      AlignedBlocksCount += BB->getKnownExecutionCount();
+    );
+  }
+}
+
+void AlignerPass::runOnFunctions(BinaryContext &BC) {
+  if (!BC.HasRelocations)
+    return;
+
+  AlignHistogram.resize(opts::BlockAlignment);
+
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    // Create a separate MCCodeEmitter to allow lock free execution
+    BinaryContext::IndependentCodeEmitter Emitter =
+        BC.createIndependentMCCodeEmitter();
+
+    if (opts::UseCompactAligner)
+      alignCompact(BF, Emitter.MCE.get());
+    else
+      alignMaxBytes(BF);
+
+    if (opts::AlignBlocks && !opts::PreserveBlocksAlignment)
+      alignBlocks(BF, Emitter.MCE.get());
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
+      ParallelUtilities::PredicateTy(nullptr), "AlignerPass");
+
+  LLVM_DEBUG(
+    dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n";
+    for (unsigned I = 1; I < AlignHistogram.size(); ++I) {
+      dbgs() << "  " << I << " : " << AlignHistogram[I] << '\n';
+    }
+    dbgs() << "BOLT-DEBUG: total execution count of aligned blocks: "
+           << AlignedBlocksCount << '\n';
+  );
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/Aligner.h b/bolt/src/Passes/Aligner.h
new file mode 100644
index 000000000000..4a2acbebbbc0
--- /dev/null
+++ b/bolt/src/Passes/Aligner.h
@@ -0,0 +1,46 @@
+//===--------- Passes/Aligner.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class AlignerPass : public BinaryFunctionPass {
+private:
+  /// Stats for usage of max bytes for basic block alignment.
+  std::vector<uint32_t> AlignHistogram;
+  std::shared_timed_mutex AlignHistogramMtx;
+
+  /// Stats: execution count of blocks that were aligned.
+  std::atomic<uint64_t> AlignedBlocksCount{0};
+
+  /// Assign alignment to basic blocks based on profile.
+  void alignBlocks(BinaryFunction &Function, const MCCodeEmitter *Emitter);
+
+public:
+  explicit AlignerPass() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override {
+    return "aligner";
+  }
+
+  /// Pass entry point
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/AllocCombiner.cpp b/bolt/src/Passes/AllocCombiner.cpp
new file mode 100644
index 000000000000..57b22d894593
--- /dev/null
+++ b/bolt/src/Passes/AllocCombiner.cpp
@@ -0,0 +1,124 @@
+//===--- Passes/AllocCombiner.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocCombiner.h"
+
+#define DEBUG_TYPE "alloccombiner"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::opt<bolt::FrameOptimizationType> FrameOptimization;
+
+} // end namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+bool getStackAdjustmentSize(const BinaryContext &BC, const MCInst &Inst,
+                            int64_t &Adjustment) {
+  return BC.MIB->evaluateSimple(Inst, Adjustment,
+                                std::make_pair(BC.MIB->getStackPointer(), 0LL),
+                                std::make_pair(0, 0LL));
+}
+
+bool isIndifferentToSP(const MCInst &Inst, const BinaryContext &BC) {
+  if (BC.MIB->isCFI(Inst))
+    return true;
+
+  const MCInstrDesc II = BC.MII->get(Inst.getOpcode());
+  if (BC.MIB->isTerminator(Inst) ||
+      II.hasImplicitDefOfPhysReg(BC.MIB->getStackPointer(), BC.MRI.get()) ||
+      II.hasImplicitUseOfPhysReg(BC.MIB->getStackPointer()))
+    return false;
+
+  for (int I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) {
+    const MCOperand &Operand = Inst.getOperand(I);
+    if (Operand.isReg() && Operand.getReg() == BC.MIB->getStackPointer()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool shouldProcess(const BinaryFunction &Function) {
+  return Function.isSimple() && Function.hasCFG() && !Function.isIgnored();
+}
+
+void runForAllWeCare(std::map<uint64_t, BinaryFunction> &BFs,
+                     std::function<void(BinaryFunction &)> Task) {
+  for (auto &It : BFs) {
+    BinaryFunction &Function = It.second;
+    if (shouldProcess(Function))
+      Task(Function);
+  }
+}
+
+} // end anonymous namespace
+
+void AllocCombinerPass::combineAdjustments(BinaryContext &BC,
+                                           BinaryFunction &BF) {
+  for (BinaryBasicBlock &BB : BF) {
+    MCInst *Prev = nullptr;
+    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
+      MCInst &Inst = *I;
+      if (isIndifferentToSP(Inst, BC))
+        continue; // Skip updating Prev
+
+      int64_t Adjustment = 0LL;
+      if (!Prev || !BC.MIB->isStackAdjustment(Inst) ||
+          !BC.MIB->isStackAdjustment(*Prev) ||
+          !getStackAdjustmentSize(BC, *Prev, Adjustment)) {
+        Prev = &Inst;
+        continue;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "At \"" << BF.getPrintName() << "\", combining: \n";
+        Inst.dump();
+        Prev->dump();
+        dbgs() << "Adjustment: " << Adjustment << "\n";
+      });
+
+      if (BC.MIB->isSUB(Inst))
+        Adjustment = -Adjustment;
+
+      BC.MIB->addToImm(Inst, Adjustment, BC.Ctx.get());
+
+      LLVM_DEBUG({
+        dbgs() << "After adjustment:\n";
+        Inst.dump();
+      });
+
+      BB.eraseInstruction(BB.findInstruction(Prev));
+      ++NumCombined;
+      FuncsChanged.insert(&BF);
+      Prev = &Inst;
+    }
+  }
+}
+
+void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
+  if (opts::FrameOptimization == FOP_NONE)
+    return;
+
+  runForAllWeCare(
+      BC.getBinaryFunctions(),
+      [&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
+
+  outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
+         << " empty spaces coalesced.\n";
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/AllocCombiner.h b/bolt/src/Passes/AllocCombiner.h
new file mode 100644
index 000000000000..d337ec7dfca5
--- /dev/null
+++ b/bolt/src/Passes/AllocCombiner.h
@@ -0,0 +1,50 @@
+//===--- Passes/AllocCombiner.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+class DataflowInfoManager;
+class FrameAnalysis;
+
+class AllocCombinerPass : public BinaryFunctionPass {
+  /// Stats aggregating variables
+  uint64_t NumCombined{0};
+  DenseSet<const BinaryFunction *> FuncsChanged;
+
+  void combineAdjustments(BinaryContext &BC, BinaryFunction &BF);
+  void coalesceEmptySpace(BinaryContext &BC, BinaryFunction &BF,
+                          DataflowInfoManager &Info, FrameAnalysis &FA);
+
+public:
+  explicit AllocCombinerPass(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "alloc-combiner";
+  }
+
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
+  }
+
+  /// Pass entry point
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/BinaryFunctionCallGraph.cpp b/bolt/src/Passes/BinaryFunctionCallGraph.cpp
new file mode 100644
index 000000000000..ddf15e70526f
--- /dev/null
+++ b/bolt/src/Passes/BinaryFunctionCallGraph.cpp
@@ -0,0 +1,285 @@
+//===--- Passes/BinaryFunctionCallGraph.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryFunctionCallGraph.h"
+#include "BinaryFunction.h"
+#include "BinaryContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+#include <stack>
+
+#define DEBUG_TYPE "callgraph"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+extern llvm::cl::opt<unsigned> Verbosity;
+}
+
+namespace llvm {
+namespace bolt {
+
+CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF,
+                                                   uint32_t Size,
+                                                   uint64_t Samples) {
+  NodeId Id = CallGraph::addNode(Size, Samples);
+  assert(size_t(Id) == Funcs.size());
+  Funcs.push_back(BF);
+  FuncToNodeId[BF] = Id;
+  assert(Funcs[Id] == BF);
+  return Id;
+}
+
+std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
+  NamedRegionTimer T1("buildcgorder", "Build cg traversal order",
+                      "CG breakdown", "CG breakdown", opts::TimeOpts);
+  std::deque<BinaryFunction *> TopologicalOrder;
+  enum NodeStatus { NEW, VISITING, VISITED };
+  std::vector<NodeStatus> NodeStatus(Funcs.size());
+  std::stack<NodeId> Worklist;
+
+  for (BinaryFunction *Func : Funcs) {
+    const NodeId Id = FuncToNodeId.at(Func);
+    Worklist.push(Id);
+    NodeStatus[Id] = NEW;
+  }
+
+  while (!Worklist.empty()) {
+    const NodeId FuncId = Worklist.top();
+    Worklist.pop();
+
+    if (NodeStatus[FuncId] == VISITED)
+      continue;
+
+    if (NodeStatus[FuncId] == VISITING) {
+      TopologicalOrder.push_back(Funcs[FuncId]);
+      NodeStatus[FuncId] = VISITED;
+      continue;
+    }
+
+    assert(NodeStatus[FuncId] == NEW);
+    NodeStatus[FuncId] = VISITING;
+    Worklist.push(FuncId);
+    for (const NodeId Callee : successors(FuncId)) {
+      if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED)
+        continue;
+      Worklist.push(Callee);
+    }
+  }
+
+  return TopologicalOrder;
+}
+
+BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
+                                       CgFilterFunction Filter,
+                                       bool CgFromPerfData,
+                                       bool IncludeColdCalls,
+                                       bool UseFunctionHotSize,
+                                       bool UseSplitHotSize,
+                                       bool UseEdgeCounts,
+                                       bool IgnoreRecursiveCalls) {
+  NamedRegionTimer T1("buildcg", "Callgraph construction", "CG breakdown",
+                      "CG breakdown", opts::TimeOpts);
+  BinaryFunctionCallGraph Cg;
+  static constexpr uint64_t COUNT_NO_PROFILE =
+      BinaryBasicBlock::COUNT_NO_PROFILE;
+
+  // Compute function size
+  auto functionSize = [&](const BinaryFunction *Function) {
+    return UseFunctionHotSize && Function->isSplit()
+      ? Function->estimateHotSize(UseSplitHotSize)
+      : Function->estimateSize();
+  };
+
+  // Add call graph nodes.
+  auto lookupNode = [&](BinaryFunction *Function) {
+    const CallGraph::NodeId Id = Cg.maybeGetNodeId(Function);
+    if (Id == CallGraph::InvalidId) {
+      // It's ok to use the hot size here when the function is split.  This is
+      // because emitFunctions will emit the hot part first in the order that is
+      // computed by ReorderFunctions.  The cold part will be emitted with the
+      // rest of the cold functions and code.
+      const size_t Size = functionSize(Function);
+      // NOTE: for functions without a profile, we set the number of samples
+      // to zero.  This will keep these functions from appearing in the hot
+      // section.  This is a little weird because we wouldn't be trying to
+      // create a node for a function unless it was the target of a call from
+      // a hot block.  The alternative would be to set the count to one or
+      // accumulate the number of calls from the callsite into the function
+      // samples.  Results from perfomance testing seem to favor the zero
+      // count though, so I'm leaving it this way for now.
+      return Cg.addNode(Function, Size, Function->getKnownExecutionCount());
+    } else {
+      return Id;
+    }
+  };
+
+  // Add call graph edges.
+  uint64_t NotProcessed = 0;
+  uint64_t TotalCallsites = 0;
+  uint64_t NoProfileCallsites = 0;
+  uint64_t NumFallbacks = 0;
+  uint64_t RecursiveCallsites = 0;
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction *Function = &It.second;
+
+    if (Filter(*Function)) {
+      continue;
+    }
+
+    const CallGraph::NodeId SrcId = lookupNode(Function);
+    // Offset of the current basic block from the beginning of the function
+    uint64_t Offset = 0;
+
+    auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) {
+      if (BinaryFunction *DstFunc =
+              DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr) {
+        if (DstFunc == Function) {
+          LLVM_DEBUG(dbgs() << "BOLT-INFO: recursive call detected in "
+                            << *DstFunc << "\n");
+          ++RecursiveCallsites;
+          if (IgnoreRecursiveCalls)
+            return false;
+        }
+        if (Filter(*DstFunc)) {
+          return false;
+        }
+        const CallGraph::NodeId DstId = lookupNode(DstFunc);
+        const bool IsValidCount = Count != COUNT_NO_PROFILE;
+        const uint64_t AdjCount = UseEdgeCounts && IsValidCount ? Count : 1;
+        if (!IsValidCount)
+          ++NoProfileCallsites;
+        Cg.incArcWeight(SrcId, DstId, AdjCount, Offset);
+        LLVM_DEBUG(
+          if (opts::Verbosity > 1) {
+            dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
+                   << " -> " << *DstFunc << " @ " << Offset << "\n";
+          });
+        return true;
+      }
+
+      return false;
+    };
+
+    // Pairs of (symbol, count) for each target at this callsite.
+    using TargetDesc = std::pair<const MCSymbol *, uint64_t>;
+    using CallInfoTy = std::vector<TargetDesc>;
+
+    // Get pairs of (symbol, count) for each target at this callsite.
+    // If the call is to an unknown function the symbol will be nullptr.
+    // If there is no profiling data the count will be COUNT_NO_PROFILE.
+    auto getCallInfo = [&](const BinaryBasicBlock *BB, const MCInst &Inst) {
+      CallInfoTy Counts;
+      const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
+
+      // If this is an indirect call use perf data directly.
+      if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) {
+        const auto &ICSP =
+          BC.MIB->getAnnotationAs<IndirectCallSiteProfile>(Inst, "CallProfile");
+        for (const IndirectCallProfile &CSI : ICSP) {
+          if (CSI.Symbol)
+            Counts.emplace_back(CSI.Symbol, CSI.Count);
+        }
+      } else {
+        const uint64_t Count = BB->getExecutionCount();
+        Counts.emplace_back(DstSym, Count);
+      }
+
+      return Counts;
+    };
+
+    // If the function has an invalid profile, try to use the perf data
+    // directly (if requested).  If there is no perf data for this function,
+    // fall back to the CFG walker which attempts to handle missing data.
+    if (!Function->hasValidProfile() && CgFromPerfData &&
+        !Function->getAllCallSites().empty()) {
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data"
+                 << " for " << *Function << "\n");
+      ++NumFallbacks;
+      const size_t Size = functionSize(Function);
+      for (const IndirectCallProfile &CSI : Function->getAllCallSites()) {
+        ++TotalCallsites;
+
+        if (!CSI.Symbol)
+          continue;
+
+        // The computed offset may exceed the hot part of the function; hence,
+        // bound it by the size.
+        Offset = CSI.Offset;
+        if (Offset > Size)
+          Offset = Size;
+
+        if (!recordCall(CSI.Symbol, CSI.Count)) {
+          ++NotProcessed;
+        }
+      }
+    } else {
+      for (BinaryBasicBlock *BB : Function->layout()) {
+        // Don't count calls from cold blocks unless requested.
+        if (BB->isCold() && !IncludeColdCalls)
+          continue;
+
+        // Determine whether the block is included in Function's (hot) size
+        // See BinaryFunction::estimateHotSize
+        bool BBIncludedInFunctionSize = false;
+        if (UseFunctionHotSize && Function->isSplit()) {
+          if (UseSplitHotSize)
+            BBIncludedInFunctionSize = !BB->isCold();
+          else
+            BBIncludedInFunctionSize = BB->getKnownExecutionCount() != 0;
+        } else {
+          BBIncludedInFunctionSize = true;
+        }
+
+        for (MCInst &Inst : *BB) {
+          // Find call instructions and extract target symbols from each one.
+          if (BC.MIB->isCall(Inst)) {
+            const CallInfoTy CallInfo = getCallInfo(BB, Inst);
+
+            if (!CallInfo.empty()) {
+              for (const TargetDesc &CI : CallInfo) {
+                ++TotalCallsites;
+                if (!recordCall(CI.first, CI.second))
+                  ++NotProcessed;
+              }
+            } else {
+              ++TotalCallsites;
+              ++NotProcessed;
+            }
+          }
+          // Increase Offset if needed
+          if (BBIncludedInFunctionSize) {
+            Offset += BC.computeCodeSize(&Inst, &Inst + 1);
+          }
+        }
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  bool PrintInfo = DebugFlag && isCurrentDebugType("callgraph");
+#else
+  bool PrintInfo = false;
+#endif
+  if (PrintInfo || opts::Verbosity > 0) {
+    outs() << format("BOLT-INFO: buildCallGraph: %u nodes, %u callsites "
+                     "(%u recursive), density = %.6lf, %u callsites not "
+                     "processed, %u callsites with invalid profile, "
+                     "used perf data for %u stale functions.\n",
+                     Cg.numNodes(), TotalCallsites, RecursiveCallsites,
+                     Cg.density(), NotProcessed, NoProfileCallsites,
+                     NumFallbacks);
+  }
+
+  return Cg;
+}
+
+}
+}
diff --git a/bolt/src/Passes/BinaryFunctionCallGraph.h b/bolt/src/Passes/BinaryFunctionCallGraph.h
new file mode 100644
index 000000000000..68019ccd05da
--- /dev/null
+++ b/bolt/src/Passes/BinaryFunctionCallGraph.h
@@ -0,0 +1,80 @@
+//===--- Passes/CallGraph.h -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H
+
+#include "CallGraph.h"
+
+#include <unordered_map>
+#include <functional>
+#include <deque>
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+class BinaryContext;
+
+class BinaryFunctionCallGraph : public CallGraph {
+public:
+  NodeId maybeGetNodeId(const BinaryFunction *BF) const {
+    auto Itr = FuncToNodeId.find(BF);
+    return Itr != FuncToNodeId.end() ? Itr->second : InvalidId;
+  }
+  NodeId getNodeId(const BinaryFunction *BF) const {
+    auto Itr = FuncToNodeId.find(BF);
+    assert(Itr != FuncToNodeId.end());
+    return Itr->second;
+  }
+  BinaryFunction *nodeIdToFunc(NodeId Id) {
+    assert(Id < Funcs.size());
+    return Funcs[Id];
+  }
+  const BinaryFunction *nodeIdToFunc(NodeId Id) const {
+    assert(Id < Funcs.size());
+    return Funcs[Id];
+  }
+  NodeId addNode(BinaryFunction *BF, uint32_t Size, uint64_t Samples = 0);
+
+  /// Compute a DFS traversal of the call graph.
+  std::deque<BinaryFunction *> buildTraversalOrder();
+
+private:
+  std::unordered_map<const BinaryFunction *, NodeId> FuncToNodeId;
+  std::vector<BinaryFunction *> Funcs;
+};
+
+using CgFilterFunction = std::function<bool (const BinaryFunction &BF)>;
+inline bool NoFilter(const BinaryFunction &) { return false; }
+
+/// Builds a call graph from the map of BinaryFunctions provided in BC.
+/// The arguments control how the graph is constructed.
+/// Filter is called on each function, any function that it returns true for
+/// is omitted from the graph.
+/// If IncludeColdCalls is true, then calls from cold BBs are considered for the
+/// graph, otherwise they are ignored.
+/// UseFunctionHotSize controls whether the hot size of a function is used when
+/// filling in the Size attribute of new Nodes.
+/// UseEdgeCounts is used to control if the Weight attribute on Arcs is computed
+/// using the number of calls.
+BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
+                                       CgFilterFunction Filter = NoFilter,
+                                       bool CgFromPerfData = false,
+                                       bool IncludeColdCalls = true,
+                                       bool UseFunctionHotSize = false,
+                                       bool UseSplitHotSize = false,
+                                       bool UseEdgeCounts = false,
+                                       bool IgnoreRecursiveCalls = false);
+
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp
new file mode 100644
index 000000000000..a17d3d1e8dec
--- /dev/null
+++ b/bolt/src/Passes/BinaryPasses.cpp
@@ -0,0 +1,1764 @@
+//===--- BinaryPasses.cpp - Binary-level analysis/optimization passes -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryPasses.h"
+#include "ParallelUtilities.h"
+#include "Passes/ReorderAlgorithm.h"
+#include "Passes/ReorderFunctions.h"
+#include "llvm/Support/CommandLine.h"
+
+#include <numeric>
+#include <vector>
+
+#define DEBUG_TYPE "bolt-opts"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+
+const char* dynoStatsOptName(const bolt::DynoStats::Category C) {
+  if (C == bolt::DynoStats::FIRST_DYNO_STAT)
+    return "none";
+  else if (C == bolt::DynoStats::LAST_DYNO_STAT)
+    return "all";
+
+  static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT+1];
+
+  OptNames[C] = bolt::DynoStats::Description(C);
+
+  std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
+
+  return OptNames[C].c_str();
+}
+
+const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) {
+  if (C == bolt::DynoStats::FIRST_DYNO_STAT)
+    return "unsorted";
+  else if (C == bolt::DynoStats::LAST_DYNO_STAT)
+    return "sorted by all stats";
+
+  return bolt::DynoStats::Description(C);
+}
+
+}
+
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<bool> EnableBAT;
+extern cl::opt<bool> UpdateDebugSections;
+extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
+extern bool isHotTextMover(const bolt::BinaryFunction &Function);
+
+enum DynoStatsSortOrder : char {
+  Ascending,
+  Descending
+};
+
+static cl::opt<DynoStatsSortOrder>
+DynoStatsSortOrderOpt("print-sorted-by-order",
+  cl::desc("use ascending or descending order when printing functions "
+           "ordered by dyno stats"),
+  cl::ZeroOrMore,
+  cl::init(DynoStatsSortOrder::Descending),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+MinBranchClusters("min-branch-clusters",
+  cl::desc("use a modified clustering algorithm geared towards minimizing "
+           "branches"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+enum PeepholeOpts : char {
+  PEEP_NONE             = 0x0,
+  PEEP_SHORTEN          = 0x1,
+  PEEP_DOUBLE_JUMPS     = 0x2,
+  PEEP_TAILCALL_TRAPS   = 0x4,
+  PEEP_USELESS_BRANCHES = 0x8,
+  PEEP_ALL              = 0xf
+};
+
+static cl::list<PeepholeOpts>
+Peepholes("peepholes",
+  cl::CommaSeparated,
+  cl::desc("enable peephole optimizations"),
+  cl::value_desc("opt1,opt2,opt3,..."),
+  cl::values(
+    clEnumValN(PEEP_NONE, "none", "disable peepholes"),
+    clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"),
+    clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps",
+               "remove double jumps when able"),
+    clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"),
+    clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches",
+               "remove useless conditional branches"),
+    clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+PrintFuncStat("print-function-statistics",
+  cl::desc("print statistics about basic block ordering"),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::list<bolt::DynoStats::Category>
+PrintSortedBy("print-sorted-by",
+  cl::CommaSeparated,
+  cl::desc("print functions sorted by order of dyno stats"),
+  cl::value_desc("key1,key2,key3,..."),
+  cl::values(
+#define D(name, ...)                                        \
+    clEnumValN(bolt::DynoStats::name,                     \
+               dynoStatsOptName(bolt::DynoStats::name),   \
+               dynoStatsOptDesc(bolt::DynoStats::name)),
+    DYNO_STATS
+#undef D
+    clEnumValN(0xffff, ".", ".")
+    ),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintUnknown("print-unknown",
+  cl::desc("print names of functions with unknown control flow"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory),
+  cl::Hidden);
+
+static cl::opt<bool>
+PrintUnknownCFG("print-unknown-cfg",
+  cl::desc("dump CFG of functions with unknown control flow"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory),
+  cl::ReallyHidden);
+
+cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
+    "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
+    cl::init(bolt::ReorderBasicBlocks::LT_NONE),
+    cl::values(
+        clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
+                   "do not reorder basic blocks"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
+                   "layout blocks in reverse order"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
+                   "perform optimal layout based on profile"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
+                   "branch-predictor",
+                   "perform optimal layout prioritizing branch "
+                   "predictions"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
+                   "perform optimal layout prioritizing I-cache "
+                   "behavior"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "cache+",
+                   "perform layout optimizing I-cache behavior"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
+                   "perform layout optimizing I-cache behavior"),
+        clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
+                   "cluster-shuffle", "perform random layout of clusters")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+ExecutionCountThreshold("execution-count-threshold",
+  cl::desc("perform profiling accuracy-sensitive optimizations only if "
+           "function execution count >= the threshold (default: 0)"),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ReportBadLayout("report-bad-layout",
+  cl::desc("print top <uint> functions with suboptimal code layout on input"),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+ReportStaleFuncs("report-stale",
+  cl::desc("print the list of functions with stale profile"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+enum SctcModes : char {
+  SctcAlways,
+  SctcPreserveDirection,
+  SctcHeuristic
+};
+
+static cl::opt<SctcModes>
+SctcMode("sctc-mode",
+  cl::desc("mode for simplify conditional tail calls"),
+  cl::init(SctcAlways),
+  cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
+    clEnumValN(SctcPreserveDirection,
+      "preserve",
+      "only perform sctc when branch direction is "
+      "preserved"),
+    clEnumValN(SctcHeuristic,
+      "heuristic",
+      "use branch prediction data to control sctc")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+StaleThreshold("stale-threshold",
+    cl::desc(
+      "maximum percentage of stale functions to tolerate (default: 100)"),
+    cl::init(100),
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+TSPThreshold("tsp-threshold",
+  cl::desc("maximum number of hot basic blocks in a function for which to use "
+           "a precise TSP solution while re-ordering basic blocks"),
+  cl::init(10),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+TopCalledLimit("top-called-limit",
+  cl::desc("maximum number of functions to print in top called "
+           "functions section"),
+  cl::init(100),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
+  return BF.isSimple() &&
+         BF.getState() == BinaryFunction::State::CFG &&
+         !BF.isIgnored();
+}
+
+bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
+  return BF.isSimple() && !BF.isIgnored();
+}
+
+void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
+  if (Function.layout_size() > 0) {
+    unsigned Count;
+    uint64_t Bytes;
+    Function.markUnreachableBlocks();
+    LLVM_DEBUG({
+      for (BinaryBasicBlock *BB : Function.layout()) {
+        if (!BB->isValid()) {
+          dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
+                 << " in function " << Function << "\n";
+          Function.dump();
+        }
+      }
+    });
+    std::tie(Count, Bytes) = Function.eraseInvalidBBs();
+    DeletedBlocks += Count;
+    DeletedBytes += Bytes;
+    if (Count) {
+      Modified.insert(&Function);
+      if (opts::Verbosity > 0) {
+        outs() << "BOLT-INFO: Removed " << Count
+               << " dead basic block(s) accounting for " << Bytes
+               << " bytes in function " << Function << '\n';
+      }
+    }
+  }
+}
+
+void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+    if (shouldOptimize(Function)) {
+      runOnFunction(Function);
+    }
+  }
+
+  outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
+         << DeletedBytes << " bytes of code.\n";
+}
+
+bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
+  return (BinaryFunctionPass::shouldPrint(BF) &&
+          opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
+}
+
+bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
+  // Apply execution count threshold
+  if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
+    return false;
+
+  return BinaryFunctionPass::shouldOptimize(BF);
+}
+
+void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
+  if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
+    return;
+
+  std::atomic<uint64_t> ModifiedFuncCount{0};
+
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
+    if (BF.hasLayoutChanged()) {
+      ++ModifiedFuncCount;
+    }
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
+      "ReorderBasicBlocks");
+
+  outs() << "BOLT-INFO: basic block reordering modified layout of "
+         << format("%zu (%.2lf%%) functions\n", ModifiedFuncCount.load(),
+                   100.0 * ModifiedFuncCount.load() /
+                       BC.getBinaryFunctions().size());
+
+  if (opts::PrintFuncStat > 0) {
+    raw_ostream &OS = outs();
+    // Copy all the values into vector in order to sort them
+    std::map<uint64_t, BinaryFunction &> ScoreMap;
+    auto &BFs = BC.getBinaryFunctions();
+    for (auto It = BFs.begin(); It != BFs.end(); ++It) {
+      ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
+          It->second.getFunctionScore(), It->second));
+    }
+
+    OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
+    OS << "           There are " << BFs.size() << " functions in total. \n";
+    OS << "           Number of functions being modified: "
+       << ModifiedFuncCount.load() << "\n";
+    OS << "           User asks for detailed information on top "
+       << opts::PrintFuncStat << " functions. (Ranked by function score)"
+       << "\n\n";
+    uint64_t I = 0;
+    for (std::map<uint64_t, BinaryFunction &>::reverse_iterator
+             Rit = ScoreMap.rbegin();
+         Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
+      BinaryFunction &Function = Rit->second;
+
+      OS << "           Information for function of top: " << (I + 1) << ": \n";
+      OS << "             Function Score is: " << Function.getFunctionScore()
+         << "\n";
+      OS << "             There are " << Function.size()
+         << " number of blocks in this function.\n";
+      OS << "             There are " << Function.getInstructionCount()
+         << " number of instructions in this function.\n";
+      OS << "             The edit distance for this function is: "
+         << Function.getEditDistance() << "\n\n";
+    }
+  }
+}
+
+void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
+    LayoutType Type, bool MinBranchClusters) const {
+  if (BF.size() == 0 || Type == LT_NONE)
+    return;
+
+  BinaryFunction::BasicBlockOrderType NewLayout;
+  std::unique_ptr<ReorderAlgorithm> Algo;
+
+  // Cannot do optimal layout without profile.
+  if (Type != LT_REVERSE && !BF.hasValidProfile())
+    return;
+
+  if (Type == LT_REVERSE) {
+    Algo.reset(new ReverseReorderAlgorithm());
+  } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
+    // Work on optimal solution if problem is small enough
+    LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
+    Algo.reset(new TSPReorderAlgorithm());
+  } else {
+    LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
+
+    std::unique_ptr<ClusterAlgorithm> CAlgo;
+    if (MinBranchClusters)
+      CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
+    else
+      CAlgo.reset(new PHGreedyClusterAlgorithm());
+
+    switch(Type) {
+    case LT_OPTIMIZE:
+      Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
+      break;
+
+    case LT_OPTIMIZE_BRANCH:
+      Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
+      break;
+
+    case LT_OPTIMIZE_CACHE:
+      Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
+      break;
+
+    case LT_OPTIMIZE_EXT_TSP:
+      Algo.reset(new ExtTSPReorderAlgorithm());
+      break;
+
+    case LT_OPTIMIZE_SHUFFLE:
+      Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
+      break;
+
+    default:
+      llvm_unreachable("unexpected layout type");
+    }
+  }
+
+  Algo->reorderBasicBlocks(BF, NewLayout);
+
+  BF.updateBasicBlockLayout(NewLayout);
+}
+
+void FixupBranches::runOnFunctions(BinaryContext &BC) {
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+    if (!BC.shouldEmit(Function) || !Function.isSimple())
+      continue;
+
+    Function.fixBranches();
+  }
+}
+
+void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    if (!BF.finalizeCFIState()) {
+      if (BC.HasRelocations) {
+        errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
+               << ". Exiting.\n";
+        exit(1);
+      }
+      BF.setSimple(false);
+      return;
+    }
+
+    BF.setFinalized();
+
+    // Update exception handling information.
+    BF.updateEHRanges();
+  };
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    return !BC.shouldEmit(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
+      SkipPredicate, "FinalizeFunctions");
+}
+
+void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
+  if (BC.HasRelocations)
+    return;
+
+  if (!opts::UpdateDebugSections)
+    return;
+
+  // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
+  // incorrect debug info.
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    uint64_t HotSize, ColdSize;
+    std::tie(HotSize, ColdSize) =
+        BC.calculateEmittedSize(BF, /*FixBranches=*/false);
+    if (HotSize > BF.getMaxSize())
+      BF.setSimple(false);
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+      SkipFunc, "CheckLargeFunctions");
+}
+
+bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
+  // Unlike other passes, allow functions in non-CFG state.
+  return BF.isSimple() && !BF.isIgnored();
+}
+
+void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
+  std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
+
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &BF = It.second;
+    int64_t CurrentGnuArgsSize = 0;
+
+    // Have we crossed hot/cold border for split functions?
+    bool SeenCold = false;
+
+    for (BinaryBasicBlock *BB : BF.layout()) {
+      if (BB->isCold() && !SeenCold) {
+        SeenCold = true;
+        CurrentGnuArgsSize = 0;
+      }
+
+      // First convert GnuArgsSize annotations into CFIs. This may change instr
+      // pointers, so do it before recording ptrs for preserved annotations
+      if (BF.usesGnuArgsSize()) {
+        for (auto II = BB->begin(); II != BB->end(); ++II) {
+          if (!BC.MIB->isInvoke(*II))
+            continue;
+          const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
+          assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size");
+          if (NewGnuArgsSize != CurrentGnuArgsSize) {
+            auto InsertII = BF.addCFIInstruction(BB, II,
+                MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
+            CurrentGnuArgsSize = NewGnuArgsSize;
+            II = std::next(InsertII);
+          }
+        }
+      }
+
+      // Now record preserved annotations separately and then strip annotations.
+      for (auto II = BB->begin(); II != BB->end(); ++II) {
+        if (BF.requiresAddressTranslation() &&
+            BC.MIB->hasAnnotation(*II, "Offset")) {
+          PreservedOffsetAnnotations.emplace_back(
+              &(*II), BC.MIB->getAnnotationAs<uint32_t>(*II, "Offset"));
+        }
+        BC.MIB->stripAnnotations(*II);
+      }
+    }
+  }
+
+  // Release all memory taken by annotations
+  BC.MIB->freeAnnotations();
+
+  // Reinsert preserved annotations we need during code emission.
+  for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
+    BC.MIB->addAnnotation<uint32_t>(*Item.first, "Offset", Item.second);
+}
+
+namespace {
+
+// This peephole fixes jump instructions that jump to another basic
+// block with a single jump instruction, e.g.
+//
+// B0: ...
+//     jmp  B1   (or jcc B1)
+//
+// B1: jmp  B2
+//
+// ->
+//
+// B0: ...
+//     jmp  B2   (or jcc B2)
+//
+uint64_t fixDoubleJumps(BinaryContext &BC,
+                        BinaryFunction &Function,
+                        bool MarkInvalid) {
+  uint64_t NumDoubleJumps = 0;
+
+  for (BinaryBasicBlock &BB : Function) {
+    auto checkAndPatch = [&](BinaryBasicBlock *Pred,
+                             BinaryBasicBlock *Succ,
+                             const MCSymbol *SuccSym) {
+      // Ignore infinite loop jumps or fallthrough tail jumps.
+      if (Pred == Succ || Succ == &BB)
+        return false;
+
+      if (Succ) {
+        const MCSymbol *TBB = nullptr;
+        const MCSymbol *FBB = nullptr;
+        MCInst *CondBranch = nullptr;
+        MCInst *UncondBranch = nullptr;
+        bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
+        if(!Res) {
+          LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
+                     Pred->dump());
+          return false;
+        }
+        Pred->replaceSuccessor(&BB, Succ);
+
+        // We must patch up any existing branch instructions to match up
+        // with the new successor.
+        MCContext *Ctx = BC.Ctx.get();
+        assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
+               "Predecessor block has inconsistent number of successors");
+        if (CondBranch &&
+            BC.MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
+          BC.MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
+        } else if (UncondBranch &&
+                   BC.MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
+          BC.MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
+        } else if (!UncondBranch) {
+          assert(Function.getBasicBlockAfter(Pred, false) != Succ &&
+                 "Don't add an explicit jump to a fallthrough block.");
+          Pred->addBranchInstruction(Succ);
+        }
+      } else {
+        // Succ will be null in the tail call case.  In this case we
+        // need to explicitly add a tail call instruction.
+        MCInst *Branch = Pred->getLastNonPseudoInstr();
+        if (Branch && BC.MIB->isUnconditionalBranch(*Branch)) {
+          assert(BC.MIB->getTargetSymbol(*Branch) == BB.getLabel());
+          Pred->removeSuccessor(&BB);
+          Pred->eraseInstruction(Pred->findInstruction(Branch));
+          Pred->addTailCallInstruction(SuccSym);
+        } else {
+          return false;
+        }
+      }
+
+      ++NumDoubleJumps;
+      LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
+                        << Pred->getName() << " -> " << BB.getName() << " to "
+                        << Pred->getName() << " -> " << SuccSym->getName()
+                        << (!Succ ? " (tail)\n" : "\n"));
+
+      return true;
+    };
+
+    if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
+      continue;
+
+    MCInst *Inst = BB.getFirstNonPseudoInstr();
+    const bool IsTailCall = BC.MIB->isTailCall(*Inst);
+
+    if (!BC.MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
+      continue;
+
+    // If we operate after SCTC make sure it's not a conditional tail call.
+    if (IsTailCall && BC.MIB->isConditionalBranch(*Inst))
+      continue;
+
+    const MCSymbol *SuccSym = BC.MIB->getTargetSymbol(*Inst);
+    BinaryBasicBlock *Succ = BB.getSuccessor();
+
+    if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
+      continue;
+
+    std::vector<BinaryBasicBlock *> Preds{BB.pred_begin(), BB.pred_end()};
+
+    for (BinaryBasicBlock *Pred : Preds) {
+      if (Pred->isLandingPad())
+        continue;
+
+      if (Pred->getSuccessor() == &BB ||
+          (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
+          Pred->getConditionalSuccessor(false) == &BB) {
+        if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) {
+          BB.markValid(BB.pred_size() != 0 ||
+                       BB.isLandingPad() ||
+                       BB.isEntryPoint());
+        }
+      }
+    }
+  }
+
+  return NumDoubleJumps;
+}
+
+}
+
+bool SimplifyConditionalTailCalls::shouldRewriteBranch(
+    const BinaryBasicBlock *PredBB,
+    const MCInst &CondBranch,
+    const BinaryBasicBlock *BB,
+    const bool DirectionFlag
+) {
+  if (BeenOptimized.count(PredBB))
+    return false;
+
+  const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
+
+  if (IsForward)
+    ++NumOrigForwardBranches;
+  else
+    ++NumOrigBackwardBranches;
+
+  if (opts::SctcMode == opts::SctcAlways)
+    return true;
+
+  if (opts::SctcMode == opts::SctcPreserveDirection)
+    return IsForward == DirectionFlag;
+
+  const ErrorOr<std::pair<double, double>> Frequency =
+      PredBB->getBranchStats(BB);
+
+  // It's ok to rewrite the conditional branch if the new target will be
+  // a backward branch.
+
+  // If no data available for these branches, then it should be ok to
+  // do the optimization since it will reduce code size.
+  if (Frequency.getError())
+    return true;
+
+  // TODO: should this use misprediction frequency instead?
+  const bool Result =
+    (IsForward && Frequency.get().first >= 0.5) ||
+    (!IsForward && Frequency.get().first <= 0.5);
+
+  return Result == DirectionFlag;
+}
+
+uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
+                                                    BinaryFunction &BF) {
+  // Need updated indices to correctly detect branch' direction.
+  BF.updateLayoutIndices();
+  BF.markUnreachableBlocks();
+
+  auto &MIB = BC.MIB;
+  uint64_t NumLocalCTCCandidates = 0;
+  uint64_t NumLocalCTCs = 0;
+  uint64_t LocalCTCTakenCount = 0;
+  uint64_t LocalCTCExecCount = 0;
+  std::vector<std::pair<BinaryBasicBlock *,
+                        const BinaryBasicBlock *>> NeedsUncondBranch;
+
+  // Will block be deleted by UCE?
+  auto isValid = [](const BinaryBasicBlock *BB) {
+    return (BB->pred_size() != 0 ||
+            BB->isLandingPad() ||
+            BB->isEntryPoint());
+  };
+
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    // Locate BB with a single direct tail-call instruction.
+    if (BB->getNumNonPseudos() != 1)
+      continue;
+
+    MCInst *Instr = BB->getFirstNonPseudoInstr();
+    if (!MIB->isTailCall(*Instr) || BC.MIB->isConditionalBranch(*Instr))
+      continue;
+
+    const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
+    if (!CalleeSymbol)
+      continue;
+
+    // Detect direction of the possible conditional tail call.
+    const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
+
+    // Iterate through all predecessors.
+    for (BinaryBasicBlock *PredBB : BB->predecessors()) {
+      BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
+      if (!CondSucc)
+        continue;
+
+      ++NumLocalCTCCandidates;
+
+      const MCSymbol *TBB = nullptr;
+      const MCSymbol *FBB = nullptr;
+      MCInst *CondBranch = nullptr;
+      MCInst *UncondBranch = nullptr;
+      bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
+
+      // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
+      if (!Result) {
+        LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
+                   PredBB->dump());
+        continue;
+      }
+
+      assert(Result && "internal error analyzing conditional branch");
+      assert(CondBranch && "conditional branch expected");
+
+      // It's possible that PredBB is also a successor to BB that may have
+      // been processed by a previous iteration of the SCTC loop, in which
+      // case it may have been marked invalid.  We should skip rewriting in
+      // this case.
+      if (!PredBB->isValid()) {
+        assert(PredBB->isSuccessor(BB) &&
+               "PredBB should be valid if it is not a successor to BB");
+        continue;
+      }
+
+      // We don't want to reverse direction of the branch in new order
+      // without further profile analysis.
+      const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
+      if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
+        continue;
+
+      // Record this block so that we don't try to optimize it twice.
+      BeenOptimized.insert(PredBB);
+
+      uint64_t Count = 0;
+      if (CondSucc != BB) {
+        // Patch the new target address into the conditional branch.
+        MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, BC.Ctx.get());
+        // Since we reversed the condition on the branch we need to change
+        // the target for the unconditional branch or add a unconditional
+        // branch to the old target.  This has to be done manually since
+        // fixupBranches is not called after SCTC.
+        NeedsUncondBranch.emplace_back(PredBB, CondSucc);
+        Count = PredBB->getFallthroughBranchInfo().Count;
+      } else {
+        // Change destination of the conditional branch.
+        MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get());
+        Count = PredBB->getTakenBranchInfo().Count;
+      }
+      const uint64_t CTCTakenFreq =
+        Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
+
+      // Annotate it, so "isCall" returns true for this jcc
+      MIB->setConditionalTailCall(*CondBranch);
+      // Add info abount the conditional tail call frequency, otherwise this
+      // info will be lost when we delete the associated BranchInfo entry
+      auto &CTCAnnotation = BC.MIB->getOrCreateAnnotationAs<uint64_t>(
+          *CondBranch, "CTCTakenCount");
+      CTCAnnotation = CTCTakenFreq;
+
+      // Remove the unused successor which may be eliminated later
+      // if there are no other users.
+      PredBB->removeSuccessor(BB);
+      // Update BB execution count
+      if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) {
+        BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
+      } else if (CTCTakenFreq > BB->getKnownExecutionCount()) {
+        BB->setExecutionCount(0);
+      }
+
+      ++NumLocalCTCs;
+      LocalCTCTakenCount += CTCTakenFreq;
+      LocalCTCExecCount += PredBB->getKnownExecutionCount();
+    }
+
+    // Remove the block from CFG if all predecessors were removed.
+    BB->markValid(isValid(BB));
+  }
+
+  // Add unconditional branches at the end of BBs to new successors
+  // as long as the successor is not a fallthrough.
+  for (auto &Entry : NeedsUncondBranch) {
+    BinaryBasicBlock *PredBB = Entry.first;
+    const BinaryBasicBlock *CondSucc = Entry.second;
+
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
+
+    // Find the next valid block.  Invalid blocks will be deleted
+    // so they shouldn't be considered fallthrough targets.
+    const BinaryBasicBlock *NextBlock = BF.getBasicBlockAfter(PredBB, false);
+    while (NextBlock && !isValid(NextBlock)) {
+      NextBlock = BF.getBasicBlockAfter(NextBlock, false);
+    }
+
+    // Get the unconditional successor to this block.
+    const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
+    assert(PredSucc && "The other branch should be a tail call");
+
+    const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
+
+    if (UncondBranch) {
+      if (HasFallthrough)
+        PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
+      else
+        MIB->replaceBranchTarget(*UncondBranch,
+                                 CondSucc->getLabel(),
+                                 BC.Ctx.get());
+    } else if (!HasFallthrough) {
+      MCInst Branch;
+      MIB->createUncondBranch(Branch, CondSucc->getLabel(), BC.Ctx.get());
+      PredBB->addInstruction(Branch);
+    }
+  }
+
+  if (NumLocalCTCs > 0) {
+    NumDoubleJumps += fixDoubleJumps(BC, BF, true);
+    // Clean-up unreachable tail-call blocks.
+    const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
+    DeletedBlocks += Stats.first;
+    DeletedBytes += Stats.second;
+
+    assert(BF.validateCFG());
+  }
+
+  LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
+                    << " conditional tail calls from a total of "
+                    << NumLocalCTCCandidates << " candidates in function " << BF
+                    << ". CTCs execution count for this function is "
+                    << LocalCTCExecCount << " and CTC taken count is "
+                    << LocalCTCTakenCount << "\n";);
+
+  NumTailCallsPatched += NumLocalCTCs;
+  NumCandidateTailCalls += NumLocalCTCCandidates;
+  CTCExecCount += LocalCTCExecCount;
+  CTCTakenCount += LocalCTCTakenCount;
+
+  return NumLocalCTCs > 0;
+}
+
+void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+
+    if (!shouldOptimize(Function))
+      continue;
+
+    if (fixTailCalls(BC, Function)) {
+      Modified.insert(&Function);
+      Function.setHasCanonicalCFG(false);
+    }
+  }
+
+  outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
+         << " tail calls (" << NumOrigForwardBranches << " forward)"
+         << " tail calls (" << NumOrigBackwardBranches << " backward)"
+         << " from a total of " << NumCandidateTailCalls << " while removing "
+         << NumDoubleJumps << " double jumps"
+         << " and removing " << DeletedBlocks << " basic blocks"
+         << " totalling " << DeletedBytes
+         << " bytes of code. CTCs total execution count is " << CTCExecCount
+         << " and the number of times CTCs are taken is " << CTCTakenCount
+         << ".\n";
+}
+
+uint64_t Peepholes::shortenInstructions(BinaryContext &BC,
+                                        BinaryFunction &Function) {
+  MCInst DebugInst;
+  uint64_t Count = 0;
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      if (opts::Verbosity > 1) {
+        DebugInst = Inst;
+      }
+      if (BC.MIB->shortenInstruction(Inst)) {
+        if (opts::Verbosity > 1) {
+          outs() << "BOLT-INFO: peephole, shortening:\n"
+                 << "BOLT-INFO:    ";
+          BC.printInstruction(outs(), DebugInst, 0, &Function);
+          outs() << "BOLT-INFO: to:";
+          BC.printInstruction(outs(), Inst, 0, &Function);
+        }
+        ++Count;
+      }
+    }
+  }
+  return Count;
+}
+
+void Peepholes::addTailcallTraps(BinaryContext &BC,
+                                 BinaryFunction &Function) {
+  for (BinaryBasicBlock &BB : Function) {
+    MCInst *Inst = BB.getLastNonPseudoInstr();
+    if (Inst && BC.MIB->isTailCall(*Inst) && BC.MIB->isIndirectBranch(*Inst)) {
+      MCInst Trap;
+      if (BC.MIB->createTrap(Trap)) {
+        BB.addInstruction(Trap);
+        ++TailCallTraps;
+      }
+    }
+  }
+}
+
+void Peepholes::removeUselessCondBranches(BinaryContext &BC,
+                                          BinaryFunction &Function) {
+  for (BinaryBasicBlock &BB : Function) {
+    if (BB.succ_size() != 2)
+      continue;
+
+    BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
+    BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
+    if (CondBB != UncondBB)
+      continue;
+
+    const MCSymbol *TBB = nullptr;
+    const MCSymbol *FBB = nullptr;
+    MCInst *CondBranch = nullptr;
+    MCInst *UncondBranch = nullptr;
+    bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
+
+    // analyzeBranch() can fail due to unusual branch instructions,
+    // e.g. jrcxz, or jump tables (indirect jump).
+    if (!Result || !CondBranch)
+      continue;
+
+    BB.removeDuplicateConditionalSuccessor(CondBranch);
+    ++NumUselessCondBranches;
+  }
+}
+
+void Peepholes::runOnFunctions(BinaryContext &BC) {
+  const char Opts =
+    std::accumulate(opts::Peepholes.begin(),
+                    opts::Peepholes.end(),
+                    0,
+                    [](const char A, const opts::PeepholeOpts B) {
+                      return A | B;
+                    });
+  if (Opts == opts::PEEP_NONE || !BC.isX86())
+    return;
+
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+    if (shouldOptimize(Function)) {
+      if (Opts & opts::PEEP_SHORTEN)
+        NumShortened += shortenInstructions(BC, Function);
+      if (Opts & opts::PEEP_DOUBLE_JUMPS)
+        NumDoubleJumps += fixDoubleJumps(BC, Function, false);
+      if (Opts & opts::PEEP_TAILCALL_TRAPS)
+        addTailcallTraps(BC, Function);
+      if (Opts & opts::PEEP_USELESS_BRANCHES)
+        removeUselessCondBranches(BC, Function);
+      assert(Function.validateCFG());
+    }
+  }
+  outs() << "BOLT-INFO: Peephole: " << NumShortened
+         << " instructions shortened.\n"
+         << "BOLT-INFO: Peephole: " << NumDoubleJumps
+         << " double jumps patched.\n"
+         << "BOLT-INFO: Peephole: " << TailCallTraps
+         << " tail call traps inserted.\n"
+         << "BOLT-INFO: Peephole: " << NumUselessCondBranches
+         << " useless conditional branches removed.\n";
+}
+
+bool SimplifyRODataLoads::simplifyRODataLoads(
+    BinaryContext &BC, BinaryFunction &BF) {
+  auto &MIB = BC.MIB;
+
+  uint64_t NumLocalLoadsSimplified = 0;
+  uint64_t NumDynamicLocalLoadsSimplified = 0;
+  uint64_t NumLocalLoadsFound = 0;
+  uint64_t NumDynamicLocalLoadsFound = 0;
+
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    for (MCInst &Inst : *BB) {
+      unsigned Opcode = Inst.getOpcode();
+      const MCInstrDesc &Desc = BC.MII->get(Opcode);
+
+      // Skip instructions that do not load from memory.
+      if (!Desc.mayLoad())
+        continue;
+
+      // Try to statically evaluate the target memory address;
+      uint64_t TargetAddress;
+
+      if (MIB->hasPCRelOperand(Inst)) {
+        // Try to find the symbol that corresponds to the PC-relative operand.
+        MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
+        assert(DispOpI != Inst.end() && "expected PC-relative displacement");
+        assert(DispOpI->isExpr() &&
+              "found PC-relative with non-symbolic displacement");
+
+        // Get displacement symbol.
+        const MCSymbol *DisplSymbol;
+        uint64_t DisplOffset;
+
+        std::tie(DisplSymbol, DisplOffset) =
+          BC.MIB->getTargetSymbolInfo(DispOpI->getExpr());
+
+        if (!DisplSymbol)
+          continue;
+
+        // Look up the symbol address in the global symbols map of the binary
+        // context object.
+        BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
+        if (!BD)
+          continue;
+        TargetAddress = BD->getAddress() + DisplOffset;
+      } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
+        continue;
+      }
+
+      // Get the contents of the section containing the target address of the
+      // memory operand. We are only interested in read-only sections.
+      ErrorOr<BinarySection &> DataSection =
+          BC.getSectionForAddress(TargetAddress);
+      if (!DataSection || !DataSection->isReadOnly())
+        continue;
+
+      if (BC.getRelocationAt(TargetAddress) ||
+          BC.getDynamicRelocationAt(TargetAddress))
+        continue;
+
+      uint32_t Offset = TargetAddress - DataSection->getAddress();
+      StringRef ConstantData = DataSection->getContents();
+
+      ++NumLocalLoadsFound;
+      if (BB->hasProfile())
+        NumDynamicLocalLoadsFound += BB->getExecutionCount();
+
+      if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
+        ++NumLocalLoadsSimplified;
+        if (BB->hasProfile())
+          NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
+      }
+    }
+  }
+
+  NumLoadsFound += NumLocalLoadsFound;
+  NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
+  NumLoadsSimplified += NumLocalLoadsSimplified;
+  NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
+
+  return NumLocalLoadsSimplified > 0;
+}
+
+void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+    if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) {
+      Modified.insert(&Function);
+    }
+  }
+
+  outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
+         << NumLoadsFound << " loads from a statically computed address.\n"
+         << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
+         << "\n"
+         << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
+}
+
+void AssignSections::runOnFunctions(BinaryContext &BC) {
+  for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
+    Function->setCodeSectionName(BC.getInjectedCodeSectionName());
+    Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
+  }
+
+  // In non-relocation mode functions have pre-assigned section names.
+  if (!BC.HasRelocations)
+    return;
+
+  const bool UseColdSection =
+      BC.NumProfiledFuncs > 0 ||
+      opts::ReorderFunctions == ReorderFunctions::RT_USER;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (opts::isHotTextMover(Function)) {
+      Function.setCodeSectionName(BC.getHotTextMoverSectionName());
+      Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
+      continue;
+    }
+
+    if (!UseColdSection ||
+        Function.hasValidIndex() ||
+        Function.hasValidProfile()) {
+      Function.setCodeSectionName(BC.getMainCodeSectionName());
+    } else {
+      Function.setCodeSectionName(BC.getColdCodeSectionName());
+    }
+
+    if (Function.isSplit())
+      Function.setColdCodeSectionName(BC.getColdCodeSectionName());
+  }
+}
+
+void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
+  double FlowImbalanceMean = 0.0;
+  size_t NumBlocksConsidered = 0;
+  double WorstBias = 0.0;
+  const BinaryFunction *WorstBiasFunc = nullptr;
+
+  // For each function CFG, we fill an IncomingMap with the sum of the frequency
+  // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
+  // of the frequency of outgoing edges.
+  using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
+  std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
+  std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
+
+  // Compute mean
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &Function = BFI.second;
+    if (Function.empty() || !Function.isSimple())
+      continue;
+    FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
+    FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
+    for (const BinaryBasicBlock &BB : Function) {
+      uint64_t TotalOutgoing = 0ULL;
+      auto SuccBIIter = BB.branch_info_begin();
+      for (BinaryBasicBlock *Succ : BB.successors()) {
+        uint64_t Count = SuccBIIter->Count;
+        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
+          ++SuccBIIter;
+          continue;
+        }
+        TotalOutgoing += Count;
+        IncomingMap[Succ] += Count;
+        ++SuccBIIter;
+      }
+      OutgoingMap[&BB] = TotalOutgoing;
+    }
+
+    size_t NumBlocks = 0;
+    double Mean = 0.0;
+    for (const BinaryBasicBlock &BB : Function) {
+      // Do not compute score for low frequency blocks, entry or exit blocks
+      if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
+        continue;
+      ++NumBlocks;
+      const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
+      Mean += fabs(Difference / IncomingMap[&BB]);
+    }
+
+    FlowImbalanceMean += Mean;
+    NumBlocksConsidered += NumBlocks;
+    if (!NumBlocks)
+      continue;
+    double FuncMean = Mean / NumBlocks;
+    if (FuncMean > WorstBias) {
+      WorstBias = FuncMean;
+      WorstBiasFunc = &Function;
+    }
+  }
+  if (NumBlocksConsidered > 0)
+    FlowImbalanceMean /= NumBlocksConsidered;
+
+  // Compute standard deviation
+  NumBlocksConsidered = 0;
+  double FlowImbalanceVar = 0.0;
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &Function = BFI.second;
+    if (Function.empty() || !Function.isSimple())
+      continue;
+    FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
+    FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
+    for (const BinaryBasicBlock &BB : Function) {
+      if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
+        continue;
+      ++NumBlocksConsidered;
+      const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
+      FlowImbalanceVar +=
+          pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
+    }
+  }
+  if (NumBlocksConsidered) {
+    FlowImbalanceVar /= NumBlocksConsidered;
+    FlowImbalanceVar = sqrt(FlowImbalanceVar);
+  }
+
+  // Report to user
+  outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
+                   (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
+  if (WorstBiasFunc && opts::Verbosity >= 1) {
+    outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
+           << "\n";
+    LLVM_DEBUG(WorstBiasFunc->dump());
+  }
+}
+
+void
+PrintProgramStats::runOnFunctions(BinaryContext &BC) {
+  uint64_t NumRegularFunctions = 0;
+  uint64_t NumStaleProfileFunctions = 0;
+  uint64_t NumNonSimpleProfiledFunctions = 0;
+  uint64_t NumUnknownControlFlowFunctions = 0;
+  uint64_t TotalSampleCount = 0;
+  uint64_t StaleSampleCount = 0;
+  std::vector<BinaryFunction *> ProfiledFunctions;
+  const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    // Ignore PLT functions for stats.
+    if (Function.isPLTFunction())
+      continue;
+
+    ++NumRegularFunctions;
+
+    if (!Function.isSimple()) {
+      if (Function.hasProfile()) {
+        ++NumNonSimpleProfiledFunctions;
+      }
+      continue;
+    }
+
+    if (Function.hasUnknownControlFlow()) {
+      if (opts::PrintUnknownCFG) {
+        Function.dump();
+      } else if (opts::PrintUnknown) {
+        errs() << "function with unknown control flow: " << Function <<'\n';
+      }
+      ++NumUnknownControlFlowFunctions;
+    }
+
+    if (!Function.hasProfile())
+      continue;
+
+    uint64_t SampleCount = Function.getRawBranchCount();
+    TotalSampleCount += SampleCount;
+
+    if (Function.hasValidProfile()) {
+      ProfiledFunctions.push_back(&Function);
+    } else {
+      if (opts::ReportStaleFuncs) {
+        outs() << StaleFuncsHeader;
+        StaleFuncsHeader = "";
+        outs() << "  " << Function << '\n';
+      }
+      ++NumStaleProfileFunctions;
+      StaleSampleCount += SampleCount;
+    }
+  }
+  BC.NumProfiledFuncs = ProfiledFunctions.size();
+
+  const size_t NumAllProfiledFunctions =
+      ProfiledFunctions.size() + NumStaleProfileFunctions;
+  outs() << "BOLT-INFO: " << NumAllProfiledFunctions
+         << " out of " << NumRegularFunctions << " functions in the binary ("
+         << format("%.1f", NumAllProfiledFunctions /
+                             (float) NumRegularFunctions * 100.0f)
+         << "%) have non-empty execution profile\n";
+  if (NumNonSimpleProfiledFunctions) {
+    outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions
+           << " function" << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
+           << " with profile could not be optimized\n";
+  }
+  if (NumStaleProfileFunctions) {
+    const float PctStale =
+      NumStaleProfileFunctions / (float) NumAllProfiledFunctions * 100.0f;
+    auto printErrorOrWarning = [&]() {
+      if (PctStale > opts::StaleThreshold) {
+        errs() << "BOLT-ERROR: ";
+      } else {
+        errs() << "BOLT-WARNING: ";
+      }
+    };
+    printErrorOrWarning();
+    errs() << NumStaleProfileFunctions
+           << format(" (%.1f%% of all profiled)", PctStale)
+           << " function" << (NumStaleProfileFunctions == 1 ? "" : "s")
+           << " have invalid (possibly stale) profile."
+              " Use -report-stale to see the list.\n";
+    if (TotalSampleCount > 0) {
+      printErrorOrWarning();
+      errs() << StaleSampleCount << " out of " << TotalSampleCount
+             << " samples in the binary ("
+             << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
+             << "%) belong to functions with invalid"
+                " (possibly stale) profile.\n";
+    }
+    if (PctStale > opts::StaleThreshold) {
+      errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
+             << opts::StaleThreshold << "%. Exiting.\n";
+      exit(1);
+    }
+  }
+
+  if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
+    outs() << "BOLT-INFO: profile for " << NumUnusedObjects
+           << " objects was ignored\n";
+  }
+
+  if (ProfiledFunctions.size() > 10) {
+    if (opts::Verbosity >= 1) {
+      outs() << "BOLT-INFO: top called functions are:\n";
+      std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(),
+                [](BinaryFunction *A, BinaryFunction *B) {
+                  return B->getExecutionCount() < A->getExecutionCount();
+                }
+                );
+      auto SFI = ProfiledFunctions.begin();
+      auto SFIend = ProfiledFunctions.end();
+      for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
+           ++SFI, ++I) {
+        outs() << "  " << **SFI << " : "
+               << (*SFI)->getExecutionCount() << '\n';
+      }
+    }
+  }
+
+  if (!opts::PrintSortedBy.empty() &&
+      std::find(opts::PrintSortedBy.begin(),
+                opts::PrintSortedBy.end(),
+                DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) {
+
+    std::vector<const BinaryFunction *> Functions;
+    std::map<const BinaryFunction *, DynoStats> Stats;
+
+    for (const auto &BFI : BC.getBinaryFunctions()) {
+      const BinaryFunction &BF = BFI.second;
+      if (shouldOptimize(BF) && BF.hasValidProfile()) {
+        Functions.push_back(&BF);
+        Stats.emplace(&BF, getDynoStats(BF));
+      }
+    }
+
+    const bool SortAll =
+      std::find(opts::PrintSortedBy.begin(),
+                opts::PrintSortedBy.end(),
+                DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end();
+
+    const bool Ascending =
+      opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
+
+    if (SortAll) {
+      std::stable_sort(
+        Functions.begin(),
+        Functions.end(),
+        [Ascending,&Stats](const BinaryFunction *A, const BinaryFunction *B) {
+          return Ascending ?
+            Stats.at(A) < Stats.at(B) : Stats.at(B) < Stats.at(A);
+        }
+      );
+    } else {
+      std::stable_sort(
+        Functions.begin(),
+        Functions.end(),
+        [Ascending,&Stats](const BinaryFunction *A, const BinaryFunction *B) {
+          const DynoStats &StatsA = Stats.at(A);
+          const DynoStats &StatsB = Stats.at(B);
+          return Ascending
+            ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
+            : StatsB.lessThan(StatsA, opts::PrintSortedBy);
+        }
+      );
+    }
+
+    outs() << "BOLT-INFO: top functions sorted by ";
+    if (SortAll) {
+      outs() << "dyno stats";
+    } else {
+      outs() << "(";
+      bool PrintComma = false;
+      for (const DynoStats::Category Category : opts::PrintSortedBy) {
+        if (PrintComma) outs() << ", ";
+        outs() << DynoStats::Description(Category);
+        PrintComma = true;
+      }
+      outs() << ")";
+    }
+
+    outs() << " are:\n";
+    auto SFI = Functions.begin();
+    for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
+      const DynoStats Stats = getDynoStats(**SFI);
+      outs() << "  " << **SFI;
+      if (!SortAll) {
+        outs() << " (";
+        bool PrintComma = false;
+        for (const DynoStats::Category Category : opts::PrintSortedBy) {
+          if (PrintComma) outs() << ", ";
+          outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
+          PrintComma = true;
+        }
+        outs() << ")";
+      }
+      outs() << "\n";
+    }
+  }
+
+  if (!BC.TrappedFunctions.empty()) {
+    errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size()
+           << " function" << (BC.TrappedFunctions.size() > 1 ? "s" : "")
+           << " will trap on entry. Use -trap-avx512=0 to disable"
+              " traps.";
+    if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
+      errs() << '\n';
+      for (const BinaryFunction *Function : BC.TrappedFunctions)
+        errs() << "  " << *Function << '\n';
+    } else {
+      errs() << " Use -v=1 to see the list.\n";
+    }
+  }
+
+  // Print information on missed macro-fusion opportunities seen on input.
+  if (BC.MissedMacroFusionPairs) {
+    outs() << "BOLT-INFO: the input contains "
+           << BC.MissedMacroFusionPairs << " (dynamic count : "
+           << BC.MissedMacroFusionExecCount
+           << ") opportunities for macro-fusion optimization";
+    switch (opts::AlignMacroOpFusion) {
+    case MFT_NONE:
+      outs() << ". Use -align-macro-fusion to fix.\n";
+      break;
+    case MFT_HOT:
+      outs() << ". Will fix instances on a hot path.\n";
+      break;
+    case MFT_ALL:
+      outs() << " that are going to be fixed\n";
+      break;
+    }
+  }
+
+  // Collect and print information about suboptimal code layout on input.
+  if (opts::ReportBadLayout) {
+    std::vector<const BinaryFunction *> SuboptimalFuncs;
+    for (auto &BFI : BC.getBinaryFunctions()) {
+      const BinaryFunction &BF = BFI.second;
+      if (!BF.hasValidProfile())
+        continue;
+
+      const uint64_t HotThreshold =
+          std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
+      bool HotSeen = false;
+      for (const BinaryBasicBlock *BB : BF.rlayout()) {
+        if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
+          HotSeen = true;
+          continue;
+        }
+        if (HotSeen && BB->getKnownExecutionCount() == 0) {
+          SuboptimalFuncs.push_back(&BF);
+          break;
+        }
+      }
+    }
+
+    if (!SuboptimalFuncs.empty()) {
+      std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(),
+               [](const BinaryFunction *A, const BinaryFunction *B) {
+                 return A->getKnownExecutionCount() / A->getSize() >
+                        B->getKnownExecutionCount() / B->getSize();
+               });
+
+      outs() << "BOLT-INFO: " << SuboptimalFuncs.size() << " functions have "
+                "cold code in the middle of hot code. Top functions are:\n";
+      for (unsigned I = 0;
+           I < std::min(static_cast<size_t>(opts::ReportBadLayout),
+                        SuboptimalFuncs.size());
+           ++I) {
+        SuboptimalFuncs[I]->print(outs());
+      }
+    }
+  }
+
+  if (NumUnknownControlFlowFunctions) {
+    outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
+           << " functions have instructions with unknown control flow";
+    if (!opts::PrintUnknown) {
+      outs() << ". Use -print-unknown to see the list.";
+    }
+    outs() << '\n';
+  }
+}
+
+void InstructionLowering::runOnFunctions(BinaryContext &BC) {
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    for (BinaryBasicBlock &BB : BFI.second) {
+      for (MCInst &Instruction : BB) {
+        BC.MIB->lowerTailCall(Instruction);
+      }
+    }
+  }
+}
+
+void StripRepRet::runOnFunctions(BinaryContext &BC) {
+  uint64_t NumPrefixesRemoved = 0;
+  uint64_t NumBytesSaved = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    for (BinaryBasicBlock &BB : BFI.second) {
+      auto LastInstRIter = BB.getLastNonPseudo();
+      if (LastInstRIter == BB.rend() ||
+          !BC.MIB->isReturn(*LastInstRIter) ||
+          !BC.MIB->deleteREPPrefix(*LastInstRIter))
+        continue;
+
+      NumPrefixesRemoved += BB.getKnownExecutionCount();
+      ++NumBytesSaved;
+    }
+  }
+
+  if (NumBytesSaved) {
+    outs() << "BOLT-INFO: removed " << NumBytesSaved << " 'repz' prefixes"
+              " with estimated execution count of " << NumPrefixesRemoved
+           << " times.\n";
+  }
+}
+
+void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  uint64_t NumInlined = 0;
+  uint64_t NumInlinedDyno = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    for (BinaryBasicBlock &BB : BFI.second) {
+      for (auto II = BB.begin(); II != BB.end(); ++II) {
+        MCInst &Inst = *II;
+
+        if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
+            !Inst.getOperand(0).isExpr())
+          continue;
+
+        const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
+        if (CalleeSymbol->getName() != "memcpy" &&
+            CalleeSymbol->getName() != "memcpy@PLT" &&
+            CalleeSymbol->getName() != "_memcpy8")
+          continue;
+
+        const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
+        const bool IsTailCall = BC.MIB->isTailCall(Inst);
+
+        const std::vector<MCInst> NewCode =
+            BC.MIB->createInlineMemcpy(IsMemcpy8);
+        II = BB.replaceInstruction(II, NewCode);
+        std::advance(II, NewCode.size() - 1);
+        if (IsTailCall) {
+          MCInst Return;
+          BC.MIB->createReturn(Return);
+          II = BB.insertInstruction(std::next(II), std::move(Return));
+        }
+
+        ++NumInlined;
+        NumInlinedDyno += BB.getKnownExecutionCount();
+      }
+    }
+  }
+
+  if (NumInlined) {
+    outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
+    if (NumInlinedDyno)
+      outs() << ". The calls were executed " << NumInlinedDyno
+             << " times based on profile.";
+    outs() << '\n';
+  }
+}
+
+bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
+  if (!BinaryFunctionPass::shouldOptimize(Function))
+    return false;
+
+  for (const std::string &FunctionSpec : Spec) {
+    StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
+    if (Function.hasNameRegex(FunctionName))
+      return true;
+  }
+
+  return false;
+}
+
+std::set<size_t>
+SpecializeMemcpy1::getCallSitesToOptimize(const BinaryFunction &Function) const{
+  StringRef SitesString;
+  for (const std::string &FunctionSpec : Spec) {
+    StringRef FunctionName;
+    std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
+    if (Function.hasNameRegex(FunctionName))
+      break;
+    SitesString = "";
+  }
+
+  std::set<size_t> Sites;
+  SmallVector<StringRef, 4> SitesVec;
+  SitesString.split(SitesVec, ':');
+  for (StringRef SiteString : SitesVec) {
+    if (SiteString.empty())
+      continue;
+    size_t Result;
+    if (!SiteString.getAsInteger(10, Result))
+      Sites.emplace(Result);
+  }
+
+  return Sites;
+}
+
+void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  uint64_t NumSpecialized = 0;
+  uint64_t NumSpecializedDyno = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (!shouldOptimize(Function))
+      continue;
+
+    std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
+    auto shouldOptimize = [&](size_t N) {
+      return CallsToOptimize.empty() || CallsToOptimize.count(N);
+    };
+
+    std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
+    size_t CallSiteID = 0;
+    for (BinaryBasicBlock *CurBB : Blocks) {
+      for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
+        MCInst &Inst = *II;
+
+        if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
+            !Inst.getOperand(0).isExpr())
+          continue;
+
+        const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
+        if (CalleeSymbol->getName() != "memcpy" &&
+            CalleeSymbol->getName() != "memcpy@PLT")
+          continue;
+
+        if (BC.MIB->isTailCall(Inst))
+          continue;
+
+        ++CallSiteID;
+
+        if (!shouldOptimize(CallSiteID))
+          continue;
+
+        // Create a copy of a call to memcpy(dest, src, size).
+        MCInst MemcpyInstr = Inst;
+
+        BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
+
+        BinaryBasicBlock *NextBB = nullptr;
+        if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
+          NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
+          NextBB->eraseInstruction(NextBB->begin());
+        } else {
+          NextBB = OneByteMemcpyBB->getSuccessor();
+          OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
+          assert(NextBB && "unexpected call to memcpy() with no return");
+        }
+
+        BinaryBasicBlock *MemcpyBB =
+            Function.addBasicBlock(CurBB->getInputOffset());
+        std::vector<MCInst> CmpJCC =
+            BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
+                                OneByteMemcpyBB->getLabel(), BC.Ctx.get());
+        CurBB->addInstructions(CmpJCC);
+        CurBB->addSuccessor(MemcpyBB);
+
+        MemcpyBB->addInstruction(std::move(MemcpyInstr));
+        MemcpyBB->addSuccessor(NextBB);
+        MemcpyBB->setCFIState(NextBB->getCFIState());
+        MemcpyBB->setExecutionCount(0);
+
+        // To prevent the actual call from being moved to cold, we set its
+        // execution count to 1.
+        if (CurBB->getKnownExecutionCount() > 0)
+          MemcpyBB->setExecutionCount(1);
+
+        std::vector<MCInst> OneByteMemcpy = BC.MIB->createOneByteMemcpy();
+        OneByteMemcpyBB->addInstructions(OneByteMemcpy);
+
+        ++NumSpecialized;
+        NumSpecializedDyno += CurBB->getKnownExecutionCount();
+
+        CurBB = NextBB;
+
+        // Note: we don't expect the next instruction to be a call to memcpy.
+        II = CurBB->begin();
+      }
+    }
+  }
+
+  if (NumSpecialized) {
+    outs() << "BOLT-INFO: specialized " << NumSpecialized
+           << " memcpy() call sites for size 1";
+    if (NumSpecializedDyno)
+      outs() << ". The calls were executed " << NumSpecializedDyno
+             << " times based on profile.";
+    outs() << '\n';
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h
new file mode 100644
index 000000000000..acfa759efa38
--- /dev/null
+++ b/bolt/src/Passes/BinaryPasses.h
@@ -0,0 +1,465 @@
+//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The set of optimization/analysis passes that run on BinaryFunctions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_PASSES_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "DynoStats.h"
+#include "llvm/Support/CommandLine.h"
+#include <atomic>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_set>
+
+namespace llvm {
+namespace bolt {
+
+/// An optimization/analysis pass that runs on functions.
+class BinaryFunctionPass {
+protected:
+  bool PrintPass;
+
+  explicit BinaryFunctionPass(const bool PrintPass)
+    : PrintPass(PrintPass) { }
+
+  /// Control whether a specific function should be skipped during
+  /// optimization.
+  virtual bool shouldOptimize(const BinaryFunction &BF) const;
+public:
+  virtual ~BinaryFunctionPass() = default;
+
+  /// The name of this pass
+  virtual const char *getName() const = 0;
+
+  /// Control whether debug info is printed after this pass is completed.
+  bool printPass() const { return PrintPass; }
+
+  /// Control whether debug info is printed for an individual function after
+  /// this pass is completed (printPass() must have returned true).
+  virtual bool shouldPrint(const BinaryFunction &BF) const;
+
+  /// Execute this pass on the given functions.
+  virtual void runOnFunctions(BinaryContext &BC) = 0;
+};
+
+/// A pass to print program-wide dynostats.
+class DynoStatsPrintPass : public BinaryFunctionPass {
+protected:
+  DynoStats PrevDynoStats;
+  std::string Title;
+
+public:
+  DynoStatsPrintPass(const DynoStats &PrevDynoStats, const char *Title)
+    : BinaryFunctionPass(false)
+    , PrevDynoStats(PrevDynoStats)
+    , Title(Title) {
+  }
+
+  const char *getName() const override {
+    return "print dyno-stats after optimizations";
+  }
+
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return false;
+  }
+
+  void runOnFunctions(BinaryContext &BC) override {
+    const DynoStats NewDynoStats = getDynoStats(BC.getBinaryFunctions());
+    const bool Changed = (NewDynoStats != PrevDynoStats);
+    outs() << "BOLT-INFO: program-wide dynostats "
+           << Title << (Changed ? "" : " (no change)") << ":\n\n"
+           << PrevDynoStats;
+    if (Changed) {
+      outs() << '\n';
+      NewDynoStats.print(outs(), &PrevDynoStats);
+    }
+    outs() << '\n';
+  }
+};
+
+/// Detect and eliminate unreachable basic blocks. We could have those
+/// filled with nops and they are used for alignment.
+class EliminateUnreachableBlocks : public BinaryFunctionPass {
+  std::shared_timed_mutex ModifiedMtx;
+  std::unordered_set<const BinaryFunction *> Modified;
+  std::atomic<unsigned> DeletedBlocks{0};
+  std::atomic<uint64_t> DeletedBytes{0};
+  void runOnFunction(BinaryFunction& Function);
+ public:
+  EliminateUnreachableBlocks(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "eliminate-unreachable";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext&) override;
+};
+
+// Reorder the basic blocks for each function based on hotness.
+class ReorderBasicBlocks : public BinaryFunctionPass {
+public:
+  /// Choose which strategy should the block layout heuristic prioritize when
+  /// facing conflicting goals.
+  enum LayoutType : char {
+    /// LT_NONE - do not change layout of basic blocks
+    LT_NONE = 0, /// no reordering
+    /// LT_REVERSE - reverse the order of basic blocks, meant for testing
+    /// purposes. The first basic block is left intact and the rest are
+    /// put in the reverse order.
+    LT_REVERSE,
+    /// LT_OPTIMIZE - optimize layout of basic blocks based on profile.
+    LT_OPTIMIZE,
+    /// LT_OPTIMIZE_BRANCH is an implementation of what is suggested in Pettis'
+    /// paper (PLDI '90) about block reordering, trying to minimize branch
+    /// mispredictions.
+    LT_OPTIMIZE_BRANCH,
+    /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04)
+    /// that suggests putting frequently executed chains first in the layout.
+    LT_OPTIMIZE_CACHE,
+    /// Block reordering guided by the extended TSP metric.
+    LT_OPTIMIZE_EXT_TSP,
+    /// Create clusters and use random order for them.
+    LT_OPTIMIZE_SHUFFLE,
+  };
+
+private:
+  void modifyFunctionLayout(BinaryFunction &Function,
+                            LayoutType Type,
+                            bool MinBranchClusters) const;
+public:
+  explicit ReorderBasicBlocks(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override {
+    return "reorder-blocks";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override;
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Sync local branches with CFG.
+class FixupBranches : public BinaryFunctionPass {
+ public:
+  explicit FixupBranches(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "fix-branches";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Fix the CFI state and exception handling information after all other
+/// passes have completed.
+class FinalizeFunctions : public BinaryFunctionPass {
+ public:
+  explicit FinalizeFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "finalize-functions";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Perform any necessary adjustments for functions that do not fit into their
+/// original space in non-relocation mode.
+class CheckLargeFunctions : public BinaryFunctionPass {
+public:
+  explicit CheckLargeFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "check-large-functions";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+};
+
+/// Convert and remove all BOLT-related annotations before LLVM code emission.
+class LowerAnnotations : public BinaryFunctionPass {
+ public:
+  explicit LowerAnnotations(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "lower-annotations";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// An optimization to simplify conditional tail calls by removing
+/// unnecessary branches.
+///
+/// This optimization considers both of the following cases:
+///
+/// foo: ...
+///      jcc L1   original
+///      ...
+/// L1:  jmp bar  # TAILJMP
+///
+/// ->
+///
+/// foo: ...
+///      jcc bar  iff jcc L1 is expected
+///      ...
+///
+/// L1 is unreachable
+///
+/// OR
+///
+/// foo: ...
+///      jcc  L2
+/// L1:  jmp  dest  # TAILJMP
+/// L2:  ...
+///
+/// ->
+///
+/// foo: jncc dest  # TAILJMP
+/// L2:  ...
+///
+/// L1 is unreachable
+///
+/// For this particular case, the first basic block ends with
+/// a conditional branch and has two successors, one fall-through
+/// and one for when the condition is true.
+/// The target of the conditional is a basic block with a single
+/// unconditional branch (i.e. tail call) to another function.
+/// We don't care about the contents of the fall-through block.
+/// We assume that the target of the conditional branch is the
+/// first successor.
+class SimplifyConditionalTailCalls : public BinaryFunctionPass {
+  uint64_t NumCandidateTailCalls{0};
+  uint64_t NumTailCallsPatched{0};
+  uint64_t CTCExecCount{0};
+  uint64_t CTCTakenCount{0};
+  uint64_t NumOrigForwardBranches{0};
+  uint64_t NumOrigBackwardBranches{0};
+  uint64_t NumDoubleJumps{0};
+  uint64_t DeletedBlocks{0};
+  uint64_t DeletedBytes{0};
+  std::unordered_set<const BinaryFunction *> Modified;
+  std::set<const BinaryBasicBlock *> BeenOptimized;
+
+  bool shouldRewriteBranch(const BinaryBasicBlock *PredBB,
+                           const MCInst &CondBranch,
+                           const BinaryBasicBlock *BB,
+                           const bool DirectionFlag);
+
+  uint64_t fixTailCalls(BinaryContext &BC, BinaryFunction &BF);
+ public:
+  explicit SimplifyConditionalTailCalls(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "simplify-conditional-tail-calls";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Perform simple peephole optimizations.
+class Peepholes : public BinaryFunctionPass {
+  uint64_t NumShortened{0};
+  uint64_t NumDoubleJumps{0};
+  uint64_t TailCallTraps{0};
+  uint64_t NumUselessCondBranches{0};
+
+  /// Attempt to use the minimum operand width for arithmetic, branch and
+  /// move instructions.
+  uint64_t shortenInstructions(BinaryContext &BC, BinaryFunction &Function);
+
+  /// Add trap instructions immediately after indirect tail calls to prevent
+  /// the processor from decoding instructions immediate following the
+  /// tailcall.
+  void addTailcallTraps(BinaryContext &BC, BinaryFunction &Function);
+
+  /// Remove useless duplicate successors.  When the conditional
+  /// successor is the same as the unconditional successor, we can
+  /// remove the conditional successor and branch instruction.
+  void removeUselessCondBranches(BinaryContext &BC, BinaryFunction &Function);
+public:
+  explicit Peepholes(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "peepholes";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// An optimization to simplify loads from read-only sections.The pass converts
+/// load instructions with statically computed target address such as:
+///
+///      mov 0x12f(%rip), %eax
+///
+/// to their counterparts that use immediate operands instead of memory loads:
+///
+///     mov $0x4007dc, %eax
+///
+/// when the target address points somewhere inside a read-only section.
+///
+class SimplifyRODataLoads : public BinaryFunctionPass {
+  uint64_t NumLoadsSimplified{0};
+  uint64_t NumDynamicLoadsSimplified{0};
+  uint64_t NumLoadsFound{0};
+  uint64_t NumDynamicLoadsFound{0};
+  std::unordered_set<const BinaryFunction *> Modified;
+
+  bool simplifyRODataLoads(BinaryContext &BC, BinaryFunction &BF);
+
+public:
+  explicit SimplifyRODataLoads(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "simplify-read-only-loads";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Assign output sections to all functions.
+class AssignSections : public BinaryFunctionPass {
+ public:
+  explicit AssignSections()
+    : BinaryFunctionPass(false) {
+  }
+
+  const char *getName() const override {
+    return "assign-sections";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Compute and report to the user the imbalance in flow equations for all
+/// CFGs, so we can detect bad quality profile. Prints average and standard
+/// deviation of the absolute differences of outgoing flow minus incoming flow
+/// for blocks of interest (excluding prologues, epilogues, and BB frequency
+/// lower than 100).
+class PrintProfileStats : public BinaryFunctionPass {
+ public:
+  explicit PrintProfileStats(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "profile-stats";
+  }
+  bool shouldPrint(const BinaryFunction &) const override {
+    return false;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Prints a list of the top 100 functions sorted by a set of
+/// dyno stats categories.
+class PrintProgramStats : public BinaryFunctionPass {
+ public:
+  explicit PrintProgramStats(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "print-stats";
+  }
+  bool shouldPrint(const BinaryFunction &) const override {
+    return false;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Pass for lowering any instructions that we have raised and that have
+/// to be lowered.
+class InstructionLowering : public BinaryFunctionPass {
+public:
+  explicit InstructionLowering(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "inst-lowering";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Pass for stripping 'repz' from 'repz retq' sequence of instructions.
+class StripRepRet : public BinaryFunctionPass {
+public:
+  explicit StripRepRet(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "strip-rep-ret";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Pass for inlining calls to memcpy using 'rep movsb' on X86.
+class InlineMemcpy : public BinaryFunctionPass {
+public:
+  explicit InlineMemcpy(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "inline-memcpy";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+/// Pass for specializing memcpy for a size of 1 byte.
+class SpecializeMemcpy1 : public BinaryFunctionPass {
+private:
+  std::vector<std::string> Spec;
+
+  /// Return indices of the call sites to optimize. Count starts at 1.
+  /// Returns an empty set for all call sites in the function.
+  std::set<size_t> getCallSitesToOptimize(const BinaryFunction &) const;
+
+public:
+  explicit SpecializeMemcpy1(const cl::opt<bool> &PrintPass,
+                             cl::list<std::string> &Spec)
+    : BinaryFunctionPass(PrintPass), Spec(Spec) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override {
+    return "specialize-memcpy";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+enum FrameOptimizationType : char {
+  FOP_NONE, /// Don't perform FOP.
+  FOP_HOT,  /// Perform FOP on hot functions.
+  FOP_ALL   /// Perform FOP on all functions.
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt
new file mode 100644
index 000000000000..d1afc400e0a8
--- /dev/null
+++ b/bolt/src/Passes/CMakeLists.txt
@@ -0,0 +1,47 @@
+add_llvm_library(LLVMBOLTPasses
+  Aligner.cpp
+  AllocCombiner.cpp
+  BinaryPasses.cpp
+  BinaryFunctionCallGraph.cpp
+  CallGraph.cpp
+  CallGraphWalker.cpp
+  DataflowAnalysis.cpp
+  DataflowInfoManager.cpp
+  ExtTSPReorderAlgorithm.cpp
+  FrameAnalysis.cpp
+  FrameOptimizer.cpp
+  HFSort.cpp
+  HFSortPlus.cpp
+  IdenticalCodeFolding.cpp
+  IndirectCallPromotion.cpp
+  Inliner.cpp
+  Instrumentation.cpp
+  JTFootprintReduction.cpp
+  LongJmp.cpp
+  LoopInversionPass.cpp
+  LivenessAnalysis.cpp
+  MCF.cpp
+  PatchEntries.cpp
+  PettisAndHansen.cpp
+  PLTCall.cpp
+  RegAnalysis.cpp
+  RegReAssign.cpp
+  ReorderAlgorithm.cpp
+  ReorderFunctions.cpp
+  ReorderData.cpp
+  ShrinkWrapping.cpp
+  SplitFunctions.cpp
+  StackAllocationAnalysis.cpp
+  StackAvailableExpressions.cpp
+  StackPointerTracking.cpp
+  StackReachingUses.cpp
+  StokeInfo.cpp
+  ValidateInternalCalls.cpp
+  VeneerElimination.cpp
+  RetpolineInsertion.cpp
+
+  DEPENDS
+  intrinsics_gen
+  )
+
+include_directories( ${BOLT_SOURCE_DIR}/src )
diff --git a/bolt/src/Passes/CallGraph.cpp b/bolt/src/Passes/CallGraph.cpp
new file mode 100644
index 000000000000..3693c38e1eef
--- /dev/null
+++ b/bolt/src/Passes/CallGraph.cpp
@@ -0,0 +1,124 @@
+//===--- Passes/CallGraph.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "CallGraph.h"
+
+#define DEBUG_TYPE "callgraph"
+
+#if defined(__x86_64__) && !defined(_MSC_VER)
+#  if (!defined USE_SSECRC)
+#    define USE_SSECRC
+#  endif
+#else
+#  undef USE_SSECRC
+#endif
+
+namespace {
+
+inline size_t hash_int64_fallback(int64_t k) {
+  uint64_t key = (unsigned long long)k;
+  // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
+  // http://www.concentric.net/~ttwang/tech/inthash.htm
+  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+  key = key ^ (key >> 24);
+  key = (key + (key << 3)) + (key << 8); // key * 265
+  key = key ^ (key >> 14);
+  key = (key + (key << 2)) + (key << 4); // key * 21
+  key = key ^ (key >> 28);
+  return static_cast<size_t>(static_cast<uint32_t>(key));
+}
+
+inline size_t hash_int64(int64_t k) {
+#if defined(USE_SSECRC) && defined(__SSE4_2__)
+  size_t h = 0;
+  __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
+  return h;
+#else
+  return hash_int64_fallback(k);
+#endif
+}
+
+inline size_t hash_int64_pair(int64_t k1, int64_t k2) {
+#if defined(USE_SSECRC) && defined(__SSE4_2__)
+  // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes
+  // differently from (k2, k1).
+  k1 += k1;
+  __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2));
+  return k1;
+#else
+  return (hash_int64(k1) << 1) ^ hash_int64(k2);
+#endif
+}
+
+}
+
+namespace llvm {
+namespace bolt {
+
+int64_t CallGraph::Arc::Hash::operator()(const Arc &Arc) const {
+#ifdef USE_STD_HASH
+  std::hash<int64_t> Hasher;
+  return hashCombine(Hasher(Arc.src()), Arc.dst());
+#else
+  return hash_int64_pair(int64_t(Arc.src()), int64_t(Arc.dst()));
+#endif
+}
+
+CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint64_t Samples) {
+  NodeId Id = Nodes.size();
+  Nodes.emplace_back(Size, Samples);
+  return Id;
+}
+
+const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W,
+                                              double Offset) {
+  assert(Offset <= size(Src) && "Call offset exceeds function size");
+
+  std::pair<ArcIterator, bool> Res = Arcs.emplace(Src, Dst, W);
+  if (!Res.second) {
+    Res.first->Weight += W;
+    Res.first->AvgCallOffset += Offset * W;
+    return *Res.first;
+  }
+  Res.first->AvgCallOffset = Offset * W;
+  Nodes[Src].Succs.push_back(Dst);
+  Nodes[Dst].Preds.push_back(Src);
+  return *Res.first;
+}
+
+void CallGraph::normalizeArcWeights() {
+  for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) {
+    const Node &Func = getNode(FuncId);
+    for (NodeId Caller : Func.predecessors()) {
+      ArcIterator Arc = findArc(Caller, FuncId);
+      Arc->NormalizedWeight = Arc->weight() / Func.samples();
+      if (Arc->weight() > 0)
+        Arc->AvgCallOffset /= Arc->weight();
+      assert(Arc->AvgCallOffset <= size(Caller) &&
+             "Avg call offset exceeds function size");
+    }
+  }
+}
+
+void CallGraph::adjustArcWeights() {
+  for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) {
+    const Node &Func = getNode(FuncId);
+    uint64_t InWeight = 0;
+    for (NodeId Caller : Func.predecessors()) {
+      ArcIterator Arc = findArc(Caller, FuncId);
+      InWeight += (uint64_t)Arc->weight();
+    }
+    if (Func.samples() < InWeight)
+      setSamples(FuncId, InWeight);
+  }
+}
+
+}
+}
diff --git a/bolt/src/Passes/CallGraph.h b/bolt/src/Passes/CallGraph.h
new file mode 100644
index 000000000000..0df952da910d
--- /dev/null
+++ b/bolt/src/Passes/CallGraph.h
@@ -0,0 +1,216 @@
+//===--- Passes/CallGraph.h -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <unordered_set>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+
+// TODO: find better place for this
+inline int64_t hashCombine(const int64_t Seed, const int64_t Val) {
+  std::hash<int64_t> Hasher;
+  return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2));
+}
+
+/// A call graph class.
+class CallGraph {
+public:
+  using NodeId = size_t;
+  static constexpr NodeId InvalidId = -1;
+
+  template <typename T>
+  class iterator_range {
+    T Begin;
+    T End;
+
+  public:
+    template <typename Container>
+    iterator_range(Container &&c) : Begin(c.begin()), End(c.end()) {}
+    iterator_range(T Begin, T End)
+      : Begin(std::move(Begin)),
+        End(std::move(End)) {}
+
+    T begin() const { return Begin; }
+    T end() const { return End; }
+  };
+
+  class Arc {
+  public:
+    struct Hash {
+      int64_t operator()(const Arc &Arc) const;
+    };
+
+    Arc(NodeId S, NodeId D, double W = 0)
+      : Src(S)
+      , Dst(D)
+      , Weight(W)
+    {}
+    Arc(const Arc&) = delete;
+
+    friend bool operator==(const Arc &Lhs, const Arc &Rhs) {
+      return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst;
+    }
+
+    NodeId src() const { return Src; }
+    NodeId dst() const { return Dst; }
+    double weight() const { return Weight; }
+    double avgCallOffset() const { return AvgCallOffset; }
+    double normalizedWeight() const { return NormalizedWeight; }
+
+  private:
+    friend class CallGraph;
+    NodeId Src{InvalidId};
+    NodeId Dst{InvalidId};
+    mutable double Weight{0};
+    mutable double NormalizedWeight{0};
+    mutable double AvgCallOffset{0};
+  };
+
+  using ArcsType = std::unordered_set<Arc, Arc::Hash>;
+  using ArcIterator = ArcsType::iterator;
+  using ArcConstIterator = ArcsType::const_iterator;
+
+  class Node {
+  public:
+    explicit Node(uint32_t Size, uint64_t Samples = 0)
+      : Size(Size), Samples(Samples)
+    {}
+
+    uint32_t size() const { return Size; }
+    uint64_t samples() const { return Samples; }
+
+    const std::vector<NodeId> &successors() const {
+      return Succs;
+    }
+    const std::vector<NodeId> &predecessors() const {
+      return Preds;
+    }
+
+  private:
+    friend class CallGraph;
+    uint32_t Size;
+    uint64_t Samples;
+
+    // preds and succs contain no duplicate elements and self arcs are not allowed
+    std::vector<NodeId> Preds;
+    std::vector<NodeId> Succs;
+  };
+
+  size_t numNodes() const {
+    return Nodes.size();
+  }
+  size_t numArcs() const {
+    return Arcs.size();
+  }
+  const Node &getNode(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id];
+  }
+  uint32_t size(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Size;
+  }
+  uint64_t samples(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Samples;
+  }
+  const std::vector<NodeId> &successors(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Succs;
+  }
+  const std::vector<NodeId> &predecessors(const NodeId Id) const {
+    assert(Id < Nodes.size());
+    return Nodes[Id].Preds;
+  }
+  NodeId addNode(uint32_t Size, uint64_t Samples = 0);
+  const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0,
+                          double Offset = 0.0);
+  ArcIterator findArc(NodeId Src, NodeId Dst) {
+    return Arcs.find(Arc(Src, Dst));
+  }
+  ArcConstIterator findArc(NodeId Src, NodeId Dst) const {
+    return Arcs.find(Arc(Src, Dst));
+  }
+  iterator_range<ArcConstIterator> arcs() const {
+    return iterator_range<ArcConstIterator>(Arcs.begin(), Arcs.end());
+  }
+  iterator_range<std::vector<Node>::const_iterator> nodes() const {
+    return iterator_range<std::vector<Node>::const_iterator>(Nodes.begin(), Nodes.end());
+  }
+
+  double density() const {
+    return double(Arcs.size()) / (Nodes.size()*Nodes.size());
+  }
+
+  // Initialize NormalizedWeight field for every arc
+  void normalizeArcWeights();
+  // Make sure that the sum of incoming arc weights is at least the number of
+  // samples for every node
+  void adjustArcWeights();
+
+  template <typename L>
+  void printDot(char* fileName, L getLabel) const;
+
+private:
+  void setSamples(const NodeId Id, uint64_t Samples) {
+    assert(Id < Nodes.size());
+    Nodes[Id].Samples = Samples;
+  }
+
+  std::vector<Node> Nodes;
+  ArcsType Arcs;
+};
+
+template<class L>
+void CallGraph::printDot(char* FileName, L GetLabel) const {
+  FILE* File = fopen(FileName, "wt");
+  if (!File) return;
+
+  fprintf(File, "digraph g {\n");
+  for (NodeId F = 0; F < Nodes.size(); F++) {
+    if (Nodes[F].samples() == 0) continue;
+    fprintf(
+            File,
+            "f%lu [label=\"%s\\nsamples=%u\\nsize=%u\"];\n",
+            F,
+            GetLabel(F),
+            Nodes[F].samples(),
+            Nodes[F].size());
+  }
+  for (NodeId F = 0; F < Nodes.size(); F++) {
+    if (Nodes[F].samples() == 0) continue;
+    for (NodeId Dst : Nodes[F].successors()) {
+      ArcConstIterator Arc = findArc(F, Dst);
+      fprintf(
+              File,
+              "f%lu -> f%u [label=\"normWgt=%.3lf,weight=%.0lf,callOffset=%.1lf\"];"
+              "\n",
+              F,
+              Dst,
+              Arc->normalizedWeight(),
+              Arc->weight(),
+              Arc->avgCallOffset());
+    }
+  }
+  fprintf(File, "}\n");
+  fclose(File);
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/CallGraphWalker.cpp b/bolt/src/Passes/CallGraphWalker.cpp
new file mode 100644
index 000000000000..e3f26e47ab6f
--- /dev/null
+++ b/bolt/src/Passes/CallGraphWalker.cpp
@@ -0,0 +1,65 @@
+//===--- CallGraphWalker.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "CallGraphWalker.h"
+#include "BinaryFunctionCallGraph.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+#include <queue>
+#include <set>
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+void CallGraphWalker::traverseCG() {
+  NamedRegionTimer T1("CG Traversal", "CG Traversal", "CG breakdown",
+                      "CG breakdown", opts::TimeOpts);
+  std::queue<BinaryFunction *> Queue;
+  std::set<BinaryFunction *> InQueue;
+
+  for (BinaryFunction *Func : TopologicalCGOrder) {
+    Queue.push(Func);
+    InQueue.insert(Func);
+  }
+
+  while (!Queue.empty()) {
+    BinaryFunction *Func = Queue.front();
+    Queue.pop();
+    InQueue.erase(Func);
+
+    bool Changed = false;
+    for (CallbackTy Visitor : Visitors) {
+      bool CurVisit = Visitor(Func);
+      Changed = Changed || CurVisit;
+    }
+
+    if (Changed) {
+      for (CallGraph::NodeId CallerID : CG.predecessors(CG.getNodeId(Func))) {
+        BinaryFunction *CallerFunc = CG.nodeIdToFunc(CallerID);
+        if (InQueue.count(CallerFunc))
+          continue;
+        Queue.push(CallerFunc);
+        InQueue.insert(CallerFunc);
+      }
+    }
+  }
+}
+
+void CallGraphWalker::walk() {
+  TopologicalCGOrder = CG.buildTraversalOrder();
+  traverseCG();
+}
+
+}
+}
diff --git a/bolt/src/Passes/CallGraphWalker.h b/bolt/src/Passes/CallGraphWalker.h
new file mode 100644
index 000000000000..be3dca300c30
--- /dev/null
+++ b/bolt/src/Passes/CallGraphWalker.h
@@ -0,0 +1,60 @@
+//===--- Passes/CallGraphWalker.h -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPHWALKER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPHWALKER_H
+
+#include <deque>
+#include <functional>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+class BinaryFunction;
+class BinaryFunctionCallGraph;
+
+/// Perform a bottom-up walk of the call graph with the intent of computing
+/// a property that depends on callees. In the event of a CG cycles, this will
+/// re-visit functions until their observed property converges.
+class CallGraphWalker {
+  BinaryFunctionCallGraph &CG;
+
+  /// DFS or reverse post-ordering of the call graph nodes to allow us to
+  /// traverse the call graph bottom-up
+  std::deque<BinaryFunction *> TopologicalCGOrder;
+
+  /// Stores all visitor functions to call when traversing the call graph
+  typedef std::function<bool(BinaryFunction*)> CallbackTy;
+  std::vector<CallbackTy> Visitors;
+
+  /// Do the bottom-up traversal
+  void traverseCG();
+
+public:
+  /// Initialize core context references but don't do anything yet
+  CallGraphWalker(BinaryFunctionCallGraph &CG) : CG(CG) {}
+
+  /// Register a new callback function to be called for each function when
+  /// traversing the call graph bottom-up. Function should return true iff
+  /// whatever information it is keeping track of has changed. Function must
+  /// converge with time, ie, it must eventually return false, otherwise the
+  /// call graph walk will never finish.
+  void registerVisitor(CallbackTy Callback) {
+    Visitors.emplace_back(Callback);
+  }
+
+  /// Build the call graph, establish a traversal order and traverse it.
+  void walk();
+};
+
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/DataflowAnalysis.cpp b/bolt/src/Passes/DataflowAnalysis.cpp
new file mode 100644
index 000000000000..d2d4707cd0fc
--- /dev/null
+++ b/bolt/src/Passes/DataflowAnalysis.cpp
@@ -0,0 +1,94 @@
+//===--- Passes/DataflowAnalysis.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DataflowAnalysis.h"
+
+#define DEBUG_TYPE "dataflow"
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const BitVector &State) {
+  LLVM_DEBUG({
+    OS << "BitVector(";
+    const char *Sep = "";
+    if (State.count() > (State.size() >> 1)) {
+      OS << "all, except: ";
+      BitVector BV = State;
+      BV.flip();
+      for (int I = BV.find_first(); I != -1; I = BV.find_next(I)) {
+        OS << Sep << I;
+        Sep = " ";
+      }
+      OS << ")";
+      return OS;
+    }
+    for (int I = State.find_first(); I != -1; I = State.find_next(I)) {
+      OS << Sep << I;
+      Sep = " ";
+    }
+    OS << ")";
+    return OS;
+  });
+  OS << "BitVector";
+  return OS;
+}
+
+namespace bolt {
+
+void doForAllPreds(const BinaryContext &BC, const BinaryBasicBlock &BB,
+                   std::function<void(ProgramPoint)> Task) {
+  for (BinaryBasicBlock *Pred : BB.predecessors()) {
+    if (Pred->isValid())
+      Task(ProgramPoint::getLastPointAt(*Pred));
+  }
+  if (!BB.isLandingPad())
+    return;
+  for (BinaryBasicBlock *Thrower : BB.throwers()) {
+    for (MCInst &Inst : *Thrower) {
+      if (!BC.MIB->isInvoke(Inst))
+        continue;
+      const Optional<MCPlus::MCLandingPad> EHInfo = BC.MIB->getEHInfo(Inst);
+      if (!EHInfo || EHInfo->first != BB.getLabel())
+        continue;
+      Task(ProgramPoint(&Inst));
+    }
+  }
+}
+
+/// Operates on all successors of a basic block.
+void doForAllSuccs(const BinaryBasicBlock &BB,
+                   std::function<void(ProgramPoint)> Task) {
+  for (BinaryBasicBlock *Succ : BB.successors()) {
+    if (Succ->isValid())
+      Task(ProgramPoint::getFirstPointAt(*Succ));
+  }
+}
+
+void RegStatePrinter::print(raw_ostream &OS, const BitVector &State) const {
+  if (State.all()) {
+    OS << "(all)";
+    return;
+  }
+  if (State.count() > (State.size() >> 1)) {
+    OS << "all, except: ";
+    BitVector BV = State;
+    BV.flip();
+    for (int I = BV.find_first(); I != -1; I = BV.find_next(I)) {
+      OS << BC.MRI->getName(I) << " ";
+    }
+    return;
+  }
+  for (int I = State.find_first(); I != -1; I = State.find_next(I)) {
+    OS << BC.MRI->getName(I) << " ";
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/DataflowAnalysis.h b/bolt/src/Passes/DataflowAnalysis.h
new file mode 100644
index 000000000000..29c13a98e56e
--- /dev/null
+++ b/bolt/src/Passes/DataflowAnalysis.h
@@ -0,0 +1,589 @@
+//===--- Passes/DataflowAnalysis.h ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWANALYSIS_H
+
+#include "BinaryContext.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/Errc.h"
+#include <queue>
+
+namespace llvm {
+namespace bolt {
+
+/// Represents a given program point as viewed by a dataflow analysis. This
+/// point is a location that may be either an instruction or a basic block.
+///  Example:
+///
+///    BB1:    --> ProgramPoint 1  (stored as bb *)
+///      add   --> ProgramPoint 2  (stored as inst *)
+///      sub   --> ProgramPoint 3  (stored as inst *)
+///      jmp   --> ProgramPoint 4  (stored as inst *)
+///
+/// ProgramPoints allow us to attach a state to any location in the program
+/// and is a core concept used in the dataflow analysis engine.
+///
+/// A dataflow analysis will associate a state with a program point. In
+/// analyses whose direction is forward, this state tracks what happened after
+/// the execution of an instruction, and the BB tracks the state of what
+/// happened before the execution of the first instruction in this BB. For
+/// backwards dataflow analyses, state tracks what happened before the
+/// execution of a given instruction, while the state associated with a BB
+/// tracks what happened after the execution of the last instruction of a BB.
+class ProgramPoint {
+  enum IDTy : bool { BB = 0, Inst } ID;
+
+  union DataU {
+    BinaryBasicBlock *BB;
+    MCInst *Inst;
+    DataU(BinaryBasicBlock *BB) : BB(BB) {}
+    DataU(MCInst *Inst) : Inst(Inst) {}
+  } Data;
+
+public:
+  ProgramPoint() : ID(IDTy::BB), Data((MCInst *)nullptr) {}
+  ProgramPoint(BinaryBasicBlock *BB) : ID(IDTy::BB), Data(BB) {}
+  ProgramPoint(MCInst *Inst) : ID(IDTy::Inst), Data(Inst) {}
+
+  /// Convenience function to access the last program point of a basic block,
+  /// which is equal to its last instruction. If it is empty, it is equal to
+  /// itself.
+  static ProgramPoint getLastPointAt(BinaryBasicBlock &BB) {
+    auto Last = BB.rbegin();
+    if (Last != BB.rend())
+      return ProgramPoint(&*Last);
+    return ProgramPoint(&BB);
+  }
+
+  /// Similar to getLastPointAt.
+  static ProgramPoint getFirstPointAt(BinaryBasicBlock &BB) {
+    auto First = BB.begin();
+    if (First != BB.end())
+      return ProgramPoint(&*First);
+    return ProgramPoint(&BB);
+  }
+
+  bool operator<(const ProgramPoint &PP) const { return Data.BB < PP.Data.BB; }
+  bool operator==(const ProgramPoint &PP) const {
+    return Data.BB == PP.Data.BB;
+  }
+
+  bool isBB() const { return ID == IDTy::BB; }
+  bool isInst() const { return ID == IDTy::Inst; }
+
+  BinaryBasicBlock *getBB() const {
+    assert(isBB());
+    return Data.BB;
+  }
+  MCInst *getInst() const {
+    assert(isInst());
+    return Data.Inst;
+  }
+
+  friend DenseMapInfo<ProgramPoint>;
+};
+
+/// Convenience function to operate on all predecessors of a BB, as viewed
+/// by a dataflow analysis. This includes throw sites if it is a landing pad.
+void doForAllPreds(const BinaryContext &BC, const BinaryBasicBlock &BB,
+                   std::function<void(ProgramPoint)> Task);
+
+/// Operates on all successors of a basic block.
+void doForAllSuccs(const BinaryBasicBlock &BB,
+                   std::function<void(ProgramPoint)> Task);
+
+/// Default printer for State data.
+template <typename StateTy>
+class StatePrinter {
+public:
+  void print(raw_ostream &OS, const StateTy &State) const {
+    OS << State;
+  }
+  explicit StatePrinter(const BinaryContext &) { }
+};
+
+/// Printer for State data that is a BitVector of registers.
+class RegStatePrinter {
+public:
+  void print(raw_ostream &OS, const BitVector &State) const;
+  explicit RegStatePrinter(const BinaryContext &BC) : BC(BC) { }
+private:
+  const BinaryContext &BC;
+};
+
+/// Base class for dataflow analyses. Depends on the type of whatever object is
+/// stored as the state (StateTy) at each program point. The dataflow then
+/// updates the state at each program point depending on the instruction being
+/// processed, iterating until all points converge and agree on a state value.
+/// Remember that depending on how you formulate your dataflow equation, this
+/// may not converge and will loop indefinitely.
+/// /p Backward indicates the direction of the dataflow. If false, direction is
+/// forward.
+///
+/// Example: Compute the set of live registers at each program point.
+///
+///   Modelling:
+///     Let State be the set of registers that are live. The kill set of a
+///     point is the set of all registers clobbered by the instruction at this
+///     program point. The gen set is the set of all registers read by it.
+///
+///       out{b} = Union (s E succs{b}) {in{s}}
+///       in{b}  = (out{b} - kill{b}) U gen{b}
+///
+///   Template parameters:
+///     StateTy = BitVector, where each index corresponds to a machine register
+///     Backward = true   (live reg operates in reverse order)
+///
+///   Subclass implementation notes:
+///     Confluence operator = union  (if a reg is alive in any succ, it is alive
+///     in the current block).
+///
+template <typename Derived,
+          typename StateTy,
+          bool Backward = false,
+          typename StatePrinterTy = StatePrinter<StateTy>>
+class DataflowAnalysis {
+  /// CRTP convenience methods
+  Derived &derived() {
+    return *static_cast<Derived*>(this);
+  }
+
+  const Derived &const_derived() const {
+    return *static_cast<const Derived*>(this);
+  }
+
+  mutable Optional<unsigned> AnnotationIndex;
+
+protected:
+  const BinaryContext &BC;
+  /// Reference to the function being analysed
+  BinaryFunction &Func;
+
+  /// The id of the annotation allocator to be used
+  MCPlusBuilder::AllocatorIdTy AllocatorId = 0;
+
+  /// Tracks the state at basic block start (end) if direction of the dataflow
+  /// is forward (backward).
+  std::unordered_map<const BinaryBasicBlock *, StateTy> StateAtBBEntry;
+  /// Map a point to its previous (succeeding) point if the direction of the
+  /// dataflow is forward (backward). This is used to support convenience
+  /// methods to access the resulting state before (after) a given instruction,
+  /// otherwise our clients need to keep "prev" pointers themselves.
+  DenseMap<const MCInst *, ProgramPoint> PrevPoint;
+
+  /// Perform any bookkeeping before dataflow starts
+  void preflight() {
+    llvm_unreachable("Unimplemented method");
+  }
+
+  /// Sets initial state for each BB
+  StateTy getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    llvm_unreachable("Unimplemented method");
+  }
+
+  /// Sets initial state for each instruction (out set)
+  StateTy getStartingStateAtPoint(const MCInst &Point) {
+    llvm_unreachable("Unimplemented method");
+  }
+
+  /// Computes the in set for the first instruction in a BB by applying the
+  /// confluence operator to the out sets of the last instruction of each pred
+  /// (in case of a backwards dataflow, we will operate on the in sets of each
+  /// successor to determine the starting state of the last instruction of the
+  /// current BB)
+  void doConfluence(StateTy &StateOut, const StateTy &StateIn) {
+    llvm_unreachable("Unimplemented method");
+  }
+
+  /// In case of a forwards dataflow, compute the in set for the first
+  /// instruction in a Landing Pad considering all out sets for associated
+  /// throw sites.
+  /// In case of a backwards dataflow, compute the in set of a invoke
+  /// instruction considering in sets for the first instructions of its
+  /// landing pads.
+  void doConfluenceWithLP(StateTy &StateOut, const StateTy &StateIn,
+                                  const MCInst &Invoke) {
+    return derived().doConfluence(StateOut, StateIn);
+  }
+
+  /// Returns the out set of an instruction given its in set.
+  /// If backwards, computes the in set given its out set.
+  StateTy computeNext(const MCInst &Point, const StateTy &Cur) {
+    llvm_unreachable("Unimplemented method");
+    return StateTy();
+  }
+
+  /// Returns the MCAnnotation name
+  StringRef getAnnotationName() const {
+    llvm_unreachable("Unimplemented method");
+    return StringRef("");
+  }
+
+  unsigned getAnnotationIndex() const {
+    if (AnnotationIndex)
+      return *AnnotationIndex;
+    AnnotationIndex =
+      BC.MIB->getOrCreateAnnotationIndex(const_derived().getAnnotationName());
+    return *AnnotationIndex;
+  }
+
+  /// Private getter methods accessing state in a read-write fashion
+  StateTy &getOrCreateStateAt(const BinaryBasicBlock &BB) {
+    return StateAtBBEntry[&BB];
+  }
+
+  StateTy &getOrCreateStateAt(MCInst &Point) {
+    return BC.MIB->getOrCreateAnnotationAs<StateTy>(
+        Point, derived().getAnnotationIndex(), AllocatorId);
+  }
+
+  StateTy &getOrCreateStateAt(ProgramPoint Point) {
+    if (Point.isBB())
+      return getOrCreateStateAt(*Point.getBB());
+    return getOrCreateStateAt(*Point.getInst());
+  }
+
+public:
+  /// Return the allocator id
+  unsigned getAllocatorId() {
+    return AllocatorId;
+  }
+
+  /// If the direction of the dataflow is forward, operates on the last
+  /// instruction of all predecessors when performing an iteration of the
+  /// dataflow equation for the start of this BB.  If backwards, operates on
+  /// the first instruction of all successors.
+  void doForAllSuccsOrPreds(const BinaryBasicBlock &BB,
+                            std::function<void(ProgramPoint)> Task) {
+    if (!Backward)
+      return doForAllPreds(BC, BB, Task);
+    return doForAllSuccs(BB, Task);
+  }
+
+  /// We need the current binary context and the function that will be processed
+  /// in this dataflow analysis.
+  DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                   MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
+      : BC(BC), Func(BF), AllocatorId(AllocatorId) {}
+
+  virtual ~DataflowAnalysis() {
+    cleanAnnotations();
+  }
+
+  /// Track the state at basic block start (end) if direction of the dataflow
+  /// is forward (backward).
+  ErrorOr<const StateTy &> getStateAt(const BinaryBasicBlock &BB) const {
+    auto Iter = StateAtBBEntry.find(&BB);
+    if (Iter == StateAtBBEntry.end())
+      return make_error_code(errc::result_out_of_range);
+    return Iter->second;
+  }
+
+  /// Track the state at the end (start) of each MCInst in this function if
+  /// the direction of the dataflow is forward (backward).
+  ErrorOr<const StateTy &> getStateAt(const MCInst &Point) const {
+    return BC.MIB->tryGetAnnotationAs<StateTy>(
+        Point, const_derived().getAnnotationIndex());
+  }
+
+  /// Return the out set (in set) of a given program point if the direction of
+  /// the dataflow is forward (backward).
+  ErrorOr<const StateTy &> getStateAt(ProgramPoint Point) const {
+    if (Point.isBB())
+      return getStateAt(*Point.getBB());
+    return getStateAt(*Point.getInst());
+  }
+
+  /// Relies on a ptr map to fetch the previous instruction and then retrieve
+  /// state. WARNING: Watch out for invalidated pointers. Do not use this
+  /// function if you invalidated pointers after the analysis has been completed
+  ErrorOr<const StateTy &> getStateBefore(const MCInst &Point) {
+    return getStateAt(PrevPoint[&Point]);
+  }
+
+  ErrorOr<const StateTy &> getStateBefore(ProgramPoint Point) {
+    if (Point.isBB())
+      return getStateAt(*Point.getBB());
+    return getStateAt(PrevPoint[Point.getInst()]);
+  }
+
+  /// Remove any state annotations left by this analysis
+  void cleanAnnotations() {
+    for (BinaryBasicBlock &BB : Func) {
+      for (MCInst &Inst : BB) {
+        BC.MIB->removeAnnotation(Inst, derived().getAnnotationIndex());
+      }
+    }
+  }
+
+  /// Public entry point that will perform the entire analysis form start to
+  /// end.
+  void run() {
+    derived().preflight();
+
+    // Initialize state for all points of the function
+    for (BinaryBasicBlock &BB : Func) {
+      StateTy &St = getOrCreateStateAt(BB);
+      St = derived().getStartingStateAtBB(BB);
+      for (MCInst &Inst : BB) {
+        StateTy &St = getOrCreateStateAt(Inst);
+        St = derived().getStartingStateAtPoint(Inst);
+      }
+    }
+    assert(Func.begin() != Func.end() && "Unexpected empty function");
+
+    std::queue<BinaryBasicBlock *> Worklist;
+    // TODO: Pushing this in a DFS ordering will greatly speed up the dataflow
+    // performance.
+    if (!Backward) {
+      for (BinaryBasicBlock &BB : Func) {
+        Worklist.push(&BB);
+        MCInst *Prev = nullptr;
+        for (MCInst &Inst : BB) {
+          PrevPoint[&Inst] = Prev ? ProgramPoint(Prev) : ProgramPoint(&BB);
+          Prev = &Inst;
+        }
+      }
+    } else {
+      for (auto I = Func.rbegin(), E = Func.rend(); I != E; ++I) {
+        Worklist.push(&*I);
+        MCInst *Prev = nullptr;
+        for (auto J = (*I).rbegin(), E2 = (*I).rend(); J != E2; ++J) {
+          MCInst &Inst = *J;
+          PrevPoint[&Inst] = Prev ? ProgramPoint(Prev) : ProgramPoint(&*I);
+          Prev = &Inst;
+        }
+      }
+    }
+
+    // Main dataflow loop
+    while (!Worklist.empty()) {
+      BinaryBasicBlock *BB = Worklist.front();
+      Worklist.pop();
+
+      // Calculate state at the entry of first instruction in BB
+      StateTy StateAtEntry = getOrCreateStateAt(*BB);
+      if (BB->isLandingPad()) {
+        doForAllSuccsOrPreds(*BB, [&](ProgramPoint P) {
+          if (P.isInst() && BC.MIB->isInvoke(*P.getInst()))
+            derived().doConfluenceWithLP(StateAtEntry, *getStateAt(P),
+                                         *P.getInst());
+          else
+            derived().doConfluence(StateAtEntry, *getStateAt(P));
+        });
+      } else {
+        doForAllSuccsOrPreds(*BB, [&](ProgramPoint P) {
+          derived().doConfluence(StateAtEntry, *getStateAt(P));
+        });
+      }
+
+      bool Changed = false;
+      StateTy &St = getOrCreateStateAt(*BB);
+      if (St != StateAtEntry) {
+        Changed = true;
+        St = std::move(StateAtEntry);
+      }
+
+      // Propagate information from first instruction down to the last one
+      StateTy *PrevState = &St;
+      const MCInst *LAST = nullptr;
+      if (!Backward)
+        LAST = &*BB->rbegin();
+      else
+        LAST = &*BB->begin();
+
+      auto doNext = [&] (MCInst &Inst, const BinaryBasicBlock &BB) {
+        StateTy CurState = derived().computeNext(Inst, *PrevState);
+
+        if (Backward && BC.MIB->isInvoke(Inst)) {
+          BinaryBasicBlock *LBB = Func.getLandingPadBBFor(BB, Inst);
+          if (LBB) {
+            auto First = LBB->begin();
+            if (First != LBB->end()) {
+              derived().doConfluenceWithLP(CurState,
+                                           getOrCreateStateAt(&*First), Inst);
+            } else {
+              derived().doConfluenceWithLP(CurState, getOrCreateStateAt(LBB),
+                                           Inst);
+            }
+          }
+        }
+
+        StateTy &St = getOrCreateStateAt(Inst);
+        if (St != CurState) {
+          St = CurState;
+          if (&Inst == LAST)
+            Changed = true;
+        }
+        PrevState = &St;
+      };
+
+      if (!Backward) {
+        for (MCInst &Inst : *BB) {
+          doNext(Inst, *BB);
+        }
+      } else {
+        for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
+          doNext(*I, *BB);
+        }
+      }
+
+      if (Changed) {
+        if (!Backward) {
+          for (BinaryBasicBlock *Succ : BB->successors()) {
+            Worklist.push(Succ);
+          }
+          for (BinaryBasicBlock *LandingPad : BB->landing_pads()) {
+            Worklist.push(LandingPad);
+          }
+        } else {
+          for (BinaryBasicBlock *Pred : BB->predecessors()) {
+            Worklist.push(Pred);
+          }
+          for (BinaryBasicBlock *Thrower : BB->throwers()) {
+            Worklist.push(Thrower);
+          }
+        }
+      }
+    } // end while (!Worklist.empty())
+  }
+};
+
+/// Define an iterator for navigating the expressions calculated by a
+/// dataflow analysis at each program point, when they are backed by a
+/// BitVector.
+class ExprIterator
+    : public std::iterator<std::forward_iterator_tag, const MCInst *> {
+  const BitVector *BV;
+  const std::vector<MCInst *> &Expressions;
+  int Idx;
+
+public:
+  ExprIterator &operator++() {
+    assert(Idx != -1 && "Iterator already at the end");
+    Idx = BV->find_next(Idx);
+    return *this;
+  }
+  ExprIterator operator++(int) {
+    assert(Idx != -1 && "Iterator already at the end");
+    ExprIterator Ret = *this;
+    ++(*this);
+    return Ret;
+  }
+  bool operator==(const ExprIterator &Other) const { return Idx == Other.Idx; }
+  bool operator!=(const ExprIterator &Other) const { return Idx != Other.Idx; }
+  MCInst *operator*() {
+    assert(Idx != -1 && "Invalid access to end iterator");
+    return Expressions[Idx];
+  }
+  ExprIterator(const BitVector *BV, const std::vector<MCInst *> &Exprs)
+      : BV(BV), Expressions(Exprs) {
+    Idx = BV->find_first();
+  }
+  ExprIterator(const BitVector *BV, const std::vector<MCInst *> &Exprs,
+               int Idx)
+      : BV(BV), Expressions(Exprs), Idx(Idx) {}
+
+  int getBitVectorIndex() const {
+    return Idx;
+  }
+};
+
+/// Specialization of DataflowAnalysis whose state specifically stores
+/// a set of instructions.
+template <typename Derived,
+          bool Backward = false,
+          typename StatePrinterTy = StatePrinter<BitVector>>
+class InstrsDataflowAnalysis
+    : public DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy> {
+public:
+  /// These iterator functions offer access to the set of pointers to
+  /// instructions in a given program point
+  template <typename T>
+  ExprIterator expr_begin(const T &Point) const {
+    if (auto State = this->getStateAt(Point))
+      return ExprIterator(&*State, Expressions);
+    return expr_end();
+  }
+  ExprIterator expr_begin(const BitVector &BV) const {
+    return ExprIterator(&BV, Expressions);
+  }
+  ExprIterator expr_end() const {
+    return ExprIterator(nullptr, Expressions, -1);
+  }
+
+  /// Used to size the set of expressions/definitions being tracked by the
+  /// dataflow analysis
+  uint64_t NumInstrs{0};
+  /// We put every MCInst we want to track (which one representing an
+  /// expression/def) into a vector because we need to associate them with
+  /// small numbers. They will be tracked via BitVectors throughout the
+  /// dataflow analysis.
+  std::vector<MCInst *> Expressions;
+  /// Maps expressions defs (MCInsts) to its index in the Expressions vector
+  std::unordered_map<const MCInst *, uint64_t> ExprToIdx;
+
+  /// Return whether \p Expr is in the state set at \p Point
+  bool count(ProgramPoint Point, const MCInst &Expr) const {
+    auto IdxIter = ExprToIdx.find(&Expr);
+    assert (IdxIter != ExprToIdx.end() && "Invalid Expr");
+    return (*this->getStateAt(Point))[IdxIter->second];
+  }
+
+  bool count(const MCInst &Point, const MCInst &Expr) const {
+    auto IdxIter = ExprToIdx.find(&Expr);
+    assert (IdxIter != ExprToIdx.end() && "Invalid Expr");
+    return (*this->getStateAt(Point))[IdxIter->second];
+  }
+
+  /// Return whether \p Expr is in the state set at the instr of index
+  /// \p PointIdx
+  bool count(unsigned PointIdx, const MCInst &Expr) const {
+    return count(*Expressions[PointIdx], Expr);
+  }
+
+  InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                         MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : DataflowAnalysis<Derived, BitVector, Backward, StatePrinterTy>(
+            BC, BF, AllocId) {}
+  virtual ~InstrsDataflowAnalysis() {}
+};
+
+} // namespace bolt
+
+/// DenseMapInfo allows us to use the DenseMap LLVM data structure to store
+/// ProgramPoints.
+template<> struct DenseMapInfo<bolt::ProgramPoint> {
+  static inline bolt::ProgramPoint getEmptyKey() {
+    uintptr_t Val = static_cast<uintptr_t>(-1);
+    Val <<= PointerLikeTypeTraits<MCInst*>::NumLowBitsAvailable;
+    return bolt::ProgramPoint(reinterpret_cast<MCInst*>(Val));
+  }
+  static inline bolt::ProgramPoint getTombstoneKey() {
+    uintptr_t Val = static_cast<uintptr_t>(-2);
+    Val <<= PointerLikeTypeTraits<MCInst*>::NumLowBitsAvailable;
+    return bolt::ProgramPoint(reinterpret_cast<MCInst*>(Val));
+  }
+  static unsigned getHashValue(const bolt::ProgramPoint &PP) {
+    return (unsigned((uintptr_t)PP.Data.BB) >> 4) ^
+           (unsigned((uintptr_t)PP.Data.BB) >> 9);
+  }
+  static bool isEqual(const bolt::ProgramPoint &LHS,
+                      const bolt::ProgramPoint &RHS) {
+    return LHS.Data.BB == RHS.Data.BB;
+  }
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const BitVector &Val);
+
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/DataflowInfoManager.cpp b/bolt/src/Passes/DataflowInfoManager.cpp
new file mode 100644
index 000000000000..27977ed8c481
--- /dev/null
+++ b/bolt/src/Passes/DataflowInfoManager.cpp
@@ -0,0 +1,174 @@
+//===--- Passes/DataflowInfoManager.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DataflowInfoManager.h"
+
+
+namespace llvm {
+namespace bolt {
+
+ReachingDefOrUse</*Def=*/true> &DataflowInfoManager::getReachingDefs() {
+  if (RD)
+    return *RD;
+  assert(RA && "RegAnalysis required");
+  RD.reset(new ReachingDefOrUse<true>(*RA, BC, BF, None, AllocatorId));
+  RD->run();
+  return *RD;
+}
+
+void DataflowInfoManager::invalidateReachingDefs() {
+  RD.reset(nullptr);
+}
+
+ReachingDefOrUse</*Def=*/false> &DataflowInfoManager::getReachingUses() {
+  if (RU)
+    return *RU;
+  assert(RA && "RegAnalysis required");
+  RU.reset(new ReachingDefOrUse<false>(*RA, BC, BF, None, AllocatorId));
+  RU->run();
+  return *RU;
+}
+
+void DataflowInfoManager::invalidateReachingUses() {
+  RU.reset(nullptr);
+}
+
+LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() {
+  if (LA)
+    return *LA;
+  assert(RA && "RegAnalysis required");
+  LA.reset(new LivenessAnalysis(*RA, BC, BF, AllocatorId));
+  LA->run();
+  return *LA;
+}
+
+void DataflowInfoManager::invalidateLivenessAnalysis() {
+  LA.reset(nullptr);
+}
+
+StackReachingUses &DataflowInfoManager::getStackReachingUses() {
+  if (SRU)
+    return *SRU;
+  assert(FA && "FrameAnalysis required");
+  SRU.reset(new StackReachingUses(*FA, BC, BF, AllocatorId));
+  SRU->run();
+  return *SRU;
+}
+
+void DataflowInfoManager::invalidateStackReachingUses() {
+  SRU.reset(nullptr);
+}
+
+DominatorAnalysis<false> &DataflowInfoManager::getDominatorAnalysis() {
+  if (DA)
+    return *DA;
+  DA.reset(new DominatorAnalysis<false>(BC, BF, AllocatorId));
+  DA->run();
+  return *DA;
+}
+
+void DataflowInfoManager::invalidateDominatorAnalysis() {
+  DA.reset(nullptr);
+}
+
+DominatorAnalysis<true> &DataflowInfoManager::getPostDominatorAnalysis() {
+  if (PDA)
+    return *PDA;
+  PDA.reset(new DominatorAnalysis<true>(BC, BF, AllocatorId));
+  PDA->run();
+  return *PDA;
+}
+
+void DataflowInfoManager::invalidatePostDominatorAnalysis() {
+  PDA.reset(nullptr);
+}
+
+StackPointerTracking &DataflowInfoManager::getStackPointerTracking() {
+  if (SPT)
+    return *SPT;
+  SPT.reset(new StackPointerTracking(BC, BF, AllocatorId));
+  SPT->run();
+  return *SPT;
+}
+
+void DataflowInfoManager::invalidateStackPointerTracking() {
+  invalidateStackAllocationAnalysis();
+  SPT.reset(nullptr);
+}
+
+ReachingInsns<false> &DataflowInfoManager::getReachingInsns() {
+  if (RI)
+    return *RI;
+  RI.reset(new ReachingInsns<false>(BC, BF, AllocatorId));
+  RI->run();
+  return *RI;
+}
+
+void DataflowInfoManager::invalidateReachingInsns() {
+  RI.reset(nullptr);
+}
+
+ReachingInsns<true> &DataflowInfoManager::getReachingInsnsBackwards() {
+  if (RIB)
+    return *RIB;
+  RIB.reset(new ReachingInsns<true>(BC, BF, AllocatorId));
+  RIB->run();
+  return *RIB;
+}
+
+void DataflowInfoManager::invalidateReachingInsnsBackwards() {
+  RIB.reset(nullptr);
+}
+
+StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() {
+  if (SAA)
+    return *SAA;
+  SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking(),
+                                        AllocatorId));
+  SAA->run();
+  return *SAA;
+}
+
+void DataflowInfoManager::invalidateStackAllocationAnalysis() {
+  SAA.reset(nullptr);
+}
+
+std::unordered_map<const MCInst *, BinaryBasicBlock *> &
+DataflowInfoManager::getInsnToBBMap() {
+  if (InsnToBB)
+    return *InsnToBB;
+  InsnToBB.reset(new std::unordered_map<const MCInst *, BinaryBasicBlock *>());
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB)
+      (*InsnToBB)[&Inst] = &BB;
+  }
+  return *InsnToBB;
+}
+
+void DataflowInfoManager::invalidateInsnToBBMap() {
+  InsnToBB.reset(nullptr);
+}
+
+void DataflowInfoManager::invalidateAll() {
+  invalidateReachingDefs();
+  invalidateReachingUses();
+  invalidateLivenessAnalysis();
+  invalidateStackReachingUses();
+  invalidateDominatorAnalysis();
+  invalidatePostDominatorAnalysis();
+  invalidateStackPointerTracking();
+  invalidateReachingInsns();
+  invalidateReachingInsnsBackwards();
+  invalidateStackAllocationAnalysis();
+  invalidateInsnToBBMap();
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/DataflowInfoManager.h b/bolt/src/Passes/DataflowInfoManager.h
new file mode 100644
index 000000000000..87a03ab03112
--- /dev/null
+++ b/bolt/src/Passes/DataflowInfoManager.h
@@ -0,0 +1,93 @@
+//===--- Passes/DataflowInfoManager.h -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H
+
+#include "DominatorAnalysis.h"
+#include "LivenessAnalysis.h"
+#include "ReachingDefOrUse.h"
+#include "ReachingInsns.h"
+#include "StackAllocationAnalysis.h"
+#include "StackPointerTracking.h"
+#include "StackReachingUses.h"
+
+namespace llvm {
+namespace bolt {
+class FrameAnalysis;
+class RegAnalysis;
+
+/// Manages instances for dataflow analyses and try to preserve the data
+/// calculated by each analysis as much as possible, saving the need to
+/// recompute it. Also provide an interface for data invalidation when the
+/// analysis is outdated after a transform pass modified the function.
+class DataflowInfoManager {
+  const RegAnalysis *RA;
+  const FrameAnalysis *FA;
+  const BinaryContext &BC;
+  BinaryFunction &BF;
+  std::unique_ptr<ReachingDefOrUse</*Def=*/true>> RD;
+  std::unique_ptr<ReachingDefOrUse</*Def=*/false>> RU;
+  std::unique_ptr<LivenessAnalysis> LA;
+  std::unique_ptr<StackReachingUses> SRU;
+  std::unique_ptr<DominatorAnalysis</*Bwd=*/false>> DA;
+  std::unique_ptr<DominatorAnalysis</*Bwd=*/true>> PDA;
+  std::unique_ptr<StackPointerTracking> SPT;
+  std::unique_ptr<ReachingInsns<false>> RI;
+  std::unique_ptr<ReachingInsns<true>> RIB;
+  std::unique_ptr<StackAllocationAnalysis> SAA;
+  std::unique_ptr<std::unordered_map<const MCInst *, BinaryBasicBlock *>>
+      InsnToBB;
+
+  // Id of the allocator to be used for annotations added by any of the managed
+  // analysis
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
+
+public:
+  DataflowInfoManager(const BinaryContext &BC, BinaryFunction &BF,
+                      const RegAnalysis *RA, const FrameAnalysis *FA,
+                      MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : RA(RA), FA(FA), BC(BC), BF(BF), AllocatorId(AllocId){};
+
+  /// Helper function to fetch the parent BB associated with a program point
+  /// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock)
+  BinaryBasicBlock *getParentBB(ProgramPoint PP) {
+    return PP.isBB() ? PP.getBB() : getInsnToBBMap()[PP.getInst()];
+  }
+
+  ReachingDefOrUse</*Def=*/true> &getReachingDefs();
+  void invalidateReachingDefs();
+  ReachingDefOrUse</*Def=*/false> &getReachingUses();
+  void invalidateReachingUses();
+  LivenessAnalysis &getLivenessAnalysis();
+  void invalidateLivenessAnalysis();
+  StackReachingUses &getStackReachingUses();
+  void invalidateStackReachingUses();
+  DominatorAnalysis<false> &getDominatorAnalysis();
+  void invalidateDominatorAnalysis();
+  DominatorAnalysis<true> &getPostDominatorAnalysis();
+  void invalidatePostDominatorAnalysis();
+  StackPointerTracking &getStackPointerTracking();
+  void invalidateStackPointerTracking();
+  ReachingInsns<false> &getReachingInsns();
+  void invalidateReachingInsns();
+  ReachingInsns<true> &getReachingInsnsBackwards();
+  void invalidateReachingInsnsBackwards();
+  StackAllocationAnalysis &getStackAllocationAnalysis();
+  void invalidateStackAllocationAnalysis();
+  std::unordered_map<const MCInst *, BinaryBasicBlock *> &getInsnToBBMap();
+  void invalidateInsnToBBMap();
+  void invalidateAll();
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/DominatorAnalysis.h b/bolt/src/Passes/DominatorAnalysis.h
new file mode 100644
index 000000000000..a6d5f32dc8bb
--- /dev/null
+++ b/bolt/src/Passes/DominatorAnalysis.h
@@ -0,0 +1,159 @@
+//===--- Passes/DominatorAnalysis.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H
+
+#include "DataflowAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+/// The whole reason for running a dominator analysis at the instruction level
+/// (that is much more expensive than at the BB level) is because of invoke
+/// instructions that may cause early exits in the middle of the BB, making half
+/// of the BB potentially dominate the landing pad but not instructions after
+/// the invoke.
+template <bool Backward = false>
+class DominatorAnalysis
+    : public InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward> {
+  friend class DataflowAnalysis<DominatorAnalysis<Backward>, BitVector,
+                                Backward>;
+
+public:
+  DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                    MCPlusBuilder::AllocatorIdTy AllocId)
+      : InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>(BC, BF,
+                                                                      AllocId) {
+  }
+  virtual ~DominatorAnalysis() {}
+
+  SmallSetVector<ProgramPoint, 4> getDominanceFrontierFor(const MCInst &Dom) {
+    SmallSetVector<ProgramPoint, 4> Result;
+    uint64_t DomIdx = this->ExprToIdx[&Dom];
+    assert(!Backward && "Post-dom frontier not implemented");
+    for (BinaryBasicBlock &BB : this->Func) {
+      bool HasDominatedPred = false;
+      bool HasNonDominatedPred = false;
+      SmallSetVector<ProgramPoint, 4> Candidates;
+      this->doForAllSuccsOrPreds(BB, [&](ProgramPoint P) {
+        if ((*this->getStateAt(P))[DomIdx]) {
+          Candidates.insert(P);
+          HasDominatedPred = true;
+          return;
+        }
+        HasNonDominatedPred = true;
+      });
+      if (HasDominatedPred && HasNonDominatedPred)
+        Result.insert(Candidates.begin(), Candidates.end());
+      if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] &&
+          BB.succ_begin() == BB.succ_end())
+        Result.insert(ProgramPoint::getLastPointAt(BB));
+    }
+    return Result;
+  }
+
+  bool doesADominateB(const MCInst &A, unsigned BIdx) {
+    return this->count(BIdx, A);
+  }
+
+  bool doesADominateB(const MCInst &A, const MCInst &B) {
+    return this->count(B, A);
+  }
+
+  bool doesADominateB(const MCInst &A, ProgramPoint B) {
+    return this->count(B, A);
+  }
+
+  bool doesADominateB(ProgramPoint A, const MCInst &B) {
+    if (A.isInst())
+      return doesADominateB(*A.getInst(), B);
+
+    // This analysis keep track of which instructions dominates another
+    // instruction, it doesn't keep track of BBs. So we need a non-empty
+    // BB if we want to know whether this BB dominates something.
+    BinaryBasicBlock *BB = A.getBB();
+    while (BB->size() == 0) {
+      if (BB->succ_size() == 0)
+        return false;
+      assert (BB->succ_size() == 1);
+      BB = *BB->succ_begin();
+    }
+    const MCInst &InstA = *BB->begin();
+    return doesADominateB(InstA, B);
+  }
+
+  void doForAllDominators(const MCInst &Inst,
+                          std::function<void(const MCInst &)> Task) {
+    for (auto I = this->expr_begin(Inst), E = this->expr_end(); I != E; ++I) {
+      Task(**I);
+    }
+  }
+
+  void run() {
+    InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>::run();
+  }
+
+private:
+  void preflight() {
+    // Populate our universe of tracked expressions with all instructions
+    // except pseudos
+    for (BinaryBasicBlock &BB : this->Func) {
+      for (MCInst &Inst : BB) {
+        this->Expressions.push_back(&Inst);
+        this->ExprToIdx[&Inst] = this->NumInstrs++;
+      }
+    }
+  }
+
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    // Entry points start with empty set
+    // All others start with the full set.
+    if (!Backward && BB.pred_size() == 0 && BB.throw_size() == 0)
+      return BitVector(this->NumInstrs, false);
+    if (Backward && BB.succ_size() == 0)
+      return BitVector(this->NumInstrs, false);
+    return BitVector(this->NumInstrs, true);
+  }
+
+  BitVector getStartingStateAtPoint(const MCInst &Point) {
+    return BitVector(this->NumInstrs, true);
+  }
+
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn) {
+    StateOut &= StateIn;
+  }
+
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur) {
+    BitVector Next = Cur;
+    // Gen
+    if (!this->BC.MIB->isCFI(Point)) {
+      Next.set(this->ExprToIdx[&Point]);
+    }
+    return Next;
+  }
+
+  StringRef getAnnotationName() const {
+    if (Backward)
+      return StringRef("PostDominatorAnalysis");
+    return StringRef("DominatorAnalysis");
+  }
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/ExtTSPReorderAlgorithm.cpp b/bolt/src/Passes/ExtTSPReorderAlgorithm.cpp
new file mode 100644
index 000000000000..89f3c6c29942
--- /dev/null
+++ b/bolt/src/Passes/ExtTSPReorderAlgorithm.cpp
@@ -0,0 +1,969 @@
+//===--- ExtTSPReorderAlgorithm.cpp - Order basic blocks ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// ExtTSP - layout of basic blocks with i-cache optimization.
+//
+// The algorithm is a greedy heuristic that works with chains (ordered lists)
+// of basic blocks. Initially all chains are isolated basic blocks. On every
+// iteration, we pick a pair of chains whose merging yields the biggest increase
+// in the ExtTSP value, which models how i-cache "friendly" a specific chain is.
+// A pair of chains giving the maximum gain is merged into a new chain. The
+// procedure stops when there is only one chain left, or when merging does not
+// increase ExtTSP. In the latter case, the remaining chains are sorted by
+// density in decreasing order.
+//
+// An important aspect is the way two chains are merged. Unlike earlier
+// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
+// chains, X and Y, are first split into three, X1, X2, and Y. Then we
+// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
+// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
+// This improves the quality of the final result (the search space is larger)
+// while keeping the implementation sufficiently fast.
+//
+// Reference:
+//   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
+//     IEEE Transactions on Computers, 2020
+//     https://arxiv.org/abs/1809.04676
+//===----------------------------------------------------------------------===//
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "ReorderAlgorithm.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace bolt;
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> NoThreads;
+
+cl::opt<unsigned>
+ChainSplitThreshold("chain-split-threshold",
+  cl::desc("The maximum size of a chain to apply splitting"),
+  cl::init(128),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<double>
+ForwardWeight("forward-weight",
+  cl::desc("The weight of forward jumps for ExtTSP value"),
+  cl::init(0.1),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<double>
+BackwardWeight("backward-weight",
+  cl::desc("The weight of backward jumps for ExtTSP value"),
+  cl::init(0.1),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+ForwardDistance("forward-distance",
+  cl::desc("The maximum distance (in bytes) of forward jumps for ExtTSP value"),
+  cl::init(1024),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+BackwardDistance("backward-distance",
+  cl::desc("The maximum distance (in bytes) of backward jumps for ExtTSP value"),
+  cl::init(640),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+// Epsilon for comparison of doubles
+constexpr double EPS = 1e-8;
+
+class Block;
+class Chain;
+class Edge;
+
+// Calculate Ext-TSP value, which quantifies the expected number of i-cache
+// misses for a given ordering of basic blocks
+double extTSPScore(uint64_t SrcAddr,
+                   uint64_t SrcSize,
+                   uint64_t DstAddr,
+                   uint64_t Count) {
+  assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
+
+  // Fallthrough
+  if (SrcAddr + SrcSize == DstAddr) {
+    // Assume that FallthroughWeight = 1.0 after normalization
+    return static_cast<double>(Count);
+  }
+  // Forward
+  if (SrcAddr + SrcSize < DstAddr) {
+    const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
+    if (Dist <= opts::ForwardDistance) {
+      double Prob = 1.0 - static_cast<double>(Dist) / opts::ForwardDistance;
+      return opts::ForwardWeight * Prob * Count;
+    }
+    return 0;
+  }
+  // Backward
+  const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
+  if (Dist <= opts::BackwardDistance) {
+    double Prob = 1.0 - static_cast<double>(Dist) / opts::BackwardDistance;
+    return opts::BackwardWeight * Prob * Count;
+  }
+  return 0;
+}
+
+using BlockPair = std::pair<Block *, Block *>;
+using JumpList = std::vector<std::pair<BlockPair, uint64_t>>;
+using BlockIter = std::vector<Block *>::const_iterator;
+
+enum MergeTypeTy {
+  X_Y = 0,
+  X1_Y_X2 = 1,
+  Y_X2_X1 = 2,
+  X2_X1_Y = 3,
+};
+
+class MergeGainTy {
+public:
+  explicit MergeGainTy() {}
+  explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
+    : Score(Score),
+      MergeOffset(MergeOffset),
+      MergeType(MergeType) {}
+
+  double score() const {
+    return Score;
+  }
+
+  size_t mergeOffset() const {
+    return MergeOffset;
+  }
+
+  MergeTypeTy mergeType() const {
+    return MergeType;
+  }
+
+  // returns 'true' iff Other is preferred over this
+  bool operator < (const MergeGainTy& Other) const {
+    return (Other.Score > EPS && Other.Score > Score + EPS);
+  }
+
+private:
+  double Score{-1.0};
+  size_t MergeOffset{0};
+  MergeTypeTy MergeType{MergeTypeTy::X_Y};
+};
+
+// A node in CFG corresponding to a BinaryBasicBlock.
+// The class wraps several mutable fields utilized in the ExtTSP algorithm
+class Block {
+public:
+  Block(const Block&) = delete;
+  Block(Block&&) = default;
+  Block& operator=(const Block&) = delete;
+  Block& operator=(Block&&) = default;
+
+  // Corresponding basic block
+  BinaryBasicBlock *BB{nullptr};
+  // Current chain of the basic block
+  Chain *CurChain{nullptr};
+  // (Estimated) size of the block in the binary
+  uint64_t Size{0};
+  // Execution count of the block in the binary
+  uint64_t ExecutionCount{0};
+  // An original index of the node in CFG
+  size_t Index{0};
+  // The index of the block in the current chain
+  size_t CurIndex{0};
+  // An offset of the block in the current chain
+  mutable uint64_t EstimatedAddr{0};
+  // Fallthrough successor of the node in CFG
+  Block *FallthroughSucc{nullptr};
+  // Fallthrough predecessor of the node in CFG
+  Block *FallthroughPred{nullptr};
+  // Outgoing jumps from the block
+  std::vector<std::pair<Block *, uint64_t>> OutJumps;
+  // Incoming jumps to the block
+  std::vector<std::pair<Block *, uint64_t>> InJumps;
+  // Total execution count of incoming jumps
+  uint64_t InWeight{0};
+  // Total execution count of outgoing jumps
+  uint64_t OutWeight{0};
+
+public:
+  explicit Block(BinaryBasicBlock *BB_, uint64_t Size_)
+    : BB(BB_),
+      Size(Size_),
+      ExecutionCount(BB_->getKnownExecutionCount()),
+      Index(BB->getLayoutIndex()) {}
+
+  bool adjacent(const Block *Other) const {
+    return hasOutJump(Other) || hasInJump(Other);
+  }
+
+  bool hasOutJump(const Block *Other) const {
+    for (std::pair<Block *, uint64_t> Jump : OutJumps) {
+      if (Jump.first == Other)
+        return true;
+    }
+    return false;
+  }
+
+  bool hasInJump(const Block *Other) const {
+    for (std::pair<Block *, uint64_t> Jump : InJumps) {
+      if (Jump.first == Other)
+        return true;
+    }
+    return false;
+  }
+};
+
+// A chain (ordered sequence) of CFG nodes (basic blocks)
+class Chain {
+public:
+  Chain(const Chain&) = delete;
+  Chain(Chain&&) = default;
+  Chain& operator=(const Chain&) = delete;
+  Chain& operator=(Chain&&) = default;
+
+  explicit Chain(size_t Id, Block *Block)
+    : Id(Id),
+      IsEntry(Block->Index == 0),
+      ExecutionCount(Block->ExecutionCount),
+      Size(Block->Size),
+      Score(0),
+      Blocks(1, Block) {}
+
+  size_t id() const {
+    return Id;
+  }
+
+  uint64_t size() const {
+    return Size;
+  }
+
+  double density() const {
+    return static_cast<double>(ExecutionCount) / Size;
+  }
+
+  uint64_t executionCount() const {
+    return ExecutionCount;
+  }
+
+  bool isEntryPoint() const {
+    return IsEntry;
+  }
+
+  double score() const {
+    return Score;
+  }
+
+  void setScore(double NewScore) {
+    Score = NewScore;
+  }
+
+  const std::vector<Block *> &blocks() const {
+    return Blocks;
+  }
+
+  const std::vector<std::pair<Chain *, Edge *>> &edges() const {
+    return Edges;
+  }
+
+  Edge *getEdge(Chain *Other) const {
+    for (std::pair<Chain *, Edge *> It : Edges) {
+      if (It.first == Other)
+        return It.second;
+    }
+    return nullptr;
+  }
+
+  void removeEdge(Chain *Other) {
+    auto It = Edges.begin();
+    while (It != Edges.end()) {
+      if (It->first == Other) {
+        Edges.erase(It);
+        return;
+      }
+      It++;
+    }
+  }
+
+  void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); }
+
+  void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
+    Blocks = MergedBlocks;
+    IsEntry |= Other->IsEntry;
+    ExecutionCount += Other->ExecutionCount;
+    Size += Other->Size;
+    // Update block's chains
+    for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
+      Blocks[Idx]->CurChain = this;
+      Blocks[Idx]->CurIndex = Idx;
+    }
+  }
+
+  void mergeEdges(Chain *Other);
+
+  void clear() {
+    Blocks.clear();
+    Edges.clear();
+  }
+
+private:
+  size_t Id;
+  bool IsEntry;
+  uint64_t ExecutionCount;
+  uint64_t Size;
+  // Cached ext-tsp score for the chain
+  double Score;
+  // Blocks of the chain
+  std::vector<Block *> Blocks;
+  // Adjacent chains and corresponding edges (lists of jumps)
+  std::vector<std::pair<Chain *, Edge *>> Edges;
+};
+
+// An edge in CFG reprsenting jumps between chains of BinaryBasicBlocks.
+// When blocks are merged into chains, the edges are combined too so that
+// there is always at most one edge between a pair of chains
+class Edge {
+public:
+  Edge(const Edge&) = delete;
+  Edge(Edge&&) = default;
+  Edge& operator=(const Edge&) = delete;
+  Edge& operator=(Edge&&) = default;
+
+  explicit Edge(Block *SrcBlock, Block *DstBlock, uint64_t EC)
+    : SrcChain(SrcBlock->CurChain),
+      DstChain(DstBlock->CurChain),
+      Jumps(1, std::make_pair(std::make_pair(SrcBlock, DstBlock), EC)) {}
+
+  const JumpList &jumps() const {
+    return Jumps;
+  }
+
+  void changeEndpoint(Chain *From, Chain *To) {
+    if (From == SrcChain)
+      SrcChain = To;
+    if (From == DstChain)
+      DstChain = To;
+  }
+
+  void appendJump(Block *SrcBlock, Block *DstBlock, uint64_t EC) {
+    Jumps.emplace_back(std::make_pair(SrcBlock, DstBlock), EC);
+  }
+
+  void moveJumps(Edge *Other) {
+    Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
+    Other->Jumps.clear();
+  }
+
+  bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
+    return Src == SrcChain ? CacheValidForward : CacheValidBackward;
+  }
+
+  MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
+    return Src == SrcChain ? CachedGainForward : CachedGainBackward;
+  }
+
+  void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
+    if (Src == SrcChain) {
+      CachedGainForward = MergeGain;
+      CacheValidForward = true;
+    } else {
+      CachedGainBackward = MergeGain;
+      CacheValidBackward = true;
+    }
+  }
+
+  void invalidateCache() {
+    CacheValidForward = false;
+    CacheValidBackward = false;
+  }
+
+private:
+  Chain *SrcChain{nullptr};
+  Chain *DstChain{nullptr};
+  // Original jumps in the binary with correspinding execution counts
+  JumpList Jumps;
+  // Cached ext-tsp value for merging the pair of chains
+  // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
+  // we store both values here
+  MergeGainTy CachedGainForward;
+  MergeGainTy CachedGainBackward;
+  // Whether the cached value must be recomputed
+  bool CacheValidForward{false};
+  bool CacheValidBackward{false};
+};
+
+void Chain::mergeEdges(Chain *Other) {
+  assert(this != Other && "cannot merge a chain with itself");
+
+  // Update edges adjacent to chain Other
+  for (auto EdgeIt : Other->Edges) {
+    Chain *const DstChain = EdgeIt.first;
+    Edge *const DstEdge = EdgeIt.second;
+    Chain *const TargetChain = DstChain == Other ? this : DstChain;
+
+    // Find the corresponding edge in the current chain
+    Edge *curEdge = getEdge(TargetChain);
+    if (curEdge == nullptr) {
+      DstEdge->changeEndpoint(Other, this);
+      this->addEdge(TargetChain, DstEdge);
+      if (DstChain != this && DstChain != Other) {
+        DstChain->addEdge(this, DstEdge);
+      }
+    } else {
+      curEdge->moveJumps(DstEdge);
+    }
+    // Cleanup leftover edge
+    if (DstChain != Other) {
+      DstChain->removeEdge(Other);
+    }
+  }
+}
+
+// A wrapper around three chains of basic blocks; it is used to avoid extra
+// instantiation of the vectors.
+class MergedChain {
+public:
+  MergedChain(BlockIter Begin1,
+              BlockIter End1,
+              BlockIter Begin2 = BlockIter(),
+              BlockIter End2 = BlockIter(),
+              BlockIter Begin3 = BlockIter(),
+              BlockIter End3 = BlockIter())
+  : Begin1(Begin1),
+    End1(End1),
+    Begin2(Begin2),
+    End2(End2),
+    Begin3(Begin3),
+    End3(End3) {}
+
+  template<typename F>
+  void forEach(const F &Func) const {
+    for (auto It = Begin1; It != End1; It++)
+      Func(*It);
+    for (auto It = Begin2; It != End2; It++)
+      Func(*It);
+    for (auto It = Begin3; It != End3; It++)
+      Func(*It);
+  }
+
+  std::vector<Block *> getBlocks() const {
+    std::vector<Block *> Result;
+    Result.reserve(std::distance(Begin1, End1) +
+                   std::distance(Begin2, End2) +
+                   std::distance(Begin3, End3));
+    Result.insert(Result.end(), Begin1, End1);
+    Result.insert(Result.end(), Begin2, End2);
+    Result.insert(Result.end(), Begin3, End3);
+    return Result;
+  }
+
+  const Block *getFirstBlock() const {
+    return *Begin1;
+  }
+
+private:
+  BlockIter Begin1;
+  BlockIter End1;
+  BlockIter Begin2;
+  BlockIter End2;
+  BlockIter Begin3;
+  BlockIter End3;
+};
+
+/// Deterministically compare pairs of chains
+bool compareChainPairs(const Chain *A1, const Chain *B1,
+                       const Chain *A2, const Chain *B2) {
+  const uint64_t Samples1 = A1->executionCount() + B1->executionCount();
+  const uint64_t Samples2 = A2->executionCount() + B2->executionCount();
+  if (Samples1 != Samples2)
+    return Samples1 < Samples2;
+
+  // Making the order deterministic
+  if (A1 != A2)
+    return A1->id() < A2->id();
+  return B1->id() < B2->id();
+}
+class ExtTSP {
+public:
+  ExtTSP(const BinaryFunction &BF) : BF(BF) {
+    initialize();
+  }
+
+  /// Run the algorithm and return an ordering of basic block
+  void run(std::vector<BinaryBasicBlock *> &Order) {
+    // Pass 1: Merge blocks with their fallthrough successors
+    mergeFallthroughs();
+
+    // Pass 2: Merge pairs of chains while improving the ExtTSP objective
+    mergeChainPairs();
+
+    // Pass 3: Merge cold blocks to reduce code size
+    mergeColdChains();
+
+    // Collect blocks from all chains
+    concatChains(Order);
+  }
+
+private:
+  /// Initialize algorithm's data structures
+  void initialize() {
+    // Create a separate MCCodeEmitter to allow lock-free execution
+    BinaryContext::IndependentCodeEmitter Emitter;
+    if (!opts::NoThreads) {
+      Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
+    }
+
+    // Initialize CFG nodes
+    AllBlocks.reserve(BF.layout_size());
+    size_t LayoutIndex = 0;
+    for (BinaryBasicBlock *BB : BF.layout()) {
+      BB->setLayoutIndex(LayoutIndex++);
+      uint64_t Size =
+          std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1);
+      AllBlocks.emplace_back(BB, Size);
+    }
+
+    // Initialize edges for the blocks and compute their total in/out weights
+    size_t NumEdges = 0;
+    for (Block &Block : AllBlocks) {
+      auto BI = Block.BB->branch_info_begin();
+      for (BinaryBasicBlock *SuccBB : Block.BB->successors()) {
+        assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+               "missing profile for a jump");
+        if (SuccBB != Block.BB && BI->Count > 0) {
+          class Block &SuccBlock = AllBlocks[SuccBB->getLayoutIndex()];
+          uint64_t Count = BI->Count;
+          SuccBlock.InWeight += Count;
+          SuccBlock.InJumps.emplace_back(&Block, Count);
+          Block.OutWeight += Count;
+          Block.OutJumps.emplace_back(&SuccBlock, Count);
+          NumEdges++;
+        }
+        ++BI;
+      }
+    }
+
+    // Initialize execution count for every basic block, which is the
+    // maximum over the sums of all in and out edge weights.
+    // Also execution count of the entry point is set to at least 1
+    for (Block &Block : AllBlocks) {
+      size_t Index = Block.Index;
+      Block.ExecutionCount = std::max(Block.ExecutionCount, Block.InWeight);
+      Block.ExecutionCount = std::max(Block.ExecutionCount, Block.OutWeight);
+      if (Index == 0 && Block.ExecutionCount == 0)
+        Block.ExecutionCount = 1;
+    }
+
+    // Initialize chains
+    AllChains.reserve(BF.layout_size());
+    HotChains.reserve(BF.layout_size());
+    for (Block &Block : AllBlocks) {
+      AllChains.emplace_back(Block.Index, &Block);
+      Block.CurChain = &AllChains.back();
+      if (Block.ExecutionCount > 0) {
+        HotChains.push_back(&AllChains.back());
+      }
+    }
+
+    // Initialize edges
+    AllEdges.reserve(NumEdges);
+    for (Block &Block : AllBlocks) {
+      for (std::pair<class Block *, uint64_t> &Jump : Block.OutJumps) {
+        class Block *const SuccBlock = Jump.first;
+        Edge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
+        // this edge is already present in the graph
+        if (CurEdge != nullptr) {
+          assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
+          CurEdge->appendJump(&Block, SuccBlock, Jump.second);
+          continue;
+        }
+        // this is a new edge
+        AllEdges.emplace_back(&Block, SuccBlock, Jump.second);
+        Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
+        SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
+      }
+    }
+    assert(AllEdges.size() <= NumEdges && "Incorrect number of created edges");
+  }
+
+  /// For a pair of blocks, A and B, block B is the fallthrough successor of A,
+  /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
+  /// to B are from A. Such blocks should be adjacent in an optimal ordering;
+  /// the method finds and merges such pairs of blocks
+  void mergeFallthroughs() {
+    // Find fallthroughs based on edge weights
+    for (Block &Block : AllBlocks) {
+      if (Block.BB->succ_size() == 1 &&
+          Block.BB->getSuccessor()->pred_size() == 1 &&
+          Block.BB->getSuccessor()->getLayoutIndex() != 0) {
+        size_t SuccIndex = Block.BB->getSuccessor()->getLayoutIndex();
+        Block.FallthroughSucc = &AllBlocks[SuccIndex];
+        AllBlocks[SuccIndex].FallthroughPred = &Block;
+        continue;
+      }
+
+      if (Block.OutWeight == 0)
+        continue;
+      for (std::pair<class Block *, uint64_t> &Edge : Block.OutJumps) {
+        class Block *const SuccBlock = Edge.first;
+        // Successor cannot be the first BB, which is pinned
+        if (Block.OutWeight == Edge.second &&
+            SuccBlock->InWeight == Edge.second &&
+            SuccBlock->Index != 0) {
+          Block.FallthroughSucc = SuccBlock;
+          SuccBlock->FallthroughPred = &Block;
+          break;
+        }
+      }
+    }
+
+    // There might be 'cycles' in the fallthrough dependencies (since profile
+    // data isn't 100% accurate).
+    // Break the cycles by choosing the block with smallest index as the tail
+    for (Block &Block : AllBlocks) {
+      if (Block.FallthroughSucc == nullptr || Block.FallthroughPred == nullptr)
+        continue;
+
+      class Block *SuccBlock = Block.FallthroughSucc;
+      while (SuccBlock != nullptr && SuccBlock != &Block) {
+        SuccBlock = SuccBlock->FallthroughSucc;
+      }
+      if (SuccBlock == nullptr)
+        continue;
+      // break the cycle
+      AllBlocks[Block.FallthroughPred->Index].FallthroughSucc = nullptr;
+      Block.FallthroughPred = nullptr;
+    }
+
+    // Merge blocks with their fallthrough successors
+    for (Block &Block : AllBlocks) {
+      if (Block.FallthroughPred == nullptr &&
+          Block.FallthroughSucc != nullptr) {
+        class Block *CurBlock = &Block;
+        while (CurBlock->FallthroughSucc != nullptr) {
+          class Block *const NextBlock = CurBlock->FallthroughSucc;
+          mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
+          CurBlock = NextBlock;
+        }
+      }
+    }
+  }
+
+  /// Merge pairs of chains while improving the ExtTSP objective
+  void mergeChainPairs() {
+    while (HotChains.size() > 1) {
+      Chain *BestChainPred = nullptr;
+      Chain *BestChainSucc = nullptr;
+      auto BestGain = MergeGainTy();
+      // Iterate over all pairs of chains
+      for (Chain *ChainPred : HotChains) {
+        // Get candidates for merging with the current chain
+        for (auto EdgeIter : ChainPred->edges()) {
+          Chain *ChainSucc = EdgeIter.first;
+          Edge *ChainEdge = EdgeIter.second;
+          // Ignore loop edges
+          if (ChainPred == ChainSucc)
+            continue;
+
+          // Compute the gain of merging the two chains
+          MergeGainTy CurGain = mergeGain(ChainPred, ChainSucc, ChainEdge);
+          if (CurGain.score() <= EPS)
+            continue;
+
+          if (BestGain < CurGain ||
+              (std::abs(CurGain.score() - BestGain.score()) < EPS &&
+               compareChainPairs(ChainPred,
+                                 ChainSucc,
+                                 BestChainPred,
+                                 BestChainSucc))) {
+            BestGain = CurGain;
+            BestChainPred = ChainPred;
+            BestChainSucc = ChainSucc;
+          }
+        }
+      }
+
+      // Stop merging when there is no improvement
+      if (BestGain.score() <= EPS)
+        break;
+
+      // Merge the best pair of chains
+      mergeChains(BestChainPred,
+                  BestChainSucc,
+                  BestGain.mergeOffset(),
+                  BestGain.mergeType());
+    }
+  }
+
+  /// Merge cold blocks to reduce code size
+  void mergeColdChains() {
+    for (BinaryBasicBlock *SrcBB : BF.layout()) {
+      // Iterating in reverse order to make sure original fallthrough jumps are
+      // merged first
+      for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) {
+        BinaryBasicBlock *DstBB = *Itr;
+        size_t SrcIndex = SrcBB->getLayoutIndex();
+        size_t DstIndex = DstBB->getLayoutIndex();
+        Chain *SrcChain = AllBlocks[SrcIndex].CurChain;
+        Chain *DstChain = AllBlocks[DstIndex].CurChain;
+        if (SrcChain != DstChain && !DstChain->isEntryPoint() &&
+            SrcChain->blocks().back()->Index == SrcIndex &&
+            DstChain->blocks().front()->Index == DstIndex) {
+          mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
+        }
+      }
+    }
+  }
+
+  /// Compute ExtTSP score for a given order of basic blocks
+  double score(const MergedChain &MergedBlocks, const JumpList &Jumps) const {
+    if (Jumps.empty())
+      return 0.0;
+    uint64_t CurAddr = 0;
+    MergedBlocks.forEach(
+      [&](const Block *BB) {
+        BB->EstimatedAddr = CurAddr;
+        CurAddr += BB->Size;
+      }
+    );
+
+    double Score = 0;
+    for (const std::pair<std::pair<Block *, Block *>, uint64_t> &Jump : Jumps) {
+      const Block *SrcBlock = Jump.first.first;
+      const Block *DstBlock = Jump.first.second;
+      Score += extTSPScore(SrcBlock->EstimatedAddr,
+                           SrcBlock->Size,
+                           DstBlock->EstimatedAddr,
+                           Jump.second);
+    }
+    return Score;
+  }
+
+  /// Compute the gain of merging two chains
+  ///
+  /// The function considers all possible ways of merging two chains and
+  /// computes the one having the largest increase in ExtTSP objective. The
+  /// result is a pair with the first element being the gain and the second
+  /// element being the corresponding merging type.
+  MergeGainTy mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
+    if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
+      return Edge->getCachedMergeGain(ChainPred, ChainSucc);
+    }
+
+    // Precompute jumps between ChainPred and ChainSucc
+    JumpList Jumps = Edge->jumps();
+    class Edge *EdgePP = ChainPred->getEdge(ChainPred);
+    if (EdgePP != nullptr)
+      Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
+    assert(Jumps.size() > 0 && "trying to merge chains w/o jumps");
+
+    MergeGainTy Gain = MergeGainTy();
+    // Try to concatenate two chains w/o splitting
+    Gain = computeMergeGain(
+        Gain, ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y);
+
+    // Try to break ChainPred in various ways and concatenate with ChainSucc
+    if (ChainPred->blocks().size() <= opts::ChainSplitThreshold) {
+      for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
+        Block *BB1 = ChainPred->blocks()[Offset - 1];
+        Block *BB2 = ChainPred->blocks()[Offset];
+        // Does the splitting break FT successors?
+        if (BB1->FallthroughSucc != nullptr) {
+          (void)BB2;
+          assert(BB1->FallthroughSucc == BB2 && "Fallthrough not preserved");
+          continue;
+        }
+
+        Gain = computeMergeGain(
+            Gain, ChainPred, ChainSucc, Jumps, Offset, MergeTypeTy::X1_Y_X2);
+        Gain = computeMergeGain(
+            Gain, ChainPred, ChainSucc, Jumps, Offset, MergeTypeTy::Y_X2_X1);
+        Gain = computeMergeGain(
+            Gain, ChainPred, ChainSucc, Jumps, Offset, MergeTypeTy::X2_X1_Y);
+      }
+    }
+
+    Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
+    return Gain;
+  }
+
+  /// Merge two chains and update the best Gain
+  MergeGainTy computeMergeGain(const MergeGainTy &CurGain,
+                               const Chain *ChainPred,
+                               const Chain *ChainSucc,
+                               const JumpList &Jumps,
+                               size_t MergeOffset,
+                               MergeTypeTy MergeType) const {
+    MergedChain MergedBlocks = mergeBlocks(
+        ChainPred->blocks(), ChainSucc->blocks(), MergeOffset, MergeType);
+
+    // Do not allow a merge that does not preserve the original entry block
+    if ((ChainPred->isEntryPoint() || ChainSucc->isEntryPoint()) &&
+        MergedBlocks.getFirstBlock()->Index != 0)
+      return CurGain;
+
+    // The gain for the new chain
+    const double NewScore = score(MergedBlocks, Jumps) - ChainPred->score();
+    auto NewGain = MergeGainTy(NewScore, MergeOffset, MergeType);
+    return CurGain < NewGain ? NewGain : CurGain;
+  }
+
+  /// Merge two chains of blocks respecting a given merge 'type' and 'offset'
+  ///
+  /// If MergeType == 0, then the result is a concatentation of two chains.
+  /// Otherwise, the first chain is cut into two sub-chains at the offset,
+  /// and merged using all possible ways of concatenating three chains.
+  MergedChain mergeBlocks(const std::vector<Block *> &X,
+                          const std::vector<Block *> &Y,
+                          size_t MergeOffset,
+                          MergeTypeTy MergeType) const {
+    // Split the first chain, X, into X1 and X2
+    BlockIter BeginX1 = X.begin();
+    BlockIter EndX1 = X.begin() + MergeOffset;
+    BlockIter BeginX2 = X.begin() + MergeOffset;
+    BlockIter EndX2 = X.end();
+    BlockIter BeginY = Y.begin();
+    BlockIter EndY = Y.end();
+
+    // Construct a new chain from the three existing ones
+    switch(MergeType) {
+      case MergeTypeTy::X_Y:
+        return MergedChain(BeginX1, EndX2, BeginY, EndY);
+      case MergeTypeTy::X1_Y_X2:
+        return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
+      case MergeTypeTy::Y_X2_X1:
+        return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
+      case MergeTypeTy::X2_X1_Y:
+        return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
+    }
+  }
+
+  /// Merge chain From into chain Into, update the list of active chains,
+  /// adjacency information, and the corresponding cached values
+  void mergeChains(Chain *Into,
+                   Chain *From,
+                   size_t MergeOffset,
+                   MergeTypeTy MergeType) {
+    assert(Into != From && "a chain cannot be merged with itself");
+
+    // Merge the blocks
+    MergedChain MergedBlocks =
+        mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
+    Into->merge(From, MergedBlocks.getBlocks());
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Update cached ext-tsp score for the new chain
+    Edge *SelfEdge = Into->getEdge(Into);
+    if (SelfEdge != nullptr) {
+      MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
+      Into->setScore(score(MergedBlocks, SelfEdge->jumps()));
+    }
+
+    // Remove chain From from the list of active chains
+    auto Iter = std::remove(HotChains.begin(), HotChains.end(), From);
+    HotChains.erase(Iter, HotChains.end());
+
+    // Invalidate caches
+    for (std::pair<Chain *, Edge *> EdgeIter : Into->edges()) {
+      EdgeIter.second->invalidateCache();
+    }
+  }
+
+  /// Concatenate all chains into a final order
+  void concatChains(std::vector<BinaryBasicBlock *> &Order) {
+    // Collect chains
+    std::vector<Chain *> SortedChains;
+    for (Chain &Chain : AllChains) {
+      if (Chain.blocks().size() > 0) {
+        SortedChains.push_back(&Chain);
+      }
+    }
+
+    // Sorting chains by density in decreasing order
+    std::stable_sort(
+      SortedChains.begin(), SortedChains.end(),
+      [](const Chain *C1, const Chain *C2) {
+        // Original entry point to the front
+        if (C1->isEntryPoint() != C2->isEntryPoint()) {
+          if (C1->isEntryPoint())
+            return true;
+          if (C2->isEntryPoint())
+            return false;
+        }
+
+        const double D1 = C1->density();
+        const double D2 = C2->density();
+        if (D1 != D2)
+          return D1 > D2;
+
+        // Making the order deterministic
+        return C1->id() < C2->id();
+      }
+    );
+
+    // Collect the basic blocks in the order specified by their chains
+    Order.reserve(BF.layout_size());
+    for (Chain *Chain : SortedChains) {
+      for (Block *Block : Chain->blocks()) {
+        Order.push_back(Block->BB);
+      }
+    }
+  }
+
+private:
+  // The binary function
+  const BinaryFunction &BF;
+
+  // All CFG nodes (basic blocks)
+  std::vector<Block> AllBlocks;
+
+  // All chains of blocks
+  std::vector<Chain> AllChains;
+
+  // Active chains. The vector gets updated at runtime when chains are merged
+  std::vector<Chain *> HotChains;
+
+  // All edges between chains
+  std::vector<Edge> AllEdges;
+};
+
+void ExtTSPReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Do not change layout of functions w/o profile information
+  if (!BF.hasValidProfile() || BF.layout_size() <= 2) {
+    for (BinaryBasicBlock *BB : BF.layout()) {
+      Order.push_back(BB);
+    }
+    return;
+  }
+
+  // Apply the algorithm
+  ExtTSP(BF).run(Order);
+
+  // Verify correctness
+  assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
+  assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout");
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/FrameAnalysis.cpp b/bolt/src/Passes/FrameAnalysis.cpp
new file mode 100644
index 000000000000..c9fb1855042c
--- /dev/null
+++ b/bolt/src/Passes/FrameAnalysis.cpp
@@ -0,0 +1,647 @@
+//===--- Passes/FrameAnalysis.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#include "FrameAnalysis.h"
+#include "CallGraphWalker.h"
+#include "ParallelUtilities.h"
+#include "llvm/Support/Timer.h"
+#include <fstream>
+#include <stack>
+
+#define DEBUG_TYPE "fa"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<unsigned> Verbosity;
+
+static cl::list<std::string>
+    FrameOptFunctionNames("funcs-fop", cl::CommaSeparated,
+                          cl::desc("list of functions to apply frame opts"),
+                          cl::value_desc("func1,func2,func3,..."));
+
+static cl::opt<std::string> FrameOptFunctionNamesFile(
+    "funcs-file-fop",
+    cl::desc("file with list of functions to frame optimize"));
+
+static cl::opt<bool>
+TimeFA("time-fa",
+  cl::desc("time frame analysis steps"),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
+  if (Function.hasUnknownControlFlow())
+    return false;
+
+  if (!FrameOptFunctionNamesFile.empty()) {
+    assert(!FrameOptFunctionNamesFile.empty() && "unexpected empty file name");
+    std::ifstream FuncsFile(FrameOptFunctionNamesFile, std::ios::in);
+    std::string FuncName;
+    while (std::getline(FuncsFile, FuncName)) {
+      FrameOptFunctionNames.push_back(FuncName);
+    }
+    FrameOptFunctionNamesFile = "";
+  }
+
+  bool IsValid = true;
+  if (!FrameOptFunctionNames.empty()) {
+    IsValid = false;
+    for (std::string &Name : FrameOptFunctionNames) {
+      if (Function.hasName(Name)) {
+        IsValid = true;
+        break;
+      }
+    }
+  }
+  if (!IsValid)
+    return false;
+
+  return IsValid;
+}
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+raw_ostream &operator<<(raw_ostream &OS, const FrameIndexEntry &FIE) {
+  OS << "FrameIndexEntry<IsLoad: " << FIE.IsLoad << ", IsStore: " << FIE.IsStore
+     << ", IsStoreFromReg: " << FIE.IsStoreFromReg
+     << ", RegOrImm: " << FIE.RegOrImm << ", StackOffset: ";
+  if (FIE.StackOffset < 0)
+    OS << "-" << Twine::utohexstr(-FIE.StackOffset);
+  else
+    OS << "+" << Twine::utohexstr(FIE.StackOffset);
+  OS << ", Size: " << static_cast<int>(FIE.Size)
+     << ", IsSimple: " << FIE.IsSimple << ">";
+  return OS;
+}
+
+namespace {
+
+/// This class should be used to iterate through basic blocks in layout order
+/// to analyze instructions for frame accesses. The user should call
+/// enterNewBB() whenever starting analyzing a new BB and doNext() for each
+/// instruction. After doNext(), if isValidAccess() returns true, it means the
+/// current instruction accesses the frame and getFIE() may be used to obtain
+/// details about this access.
+class FrameAccessAnalysis {
+  /// We depend on Stack Pointer Tracking to figure out the current SP offset
+  /// value at a given program point
+  StackPointerTracking &SPT;
+
+  /// Context vars
+  const BinaryContext &BC;
+  const BinaryFunction &BF;
+  // Vars used for storing useful CFI info to give us a hint about how the stack
+  // is used in this function
+  int SPOffset{0};
+  int FPOffset{0};
+  int64_t CfaOffset{-8};
+  uint16_t CfaReg{7};
+  std::stack<std::pair<int64_t, uint16_t>> CFIStack;
+  /// Our pointer to access SPT info
+  const MCInst *Prev{nullptr};
+  /// Info about the last frame access
+  bool IsValidAccess{false};
+  FrameIndexEntry FIE;
+
+  bool decodeFrameAccess(const MCInst &Inst) {
+    int32_t SrcImm = 0;
+    MCPhysReg Reg = 0;
+    int64_t StackOffset = 0;
+    bool IsIndexed = false;
+    if (!BC.MIB->isStackAccess(
+            Inst, FIE.IsLoad, FIE.IsStore, FIE.IsStoreFromReg, Reg, SrcImm,
+            FIE.StackPtrReg, StackOffset, FIE.Size, FIE.IsSimple, IsIndexed)) {
+      return true;
+    }
+
+    if (IsIndexed || FIE.Size == 0) {
+      LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
+      LLVM_DEBUG(dbgs() << "Blame insn: ");
+      LLVM_DEBUG(Inst.dump());
+      return false;
+    }
+
+    assert(FIE.Size != 0);
+
+    FIE.RegOrImm = SrcImm;
+    if (FIE.IsLoad || FIE.IsStoreFromReg)
+      FIE.RegOrImm = Reg;
+
+    if (FIE.StackPtrReg == BC.MIB->getStackPointer() && SPOffset != SPT.EMPTY &&
+        SPOffset != SPT.SUPERPOSITION) {
+      LLVM_DEBUG(
+          dbgs() << "Adding access via SP while CFA reg is another one\n");
+      FIE.StackOffset = SPOffset + StackOffset;
+    } else if (FIE.StackPtrReg == BC.MIB->getFramePointer() &&
+               FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION) {
+      LLVM_DEBUG(
+          dbgs() << "Adding access via FP while CFA reg is another one\n");
+      FIE.StackOffset = FPOffset + StackOffset;
+    } else if (FIE.StackPtrReg ==
+               *BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) {
+      FIE.StackOffset = CfaOffset + StackOffset;
+    } else {
+      LLVM_DEBUG(
+          dbgs() << "Found stack access with reg different than cfa reg.\n");
+      LLVM_DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg
+                        << "\n\tStack access reg: " << FIE.StackPtrReg << "\n");
+      LLVM_DEBUG(dbgs() << "Blame insn: ");
+      LLVM_DEBUG(Inst.dump());
+      return false;
+    }
+    IsValidAccess = true;
+    return true;
+  }
+
+public:
+  FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                      StackPointerTracking &SPT)
+      : SPT(SPT), BC(BC), BF(BF) {}
+
+  void enterNewBB() { Prev = nullptr; }
+  const FrameIndexEntry &getFIE() const { return FIE; }
+  int getSPOffset() const { return SPOffset; }
+  bool isValidAccess() const { return IsValidAccess; }
+
+  bool doNext(const BinaryBasicBlock &BB, const MCInst &Inst) {
+    IsValidAccess = false;
+    std::tie(SPOffset, FPOffset) =
+        Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB);
+    Prev = &Inst;
+    // Use CFI information to keep track of which register is being used to
+    // access the frame
+    if (BC.MIB->isCFI(Inst)) {
+      const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+      switch (CFI->getOperation()) {
+      case MCCFIInstruction::OpDefCfa:
+        CfaOffset = CFI->getOffset();
+        LLVM_FALLTHROUGH;
+      case MCCFIInstruction::OpDefCfaRegister:
+        CfaReg = CFI->getRegister();
+        break;
+      case MCCFIInstruction::OpDefCfaOffset:
+        CfaOffset = CFI->getOffset();
+        break;
+      case MCCFIInstruction::OpRememberState:
+        CFIStack.push(std::make_pair(CfaOffset, CfaReg));
+        break;
+      case MCCFIInstruction::OpRestoreState: {
+        if (CFIStack.empty()) {
+          dbgs() << "Assertion is about to fail: " << BF.getPrintName() << "\n";
+        }
+        assert(!CFIStack.empty() && "Corrupt CFI stack");
+        std::pair<int64_t, uint16_t> &Elem = CFIStack.top();
+        CFIStack.pop();
+        CfaOffset = Elem.first;
+        CfaReg = Elem.second;
+        break;
+      }
+      case MCCFIInstruction::OpAdjustCfaOffset:
+        llvm_unreachable("Unhandled AdjustCfaOffset");
+        break;
+      default:
+        break;
+      }
+      return true;
+    }
+
+    if (BC.MIB->escapesVariable(Inst, SPT.HasFramePointer)) {
+      LLVM_DEBUG(
+          dbgs() << "Leaked stack address, giving up on this function.\n");
+      LLVM_DEBUG(dbgs() << "Blame insn: ");
+      LLVM_DEBUG(Inst.dump());
+      return false;
+    }
+
+    return decodeFrameAccess(Inst);
+  }
+};
+
+} // end anonymous namespace
+
+void FrameAnalysis::addArgAccessesFor(MCInst &Inst, ArgAccesses &&AA) {
+  if (ErrorOr<ArgAccesses &> OldAA = getArgAccessesFor(Inst)) {
+    if (OldAA->AssumeEverything)
+      return;
+    *OldAA = std::move(AA);
+    return;
+  }
+  if (AA.AssumeEverything) {
+    // Index 0 in ArgAccessesVector represents an "assumeeverything" entry
+    BC.MIB->addAnnotation(Inst, "ArgAccessEntry", 0U);
+    return;
+  }
+  BC.MIB->addAnnotation(Inst, "ArgAccessEntry",
+                        (unsigned)ArgAccessesVector.size());
+  ArgAccessesVector.emplace_back(std::move(AA));
+}
+
+void FrameAnalysis::addArgInStackAccessFor(MCInst &Inst,
+                                           const ArgInStackAccess &Arg) {
+  ErrorOr<ArgAccesses &> AA = getArgAccessesFor(Inst);
+  if (!AA) {
+    addArgAccessesFor(Inst, ArgAccesses(false));
+    AA = getArgAccessesFor(Inst);
+    assert(AA && "Object setup failed");
+  }
+  std::set<ArgInStackAccess> &Set = AA->Set;
+  assert(!AA->AssumeEverything && "Adding arg to AssumeEverything set");
+  Set.emplace(Arg);
+}
+
+void FrameAnalysis::addFIEFor(MCInst &Inst, const FrameIndexEntry &FIE) {
+  BC.MIB->addAnnotation(Inst, "FrameAccessEntry",
+                        (unsigned)FIEVector.size());
+  FIEVector.emplace_back(FIE);
+}
+
+ErrorOr<ArgAccesses &> FrameAnalysis::getArgAccessesFor(const MCInst &Inst) {
+  if (auto Idx = BC.MIB->tryGetAnnotationAs<unsigned>(Inst, "ArgAccessEntry")) {
+    assert(ArgAccessesVector.size() > *Idx && "Out of bounds");
+    return ArgAccessesVector[*Idx];
+  }
+  return make_error_code(errc::result_out_of_range);
+}
+
+ErrorOr<const ArgAccesses &>
+FrameAnalysis::getArgAccessesFor(const MCInst &Inst) const {
+  if (auto Idx = BC.MIB->tryGetAnnotationAs<unsigned>(Inst, "ArgAccessEntry")) {
+    assert(ArgAccessesVector.size() > *Idx && "Out of bounds");
+    return ArgAccessesVector[*Idx];
+  }
+  return make_error_code(errc::result_out_of_range);
+}
+
+ErrorOr<const FrameIndexEntry &>
+FrameAnalysis::getFIEFor(const MCInst &Inst) const {
+  if (auto Idx =
+          BC.MIB->tryGetAnnotationAs<unsigned>(Inst, "FrameAccessEntry")) {
+    assert(FIEVector.size() > *Idx && "Out of bounds");
+    return FIEVector[*Idx];
+  }
+  return make_error_code(errc::result_out_of_range);
+}
+
+void FrameAnalysis::traverseCG(BinaryFunctionCallGraph &CG) {
+  CallGraphWalker CGWalker(CG);
+
+  CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool {
+    return computeArgsAccessed(*Func);
+  });
+
+  CGWalker.walk();
+
+  DEBUG_WITH_TYPE("ra", {
+    for (auto &MapEntry : ArgsTouchedMap) {
+      const BinaryFunction *Func = MapEntry.first;
+      const auto &Set = MapEntry.second;
+      dbgs() << "Args accessed for " << Func->getPrintName() << ": ";
+      if (!Set.empty() && Set.count(std::make_pair(-1, 0))) {
+        dbgs() << "assume everything";
+      } else {
+        for (const std::pair<int64_t, uint8_t> &Entry : Set) {
+          dbgs() << "[" << Entry.first << ", " << (int)Entry.second << "] ";
+        }
+      }
+      dbgs() << "\n";
+    }
+  });
+}
+
+bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst,
+                                         int CurOffset) {
+  if (!BC.MIB->isCall(Inst))
+    return false;
+
+  std::set<int64_t> Res;
+  const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+  // If indirect call, we conservatively assume it accesses all stack positions
+  if (TargetSymbol == nullptr) {
+    addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
+    if (!FunctionsRequireAlignment.count(&BF)) {
+      FunctionsRequireAlignment.insert(&BF);
+      return true;
+    }
+    return false;
+  }
+
+  const BinaryFunction *Function = BC.getFunctionForSymbol(TargetSymbol);
+  // Call to a function without a BinaryFunction object. Conservatively assume
+  // it accesses all stack positions
+  if (Function == nullptr) {
+    addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
+    if (!FunctionsRequireAlignment.count(&BF)) {
+      FunctionsRequireAlignment.insert(&BF);
+      return true;
+    }
+    return false;
+  }
+
+  auto Iter = ArgsTouchedMap.find(Function);
+
+  bool Changed = false;
+  if (BC.MIB->isTailCall(Inst) && Iter != ArgsTouchedMap.end()) {
+    // Ignore checking CurOffset because we can't always reliably determine the
+    // offset specially after an epilogue, where tailcalls happen. It should be
+    // -8.
+    for (std::pair<int64_t, uint8_t> Elem : Iter->second) {
+      if (ArgsTouchedMap[&BF].find(Elem) == ArgsTouchedMap[&BF].end()) {
+        ArgsTouchedMap[&BF].emplace(Elem);
+        Changed = true;
+      }
+    }
+  }
+  if (FunctionsRequireAlignment.count(Function) &&
+      !FunctionsRequireAlignment.count(&BF)) {
+    Changed = true;
+    FunctionsRequireAlignment.insert(&BF);
+  }
+  if (Iter == ArgsTouchedMap.end())
+    return Changed;
+
+  if (CurOffset == StackPointerTracking::EMPTY ||
+      CurOffset == StackPointerTracking::SUPERPOSITION) {
+    addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
+    return Changed;
+  }
+
+  for (std::pair<int64_t, uint8_t> Elem : Iter->second) {
+    if (Elem.first == -1) {
+      addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
+      break;
+    }
+    LLVM_DEBUG(dbgs() << "Added arg in stack access annotation "
+                      << CurOffset + Elem.first << "\n");
+    addArgInStackAccessFor(
+        Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first,
+                               /*Size=*/Elem.second});
+  }
+  return Changed;
+}
+
+bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
+  if (!BF.isSimple() || !BF.hasCFG()) {
+    LLVM_DEBUG(dbgs() << "Treating " << BF.getPrintName()
+                      << " conservatively.\n");
+    ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0));
+    if (!FunctionsRequireAlignment.count(&BF)) {
+      FunctionsRequireAlignment.insert(&BF);
+      return true;
+    }
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Now computing args accessed for: " << BF.getPrintName()
+                    << "\n");
+  bool UpdatedArgsTouched = false;
+  bool NoInfo = false;
+  FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
+
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    FAA.enterNewBB();
+
+    for (MCInst &Inst : *BB) {
+      if (!FAA.doNext(*BB, Inst)) {
+        ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0));
+        NoInfo = true;
+        break;
+      }
+
+      // Check for calls -- attach stack accessing info to them regarding their
+      // target
+      if (updateArgsTouchedFor(BF, Inst, FAA.getSPOffset()))
+        UpdatedArgsTouched = true;
+
+      // Check for stack accesses that affect callers
+      if (!FAA.isValidAccess())
+        continue;
+
+      const FrameIndexEntry &FIE = FAA.getFIE();
+      if (FIE.StackOffset < 0)
+        continue;
+      if (ArgsTouchedMap[&BF].find(std::make_pair(FIE.StackOffset, FIE.Size)) !=
+          ArgsTouchedMap[&BF].end())
+        continue;
+
+      // Record accesses to the previous stack frame
+      ArgsTouchedMap[&BF].emplace(std::make_pair(FIE.StackOffset, FIE.Size));
+      UpdatedArgsTouched = true;
+      LLVM_DEBUG({
+        dbgs() << "Arg access offset " << FIE.StackOffset << " added to:\n";
+        BC.printInstruction(dbgs(), Inst, 0, &BF, true);
+      });
+    }
+    if (NoInfo)
+      break;
+  }
+  if (FunctionsRequireAlignment.count(&BF))
+    return UpdatedArgsTouched;
+
+  if (NoInfo) {
+    FunctionsRequireAlignment.insert(&BF);
+    return true;
+  }
+
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (BC.MIB->requiresAlignedAddress(Inst)) {
+        FunctionsRequireAlignment.insert(&BF);
+        return true;
+      }
+    }
+  }
+  return UpdatedArgsTouched;
+}
+
+bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
+  FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
+
+  LLVM_DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
+                    << "\"\n");
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    LLVM_DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n");
+    FAA.enterNewBB();
+
+    for (MCInst &Inst : *BB) {
+      if (!FAA.doNext(*BB, Inst))
+        return false;
+      LLVM_DEBUG({
+        dbgs() << "\t\tNow at ";
+        Inst.dump();
+        dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n";
+      });
+
+      if (!FAA.isValidAccess())
+        continue;
+
+      const FrameIndexEntry &FIE = FAA.getFIE();
+
+      addFIEFor(Inst, FIE);
+      LLVM_DEBUG({
+        dbgs() << "Frame index annotation " << FIE << " added to:\n";
+        BC.printInstruction(dbgs(), Inst, 0, &BF, true);
+      });
+    }
+  }
+  return true;
+}
+
+void FrameAnalysis::cleanAnnotations() {
+  NamedRegionTimer T("cleanannotations", "clean annotations", "FA",
+                     "FA breakdown", opts::TimeFA);
+
+  ParallelUtilities::WorkFuncTy CleanFunction = [&](BinaryFunction &BF) {
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &Inst : BB) {
+        BC.MIB->removeAnnotation(Inst, "ArgAccessEntry");
+        BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
+      }
+    }
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, CleanFunction,
+      ParallelUtilities::PredicateTy(nullptr), "cleanAnnotations");
+}
+
+FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
+    : BC(BC) {
+  // Position 0 of the vector should be always associated with "assume access
+  // everything".
+  ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true));
+
+  if (!opts::NoThreads) {
+    NamedRegionTimer T1("precomputespt", "pre-compute spt", "FA",
+                        "FA breakdown", opts::TimeFA);
+    preComputeSPT();
+  }
+
+  {
+    NamedRegionTimer T1("traversecg", "traverse call graph", "FA",
+                        "FA breakdown", opts::TimeFA);
+    traverseCG(CG);
+  }
+
+  for (auto &I : BC.getBinaryFunctions()) {
+    uint64_t Count = I.second.getExecutionCount();
+    if (Count != BinaryFunction::COUNT_NO_PROFILE)
+      CountDenominator += Count;
+
+    // "shouldOptimize" for passes that run after finalize
+    if (!(I.second.isSimple() && I.second.hasCFG() && !I.second.isIgnored()) ||
+        !opts::shouldFrameOptimize(I.second)) {
+      ++NumFunctionsNotOptimized;
+      if (Count != BinaryFunction::COUNT_NO_PROFILE)
+        CountFunctionsNotOptimized += Count;
+      continue;
+    }
+
+    {
+      NamedRegionTimer T1("restorefi", "restore frame index", "FA",
+                          "FA breakdown", opts::TimeFA);
+      if (!restoreFrameIndex(I.second)) {
+        ++NumFunctionsFailedRestoreFI;
+        uint64_t Count = I.second.getExecutionCount();
+        if (Count != BinaryFunction::COUNT_NO_PROFILE)
+          CountFunctionsFailedRestoreFI += Count;
+        continue;
+      }
+    }
+    AnalyzedFunctions.insert(&I.second);
+  }
+
+  {
+    NamedRegionTimer T1("clearspt", "clear spt", "FA", "FA breakdown",
+                        opts::TimeFA);
+    clearSPTMap();
+
+    // Clean up memory allocated for annotation values
+    if (!opts::NoThreads) {
+      for (MCPlusBuilder::AllocatorIdTy Id : SPTAllocatorsId)
+        BC.MIB->freeValuesAllocator(Id);
+    }
+  }
+}
+
+void FrameAnalysis::printStats() {
+  outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
+         << " function(s) "
+         << format("(%.1lf%% dyn cov)",
+                   (100.0 * CountFunctionsNotOptimized / CountDenominator))
+         << " were not optimized.\n"
+         << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
+         << " function(s) "
+         << format("(%.1lf%% dyn cov)",
+                   (100.0 * CountFunctionsFailedRestoreFI / CountDenominator))
+         << " could not have its frame indices restored.\n";
+}
+
+void FrameAnalysis::clearSPTMap() {
+  if (opts::NoThreads) {
+    SPTMap.clear();
+    return;
+  }
+
+  ParallelUtilities::WorkFuncTy ClearFunctionSPT = [&](BinaryFunction &BF) {
+    std::unique_ptr<StackPointerTracking> &SPTPtr = SPTMap.find(&BF)->second;
+    SPTPtr.reset();
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !BF.isSimple() || !BF.hasCFG();
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, ClearFunctionSPT,
+      SkipFunc, "clearSPTMap");
+
+  SPTMap.clear();
+}
+
+void FrameAnalysis::preComputeSPT() {
+  // Make sure that the SPTMap is empty
+  assert(SPTMap.size() == 0);
+
+  // Create map entries to allow lock-free parallel execution
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &BF = BFI.second;
+    if (!BF.isSimple() || !BF.hasCFG())
+      continue;
+    SPTMap.emplace(&BF, std::unique_ptr<StackPointerTracking>());
+  }
+
+  // Create an index for the SPT annotation to allow lock-free parallel
+  // execution
+  BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
+
+  // Run SPT in parallel
+  ParallelUtilities::WorkFuncWithAllocTy ProcessFunction =
+      [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
+        std::unique_ptr<StackPointerTracking> &SPTPtr =
+            SPTMap.find(&BF)->second;
+        SPTPtr = std::make_unique<StackPointerTracking>(BC, BF, AllocId);
+        SPTPtr->run();
+      };
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    return !BF.isSimple() || !BF.hasCFG();
+  };
+
+  ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, ProcessFunction,
+      SkipPredicate, "preComputeSPT");
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/FrameAnalysis.h b/bolt/src/Passes/FrameAnalysis.h
new file mode 100644
index 000000000000..fca4a2962f84
--- /dev/null
+++ b/bolt/src/Passes/FrameAnalysis.h
@@ -0,0 +1,235 @@
+//===--- Passes/FrameAnalysis.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H
+
+#include "StackPointerTracking.h"
+
+namespace llvm {
+namespace bolt {
+class BinaryFunctionCallGraph;
+
+/// Alias analysis information attached to each instruction that accesses a
+/// frame position. This is called a "frame index" by LLVM Target libs when
+/// it is building a MachineFunction frame, and we use the same name here
+/// because we are essentially doing the job of frame reconstruction.
+struct FrameIndexEntry {
+  /// If both IsLoad and IsStore are set, it means this is an instruction that
+  /// reads and updates this frame location.
+  bool IsLoad;
+  bool IsStore;
+  /// If a store, this controls whether the store uses a register os an imm
+  /// as the source value.
+  bool IsStoreFromReg;
+  /// If load, this holds the destination register. If store, this holds
+  /// either the source register or source immediate.
+  int32_t RegOrImm;
+
+  /// StackOffset and Size are the two aspects that identify this frame access
+  /// for the purposes of alias analysis.
+  int64_t StackOffset;
+  uint8_t Size;
+
+  /// If this is false, we will never atempt to remove or optimize this
+  /// instruction. We just use it to keep track of stores we don't fully
+  /// understand but we know it may write to a frame position.
+  bool IsSimple;
+
+  uint16_t StackPtrReg;
+};
+
+/// Record an access to an argument in stack. This should be attached to
+/// call instructions, so StackOffset and Size are determined in the context
+/// of the caller. This information helps the caller understand how the callee
+/// may access its private stack.
+struct ArgInStackAccess {
+  int64_t StackOffset;
+  uint8_t Size;
+
+  bool operator<(const ArgInStackAccess &RHS) const {
+    if (StackOffset != RHS.StackOffset)
+      return StackOffset < RHS.StackOffset;
+    return Size < RHS.Size;
+  }
+};
+
+/// The set of all args-in-stack accesses for a given instruction. If
+/// AssumeEverything is true, then the set should be ignored and the
+/// corresponding instruction should be treated as accessing the entire
+/// stack for the purposes of analysis and optimization.
+struct ArgAccesses {
+  bool AssumeEverything;
+  std::set<ArgInStackAccess> Set;
+
+  explicit ArgAccesses(bool AssumeEverything)
+      : AssumeEverything(AssumeEverything) {}
+};
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const FrameIndexEntry &FIE);
+
+/// This pass attaches stack access information to instructions. If a load/store
+/// instruction accesses a stack position, it will identify the CFA offset and
+/// size information of this access, where CFA is the Canonical Frame Address
+/// (using DWARF terminology).
+///
+/// This pass also computes frame usage information obtained by a bottom-up call
+/// graph traversal: which registers are clobbered by functions (including their
+/// callees as determined by the call graph), whether a function accesses its
+/// caller's stack frame and whether a function demands its stack to be aligned
+/// due to the use of SSE aligned load/store operations present in itself or any
+/// of its direct or indirect callees.
+///
+/// Initialization:
+///
+///   FrameAnalysis FA(PrintPass);
+///   FA.runOnFunctions(BC);
+///
+/// Usage (fetching frame access information about a given instruction):
+///
+///   auto FIE = FA.getFIEFor(BC, Instruction);
+///   if (FIE && FIE->IsSimple) {
+///     ... = FIE->StackOffset
+///     ... = FIE->Size
+///   }
+///
+/// Usage (determining the set of stack positions accessed by the target of a
+/// call:
+///
+///    auto Args = FA.getArgAccessesFor(BC, CallInst);
+///    if (Args && Args->AssumeEverything) {
+///      ... callee may access any position of our current stack frame
+///    }
+///
+class FrameAnalysis {
+  BinaryContext &BC;
+
+  /// Map functions to the set of <stack offsets, size> tuples representing
+  /// accesses to stack positions that belongs to caller
+  std::map<const BinaryFunction *, std::set<std::pair<int64_t, uint8_t>>>
+      ArgsTouchedMap;
+
+  /// The set of functions we were able to perform the full analysis up to
+  /// restoring frame indexes for all load/store instructions.
+  DenseSet<const BinaryFunction *> AnalyzedFunctions;
+
+  /// Set of functions that require the stack to be 16B aligned
+  DenseSet<const BinaryFunction *> FunctionsRequireAlignment;
+
+  /// Owns ArgAccesses for all instructions. References to elements are
+  /// attached to instructions as indexes to this vector, in MCAnnotations.
+  std::vector<ArgAccesses> ArgAccessesVector;
+  /// Same for FrameIndexEntries.
+  std::vector<FrameIndexEntry> FIEVector;
+
+  /// Analysis stats counters
+  uint64_t NumFunctionsNotOptimized{0};
+  uint64_t NumFunctionsFailedRestoreFI{0};
+  uint64_t CountFunctionsNotOptimized{0};
+  uint64_t CountFunctionsFailedRestoreFI{0};
+  uint64_t CountDenominator{0};
+
+  /// Convenience functions for appending MCAnnotations to instructions with
+  /// our specific data
+  void addArgAccessesFor(MCInst &Inst, ArgAccesses &&AA);
+  void addArgInStackAccessFor(MCInst &Inst, const ArgInStackAccess &Arg);
+  void addFIEFor(MCInst &Inst, const FrameIndexEntry &FIE);
+
+  /// Perform the step of building the set of registers clobbered by each
+  /// function execution, populating RegsKilledMap and RegsGenMap.
+  void traverseCG(BinaryFunctionCallGraph &CG);
+
+  /// Analyzes an instruction and if it is a call, checks the called function
+  /// to record which args in stack are accessed, if any. Returns true if
+  /// the args data associated with this instruction were updated.
+  bool updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst,
+                            int CurOffset);
+
+  /// Performs a pass over \p BF to check for accesses to arguments in stack,
+  /// flagging those as accessing the caller stack frame. All functions called
+  /// by \p BF must have been previously analyzed. Returns true if updated
+  /// args data about this function.
+  bool computeArgsAccessed(BinaryFunction &BF);
+
+  /// Alias analysis to disambiguate which frame position is accessed by each
+  /// instruction in function \p BF. Add MCAnnotation<FrameIndexEntry> to
+  /// instructions that access a frame position. Return false if it failed
+  /// to analyze and this information can't be safely determined for \p BF.
+  bool restoreFrameIndex(BinaryFunction &BF);
+
+  /// A store for SPT info per function
+  std::unordered_map<const BinaryFunction *,
+                     std::unique_ptr<StackPointerTracking>>
+      SPTMap;
+
+  /// A vector that stores ids of the allocators that are used in SPT
+  /// computation
+  std::vector<MCPlusBuilder::AllocatorIdTy> SPTAllocatorsId;
+
+public:
+  explicit FrameAnalysis(BinaryContext &BC,
+                         BinaryFunctionCallGraph &CG);
+
+  /// Return true if we could fully analyze \p Func
+  bool hasFrameInfo(const BinaryFunction &Func) const {
+    return AnalyzedFunctions.count(&Func);
+  }
+
+  /// Return true if \p Func cannot operate with a misaligned CFA
+  bool requiresAlignment(const BinaryFunction &Func) const {
+    return FunctionsRequireAlignment.count(&Func);
+  }
+
+  /// Functions for retrieving our specific MCAnnotation data from instructions
+  ErrorOr<ArgAccesses &> getArgAccessesFor(const MCInst &Inst);
+
+  ErrorOr<const ArgAccesses &> getArgAccessesFor(const MCInst &Inst) const;
+
+  ErrorOr<const FrameIndexEntry &> getFIEFor(const MCInst &Inst) const;
+
+  /// Remove all MCAnnotations attached by this pass
+  void cleanAnnotations();
+
+  ~FrameAnalysis() {
+    cleanAnnotations();
+  }
+
+  /// Print to standard output statistics about the analysis performed by this
+  /// pass
+  void printStats();
+
+  /// Get or create an SPT object and run the analysis
+  StackPointerTracking &getSPT(BinaryFunction &BF) {
+    if (!SPTMap.count(&BF)) {
+      SPTMap.emplace(&BF, std::make_unique<StackPointerTracking>(BC, BF));
+      auto Iter = SPTMap.find(&BF);
+      assert(Iter != SPTMap.end() && "item should exist");
+      Iter->second->run();
+      return *Iter->second;
+    }
+
+    auto Iter = SPTMap.find(&BF);
+    assert(Iter != SPTMap.end() && "item should exist");
+    return *Iter->second;
+  }
+
+  /// Clean and de-allocate all SPT objects
+  void clearSPTMap();
+
+  /// Perform SPT analysis for all functions in parallel
+  void preComputeSPT();
+};
+
+} // namespace bolt
+} // namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/FrameOptimizer.cpp b/bolt/src/Passes/FrameOptimizer.cpp
new file mode 100644
index 000000000000..013e759a96c5
--- /dev/null
+++ b/bolt/src/Passes/FrameOptimizer.cpp
@@ -0,0 +1,362 @@
+//===--- Passes/FrameOptimizer.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "FrameOptimizer.h"
+#include "BinaryFunctionCallGraph.h"
+#include "DataflowInfoManager.h"
+#include "ParallelUtilities.h"
+#include "ShrinkWrapping.h"
+#include "StackAvailableExpressions.h"
+#include "StackReachingUses.h"
+#include "llvm/Support/Timer.h"
+#include <deque>
+#include <unordered_map>
+
+#define DEBUG_TYPE "fop"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<bool> TimeOpts;
+extern cl::OptionCategory BoltOptCategory;
+
+using namespace bolt;
+
+cl::opt<FrameOptimizationType>
+FrameOptimization("frame-opt",
+  cl::init(FOP_NONE),
+  cl::desc("optimize stack frame accesses"),
+  cl::values(
+    clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
+    clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
+    clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+RemoveStores("frame-opt-rm-stores",
+  cl::init(FOP_NONE),
+  cl::desc("apply additional analysis to remove stores (experimental)"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
+                                                const FrameAnalysis &FA,
+                                                const BinaryContext &BC,
+                                                BinaryFunction &BF) {
+  StackAvailableExpressions SAE(RA, FA, BC, BF);
+  SAE.run();
+
+  LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
+  std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
+  bool Changed = false;
+  const auto ExprEnd = SAE.expr_end();
+  for (BinaryBasicBlock &BB : BF) {
+    LLVM_DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
+    const MCInst *Prev = nullptr;
+    for (MCInst &Inst : BB) {
+      LLVM_DEBUG({
+        dbgs() << "\t\tNow at ";
+        Inst.dump();
+        for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
+             I != ExprEnd; ++I) {
+          dbgs() << "\t\t\tReached by: ";
+          (*I)->dump();
+        }
+      });
+      // if Inst is a load from stack and the current available expressions show
+      // this value is available in a register or immediate, replace this load
+      // with move from register or from immediate.
+      ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
+      if (!FIEX) {
+        Prev = &Inst;
+        continue;
+      }
+      // FIXME: Change to remove IsSimple == 0. We're being conservative here,
+      // but once replaceMemOperandWithReg is ready, we should feed it with all
+      // sorts of complex instructions.
+      if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
+          FIEX->StackOffset >= 0) {
+        Prev = &Inst;
+        continue;
+      }
+
+      for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
+           I != ExprEnd; ++I) {
+        const MCInst *AvailableInst = *I;
+        ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
+        if (!FIEY)
+          continue;
+        assert(FIEY->IsStore && FIEY->IsSimple);
+        if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
+          continue;
+        // TODO: Change push/pops to stack adjustment instruction
+        if (BC.MIB->isPop(Inst))
+          continue;
+
+        ++NumRedundantLoads;
+        Changed = true;
+        LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
+        LLVM_DEBUG(Inst.dump());
+        LLVM_DEBUG(dbgs() << "Related store instruction: ");
+        LLVM_DEBUG(AvailableInst->dump());
+        LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
+        // Replace load
+        if (FIEY->IsStoreFromReg) {
+          if (!BC.MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
+            LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
+            break;
+          }
+          ++NumLoadsChangedToReg;
+          BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
+          LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
+          if (BC.MIB->isRedundantMove(Inst)) {
+            ++NumLoadsDeleted;
+            LLVM_DEBUG(dbgs() << "Created a redundant move\n");
+            // Delete it!
+            ToErase.push_front(std::make_pair(&BB, &Inst));
+          }
+        } else {
+          char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+          support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
+          LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
+          if (!BC.MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
+            LLVM_DEBUG(dbgs() << "FAILED\n");
+          } else {
+            ++NumLoadsChangedToImm;
+            BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
+            LLVM_DEBUG(dbgs() << "Ok\n");
+          }
+        }
+        LLVM_DEBUG(dbgs() << "Changed to: ");
+        LLVM_DEBUG(Inst.dump());
+        break;
+      }
+      Prev = &Inst;
+    }
+  }
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
+  }
+  // TODO: Implement an interface of eraseInstruction that works out the
+  // complete list of elements to remove.
+  for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) {
+    I.first->eraseInstruction(I.first->findInstruction(I.second));
+  }
+}
+
+void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
+                                            const BinaryContext &BC,
+                                            BinaryFunction &BF) {
+  StackReachingUses SRU(FA, BC, BF);
+  SRU.run();
+
+  LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
+  std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
+  bool Changed = false;
+  for (BinaryBasicBlock &BB : BF) {
+    LLVM_DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
+    const MCInst *Prev = nullptr;
+    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
+      MCInst &Inst = *I;
+      LLVM_DEBUG({
+        dbgs() << "\t\tNow at ";
+        Inst.dump();
+        for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
+             I != SRU.expr_end(); ++I) {
+          dbgs() << "\t\t\tReached by: ";
+          (*I)->dump();
+        }
+      });
+      ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
+      if (!FIEX) {
+        Prev = &Inst;
+        continue;
+      }
+      if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
+        Prev = &Inst;
+        continue;
+      }
+
+      if (SRU.isStoreUsed(*FIEX,
+                          Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
+        Prev = &Inst;
+        continue;
+      }
+      // TODO: Change push/pops to stack adjustment instruction
+      if (BC.MIB->isPush(Inst))
+        continue;
+
+      ++NumRedundantStores;
+      Changed = true;
+      LLVM_DEBUG(dbgs() << "Unused store instruction: ");
+      LLVM_DEBUG(Inst.dump());
+      LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
+      LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
+                   << " size = " << (int)FIEX->Size << "\n");
+      // Delete it!
+      ToErase.emplace_back(&BB, &Inst);
+      Prev = &Inst;
+    }
+  }
+
+  for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase) {
+    I.first->eraseInstruction(I.first->findInstruction(I.second));
+  }
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
+  }
+}
+
+void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
+  if (opts::FrameOptimization == FOP_NONE)
+    return;
+
+  std::unique_ptr<BinaryFunctionCallGraph> CG;
+  std::unique_ptr<FrameAnalysis> FA;
+  std::unique_ptr<RegAnalysis> RA;
+
+  {
+    NamedRegionTimer T1("callgraph", "create call graph", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
+  }
+
+  {
+    NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    FA = std::make_unique<FrameAnalysis>(BC, *CG);
+  }
+
+  {
+    NamedRegionTimer T1("reganalysis", "reg analysis", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
+  }
+
+  // Perform caller-saved register optimizations, then callee-saved register
+  // optimizations (shrink wrapping)
+  for (auto &I : BC.getBinaryFunctions()) {
+    if (!FA->hasFrameInfo(I.second))
+      continue;
+    // Restrict pass execution if user asked to only run on hot functions
+    if (opts::FrameOptimization == FOP_HOT) {
+      if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
+        continue;
+      LLVM_DEBUG(
+          dbgs() << "Considering " << I.second.getPrintName()
+                 << " for frame optimizations because its execution count ( "
+                 << I.second.getKnownExecutionCount()
+                 << " ) exceeds our hotness threshold ( "
+                 << BC.getHotThreshold() << " )\n");
+    }
+
+    {
+      NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
+                          opts::TimeOpts);
+      removeUnnecessaryLoads(*RA, *FA, BC, I.second);
+    }
+
+    if (opts::RemoveStores) {
+      NamedRegionTimer T1("removestores", "remove stores", "FOP",
+                          "FOP breakdown", opts::TimeOpts);
+      removeUnusedStores(*FA, BC, I.second);
+    }
+    // Don't even start shrink wrapping if no profiling info is available
+    if (I.second.getKnownExecutionCount() == 0)
+      continue;
+
+  }
+
+  {
+    NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
+                        "FOP breakdown", opts::TimeOpts);
+    performShrinkWrapping(*RA, *FA, BC);
+  }
+
+  outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
+         << " redundant load(s) and " << NumRedundantStores
+         << " unused store(s)\n";
+  outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
+         << " load(s) to use a register instead of a stack access, and "
+         << NumLoadsChangedToImm << " to use an immediate.\n"
+         << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
+         << NumRedundantStores << " store(s).\n";
+  FA->printStats();
+  ShrinkWrapping::printStats();
+}
+
+void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
+                                               const FrameAnalysis &FA,
+                                               BinaryContext &BC) {
+  // Initialize necessary annotations to allow safe parallel accesses to
+  // annotation index in MIB
+  BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
+  BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
+  BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
+  BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
+  BC.MIB->getOrCreateAnnotationIndex(
+      StackLayoutModifier::getOffsetCFIRegTagName());
+  BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
+  BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
+  BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
+  BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
+  BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
+  BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
+  BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
+  BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
+  BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
+  BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
+  BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
+  BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
+  BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    if (!FA.hasFrameInfo(BF))
+      return true;
+
+    if (opts::FrameOptimization == FOP_HOT &&
+        (BF.getKnownExecutionCount() < BC.getHotThreshold()))
+      return true;
+
+    if (BF.getKnownExecutionCount() == 0)
+      return true;
+
+    return false;
+  };
+
+  ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
+      [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
+        DataflowInfoManager Info(BC, BF, &RA, &FA, AllocatorId);
+        ShrinkWrapping SW(FA, BC, BF, Info, AllocatorId);
+
+        if (SW.perform()) {
+          std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
+          FuncsChanged.insert(&BF);
+        }
+      };
+
+  ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
+      SkipPredicate, "shrink-wrapping");
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/FrameOptimizer.h b/bolt/src/Passes/FrameOptimizer.h
new file mode 100644
index 000000000000..683552c902fb
--- /dev/null
+++ b/bolt/src/Passes/FrameOptimizer.h
@@ -0,0 +1,128 @@
+//===--- Passes/FrameOptimizer.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+class FrameAnalysis;
+class RegAnalysis;
+
+/// FrameOptimizerPass strives for removing or moving stack frame accesses to
+/// less frequently executed basic blocks, reducing the pressure on icache
+/// usage as well as dynamic instruction count.
+///
+/// This is accomplished by analyzing both caller-saved register spills and
+/// callee-saved register spills. This class handles the former while delegating
+/// the latter to the class ShrinkWrapping. We discuss caller-saved register
+/// spills optimization below.
+///
+/// Caller-saved registers must be conservatively pushed to the stack because
+/// the callee may write to these registers. If we can prove the callee will
+/// never touch these registers, we can remove this spill.
+///
+/// This optimization analyzes the call graph and first computes the set of
+/// registers that may get overwritten when executing a function (this includes
+/// the set of registers touched by all functions this function may call during
+/// its execution) -- see the FrameAnalysis class for implementation details.
+///
+/// The second step is to perform an analysis to disambiguate which stack
+/// position is being accessed by each load/store instruction -- see the
+/// FrameAnalysis class.
+///
+/// The third step performs a forward dataflow analysis, using intersection as
+/// the confluence operator, to propagate information about available
+/// stack definitions at each point of the program. See the
+/// StackAvailableExpressions class. This definition shows an equivalence
+/// between the value in a stack position and the value of a register or
+/// immediate. To have those preserved, both register and the value in the stack
+/// position cannot be touched by another instruction.
+/// These definitions we are tracking occur in the form:
+///
+///     stack def:  MEM[FRAME - 0x5c]  <= RAX
+///
+/// Any instruction that writes to RAX will kill this definition, meaning RAX
+/// cannot be used to recover the same value that is in FRAME - 0x5c. Any memory
+/// write instruction to FRAME - 0x5c will also kill this definition.
+///
+/// If such a definition is available at an instruction that loads from this
+/// frame offset, we have detected a redundant load. For example, if the
+/// previous stack definition is available at the following instruction, this
+/// is an example of a redundant stack load:
+///
+///     stack load:  RAX  <= MEM[FRAME - 0x5c]
+///
+/// The fourth step will use this info to actually modify redundant loads. In
+/// our running example, we would change the stack load to the following reg
+/// move:
+///
+///     RAX <= RAX  // can be deleted
+///
+/// In this example, since the store source register is the same as the load
+/// destination register, this creates a redundant MOV that can be deleted.
+///
+/// Finally, another analysis propagates information about which instructions
+/// are using (loading from) a stack position -- see StackReachingUses. If a
+/// store sees no use of the value it is storing, it is eliminated.
+///
+class FrameOptimizerPass : public BinaryFunctionPass {
+  /// Stats aggregating variables
+  uint64_t NumRedundantLoads{0};
+  uint64_t NumRedundantStores{0};
+  uint64_t NumLoadsChangedToReg{0};
+  uint64_t NumLoadsChangedToImm{0};
+  uint64_t NumLoadsDeleted{0};
+
+  DenseSet<const BinaryFunction *> FuncsChanged;
+
+  std::mutex FuncsChangedMutex;
+
+  /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from
+  /// the frame. Use the analysis to convert memory loads to register moves or
+  /// immediate loads. Delete redundant register moves.
+  void removeUnnecessaryLoads(const RegAnalysis &RA,
+                              const FrameAnalysis &FA,
+                              const BinaryContext &BC,
+                              BinaryFunction &BF);
+
+  /// Use information from stack frame usage to delete unused stores.
+  void removeUnusedStores(const FrameAnalysis &FA,
+                          const BinaryContext &BC,
+                          BinaryFunction &BF);
+
+  /// Perform shrinkwrapping step
+  void performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
+                             BinaryContext &BC);
+
+public:
+  explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "frame-optimizer";
+  }
+
+  /// Pass entry point
+  void runOnFunctions(BinaryContext &BC) override;
+
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
+  }
+};
+
+} // namespace bolt
+
+} // namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/HFSort.cpp b/bolt/src/Passes/HFSort.cpp
new file mode 100644
index 000000000000..166455d23925
--- /dev/null
+++ b/bolt/src/Passes/HFSort.cpp
@@ -0,0 +1,316 @@
+//===--- HFSort.cpp - Cluster functions by hotness ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: copyright/license msg.
+
+/*
+   +----------------------------------------------------------------------+
+   | HipHop for PHP                                                       |
+   +----------------------------------------------------------------------+
+   | Copyright (c) 2010-2016 Facebook, Inc. (http://www.facebook.com)     |
+   +----------------------------------------------------------------------+
+   | This source file is subject to version 3.01 of the PHP license,      |
+   | that is bundled with this package in the file LICENSE, and is        |
+   | available through the world-wide-web at the following url:           |
+   | http://www.php.net/license/3_01.txt                                  |
+   | If you did not receive a copy of the PHP license and are unable to   |
+   | obtain it through the world-wide-web, please send a note to          |
+   | license@php.net so we can mail you a copy immediately.               |
+   +----------------------------------------------------------------------+
+*/
+
+#include "HFSort.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <unordered_set>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "hfsort"
+
+namespace opts {
+extern llvm::cl::opt<unsigned> Verbosity;
+}
+
+namespace llvm {
+namespace bolt {
+
+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;
+
+namespace {
+
+// The number of pages to reserve for the functions with highest
+// density (samples / size).  The functions put in these pages are not
+// considered for clustering.
+constexpr uint32_t FrozenPages = 0;
+
+// The minimum approximate probability of a callee being called from a
+// particular arc to consider merging with the caller's cluster.
+constexpr double MinArcProbability = 0.1;
+
+// This is a factor to determine by how much a caller cluster is
+// willing to degrade it's density by merging a callee.
+constexpr int CallerDegradeFactor = 8;
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+Cluster::Cluster(NodeId Id, const Node &Func)
+: Samples(Func.samples()),
+  Size(Func.size()),
+  Density((double)Samples / Size) {
+  Targets.push_back(Id);
+}
+
+Cluster::Cluster(const std::vector<NodeId> &Nodes, const CallGraph &Cg) {
+  Samples = 0;
+  Size = 0;
+  for (NodeId TargetId : Nodes) {
+    Targets.push_back(TargetId);
+    Samples += Cg.samples(TargetId);
+    Size += Cg.size(TargetId);
+  }
+  Density = (double)Samples / Size;
+}
+
+std::string Cluster::toString() const {
+  std::string Str;
+  raw_string_ostream CS(Str);
+  bool PrintComma = false;
+  CS << "funcs = [";
+  for (const NodeId &Target : Targets) {
+    if (PrintComma) CS << ", ";
+    CS << Target;
+    PrintComma = true;
+  }
+  CS << "]";
+  return CS.str();
+}
+
+namespace {
+
+void freezeClusters(const CallGraph &Cg, std::vector<Cluster> &Clusters) {
+  uint32_t TotalSize = 0;
+  std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity);
+  for (Cluster &C : Clusters) {
+    uint32_t NewSize = TotalSize + C.size();
+    if (NewSize > FrozenPages * HugePageSize) break;
+    C.freeze();
+    TotalSize = NewSize;
+    LLVM_DEBUG(
+      NodeId Fid = C.target(0);
+      dbgs() <<
+          format("freezing cluster for func %d, size = %u, samples = %lu)\n",
+                 Fid, Cg.size(Fid), Cg.samples(Fid)););
+  }
+}
+
+}
+
+void Cluster::reverseTargets() {
+  std::reverse(Targets.begin(), Targets.end());
+}
+
+void Cluster::merge(const Cluster &Other, const double Aw) {
+  Targets.insert(Targets.end(),
+                 Other.Targets.begin(),
+                 Other.Targets.end());
+  Size += Other.Size;
+  Samples += Other.Samples;
+  Density = (double)Samples / Size;
+}
+
+void Cluster::merge(const Cluster &Other, const std::vector<CallGraph::NodeId> &Targets_) {
+  Targets = Targets_;
+  Size += Other.Size;
+  Samples += Other.Samples;
+  Density = (double)Samples / Size;
+}
+
+void Cluster::clear() {
+  Id = -1u;
+  Size = 0;
+  Samples = 0;
+  Density = 0.0;
+  Targets.clear();
+  Frozen = false;
+}
+
+std::vector<Cluster> clusterize(const CallGraph &Cg) {
+  std::vector<NodeId> SortedFuncs;
+
+  // indexed by NodeId, keeps it's current cluster
+  std::vector<Cluster*> FuncCluster(Cg.numNodes(), nullptr);
+  std::vector<Cluster> Clusters;
+  Clusters.reserve(Cg.numNodes());
+
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;
+    Clusters.emplace_back(F, Cg.getNode(F));
+    SortedFuncs.push_back(F);
+  }
+
+  freezeClusters(Cg, Clusters);
+
+  // The size and order of Clusters is fixed until we reshuffle it immediately
+  // before returning.
+  for (Cluster &Cluster : Clusters) {
+    FuncCluster[Cluster.targets().front()] = &Cluster;
+  }
+
+  std::sort(
+    SortedFuncs.begin(),
+    SortedFuncs.end(),
+    [&] (const NodeId F1, const NodeId F2) {
+      const CallGraph::Node &Func1 = Cg.getNode(F1);
+      const CallGraph::Node &Func2 = Cg.getNode(F2);
+      return
+        Func1.samples() * Func2.size() >  // TODO: is this correct?
+        Func2.samples() * Func1.size();
+    }
+  );
+
+  // Process each function, and consider merging its cluster with the
+  // one containing its most likely predecessor.
+  for (const NodeId Fid : SortedFuncs) {
+    Cluster *Cluster = FuncCluster[Fid];
+    if (Cluster->frozen()) continue;
+
+    // Find best predecessor.
+    NodeId BestPred = CallGraph::InvalidId;
+    double BestProb = 0;
+
+    for (const NodeId Src : Cg.predecessors(Fid)) {
+      const Arc &Arc = *Cg.findArc(Src, Fid);
+      if (BestPred == CallGraph::InvalidId || Arc.normalizedWeight() > BestProb) {
+        BestPred = Arc.src();
+        BestProb = Arc.normalizedWeight();
+      }
+    }
+
+    // Check if the merge is good for the callee.
+    //   Don't merge if the probability of getting to the callee from the
+    //   caller is too low.
+    if (BestProb < MinArcProbability) continue;
+
+    assert(BestPred != CallGraph::InvalidId);
+
+    class Cluster *PredCluster = FuncCluster[BestPred];
+
+    // Skip if no predCluster (predecessor w/ no samples), or if same
+    // as cluster, of it's frozen.
+    if (PredCluster == nullptr || PredCluster == Cluster ||
+        PredCluster->frozen()) {
+      continue;
+    }
+
+    // Skip if merged cluster would be bigger than the threshold.
+    if (Cluster->size() + PredCluster->size() > MaxClusterSize) continue;
+
+    // Check if the merge is good for the caller.
+    //   Don't merge if the caller's density is significantly better
+    //   than the density resulting from the merge.
+    const double NewDensity =
+      ((double)PredCluster->samples() + Cluster->samples()) /
+      (PredCluster->size() + Cluster->size());
+    if (PredCluster->density() > NewDensity * CallerDegradeFactor) {
+      continue;
+    }
+
+    LLVM_DEBUG(
+      if (opts::Verbosity > 1) {
+        dbgs() << format("merging %s -> %s: %u\n",
+                         PredCluster->toString().c_str(),
+                         Cluster->toString().c_str(),
+                         Cg.samples(Fid));
+      });
+
+    for (NodeId F : Cluster->targets()) {
+      FuncCluster[F] = PredCluster;
+    }
+
+    PredCluster->merge(*Cluster);
+    Cluster->clear();
+  }
+
+  // Return the set of Clusters that are left, which are the ones that
+  // didn't get merged (so their first func is its original func).
+  std::vector<Cluster> SortedClusters;
+  std::unordered_set<Cluster *> Visited;
+  for (const NodeId Func : SortedFuncs) {
+    Cluster *Cluster = FuncCluster[Func];
+    if (!Cluster ||
+        Visited.count(Cluster) == 1 ||
+        Cluster->target(0) != Func) {
+      continue;
+    }
+    SortedClusters.emplace_back(std::move(*Cluster));
+    Visited.insert(Cluster);
+  }
+
+  std::sort(SortedClusters.begin(),
+            SortedClusters.end(),
+            compareClustersDensity);
+
+  return SortedClusters;
+}
+
+std::vector<Cluster> randomClusters(const CallGraph &Cg) {
+  std::vector<NodeId> FuncIds(Cg.numNodes(), 0);
+  std::vector<Cluster> Clusters;
+  Clusters.reserve(Cg.numNodes());
+
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;
+    Clusters.emplace_back(F, Cg.getNode(F));
+  }
+
+  std::sort(Clusters.begin(),
+            Clusters.end(),
+            [](const Cluster &A, const Cluster &B) {
+              return A.size() < B.size();
+            });
+
+  auto pickMergeCluster = [&Clusters](const size_t Idx) {
+    size_t MaxIdx = Idx + 1;
+
+    while (MaxIdx < Clusters.size() &&
+           Clusters[Idx].size() + Clusters[MaxIdx].size() <= MaxClusterSize) {
+      ++MaxIdx;
+    }
+
+    if (MaxIdx - Idx > 1) {
+      size_t MergeIdx = (std::rand() % (MaxIdx - Idx - 1)) + Idx + 1;
+      assert(Clusters[MergeIdx].size() + Clusters[Idx].size() <= MaxClusterSize);
+      return MergeIdx;
+    }
+    return Clusters.size();
+  };
+
+  size_t Idx = 0;
+  while (Idx < Clusters.size()) {
+    size_t MergeIdx = pickMergeCluster(Idx);
+    if (MergeIdx == Clusters.size()) {
+      ++Idx;
+    } else {
+      Clusters[Idx].merge(Clusters[MergeIdx]);
+      Clusters.erase(Clusters.begin() + MergeIdx);
+    }
+  }
+
+  return Clusters;
+}
+
+}
+}
diff --git a/bolt/src/Passes/HFSort.h b/bolt/src/Passes/HFSort.h
new file mode 100644
index 000000000000..506f59bf76b7
--- /dev/null
+++ b/bolt/src/Passes/HFSort.h
@@ -0,0 +1,122 @@
+//===--- HFSort.h - Cluster functions by hotness --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Cluster functions by hotness.  There are four clustering algorithms:
+// 1. clusterize
+// 2. HFsort+
+// 3. pettisAndHansen
+// 4. randomClusters
+//
+// See original code in hphp/utils/hfsort.[h,cpp]
+//===----------------------------------------------------------------------===//
+
+// TODO: copyright/license msg.
+
+/*
+   +----------------------------------------------------------------------+
+   | HipHop for PHP                                                       |
+   +----------------------------------------------------------------------+
+   | Copyright (c) 2010-2016 Facebook, Inc. (http://www.facebook.com)     |
+   +----------------------------------------------------------------------+
+   | This source file is subject to version 3.01 of the PHP license,      |
+   | that is bundled with this package in the file LICENSE, and is        |
+   | available through the world-wide-web at the following url:           |
+   | http://www.php.net/license/3_01.txt                                  |
+   | If you did not receive a copy of the PHP license and are unable to   |
+   | obtain it through the world-wide-web, please send a note to          |
+   | license@php.net so we can mail you a copy immediately.               |
+   +----------------------------------------------------------------------+
+*/
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_HFSORT_H
+#define LLVM_TOOLS_LLVM_BOLT_HFSORT_H
+
+#include "CallGraph.h"
+
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace bolt {
+
+class Cluster {
+public:
+  Cluster(CallGraph::NodeId Id, const CallGraph::Node &F);
+  Cluster(const std::vector<CallGraph::NodeId> &Nodes, const CallGraph &Cg);
+
+  std::string toString() const;
+  double density() const { return Density; }
+  uint64_t samples() const { return Samples; }
+  uint32_t size() const { return Size; }
+  bool frozen() const { return Frozen; }
+  void freeze() { Frozen = true; }
+  void merge(const Cluster &Other, const double Aw = 0);
+  void merge(const Cluster &Other, const std::vector<CallGraph::NodeId>& Targets_);
+  void clear();
+  size_t numTargets() const {
+    return Targets.size();
+  }
+  const std::vector<CallGraph::NodeId> &targets() const {
+    return Targets;
+  }
+  CallGraph::NodeId target(size_t N) const {
+    return Targets[N];
+  }
+  void reverseTargets();
+  bool hasId() const { return Id != -1u; }
+  void setId(uint32_t NewId) {
+    assert(!hasId());
+    Id = NewId;
+  }
+  uint32_t id() const {
+    assert(hasId());
+    return Id;
+  }
+private:
+  uint32_t Id{-1u};
+  std::vector<CallGraph::NodeId> Targets;
+  uint64_t Samples{0};
+  uint32_t Size{0};
+  double Density{0.0};
+  bool Frozen{false}; // not a candidate for merging
+};
+
+// Maximum size of a cluster, in bytes.
+constexpr uint32_t MaxClusterSize = 1 << 20;
+
+// Size of a huge page in bytes.
+constexpr uint32_t HugePageSize = 2 << 20;
+
+inline bool compareClustersDensity(const Cluster &C1, const Cluster &C2) {
+  return C1.density() > C2.density();
+}
+
+/*
+ * Cluster functions in order to minimize call distance.
+ */
+std::vector<Cluster> clusterize(const CallGraph &Cg);
+
+/*
+ * Optimize function placement prioritizing i-TLB and i-cache performance.
+ */
+std::vector<Cluster> hfsortPlus(CallGraph &Cg);
+
+/*
+ * Pettis-Hansen code layout algorithm
+ * reference: K. Pettis and R. C. Hansen, "Profile Guided Code Positioning",
+ * PLDI '90
+ */
+std::vector<Cluster> pettisAndHansen(const CallGraph &Cg);
+
+/* Group functions into clusters randomly. */
+std::vector<Cluster> randomClusters(const CallGraph &Cg);
+
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/HFSortPlus.cpp b/bolt/src/Passes/HFSortPlus.cpp
new file mode 100644
index 000000000000..8c2f01c4ed6b
--- /dev/null
+++ b/bolt/src/Passes/HFSortPlus.cpp
@@ -0,0 +1,643 @@
+//===--- HFSortPlus.cpp - Order functions by hotness ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+// hfsort+ - layout of hot functions with i-TLB cache optimization.
+//
+// Given an ordering of hot functions (and hence, their assignment to the
+// i-TLB pages), we can divide all functions calls Into two categories:
+// - 'short' ones that have a caller-callee distance less than a page;
+// - 'long' ones where the distance exceeds a page.
+// The short calls are likely to result in a i-TLB cache hit. For the long ones,
+// the hit/miss result depends on the 'hotness' of the page (i.e., how often
+// the page is accessed). Assuming that functions are sent to the i-TLB cache
+// in a random order, the probability that a page is present in the cache is
+// proportional to the number of samples corresponding to the functions on the
+// page. The following algorithm detects short and long calls, and optimizes
+// the expected number of cache misses for the long ones.
+
+#include "HFSort.h"
+#include "llvm/Support/CommandLine.h"
+#include <cmath>
+#include <set>
+#include <vector>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "hfsort"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+cl::opt<unsigned>
+ITLBPageSize("itlb-page-size",
+  cl::desc("The size of i-tlb cache page"),
+  cl::init(4096),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<unsigned>
+ITLBEntries("itlb-entries",
+  cl::desc("The number of entries in i-tlb cache"),
+  cl::init(16),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ITLBDensity("itlb-density",
+  cl::desc("The density of i-tlb cache"),
+  cl::init(4096),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<double>
+MergeProbability("merge-probability",
+  cl::desc("The minimum probability of a call for merging two clusters"),
+  cl::init(0.9),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<double>
+ArcThreshold("arc-threshold",
+  cl::desc("The threshold for ignoring arcs with a small relative weight"),
+  cl::init(0.00000001),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+
+namespace {
+
+class Edge;
+using ArcList = std::vector<const Arc *>;
+
+// A chain (ordered sequence) of nodes (functions) in the call graph
+class Chain {
+public:
+  Chain(const Chain &) = delete;
+  Chain(Chain &&) = default;
+  Chain &operator=(const Chain &) = delete;
+  Chain &operator=(Chain &&) = default;
+
+  explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_)
+      : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {}
+
+  double density() const { return static_cast<double>(Samples) / Size; }
+
+  Edge *getEdge(Chain *Other) const {
+    for (std::pair<Chain *, Edge *> It : Edges) {
+      if (It.first == Other)
+        return It.second;
+    }
+    return nullptr;
+  }
+
+  void removeEdge(Chain *Other) {
+    auto It = Edges.begin();
+    while (It != Edges.end()) {
+      if (It->first == Other) {
+        Edges.erase(It);
+        return;
+      }
+      It++;
+    }
+  }
+
+  void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); }
+
+  void merge(Chain *Other) {
+    Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end());
+    Samples += Other->Samples;
+    Size += Other->Size;
+  }
+
+  void mergeEdges(Chain *Other);
+
+  void clear() {
+    Nodes.clear();
+    Edges.clear();
+  }
+
+public:
+  size_t Id;
+  uint64_t Samples;
+  uint64_t Size;
+  // Cached score for the chain
+  double Score{0};
+  // Cached short-calls for the chain
+  double ShortCalls{0};
+  // Nodes in the chain
+  std::vector<NodeId> Nodes;
+  // Adjacent chains and corresponding edges (lists of arcs)
+  std::vector<std::pair<Chain *, Edge *>> Edges;
+};
+
+// An edge in the call graph representing Arcs between two Chains.
+// When functions are merged Into chains, the edges are combined too so that
+// there is always at most one edge between a pair of chains
+class Edge {
+public:
+  Edge(const Edge &) = delete;
+  Edge(Edge &&) = default;
+  Edge &operator=(const Edge &) = delete;
+  Edge &operator=(Edge &&) = default;
+
+  explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A)
+      : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {}
+
+  void changeEndpoint(Chain *From, Chain *To) {
+    if (From == SrcChain)
+      SrcChain = To;
+    if (From == DstChain)
+      DstChain = To;
+  }
+
+  void moveArcs(Edge *Other) {
+    Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end());
+    Other->Arcs.clear();
+  }
+
+  void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) {
+    // When forward and backward gains are the same, prioritize merging that
+    // preserves the original order of the functions in the binary
+    if (std::abs(ForwardGain - BackwardGain) < 1e-8) {
+      if (SrcChain->Id < DstChain->Id) {
+        IsGainForward = true;
+        CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain;
+      } else {
+        IsGainForward = false;
+        CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain;
+      }
+    } else if (ForwardGain > BackwardGain) {
+      IsGainForward = PredChain == SrcChain;
+      CachedGain = ForwardGain;
+    } else {
+      IsGainForward = PredChain != SrcChain;
+      CachedGain = BackwardGain;
+    }
+  }
+
+  double gain() const { return CachedGain; }
+
+  Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; }
+
+  Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; }
+
+private:
+  Chain *SrcChain{nullptr};
+  Chain *DstChain{nullptr};
+
+public:
+  // Original arcs in the binary with corresponding execution counts
+  ArcList Arcs;
+  // Cached gain of merging the pair of chains
+  double CachedGain{-1.0};
+  // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
+  // we store a flag indicating which of the options results in a higher gain
+  bool IsGainForward;
+};
+
+void Chain::mergeEdges(Chain *Other) {
+  // Update edges adjacent to chain other
+  for (auto EdgeIt : Other->Edges) {
+    Chain *const DstChain = EdgeIt.first;
+    Edge *const DstEdge = EdgeIt.second;
+    Chain *const TargetChain = DstChain == Other ? this : DstChain;
+
+    // Find the corresponding edge in the current chain
+    Edge *CurEdge = getEdge(TargetChain);
+    if (CurEdge == nullptr) {
+      DstEdge->changeEndpoint(Other, this);
+      this->addEdge(TargetChain, DstEdge);
+      if (DstChain != this && DstChain != Other) {
+        DstChain->addEdge(this, DstEdge);
+      }
+    } else {
+      CurEdge->moveArcs(DstEdge);
+    }
+    // Cleanup leftover edge
+    if (DstChain != Other) {
+      DstChain->removeEdge(Other);
+    }
+  }
+}
+
+class HFSortPlus {
+public:
+  explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); }
+
+  /// Run the algorithm and return ordered set of function clusters.
+  std::vector<Cluster> run() {
+    // Pass 1
+    runPassOne();
+
+    // Pass 2
+    runPassTwo();
+
+    outs() << "BOLT-INFO: hfsort+ reduced the number of chains from "
+           << Cg.numNodes() << " to " << HotChains.size() << "\n";
+
+    // Sorting chains by density in decreasing order
+    auto DensityComparator = [](const Chain *L, const Chain *R) {
+      if (L->density() != R->density()) {
+        return L->density() > R->density();
+      }
+      // Making sure the comparison is deterministic
+      return L->Id < R->Id;
+    };
+    std::stable_sort(HotChains.begin(), HotChains.end(), DensityComparator);
+
+    // Return the set of clusters that are left, which are the ones that
+    // didn't get merged (so their first func is its original func)
+    std::vector<Cluster> Clusters;
+    Clusters.reserve(HotChains.size());
+    for (Chain *Chain : HotChains) {
+      Clusters.emplace_back(Cluster(Chain->Nodes, Cg));
+    }
+    return Clusters;
+  }
+
+private:
+  /// Initialize the set of active chains, function id to chain mapping,
+  /// total number of samples and function addresses.
+  void initialize() {
+    OutWeight.resize(Cg.numNodes(), 0);
+    InWeight.resize(Cg.numNodes(), 0);
+    AllChains.reserve(Cg.numNodes());
+    HotChains.reserve(Cg.numNodes());
+    NodeChain.resize(Cg.numNodes(), nullptr);
+    Addr.resize(Cg.numNodes(), 0);
+
+    // Initialize chains
+    for (NodeId F = 0; F < Cg.numNodes(); ++F) {
+      AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F));
+      HotChains.push_back(&AllChains.back());
+      NodeChain[F] = &AllChains.back();
+      TotalSamples += Cg.samples(F);
+      for (NodeId Succ : Cg.successors(F)) {
+        if (F == Succ)
+          continue;
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        OutWeight[F] += Arc.weight();
+        InWeight[Succ] += Arc.weight();
+      }
+    }
+
+    AllEdges.reserve(Cg.numArcs());
+    for (NodeId F = 0; F < Cg.numNodes(); ++F) {
+      for (NodeId Succ : Cg.successors(F)) {
+        if (F == Succ)
+          continue;
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        if (Arc.weight() == 0.0 ||
+            Arc.weight() / TotalSamples < opts::ArcThreshold) {
+          continue;
+        }
+
+        Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]);
+        if (CurEdge != nullptr) {
+          // This edge is already present in the graph
+          assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr);
+          CurEdge->Arcs.push_back(&Arc);
+        } else {
+          // This is a new edge
+          AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc);
+          NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back());
+          NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back());
+        }
+      }
+    }
+
+    for (Chain *&Chain : HotChains) {
+      Chain->ShortCalls = shortCalls(Chain);
+      Chain->Score = score(Chain);
+    }
+  }
+
+  /// The probability that a page with a given density is not in the cache.
+  ///
+  /// Assume that the hot functions are called in a random order; then the
+  /// probability of an i-TLB page being accessed after a function call is
+  /// p = pageSamples / TotalSamples. The probability that the page is not
+  /// accessed is (1 - p), and the probability that it is not in the cache
+  /// (i.e. not accessed during the last kCacheEntries function calls)
+  /// is (1 - p)^kCacheEntries
+  double missProbability(double ChainDensity) const {
+    double PageSamples = ChainDensity * opts::ITLBDensity;
+
+    if (PageSamples >= TotalSamples) {
+      return 0;
+    }
+
+    double P = PageSamples / TotalSamples;
+    return pow(1.0 - P, double(opts::ITLBEntries));
+  }
+
+  /// The expected number of calls on different i-TLB pages for an arc of the
+  /// call graph with a specified weight
+  double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr,
+                       double Weight) const {
+    uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr;
+    if (Dist >= opts::ITLBPageSize) {
+      return 0;
+    }
+
+    double D = double(Dist) / double(opts::ITLBPageSize);
+    // Increasing the importance of shorter calls
+    return (1.0 - D * D) * Weight;
+  }
+
+  /// The expected number of calls within a given chain with both endpoints on
+  /// the same cache page
+  double shortCalls(Chain *Chain) const {
+    Edge *Edge = Chain->getEdge(Chain);
+    if (Edge == nullptr)
+      return 0;
+
+    double Calls = 0;
+    for (const Arc *Arc : Edge->Arcs) {
+      uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
+      uint64_t DstAddr = Addr[Arc->dst()];
+      Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
+    }
+    return Calls;
+  }
+
+  /// The number of calls between the two chains with both endpoints on
+  /// the same i-TLB page, assuming that a given pair of chains gets merged
+  double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
+    double Calls = 0;
+    for (const Arc *Arc : Edge->Arcs) {
+      Chain *SrcChain = NodeChain[Arc->src()];
+      uint64_t SrcAddr;
+      uint64_t DstAddr;
+      if (SrcChain == ChainPred) {
+        SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
+        DstAddr = Addr[Arc->dst()] + ChainPred->Size;
+      } else {
+        SrcAddr =
+            Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size;
+        DstAddr = Addr[Arc->dst()];
+      }
+      Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
+    }
+
+    Calls += ChainPred->ShortCalls;
+    Calls += ChainSucc->ShortCalls;
+
+    return Calls;
+  }
+
+  double score(Chain *Chain) const {
+    double LongCalls = Chain->Samples - Chain->ShortCalls;
+    return LongCalls * missProbability(Chain->density());
+  }
+
+  /// The gain of merging two chains.
+  ///
+  /// We assume that the final chains are sorted by their density, and hence
+  /// every chain is likely to be adjacent with chains of the same density.
+  /// Thus, the 'hotness' of every chain can be estimated by density*pageSize,
+  /// which is used to compute the probability of cache misses for long calls
+  /// of a given chain.
+  /// The result is also scaled by the size of the resulting chain in order to
+  /// increase the chance of merging short chains, which is helpful for
+  /// the i-cache performance.
+  double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
+    // Cache misses on the chains before merging
+    double CurScore = ChainPred->Score + ChainSucc->Score;
+
+    // Cache misses on the merged chain
+    double LongCalls = ChainPred->Samples + ChainSucc->Samples -
+                       shortCalls(ChainPred, ChainSucc, Edge);
+    const double MergedSamples = ChainPred->Samples + ChainSucc->Samples;
+    const double MergedSize = ChainPred->Size + ChainSucc->Size;
+    double NewScore = LongCalls * missProbability(MergedSamples / MergedSize);
+
+    double Gain = CurScore - NewScore;
+    // Scale the result to increase the importance of merging short chains
+    Gain /= std::min(ChainPred->Size, ChainSucc->Size);
+
+    return Gain;
+  }
+
+  /// Run the first optimization pass of the algorithm:
+  /// Merge chains that call each other with a high probability.
+  void runPassOne() {
+    // Find candidate pairs of chains for merging
+    std::vector<const Arc *> ArcsToMerge;
+    for (Chain *ChainPred : HotChains) {
+      NodeId F = ChainPred->Nodes.back();
+      for (NodeId Succ : Cg.successors(F)) {
+        if (F == Succ)
+          continue;
+
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        if (Arc.weight() == 0.0 ||
+            Arc.weight() / TotalSamples < opts::ArcThreshold) {
+          continue;
+        }
+
+        const double CallsFromPred = OutWeight[F];
+        const double CallsToSucc = InWeight[Succ];
+        const double CallsPredSucc = Arc.weight();
+
+        // Probability that the first chain is calling the second one
+        const double ProbOut =
+            CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
+        assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability");
+
+        // Probability that the second chain is called From the first one
+        const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
+        assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability");
+
+        if (std::min(ProbOut, ProbIn) >= opts::MergeProbability) {
+          ArcsToMerge.push_back(&Arc);
+        }
+      }
+    }
+
+    // Sort the pairs by the weight in reverse order
+    std::sort(
+        ArcsToMerge.begin(), ArcsToMerge.end(),
+        [](const Arc *L, const Arc *R) { return L->weight() > R->weight(); });
+
+    // Merge the pairs of chains
+    for (const Arc *Arc : ArcsToMerge) {
+      Chain *ChainPred = NodeChain[Arc->src()];
+      Chain *ChainSucc = NodeChain[Arc->dst()];
+      if (ChainPred == ChainSucc)
+        continue;
+      if (ChainPred->Nodes.back() == Arc->src() &&
+          ChainSucc->Nodes.front() == Arc->dst()) {
+        mergeChains(ChainPred, ChainSucc);
+      }
+    }
+  }
+
+  /// Run the second optimization pass of the hfsort+ algorithm:
+  /// Merge pairs of chains while there is an improvement in the
+  /// expected cache miss ratio.
+  void runPassTwo() {
+    // Creating a priority queue containing all edges ordered by the merge gain
+    auto GainComparator = [](Edge *L, Edge *R) {
+      if (std::abs(L->gain() - R->gain()) > 1e-8) {
+        return L->gain() > R->gain();
+      }
+      // Making sure the comparison is deterministic
+      if (L->predChain()->Id != R->predChain()->Id) {
+        return L->predChain()->Id < R->predChain()->Id;
+      }
+      return L->succChain()->Id < R->succChain()->Id;
+    };
+    std::set<Edge *, decltype(GainComparator)> Queue(GainComparator);
+
+    // Inserting the edges Into the queue
+    for (Chain *ChainPred : HotChains) {
+      for (auto EdgeIt : ChainPred->Edges) {
+        Chain *ChainSucc = EdgeIt.first;
+        Edge *ChainEdge = EdgeIt.second;
+        // Ignore loop edges
+        if (ChainPred == ChainSucc)
+          continue;
+        // Ignore already processed edges
+        if (ChainEdge->gain() != -1.0)
+          continue;
+
+        // Compute the gain of merging the two chains
+        auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge);
+        auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge);
+        ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain);
+        if (ChainEdge->gain() > 0.0) {
+          Queue.insert(ChainEdge);
+        }
+      }
+    }
+
+    // Merge the chains while the gain of merging is positive
+    while (!Queue.empty()) {
+      // Extract the best (top) edge for merging
+      Edge *It = *Queue.begin();
+      Queue.erase(Queue.begin());
+      Edge *BestEdge = It;
+      Chain *BestChainPred = BestEdge->predChain();
+      Chain *BestChainSucc = BestEdge->succChain();
+      if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0)
+        continue;
+
+      // Remove outdated edges
+      for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges) {
+        Queue.erase(EdgeIt.second);
+      }
+      for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges) {
+        Queue.erase(EdgeIt.second);
+      }
+
+      // Merge the best pair of chains
+      mergeChains(BestChainPred, BestChainSucc);
+
+      // Insert newly created edges Into the queue
+      for (auto EdgeIt : BestChainPred->Edges) {
+        Chain *ChainSucc = EdgeIt.first;
+        Edge *ChainEdge = EdgeIt.second;
+        // Ignore loop edges
+        if (BestChainPred == ChainSucc)
+          continue;
+
+        // Compute the gain of merging the two chains
+        auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge);
+        auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge);
+        ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain);
+        if (ChainEdge->gain() > 0.0) {
+          Queue.insert(ChainEdge);
+        }
+      }
+    }
+  }
+
+  /// Merge chain From into chain Into and update the list of active chains.
+  void mergeChains(Chain *Into, Chain *From) {
+    assert(Into != From && "cannot merge a chain with itself");
+    Into->merge(From);
+
+    // Update the chains and addresses for functions merged from From
+    size_t CurAddr = 0;
+    for (NodeId F : Into->Nodes) {
+      NodeChain[F] = Into;
+      Addr[F] = CurAddr;
+      CurAddr += Cg.size(F);
+    }
+
+    // Merge edges
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Update cached scores for the new chain
+    Into->ShortCalls = shortCalls(Into);
+    Into->Score = score(Into);
+
+    // Remove chain From From the list of active chains
+    auto it = std::remove(HotChains.begin(), HotChains.end(), From);
+    HotChains.erase(it, HotChains.end());
+  }
+
+private:
+  // The call graph
+  const CallGraph &Cg;
+
+  // All chains of functions
+  std::vector<Chain> AllChains;
+
+  // Active chains. The vector gets updated at runtime when chains are merged
+  std::vector<Chain *> HotChains;
+
+  // All edges between chains
+  std::vector<Edge> AllEdges;
+
+  // Node_id => chain
+  std::vector<Chain *> NodeChain;
+
+  // Current address of the function From the beginning of its chain
+  std::vector<uint64_t> Addr;
+
+  // Total weight of outgoing arcs for each function
+  std::vector<double> OutWeight;
+
+  // Total weight of incoming arcs for each function
+  std::vector<double> InWeight;
+  // The total number of samples in the graph
+  double TotalSamples{0};
+};
+
+} // end anonymous namespace
+
+std::vector<Cluster> hfsortPlus(CallGraph &Cg) {
+  // It is required that the sum of incoming arc weights is not greater
+  // than the number of samples for every function.
+  // Ensuring the call graph obeys the property before running the algorithm.
+  Cg.adjustArcWeights();
+  return HFSortPlus(Cg).run();
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/IdenticalCodeFolding.cpp b/bolt/src/Passes/IdenticalCodeFolding.cpp
new file mode 100644
index 000000000000..e9b0dd0b23c6
--- /dev/null
+++ b/bolt/src/Passes/IdenticalCodeFolding.cpp
@@ -0,0 +1,593 @@
+//===--- IdenticalCodeFolding.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Passes/IdenticalCodeFolding.h"
+#include "ParallelUtilities.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Timer.h"
+#include <atomic>
+#include <map>
+#include <set>
+#include <unordered_map>
+
+#define DEBUG_TYPE "bolt-icf"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+static cl::opt<bool>
+UseDFS("icf-dfs",
+  cl::desc("use DFS ordering when using -icf option"),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+TimeICF("time-icf",
+  cl::desc("time icf steps"),
+  cl::ReallyHidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+} // namespace opts
+
+namespace {
+using JumpTable = bolt::JumpTable;
+
+/// Compare two jump tables in 2 functions. The function relies on consistent
+/// ordering of basic blocks in both binary functions (e.g. DFS).
+bool equalJumpTables(const JumpTable &JumpTableA,
+                     const JumpTable &JumpTableB,
+                     const BinaryFunction &FunctionA,
+                     const BinaryFunction &FunctionB) {
+  if (JumpTableA.EntrySize != JumpTableB.EntrySize)
+    return false;
+
+  if (JumpTableA.Type != JumpTableB.Type)
+    return false;
+
+  if (JumpTableA.getSize() != JumpTableB.getSize())
+    return false;
+
+  for (uint64_t Index = 0; Index < JumpTableA.Entries.size(); ++Index) {
+    const MCSymbol *LabelA = JumpTableA.Entries[Index];
+    const MCSymbol *LabelB = JumpTableB.Entries[Index];
+
+    const BinaryBasicBlock *TargetA = FunctionA.getBasicBlockForLabel(LabelA);
+    const BinaryBasicBlock *TargetB = FunctionB.getBasicBlockForLabel(LabelB);
+
+    if (!TargetA || !TargetB) {
+      assert((TargetA || LabelA == FunctionA.getFunctionEndLabel()) &&
+             "no target basic block found");
+      assert((TargetB || LabelB == FunctionB.getFunctionEndLabel()) &&
+             "no target basic block found");
+
+      if (TargetA != TargetB)
+        return false;
+
+      continue;
+    }
+
+    assert(TargetA && TargetB && "cannot locate target block(s)");
+
+    if (TargetA->getLayoutIndex() != TargetB->getLayoutIndex())
+      return false;
+  }
+
+  return true;
+}
+
+/// Helper function that compares an instruction of this function to the
+/// given instruction of the given function. The functions should have
+/// identical CFG.
+template <class Compare>
+bool isInstrEquivalentWith(const MCInst &InstA, const BinaryBasicBlock &BBA,
+                           const MCInst &InstB, const BinaryBasicBlock &BBB,
+                           Compare Comp) {
+  if (InstA.getOpcode() != InstB.getOpcode()) {
+    return false;
+  }
+
+  const BinaryContext &BC = BBA.getFunction()->getBinaryContext();
+
+  // In this function we check for special conditions:
+  //
+  //    * instructions with landing pads
+  //
+  // Most of the common cases should be handled by MCPlus::equals()
+  // that compares regular instruction operands.
+  //
+  // NB: there's no need to compare jump table indirect jump instructions
+  //     separately as jump tables are handled by comparing corresponding
+  //     symbols.
+  const Optional<MCPlus::MCLandingPad> EHInfoA = BC.MIB->getEHInfo(InstA);
+  const Optional<MCPlus::MCLandingPad> EHInfoB = BC.MIB->getEHInfo(InstB);
+
+  if (EHInfoA || EHInfoB) {
+    if (!EHInfoA && (EHInfoB->first || EHInfoB->second))
+      return false;
+
+    if (!EHInfoB && (EHInfoA->first || EHInfoA->second))
+      return false;
+
+    if (EHInfoA && EHInfoB) {
+      // Action indices should match.
+      if (EHInfoA->second != EHInfoB->second)
+        return false;
+
+      if (!EHInfoA->first != !EHInfoB->first)
+        return false;
+
+      if (EHInfoA->first && EHInfoB->first) {
+        const BinaryBasicBlock *LPA = BBA.getLandingPad(EHInfoA->first);
+        const BinaryBasicBlock *LPB = BBB.getLandingPad(EHInfoB->first);
+        assert(LPA && LPB && "cannot locate landing pad(s)");
+
+        if (LPA->getLayoutIndex() != LPB->getLayoutIndex())
+          return false;
+      }
+    }
+  }
+
+  return BC.MIB->equals(InstA, InstB, Comp);
+}
+
+
+/// Returns true if this function has identical code and CFG with
+/// the given function \p BF.
+///
+/// If \p CongruentSymbols is set to true, then symbolic operands that reference
+/// potentially identical but different functions are ignored during the
+/// comparison.
+bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B,
+                     bool CongruentSymbols) {
+  assert(A.hasCFG() && B.hasCFG() && "both functions should have CFG");
+
+  // Compare the two functions, one basic block at a time.
+  // Currently we require two identical basic blocks to have identical
+  // instruction sequences and the same index in their corresponding
+  // functions. The latter is important for CFG equality.
+
+  if (A.layout_size() != B.layout_size())
+    return false;
+
+  // Comparing multi-entry functions could be non-trivial.
+  if (A.isMultiEntry() || B.isMultiEntry())
+    return false;
+
+  // Process both functions in either DFS or existing order.
+  const std::vector<BinaryBasicBlock *> &OrderA =
+      opts::UseDFS ? A.dfs() : A.getLayout();
+  const std::vector<BinaryBasicBlock *> &OrderB =
+      opts::UseDFS ? B.dfs() : B.getLayout();
+
+  const BinaryContext &BC = A.getBinaryContext();
+
+  auto BBI = OrderB.begin();
+  for (const BinaryBasicBlock *BB : OrderA) {
+    const BinaryBasicBlock *OtherBB = *BBI;
+
+    if (BB->getLayoutIndex() != OtherBB->getLayoutIndex())
+      return false;
+
+    // Compare successor basic blocks.
+    // NOTE: the comparison for jump tables is only partially verified here.
+    if (BB->succ_size() != OtherBB->succ_size())
+      return false;
+
+    auto SuccBBI = OtherBB->succ_begin();
+    for (const BinaryBasicBlock *SuccBB : BB->successors()) {
+      const BinaryBasicBlock *SuccOtherBB = *SuccBBI;
+      if (SuccBB->getLayoutIndex() != SuccOtherBB->getLayoutIndex())
+        return false;
+      ++SuccBBI;
+    }
+
+    // Compare all instructions including pseudos.
+    auto I = BB->begin(), E = BB->end();
+    auto OtherI = OtherBB->begin(), OtherE = OtherBB->end();
+    while (I != E && OtherI != OtherE) {
+      // Compare symbols.
+      auto AreSymbolsIdentical = [&] (const MCSymbol *SymbolA,
+                                      const MCSymbol *SymbolB) {
+        if (SymbolA == SymbolB)
+          return true;
+
+        // All local symbols are considered identical since they affect a
+        // control flow and we check the control flow separately.
+        // If a local symbol is escaped, then the function (potentially) has
+        // multiple entry points and we exclude such functions from
+        // comparison.
+        if (SymbolA->isTemporary() && SymbolB->isTemporary())
+          return true;
+
+        // Compare symbols as functions.
+        uint64_t EntryIDA = 0;
+        uint64_t EntryIDB = 0;
+        const BinaryFunction *FunctionA =
+            BC.getFunctionForSymbol(SymbolA, &EntryIDA);
+        const BinaryFunction *FunctionB =
+            BC.getFunctionForSymbol(SymbolB, &EntryIDB);
+        if (FunctionA && EntryIDA)
+          FunctionA = nullptr;
+        if (FunctionB && EntryIDB)
+          FunctionB = nullptr;
+        if (FunctionA && FunctionB) {
+          // Self-referencing functions and recursive calls.
+          if (FunctionA == &A && FunctionB == &B)
+            return true;
+
+          // Functions with different hash values can never become identical,
+          // hence A and B are different.
+          if (CongruentSymbols)
+            return FunctionA->getHash() == FunctionB->getHash();
+
+          return FunctionA == FunctionB;
+        }
+
+        // One of the symbols represents a function, the other one does not.
+        if (FunctionA != FunctionB) {
+          return false;
+        }
+
+        // Check if symbols are jump tables.
+        const BinaryData *SIA = BC.getBinaryDataByName(SymbolA->getName());
+        if (!SIA)
+          return false;
+        const BinaryData *SIB = BC.getBinaryDataByName(SymbolB->getName());
+        if (!SIB)
+          return false;
+
+        assert((SIA->getAddress() != SIB->getAddress()) &&
+               "different symbols should not have the same value");
+
+        const JumpTable *JumpTableA =
+            A.getJumpTableContainingAddress(SIA->getAddress());
+        if (!JumpTableA)
+          return false;
+
+        const JumpTable *JumpTableB =
+            B.getJumpTableContainingAddress(SIB->getAddress());
+        if (!JumpTableB)
+          return false;
+
+        if ((SIA->getAddress() - JumpTableA->getAddress()) !=
+            (SIB->getAddress() - JumpTableB->getAddress()))
+          return false;
+
+        return equalJumpTables(*JumpTableA, *JumpTableB, A, B);
+      };
+
+      if(!isInstrEquivalentWith(*I, *BB, *OtherI, *OtherBB,
+                                AreSymbolsIdentical)) {
+        return false;
+      }
+
+      ++I; ++OtherI;
+    }
+
+    // One of the identical blocks may have a trailing unconditional jump that
+    // is ignored for CFG purposes.
+    const MCInst *TrailingInstr =
+        (I != E ? &(*I) : (OtherI != OtherE ? &(*OtherI) : 0));
+    if (TrailingInstr && !BC.MIB->isUnconditionalBranch(*TrailingInstr)) {
+      return false;
+    }
+
+    ++BBI;
+  }
+
+  // Compare exceptions action tables.
+  if (A.getLSDAActionTable() != B.getLSDAActionTable() ||
+      A.getLSDATypeTable() != B.getLSDATypeTable() ||
+      A.getLSDATypeIndexTable() != B.getLSDATypeIndexTable()) {
+    return false;
+  }
+
+  return true;
+}
+
+// This hash table is used to identify identical functions. It maps
+// a function to a bucket of functions identical to it.
+struct KeyHash {
+  size_t operator()(const BinaryFunction *F) const {
+    return F->getHash();
+  }
+};
+
+/// Identify two congruent functions. Two functions are considered congruent,
+/// if they are identical/equal except for some of their instruction operands
+/// that reference potentially identical functions, i.e. functions that could
+/// be folded later. Congruent functions are candidates for folding in our
+/// iterative ICF algorithm.
+///
+/// Congruent functions are required to have identical hash.
+struct KeyCongruent {
+  bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
+    if (A == B)
+      return true;
+    return isIdenticalWith(*A, *B, /*CongruentSymbols=*/true);
+  }
+};
+
+struct KeyEqual {
+  bool operator()(const BinaryFunction *A, const BinaryFunction *B) const {
+    if (A == B)
+      return true;
+    return isIdenticalWith(*A, *B, /*CongruentSymbols=*/false);
+  }
+};
+
+typedef std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>,
+                           KeyHash, KeyCongruent>
+    CongruentBucketsMap;
+
+typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
+                           KeyHash, KeyEqual>
+    IdenticalBucketsMap;
+
+std::string hashInteger(uint64_t Value) {
+  std::string HashString;
+  if (Value == 0) {
+    HashString.push_back(0);
+  }
+  while (Value) {
+    uint8_t LSB = Value & 0xff;
+    HashString.push_back(LSB);
+    Value >>= 8;
+  }
+
+  return HashString;
+}
+
+std::string hashSymbol(BinaryContext &BC, const MCSymbol &Symbol) {
+  std::string HashString;
+
+  // Ignore function references.
+  if (BC.getFunctionForSymbol(&Symbol))
+    return HashString;
+
+  llvm::ErrorOr<uint64_t> ErrorOrValue = BC.getSymbolValue(Symbol);
+  if (!ErrorOrValue)
+    return HashString;
+
+  // Ignore jump table references.
+  if (BC.getJumpTableContainingAddress(*ErrorOrValue))
+    return HashString;
+
+  return HashString.append(hashInteger(*ErrorOrValue));
+}
+
+std::string hashExpr(BinaryContext &BC, const MCExpr &Expr) {
+  switch (Expr.getKind()) {
+  case MCExpr::Constant:
+    return hashInteger(cast<MCConstantExpr>(Expr).getValue());
+  case MCExpr::SymbolRef:
+    return hashSymbol(BC, cast<MCSymbolRefExpr>(Expr).getSymbol());
+  case MCExpr::Unary: {
+    const auto &UnaryExpr = cast<MCUnaryExpr>(Expr);
+    return hashInteger(UnaryExpr.getOpcode())
+        .append(hashExpr(BC, *UnaryExpr.getSubExpr()));
+  }
+  case MCExpr::Binary: {
+    const auto &BinaryExpr = cast<MCBinaryExpr>(Expr);
+    return hashExpr(BC, *BinaryExpr.getLHS())
+        .append(hashInteger(BinaryExpr.getOpcode()))
+        .append(hashExpr(BC, *BinaryExpr.getRHS()));
+  }
+  case MCExpr::Target:
+    return std::string();
+  }
+
+  llvm_unreachable("invalid expression kind");
+}
+
+std::string hashInstOperand(BinaryContext &BC, const MCOperand &Operand) {
+  if (Operand.isImm()) {
+    return hashInteger(Operand.getImm());
+  } else if (Operand.isReg()) {
+    return hashInteger(Operand.getReg());
+  } else if (Operand.isExpr()) {
+    return hashExpr(BC, *Operand.getExpr());
+  }
+
+  return std::string();
+}
+
+} // namespace
+
+namespace llvm {
+namespace bolt {
+
+void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
+  const size_t OriginalFunctionCount = BC.getBinaryFunctions().size();
+  uint64_t NumFunctionsFolded = 0;
+  std::atomic<uint64_t> NumJTFunctionsFolded{0};
+  std::atomic<uint64_t> BytesSavedEstimate{0};
+  std::atomic<uint64_t> CallsSavedEstimate{0};
+  std::atomic<uint64_t> NumFoldedLastIteration{0};
+  CongruentBucketsMap CongruentBuckets;
+
+  // Hash all the functions
+  auto hashFunctions = [&]() {
+    NamedRegionTimer HashFunctionsTimer("hashing", "hashing", "ICF breakdown",
+                                        "ICF breakdown", opts::TimeICF);
+    ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+      // Make sure indices are in-order.
+      BF.updateLayoutIndices();
+
+      // Pre-compute hash before pushing into hashtable.
+      // Hash instruction operands to minimize hash collisions.
+      BF.computeHash(opts::UseDFS,
+                     [&BC] (const MCOperand &Op) {
+                       return hashInstOperand(BC, Op);
+                     });
+    };
+
+    ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+      return !shouldOptimize(BF);
+    };
+
+    ParallelUtilities::runOnEachFunction(
+        BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
+        "hashFunctions", /*ForceSequential*/ false, 2);
+  };
+
+  // Creates buckets with congruent functions - functions that potentially
+  // could  be folded.
+  auto createCongruentBuckets = [&]() {
+    NamedRegionTimer CongruentBucketsTimer("congruent buckets",
+                                           "congruent buckets", "ICF breakdown",
+                                           "ICF breakdown", opts::TimeICF);
+    for (auto &BFI : BC.getBinaryFunctions()) {
+      BinaryFunction &BF = BFI.second;
+      if (!this->shouldOptimize(BF))
+        continue;
+      CongruentBuckets[&BF].emplace(&BF);
+    }
+  };
+
+  // Partition each set of congruent functions into sets of identical functions
+  // and fold them
+  auto performFoldingPass = [&]() {
+    NamedRegionTimer FoldingPassesTimer("folding passes", "folding passes",
+                                        "ICF breakdown", "ICF breakdown",
+                                        opts::TimeICF);
+    Timer SinglePass("single fold pass", "single fold pass");
+    LLVM_DEBUG(SinglePass.startTimer());
+
+    ThreadPool *ThPool;
+    if (!opts::NoThreads)
+      ThPool = &ParallelUtilities::getThreadPool();
+
+    // Fold identical functions within a single congruent bucket
+    auto processSingleBucket = [&](std::set<BinaryFunction *> &Candidates) {
+      Timer T("folding single congruent list", "folding single congruent list");
+      LLVM_DEBUG(T.startTimer());
+
+      // Identical functions go into the same bucket.
+      IdenticalBucketsMap IdenticalBuckets;
+      for (BinaryFunction *BF : Candidates) {
+        IdenticalBuckets[BF].emplace_back(BF);
+      }
+
+      for (auto &IBI : IdenticalBuckets) {
+        // Functions identified as identical.
+        std::vector<BinaryFunction *> &Twins = IBI.second;
+        if (Twins.size() < 2)
+          continue;
+
+        // Fold functions. Keep the order consistent across invocations with
+        // different options.
+        std::stable_sort(Twins.begin(), Twins.end(),
+                         [](const BinaryFunction *A, const BinaryFunction *B) {
+                           return A->getFunctionNumber() <
+                                  B->getFunctionNumber();
+                         });
+
+        BinaryFunction *ParentBF = Twins[0];
+        for (unsigned I = 1; I < Twins.size(); ++I) {
+          BinaryFunction *ChildBF = Twins[I];
+          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: folding " << *ChildBF << " into "
+                            << *ParentBF << '\n');
+
+          // Remove child function from the list of candidates.
+          auto FI = Candidates.find(ChildBF);
+          assert(FI != Candidates.end() &&
+                 "function expected to be in the set");
+          Candidates.erase(FI);
+
+          // Fold the function and remove from the list of processed functions.
+          BytesSavedEstimate += ChildBF->getSize();
+          CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(),
+                                         ParentBF->getKnownExecutionCount());
+          BC.foldFunction(*ChildBF, *ParentBF);
+
+          ++NumFoldedLastIteration;
+
+          if (ParentBF->hasJumpTables())
+            ++NumJTFunctionsFolded;
+        }
+      }
+
+      LLVM_DEBUG(T.stopTimer());
+    };
+
+    // Create a task for each congruent bucket
+    for (auto &Entry : CongruentBuckets) {
+      std::set<BinaryFunction *> &Bucket = Entry.second;
+      if (Bucket.size() < 2)
+        continue;
+
+      if (opts::NoThreads)
+        processSingleBucket(Bucket);
+      else
+        ThPool->async(processSingleBucket, std::ref(Bucket));
+    }
+
+    if (!opts::NoThreads)
+      ThPool->wait();
+
+    LLVM_DEBUG(SinglePass.stopTimer());
+  };
+
+  hashFunctions();
+  createCongruentBuckets();
+
+  unsigned Iteration = 1;
+  // We repeat the pass until no new modifications happen.
+  do {
+    NumFoldedLastIteration = 0;
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
+
+    performFoldingPass();
+
+    NumFunctionsFolded += NumFoldedLastIteration;
+    ++Iteration;
+
+  } while (NumFoldedLastIteration > 0);
+
+  LLVM_DEBUG({
+    // Print functions that are congruent but not identical.
+    for (auto &CBI : CongruentBuckets) {
+      std::set<BinaryFunction *> &Candidates = CBI.second;
+      if (Candidates.size() < 2)
+        continue;
+      dbgs() << "BOLT-DEBUG: the following " << Candidates.size()
+             << " functions (each of size " << (*Candidates.begin())->getSize()
+             << " bytes) are congruent but not identical:\n";
+      for (BinaryFunction *BF : Candidates) {
+        dbgs() << "  " << *BF;
+        if (BF->getKnownExecutionCount()) {
+          dbgs() << " (executed " << BF->getKnownExecutionCount() << " times)";
+        }
+        dbgs() << '\n';
+      }
+    }
+  });
+
+  if (NumFunctionsFolded) {
+    outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded
+           << " out of " << OriginalFunctionCount << " functions in "
+           << Iteration << " passes. "
+           << NumJTFunctionsFolded << " functions had jump tables.\n"
+           << "BOLT-INFO: Removing all identical functions will save "
+           << format("%.2lf", (double) BytesSavedEstimate / 1024)
+           << " KB of code space. Folded functions were called "
+           << CallsSavedEstimate << " times based on profile.\n";
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/IdenticalCodeFolding.h b/bolt/src/Passes/IdenticalCodeFolding.h
new file mode 100644
index 000000000000..886374f8de6c
--- /dev/null
+++ b/bolt/src/Passes/IdenticalCodeFolding.h
@@ -0,0 +1,47 @@
+//===--- IdenticalCodeFolding.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_IDENTICAL_CODE_FOLDING_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_IDENTICAL_CODE_FOLDING_H
+
+#include "BinaryFunction.h"
+#include "Passes/BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+/// An optimization that replaces references to identical functions with
+/// references to a single one of them.
+///
+class IdenticalCodeFolding : public BinaryFunctionPass {
+protected:
+  bool shouldOptimize(const BinaryFunction &BF) const override {
+    if (BF.hasUnknownControlFlow())
+      return false;
+    if (BF.isFolded())
+      return false;
+    if (BF.hasSDTMarker())
+      return false;
+    return BinaryFunctionPass::shouldOptimize(BF);
+  }
+public:
+  explicit IdenticalCodeFolding(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "identical-code-folding";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp
new file mode 100644
index 000000000000..1dc4eba4b9df
--- /dev/null
+++ b/bolt/src/Passes/IndirectCallPromotion.cpp
@@ -0,0 +1,1560 @@
+//===--- BinaryPasses.cpp - Binary-level analysis/optimization passes -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "IndirectCallPromotion.h"
+#include "BinaryFunctionCallGraph.h"
+#include "DataflowInfoManager.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "ICP"
+#define DEBUG_VERBOSE(Level, X) if (opts::Verbosity >= (Level)) { X; }
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<unsigned> ExecutionCountThreshold;
+
+cl::opt<IndirectCallPromotionType>
+IndirectCallPromotion("indirect-call-promotion",
+  cl::init(ICP_NONE),
+  cl::desc("indirect call promotion"),
+  cl::values(
+    clEnumValN(ICP_NONE, "none", "do not perform indirect call promotion"),
+    clEnumValN(ICP_CALLS, "calls", "perform ICP on indirect calls"),
+    clEnumValN(ICP_JUMP_TABLES, "jump-tables", "perform ICP on jump tables"),
+    clEnumValN(ICP_ALL, "all", "perform ICP on calls and jump tables")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPJTRemainingPercentThreshold(
+    "icp-jt-remaining-percent-threshold",
+    cl::desc("The percentage threshold against remaining unpromoted indirect "
+             "call count for the promotion for jump tables"),
+    cl::init(30),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPJTTotalPercentThreshold(
+    "icp-jt-total-percent-threshold",
+    cl::desc("The percentage threshold against total count for the promotion for "
+             "jump tables"),
+    cl::init(5),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPCallsRemainingPercentThreshold(
+    "icp-calls-remaining-percent-threshold",
+    cl::desc("The percentage threshold against remaining unpromoted indirect "
+             "call count for the promotion for calls"),
+    cl::init(50),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPCallsTotalPercentThreshold(
+    "icp-calls-total-percent-threshold",
+    cl::desc("The percentage threshold against total count for the promotion for "
+             "calls"),
+    cl::init(30),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+IndirectCallPromotionMispredictThreshold(
+    "indirect-call-promotion-mispredict-threshold",
+    cl::desc("misprediction threshold for skipping ICP on an "
+             "indirect call"),
+    cl::init(0),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+IndirectCallPromotionUseMispredicts(
+    "indirect-call-promotion-use-mispredicts",
+    cl::desc("use misprediction frequency for determining whether or not ICP "
+             "should be applied at a callsite.  The "
+             "-indirect-call-promotion-mispredict-threshold value will be used "
+             "by this heuristic"),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+IndirectCallPromotionTopN(
+    "indirect-call-promotion-topn",
+    cl::desc("limit number of targets to consider when doing indirect "
+             "call promotion. 0 = no limit"),
+    cl::init(3),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+IndirectCallPromotionCallsTopN(
+    "indirect-call-promotion-calls-topn",
+    cl::desc("limit number of targets to consider when doing indirect "
+             "call promotion on calls. 0 = no limit"),
+    cl::init(0),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+IndirectCallPromotionJumpTablesTopN(
+    "indirect-call-promotion-jump-tables-topn",
+    cl::desc("limit number of targets to consider when doing indirect "
+             "call promotion on jump tables. 0 = no limit"),
+    cl::init(0),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+EliminateLoads(
+    "icp-eliminate-loads",
+    cl::desc("enable load elimination using memory profiling data when "
+             "performing ICP"),
+    cl::init(true),
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ICPTopCallsites(
+    "icp-top-callsites",
+    cl::desc("optimize hottest calls until at least this percentage of all "
+             "indirect calls frequency is covered. 0 = all callsites"),
+    cl::init(99),
+    cl::Hidden,
+    cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
+
+static cl::list<std::string>
+ICPFuncsList("icp-funcs",
+             cl::CommaSeparated,
+             cl::desc("list of functions to enable ICP for"),
+             cl::value_desc("func1,func2,func3,..."),
+             cl::Hidden,
+             cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+ICPOldCodeSequence(
+    "icp-old-code-sequence",
+    cl::desc("use old code sequence for promoted calls"),
+    cl::init(false),
+    cl::ZeroOrMore,
+    cl::Hidden,
+    cl::cat(BoltOptCategory));
+
+static cl::opt<bool> ICPJumpTablesByTarget(
+    "icp-jump-tables-targets",
+    cl::desc(
+        "for jump tables, optimize indirect jmp targets instead of indices"),
+    cl::init(false), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
+  bool IsValid = true;
+  for (auto &BFI : BFs) {
+    BinaryFunction &BF = BFI.second;
+    if (!BF.isSimple()) continue;
+    for (BinaryBasicBlock *BB : BF.layout()) {
+      auto BI = BB->branch_info_begin();
+      for (BinaryBasicBlock *SuccBB : BB->successors()) {
+        if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
+          if (BB->getKnownExecutionCount() == 0 ||
+              SuccBB->getKnownExecutionCount() == 0) {
+            errs() << "BOLT-WARNING: profile verification failed after ICP for "
+                      "function " << BF << '\n';
+            IsValid = false;
+          }
+        }
+        ++BI;
+      }
+    }
+  }
+  return IsValid;
+}
+
+}
+
+IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
+                                          const IndirectCallProfile &ICP)
+  : From(BF.getSymbol()),
+    To(ICP.Offset),
+    Mispreds(ICP.Mispreds),
+    Branches(ICP.Count) {
+  if (ICP.Symbol) {
+    To.Sym = ICP.Symbol;
+    To.Addr = 0;
+  }
+}
+
+void IndirectCallPromotion::printDecision(
+    llvm::raw_ostream &OS,
+    std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
+  uint64_t TotalCount = 0;
+  uint64_t TotalMispreds = 0;
+  for (const Callsite &S : Targets) {
+    TotalCount += S.Branches;
+    TotalMispreds += S.Mispreds;
+  }
+  if (!TotalCount)
+    TotalCount = 1;
+  if (!TotalMispreds)
+    TotalMispreds = 1;
+
+  OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
+     << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
+     << "\n";
+
+  size_t I = 0;
+  for (const Callsite &S : Targets) {
+    OS << "Count = " << S.Branches << ", "
+       << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
+       << "Mispreds = " << S.Mispreds << ", "
+       << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
+    if (I < N)
+      OS << " * to be optimized *";
+    if (!S.JTIndices.empty()) {
+      OS << " Indices:";
+      for (const uint64_t Idx : S.JTIndices)
+        OS << " " << Idx;
+    }
+    OS << "\n";
+    I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
+  }
+}
+
+// Get list of targets for a given call sorted by most frequently
+// called first.
+std::vector<IndirectCallPromotion::Callsite>
+IndirectCallPromotion::getCallTargets(
+  BinaryBasicBlock &BB,
+  const MCInst &Inst
+) const {
+  BinaryFunction &BF = *BB.getFunction();
+  BinaryContext &BC = BF.getBinaryContext();
+  std::vector<Callsite> Targets;
+
+  if (const JumpTable *JT = BF.getJumpTable(Inst)) {
+    // Don't support PIC jump tables for now
+    if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
+      return Targets;
+    const Location From(BF.getSymbol());
+    const std::pair<size_t, size_t> Range =
+        JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
+    assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
+    JumpTable::JumpInfo DefaultJI;
+    const JumpTable::JumpInfo *JI =
+        JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
+    const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
+    assert(JT->Type == JumpTable::JTT_PIC ||
+           JT->EntrySize == BC.AsmInfo->getCodePointerSize());
+    for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
+      MCSymbol *Entry = JT->Entries[I];
+      assert(BF.getBasicBlockForLabel(Entry) ||
+             Entry == BF.getFunctionEndLabel() ||
+             Entry == BF.getFunctionColdEndLabel());
+      if (Entry == BF.getFunctionEndLabel() ||
+          Entry == BF.getFunctionColdEndLabel())
+        continue;
+      const Location To(Entry);
+      const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Entry);
+      Targets.emplace_back(
+          From, To, BI.MispredictedCount, BI.Count, I - Range.first);
+    }
+
+    // Sort by symbol then addr.
+    std::sort(Targets.begin(), Targets.end(),
+              [](const Callsite &A, const Callsite &B) {
+                if (A.To.Sym && B.To.Sym)
+                  return A.To.Sym < B.To.Sym;
+                else if (A.To.Sym && !B.To.Sym)
+                  return true;
+                else if (!A.To.Sym && B.To.Sym)
+                  return false;
+                else
+                  return A.To.Addr < B.To.Addr;
+              });
+
+    // Targets may contain multiple entries to the same target, but using
+    // different indices. Their profile will report the same number of branches
+    // for different indices if the target is the same. That's because we don't
+    // profile the index value, but only the target via LBR.
+    auto First = Targets.begin();
+    auto Last = Targets.end();
+    auto Result = First;
+    while (++First != Last) {
+      Callsite &A = *Result;
+      const Callsite &B = *First;
+      if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) {
+        A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
+                           B.JTIndices.end());
+      } else {
+        *(++Result) = *First;
+      }
+    }
+    ++Result;
+
+    LLVM_DEBUG(if (Targets.end() - Result > 0) {
+      dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
+             << " duplicate targets removed\n";
+    });
+
+    Targets.erase(Result, Targets.end());
+  } else {
+    // Don't try to optimize PC relative indirect calls.
+    if (Inst.getOperand(0).isReg() &&
+        Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) {
+      return Targets;
+    }
+    auto ICSP =
+      BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(Inst, "CallProfile");
+    if (ICSP) {
+      for (const IndirectCallProfile &CSP : ICSP.get()) {
+        Callsite Site(BF, CSP);
+        if (Site.isValid())
+          Targets.emplace_back(std::move(Site));
+      }
+    }
+  }
+
+  // Sort by target count, number of indices in case of jump table,  and
+  // mispredicts. We prioritize targets with high count, small number of
+  // indices and high mispredicts
+  std::stable_sort(Targets.begin(), Targets.end(),
+                   [](const Callsite &A, const Callsite &B) {
+                     if (A.Branches != B.Branches)
+                       return A.Branches > B.Branches;
+                     else if (A.JTIndices.size() != B.JTIndices.size())
+                       return A.JTIndices.size() < B.JTIndices.size();
+                     else
+                       return A.Mispreds > B.Mispreds;
+                   });
+
+  // Remove non-symbol targets
+  auto Last = std::remove_if(Targets.begin(),
+                             Targets.end(),
+                             [](const Callsite &CS) {
+                               return !CS.To.Sym;
+                             });
+  Targets.erase(Last, Targets.end());
+
+  LLVM_DEBUG(
+    if (BF.getJumpTable(Inst)) {
+      uint64_t TotalCount = 0;
+      uint64_t TotalMispreds = 0;
+      for (const Callsite &S : Targets) {
+        TotalCount += S.Branches;
+        TotalMispreds += S.Mispreds;
+      }
+      if (!TotalCount) TotalCount = 1;
+      if (!TotalMispreds) TotalMispreds = 1;
+
+      dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
+             << ", Count = " << TotalCount
+             << ", Mispreds = " << TotalMispreds << "\n";
+
+      size_t I = 0;
+      for (const Callsite &S : Targets) {
+        dbgs () << "Count[" << I << "] = " << S.Branches << ", "
+                << format("%.1f", (100.0*S.Branches)/TotalCount) << ", "
+                << "Mispreds[" << I << "] = " << S.Mispreds << ", "
+                << format("%.1f", (100.0*S.Mispreds)/TotalMispreds) << "\n";
+        ++I;
+      }
+    });
+
+  return Targets;
+}
+
+IndirectCallPromotion::JumpTableInfoType
+IndirectCallPromotion::maybeGetHotJumpTableTargets(
+   BinaryContext &BC,
+   BinaryFunction &Function,
+   BinaryBasicBlock *BB,
+   MCInst &CallInst,
+   MCInst *&TargetFetchInst,
+   const JumpTable *JT
+) const {
+  JumpTableInfoType HotTargets;
+
+  assert(JT && "Can't get jump table addrs for non-jump tables.");
+
+  if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
+    return JumpTableInfoType();
+
+  MCInst *MemLocInstr;
+  MCInst *PCRelBaseOut;
+  unsigned BaseReg, IndexReg;
+  int64_t DispValue;
+  const MCExpr *DispExpr;
+  MutableArrayRef<MCInst> Insts(&BB->front(), &CallInst);
+  const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
+      CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
+      MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
+
+  assert(MemLocInstr && "There should always be a load for jump tables");
+  if (!MemLocInstr)
+    return JumpTableInfoType();
+
+  LLVM_DEBUG({
+      dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
+             << "jump table in " << Function << " at @ "
+             << (&CallInst - &BB->front()) << "\n"
+             << "BOLT-INFO: ICP target fetch instructions:\n";
+      BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
+      if (MemLocInstr != &CallInst) {
+        BC.printInstruction(dbgs(), CallInst, 0, &Function);
+      }
+    });
+
+  DEBUG_VERBOSE(1, {
+      dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
+             << "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
+             << "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
+             << "DispValue = " << Twine::utohexstr(DispValue) << ", "
+             << "DispExpr = " << DispExpr << ", "
+             << "MemLocInstr = ";
+      BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
+      dbgs() << "\n";
+    });
+
+  ++TotalIndexBasedCandidates;
+
+  auto ErrorOrMemAccesssProfile =
+    BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
+                                                    "MemoryAccessProfile");
+  if (!ErrorOrMemAccesssProfile) {
+    DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n");
+    return JumpTableInfoType();
+  }
+  MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();
+
+  uint64_t ArrayStart;
+  if (DispExpr) {
+    ErrorOr<uint64_t> DispValueOrError =
+        BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
+    assert(DispValueOrError && "global symbol needs a value");
+    ArrayStart = *DispValueOrError;
+  } else {
+    ArrayStart = static_cast<uint64_t>(DispValue);
+  }
+
+  if (BaseReg == BC.MRI->getProgramCounter()) {
+    ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
+  }
+
+  // This is a map of [symbol] -> [count, index] and is used to combine indices
+  // into the jump table since there may be multiple addresses that all have the
+  // same entry.
+  std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
+  const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);
+
+  for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
+    size_t Index;
+    // Mem data occasionally includes nullprs, ignore them.
+    if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
+      continue;
+
+    if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
+      return JumpTableInfoType();
+
+    if (AccessInfo.MemoryObject) {
+      // Deal with bad/stale data
+      if (!AccessInfo.MemoryObject->getName().
+           startswith("JUMP_TABLE/" + Function.getOneName().str()))
+        return JumpTableInfoType();
+      Index =
+        (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
+    } else {
+      Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
+    }
+
+    // If Index is out of range it probably means the memory profiling data is
+    // wrong for this instruction, bail out.
+    if (Index >= Range.second) {
+      LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
+                        << ", " << Range.second << "\n");
+      return JumpTableInfoType();
+    }
+
+    // Make sure the hot index points at a legal label corresponding to a BB,
+    // e.g. not the end of function (unreachable) label.
+    if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
+      LLVM_DEBUG({
+          dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
+                 << "label " << JT->Entries[Index + Range.first]->getName()
+                 << " in jump table:\n";
+          JT->print(dbgs());
+          dbgs() << "HotTargetMap:\n";
+          for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
+               HotTargetMap) {
+            dbgs() << "BOLT-INFO: " << HT.first->getName()
+                   << " = (count=" << HT.first << ", index=" << HT.second
+                   << ")\n";
+          }
+        });
+      return JumpTableInfoType();
+    }
+
+    std::pair<uint64_t, uint64_t> &HotTarget =
+        HotTargetMap[JT->Entries[Index + Range.first]];
+    HotTarget.first += AccessInfo.Count;
+    HotTarget.second = Index;
+  }
+
+  std::transform(
+     HotTargetMap.begin(),
+     HotTargetMap.end(),
+     std::back_inserter(HotTargets),
+     [](const std::pair<MCSymbol *, std::pair<uint64_t, uint64_t>> &A) {
+       return A.second;
+     });
+
+  // Sort with highest counts first.
+  std::sort(HotTargets.rbegin(), HotTargets.rend());
+
+  LLVM_DEBUG({
+      dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
+      for (const std::pair<uint64_t, uint64_t> &Target : HotTargets) {
+        dbgs() << "BOLT-INFO:  Idx = " << Target.second << ", "
+               << "Count = " << Target.first << "\n";
+      }
+    });
+
+  BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;
+
+  TargetFetchInst = MemLocInstr;
+
+  return HotTargets;
+}
+
+IndirectCallPromotion::SymTargetsType
+IndirectCallPromotion::findCallTargetSymbols(
+  BinaryContext &BC,
+  std::vector<Callsite> &Targets,
+  size_t &N,
+  BinaryFunction &Function,
+  BinaryBasicBlock *BB,
+  MCInst &CallInst,
+  MCInst *&TargetFetchInst
+) const {
+  const JumpTable *JT = Function.getJumpTable(CallInst);
+  SymTargetsType SymTargets;
+
+  if (JT) {
+    JumpTableInfoType HotTargets = maybeGetHotJumpTableTargets(
+        BC, Function, BB, CallInst, TargetFetchInst, JT);
+
+    if (!HotTargets.empty()) {
+      auto findTargetsIndex = [&](uint64_t JTIndex) {
+        for (size_t I = 0; I < Targets.size(); ++I) {
+          std::vector<uint64_t> &JTIs = Targets[I].JTIndices;
+          if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
+            return I;
+        }
+        LLVM_DEBUG(
+            dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
+                   << " table entry in " << Function << "\n");
+        llvm_unreachable("Hot indices must be referred to by at least one "
+                         "callsite");
+      };
+
+      if (opts::Verbosity >= 1) {
+        for (size_t I = 0; I < HotTargets.size(); ++I) {
+          outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
+                 << HotTargets[I].first << ", " << HotTargets[I].second << ")\n";
+        }
+      }
+
+      // Recompute hottest targets, now discriminating which index is hot
+      // NOTE: This is a tradeoff. On one hand, we get index information. On the
+      // other hand, info coming from the memory profile is much less accurate
+      // than LBRs. So we may actually end up working with more coarse
+      // profile granularity in exchange for information about indices.
+      std::vector<Callsite> NewTargets;
+      std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
+      uint64_t TotalMemAccesses = 0;
+      for (size_t I = 0; I < HotTargets.size(); ++I) {
+        const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
+        ++IndicesPerTarget[Targets[TargetIndex].To.Sym];
+        TotalMemAccesses += HotTargets[I].first;
+      }
+      uint64_t RemainingMemAccesses = TotalMemAccesses;
+      const size_t TopN = opts::IndirectCallPromotionJumpTablesTopN != 0
+                              ? opts::IndirectCallPromotionTopN
+                              : opts::IndirectCallPromotionTopN;
+      size_t I = 0;
+      for (; I < HotTargets.size(); ++I) {
+        const uint64_t MemAccesses = HotTargets[I].first;
+        if (100 * MemAccesses <
+            TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
+          break;
+        if (100 * MemAccesses <
+            RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
+          break;
+        if (TopN && I >= TopN)
+          break;
+        RemainingMemAccesses -= MemAccesses;
+
+        const uint64_t JTIndex = HotTargets[I].second;
+        Callsite &Target = Targets[findTargetsIndex(JTIndex)];
+
+        NewTargets.push_back(Target);
+        std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
+        Target.JTIndices.erase(std::remove(Target.JTIndices.begin(),
+                                           Target.JTIndices.end(), JTIndex),
+                               Target.JTIndices.end());
+
+        // Keep fixCFG counts sane if more indices use this same target later
+        assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
+        NewTargets.back().Branches =
+            Target.Branches / IndicesPerTarget[Target.To.Sym];
+        NewTargets.back().Mispreds =
+            Target.Mispreds / IndicesPerTarget[Target.To.Sym];
+        assert(Target.Branches >= NewTargets.back().Branches);
+        assert(Target.Mispreds >= NewTargets.back().Mispreds);
+        Target.Branches -= NewTargets.back().Branches;
+        Target.Mispreds -= NewTargets.back().Mispreds;
+      }
+      std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets));
+      std::swap(NewTargets, Targets);
+      N = I;
+
+      if (N == 0 && opts::Verbosity >= 1) {
+        outs() << "BOLT-INFO: ICP failed in " << Function << " in "
+               << BB->getName()
+               << ": failed to meet thresholds after memory profile data was "
+                  "loaded.\n";
+        return SymTargets;
+      }
+    }
+
+    for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
+      Callsite &Target = Targets[TgtIdx];
+      assert(Target.To.Sym && "All ICP targets must be to known symbols");
+      assert(!Target.JTIndices.empty() && "Jump tables must have indices");
+      for (uint64_t Idx : Target.JTIndices) {
+        SymTargets.emplace_back(Target.To.Sym, Idx);
+        ++I;
+      }
+    }
+  } else {
+    for (size_t I = 0; I < N; ++I) {
+      assert(Targets[I].To.Sym &&
+             "All ICP targets must be to known symbols");
+      assert(Targets[I].JTIndices.empty() &&
+             "Can't have jump table indices for non-jump tables");
+      SymTargets.emplace_back(Targets[I].To.Sym, 0);
+    }
+  }
+
+  return SymTargets;
+}
+
+IndirectCallPromotion::MethodInfoType
+IndirectCallPromotion::maybeGetVtableSyms(
+   BinaryContext &BC,
+   BinaryFunction &Function,
+   BinaryBasicBlock *BB,
+   MCInst &Inst,
+   const SymTargetsType &SymTargets
+) const {
+  std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
+  std::vector<MCInst *> MethodFetchInsns;
+  unsigned VtableReg, MethodReg;
+  uint64_t MethodOffset;
+
+  assert(!Function.getJumpTable(Inst) &&
+         "Can't get vtable addrs for jump tables.");
+
+  if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
+    return MethodInfoType();
+
+  MutableArrayRef<MCInst> Insts(&BB->front(), &Inst + 1);
+  if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(),
+                                        Insts.end(),
+                                        MethodFetchInsns,
+                                        VtableReg,
+                                        MethodReg,
+                                        MethodOffset)) {
+    DEBUG_VERBOSE(1,
+                  dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
+                         << Function << " at @ " << (&Inst - &BB->front())
+                         << "\n");
+    return MethodInfoType();
+  }
+
+  ++TotalMethodLoadEliminationCandidates;
+
+  DEBUG_VERBOSE(1, {
+    dbgs() << "BOLT-INFO: ICP found virtual method call in "
+           << Function << " at @ " << (&Inst - &BB->front()) << "\n";
+    dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
+    for (MCInst *Inst : MethodFetchInsns) {
+      BC.printInstruction(dbgs(), *Inst, 0, &Function);
+    }
+    if (MethodFetchInsns.back() != &Inst) {
+      BC.printInstruction(dbgs(), Inst, 0, &Function);
+    }
+  });
+
+  // Try to get value profiling data for the method load instruction.
+  auto ErrorOrMemAccesssProfile =
+    BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
+                                                    "MemoryAccessProfile");
+  if (!ErrorOrMemAccesssProfile) {
+    DEBUG_VERBOSE(1,
+                  dbgs() << "BOLT-INFO: ICP no memory profiling data found\n");
+    return MethodInfoType();
+  }
+  MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccesssProfile.get();
+
+  // Find the vtable that each method belongs to.
+  std::map<const MCSymbol *, uint64_t> MethodToVtable;
+
+  for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
+    uint64_t Address = AccessInfo.Offset;
+    if (AccessInfo.MemoryObject) {
+      Address += AccessInfo.MemoryObject->getAddress();
+    }
+
+    // Ignore bogus data.
+    if (!Address)
+      continue;
+
+    const uint64_t VtableBase = Address - MethodOffset;
+
+    DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
+                            << Twine::utohexstr(VtableBase)
+                            << "+" << MethodOffset << "/" << AccessInfo.Count
+                            << "\n");
+
+    if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
+      BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
+      if (!MethodBD)  // skip unknown methods
+        continue;
+      MCSymbol *MethodSym = MethodBD->getSymbol();
+      MethodToVtable[MethodSym] = VtableBase;
+      DEBUG_VERBOSE(1, {
+        const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
+        dbgs() << "BOLT-INFO: ICP found method = "
+               << Twine::utohexstr(MethodAddr.get()) << "/"
+               << (Method ? Method->getPrintName() : "") << "\n";
+      });
+    }
+  }
+
+  // Find the vtable for each target symbol.
+  for (size_t I = 0; I < SymTargets.size(); ++I) {
+    auto Itr = MethodToVtable.find(SymTargets[I].first);
+    if (Itr != MethodToVtable.end()) {
+      if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
+        const uint64_t Addend = Itr->second - BD->getAddress();
+        VtableSyms.emplace_back(BD->getSymbol(), Addend);
+        continue;
+      }
+    }
+    // Give up if we can't find the vtable for a method.
+    DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
+                            << SymTargets[I].first->getName() << "\n");
+    return MethodInfoType();
+  }
+
+  // Make sure the vtable reg is not clobbered by the argument passing code
+  if (VtableReg != MethodReg) {
+    for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
+         ++CurInst) {
+      const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
+      if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) {
+        return MethodInfoType();
+      }
+    }
+  }
+
+  return MethodInfoType(VtableSyms, MethodFetchInsns);
+}
+
+std::vector<std::unique_ptr<BinaryBasicBlock>>
+IndirectCallPromotion::rewriteCall(
+   BinaryContext &BC,
+   BinaryFunction &Function,
+   BinaryBasicBlock *IndCallBlock,
+   const MCInst &CallInst,
+   MCPlusBuilder::BlocksVectorTy &&ICPcode,
+   const std::vector<MCInst *> &MethodFetchInsns
+) const {
+  // Create new basic blocks with correct code in each one first.
+  std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
+  const bool IsTailCallOrJT = (BC.MIB->isTailCall(CallInst) ||
+                               Function.getJumpTable(CallInst));
+
+  // Move instructions from the tail of the original call block
+  // to the merge block.
+
+  // Remember any pseudo instructions following a tail call.  These
+  // must be preserved and moved to the original block.
+  std::vector<MCInst> TailInsts;
+  const MCInst *TailInst = &CallInst;
+  if (IsTailCallOrJT) {
+    while (TailInst + 1 < &(*IndCallBlock->end()) &&
+           BC.MIB->isPseudo(*(TailInst + 1))) {
+      TailInsts.push_back(*++TailInst);
+    }
+  }
+
+  std::vector<MCInst> MovedInst = IndCallBlock->splitInstructions(&CallInst);
+  // Link new BBs to the original input offset of the BB where the indirect
+  // call site is, so we can map samples recorded in new BBs back to the
+  // original BB seen in the input binary (if using BAT)
+  const uint32_t OrigOffset = IndCallBlock->getInputOffset();
+
+  IndCallBlock->eraseInstructions(MethodFetchInsns.begin(),
+                                  MethodFetchInsns.end());
+  if (IndCallBlock->empty() ||
+      (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) {
+    IndCallBlock->addInstructions(ICPcode.front().second.begin(),
+                                  ICPcode.front().second.end());
+  } else {
+    IndCallBlock->replaceInstruction(std::prev(IndCallBlock->end()),
+                                     ICPcode.front().second);
+  }
+  IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end());
+
+  for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
+    MCSymbol *&Sym = Itr->first;
+    std::vector<MCInst> &Insts = Itr->second;
+    assert(Sym);
+    std::unique_ptr<BinaryBasicBlock> TBB =
+        Function.createBasicBlock(OrigOffset, Sym);
+    for (MCInst &Inst : Insts) { // sanitize new instructions.
+      if (BC.MIB->isCall(Inst))
+        BC.MIB->removeAnnotation(Inst, "CallProfile");
+    }
+    TBB->addInstructions(Insts.begin(), Insts.end());
+    NewBBs.emplace_back(std::move(TBB));
+  }
+
+  // Move tail of instructions from after the original call to
+  // the merge block.
+  if (!IsTailCallOrJT) {
+    NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
+  }
+
+  return NewBBs;
+}
+
+BinaryBasicBlock *IndirectCallPromotion::fixCFG(
+  BinaryContext &BC,
+  BinaryFunction &Function,
+  BinaryBasicBlock *IndCallBlock,
+  const bool IsTailCall,
+  const bool IsJumpTable,
+  IndirectCallPromotion::BasicBlocksVector &&NewBBs,
+  const std::vector<Callsite> &Targets
+) const {
+  using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
+  BinaryBasicBlock *MergeBlock = nullptr;
+
+  // Scale indirect call counts to the execution count of the original
+  // basic block containing the indirect call.
+  uint64_t TotalCount = IndCallBlock->getKnownExecutionCount();
+  uint64_t TotalIndirectBranches = 0;
+  for (const Callsite &Target : Targets) {
+    TotalIndirectBranches += Target.Branches;
+  }
+  if (TotalIndirectBranches == 0)
+    TotalIndirectBranches = 1;
+  std::vector<BinaryBranchInfo> BBI;
+  std::vector<BinaryBranchInfo> ScaledBBI;
+  for (const Callsite &Target : Targets) {
+    const size_t NumEntries = std::max(1UL, Target.JTIndices.size());
+    for (size_t I = 0; I < NumEntries; ++I) {
+      BBI.push_back(
+          BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
+                           (Target.Mispreds + NumEntries - 1)  / NumEntries});
+      ScaledBBI.push_back(BinaryBranchInfo{
+          uint64_t(TotalCount * Target.Branches /
+                                      (NumEntries * TotalIndirectBranches)),
+          uint64_t(TotalCount * Target.Mispreds /
+                                      (NumEntries * TotalIndirectBranches))});
+    }
+  }
+
+  if (IsJumpTable) {
+    BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
+    IndCallBlock->moveAllSuccessorsTo(NewIndCallBlock);
+
+    std::vector<MCSymbol*> SymTargets;
+    for (const Callsite &Target : Targets) {
+      const size_t NumEntries = std::max(1UL, Target.JTIndices.size());
+      for (size_t I = 0; I < NumEntries; ++I) {
+        SymTargets.push_back(Target.To.Sym);
+      }
+    }
+    assert(SymTargets.size() > NewBBs.size() - 1 &&
+           "There must be a target symbol associated with each new BB.");
+
+    for (uint64_t I = 0; I < NewBBs.size(); ++I) {
+      BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : IndCallBlock;
+      SourceBB->setExecutionCount(TotalCount);
+
+      BinaryBasicBlock *TargetBB =
+          Function.getBasicBlockForLabel(SymTargets[I]);
+      SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken
+
+      TotalCount -= ScaledBBI[I].Count;
+      SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through
+
+      // Update branch info for the indirect jump.
+      BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
+          NewIndCallBlock->getBranchInfo(*TargetBB);
+      if (BranchInfo.Count > BBI[I].Count)
+        BranchInfo.Count -= BBI[I].Count;
+      else
+        BranchInfo.Count = 0;
+
+      if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
+        BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
+      else
+        BranchInfo.MispredictedCount = 0;
+    }
+  } else {
+    assert(NewBBs.size() >= 2);
+    assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty());
+    assert(NewBBs.size() % 2 == 1 || IsTailCall);
+
+    auto ScaledBI = ScaledBBI.begin();
+    auto updateCurrentBranchInfo = [&]{
+      assert(ScaledBI != ScaledBBI.end());
+      TotalCount -= ScaledBI->Count;
+      ++ScaledBI;
+    };
+
+    if (!IsTailCall) {
+      MergeBlock = NewBBs.back().get();
+      IndCallBlock->moveAllSuccessorsTo(MergeBlock);
+    }
+
+    // Fix up successors and execution counts.
+    updateCurrentBranchInfo();
+    IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount);
+    IndCallBlock->addSuccessor(NewBBs[0].get(), ScaledBBI[0]);
+
+    const size_t Adj = IsTailCall ? 1 : 2;
+    for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
+      assert(TotalCount <= IndCallBlock->getExecutionCount() ||
+             TotalCount <= uint64_t(TotalIndirectBranches));
+      uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
+      if (I % 2 == 0) {
+        if (MergeBlock) {
+          NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I+1)/2].Count);
+        }
+      } else {
+        assert(I + 2 < NewBBs.size());
+        updateCurrentBranchInfo();
+        NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount);
+        NewBBs[I]->addSuccessor(NewBBs[I+1].get(), ScaledBBI[(I+1)/2]);
+        ExecCount += TotalCount;
+      }
+      NewBBs[I]->setExecutionCount(ExecCount);
+    }
+
+    if (MergeBlock) {
+      // Arrange for the MergeBlock to be the fallthrough for the first
+      // promoted call block.
+      std::unique_ptr<BinaryBasicBlock> MBPtr;
+      std::swap(MBPtr, NewBBs.back());
+      NewBBs.pop_back();
+      NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
+      // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
+      NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
+    }
+  }
+
+  // Update the execution count.
+  NewBBs.back()->setExecutionCount(TotalCount);
+
+  // Update BB and BB layout.
+  Function.insertBasicBlocks(IndCallBlock, std::move(NewBBs));
+  assert(Function.validateCFG());
+
+  return MergeBlock;
+}
+
+size_t
+IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB,
+                                          const MCInst &Inst,
+                                          const std::vector<Callsite> &Targets,
+                                          uint64_t NumCalls) {
+  if (BB->getKnownExecutionCount() < opts::ExecutionCountThreshold)
+    return 0;
+
+  const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst);
+
+  auto computeStats = [&](size_t N) {
+    for (size_t I = 0; I < N; ++I) {
+      if (!IsJumpTable)
+        TotalNumFrequentCalls += Targets[I].Branches;
+      else
+        TotalNumFrequentJmps += Targets[I].Branches;
+    }
+  };
+
+  // If we have no targets (or no calls), skip this callsite.
+  if (Targets.empty() || !NumCalls) {
+    if (opts::Verbosity >= 1) {
+      const auto InstIdx = &Inst - &(*BB->begin());
+      outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
+             << InstIdx << " in " << BB->getName()
+             << ", calls = " << NumCalls
+             << ", targets empty or NumCalls == 0.\n";
+    }
+    return 0;
+  }
+
+  size_t TopN = opts::IndirectCallPromotionTopN;
+  if (IsJumpTable) {
+    if (opts::IndirectCallPromotionJumpTablesTopN != 0)
+      TopN = opts::IndirectCallPromotionJumpTablesTopN;
+  } else if (opts::IndirectCallPromotionCallsTopN != 0) {
+    TopN = opts::IndirectCallPromotionCallsTopN;
+  }
+  const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
+
+  if (opts::ICPTopCallsites > 0) {
+    BinaryContext &BC = BB->getFunction()->getBinaryContext();
+    if (!BC.MIB->hasAnnotation(Inst, "DoICP"))
+      return 0;
+  }
+
+  // Pick the top N targets.
+  uint64_t TotalCallsTopN = 0;
+  uint64_t TotalMispredictsTopN = 0;
+  size_t N = 0;
+
+  if (opts::IndirectCallPromotionUseMispredicts &&
+      (!IsJumpTable || opts::ICPJumpTablesByTarget)) {
+    // Count total number of mispredictions for (at most) the top N targets.
+    // We may choose a smaller N (TrialN vs. N) if the frequency threshold
+    // is exceeded by fewer targets.
+    double Threshold = double(opts::IndirectCallPromotionMispredictThreshold);
+    for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
+      Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
+      TotalMispredictsTopN += Targets[I].Mispreds;
+    }
+    computeStats(N);
+
+    // Compute the misprediction frequency of the top N call targets.  If this
+    // frequency is greater than the threshold, we should try ICP on this callsite.
+    const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
+
+    if (TopNFrequency == 0 ||
+        TopNFrequency < opts::IndirectCallPromotionMispredictThreshold) {
+      if (opts::Verbosity >= 1) {
+        const auto InstIdx = &Inst - &(*BB->begin());
+        outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ "
+               << InstIdx << " in " << BB->getName() << ", calls = "
+               << NumCalls << ", top N mis. frequency "
+               << format("%.1f", TopNFrequency) << "% < "
+               << opts::IndirectCallPromotionMispredictThreshold << "%\n";
+      }
+      return 0;
+    }
+  } else {
+    size_t MaxTargets = 0;
+
+    // Count total number of calls for (at most) the top N targets.
+    // We may choose a smaller N (TrialN vs. N) if the frequency threshold
+    // is exceeded by fewer targets.
+    const unsigned TotalThreshold = IsJumpTable
+                                        ? opts::ICPJTTotalPercentThreshold
+                                        : opts::ICPCallsTotalPercentThreshold;
+    const unsigned RemainingThreshold =
+        IsJumpTable ? opts::ICPJTRemainingPercentThreshold
+                    : opts::ICPCallsRemainingPercentThreshold;
+    uint64_t NumRemainingCalls = NumCalls;
+    for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
+      if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
+        break;
+      if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
+        break;
+      if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
+          TrialN)
+        break;
+      TotalCallsTopN += Targets[I].Branches;
+      TotalMispredictsTopN += Targets[I].Mispreds;
+      NumRemainingCalls -= Targets[I].Branches;
+      N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
+    }
+    computeStats(MaxTargets);
+
+    // Don't check misprediction frequency for jump tables -- we don't really
+    // care as long as we are saving loads from the jump table.
+    if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
+      // Compute the misprediction frequency of the top N call targets.  If
+      // this frequency is less than the threshold, we should skip ICP at
+      // this callsite.
+      const double TopNMispredictFrequency =
+        (100.0 * TotalMispredictsTopN) / NumCalls;
+
+      if (TopNMispredictFrequency <
+          opts::IndirectCallPromotionMispredictThreshold) {
+        if (opts::Verbosity >= 1) {
+          const auto InstIdx = &Inst - &(*BB->begin());
+          outs() << "BOLT-INFO: ICP failed in " <<  *BB->getFunction() << " @ "
+                 << InstIdx << " in " << BB->getName() << ", calls = "
+                 << NumCalls << ", top N mispredict frequency "
+                 << format("%.1f", TopNMispredictFrequency) << "% < "
+                 << opts::IndirectCallPromotionMispredictThreshold << "%\n";
+        }
+        return 0;
+      }
+    }
+  }
+
+  // Filter functions that can have ICP applied (for debugging)
+  if (!opts::ICPFuncsList.empty()) {
+    for (std::string &Name : opts::ICPFuncsList) {
+      if (BB->getFunction()->hasName(Name))
+        return N;
+    }
+    return 0;
+  }
+
+  return N;
+}
+
+void
+IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
+                                         const MCInst &Inst,
+                                         const std::vector<Callsite> &Targets,
+                                         const size_t N,
+                                         uint64_t NumCalls) const {
+  BinaryContext &BC = BB->getFunction()->getBinaryContext();
+  const bool IsTailCall = BC.MIB->isTailCall(Inst);
+  const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst);
+  const auto InstIdx = &Inst - &(*BB->begin());
+
+  outs() << "BOLT-INFO: ICP candidate branch info: "
+         << *BB->getFunction() << " @ " << InstIdx
+         << " in " << BB->getName()
+         << " -> calls = " << NumCalls
+         << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
+         << "\n";
+  for (size_t I = 0; I < N; I++) {
+    const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
+    const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
+    outs() << "BOLT-INFO:   ";
+    if (Targets[I].To.Sym)
+      outs() << Targets[I].To.Sym->getName();
+    else
+      outs() << Targets[I].To.Addr;
+    outs() << ", calls = " << Targets[I].Branches
+           << ", mispreds = " << Targets[I].Mispreds
+           << ", taken freq = " << format("%.1f", Frequency) << "%"
+           << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
+    bool First = true;
+    for (uint64_t JTIndex : Targets[I].JTIndices) {
+      outs() << (First ? ", indices = " : ", ") << JTIndex;
+      First = false;
+    }
+    outs() << "\n";
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "BOLT-INFO: ICP original call instruction:";
+    BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
+  });
+}
+
+void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
+  if (opts::IndirectCallPromotion == ICP_NONE)
+    return;
+
+  auto &BFs = BC.getBinaryFunctions();
+
+  const bool OptimizeCalls =
+    (opts::IndirectCallPromotion == ICP_CALLS ||
+     opts::IndirectCallPromotion == ICP_ALL);
+  const bool OptimizeJumpTables =
+    (opts::IndirectCallPromotion == ICP_JUMP_TABLES ||
+     opts::IndirectCallPromotion == ICP_ALL);
+
+  std::unique_ptr<RegAnalysis> RA;
+  std::unique_ptr<BinaryFunctionCallGraph> CG;
+  if (OptimizeJumpTables) {
+    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
+    RA.reset(new RegAnalysis(BC, &BFs, &*CG));
+  }
+
+  // If icp-top-callsites is enabled, compute the total number of indirect
+  // calls and then optimize the hottest callsites that contribute to that
+  // total.
+  SetVector<BinaryFunction *> Functions;
+  if (opts::ICPTopCallsites == 0) {
+    for (auto &KV : BFs) {
+      Functions.insert(&KV.second);
+    }
+  } else {
+    using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
+    std::vector<IndirectCallsite> IndirectCalls;
+    size_t TotalIndirectCalls = 0;
+
+    // Find all the indirect callsites.
+    for (auto &BFIt : BFs) {
+      BinaryFunction &Function = BFIt.second;
+
+      if (!Function.isSimple() || Function.isIgnored() ||
+          !Function.hasProfile())
+        continue;
+
+      const bool HasLayout = !Function.layout_empty();
+
+      for (BinaryBasicBlock &BB : Function) {
+        if (HasLayout && Function.isSplit() && BB.isCold())
+          continue;
+
+        for (MCInst &Inst : BB) {
+          const bool IsJumpTable = Function.getJumpTable(Inst);
+          const bool HasIndirectCallProfile =
+            BC.MIB->hasAnnotation(Inst, "CallProfile");
+          const bool IsDirectCall = (BC.MIB->isCall(Inst) &&
+                                     BC.MIB->getTargetSymbol(Inst, 0));
+
+          if (!IsDirectCall &&
+              ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
+               (IsJumpTable && OptimizeJumpTables))) {
+            uint64_t NumCalls = 0;
+            for (const Callsite &BInfo : getCallTargets(BB, Inst)) {
+              NumCalls += BInfo.Branches;
+            }
+            IndirectCalls.push_back(
+                std::make_tuple(NumCalls, &Inst, &Function));
+            TotalIndirectCalls += NumCalls;
+          }
+        }
+      }
+    }
+
+    // Sort callsites by execution count.
+    std::sort(IndirectCalls.rbegin(), IndirectCalls.rend());
+
+    // Find callsites that contribute to the top "opts::ICPTopCallsites"%
+    // number of calls.
+    const float TopPerc = opts::ICPTopCallsites / 100.0f;
+    int64_t MaxCalls = TotalIndirectCalls * TopPerc;
+    uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
+    size_t Num = 0;
+    for (const IndirectCallsite &IC : IndirectCalls) {
+      const uint64_t CurFreq = std::get<0>(IC);
+      // Once we decide to stop, include at least all branches that share the
+      // same frequency of the last one to avoid non-deterministic behavior
+      // (e.g. turning on/off ICP depending on the order of functions)
+      if (MaxCalls <= 0 && CurFreq != LastFreq)
+        break;
+      MaxCalls -= CurFreq;
+      LastFreq = CurFreq;
+      BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
+      Functions.insert(std::get<2>(IC));
+      ++Num;
+    }
+    outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
+           << ", " << Num << " callsites cover " << opts::ICPTopCallsites
+           << "% of all indirect calls\n";
+  }
+
+  for (BinaryFunction *FuncPtr : Functions) {
+    BinaryFunction &Function = *FuncPtr;
+
+    if (!Function.isSimple() || Function.isIgnored() || !Function.hasProfile())
+      continue;
+
+    const bool HasLayout = !Function.layout_empty();
+
+    // Total number of indirect calls issued from the current Function.
+    // (a fraction of TotalIndirectCalls)
+    uint64_t FuncTotalIndirectCalls = 0;
+    uint64_t FuncTotalIndirectJmps = 0;
+
+    std::vector<BinaryBasicBlock *> BBs;
+    for (BinaryBasicBlock &BB : Function) {
+      // Skip indirect calls in cold blocks.
+      if (!HasLayout || !Function.isSplit() || !BB.isCold()) {
+        BBs.push_back(&BB);
+      }
+    }
+    if (BBs.empty())
+      continue;
+
+    DataflowInfoManager Info(BC, Function, RA.get(), nullptr);
+    while (!BBs.empty()) {
+      BinaryBasicBlock *BB = BBs.back();
+      BBs.pop_back();
+
+      for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
+        MCInst &Inst = BB->getInstructionAtIndex(Idx);
+        const auto InstIdx = &Inst - &(*BB->begin());
+        const bool IsTailCall = BC.MIB->isTailCall(Inst);
+        const bool HasIndirectCallProfile =
+          BC.MIB->hasAnnotation(Inst, "CallProfile");
+        const bool IsJumpTable = Function.getJumpTable(Inst);
+
+        if (BC.MIB->isCall(Inst)) {
+          TotalCalls += BB->getKnownExecutionCount();
+        }
+
+        if (IsJumpTable && !OptimizeJumpTables)
+          continue;
+
+        if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
+          continue;
+
+        // Ignore direct calls.
+        if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
+          continue;
+
+        assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst))
+               && "expected a call or an indirect jump instruction");
+
+        if (IsJumpTable)
+          ++TotalJumpTableCallsites;
+        else
+          ++TotalIndirectCallsites;
+
+        std::vector<Callsite> Targets = getCallTargets(*BB, Inst);
+
+        // Compute the total number of calls from this particular callsite.
+        uint64_t NumCalls = 0;
+        for (const Callsite &BInfo : Targets) {
+          NumCalls += BInfo.Branches;
+        }
+        if (!IsJumpTable)
+          FuncTotalIndirectCalls += NumCalls;
+        else
+          FuncTotalIndirectJmps += NumCalls;
+
+        // If FLAGS regs is alive after this jmp site, do not try
+        // promoting because we will clobber FLAGS.
+        if (IsJumpTable) {
+          ErrorOr<const BitVector &> State =
+              Info.getLivenessAnalysis().getStateBefore(Inst);
+          if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
+            if (opts::Verbosity >= 1) {
+              outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                     << InstIdx << " in " << BB->getName()
+                     << ", calls = " << NumCalls
+                     << (State ? ", cannot clobber flags reg.\n"
+                               : ", no liveness data available.\n");
+            }
+            continue;
+          }
+        }
+
+        // Should this callsite be optimized?  Return the number of targets
+        // to use when promoting this call.  A value of zero means to skip
+        // this callsite.
+        size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls);
+
+        // If it is a jump table and it failed to meet our initial threshold,
+        // proceed to findCallTargetSymbols -- it may reevaluate N if
+        // memory profile is present
+        if (!N && !IsJumpTable)
+          continue;
+
+        if (opts::Verbosity >= 1) {
+          printCallsiteInfo(BB, Inst, Targets, N, NumCalls);
+        }
+
+        // Find MCSymbols or absolute addresses for each call target.
+        MCInst *TargetFetchInst = nullptr;
+        const SymTargetsType SymTargets = findCallTargetSymbols(
+            BC, Targets, N, Function, BB, Inst, TargetFetchInst);
+
+        // findCallTargetSymbols may have changed N if mem profile is available
+        // for jump tables
+        if (!N)
+          continue;
+
+        LLVM_DEBUG(printDecision(dbgs(), Targets, N));
+
+        // If we can't resolve any of the target symbols, punt on this callsite.
+        // TODO: can this ever happen?
+        if (SymTargets.size() < N) {
+          const size_t LastTarget = SymTargets.size();
+          if (opts::Verbosity >= 1) {
+            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                   << InstIdx << " in " << BB->getName()
+                   << ", calls = " << NumCalls
+                   << ", ICP failed to find target symbol for "
+                   << Targets[LastTarget].To.Sym->getName() << "\n";
+          }
+          continue;
+        }
+
+        MethodInfoType MethodInfo;
+
+        if (!IsJumpTable) {
+          MethodInfo = maybeGetVtableSyms(BC,
+                                          Function,
+                                          BB,
+                                          Inst,
+                                          SymTargets);
+          TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
+          LLVM_DEBUG(dbgs()
+                     << "BOLT-INFO: ICP "
+                     << (!MethodInfo.first.empty() ? "found" : "did not find")
+                     << " vtables for all methods.\n");
+        } else if (TargetFetchInst) {
+          ++TotalIndexBasedJumps;
+          MethodInfo.second.push_back(TargetFetchInst);
+        }
+
+        // Generate new promoted call code for this callsite.
+        MCPlusBuilder::BlocksVectorTy ICPcode =
+            (IsJumpTable && !opts::ICPJumpTablesByTarget)
+                ? BC.MIB->jumpTablePromotion(Inst, SymTargets,
+                                             MethodInfo.second, BC.Ctx.get())
+                : BC.MIB->indirectCallPromotion(
+                      Inst, SymTargets, MethodInfo.first, MethodInfo.second,
+                      opts::ICPOldCodeSequence, BC.Ctx.get());
+
+        if (ICPcode.empty()) {
+          if (opts::Verbosity >= 1) {
+            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                   << InstIdx << " in " << BB->getName()
+                   << ", calls = " << NumCalls
+                   << ", unable to generate promoted call code.\n";
+          }
+          continue;
+        }
+
+        LLVM_DEBUG({
+          uint64_t Offset = Targets[0].From.Addr;
+          dbgs() << "BOLT-INFO: ICP indirect call code:\n";
+          for (const auto &entry : ICPcode) {
+            const MCSymbol *const &Sym = entry.first;
+            const std::vector<MCInst> &Insts = entry.second;
+            if (Sym) dbgs() << Sym->getName() << ":\n";
+            Offset = BC.printInstructions(dbgs(),
+                                          Insts.begin(),
+                                          Insts.end(),
+                                          Offset);
+          }
+          dbgs() << "---------------------------------------------------\n";
+        });
+
+        // Rewrite the CFG with the newly generated ICP code.
+        std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs = rewriteCall(
+            BC, Function, BB, Inst, std::move(ICPcode), MethodInfo.second);
+
+        // Fix the CFG after inserting the new basic blocks.
+        BinaryBasicBlock *MergeBlock =
+            fixCFG(BC, Function, BB, IsTailCall, IsJumpTable, std::move(NewBBs),
+                   Targets);
+
+        // Since the tail of the original block was split off and it may contain
+        // additional indirect calls, we must add the merge block to the set of
+        // blocks to process.
+        if (MergeBlock) {
+          BBs.push_back(MergeBlock);
+        }
+
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: ICP succeeded in "
+                 << Function << " @ " << InstIdx
+                 << " in " << BB->getName()
+                 << " -> calls = " << NumCalls << "\n";
+        }
+
+        if (IsJumpTable)
+          ++TotalOptimizedJumpTableCallsites;
+        else
+          ++TotalOptimizedIndirectCallsites;
+
+        Modified.insert(&Function);
+      }
+    }
+    TotalIndirectCalls += FuncTotalIndirectCalls;
+    TotalIndirectJmps += FuncTotalIndirectJmps;
+  }
+
+  outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
+         << TotalIndirectCallsites
+         << "\n"
+         << "BOLT-INFO: ICP total jump table callsites = "
+         << TotalJumpTableCallsites
+         << "\n"
+         << "BOLT-INFO: ICP total number of calls = "
+         << TotalCalls
+         << "\n"
+         << "BOLT-INFO: ICP percentage of calls that are indirect = "
+         << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls)
+         << "%\n"
+         << "BOLT-INFO: ICP percentage of indirect calls that can be "
+            "optimized = "
+         << format("%.1f", (100.0 * TotalNumFrequentCalls) /
+                   std::max<size_t>(TotalIndirectCalls, 1))
+         << "%\n"
+         << "BOLT-INFO: ICP percentage of indirect callsites that are "
+            "optimized = "
+         << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
+                   std::max<uint64_t>(TotalIndirectCallsites, 1))
+         << "%\n"
+         << "BOLT-INFO: ICP number of method load elimination candidates = "
+         << TotalMethodLoadEliminationCandidates
+         << "\n"
+         << "BOLT-INFO: ICP percentage of method calls candidates that have "
+            "loads eliminated = "
+         << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
+                   std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
+         << "%\n"
+         << "BOLT-INFO: ICP percentage of indirect branches that are "
+            "optimized = "
+         << format("%.1f", (100.0 * TotalNumFrequentJmps) /
+                   std::max<uint64_t>(TotalIndirectJmps, 1))
+         << "%\n"
+         << "BOLT-INFO: ICP percentage of jump table callsites that are "
+         << "optimized = "
+         << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
+                   std::max<uint64_t>(TotalJumpTableCallsites, 1))
+         << "%\n"
+         << "BOLT-INFO: ICP number of jump table callsites that can use hot "
+         << "indices = " << TotalIndexBasedCandidates
+         << "\n"
+         << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
+            "indices = "
+         << format("%.1f", (100.0 * TotalIndexBasedJumps) /
+                   std::max<uint64_t>(TotalIndexBasedCandidates, 1))
+         << "%\n";
+
+  (void)verifyProfile;
+#ifndef NDEBUG
+  verifyProfile(BFs);
+#endif
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/IndirectCallPromotion.h b/bolt/src/Passes/IndirectCallPromotion.h
new file mode 100644
index 000000000000..a417202ed24f
--- /dev/null
+++ b/bolt/src/Passes/IndirectCallPromotion.h
@@ -0,0 +1,251 @@
+//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The indirect call promotion (ICP) optimization pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INDIRECT_CALL_PROMOTION_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_INDIRECT_CALL_PROMOTION_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+/// Optimize indirect calls.
+/// The indirect call promotion pass visits each indirect call and
+/// examines a branch profile for each. If the most frequent targets
+/// from that callsite exceed the specified threshold (default 90%),
+/// the call is promoted. Otherwise, it is ignored. By default,
+/// only one target is considered at each callsite.
+///
+/// When an candidate callsite is processed, we modify the callsite
+/// to test for the most common call targets before calling through
+/// the original generic call mechanism.
+///
+/// The CFG and layout are modified by ICP.
+///
+/// A few new command line options have been added:
+///   -indirect-call-promotion=[none,call,jump-tables,all]
+///   -indirect-call-promotion-threshold=<percentage>
+///   -indirect-call-promotion-mispredict-threshold=<percentage>
+///   -indirect-call-promotion-topn=<int>
+///
+/// The threshold is the minimum frequency of a call target needed
+/// before ICP is triggered.
+///
+/// The mispredict threshold is used to disable the optimization at
+/// any callsite where the branch predictor does a good enough job
+/// that ICP wouldn't help regardless of the frequency of the most
+/// common target.
+///
+/// The topn option controls the number of targets to consider for
+/// each callsite, e.g. ICP is triggered if topn=2 and the total
+/// frequency of the top two call targets exceeds the threshold.
+///
+/// The minimize code size option controls whether or not the hot
+/// calls are to registers (callq %r10) or to function addresses
+/// (callq $foo).
+///
+/// Example of ICP:
+///
+/// C++ code:
+///
+///   int B_count = 0;
+///   int C_count = 0;
+///
+///   struct A { virtual void foo() = 0; }
+///   struct B : public A { virtual void foo() { ++B_count; }; };
+///   struct C : public A { virtual void foo() { ++C_count; }; };
+///
+///   A* a = ...
+///   a->foo();
+///   ...
+///
+/// original assembly:
+///
+///   B0: 49 8b 07             mov    (%r15),%rax
+///       4c 89 ff             mov    %r15,%rdi
+///       ff 10                callq  *(%rax)
+///       41 83 e6 01          and    $0x1,%r14d
+///       4d 89 e6             mov    %r12,%r14
+///       4c 0f 44 f5          cmove  %rbp,%r14
+///       4c 89 f7             mov    %r14,%rdi
+///       ...
+///
+/// after ICP:
+///
+///   B0: 49 8b 07             mov    (%r15),%rax
+///       4c 89 ff             mov    %r15,%rdi
+///       48 81 38 e0 0b 40 00 cmpq   $B::foo,(%rax)
+///       75 29                jne    B3
+///   B1: e8 45 03 00 00       callq  $B::foo
+///   B2: 41 83 e6 01          and    $0x1,%r14d
+///       4d 89 e6             mov    %r12,%r14
+///       4c 0f 44 f5          cmove  %rbp,%r14
+///       4c 89 f7             mov    %r14,%rdi
+///       ...
+///
+///   B3: ff 10                callq  *(%rax)
+///       eb d6                jmp    B2
+///
+class IndirectCallPromotion : public BinaryFunctionPass {
+  using BasicBlocksVector = std::vector<std::unique_ptr<BinaryBasicBlock>>;
+  using MethodInfoType = std::pair<std::vector<std::pair<MCSymbol *, uint64_t>>,
+                                   std::vector<MCInst *>>;
+  using JumpTableInfoType = std::vector<std::pair<uint64_t, uint64_t>>;
+  using SymTargetsType = std::vector<std::pair<MCSymbol *, uint64_t>>;
+  struct Location {
+    MCSymbol *Sym{nullptr};
+    uint64_t Addr{0};
+    bool isValid() const {
+      return Sym || (!Sym && Addr != 0);
+    }
+    Location() { }
+    explicit Location(MCSymbol *Sym) : Sym(Sym) { }
+    explicit Location(uint64_t Addr) : Addr(Addr) { }
+  };
+
+  struct Callsite {
+    Location From;
+    Location To;
+    uint64_t Mispreds{0};
+    uint64_t Branches{0};
+    // Indices in the jmp table (jt only)
+    std::vector<uint64_t> JTIndices;
+    bool isValid() const {
+      return From.isValid() && To.isValid();
+    }
+    Callsite(BinaryFunction &BF, const IndirectCallProfile &ICP);
+    Callsite(const Location &From, const Location &To,
+             uint64_t Mispreds, uint64_t Branches,
+             uint64_t JTIndex)
+    : From(From), To(To), Mispreds(Mispreds), Branches(Branches),
+      JTIndices(1, JTIndex) { }
+  };
+
+  std::unordered_set<const BinaryFunction *> Modified;
+  // Total number of calls from all callsites.
+  uint64_t TotalCalls{0};
+
+  // Total number of indirect calls from all callsites.
+  // (a fraction of TotalCalls)
+  uint64_t TotalIndirectCalls{0};
+
+  // Total number of jmp table calls from all callsites.
+  // (a fraction of TotalCalls)
+  uint64_t TotalIndirectJmps{0};
+
+  // Total number of callsites that use indirect calls.
+  // (the total number of callsites is not recorded)
+  uint64_t TotalIndirectCallsites{0};
+
+  // Total number of callsites that are jump tables.
+  uint64_t TotalJumpTableCallsites{0};
+
+  // Total number of indirect callsites that are optimized by ICP.
+  // (a fraction of TotalIndirectCallsites)
+  uint64_t TotalOptimizedIndirectCallsites{0};
+
+  // Total number of method callsites that can have loads eliminated.
+  mutable uint64_t TotalMethodLoadEliminationCandidates{0};
+
+  // Total number of method callsites that had loads eliminated.
+  uint64_t TotalMethodLoadsEliminated{0};
+
+  // Total number of jump table callsites that are optimized by ICP.
+  uint64_t TotalOptimizedJumpTableCallsites{0};
+
+  // Total number of indirect calls that are optimized by ICP.
+  // (a fraction of TotalCalls)
+  uint64_t TotalNumFrequentCalls{0};
+
+  // Total number of jump table calls that are optimized by ICP.
+  // (a fraction of TotalCalls)
+  uint64_t TotalNumFrequentJmps{0};
+
+  // Total number of jump table sites that can use hot indices.
+  mutable uint64_t TotalIndexBasedCandidates{0};
+
+  // Total number of jump table sites that use hot indices.
+  uint64_t TotalIndexBasedJumps{0};
+
+  void printDecision(llvm::raw_ostream &OS,
+                     std::vector<IndirectCallPromotion::Callsite> &Targets,
+                     unsigned N) const;
+
+  std::vector<Callsite> getCallTargets(BinaryBasicBlock &BB,
+                                       const MCInst &Inst) const;
+
+  size_t canPromoteCallsite(const BinaryBasicBlock *BB,
+                            const MCInst &Inst,
+                            const std::vector<Callsite> &Targets,
+                            uint64_t NumCalls);
+
+  void printCallsiteInfo(const BinaryBasicBlock *BB,
+                         const MCInst &Inst,
+                         const std::vector<Callsite> &Targets,
+                         const size_t N,
+                         uint64_t NumCalls) const;
+
+  JumpTableInfoType
+  maybeGetHotJumpTableTargets(BinaryContext &BC,
+                              BinaryFunction &Function,
+                              BinaryBasicBlock *BB,
+                              MCInst &Inst,
+                              MCInst *&TargetFetchInst,
+                              const JumpTable *JT) const;
+
+  SymTargetsType findCallTargetSymbols(BinaryContext &BC,
+                                       std::vector<Callsite> &Targets,
+                                       size_t &N,
+                                       BinaryFunction &Function,
+                                       BinaryBasicBlock *BB,
+                                       MCInst &Inst,
+                                       MCInst *&TargetFetchInst) const;
+
+  MethodInfoType maybeGetVtableSyms(BinaryContext &BC,
+                                    BinaryFunction &Function,
+                                    BinaryBasicBlock *BB,
+                                    MCInst &Inst,
+                                    const SymTargetsType &SymTargets) const;
+
+  std::vector<std::unique_ptr<BinaryBasicBlock>>
+  rewriteCall(BinaryContext &BC,
+              BinaryFunction &Function,
+              BinaryBasicBlock *IndCallBlock,
+              const MCInst &CallInst,
+              MCPlusBuilder::BlocksVectorTy &&ICPcode,
+              const std::vector<MCInst *> &MethodFetchInsns) const;
+
+  BinaryBasicBlock *fixCFG(BinaryContext &BC,
+                           BinaryFunction &Function,
+                           BinaryBasicBlock *IndCallBlock,
+                           const bool IsTailCall,
+                           const bool IsJumpTable,
+                           BasicBlocksVector &&NewBBs,
+                           const std::vector<Callsite> &Targets) const;
+
+ public:
+  explicit IndirectCallPromotion(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "indirect-call-promotion";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/Inliner.cpp b/bolt/src/Passes/Inliner.cpp
new file mode 100644
index 000000000000..e76546436b93
--- /dev/null
+++ b/bolt/src/Passes/Inliner.cpp
@@ -0,0 +1,597 @@
+//===--- Passes/Inliner.cpp - Inlining infra for BOLT ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The current inliner has a limited callee support
+// (see Inliner::getInliningInfo() for the most up-to-date details):
+//
+//  * No exception handling
+//  * No jump tables
+//  * Single entry point
+//  * CFI update not supported - breaks unwinding
+//  * Regular Call Sites:
+//    - only leaf functions (or callees with only tail calls)
+//      * no invokes (they can't be tail calls)
+//    - no direct use of %rsp
+//  * Tail Call Sites:
+//    - since the stack is unmodified, the regular call limitations are lifted
+//
+//===----------------------------------------------------------------------===//
+
+#include "Inliner.h"
+#include "MCPlus.h"
+#include "llvm/Support/CommandLine.h"
+#include <map>
+
+#define DEBUG_TYPE "bolt-inliner"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+static cl::opt<bool>
+AdjustProfile("inline-ap",
+  cl::desc("adjust function profile after inlining"),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::list<std::string>
+ForceInlineFunctions("force-inline",
+  cl::CommaSeparated,
+  cl::desc("list of functions to always consider for inlining"),
+  cl::value_desc("func1,func2,func3,..."),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+InlineAll("inline-all",
+  cl::desc("inline all functions"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+InlineIgnoreLeafCFI("inline-ignore-leaf-cfi",
+  cl::desc("inline leaf functions with CFI programs (can break unwinding)"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::ReallyHidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+InlineIgnoreCFI("inline-ignore-cfi",
+  cl::desc("inline functions with CFI programs (can break exception handling)"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::ReallyHidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+InlineLimit("inline-limit",
+  cl::desc("maximum number of call sites to inline"),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+InlineMaxIters("inline-max-iters",
+  cl::desc("maximum number of inline iterations"),
+  cl::init(3),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+InlineSmallFunctions("inline-small-functions",
+  cl::desc("inline functions if increase in size is less than defined by "
+           "-inline-small-functions-bytes"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+InlineSmallFunctionsBytes("inline-small-functions-bytes",
+  cl::desc("max number of bytes for the function to be considered small for "
+           "inlining purposes"),
+  cl::init(4),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+NoInline("no-inline",
+  cl::desc("disable all inlining (overrides other inlining options)"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+/// This function returns true if any of inlining options are specified and the
+/// inlining pass should be executed. Whenever a new inlining option is added,
+/// this function should reflect the change.
+bool inliningEnabled() {
+  return !NoInline &&
+    (InlineAll ||
+     InlineSmallFunctions ||
+     !ForceInlineFunctions.empty());
+}
+
+bool mustConsider(const llvm::bolt::BinaryFunction &Function) {
+  for (std::string &Name : opts::ForceInlineFunctions) {
+    if (Function.hasName(Name))
+      return true;
+  }
+  return false;
+}
+
+void syncOptions() {
+  if (opts::InlineIgnoreCFI)
+    opts::InlineIgnoreLeafCFI = true;
+
+  if (opts::InlineAll)
+    opts::InlineSmallFunctions = true;
+}
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+uint64_t Inliner::SizeOfCallInst;
+uint64_t Inliner::SizeOfTailCallInst;
+
+uint64_t Inliner::getSizeOfCallInst(const BinaryContext &BC) {
+  if (SizeOfCallInst)
+    return SizeOfCallInst;
+
+  MCInst Inst;
+  BC.MIB->createCall(Inst, BC.Ctx->createNamedTempSymbol(), BC.Ctx.get());
+  SizeOfCallInst = BC.computeInstructionSize(Inst);
+
+  return SizeOfCallInst;
+}
+
+uint64_t Inliner::getSizeOfTailCallInst(const BinaryContext &BC) {
+  if (SizeOfTailCallInst)
+    return SizeOfTailCallInst;
+
+  MCInst Inst;
+  BC.MIB->createTailCall(Inst, BC.Ctx->createNamedTempSymbol(), BC.Ctx.get());
+  SizeOfTailCallInst = BC.computeInstructionSize(Inst);
+
+  return SizeOfTailCallInst;
+}
+
+Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const {
+  if (!shouldOptimize(BF))
+    return INL_NONE;
+
+  const BinaryContext &BC = BF.getBinaryContext();
+  bool DirectSP = false;
+  bool HasCFI = false;
+  bool IsLeaf = true;
+
+  // Perform necessary checks unless the option overrides it.
+  if (!opts::mustConsider(BF)) {
+    if (BF.hasSDTMarker())
+      return INL_NONE;
+
+    if (BF.hasEHRanges())
+      return INL_NONE;
+
+    if (BF.isMultiEntry())
+      return INL_NONE;
+
+    if (BF.hasJumpTables())
+      return INL_NONE;
+
+    const MCPhysReg SPReg = BC.MIB->getStackPointer();
+    for (const BinaryBasicBlock *BB : BF.layout()) {
+      for (const MCInst &Inst : *BB) {
+        // Tail calls are marked as implicitly using the stack pointer and they
+        // could be inlined.
+        if (BC.MIB->isTailCall(Inst))
+          break;
+
+        if (BC.MIB->isCFI(Inst)) {
+          HasCFI = true;
+          continue;
+        }
+
+        if (BC.MIB->isCall(Inst))
+          IsLeaf = false;
+
+        // Push/pop instructions are straightforward to handle.
+        if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst))
+          continue;
+
+        DirectSP |= BC.MIB->hasDefOfPhysReg(Inst, SPReg) ||
+                    BC.MIB->hasUseOfPhysReg(Inst, SPReg);
+      }
+    }
+  }
+
+  if (HasCFI) {
+    if (!opts::InlineIgnoreLeafCFI)
+      return INL_NONE;
+
+    if (!IsLeaf && !opts::InlineIgnoreCFI)
+      return INL_NONE;
+  }
+
+  InliningInfo Info(DirectSP ? INL_TAILCALL : INL_ANY);
+
+  size_t Size = BF.estimateSize();
+
+  Info.SizeAfterInlining = Size;
+  Info.SizeAfterTailCallInlining = Size;
+
+  // Handle special case of the known size reduction.
+  if (BF.size() == 1) {
+    // For a regular call the last return instruction could be removed
+    // (or converted to a branch).
+    const MCInst *LastInst = BF.back().getLastNonPseudoInstr();
+    if (LastInst &&
+        BC.MIB->isReturn(*LastInst) &&
+        !BC.MIB->isTailCall(*LastInst)) {
+      const uint64_t RetInstSize = BC.computeInstructionSize(*LastInst);
+      assert(Size >= RetInstSize);
+      Info.SizeAfterInlining -= RetInstSize;
+    }
+  }
+
+  return Info;
+}
+
+void
+Inliner::findInliningCandidates(BinaryContext &BC) {
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &Function = BFI.second;
+    const InliningInfo InlInfo = getInliningInfo(Function);
+    if (InlInfo.Type != INL_NONE)
+      InliningCandidates[&Function] = InlInfo;
+  }
+}
+
+std::pair<BinaryBasicBlock *, BinaryBasicBlock::iterator>
+Inliner::inlineCall(BinaryBasicBlock &CallerBB,
+                    BinaryBasicBlock::iterator CallInst,
+                    const BinaryFunction &Callee) {
+  BinaryFunction &CallerFunction = *CallerBB.getFunction();
+  BinaryContext &BC = CallerFunction.getBinaryContext();
+  auto &MIB = *BC.MIB;
+
+  assert(MIB.isCall(*CallInst) && "can only inline a call or a tail call");
+  assert(!Callee.isMultiEntry() &&
+         "cannot inline function with multiple entries");
+  assert(!Callee.hasJumpTables() &&
+         "cannot inline function with jump table(s)");
+
+  // Get information about the call site.
+  const bool CSIsInvoke = BC.MIB->isInvoke(*CallInst);
+  const bool CSIsTailCall = BC.MIB->isTailCall(*CallInst);
+  const int64_t CSGNUArgsSize = BC.MIB->getGnuArgsSize(*CallInst);
+  const Optional<MCPlus::MCLandingPad> CSEHInfo = BC.MIB->getEHInfo(*CallInst);
+
+  // Split basic block at the call site if there will be more incoming edges
+  // coming from the callee.
+  BinaryBasicBlock *FirstInlinedBB = &CallerBB;
+  if (Callee.front().pred_size() && CallInst != CallerBB.begin()) {
+    FirstInlinedBB = CallerBB.splitAt(CallInst);
+    CallInst = FirstInlinedBB->begin();
+  }
+
+  // Split basic block after the call instruction unless the callee is trivial
+  // (i.e. consists of a single basic block). If necessary, obtain a basic block
+  // for return instructions in the callee to redirect to.
+  BinaryBasicBlock *NextBB = nullptr;
+  if (Callee.size() > 1) {
+    if (std::next(CallInst) != FirstInlinedBB->end()) {
+      NextBB = FirstInlinedBB->splitAt(std::next(CallInst));
+    } else {
+      NextBB = FirstInlinedBB->getSuccessor();
+    }
+  }
+  if (NextBB)
+    FirstInlinedBB->removeSuccessor(NextBB);
+
+  // Remove the call instruction.
+  auto InsertII = FirstInlinedBB->eraseInstruction(CallInst);
+
+  double ProfileRatio = 0;
+  if (uint64_t CalleeExecCount = Callee.getKnownExecutionCount()) {
+    ProfileRatio =
+      (double) FirstInlinedBB->getKnownExecutionCount() / CalleeExecCount;
+  }
+
+  // Save execution count of the first block as we don't want it to change
+  // later due to profile adjustment rounding errors.
+  const uint64_t FirstInlinedBBCount = FirstInlinedBB->getKnownExecutionCount();
+
+  // Copy basic blocks and maintain a map from their origin.
+  std::unordered_map<const BinaryBasicBlock *, BinaryBasicBlock *> InlinedBBMap;
+  InlinedBBMap[&Callee.front()] = FirstInlinedBB;
+  for (auto BBI = std::next(Callee.begin()); BBI != Callee.end(); ++BBI) {
+    BinaryBasicBlock *InlinedBB = CallerFunction.addBasicBlock(0);
+    InlinedBBMap[&*BBI] = InlinedBB;
+    InlinedBB->setCFIState(FirstInlinedBB->getCFIState());
+    if (Callee.hasValidProfile()) {
+      InlinedBB->setExecutionCount(BBI->getKnownExecutionCount());
+    } else {
+      InlinedBB->setExecutionCount(FirstInlinedBBCount);
+    }
+  }
+
+  // Copy over instructions and edges.
+  for (const BinaryBasicBlock &BB : Callee) {
+    BinaryBasicBlock *InlinedBB = InlinedBBMap[&BB];
+
+    if (InlinedBB != FirstInlinedBB)
+      InsertII = InlinedBB->begin();
+
+    // Copy over instructions making any necessary mods.
+    for (MCInst Inst : BB) {
+      if (MIB.isPseudo(Inst))
+        continue;
+
+      MIB.stripAnnotations(Inst);
+
+      // Fix branch target. Strictly speaking, we don't have to do this as
+      // targets of direct branches will be fixed later and don't matter
+      // in the CFG state. However, disassembly may look misleading, and
+      // hence we do the fixing.
+      if (MIB.isBranch(Inst)) {
+        assert(!MIB.isIndirectBranch(Inst) &&
+               "unexpected indirect branch in callee");
+        const BinaryBasicBlock *TargetBB =
+            Callee.getBasicBlockForLabel(MIB.getTargetSymbol(Inst));
+        assert(TargetBB && "cannot find target block in callee");
+        MIB.replaceBranchTarget(Inst, InlinedBBMap[TargetBB]->getLabel(),
+                                BC.Ctx.get());
+      }
+
+      if (CSIsTailCall || (!MIB.isCall(Inst) && !MIB.isReturn(Inst))) {
+        InsertII = std::next(InlinedBB->insertInstruction(InsertII,
+                                                          std::move(Inst)));
+        continue;
+      }
+
+      // Handle special instructions for a non-tail call site.
+      if (!MIB.isCall(Inst)) {
+        // Returns are removed.
+        break;
+      }
+
+      MIB.convertTailCallToCall(Inst);
+
+      // Propagate EH-related info to call instructions.
+      if (CSIsInvoke) {
+        MIB.addEHInfo(Inst, *CSEHInfo);
+        if (CSGNUArgsSize >= 0)
+          MIB.addGnuArgsSize(Inst, CSGNUArgsSize);
+      }
+
+      InsertII = std::next(InlinedBB->insertInstruction(InsertII,
+                                                        std::move(Inst)));
+    }
+
+    // Add CFG edges to the basic blocks of the inlined instance.
+    std::vector<BinaryBasicBlock *> Successors(BB.succ_size());
+    std::transform(
+        BB.succ_begin(),
+        BB.succ_end(),
+        Successors.begin(),
+        [&InlinedBBMap](const BinaryBasicBlock *BB) {
+          return InlinedBBMap.at(BB);
+        });
+
+    if (CallerFunction.hasValidProfile() && Callee.hasValidProfile()) {
+      InlinedBB->addSuccessors(
+          Successors.begin(),
+          Successors.end(),
+          BB.branch_info_begin(),
+          BB.branch_info_end());
+    } else {
+      InlinedBB->addSuccessors(
+          Successors.begin(),
+          Successors.end());
+    }
+
+    if (!CSIsTailCall && BB.succ_size() == 0 && NextBB) {
+      // Either it's a return block or the last instruction never returns.
+      InlinedBB->addSuccessor(NextBB, InlinedBB->getExecutionCount());
+    }
+
+    // Scale profiling info for blocks and edges after inlining.
+    if (CallerFunction.hasValidProfile() && Callee.size() > 1) {
+      if (opts::AdjustProfile) {
+        InlinedBB->adjustExecutionCount(ProfileRatio);
+      } else {
+        InlinedBB->setExecutionCount(
+            InlinedBB->getKnownExecutionCount() * ProfileRatio);
+      }
+    }
+  }
+
+  // Restore the original execution count of the first inlined basic block.
+  FirstInlinedBB->setExecutionCount(FirstInlinedBBCount);
+
+  CallerFunction.recomputeLandingPads();
+
+  if (NextBB)
+    return std::make_pair(NextBB, NextBB->begin());
+
+  if (Callee.size() == 1)
+    return std::make_pair(FirstInlinedBB, InsertII);
+
+  return std::make_pair(FirstInlinedBB, FirstInlinedBB->end());
+}
+
+bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
+  BinaryContext &BC = Function.getBinaryContext();
+  std::vector<BinaryBasicBlock *> Blocks(Function.layout().begin(),
+                                         Function.layout().end());
+  std::sort(Blocks.begin(), Blocks.end(),
+      [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) {
+        return BB1->getKnownExecutionCount() > BB2->getKnownExecutionCount();
+      });
+
+  bool DidInlining = false;
+  for (BinaryBasicBlock *BB : Blocks) {
+    for (auto InstIt = BB->begin(); InstIt != BB->end(); ) {
+      MCInst &Inst = *InstIt;
+      if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
+          !Inst.getOperand(0).isExpr()) {
+        ++InstIt;
+        continue;
+      }
+
+      const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+      assert(TargetSymbol && "target symbol expected for direct call");
+
+      // Don't inline calls to a secondary entry point in a target function.
+      uint64_t EntryID = 0;
+      BinaryFunction *TargetFunction =
+          BC.getFunctionForSymbol(TargetSymbol, &EntryID);
+      if (!TargetFunction || EntryID != 0) {
+        ++InstIt;
+        continue;
+      }
+
+      // Don't do recursive inlining.
+      if (TargetFunction == &Function) {
+        ++InstIt;
+        continue;
+      }
+
+      auto IInfo = InliningCandidates.find(TargetFunction);
+      if (IInfo == InliningCandidates.end()) {
+        ++InstIt;
+        continue;
+      }
+
+      const bool IsTailCall = BC.MIB->isTailCall(Inst);
+      if (!IsTailCall && IInfo->second.Type == INL_TAILCALL) {
+        ++InstIt;
+        continue;
+      }
+
+      int64_t SizeAfterInlining;
+      if (IsTailCall) {
+        SizeAfterInlining = IInfo->second.SizeAfterTailCallInlining -
+                            getSizeOfTailCallInst(BC);
+      } else {
+        SizeAfterInlining = IInfo->second.SizeAfterInlining -
+                            getSizeOfCallInst(BC);
+      }
+
+      if (!opts::InlineAll && !opts::mustConsider(*TargetFunction)) {
+        if (!opts::InlineSmallFunctions ||
+            SizeAfterInlining > opts::InlineSmallFunctionsBytes) {
+          ++InstIt;
+          continue;
+        }
+      }
+
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction
+                        << " in " << Function << " : " << BB->getName()
+                        << ". Count: " << BB->getKnownExecutionCount()
+                        << ". Size change: " << SizeAfterInlining
+                        << " bytes.\n");
+
+      std::tie(BB, InstIt) = inlineCall(*BB, InstIt, *TargetFunction);
+
+      DidInlining = true;
+      TotalInlinedBytes += SizeAfterInlining;
+
+      ++NumInlinedCallSites;
+      NumInlinedDynamicCalls += BB->getExecutionCount();
+
+      // Subtract basic block execution count from the callee execution count.
+      if (opts::AdjustProfile) {
+        TargetFunction->adjustExecutionCount(BB->getKnownExecutionCount());
+      }
+
+      // Check if the caller inlining status has to be adjusted.
+      if (IInfo->second.Type == INL_TAILCALL) {
+        auto CallerIInfo = InliningCandidates.find(&Function);
+        if (CallerIInfo != InliningCandidates.end() &&
+            CallerIInfo->second.Type == INL_ANY) {
+          LLVM_DEBUG(dbgs() << "adjusting inlining status for function "
+                            << Function << '\n');
+          CallerIInfo->second.Type = INL_TAILCALL;
+        }
+      }
+
+      if (NumInlinedCallSites == opts::InlineLimit) {
+        return true;
+      }
+    }
+  }
+
+  return DidInlining;
+}
+
+void Inliner::runOnFunctions(BinaryContext &BC) {
+  opts::syncOptions();
+
+  if (!opts::inliningEnabled())
+    return;
+
+  uint64_t TotalSize = 0;
+  for (auto &BFI : BC.getBinaryFunctions())
+    TotalSize += BFI.second.getSize();
+
+  bool InlinedOnce;
+  unsigned NumIters = 0;
+  do {
+    if (opts::InlineLimit && NumInlinedCallSites >= opts::InlineLimit)
+      break;
+
+    InlinedOnce = false;
+
+    InliningCandidates.clear();
+    findInliningCandidates(BC);
+
+    std::vector<BinaryFunction *> ConsideredFunctions;
+    for (auto &BFI : BC.getBinaryFunctions()) {
+      BinaryFunction &Function = BFI.second;
+      if (!shouldOptimize(Function))
+        continue;
+      ConsideredFunctions.push_back(&Function);
+    }
+    std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(),
+        [](const BinaryFunction *A, const BinaryFunction *B) {
+        return B->getKnownExecutionCount() < A->getKnownExecutionCount();
+    });
+    for (BinaryFunction *Function : ConsideredFunctions) {
+      if (opts::InlineLimit && NumInlinedCallSites >= opts::InlineLimit)
+        break;
+
+      const bool DidInline = inlineCallsInFunction(*Function);
+
+      if (DidInline)
+        Modified.insert(Function);
+
+      InlinedOnce |= DidInline;
+    }
+
+    ++NumIters;
+  } while (InlinedOnce && NumIters < opts::InlineMaxIters);
+
+  if (NumInlinedCallSites) {
+    outs() << "BOLT-INFO: inlined " << NumInlinedDynamicCalls << " calls at "
+           << NumInlinedCallSites << " call sites in " << NumIters
+           << " iteration(s). Change in binary size: " << TotalInlinedBytes
+           << " bytes.\n";
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/Inliner.h b/bolt/src/Passes/Inliner.h
new file mode 100644
index 000000000000..84fd37f89b13
--- /dev/null
+++ b/bolt/src/Passes/Inliner.h
@@ -0,0 +1,102 @@
+//===--- Passes/Inliner.h - Inlining infra for BOLT -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The set of optimization/analysis passes that run on BinaryFunctions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_INLINER_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class Inliner : public BinaryFunctionPass {
+private:
+
+  enum InliningType : char {
+    INL_NONE = 0, /// Cannot inline
+    INL_TAILCALL, /// Can inline at tail call site
+    INL_ANY       /// Can inline at any call site
+  };
+
+  struct InliningInfo {
+    InliningType Type{INL_NONE};
+    uint64_t SizeAfterInlining{0};
+    uint64_t SizeAfterTailCallInlining{0};
+
+    InliningInfo(InliningType Type = INL_NONE)
+      : Type(Type)
+    {}
+  };
+
+  std::unordered_map<const BinaryFunction *, InliningInfo> InliningCandidates;
+
+  /// Count total amount of bytes inlined for all instances of Inliner.
+  /// Note that this number could be negative indicating that the inliner
+  /// reduced the size.
+  int64_t TotalInlinedBytes{0};
+
+  /// Dynamic count of calls eliminated.
+  uint64_t NumInlinedDynamicCalls{0};
+
+  /// Number of call sites that were inlined.
+  uint64_t NumInlinedCallSites{0};
+
+  /// Size in bytes of a regular call instruction.
+  static uint64_t SizeOfCallInst;
+
+  /// Size in bytes of a tail call instruction.
+  static uint64_t SizeOfTailCallInst;
+
+  /// Set of functions modified by inlining (used for printing).
+  std::unordered_set<const BinaryFunction *> Modified;
+
+  /// Return the size in bytes of a regular call instruction.
+  uint64_t getSizeOfCallInst(const BinaryContext &BC);
+
+  /// Return the size in bytes of a tail call instruction.
+  uint64_t getSizeOfTailCallInst(const BinaryContext &BC);
+
+  void findInliningCandidates(BinaryContext &BC);
+
+  bool inlineCallsInFunction(BinaryFunction &Function);
+
+  /// Inline a function call \p CallInst to function \p Callee.
+  ///
+  /// Return the location (basic block and instruction iterator) where the code
+  /// of the caller function continues after the inlined code.
+  std::pair<BinaryBasicBlock *, BinaryBasicBlock::iterator>
+  inlineCall(BinaryBasicBlock &CallerBB,
+             BinaryBasicBlock::iterator CallInst,
+             const BinaryFunction &Callee);
+
+  /// Check if the inliner can handle inlining of \p BF.
+  InliningInfo getInliningInfo(const BinaryFunction &BF) const;
+
+public:
+  explicit Inliner(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "inlining";
+  }
+
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/Instrumentation.cpp b/bolt/src/Passes/Instrumentation.cpp
new file mode 100644
index 000000000000..40755cfc7926
--- /dev/null
+++ b/bolt/src/Passes/Instrumentation.cpp
@@ -0,0 +1,717 @@
+//===--- Passes/Instrumentation.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Instrumentation.h"
+#include "ParallelUtilities.h"
+#include "RuntimeLibs/InstrumentationRuntimeLibrary.h"
+#include "llvm/Support/CommandLine.h"
+#include <stack>
+
+#define DEBUG_TYPE "bolt-instrumentation"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltInstrCategory;
+
+cl::opt<std::string> InstrumentationFilename(
+    "instrumentation-file",
+    cl::desc("file name where instrumented profile will be saved (default: "
+             "/tmp/prof.fdata)"),
+    cl::init("/tmp/prof.fdata"), cl::Optional, cl::cat(BoltInstrCategory));
+
+cl::opt<std::string> InstrumentationBinpath(
+    "instrumentation-binpath",
+    cl::desc("path to instumented binary in case if /proc/self/map_files "
+             "is not accessible due to access restriction issues"),
+    cl::Optional, cl::cat(BoltInstrCategory));
+
+cl::opt<bool> InstrumentationFileAppendPID(
+    "instrumentation-file-append-pid",
+    cl::desc("append PID to saved profile file name (default: false)"),
+    cl::init(false),
+    cl::Optional,
+    cl::cat(BoltInstrCategory));
+
+cl::opt<bool> ConservativeInstrumentation(
+    "conservative-instrumentation",
+    cl::desc(
+        "don't trust our CFG and disable spanning trees and any counter "
+        "inference, put a counter everywhere (for debugging, default: false)"),
+    cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
+
+cl::opt<uint32_t> InstrumentationSleepTime(
+    "instrumentation-sleep-time",
+    cl::desc("interval between profile writes (default: 0 = write only at "
+             "program end).  This is useful for service workloads when you "
+             "want to dump profile every X minutes or if you are killing the "
+             "program and the profile is not being dumped at the end."),
+    cl::init(0), cl::Optional, cl::cat(BoltInstrCategory));
+
+cl::opt<bool> InstrumentationNoCountersClear(
+    "instrumentation-no-counters-clear",
+    cl::desc("Don't clear counters across dumps "
+             "(use with instrumentation-sleep-time option)"),
+    cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
+
+cl::opt<bool> InstrumentationWaitForks(
+    "instrumentation-wait-forks",
+    cl::desc("Wait until all forks of instrumented process will finish "
+             "(use with instrumentation-sleep-time option)"),
+    cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
+
+cl::opt<bool>
+    InstrumentHotOnly("instrument-hot-only",
+                      cl::desc("only insert instrumentation on hot functions "
+                               "(needs profile, default: false)"),
+                      cl::init(false), cl::Optional,
+                      cl::cat(BoltInstrCategory));
+
+cl::opt<bool> InstrumentCalls("instrument-calls",
+                              cl::desc("record profile for inter-function "
+                                       "control flow activity (default: true)"),
+                              cl::init(true), cl::Optional,
+                              cl::cat(BoltInstrCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
+  auto Iter = FuncToStringIdx.find(&Function);
+  if (Iter != FuncToStringIdx.end())
+    return Iter->second;
+  size_t Idx = Summary->StringTable.size();
+  FuncToStringIdx.emplace(std::make_pair(&Function, Idx));
+  Summary->StringTable.append(std::string(Function.getOneName()));
+  Summary->StringTable.append(1, '\0');
+  return Idx;
+}
+
+bool Instrumentation::createCallDescription(FunctionDescription &FuncDesc,
+                                            const BinaryFunction &FromFunction,
+                                            uint32_t From, uint32_t FromNodeID,
+                                            const BinaryFunction &ToFunction,
+                                            uint32_t To, bool IsInvoke) {
+  CallDescription CD;
+  // Ordinarily, we don't augment direct calls with an explicit counter, except
+  // when forced to do so or when we know this callee could be throwing
+  // exceptions, in which case there is no other way to accurately record its
+  // frequency.
+  bool ForceInstrumentation = opts::ConservativeInstrumentation || IsInvoke;
+  CD.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
+  CD.FromLoc.Offset = From;
+  CD.FromNode = FromNodeID;
+  CD.Target = &ToFunction;
+  CD.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
+  CD.ToLoc.Offset = To;
+  CD.Counter = ForceInstrumentation ? Summary->Counters.size() : 0xffffffff;
+  if (ForceInstrumentation)
+    ++DirectCallCounters;
+  FuncDesc.Calls.emplace_back(CD);
+  return ForceInstrumentation;
+}
+
+void Instrumentation::createIndCallDescription(
+    const BinaryFunction &FromFunction, uint32_t From) {
+  IndCallDescription ICD;
+  ICD.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
+  ICD.FromLoc.Offset = From;
+  Summary->IndCallDescriptions.emplace_back(ICD);
+}
+
+void Instrumentation::createIndCallTargetDescription(
+    const BinaryFunction &ToFunction, uint32_t To) {
+  IndCallTargetDescription ICD;
+  ICD.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
+  ICD.ToLoc.Offset = To;
+  ICD.Target = &ToFunction;
+  Summary->IndCallTargetDescriptions.emplace_back(ICD);
+}
+
+bool Instrumentation::createEdgeDescription(
+    FunctionDescription &FuncDesc,
+    const BinaryFunction &FromFunction, uint32_t From,
+    uint32_t FromNodeID,
+    const BinaryFunction &ToFunction, uint32_t To,
+    uint32_t ToNodeID, bool Instrumented) {
+  EdgeDescription ED;
+  auto Result = FuncDesc.EdgesSet.insert(std::make_pair(FromNodeID, ToNodeID));
+  // Avoid creating duplicated edge descriptions. This happens in CFGs where a
+  // block jumps to its fall-through.
+  if (Result.second == false)
+    return false;
+  ED.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
+  ED.FromLoc.Offset = From;
+  ED.FromNode = FromNodeID;
+  ED.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
+  ED.ToLoc.Offset = To;
+  ED.ToNode = ToNodeID;
+  ED.Counter = Instrumented ? Summary->Counters.size() : 0xffffffff;
+  if (Instrumented)
+    ++BranchCounters;
+  FuncDesc.Edges.emplace_back(ED);
+  return Instrumented;
+}
+
+void Instrumentation::createLeafNodeDescription(FunctionDescription &FuncDesc,
+                                                uint32_t Node) {
+  InstrumentedNode IN;
+  IN.Node = Node;
+  IN.Counter = Summary->Counters.size();
+  ++LeafNodeCounters;
+  FuncDesc.LeafNodes.emplace_back(IN);
+}
+
+std::vector<MCInst>
+Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) {
+  auto L = BC.scopeLock();
+  MCSymbol *Label;
+  Label = BC.Ctx->createNamedTempSymbol("InstrEntry");
+  Summary->Counters.emplace_back(Label);
+  std::vector<MCInst> CounterInstrs;
+  CounterInstrs.resize(IsLeaf? 5 : 3);
+  uint32_t I = 0;
+  // Don't clobber application red zone (ABI dependent)
+  if (IsLeaf)
+    BC.MIB->createStackPointerIncrement(CounterInstrs[I++], 128,
+                                        /*NoFlagsClobber=*/true);
+  BC.MIB->createPushFlags(CounterInstrs[I++], 2);
+  BC.MIB->createIncMemory(CounterInstrs[I++], Label, &*BC.Ctx);
+  BC.MIB->createPopFlags(CounterInstrs[I++], 2);
+  if (IsLeaf)
+    BC.MIB->createStackPointerDecrement(CounterInstrs[I++], 128,
+                                        /*NoFlagsClobber=*/true);
+  return CounterInstrs;
+}
+
+namespace {
+
+// Helper instruction sequence insertion function
+BinaryBasicBlock::iterator
+insertInstructions(std::vector<MCInst>& Instrs,
+                   BinaryBasicBlock &BB,
+                   BinaryBasicBlock::iterator Iter) {
+  for (MCInst &NewInst : Instrs) {
+    Iter = BB.insertInstruction(Iter, NewInst);
+    ++Iter;
+  }
+  return Iter;
+}
+
+}
+
+void Instrumentation::instrumentLeafNode(BinaryContext &BC,
+                                         BinaryBasicBlock &BB,
+                                         BinaryBasicBlock::iterator Iter,
+                                         bool IsLeaf,
+                                         FunctionDescription &FuncDesc,
+                                         uint32_t Node) {
+  createLeafNodeDescription(FuncDesc, Node);
+  std::vector<MCInst> CounterInstrs = createInstrumentationSnippet(BC, IsLeaf);
+  insertInstructions(CounterInstrs, BB, Iter);
+}
+
+void Instrumentation::instrumentIndirectTarget(BinaryBasicBlock &BB,
+                                               BinaryBasicBlock::iterator &Iter,
+                                               BinaryFunction &FromFunction,
+                                               uint32_t From) {
+  auto L = FromFunction.getBinaryContext().scopeLock();
+  const size_t IndCallSiteID = Summary->IndCallDescriptions.size();
+  createIndCallDescription(FromFunction, From);
+
+  BinaryContext &BC = FromFunction.getBinaryContext();
+  bool IsTailCall = BC.MIB->isTailCall(*Iter);
+  std::vector<MCInst> CounterInstrs = BC.MIB->createInstrumentedIndirectCall(
+      *Iter, IsTailCall,
+      IsTailCall ? IndTailCallHandlerFunction->getSymbol()
+                 : IndCallHandlerFunction->getSymbol(),
+      IndCallSiteID, &*BC.Ctx);
+
+  Iter = BB.eraseInstruction(Iter);
+  Iter = insertInstructions(CounterInstrs, BB, Iter);
+  --Iter;
+}
+
+bool Instrumentation::instrumentOneTarget(
+    SplitWorklistTy &SplitWorklist, SplitInstrsTy &SplitInstrs,
+    BinaryBasicBlock::iterator &Iter, BinaryFunction &FromFunction,
+    BinaryBasicBlock &FromBB, uint32_t From, BinaryFunction &ToFunc,
+    BinaryBasicBlock *TargetBB, uint32_t ToOffset, bool IsLeaf, bool IsInvoke,
+    FunctionDescription *FuncDesc, uint32_t FromNodeID, uint32_t ToNodeID) {
+  {
+    auto L = FromFunction.getBinaryContext().scopeLock();
+    bool Created = true;
+    if (!TargetBB)
+      Created = createCallDescription(*FuncDesc, FromFunction, From, FromNodeID,
+                                      ToFunc, ToOffset, IsInvoke);
+    else
+      Created = createEdgeDescription(*FuncDesc, FromFunction, From, FromNodeID,
+                                      ToFunc, ToOffset, ToNodeID,
+                                      /*Instrumented=*/true);
+    if (!Created)
+      return false;
+  }
+
+  std::vector<MCInst> CounterInstrs =
+    createInstrumentationSnippet(FromFunction.getBinaryContext(), IsLeaf);
+
+  BinaryContext &BC = FromFunction.getBinaryContext();
+  const MCInst &Inst = *Iter;
+  if (BC.MIB->isCall(Inst)) {
+    // This code handles both
+    // - (regular) inter-function calls (cross-function control transfer),
+    // - (rare) intra-function calls (function-local control transfer)
+    Iter = insertInstructions(CounterInstrs, FromBB, Iter);
+    return true;
+  }
+
+  if (!TargetBB || !FuncDesc)
+    return false;
+
+  // Indirect branch, conditional branches or fall-throughs
+  // Regular cond branch, put counter at start of target block
+  //
+  // N.B.: (FromBB != TargetBBs) checks below handle conditional jumps where
+  // we can't put the instrumentation counter in this block because not all
+  // paths that reach it at this point will be taken and going to the target.
+  if (TargetBB->pred_size() == 1 && &FromBB != TargetBB &&
+      !TargetBB->isEntryPoint()) {
+    insertInstructions(CounterInstrs, *TargetBB, TargetBB->begin());
+    return true;
+  }
+  if (FromBB.succ_size() == 1 && &FromBB != TargetBB) {
+    Iter = insertInstructions(CounterInstrs, FromBB, Iter);
+    return true;
+  }
+  // Critical edge, create BB and put counter there
+  SplitWorklist.emplace_back(&FromBB, TargetBB);
+  SplitInstrs.emplace_back(std::move(CounterInstrs));
+  return true;
+}
+
+void Instrumentation::instrumentFunction(BinaryContext &BC,
+                                         BinaryFunction &Function,
+                                         MCPlusBuilder::AllocatorIdTy AllocId) {
+  if (Function.hasUnknownControlFlow())
+    return;
+
+  if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1"))
+    return;
+
+  SplitWorklistTy SplitWorklist;
+  SplitInstrsTy SplitInstrs;
+
+  FunctionDescription *FuncDesc = nullptr;
+  {
+    std::unique_lock<std::shared_timed_mutex> L(FDMutex);
+    Summary->FunctionDescriptions.emplace_back();
+    FuncDesc = &Summary->FunctionDescriptions.back();
+  }
+
+  FuncDesc->Function = &Function;
+  Function.disambiguateJumpTables(AllocId);
+  Function.deleteConservativeEdges();
+
+  std::unordered_map<const BinaryBasicBlock *, uint32_t> BBToID;
+  uint32_t Id = 0;
+  for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
+    BBToID[&*BBI] = Id++;
+  }
+  std::unordered_set<const BinaryBasicBlock *> VisitedSet;
+  // DFS to establish edges we will use for a spanning tree. Edges in the
+  // spanning tree can be instrumentation-free since their count can be
+  // inferred by solving flow equations on a bottom-up traversal of the tree.
+  // Exit basic blocks are always instrumented so we start the traversal with
+  // a minimum number of defined variables to make the equation solvable.
+  std::stack<std::pair<const BinaryBasicBlock *, BinaryBasicBlock *>> Stack;
+  std::unordered_map<const BinaryBasicBlock *,
+                     std::set<const BinaryBasicBlock *>>
+      STOutSet;
+  for (auto BBI = Function.layout_rbegin(); BBI != Function.layout_rend();
+       ++BBI) {
+    if ((*BBI)->isEntryPoint() || (*BBI)->isLandingPad()) {
+      Stack.push(std::make_pair(nullptr, *BBI));
+      if (opts::InstrumentCalls && (*BBI)->isEntryPoint()) {
+        EntryNode E;
+        E.Node = BBToID[&**BBI];
+        E.Address = (*BBI)->getInputOffset();
+        FuncDesc->EntryNodes.emplace_back(E);
+        createIndCallTargetDescription(Function, (*BBI)->getInputOffset());
+      }
+    }
+  }
+
+  // Modified version of BinaryFunction::dfs() to build a spanning tree
+  if (!opts::ConservativeInstrumentation) {
+    while (!Stack.empty()) {
+      BinaryBasicBlock *BB;
+      const BinaryBasicBlock *Pred;
+      std::tie(Pred, BB) = Stack.top();
+      Stack.pop();
+      if (VisitedSet.find(BB) != VisitedSet.end())
+        continue;
+
+      VisitedSet.insert(BB);
+      if (Pred)
+        STOutSet[Pred].insert(BB);
+
+      for (BinaryBasicBlock *SuccBB : BB->successors())
+        Stack.push(std::make_pair(BB, SuccBB));
+    }
+  }
+
+  // Determine whether this is a leaf function, which needs special
+  // instructions to protect the red zone
+  bool IsLeafFunction = true;
+  DenseSet<const BinaryBasicBlock *> InvokeBlocks;
+  for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
+    for (auto I = BBI->begin(), E = BBI->end(); I != E; ++I) {
+      if (BC.MIB->isCall(*I)) {
+        if (BC.MIB->isInvoke(*I)) {
+          InvokeBlocks.insert(&*BBI);
+        }
+        IsLeafFunction = false;
+      }
+    }
+  }
+
+  for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
+    BinaryBasicBlock &BB = *BBI;
+    bool HasUnconditionalBranch = false;
+    bool HasJumpTable = false;
+    bool IsInvokeBlock = InvokeBlocks.count(&BB) > 0;
+
+    for (auto I = BB.begin(); I != BB.end(); ++I) {
+      const MCInst &Inst = *I;
+      if (!BC.MIB->hasAnnotation(Inst, "Offset"))
+        continue;
+
+      const bool IsJumpTable = Function.getJumpTable(Inst);
+      if (IsJumpTable)
+        HasJumpTable = true;
+      else if (BC.MIB->isUnconditionalBranch(Inst))
+        HasUnconditionalBranch = true;
+      else if ((!BC.MIB->isCall(Inst) && !BC.MIB->isConditionalBranch(Inst)) ||
+               BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
+        continue;
+
+      uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
+      const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
+      BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
+      uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
+      BinaryFunction *TargetFunc =
+          TargetBB ? &Function : BC.getFunctionForSymbol(Target);
+      if (TargetFunc && BC.MIB->isCall(Inst)) {
+        if (opts::InstrumentCalls) {
+          const BinaryBasicBlock *ForeignBB =
+              TargetFunc->getBasicBlockForLabel(Target);
+          if (ForeignBB)
+            ToOffset = ForeignBB->getInputOffset();
+          instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
+                              FromOffset, *TargetFunc, TargetBB, ToOffset,
+                              IsLeafFunction, IsInvokeBlock, FuncDesc,
+                              BBToID[&BB]);
+        }
+        continue;
+      }
+      if (TargetFunc) {
+        // Do not instrument edges in the spanning tree
+        if (STOutSet[&BB].find(TargetBB) != STOutSet[&BB].end()) {
+          auto L = BC.scopeLock();
+          createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
+                                Function, ToOffset, BBToID[TargetBB],
+                                /*Instrumented=*/false);
+          continue;
+        }
+        instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
+                            FromOffset, *TargetFunc, TargetBB, ToOffset,
+                            IsLeafFunction, IsInvokeBlock, FuncDesc,
+                            BBToID[&BB], BBToID[TargetBB]);
+        continue;
+      }
+
+      if (IsJumpTable) {
+        for (BinaryBasicBlock *&Succ : BB.successors()) {
+          // Do not instrument edges in the spanning tree
+          if (STOutSet[&BB].find(&*Succ) != STOutSet[&BB].end()) {
+            auto L = BC.scopeLock();
+            createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
+                                  Function, Succ->getInputOffset(),
+                                  BBToID[&*Succ], /*Instrumented=*/false);
+            continue;
+          }
+          instrumentOneTarget(
+              SplitWorklist, SplitInstrs, I, Function, BB, FromOffset, Function,
+              &*Succ, Succ->getInputOffset(), IsLeafFunction, IsInvokeBlock,
+              FuncDesc, BBToID[&BB], BBToID[&*Succ]);
+        }
+        continue;
+      }
+
+      // Handle indirect calls -- could be direct calls with unknown targets
+      // or secondary entry points of known functions, so check it is indirect
+      // to be sure.
+      if (opts::InstrumentCalls && BC.MIB->isIndirectCall(*I))
+        instrumentIndirectTarget(BB, I, Function, FromOffset);
+
+    } // End of instructions loop
+
+    // Instrument fallthroughs (when the direct jump instruction is missing)
+    if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
+        BB.size() > 0) {
+      BinaryBasicBlock *FTBB = BB.getFallthrough();
+      assert(FTBB && "expected valid fall-through basic block");
+      auto I = BB.begin();
+      auto LastInstr = BB.end();
+      --LastInstr;
+      while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
+        --LastInstr;
+      uint32_t FromOffset = 0;
+      // The last instruction in the BB should have an annotation, except
+      // if it was branching to the end of the function as a result of
+      // __builtin_unreachable(), in which case it was deleted by fixBranches.
+      // Ignore this case. FIXME: force fixBranches() to preserve the offset.
+      if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
+        continue;
+      FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
+
+      // Do not instrument edges in the spanning tree
+      if (STOutSet[&BB].find(FTBB) != STOutSet[&BB].end()) {
+        auto L = BC.scopeLock();
+        createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
+                              Function, FTBB->getInputOffset(), BBToID[FTBB],
+                              /*Instrumented=*/false);
+        continue;
+      }
+      instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
+                          FromOffset, Function, FTBB, FTBB->getInputOffset(),
+                          IsLeafFunction, IsInvokeBlock, FuncDesc, BBToID[&BB],
+                          BBToID[FTBB]);
+    }
+  } // End of BBs loop
+
+  // Instrument spanning tree leaves
+  if (!opts::ConservativeInstrumentation) {
+    for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
+      BinaryBasicBlock &BB = *BBI;
+      if (STOutSet[&BB].size() == 0)
+        instrumentLeafNode(BC, BB, BB.begin(), IsLeafFunction, *FuncDesc,
+                           BBToID[&BB]);
+    }
+  }
+
+  // Consume list of critical edges: split them and add instrumentation to the
+  // newly created BBs
+  auto Iter = SplitInstrs.begin();
+  for (std::pair<BinaryBasicBlock *, BinaryBasicBlock *> &BBPair :
+       SplitWorklist) {
+    BinaryBasicBlock *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
+    NewBB->addInstructions(Iter->begin(), Iter->end());
+    ++Iter;
+  }
+
+  // Unused now
+  FuncDesc->EdgesSet.clear();
+}
+
+void Instrumentation::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
+                                                 /*IsText=*/false,
+                                                 /*IsAllocatable=*/true);
+  BC.registerOrUpdateSection(".bolt.instr.counters", ELF::SHT_PROGBITS, Flags,
+                             nullptr, 0, 1);
+
+  BC.registerOrUpdateNoteSection(".bolt.instr.tables", nullptr,
+                                  0,
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true, ELF::SHT_NOTE);
+
+  Summary->IndCallInstrPointer =
+      BC.Ctx->getOrCreateSymbol("__bolt_ind_call_instr_pointer");
+  Summary->IndTailCallInstrPointer =
+      BC.Ctx->getOrCreateSymbol("__bolt_ind_tailcall_instr_pointer");
+
+  createAuxiliaryFunctions(BC);
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    return (!BF.isSimple() || BF.isIgnored() ||
+            (opts::InstrumentHotOnly && !BF.getKnownExecutionCount()));
+  };
+
+  ParallelUtilities::WorkFuncWithAllocTy WorkFun =
+      [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
+        instrumentFunction(BC, BF, AllocatorId);
+      };
+
+  ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+      BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFun,
+      SkipPredicate, "instrumentation", /* ForceSequential=*/true);
+
+  if (BC.isMachO()) {
+    if (BC.StartFunctionAddress) {
+      BinaryFunction *Main =
+          BC.getBinaryFunctionAtAddress(*BC.StartFunctionAddress);
+      assert(Main && "Entry point function not found");
+      BinaryBasicBlock &BB = Main->front();
+
+      ErrorOr<BinarySection &> SetupSection =
+          BC.getUniqueSectionByName("I__setup");
+      if (!SetupSection) {
+        llvm::errs() << "Cannot find I__setup section\n";
+        exit(1);
+      }
+      MCSymbol *Target = BC.registerNameAtAddress(
+          "__bolt_instr_setup", SetupSection->getAddress(), 0, 0);
+      MCInst NewInst;
+      BC.MIB->createCall(NewInst, Target, BC.Ctx.get());
+      BB.insertInstruction(BB.begin(), std::move(NewInst));
+    } else {
+      llvm::errs() << "BOLT-WARNING: Entry point not found\n";
+    }
+
+    if (BinaryData *BD = BC.getBinaryDataByName("___GLOBAL_init_65535/1")) {
+      BinaryFunction *Ctor = BC.getBinaryFunctionAtAddress(BD->getAddress());
+      assert(Ctor && "___GLOBAL_init_65535 function not found");
+      BinaryBasicBlock &BB = Ctor->front();
+      ErrorOr<BinarySection &> FiniSection =
+          BC.getUniqueSectionByName("I__fini");
+      if (!FiniSection) {
+        llvm::errs() << "Cannot find I__fini section\n";
+        exit(1);
+      }
+      MCSymbol *Target = BC.registerNameAtAddress(
+          "__bolt_instr_fini", FiniSection->getAddress(), 0, 0);
+      auto IsLEA = [&BC](const MCInst &Inst) { return BC.MIB->isLEA64r(Inst); };
+      const auto LEA = std::find_if(std::next(std::find_if(
+          BB.rbegin(), BB.rend(), IsLEA)), BB.rend(), IsLEA);
+      LEA->getOperand(4).setExpr(
+          MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *BC.Ctx));
+    } else {
+      llvm::errs() << "BOLT-WARNING: ___GLOBAL_init_65535 not found\n";
+    }
+  }
+
+  setupRuntimeLibrary(BC);
+}
+
+void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
+  auto createSimpleFunction =
+      [&](StringRef Title, std::vector<MCInst> Instrs) -> BinaryFunction * {
+    BinaryFunction *Func = BC.createInjectedBinaryFunction(std::string(Title));
+
+    std::vector<std::unique_ptr<BinaryBasicBlock>> BBs;
+    BBs.emplace_back(
+        Func->createBasicBlock(BinaryBasicBlock::INVALID_OFFSET, nullptr));
+    BBs.back()->addInstructions(Instrs.begin(), Instrs.end());
+    BBs.back()->setCFIState(0);
+    Func->insertBasicBlocks(nullptr, std::move(BBs),
+                            /*UpdateLayout=*/true,
+                            /*UpdateCFIState=*/false);
+    Func->updateState(BinaryFunction::State::CFG_Finalized);
+    return Func;
+  };
+
+  BinaryFunction *IndCallHandler =
+      createSimpleFunction("__bolt_instr_ind_call_handler",
+                           BC.MIB->createInstrumentedIndCallHandler());
+
+  IndCallHandlerFunction = createSimpleFunction(
+      "__bolt_instr_ind_call_handler_func",
+      BC.MIB->createInstrumentedIndCallTrampoline(
+          Summary->IndCallInstrPointer, IndCallHandler->getSymbol(), &*BC.Ctx));
+
+  BinaryFunction *IndTailCallHandler =
+      createSimpleFunction("__bolt_instr_ind_tail_call_handler",
+                           BC.MIB->createInstrumentedIndTailCallHandler());
+
+  IndTailCallHandlerFunction =
+      createSimpleFunction("__bolt_instr_ind_tailcall_handler_func",
+                           BC.MIB->createInstrumentedIndCallTrampoline(
+                               Summary->IndTailCallInstrPointer,
+                               IndTailCallHandler->getSymbol(), &*BC.Ctx));
+
+  createSimpleFunction("__bolt_num_counters_getter",
+                       BC.MIB->createNumCountersGetter(BC.Ctx.get()));
+  createSimpleFunction("__bolt_instr_locations_getter",
+                       BC.MIB->createInstrLocationsGetter(BC.Ctx.get()));
+  createSimpleFunction("__bolt_instr_tables_getter",
+                       BC.MIB->createInstrTablesGetter(BC.Ctx.get()));
+  createSimpleFunction("__bolt_instr_num_funcs_getter",
+                       BC.MIB->createInstrNumFuncsGetter(BC.Ctx.get()));
+
+  if (BC.isELF()) {
+    if (BC.StartFunctionAddress) {
+      BinaryFunction *Start =
+          BC.getBinaryFunctionAtAddress(*BC.StartFunctionAddress);
+      assert(Start && "Entry point function not found");
+      const MCSymbol *StartSym = Start->getSymbol();
+      createSimpleFunction(
+          "__bolt_start_trampoline",
+          BC.MIB->createSymbolTrampoline(StartSym, BC.Ctx.get()));
+    }
+    if (BC.FiniFunctionAddress) {
+      BinaryFunction *Fini =
+          BC.getBinaryFunctionAtAddress(*BC.FiniFunctionAddress);
+      assert(Fini && "Finalization function not found");
+      const MCSymbol *FiniSym = Fini->getSymbol();
+      createSimpleFunction(
+          "__bolt_fini_trampoline",
+          BC.MIB->createSymbolTrampoline(FiniSym, BC.Ctx.get()));
+    } else {
+      // Create dummy return function for tranpoline to avoid issues
+      // with unknown symbol in runtime library. E.g. for static PIE
+      // executable
+      createSimpleFunction("__bolt_fini_trampoline",
+                           BC.MIB->createDummyReturnFunction(BC.Ctx.get()));
+    }
+  }
+}
+
+void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {
+  uint32_t FuncDescSize = Summary->getFDSize();
+
+  outs() << "BOLT-INSTRUMENTER: Number of indirect call site descriptors: "
+         << Summary->IndCallDescriptions.size() << "\n";
+  outs() << "BOLT-INSTRUMENTER: Number of indirect call target descriptors: "
+         << Summary->IndCallTargetDescriptions.size() << "\n";
+  outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
+         << Summary->FunctionDescriptions.size() << "\n";
+  outs() << "BOLT-INSTRUMENTER: Number of branch counters: " << BranchCounters
+         << "\n";
+  outs() << "BOLT-INSTRUMENTER: Number of ST leaf node counters: "
+         << LeafNodeCounters << "\n";
+  outs() << "BOLT-INSTRUMENTER: Number of direct call counters: "
+         << DirectCallCounters << "\n";
+  outs() << "BOLT-INSTRUMENTER: Total number of counters: "
+         << Summary->Counters.size() << "\n";
+  outs() << "BOLT-INSTRUMENTER: Total size of counters: "
+         << (Summary->Counters.size() * 8) << " bytes (static alloc memory)\n";
+  outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
+         << Summary->StringTable.size() << " bytes in file\n";
+  outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
+         << (FuncDescSize +
+             Summary->IndCallDescriptions.size() * sizeof(IndCallDescription) +
+             Summary->IndCallTargetDescriptions.size() *
+                 sizeof(IndCallTargetDescription))
+         << " bytes in file\n";
+  outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
+         << opts::InstrumentationFilename << "\n";
+
+  InstrumentationRuntimeLibrary *RtLibrary =
+      static_cast<InstrumentationRuntimeLibrary *>(BC.getRuntimeLibrary());
+  assert(RtLibrary && "instrumentation runtime library object must be set");
+  RtLibrary->setSummary(std::move(Summary));
+}
+}
+}
diff --git a/bolt/src/Passes/Instrumentation.h b/bolt/src/Passes/Instrumentation.h
new file mode 100644
index 000000000000..7f3581e95a65
--- /dev/null
+++ b/bolt/src/Passes/Instrumentation.h
@@ -0,0 +1,130 @@
+//===--- Passes/Instrumentation.h -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an instrumentation pass that modifies the input binary to generate
+// a profile after execution finishes. It can modify branches and calls to
+// increment counters stored in the process memory. A runtime library is linked
+// into the final binary to handle writing these counters to an fdata file. See
+// runtime/instr.cpp
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_H
+
+#include "BinaryPasses.h"
+#include "Passes/InstrumentationSummary.h"
+
+namespace llvm {
+namespace bolt {
+
+class Instrumentation : public BinaryFunctionPass {
+public:
+  Instrumentation(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass),
+        Summary(std::make_unique<InstrumentationSummary>()) {}
+
+  /// Modifies all functions by inserting instrumentation code (first step)
+  void runOnFunctions(BinaryContext &BC) override;
+
+  const char *getName() const override { return "instrumentation"; }
+
+private:
+
+  void instrumentFunction(BinaryContext &BC, BinaryFunction &Function,
+                          MCPlusBuilder::AllocatorIdTy = 0);
+
+  /// Retrieve the string table index for the name of \p Function. We encode
+  /// instrumented locations descriptions with the aid of a string table to
+  /// manage memory of the instrumentation runtime in a more efficient way.
+  /// If this function name is not represented in the string table yet, it will
+  /// be inserted and its index returned.
+  uint32_t getFunctionNameIndex(const BinaryFunction &Function);
+
+  /// Metadata creation methods
+  void createIndCallDescription(const BinaryFunction &FromFunction,
+                                uint32_t From);
+  void createIndCallTargetDescription(const BinaryFunction &ToFunction,
+                                      uint32_t To);
+  bool createCallDescription(FunctionDescription &FuncDesc,
+                             const BinaryFunction &FromFunction, uint32_t From,
+                             uint32_t FromNodeID,
+                             const BinaryFunction &ToFunction, uint32_t To,
+                             bool IsInvoke);
+  bool createEdgeDescription(FunctionDescription &FuncDesc,
+                             const BinaryFunction &FromFunction, uint32_t From,
+                             uint32_t FromNodeID,
+                             const BinaryFunction &ToFunction, uint32_t To,
+                             uint32_t ToNodeID, bool Instrumented);
+  void createLeafNodeDescription(FunctionDescription &FuncDesc, uint32_t Node);
+
+  /// Create the sequence of instructions to increment a counter
+  std::vector<MCInst> createInstrumentationSnippet(BinaryContext &BC,
+                                                   bool IsLeaf);
+
+  // Critical edges worklist
+  // This worklist keeps track of CFG edges <From-To> that needs to be split.
+  // This task is deferred until we finish processing all BBs because we can't
+  // modify the CFG while iterating over it. For each edge, \p SplitInstrsTy
+  // stores the list of instrumentation instructions as a vector of MCInsts.
+  // instrumentOneTarget() populates this, instrumentFunction() consumes.
+  using SplitWorklistTy =
+      std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>>;
+  using SplitInstrsTy = std::vector<std::vector<MCInst>>;
+
+  /// Instrument the branch or call in \p Iter. \p TargetBB should be non-null
+  /// if this is a local branch and null if it is a call. Return true if the
+  /// location was instrumented with an explicit counter or false if it just
+  /// created the description, but no explicit counters were necessary.
+  bool instrumentOneTarget(SplitWorklistTy &SplitWorklist,
+                           SplitInstrsTy &SplitInstrs,
+                           BinaryBasicBlock::iterator &Iter,
+                           BinaryFunction &FromFunction,
+                           BinaryBasicBlock &FromBB, uint32_t From,
+                           BinaryFunction &ToFunc, BinaryBasicBlock *TargetBB,
+                           uint32_t ToOffset, bool IsLeaf, bool IsInvoke,
+                           FunctionDescription *FuncDesc,
+                           uint32_t FromNodeID, uint32_t ToNodeID = 0);
+
+  void instrumentLeafNode(BinaryContext &BC, BinaryBasicBlock &BB,
+                          BinaryBasicBlock::iterator Iter, bool IsLeaf,
+                          FunctionDescription &FuncDesc, uint32_t Node);
+
+  void instrumentIndirectTarget(BinaryBasicBlock &BB,
+                                BinaryBasicBlock::iterator &Iter,
+                                BinaryFunction &FromFunction, uint32_t From);
+
+  void createAuxiliaryFunctions(BinaryContext &BC);
+
+  uint32_t getFDSize() const;
+
+  /// Create a runtime library, pass the BinData over, and register it
+  /// under \p BC.
+  void setupRuntimeLibrary(BinaryContext &BC);
+
+  /// strtab indices in StringTable for each function name
+  std::unordered_map<const BinaryFunction *, uint32_t> FuncToStringIdx;
+
+  mutable std::shared_timed_mutex FDMutex;
+
+  /// The data generated during Instrumentation pass that needs to
+  /// be passed to the Instrument runtime library.
+  std::unique_ptr<InstrumentationSummary> Summary;
+
+  /// Statistics on counters
+  uint32_t DirectCallCounters{0};
+  uint32_t BranchCounters{0};
+  uint32_t LeafNodeCounters{0};
+
+  /// Indirect call instrumentation functions
+  BinaryFunction *IndCallHandlerFunction;
+  BinaryFunction *IndTailCallHandlerFunction;
+};
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/InstrumentationSummary.h b/bolt/src/Passes/InstrumentationSummary.h
new file mode 100644
index 000000000000..6a7c40d85371
--- /dev/null
+++ b/bolt/src/Passes/InstrumentationSummary.h
@@ -0,0 +1,143 @@
+//===--- InstrumentationSummary.h -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// InstrumentationSummary holds all the data generated during
+// the Instrumentation pass, which will be needed latter for runtime library
+// binary emit and linking.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_SUMMARY_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_SUMMARY_H
+
+#include "llvm/ADT/DenseSet.h"
+#include <vector>
+
+namespace llvm {
+
+class MCSymbol;
+
+namespace bolt {
+
+class BinaryFunction;
+
+// All structs here are part of the program metadata serialization format and
+// consist of POD types or array of POD types that are trivially mapped from
+// disk to memory. This provides the runtime library with a basic
+// understanding of the program structure, so it can build a CFG for each
+// function and deduce execution counts for edges that don't require explicit
+// counters. It also provides function names and offsets used when writing the
+// fdata file.
+
+// Location information -- analoguous to the concept of the same name in fdata
+// writing/reading. The difference is that the name is stored as an index to a
+// string table written separately.
+struct LocDescription {
+  uint32_t FuncString;
+  uint32_t Offset;
+};
+
+// Inter-function control flow transfer instrumentation
+struct CallDescription {
+  LocDescription FromLoc;
+  uint32_t FromNode; // Node refers to the CFG node index of the call site
+  LocDescription ToLoc;
+  uint32_t Counter;
+  const BinaryFunction *Target;
+};
+
+// Spans multiple counters during runtime - this is an indirect call site
+struct IndCallDescription {
+  LocDescription FromLoc;
+};
+
+// This is an indirect call target (any entry point from any function). This
+// is stored sorted in the binary for fast lookups during data writing.
+struct IndCallTargetDescription {
+  LocDescription ToLoc;
+  const BinaryFunction *Target;
+};
+
+// Intra-function control flow transfer instrumentation
+struct EdgeDescription {
+  LocDescription FromLoc;
+  uint32_t FromNode;
+  LocDescription ToLoc;
+  uint32_t ToNode;
+  uint32_t Counter;
+};
+
+// Basic block frequency (CFG node) instrumentation - only used for spanning
+// tree leaf nodes.
+struct InstrumentedNode {
+  uint32_t Node;
+  uint32_t Counter;
+};
+
+// Entry basic blocks for a function. We record their output addresses to
+// check frequency of this address (via node number) against all tracked calls
+// to this address and discover traffic coming from uninstrumented code.
+struct EntryNode {
+  uint64_t Node;
+  uint64_t Address;
+};
+
+// Base struct organizing all metadata pertaining to a single function
+struct FunctionDescription {
+  const BinaryFunction *Function;
+  std::vector<InstrumentedNode> LeafNodes;
+  std::vector<EdgeDescription> Edges;
+  DenseSet<std::pair<uint32_t, uint32_t>> EdgesSet;
+  std::vector<CallDescription> Calls;
+  std::vector<EntryNode> EntryNodes;
+};
+
+/// Holds the summary of the data generated by the Instrumentation Pass.
+/// These information will be needed for binary emit.
+struct InstrumentationSummary {
+  /// Identify all counters used in runtime while instrumentation is running
+  std::vector<MCSymbol *> Counters;
+
+  /// Stores function names, to be emitted to the runtime
+  std::string StringTable;
+
+  /// Pointer to runtime instrumentation handler
+  MCSymbol *IndCallInstrPointer;
+  MCSymbol *IndTailCallInstrPointer;
+
+  /// Intra-function control flow and direct calls
+  std::vector<FunctionDescription> FunctionDescriptions;
+
+  /// Inter-function control flow via indirect calls
+  std::vector<IndCallDescription> IndCallDescriptions;
+  std::vector<IndCallTargetDescription> IndCallTargetDescriptions;
+
+  static constexpr uint64_t NUM_SERIALIZED_CONTAINERS = 4;
+  static constexpr uint64_t SERIALIZED_CONTAINER_SIZE =
+      sizeof(uint32_t) * NUM_SERIALIZED_CONTAINERS;
+
+  uint32_t getFDSize() const {
+    uint32_t FuncDescSize = 0;
+    for (const FunctionDescription &Func : FunctionDescriptions) {
+      // A function description consists of containers of different
+      // descriptions. We use vectors to store them and when serializing them,
+      // we first output a uint32_t-sized field for the number of elements of
+      // the vector and then we write each element, so a simple parser know
+      // where to stop.
+      FuncDescSize += SERIALIZED_CONTAINER_SIZE +
+                      Func.Edges.size() * sizeof(EdgeDescription) +
+                      Func.LeafNodes.size() * sizeof(InstrumentedNode) +
+                      Func.Calls.size() * sizeof(CallDescription) +
+                      Func.EntryNodes.size() * sizeof(EntryNode);
+    }
+    return FuncDescSize;
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/JTFootprintReduction.cpp b/bolt/src/Passes/JTFootprintReduction.cpp
new file mode 100644
index 000000000000..6dc52fbbac6a
--- /dev/null
+++ b/bolt/src/Passes/JTFootprintReduction.cpp
@@ -0,0 +1,301 @@
+//===--- JTFootprintReduction.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "JTFootprintReduction.h"
+#include "BinaryFunctionCallGraph.h"
+#include "DataflowInfoManager.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "JT"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<unsigned> Verbosity;
+
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+
+static cl::opt<bool>
+JTFootprintOnlyPIC("jt-footprint-optimize-for-icache",
+  cl::desc("with jt-footprint-reduction, only process PIC jumptables and turn"
+           " off other transformations that increase code size"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+void JTFootprintReduction::checkOpportunities(BinaryContext &BC,
+                                              BinaryFunction &Function,
+                                              DataflowInfoManager &Info) {
+  std::map<JumpTable *, uint64_t> AllJTs;
+
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      JumpTable *JumpTable = Function.getJumpTable(Inst);
+      if (!JumpTable)
+        continue;
+
+      AllJTs[JumpTable] += BB.getKnownExecutionCount();
+      ++IndJmps;
+
+      if (BlacklistedJTs.count(JumpTable)) {
+        ++IndJmpsDenied;
+        continue;
+      }
+
+      uint64_t Scale;
+      // Try a standard indirect jump matcher
+      std::unique_ptr<MCPlusBuilder::MCInstMatcher> IndJmpMatcher =
+          BC.MIB->matchIndJmp(BC.MIB->matchAnyOperand(),
+                              BC.MIB->matchImm(Scale), BC.MIB->matchReg(),
+                              BC.MIB->matchAnyOperand());
+      if (!opts::JTFootprintOnlyPIC &&
+          IndJmpMatcher->match(*BC.MRI, *BC.MIB,
+                               MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1),
+                               -1) &&
+          Scale == 8) {
+        if (Info.getLivenessAnalysis().scavengeRegAfter(&Inst))
+          continue;
+        BlacklistedJTs.insert(JumpTable);
+        ++IndJmpsDenied;
+        ++NumJTsNoReg;
+        continue;
+      }
+
+      // Try a PIC matcher. The pattern we are looking for is a PIC JT ind jmp:
+      //    addq    %rdx, %rsi
+      //    addq    %rdx, %rdi
+      //    leaq    DATAat0x402450(%rip), %r11
+      //    movslq  (%r11,%rdx,4), %rcx
+      //    addq    %r11, %rcx
+      //    jmpq    *%rcx # JUMPTABLE @0x402450
+      MCPhysReg BaseReg1;
+      MCPhysReg BaseReg2;
+      uint64_t Offset;
+      std::unique_ptr<MCPlusBuilder::MCInstMatcher> PICIndJmpMatcher =
+          BC.MIB->matchIndJmp(BC.MIB->matchAdd(
+              BC.MIB->matchReg(BaseReg1),
+              BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg2),
+                                BC.MIB->matchImm(Scale), BC.MIB->matchReg(),
+                                BC.MIB->matchImm(Offset))));
+      std::unique_ptr<MCPlusBuilder::MCInstMatcher> PICBaseAddrMatcher =
+          BC.MIB->matchIndJmp(
+              BC.MIB->matchAdd(BC.MIB->matchLoadAddr(BC.MIB->matchSymbol()),
+                               BC.MIB->matchAnyOperand()));
+      if (!PICIndJmpMatcher->match(
+              *BC.MRI, *BC.MIB,
+              MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1), -1) ||
+          Scale != 4 || BaseReg1 != BaseReg2 || Offset != 0 ||
+          !PICBaseAddrMatcher->match(
+              *BC.MRI, *BC.MIB,
+              MutableArrayRef<MCInst>(&*BB.begin(), &Inst + 1), -1)) {
+        BlacklistedJTs.insert(JumpTable);
+        ++IndJmpsDenied;
+        ++NumJTsBadMatch;
+        continue;
+      }
+    }
+  }
+
+  // Statistics only
+  for (const auto &JTFreq : AllJTs) {
+    JumpTable *JT = JTFreq.first;
+    uint64_t CurScore = JTFreq.second;
+    TotalJTScore += CurScore;
+    if (!BlacklistedJTs.count(JT)) {
+      OptimizedScore += CurScore;
+      if (JT->EntrySize == 8)
+        BytesSaved += JT->getSize() >> 1;
+    }
+  }
+  TotalJTs += AllJTs.size();
+  TotalJTsDenied += BlacklistedJTs.size();
+}
+
+bool JTFootprintReduction::tryOptimizeNonPIC(
+    BinaryContext &BC, BinaryBasicBlock &BB,
+    BinaryBasicBlock::iterator Inst, uint64_t JTAddr,
+    JumpTable *JumpTable, DataflowInfoManager &Info) {
+  if (opts::JTFootprintOnlyPIC)
+    return false;
+
+  MCOperand Base;
+  uint64_t Scale;
+  MCPhysReg Index;
+  MCOperand Offset;
+  std::unique_ptr<MCPlusBuilder::MCInstMatcher> IndJmpMatcher =
+      BC.MIB->matchIndJmp(BC.MIB->matchAnyOperand(Base),
+                          BC.MIB->matchImm(Scale), BC.MIB->matchReg(Index),
+                          BC.MIB->matchAnyOperand(Offset));
+  if (!IndJmpMatcher->match(*BC.MRI, *BC.MIB,
+                            MutableArrayRef<MCInst>(&*BB.begin(), &*Inst + 1),
+                            -1)) {
+    return false;
+  }
+
+  assert(Scale == 8 && "Wrong scale");
+
+  Scale = 4;
+  IndJmpMatcher->annotate(*BC.MIB, "DeleteMe");
+
+  LivenessAnalysis &LA = Info.getLivenessAnalysis();
+  MCPhysReg Reg = LA.scavengeRegAfter(&*Inst);
+  assert(Reg != 0 && "Register scavenger failed!");
+  MCOperand RegOp = MCOperand::createReg(Reg);
+  SmallVector<MCInst, 4> NewFrag;
+
+  BC.MIB->createIJmp32Frag(NewFrag, Base, MCOperand::createImm(Scale),
+                           MCOperand::createReg(Index), Offset, RegOp);
+  BC.MIB->setJumpTable(NewFrag.back(), JTAddr, Index);
+
+  JumpTable->OutputEntrySize = 4;
+
+  BB.replaceInstruction(Inst, NewFrag.begin(), NewFrag.end());
+  return true;
+}
+
+bool JTFootprintReduction::tryOptimizePIC(
+    BinaryContext &BC, BinaryBasicBlock &BB,
+    BinaryBasicBlock::iterator Inst, uint64_t JTAddr,
+    JumpTable *JumpTable, DataflowInfoManager &Info) {
+  MCPhysReg BaseReg;
+  uint64_t Scale;
+  MCPhysReg Index;
+  MCOperand Offset;
+  MCOperand JumpTableRef;
+  std::unique_ptr<MCPlusBuilder::MCInstMatcher> PICIndJmpMatcher =
+      BC.MIB->matchIndJmp(BC.MIB->matchAdd(
+          BC.MIB->matchLoadAddr(BC.MIB->matchAnyOperand(JumpTableRef)),
+          BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg), BC.MIB->matchImm(Scale),
+                            BC.MIB->matchReg(Index),
+                            BC.MIB->matchAnyOperand())));
+  if (!PICIndJmpMatcher->match(*BC.MRI, *BC.MIB,
+                              MutableArrayRef<MCInst>(&*BB.begin(), &*Inst + 1),
+                               -1)) {
+    return false;
+  }
+
+  assert(Scale == 4 && "Wrong scale");
+
+  PICIndJmpMatcher->annotate(*BC.MIB, "DeleteMe");
+
+  MCOperand RegOp = MCOperand::createReg(BaseReg);
+  SmallVector<MCInst, 4> NewFrag;
+
+  BC.MIB->createIJmp32Frag(NewFrag, MCOperand::createReg(0),
+                           MCOperand::createImm(Scale),
+                           MCOperand::createReg(Index), JumpTableRef, RegOp);
+  BC.MIB->setJumpTable(NewFrag.back(), JTAddr, Index);
+
+  JumpTable->OutputEntrySize = 4;
+  // DePICify
+  JumpTable->Type = JumpTable::JTT_NORMAL;
+
+  BB.replaceInstruction(Inst, NewFrag.begin(), NewFrag.end());
+  return true;
+}
+
+void JTFootprintReduction::optimizeFunction(BinaryContext &BC,
+                                            BinaryFunction &Function,
+                                            DataflowInfoManager &Info) {
+  for (BinaryBasicBlock &BB : Function) {
+    if (!BB.getNumNonPseudos())
+      continue;
+
+    auto IndJmpRI = BB.getLastNonPseudo();
+    auto IndJmp = std::prev(IndJmpRI.base());
+    const uint64_t JTAddr = BC.MIB->getJumpTable(*IndJmp);
+
+    if (!JTAddr)
+      continue;
+
+    JumpTable *JumpTable = Function.getJumpTable(*IndJmp);
+    if (BlacklistedJTs.count(JumpTable))
+      continue;
+
+    if (tryOptimizeNonPIC(BC, BB, IndJmp, JTAddr, JumpTable, Info)
+        || tryOptimizePIC(BC, BB, IndJmp, JTAddr, JumpTable, Info)) {
+      Modified.insert(&Function);
+      continue;
+    }
+
+    llvm_unreachable("Should either optimize PIC or NonPIC successfuly");
+  }
+
+  if (!Modified.count(&Function))
+    return;
+
+  for (BinaryBasicBlock &BB : Function) {
+    for (auto I = BB.begin(); I != BB.end(); ) {
+      if (BC.MIB->hasAnnotation(*I, "DeleteMe"))
+        I = BB.eraseInstruction(I);
+      else
+        ++I;
+    }
+  }
+}
+
+void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
+  if (opts::JumpTables == JTS_BASIC && BC.HasRelocations)
+    return;
+
+  std::unique_ptr<RegAnalysis> RA;
+  std::unique_ptr<BinaryFunctionCallGraph> CG;
+  if (!opts::JTFootprintOnlyPIC) {
+    CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
+    RA.reset(new RegAnalysis(BC, &BC.getBinaryFunctions(), &*CG));
+  }
+  for (auto &BFIt : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFIt.second;
+
+    if (!Function.isSimple() || Function.isIgnored())
+      continue;
+
+    if (Function.getKnownExecutionCount() == 0)
+      continue;
+
+    DataflowInfoManager Info(BC, Function, RA.get(), nullptr);
+    BlacklistedJTs.clear();
+    checkOpportunities(BC, Function, Info);
+    optimizeFunction(BC, Function, Info);
+  }
+
+  if (TotalJTs == TotalJTsDenied) {
+    outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
+    return;
+  }
+
+  outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
+  if (OptimizedScore) {
+    outs() << format("\t   %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore))
+           << " of dynamic JT entries were reduced.\n";
+  }
+  outs() << "\t   " << TotalJTs - TotalJTsDenied << " of " << TotalJTs
+         << " jump tables affected.\n";
+  outs() << "\t   " << IndJmps - IndJmpsDenied << " of " << IndJmps
+         << " indirect jumps to JTs affected.\n";
+  outs() << "\t   " << NumJTsBadMatch
+         << " JTs discarded due to unsupported jump pattern.\n";
+  outs() << "\t   " << NumJTsNoReg
+         << " JTs discarded due to register unavailability.\n";
+  outs() << "\t   " << BytesSaved
+         << " bytes saved.\n";
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h
new file mode 100644
index 000000000000..ac44dfd8477a
--- /dev/null
+++ b/bolt/src/Passes/JTFootprintReduction.h
@@ -0,0 +1,83 @@
+//===--- JTFootprintReduction.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Jump table footprint reduction pass
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_JT_FOOTPRINT_REDUCTION_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_JT_FOOTPRINT_REDUCTION_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+class DataflowInfoManager;
+
+/// This pass identify indirect jumps to jump tables and reduce their entries
+/// size from 8 to 4 bytes. For PIC jump tables, it will remove the PIC code
+/// (since BOLT only process static code and it makes no sense to use expensive
+/// PIC-style jumps in static code).
+class JTFootprintReduction : public BinaryFunctionPass {
+  uint64_t TotalJTScore{0};
+  uint64_t TotalJTs{0};
+  uint64_t TotalJTsDenied{0};
+  uint64_t OptimizedScore{0};
+  uint64_t IndJmps{0};
+  uint64_t IndJmpsDenied{0};
+  uint64_t NumJTsBadMatch{0};
+  uint64_t NumJTsNoReg{0};
+  uint64_t BytesSaved{0};
+  DenseSet<JumpTable *> BlacklistedJTs;
+  DenseSet<const BinaryFunction *> Modified;
+
+  /// Check if \p Function presents jump tables where all jump locations can
+  /// be safely changed to use a different code sequence. If this is true, we
+  /// will be able to emit the whole table with a smaller entry size.
+  void checkOpportunities(BinaryContext &BC, BinaryFunction &Function,
+                          DataflowInfoManager &Info);
+
+  /// The Non-PIC jump table optimization consists of reducing the jump table
+  /// entry size from 8 to 4 bytes. For that, we need to change the jump code
+  /// sequence from a single jmp * instruction to a pair of load32zext-jmp
+  /// instructions that depend on the availability of an extra register.
+  /// This saves dcache/dTLB at the expense of icache.
+  bool tryOptimizeNonPIC(BinaryContext &BC, BinaryBasicBlock &BB,
+                         BinaryBasicBlock::iterator Inst,
+                         uint64_t JTAddr, JumpTable *JumpTable,
+                         DataflowInfoManager &Info);
+
+  /// The PIC jump table optimization consists of "de-pic-ifying" it, since the
+  /// PIC jump sequence is larger than its non-PIC counterpart, saving icache.
+  bool tryOptimizePIC(BinaryContext &BC, BinaryBasicBlock &BB,
+                      BinaryBasicBlock::iterator Inst,
+                      uint64_t JTAddr, JumpTable *JumpTable,
+                      DataflowInfoManager &Info);
+
+  /// Run a pass for \p Function
+  void optimizeFunction(BinaryContext &BC, BinaryFunction &Function,
+                        DataflowInfoManager &Info);
+
+public:
+  explicit JTFootprintReduction(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  /// BinaryPass interface functions
+  const char *getName() const override {
+    return "jt-footprint-reduction";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/LivenessAnalysis.cpp b/bolt/src/Passes/LivenessAnalysis.cpp
new file mode 100644
index 000000000000..8ef08f40d76b
--- /dev/null
+++ b/bolt/src/Passes/LivenessAnalysis.cpp
@@ -0,0 +1,18 @@
+//===--- Passes/LivenessAnalysis.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#include "LivenessAnalysis.h"
+
+namespace llvm {
+namespace bolt {
+
+LivenessAnalysis::~LivenessAnalysis() {}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/LivenessAnalysis.h b/bolt/src/Passes/LivenessAnalysis.h
new file mode 100644
index 000000000000..bb5d2607d7f5
--- /dev/null
+++ b/bolt/src/Passes/LivenessAnalysis.h
@@ -0,0 +1,176 @@
+//===--- Passes/LivenessAnalysis.h ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_LIVENESSANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_LIVENESSANALYSIS_H
+
+#include "DataflowAnalysis.h"
+#include "RegAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> AssumeABI;
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+class LivenessAnalysis
+  : public DataflowAnalysis<LivenessAnalysis, BitVector, true, RegStatePrinter> {
+  using Parent = DataflowAnalysis<LivenessAnalysis,
+                                  BitVector,
+                                  true,
+                                  RegStatePrinter>;
+  friend class DataflowAnalysis<LivenessAnalysis,
+                                BitVector,
+                                true,
+                                RegStatePrinter>;
+
+public:
+  LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC,
+                   BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId)
+      : Parent(BC, BF, AllocId), RA(RA), NumRegs(BC.MRI->getNumRegs()) {}
+  virtual ~LivenessAnalysis();
+
+  bool isAlive(ProgramPoint PP, MCPhysReg Reg) const {
+    BitVector BV = (*this->getStateAt(PP));
+    const BitVector &RegAliases = BC.MIB->getAliases(Reg);
+    BV &= RegAliases;
+    return BV.any();
+  }
+
+  void run() {
+    Parent::run();
+  }
+
+  // Return a usable general-purpose reg after point P. Return 0 if no reg is
+  // available.
+  MCPhysReg scavengeRegAfter(ProgramPoint P) {
+    BitVector BV = *this->getStateAt(P);
+    BV.flip();
+    BitVector GPRegs(NumRegs, false);
+    this->BC.MIB->getGPRegs(GPRegs, /*IncludeAlias=*/false);
+    // Ignore the register used for frame pointer even if it is not alive (it
+    // may be used by CFI which is not represented in our dataflow).
+    BitVector FP = BC.MIB->getAliases(BC.MIB->getFramePointer());
+    FP.flip();
+    BV &= GPRegs;
+    BV &= FP;
+    int Reg = BV.find_first();
+    return Reg != -1 ? Reg : 0;
+  }
+
+protected:
+  /// Reference to the result of reg analysis
+  const RegAnalysis &RA;
+  const uint16_t NumRegs;
+
+  void preflight() {}
+
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    // Entry points start with default live out (registers used as return
+    // values).
+    if (BB.succ_size() == 0) {
+      BitVector State(NumRegs, false);
+      if (opts::AssumeABI) {
+        BC.MIB->getDefaultLiveOut(State);
+        BC.MIB->getCalleeSavedRegs(State);
+      } else {
+        State.set();
+        State.reset(BC.MIB->getFlagsReg());
+      }
+      return State;
+    }
+    return BitVector(NumRegs, false);
+  }
+
+  BitVector getStartingStateAtPoint(const MCInst &Point) {
+    return BitVector(NumRegs, false);
+  }
+
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn) {
+    StateOut |= StateIn;
+  }
+
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur) {
+    BitVector Next = Cur;
+    bool IsCall = this->BC.MIB->isCall(Point);
+    // Kill
+    BitVector Written = BitVector(NumRegs, false);
+    if (!IsCall) {
+      this->BC.MIB->getWrittenRegs(Point, Written);
+    } else {
+      RA.getInstClobberList(Point, Written);
+      // When clobber list is conservative, it is clobbering all/most registers,
+      // a conservative estimate because it knows nothing about this call.
+      // For our purposes, assume it kills no registers/callee-saved regs
+      // because we don't really know what's going on.
+      if (RA.isConservative(Written)) {
+        Written.reset();
+        BC.MIB->getDefaultLiveOut(Written);
+        // If ABI is respected, everything except CSRs should be dead after a
+        // call
+        if (opts::AssumeABI) {
+          BitVector CSR = BitVector(NumRegs, false);
+          BC.MIB->getCalleeSavedRegs(CSR);
+          CSR.flip();
+          Written |= CSR;
+        }
+      }
+    }
+    Written.flip();
+    Next &= Written;
+    // Gen
+    if (!this->BC.MIB->isCFI(Point)) {
+      if (BC.MIB->isCleanRegXOR(Point))
+        return Next;
+
+      BitVector Used = BitVector(NumRegs, false);
+      if (IsCall) {
+        RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/true);
+        if (RA.isConservative(Used)) {
+          Used = BC.MIB->getRegsUsedAsParams();
+          BC.MIB->getDefaultLiveOut(Used);
+        }
+      }
+      const MCInstrDesc &InstInfo = BC.MII->get(Point.getOpcode());
+      for (unsigned I = 0, E = Point.getNumOperands(); I != E; ++I) {
+        if (!Point.getOperand(I).isReg() || I < InstInfo.getNumDefs())
+          continue;
+        Used |= BC.MIB->getAliases(Point.getOperand(I).getReg(),
+                                   /*OnlySmaller=*/false);
+      }
+      for (auto
+             I = InstInfo.getImplicitUses(),
+             E = InstInfo.getImplicitUses() + InstInfo.getNumImplicitUses();
+           I != E; ++I) {
+        Used |= BC.MIB->getAliases(*I, false);
+      }
+      if (IsCall &&
+          (!BC.MIB->isTailCall(Point) || !BC.MIB->isConditionalBranch(Point))) {
+        // Never gen FLAGS from a non-conditional call... this is overly
+        // conservative
+        Used.reset(BC.MIB->getFlagsReg());
+      }
+      Next |= Used;
+    }
+    return Next;
+  }
+
+  StringRef getAnnotationName() const {
+    return StringRef("LivenessAnalysis");
+  }
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp
new file mode 100644
index 000000000000..febc30aa32db
--- /dev/null
+++ b/bolt/src/Passes/LongJmp.cpp
@@ -0,0 +1,629 @@
+//===--- Passes/LongJmp.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "LongJmp.h"
+#include "llvm/Support/Alignment.h"
+
+#define DEBUG_TYPE "longjmp"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> UseOldText;
+extern cl::opt<unsigned> AlignFunctions;
+extern cl::opt<unsigned> AlignFunctionsMaxBytes;
+extern cl::opt<bool> HotFunctionsAtEnd;
+
+static cl::opt<bool>
+GroupStubs("group-stubs",
+  cl::desc("share stubs across functions"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+constexpr unsigned ColdFragAlign = 16;
+
+void relaxStubToShortJmp(BinaryBasicBlock &StubBB, const MCSymbol *Tgt) {
+  const BinaryContext &BC = StubBB.getFunction()->getBinaryContext();
+  std::vector<MCInst> Seq;
+  BC.MIB->createShortJmp(Seq, Tgt, BC.Ctx.get());
+  StubBB.clear();
+  StubBB.addInstructions(Seq.begin(), Seq.end());
+}
+
+void relaxStubToLongJmp(BinaryBasicBlock &StubBB, const MCSymbol *Tgt) {
+  const BinaryContext &BC = StubBB.getFunction()->getBinaryContext();
+  std::vector<MCInst> Seq;
+  BC.MIB->createLongJmp(Seq, Tgt, BC.Ctx.get());
+  StubBB.clear();
+  StubBB.addInstructions(Seq.begin(), Seq.end());
+}
+
+BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
+  if (!Func.isSplit() || Func.empty())
+    return nullptr;
+
+  assert(!(*Func.begin()).isCold() && "Entry cannot be cold");
+  for (auto I = Func.layout_begin(), E = Func.layout_end(); I != E; ++I) {
+    auto Next = std::next(I);
+    if (Next != E && (*Next)->isCold())
+      return *I;
+  }
+  llvm_unreachable("No hot-colt split point found");
+}
+
+bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
+  return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
+         !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
+}
+
+} // end anonymous namespace
+
+std::pair<std::unique_ptr<BinaryBasicBlock>, MCSymbol *>
+LongJmpPass::createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym,
+                           bool TgtIsFunc, uint64_t AtAddress) {
+  BinaryFunction &Func = *SourceBB.getFunction();
+  const BinaryContext &BC = Func.getBinaryContext();
+  const bool IsCold = SourceBB.isCold();
+  MCSymbol *StubSym = BC.Ctx->createNamedTempSymbol("Stub");
+  std::unique_ptr<BinaryBasicBlock> StubBB = Func.createBasicBlock(0, StubSym);
+  MCInst Inst;
+  BC.MIB->createUncondBranch(Inst, TgtSym, BC.Ctx.get());
+  if (TgtIsFunc)
+    BC.MIB->convertJmpToTailCall(Inst);
+  StubBB->addInstruction(Inst);
+  StubBB->setExecutionCount(0);
+
+  // Register this in stubs maps
+  auto registerInMap = [&](StubGroupsTy &Map) {
+    StubGroupTy &StubGroup = Map[TgtSym];
+    StubGroup.insert(
+        std::lower_bound(
+            StubGroup.begin(), StubGroup.end(),
+            std::make_pair(AtAddress, nullptr),
+            [&](const std::pair<uint64_t, BinaryBasicBlock *> &LHS,
+                const std::pair<uint64_t, BinaryBasicBlock *> &RHS) {
+              return LHS.first < RHS.first;
+            }),
+        std::make_pair(AtAddress, StubBB.get()));
+  };
+
+  Stubs[&Func].insert(StubBB.get());
+  StubBits[StubBB.get()] = BC.MIB->getUncondBranchEncodingSize();
+  if (IsCold) {
+    registerInMap(ColdLocalStubs[&Func]);
+    if (opts::GroupStubs && TgtIsFunc)
+      registerInMap(ColdStubGroups);
+    ++NumColdStubs;
+  } else {
+    registerInMap(HotLocalStubs[&Func]);
+    if (opts::GroupStubs && TgtIsFunc)
+      registerInMap(HotStubGroups);
+    ++NumHotStubs;
+  }
+
+  return std::make_pair(std::move(StubBB), StubSym);
+}
+
+BinaryBasicBlock *LongJmpPass::lookupStubFromGroup(
+    const StubGroupsTy &StubGroups, const BinaryFunction &Func,
+    const MCInst &Inst, const MCSymbol *TgtSym, uint64_t DotAddress) const {
+  const BinaryContext &BC = Func.getBinaryContext();
+  auto CandidatesIter = StubGroups.find(TgtSym);
+  if (CandidatesIter == StubGroups.end())
+    return nullptr;
+  const StubGroupTy &Candidates = CandidatesIter->second;
+  if (Candidates.empty())
+    return nullptr;
+  const StubTy *Cand = std::lower_bound(
+      Candidates.begin(), Candidates.end(), std::make_pair(DotAddress, nullptr),
+      [&](const std::pair<uint64_t, BinaryBasicBlock *> &LHS,
+          const std::pair<uint64_t, BinaryBasicBlock *> &RHS) {
+        return LHS.first < RHS.first;
+      });
+  if (Cand != Candidates.begin()) {
+    const StubTy *LeftCand = Cand;
+    --LeftCand;
+    if (Cand->first - DotAddress >
+        DotAddress - LeftCand->first)
+      Cand = LeftCand;
+  }
+  int BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1;
+  uint64_t Mask = ~((1ULL << BitsAvail) - 1);
+  uint64_t PCRelTgtAddress = Cand->first;
+  PCRelTgtAddress = DotAddress > PCRelTgtAddress ? DotAddress - PCRelTgtAddress
+                                                 : PCRelTgtAddress - DotAddress;
+  LLVM_DEBUG({
+    if (Candidates.size() > 1)
+      dbgs() << "Considering stub group with " << Candidates.size()
+             << " candidates. DotAddress is " << Twine::utohexstr(DotAddress)
+             << ", chosen candidate address is "
+             << Twine::utohexstr(Cand->first) << "\n";
+  });
+  return PCRelTgtAddress & Mask ? nullptr : Cand->second;
+}
+
+BinaryBasicBlock *
+LongJmpPass::lookupGlobalStub(const BinaryBasicBlock &SourceBB,
+                              const MCInst &Inst, const MCSymbol *TgtSym,
+                              uint64_t DotAddress) const {
+  const BinaryFunction &Func = *SourceBB.getFunction();
+  const StubGroupsTy &StubGroups =
+      SourceBB.isCold() ? ColdStubGroups : HotStubGroups;
+  return lookupStubFromGroup(StubGroups, Func, Inst, TgtSym,
+                             DotAddress);
+}
+
+BinaryBasicBlock *LongJmpPass::lookupLocalStub(const BinaryBasicBlock &SourceBB,
+                                               const MCInst &Inst,
+                                               const MCSymbol *TgtSym,
+                                               uint64_t DotAddress) const {
+  const BinaryFunction &Func = *SourceBB.getFunction();
+  const DenseMap<const BinaryFunction *, StubGroupsTy> &StubGroups =
+      SourceBB.isCold() ? ColdLocalStubs : HotLocalStubs;
+  const auto Iter = StubGroups.find(&Func);
+  if (Iter == StubGroups.end())
+    return nullptr;
+  return lookupStubFromGroup(Iter->second, Func, Inst, TgtSym, DotAddress);
+}
+
+std::unique_ptr<BinaryBasicBlock>
+LongJmpPass::replaceTargetWithStub(BinaryBasicBlock &BB, MCInst &Inst,
+                                   uint64_t DotAddress,
+                                   uint64_t StubCreationAddress) {
+  const BinaryFunction &Func = *BB.getFunction();
+  const BinaryContext &BC = Func.getBinaryContext();
+  std::unique_ptr<BinaryBasicBlock> NewBB;
+  const MCSymbol *TgtSym = BC.MIB->getTargetSymbol(Inst);
+  assert (TgtSym && "getTargetSymbol failed");
+
+  BinaryBasicBlock::BinaryBranchInfo BI{0, 0};
+  BinaryBasicBlock *TgtBB = BB.getSuccessor(TgtSym, BI);
+  auto LocalStubsIter = Stubs.find(&Func);
+
+  // If already using stub and the stub is from another function, create a local
+  // stub, since the foreign stub is now out of range
+  if (!TgtBB) {
+    auto SSIter = SharedStubs.find(TgtSym);
+    if (SSIter != SharedStubs.end()) {
+      TgtSym = BC.MIB->getTargetSymbol(*SSIter->second->begin());
+      --NumSharedStubs;
+    }
+  } else if (LocalStubsIter != Stubs.end() &&
+             LocalStubsIter->second.count(TgtBB)) {
+    // If we are replacing a local stub (because it is now out of range),
+    // use its target instead of creating a stub to jump to another stub
+    TgtSym = BC.MIB->getTargetSymbol(*TgtBB->begin());
+    TgtBB = BB.getSuccessor(TgtSym, BI);
+  }
+
+  BinaryBasicBlock *StubBB = lookupLocalStub(BB, Inst, TgtSym, DotAddress);
+  // If not found, look it up in globally shared stub maps if it is a function
+  // call (TgtBB is not set)
+  if (!StubBB && !TgtBB) {
+    StubBB = lookupGlobalStub(BB, Inst, TgtSym, DotAddress);
+    if (StubBB) {
+      SharedStubs[StubBB->getLabel()] = StubBB;
+      ++NumSharedStubs;
+    }
+  }
+  MCSymbol *StubSymbol = StubBB ? StubBB->getLabel() : nullptr;
+
+  if (!StubBB) {
+    std::tie(NewBB, StubSymbol) =
+        createNewStub(BB, TgtSym, /*is func?*/ !TgtBB, StubCreationAddress);
+    StubBB = NewBB.get();
+  }
+
+  // Local branch
+  if (TgtBB) {
+    uint64_t OrigCount = BI.Count;
+    uint64_t OrigMispreds = BI.MispredictedCount;
+    BB.replaceSuccessor(TgtBB, StubBB, OrigCount, OrigMispreds);
+    StubBB->setExecutionCount(StubBB->getExecutionCount() + OrigCount);
+    if (NewBB) {
+      StubBB->addSuccessor(TgtBB, OrigCount, OrigMispreds);
+      StubBB->setIsCold(BB.isCold());
+    }
+  // Call / tail call
+  } else {
+    StubBB->setExecutionCount(StubBB->getExecutionCount() +
+                              BB.getExecutionCount());
+    if (NewBB) {
+      assert(TgtBB == nullptr);
+      StubBB->setIsCold(BB.isCold());
+      // Set as entry point because this block is valid but we have no preds
+      StubBB->getFunction()->addEntryPoint(*StubBB);
+    }
+  }
+  BC.MIB->replaceBranchTarget(Inst, StubSymbol, BC.Ctx.get());
+
+  return NewBB;
+}
+
+void LongJmpPass::updateStubGroups() {
+  auto update = [&](StubGroupsTy &StubGroups) {
+    for (auto &KeyVal : StubGroups) {
+      for (StubTy &Elem : KeyVal.second) {
+        Elem.first = BBAddresses[Elem.second];
+      }
+      std::sort(KeyVal.second.begin(), KeyVal.second.end(),
+                [&](const std::pair<uint64_t, BinaryBasicBlock *> &LHS,
+                    const std::pair<uint64_t, BinaryBasicBlock *> &RHS) {
+                  return LHS.first < RHS.first;
+                });
+    }
+  };
+
+  for (auto &KeyVal : HotLocalStubs)
+    update(KeyVal.second);
+  for (auto &KeyVal : ColdLocalStubs)
+    update(KeyVal.second);
+  update(HotStubGroups);
+  update(ColdStubGroups);
+}
+
+void LongJmpPass::tentativeBBLayout(const BinaryFunction &Func) {
+  const BinaryContext &BC = Func.getBinaryContext();
+  uint64_t HotDot = HotAddresses[&Func];
+  uint64_t ColdDot = ColdAddresses[&Func];
+  bool Cold = false;
+  for (BinaryBasicBlock *BB : Func.layout()) {
+    if (Cold || BB->isCold()) {
+      Cold = true;
+      BBAddresses[BB] = ColdDot;
+      ColdDot += BC.computeCodeSize(BB->begin(), BB->end());
+    } else {
+      BBAddresses[BB] = HotDot;
+      HotDot += BC.computeCodeSize(BB->begin(), BB->end());
+    }
+  }
+}
+
+uint64_t LongJmpPass::tentativeLayoutRelocColdPart(
+  const BinaryContext &BC, std::vector<BinaryFunction *> &SortedFunctions,
+  uint64_t DotAddress) {
+  for (BinaryFunction *Func : SortedFunctions) {
+    if (!Func->isSplit())
+      continue;
+    DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign);
+    uint64_t Pad =
+        offsetToAlignment(DotAddress, llvm::Align(opts::AlignFunctions));
+    if (Pad <= opts::AlignFunctionsMaxBytes)
+      DotAddress += Pad;
+    ColdAddresses[Func] = DotAddress;
+    LLVM_DEBUG(dbgs() << Func->getPrintName() << " cold tentative: "
+                      << Twine::utohexstr(DotAddress) << "\n");
+    DotAddress += Func->estimateColdSize();
+    DotAddress += Func->estimateConstantIslandSize();
+  }
+  return DotAddress;
+}
+
+uint64_t LongJmpPass::tentativeLayoutRelocMode(
+  const BinaryContext &BC, std::vector<BinaryFunction *> &SortedFunctions,
+  uint64_t DotAddress) {
+
+  // Compute hot cold frontier
+  uint32_t LastHotIndex = -1u;
+  uint32_t CurrentIndex = 0;
+  if (opts::HotFunctionsAtEnd) {
+    for (BinaryFunction *BF : SortedFunctions) {
+      if (BF->hasValidIndex() && LastHotIndex == -1u) {
+        LastHotIndex = CurrentIndex;
+      }
+      ++CurrentIndex;
+    }
+  } else {
+    for (BinaryFunction *BF : SortedFunctions) {
+      if (!BF->hasValidIndex() && LastHotIndex == -1u) {
+        LastHotIndex = CurrentIndex;
+      }
+      ++CurrentIndex;
+    }
+  }
+
+  // Hot
+  CurrentIndex = 0;
+  bool ColdLayoutDone = false;
+  for (BinaryFunction *Func : SortedFunctions) {
+    if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) {
+      DotAddress =
+          tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
+      ColdLayoutDone = true;
+      if (opts::HotFunctionsAtEnd)
+        DotAddress = alignTo(DotAddress, BC.PageAlign);
+    }
+
+    DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign);
+    uint64_t Pad =
+        offsetToAlignment(DotAddress, llvm::Align(opts::AlignFunctions));
+    if (Pad <= opts::AlignFunctionsMaxBytes)
+      DotAddress += Pad;
+    HotAddresses[Func] = DotAddress;
+    LLVM_DEBUG(dbgs() << Func->getPrintName() << " tentative: "
+                      << Twine::utohexstr(DotAddress) << "\n");
+    if (!Func->isSplit())
+      DotAddress += Func->estimateSize();
+    else
+      DotAddress += Func->estimateHotSize();
+    DotAddress += Func->estimateConstantIslandSize();
+    ++CurrentIndex;
+  }
+  // BBs
+  for (BinaryFunction *Func : SortedFunctions)
+    tentativeBBLayout(*Func);
+
+  return DotAddress;
+}
+
+void LongJmpPass::tentativeLayout(
+    const BinaryContext &BC,
+    std::vector<BinaryFunction *> &SortedFunctions) {
+  uint64_t DotAddress = BC.LayoutStartAddress;
+
+  if (!BC.HasRelocations) {
+    for (BinaryFunction *Func : SortedFunctions) {
+      HotAddresses[Func] = Func->getAddress();
+      DotAddress = alignTo(DotAddress, ColdFragAlign);
+      ColdAddresses[Func] = DotAddress;
+      if (Func->isSplit())
+        DotAddress += Func->estimateColdSize();
+      tentativeBBLayout(*Func);
+    }
+
+    return;
+  }
+
+  // Relocation mode
+  uint64_t EstimatedTextSize = tentativeLayoutRelocMode(BC, SortedFunctions, 0);
+
+  // Initial padding
+  if (opts::UseOldText && EstimatedTextSize <= BC.OldTextSectionSize) {
+    DotAddress = BC.OldTextSectionAddress;
+    uint64_t Pad = offsetToAlignment(DotAddress, llvm::Align(BC.PageAlign));
+    if (Pad + EstimatedTextSize <= BC.OldTextSectionSize) {
+      DotAddress += Pad;
+    }
+  } else {
+    DotAddress = alignTo(BC.LayoutStartAddress, BC.PageAlign);
+  }
+
+  tentativeLayoutRelocMode(BC, SortedFunctions, DotAddress);
+}
+
+bool LongJmpPass::usesStub(const BinaryFunction &Func,
+                           const MCInst &Inst) const {
+  const MCSymbol *TgtSym = Func.getBinaryContext().MIB->getTargetSymbol(Inst);
+  const BinaryBasicBlock *TgtBB = Func.getBasicBlockForLabel(TgtSym);
+  auto Iter = Stubs.find(&Func);
+  if (Iter != Stubs.end())
+    return Iter->second.count(TgtBB);
+  return false;
+}
+
+uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC,
+                                       const MCSymbol *Target,
+                                       const BinaryBasicBlock *TgtBB) const {
+  if (TgtBB) {
+    auto Iter = BBAddresses.find(TgtBB);
+    assert (Iter != BBAddresses.end() && "Unrecognized BB");
+    return Iter->second;
+  }
+  uint64_t EntryID = 0;
+  const BinaryFunction *TargetFunc = BC.getFunctionForSymbol(Target, &EntryID);
+  auto Iter = HotAddresses.find(TargetFunc);
+  if (Iter == HotAddresses.end() || (TargetFunc && EntryID)) {
+    // Look at BinaryContext's resolution for this symbol - this is a symbol not
+    // mapped to a BinaryFunction
+    ErrorOr<uint64_t> ValueOrError = BC.getSymbolValue(*Target);
+    assert(ValueOrError && "Unrecognized symbol");
+    return *ValueOrError;
+  }
+  return Iter->second;
+}
+
+bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
+  const BinaryFunction &Func = *StubBB.getFunction();
+  const BinaryContext &BC = Func.getBinaryContext();
+  const int Bits = StubBits[&StubBB];
+  // Already working with the largest range?
+  if (Bits == static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8))
+    return false;
+
+  const static int RangeShortJmp = BC.MIB->getShortJmpEncodingSize();
+  const static int RangeSingleInstr = BC.MIB->getUncondBranchEncodingSize();
+  const static uint64_t ShortJmpMask = ~((1ULL << RangeShortJmp) - 1);
+  const static uint64_t SingleInstrMask =
+      ~((1ULL << (RangeSingleInstr - 1)) - 1);
+
+  const MCSymbol *RealTargetSym = BC.MIB->getTargetSymbol(*StubBB.begin());
+  const BinaryBasicBlock *TgtBB = Func.getBasicBlockForLabel(RealTargetSym);
+  uint64_t TgtAddress = getSymbolAddress(BC, RealTargetSym, TgtBB);
+  uint64_t DotAddress = BBAddresses[&StubBB];
+  uint64_t PCRelTgtAddress = DotAddress > TgtAddress ? DotAddress - TgtAddress
+                                            : TgtAddress - DotAddress;
+  // If it fits in one instruction, do not relax
+  if (!(PCRelTgtAddress & SingleInstrMask))
+    return false;
+
+  // Fits short jmp
+  if (!(PCRelTgtAddress & ShortJmpMask)) {
+    if (Bits >= RangeShortJmp)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Relaxing stub to short jump. PCRelTgtAddress = "
+                      << Twine::utohexstr(PCRelTgtAddress)
+                      << " RealTargetSym = " << RealTargetSym->getName()
+                      << "\n");
+    relaxStubToShortJmp(StubBB, RealTargetSym);
+    StubBits[&StubBB] = RangeShortJmp;
+    return true;
+  }
+
+  // Needs a long jmp
+  if (Bits > RangeShortJmp)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Relaxing stub to long jump. PCRelTgtAddress = "
+                    << Twine::utohexstr(PCRelTgtAddress)
+                    << " RealTargetSym = " << RealTargetSym->getName() << "\n");
+  relaxStubToLongJmp(StubBB, RealTargetSym);
+  StubBits[&StubBB] = static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8);
+  return true;
+}
+
+bool LongJmpPass::needsStub(const BinaryBasicBlock &BB, const MCInst &Inst,
+                            uint64_t DotAddress) const {
+  const BinaryFunction &Func = *BB.getFunction();
+  const BinaryContext &BC = Func.getBinaryContext();
+  const MCSymbol *TgtSym = BC.MIB->getTargetSymbol(Inst);
+  assert (TgtSym && "getTargetSymbol failed");
+
+  const BinaryBasicBlock *TgtBB = Func.getBasicBlockForLabel(TgtSym);
+  // Check for shared stubs from foreign functions
+  if (!TgtBB) {
+    auto SSIter = SharedStubs.find(TgtSym);
+    if (SSIter != SharedStubs.end()) {
+      TgtBB = SSIter->second;
+    }
+  }
+
+  int BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1;
+  uint64_t Mask = ~((1ULL << BitsAvail) - 1);
+
+  uint64_t PCRelTgtAddress = getSymbolAddress(BC, TgtSym, TgtBB);
+  PCRelTgtAddress = DotAddress > PCRelTgtAddress ? DotAddress - PCRelTgtAddress
+                                                 : PCRelTgtAddress - DotAddress;
+
+  return PCRelTgtAddress & Mask;
+}
+
+bool LongJmpPass::relax(BinaryFunction &Func) {
+  const BinaryContext &BC = Func.getBinaryContext();
+  bool Modified = false;
+
+  assert(BC.isAArch64() && "Unsupported arch");
+  constexpr int InsnSize = 4; // AArch64
+  std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
+      Insertions;
+
+  BinaryBasicBlock *Frontier = getBBAtHotColdSplitPoint(Func);
+  uint64_t FrontierAddress = Frontier ? BBAddresses[Frontier] : 0;
+  if (FrontierAddress) {
+    FrontierAddress += Frontier->getNumNonPseudos() * InsnSize;
+  }
+  // Add necessary stubs for branch targets we know we can't fit in the
+  // instruction
+  for (BinaryBasicBlock &BB : Func) {
+    uint64_t DotAddress = BBAddresses[&BB];
+    // Stubs themselves are relaxed on the next loop
+    if (Stubs[&Func].count(&BB))
+      continue;
+
+    for (MCInst &Inst : BB) {
+      if (BC.MIB->isPseudo(Inst))
+        continue;
+
+      if (!shouldInsertStub(BC, Inst)) {
+        DotAddress += InsnSize;
+        continue;
+      }
+
+      // Check and relax direct branch or call
+      if (!needsStub(BB, Inst, DotAddress)) {
+        DotAddress += InsnSize;
+        continue;
+      }
+      Modified = true;
+
+      // Insert stubs close to the patched BB if call, but far away from the
+      // hot path if a branch, since this branch target is the cold region
+      // (but first check that the far away stub will be in range).
+      BinaryBasicBlock *InsertionPoint = &BB;
+      if (Func.isSimple() && !BC.MIB->isCall(Inst) && FrontierAddress &&
+          !BB.isCold()) {
+        int BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1;
+        uint64_t Mask = ~((1ULL << BitsAvail) - 1);
+        assert(FrontierAddress > DotAddress &&
+               "Hot code should be before the frontier");
+        uint64_t PCRelTgt = FrontierAddress - DotAddress;
+        if (!(PCRelTgt & Mask))
+          InsertionPoint = Frontier;
+      }
+      // Always put stubs at the end of the function if non-simple. We can't
+      // change the layout of non-simple functions because it has jump tables
+      // that we do not control.
+      if (!Func.isSimple())
+        InsertionPoint = &*std::prev(Func.end());
+
+      // Create a stub to handle a far-away target
+      Insertions.emplace_back(InsertionPoint,
+                              replaceTargetWithStub(BB, Inst, DotAddress,
+                                                    InsertionPoint == Frontier
+                                                        ? FrontierAddress
+                                                        : DotAddress));
+    }
+  }
+
+  // Relax stubs if necessary
+  for (BinaryBasicBlock &BB : Func) {
+    if (!Stubs[&Func].count(&BB) || !BB.isValid())
+      continue;
+
+    Modified |= relaxStub(BB);
+  }
+
+  for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Elmt :
+       Insertions) {
+    if (!Elmt.second)
+      continue;
+    std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
+    NewBBs.emplace_back(std::move(Elmt.second));
+    Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true);
+  }
+
+  return Modified;
+}
+
+void LongJmpPass::runOnFunctions(BinaryContext &BC) {
+  outs() << "BOLT-INFO: Starting stub-insertion pass\n";
+  std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
+  bool Modified;
+  uint32_t Iterations = 0;
+  do {
+    ++Iterations;
+    Modified = false;
+    tentativeLayout(BC, Sorted);
+    updateStubGroups();
+    for (BinaryFunction *Func : Sorted) {
+      if (relax(*Func)) {
+        // Don't ruin non-simple functions, they can't afford to have the layout
+        // changed.
+        if (Func->isSimple())
+          Func->fixBranches();
+        Modified = true;
+      }
+    }
+  } while (Modified);
+  outs() << "BOLT-INFO: Inserted " << NumHotStubs
+         << " stubs in the hot area and " << NumColdStubs
+         << " stubs in the cold area. Shared " << NumSharedStubs
+         << " times, iterated " << Iterations << " times.\n";
+}
+}
+}
diff --git a/bolt/src/Passes/LongJmp.h b/bolt/src/Passes/LongJmp.h
new file mode 100644
index 000000000000..5c292b7882f2
--- /dev/null
+++ b/bolt/src/Passes/LongJmp.h
@@ -0,0 +1,157 @@
+//===--- Passes/LongJmp.h -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_LONGJMP_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_LONGJMP_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+/// LongJmp is veneer-insertion pass originally written for AArch64 that
+/// compensates for its short-range branches, typically done during linking. We
+/// pull this pass inside BOLT because here we can do a better job at stub
+/// inserting by manipulating the CFG, something linkers can't do.
+///
+/// We iteratively repeat the following until no modification is done: we do a
+/// tentative layout with the current function sizes; then we add stubs for
+/// branches that we know are out of range or we expand smaller stubs (28-bit)
+/// to a large one if necessary (32 or 64).
+///
+/// This expansion inserts the equivalent of "linker stubs", small
+/// blocks of code that load a 64-bit address into a pre-allocated register and
+//  then executes an unconditional indirect branch on this register. By using a
+/// 64-bit range, we guarantee it can reach any code location.
+///
+class LongJmpPass : public BinaryFunctionPass {
+  /// Used to implement stub grouping (re-using a stub from one function into
+  /// another)
+  using StubTy = std::pair<uint64_t, BinaryBasicBlock *>;
+  using StubGroupTy = SmallVector<StubTy, 4>;
+  using StubGroupsTy = DenseMap<const MCSymbol *, StubGroupTy>;
+  StubGroupsTy HotStubGroups;
+  StubGroupsTy ColdStubGroups;
+  DenseMap<const MCSymbol *, BinaryBasicBlock *> SharedStubs;
+
+  /// Stubs that are local to a function. This will be the primary lookup
+  /// before resorting to stubs located in foreign functions.
+  using StubMapTy = DenseMap<const BinaryFunction *, StubGroupsTy>;
+  /// Used to quickly fetch stubs based on the target they jump to
+  StubMapTy HotLocalStubs;
+  StubMapTy ColdLocalStubs;
+
+  /// Used to quickly identify whether a BB is a stub, sharded by function
+  DenseMap<const BinaryFunction *, std::set<const BinaryBasicBlock *>> Stubs;
+
+  using FuncAddressesMapTy = DenseMap<const BinaryFunction *, uint64_t>;
+  /// Hold tentative addresses
+  FuncAddressesMapTy HotAddresses;
+  FuncAddressesMapTy ColdAddresses;
+  DenseMap<const BinaryBasicBlock *, uint64_t> BBAddresses;
+
+  /// Used to identify the stub size
+  DenseMap<const BinaryBasicBlock *, int> StubBits;
+
+  /// Stats about number of stubs inserted
+  uint32_t NumHotStubs{0};
+  uint32_t NumColdStubs{0};
+  uint32_t NumSharedStubs{0};
+
+  ///                 -- Layout estimation methods --
+  /// Try to do layout before running the emitter, by looking at BinaryFunctions
+  /// and MCInsts -- this is an estimation. To be correct for longjmp inserter
+  /// purposes, we need to do a size worst-case estimation. Real layout is done
+  /// by RewriteInstance::mapFileSections()
+  void tentativeLayout(const BinaryContext &BC,
+                       std::vector<BinaryFunction *> &SortedFunctions);
+  uint64_t
+  tentativeLayoutRelocMode(const BinaryContext &BC,
+                           std::vector<BinaryFunction *> &SortedFunctions,
+                           uint64_t DotAddress);
+  uint64_t
+  tentativeLayoutRelocColdPart(const BinaryContext &BC,
+                              std::vector<BinaryFunction *> &SortedFunctions,
+                              uint64_t DotAddress);
+  void tentativeBBLayout(const BinaryFunction &Func);
+
+  /// Update stubs addresses with their exact address after a round of stub
+  /// insertion and layout estimation is done.
+  void updateStubGroups();
+
+  ///              -- Relaxation/stub insertion methods --
+  /// Creates a  new stub jumping to \p TgtSym and updates bookkeeping about
+  /// this stub using \p AtAddress as its initial location. This location is
+  /// an approximation and will be later resolved to the exact location in
+  /// a next iteration, in updateStubGroups.
+  std::pair<std::unique_ptr<BinaryBasicBlock>, MCSymbol *>
+  createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym,
+                bool TgtIsFunc, uint64_t AtAddress);
+
+  /// Replace the target of call or conditional branch in \p Inst with a
+  /// a stub that in turn will branch to the target (perform stub insertion).
+  /// If a new stub was created, return it.
+  std::unique_ptr<BinaryBasicBlock>
+  replaceTargetWithStub(BinaryBasicBlock &BB, MCInst &Inst, uint64_t DotAddress,
+                        uint64_t StubCreationAddress);
+
+  /// Helper used to fetch the closest stub to \p Inst at \p DotAddress that
+  /// is jumping to \p TgtSym. Returns nullptr if the closest stub is out of
+  /// range or if it doesn't exist. The source of truth for stubs will be the
+  /// map \p StubGroups, which can be either local stubs for a particular
+  /// function that is very large and needs to group stubs, or can be global
+  /// stubs if we are sharing stubs across functions.
+  BinaryBasicBlock *lookupStubFromGroup(const StubGroupsTy &StubGroups,
+                                        const BinaryFunction &Func,
+                                        const MCInst &Inst,
+                                        const MCSymbol *TgtSym,
+                                        uint64_t DotAddress) const;
+
+  /// Lookup closest stub from the global pool, meaning this can return a basic
+  /// block from another function.
+  BinaryBasicBlock *lookupGlobalStub(const BinaryBasicBlock &SourceBB,
+                                     const MCInst &Inst, const MCSymbol *TgtSym,
+                                     uint64_t DotAddress) const;
+
+  /// Lookup closest stub local to \p Func.
+  BinaryBasicBlock *lookupLocalStub(const BinaryBasicBlock &SourceBB,
+                                    const MCInst &Inst, const MCSymbol *TgtSym,
+                                    uint64_t DotAddress) const;
+
+  /// Helper to identify whether \p Inst is branching to a stub
+  bool usesStub(const BinaryFunction &Func, const MCInst &Inst) const;
+
+  /// True if Inst is a branch that is out of range
+  bool needsStub(const BinaryBasicBlock &BB, const MCInst &Inst,
+                 uint64_t DotAddress) const;
+
+  /// Expand the range of the stub in StubBB if necessary
+  bool relaxStub(BinaryBasicBlock &StubBB);
+
+  /// Helper to resolve a symbol address according to our tentative layout
+  uint64_t getSymbolAddress(const BinaryContext &BC, const MCSymbol *Target,
+                            const BinaryBasicBlock *TgtBB) const;
+
+  /// Relax function by adding necessary stubs or relaxing existing stubs
+  bool relax(BinaryFunction &BF);
+
+public:
+  /// BinaryPass public interface
+
+  explicit LongJmpPass(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override { return "long-jmp"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/LoopInversionPass.cpp b/bolt/src/Passes/LoopInversionPass.cpp
new file mode 100644
index 000000000000..83d0b1369d1d
--- /dev/null
+++ b/bolt/src/Passes/LoopInversionPass.cpp
@@ -0,0 +1,103 @@
+//===--------- Passes/LoopInversionPass.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoopInversionPass.h"
+#include "ParallelUtilities.h"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltCategory;
+
+extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
+
+static cl::opt<bool> LoopReorder(
+    "loop-inversion-opt",
+    cl::desc("reorder unconditional jump instructions in loops optimization"),
+    cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+bool LoopInversionPass::runOnFunction(BinaryFunction &BF) {
+  bool IsChanged = false;
+  if (BF.layout_size() < 3 || !BF.hasValidProfile())
+    return false;
+
+  BF.updateLayoutIndices();
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (BB->succ_size() != 1 || BB->pred_size() != 1)
+      continue;
+
+    BinaryBasicBlock *SuccBB = *BB->succ_begin();
+    BinaryBasicBlock *PredBB = *BB->pred_begin();
+    const unsigned BBIndex = BB->getLayoutIndex();
+    const unsigned SuccBBIndex = SuccBB->getLayoutIndex();
+    if (SuccBB == PredBB && BB != SuccBB && BBIndex != 0 && SuccBBIndex != 0 &&
+        SuccBB->succ_size() == 2 && BB->isCold() == SuccBB->isCold()) {
+      // Get the second successor (after loop BB)
+      BinaryBasicBlock *SecondSucc = nullptr;
+      for (BinaryBasicBlock *Succ : SuccBB->successors()) {
+        if (Succ != &*BB) {
+          SecondSucc = Succ;
+          break;
+        }
+      }
+
+      assert(SecondSucc != nullptr && "Unable to find second BB successor");
+      const uint64_t BBCount = SuccBB->getBranchInfo(*BB).Count;
+      const uint64_t OtherCount = SuccBB->getBranchInfo(*SecondSucc).Count;
+      if ((BBCount < OtherCount) && (BBIndex > SuccBBIndex))
+        continue;
+
+      IsChanged = true;
+      BB->setLayoutIndex(SuccBBIndex);
+      SuccBB->setLayoutIndex(BBIndex);
+    }
+  }
+
+  if (IsChanged) {
+    BinaryFunction::BasicBlockOrderType NewOrder = BF.getLayout();
+    std::sort(NewOrder.begin(), NewOrder.end(),
+              [&](BinaryBasicBlock *BB1, BinaryBasicBlock *BB2) {
+                return BB1->getLayoutIndex() < BB2->getLayoutIndex();
+              });
+    BF.updateBasicBlockLayout(NewOrder);
+  }
+
+  return IsChanged;
+}
+
+void LoopInversionPass::runOnFunctions(BinaryContext &BC) {
+  std::atomic<uint64_t> ModifiedFuncCount{0};
+  if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE ||
+      opts::LoopReorder == false)
+    return;
+
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    if (runOnFunction(BF))
+      ++ModifiedFuncCount;
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
+      "LoopInversionPass");
+
+  outs() << "BOLT-INFO: " << ModifiedFuncCount
+         << " Functions were reordered by LoopInversionPass\n";
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/LoopInversionPass.h b/bolt/src/Passes/LoopInversionPass.h
new file mode 100644
index 000000000000..2ade6a87eb0f
--- /dev/null
+++ b/bolt/src/Passes/LoopInversionPass.h
@@ -0,0 +1,61 @@
+//===--------- Passes/LoopInversionPass.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_LOOPINVERSION_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_LOOPINVERSION_H
+
+#include "BinaryPasses.h"
+
+// This pass founds cases when BBs have layout:
+// #BB0:
+// ....
+// #BB1:
+// cmp
+// cond_jmp #BB3
+// #BB2:
+// <loop body>
+// jmp #BB1
+// #BB3:
+// <loop exit>
+//
+// And swaps BB1 and BB2:
+// #BB0:
+// ....
+// jmp #BB1
+// #BB2:
+// <loop body>
+// #BB1:
+// cmp
+// cond_njmp #BB2
+// #BB3:
+// <loop exit>
+//
+// And vice versa depending on the profile information.
+// The advantage is that the loop uses only one conditional jump,
+// the unconditional jump is only used once on the loop start.
+
+namespace llvm {
+namespace bolt {
+
+class LoopInversionPass : public BinaryFunctionPass {
+public:
+  explicit LoopInversionPass() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override { return "loop-inversion-opt"; }
+
+  /// Pass entry point
+  void runOnFunctions(BinaryContext &BC) override;
+  bool runOnFunction(BinaryFunction &Function);
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/MCF.cpp b/bolt/src/Passes/MCF.cpp
new file mode 100644
index 000000000000..2e540821393d
--- /dev/null
+++ b/bolt/src/Passes/MCF.cpp
@@ -0,0 +1,487 @@
+//===--- Passes/MCF.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCF.h"
+#include "BinaryFunction.h"
+#include "Passes/DataflowInfoManager.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <vector>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "mcf"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> TimeOpts;
+
+static cl::opt<bool>
+IterativeGuess("iterative-guess",
+  cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+EqualizeBBCounts("equalize-bb-counts",
+  cl::desc("in non-LBR mode, use same count for BBs "
+           "that should have equivalent count"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+UseRArcs("mcf-use-rarcs",
+  cl::desc("in MCF, consider the possibility of cancelling flow to balance "
+           "edges"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// Edge Weight Inference Heuristic
+//
+// We start by maintaining the invariant used in LBR mode where the sum of
+// pred edges count is equal to the block execution count. This loop will set
+// pred edges count by balancing its own execution count in different pred
+// edges. The weight of each edge is guessed by looking at how hot each pred
+// block is (in terms of samples).
+// There are two caveats in this approach. One is for critical edges and the
+// other is for self-referencing blocks (loops of 1 BB). For critical edges,
+// we can't infer the hotness of them based solely on pred BBs execution
+// count. For each critical edge we look at the pred BB, then look at its
+// succs to adjust its weight.
+//
+//    [ 60  ]       [ 25 ]
+//       |      \     |
+//    [ 10  ]       [ 75 ]
+//
+// The illustration above shows a critical edge \. We wish to adjust bb count
+// 60 to 50 to properly determine the weight of the critical edge to be
+// 50 / 75.
+// For self-referencing edges, we attribute its weight by subtracting the
+// current BB execution count by the sum of predecessors count if this result
+// is non-negative.
+using EdgeWeightMap =
+    DenseMap<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>,
+             double>;
+
+template <class NodeT>
+void updateEdgeWeight(EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
+                      const BinaryBasicBlock *B, double Weight);
+
+template <>
+void updateEdgeWeight<BinaryBasicBlock *>(
+    EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
+    const BinaryBasicBlock *B, double Weight) {
+  EdgeWeights[std::make_pair(A, B)] = Weight;
+  return;
+}
+
+template <>
+void updateEdgeWeight<Inverse<BinaryBasicBlock *>>(
+    EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
+    const BinaryBasicBlock *B, double Weight) {
+  EdgeWeights[std::make_pair(B, A)] = Weight;
+  return;
+}
+
+template <class NodeT>
+void computeEdgeWeights(BinaryBasicBlock *BB, EdgeWeightMap &EdgeWeights) {
+  typedef GraphTraits<NodeT> GraphT;
+  typedef GraphTraits<Inverse<NodeT> > InvTraits;
+
+  double TotalChildrenCount = 0.0;
+  SmallVector<double, 4> ChildrenExecCount;
+  // First pass computes total children execution count that directly
+  // contribute to this BB.
+  for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
+         E = GraphT::child_end(BB); CI != E; ++CI) {
+    typename GraphT::NodeRef Child = *CI;
+    double ChildExecCount = Child->getExecutionCount();
+    // Is self-reference?
+    if (Child == BB) {
+      ChildExecCount = 0.0; // will fill this in second pass
+    } else if (GraphT::child_end(BB) - GraphT::child_begin(BB) > 1 &&
+               InvTraits::child_end(Child) - InvTraits::child_begin(Child) >
+                   1) {
+      // Handle critical edges. This will cause a skew towards crit edges, but
+      // it is a quick solution.
+      double CritWeight = 0.0;
+      uint64_t Denominator = 0;
+      for (typename InvTraits::ChildIteratorType
+               II = InvTraits::child_begin(Child),
+               IE = InvTraits::child_end(Child);
+           II != IE; ++II) {
+        typename GraphT::NodeRef N = *II;
+        Denominator += N->getExecutionCount();
+        if (N != BB) {
+          continue;
+        }
+        CritWeight = N->getExecutionCount();
+      }
+      if (Denominator)
+        CritWeight /= static_cast<double>(Denominator);
+      ChildExecCount *= CritWeight;
+    }
+    ChildrenExecCount.push_back(ChildExecCount);
+    TotalChildrenCount += ChildExecCount;
+  }
+  // Second pass fixes the weight of a possible self-reference edge
+  uint32_t ChildIndex = 0;
+  for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
+         E = GraphT::child_end(BB); CI != E; ++CI) {
+    typename GraphT::NodeRef Child = *CI;
+    if (Child != BB) {
+      ++ChildIndex;
+      continue;
+    }
+    if (static_cast<double>(BB->getExecutionCount()) > TotalChildrenCount) {
+      ChildrenExecCount[ChildIndex] =
+          BB->getExecutionCount() - TotalChildrenCount;
+      TotalChildrenCount += ChildrenExecCount[ChildIndex];
+    }
+    break;
+  }
+  // Third pass finally assigns weights to edges
+  ChildIndex = 0;
+  for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
+         E = GraphT::child_end(BB); CI != E; ++CI) {
+    typename GraphT::NodeRef Child = *CI;
+    double Weight = 1 / (GraphT::child_end(BB) - GraphT::child_begin(BB));
+    if (TotalChildrenCount != 0.0)
+      Weight = ChildrenExecCount[ChildIndex] / TotalChildrenCount;
+    updateEdgeWeight<NodeT>(EdgeWeights, BB, Child, Weight);
+    ++ChildIndex;
+  }
+}
+
+template<class NodeT>
+void computeEdgeWeights(BinaryFunction &BF, EdgeWeightMap &EdgeWeights) {
+  for (BinaryBasicBlock &BB : BF) {
+    computeEdgeWeights<NodeT>(&BB, EdgeWeights);
+  }
+}
+
+/// Make BB count match the sum of all incoming edges. If AllEdges is true,
+/// make it match max(SumPredEdges, SumSuccEdges).
+void recalculateBBCounts(BinaryFunction &BF, bool AllEdges) {
+  for (BinaryBasicBlock &BB : BF) {
+    uint64_t TotalPredsEWeight = 0;
+    for (BinaryBasicBlock *Pred : BB.predecessors()) {
+      TotalPredsEWeight += Pred->getBranchInfo(BB).Count;
+    }
+
+    if (TotalPredsEWeight > BB.getExecutionCount()) {
+      BB.setExecutionCount(TotalPredsEWeight);
+    }
+
+    if (!AllEdges)
+      continue;
+
+    uint64_t TotalSuccsEWeight = 0;
+    for (BinaryBasicBlock::BinaryBranchInfo &BI : BB.branch_info()) {
+      TotalSuccsEWeight += BI.Count;
+    }
+
+    if (TotalSuccsEWeight > BB.getExecutionCount()) {
+      BB.setExecutionCount(TotalSuccsEWeight);
+    }
+  }
+}
+
+// This is our main edge count guessing heuristic. Look at predecessors and
+// assign a proportionally higher count to pred edges coming from blocks with
+// a higher execution count in comparison with the other predecessor blocks,
+// making SumPredEdges match the current BB count.
+// If "UseSucc" is true, apply the same logic to successor edges as well. Since
+// some successor edges may already have assigned a count, only update it if the
+// new count is higher.
+void guessEdgeByRelHotness(BinaryFunction &BF, bool UseSucc,
+                           EdgeWeightMap &PredEdgeWeights,
+                           EdgeWeightMap &SuccEdgeWeights) {
+  for (BinaryBasicBlock &BB : BF) {
+    for (BinaryBasicBlock *Pred : BB.predecessors()) {
+      double RelativeExec = PredEdgeWeights[std::make_pair(Pred, &BB)];
+      RelativeExec *= BB.getExecutionCount();
+      BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB);
+      if (static_cast<uint64_t>(RelativeExec) > BI.Count)
+        BI.Count = static_cast<uint64_t>(RelativeExec);
+    }
+
+    if (!UseSucc)
+      continue;
+
+    auto BI = BB.branch_info_begin();
+    for (BinaryBasicBlock *Succ : BB.successors()) {
+      double RelativeExec = SuccEdgeWeights[std::make_pair(&BB, Succ)];
+      RelativeExec *= BB.getExecutionCount();
+      if (static_cast<uint64_t>(RelativeExec) > BI->Count)
+        BI->Count = static_cast<uint64_t>(RelativeExec);
+      ++BI;
+    }
+  }
+}
+
+using ArcSet =
+    DenseSet<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>>;
+
+/// Predecessor edges version of guessEdgeByIterativeApproach. GuessedArcs has
+/// all edges we already established their count. Try to guess the count of
+/// the remaining edge, if there is only one to guess, and return true if we
+/// were able to guess.
+bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
+  if (BB->pred_size() == 0)
+    return false;
+
+  uint64_t TotalPredCount = 0;
+  unsigned NumGuessedEdges = 0;
+  for (BinaryBasicBlock *Pred : BB->predecessors()) {
+    if (GuessedArcs.count(std::make_pair(Pred, BB)))
+      ++NumGuessedEdges;
+    TotalPredCount += Pred->getBranchInfo(*BB).Count;
+  }
+
+  if (NumGuessedEdges != BB->pred_size() - 1)
+    return false;
+
+  int64_t Guessed =
+      static_cast<int64_t>(BB->getExecutionCount()) - TotalPredCount;
+  if (Guessed < 0)
+    Guessed = 0;
+
+  for (BinaryBasicBlock *Pred : BB->predecessors()) {
+    if (GuessedArcs.count(std::make_pair(Pred, BB)))
+      continue;
+
+    Pred->getBranchInfo(*BB).Count = Guessed;
+    return true;
+  }
+  llvm_unreachable("Expected unguessed arc");
+}
+
+/// Successor edges version of guessEdgeByIterativeApproach. GuessedArcs has
+/// all edges we already established their count. Try to guess the count of
+/// the remaining edge, if there is only one to guess, and return true if we
+/// were able to guess.
+bool guessSuccEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
+  if (BB->succ_size() == 0)
+    return false;
+
+  uint64_t TotalSuccCount = 0;
+  unsigned NumGuessedEdges = 0;
+  auto BI = BB->branch_info_begin();
+  for (BinaryBasicBlock *Succ : BB->successors()) {
+    if (GuessedArcs.count(std::make_pair(BB, Succ)))
+      ++NumGuessedEdges;
+    TotalSuccCount += BI->Count;
+    ++BI;
+  }
+
+  if (NumGuessedEdges != BB->succ_size() - 1)
+    return false;
+
+  int64_t Guessed =
+      static_cast<int64_t>(BB->getExecutionCount()) - TotalSuccCount;
+  if (Guessed < 0)
+    Guessed = 0;
+
+  BI = BB->branch_info_begin();
+  for (BinaryBasicBlock *Succ : BB->successors()) {
+    if (GuessedArcs.count(std::make_pair(BB, Succ))) {
+      ++BI;
+      continue;
+    }
+
+    BI->Count = Guessed;
+    GuessedArcs.insert(std::make_pair(BB, Succ));
+    return true;
+  }
+  llvm_unreachable("Expected unguessed arc");
+}
+
+/// Guess edge count whenever we have only one edge (pred or succ) left
+/// to guess. Then make its count equal to BB count minus all other edge
+/// counts we already know their count. Repeat this until there is no
+/// change.
+void guessEdgeByIterativeApproach(BinaryFunction &BF) {
+  ArcSet KnownArcs;
+  bool Changed = false;
+
+  do {
+    Changed = false;
+    for (BinaryBasicBlock &BB : BF) {
+      if (guessPredEdgeCounts(&BB, KnownArcs)) Changed = true;
+      if (guessSuccEdgeCounts(&BB, KnownArcs)) Changed = true;
+    }
+  } while (Changed);
+
+  // Guess count for non-inferred edges
+  for (BinaryBasicBlock &BB : BF) {
+    for (BinaryBasicBlock *Pred : BB.predecessors()) {
+      if (KnownArcs.count(std::make_pair(Pred, &BB)))
+        continue;
+      BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB);
+      BI.Count =
+        std::min(Pred->getExecutionCount(), BB.getExecutionCount()) / 2;
+      KnownArcs.insert(std::make_pair(Pred, &BB));
+    }
+    auto BI = BB.branch_info_begin();
+    for (BinaryBasicBlock *Succ : BB.successors()) {
+      if (KnownArcs.count(std::make_pair(&BB, Succ))) {
+        ++BI;
+        continue;
+      }
+      BI->Count =
+          std::min(BB.getExecutionCount(), Succ->getExecutionCount()) / 2;
+      KnownArcs.insert(std::make_pair(&BB, Succ));
+      break;
+    }
+  }
+}
+
+/// Associate each basic block with the BinaryLoop object corresponding to the
+/// innermost loop containing this block.
+DenseMap<const BinaryBasicBlock *, const BinaryLoop*>
+createLoopNestLevelMap(BinaryFunction &BF) {
+  DenseMap<const BinaryBasicBlock *, const BinaryLoop*> LoopNestLevel;
+  const BinaryLoopInfo &BLI = BF.getLoopInfo();
+
+  for (BinaryBasicBlock &BB : BF) {
+    LoopNestLevel[&BB] = BLI[&BB];
+  }
+
+  return LoopNestLevel;
+}
+
+/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
+/// without the Usability Burden" by Diego Novillo to make basic block counts
+/// equal if we show that A dominates B, B post-dominates A and they are in the
+/// same loop and same loop nesting level.
+void equalizeBBCounts(BinaryFunction &BF) {
+  auto Info = DataflowInfoManager(BF.getBinaryContext(), BF, nullptr, nullptr);
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+  DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis();
+  auto &InsnToBB = Info.getInsnToBBMap();
+  // These analyses work at the instruction granularity, but we really only need
+  // basic block granularity here. So we'll use a set of visited edges to avoid
+  // revisiting the same BBs again and again.
+  DenseMap<const BinaryBasicBlock *, std::set<const BinaryBasicBlock *>>
+      Visited;
+  // Equivalence classes mapping. Each equivalence class is defined by the set
+  // of BBs that obeys the aforementioned properties.
+  DenseMap<const BinaryBasicBlock *, signed> BBsToEC;
+  std::vector<std::vector<BinaryBasicBlock *>> Classes;
+
+  BF.calculateLoopInfo();
+  DenseMap<const BinaryBasicBlock *, const BinaryLoop *> LoopNestLevel =
+      createLoopNestLevelMap(BF);
+
+  for (BinaryBasicBlock &BB : BF) {
+    BBsToEC[&BB] = -1;
+  }
+
+  for (BinaryBasicBlock &BB : BF) {
+    auto I = BB.begin();
+    if (I == BB.end())
+      continue;
+
+    DA.doForAllDominators(*I, [&](const MCInst &DomInst) {
+      BinaryBasicBlock *DomBB = InsnToBB[&DomInst];
+      if (Visited[DomBB].count(&BB))
+        return;
+      Visited[DomBB].insert(&BB);
+      if (!PDA.doesADominateB(*I, DomInst))
+        return;
+      if (LoopNestLevel[&BB] != LoopNestLevel[DomBB])
+        return;
+      if (BBsToEC[DomBB] == -1  && BBsToEC[&BB] == -1) {
+        BBsToEC[DomBB] = Classes.size();
+        BBsToEC[&BB] = Classes.size();
+        Classes.emplace_back();
+        Classes.back().push_back(DomBB);
+        Classes.back().push_back(&BB);
+        return;
+      }
+      if (BBsToEC[DomBB] == -1) {
+        BBsToEC[DomBB] = BBsToEC[&BB];
+        Classes[BBsToEC[&BB]].push_back(DomBB);
+        return;
+      }
+      if (BBsToEC[&BB] == -1) {
+        BBsToEC[&BB] = BBsToEC[DomBB];
+        Classes[BBsToEC[DomBB]].push_back(&BB);
+        return;
+      }
+      signed BBECNum = BBsToEC[&BB];
+      std::vector<BinaryBasicBlock *> DomEC = Classes[BBsToEC[DomBB]];
+      std::vector<BinaryBasicBlock *> BBEC = Classes[BBECNum];
+      for (BinaryBasicBlock *Block : DomEC) {
+        BBsToEC[Block] = BBECNum;
+        BBEC.push_back(Block);
+      }
+      DomEC.clear();
+    });
+  }
+
+  for (std::vector<BinaryBasicBlock *> &Class : Classes) {
+    uint64_t Max = 0ULL;
+    for (BinaryBasicBlock *BB : Class) {
+      Max = std::max(Max, BB->getExecutionCount());
+    }
+    for (BinaryBasicBlock *BB : Class) {
+      BB->setExecutionCount(Max);
+    }
+  }
+}
+
+} // end anonymous namespace
+
+void estimateEdgeCounts(BinaryFunction &BF) {
+  EdgeWeightMap PredEdgeWeights;
+  EdgeWeightMap SuccEdgeWeights;
+  if (!opts::IterativeGuess) {
+    computeEdgeWeights<Inverse<BinaryBasicBlock *>>(BF, PredEdgeWeights);
+    computeEdgeWeights<BinaryBasicBlock *>(BF, SuccEdgeWeights);
+  }
+  if (opts::EqualizeBBCounts) {
+    LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true));
+    equalizeBBCounts(BF);
+    LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true));
+  }
+  if (opts::IterativeGuess)
+    guessEdgeByIterativeApproach(BF);
+  else
+    guessEdgeByRelHotness(BF, /*UseSuccs=*/false, PredEdgeWeights,
+                          SuccEdgeWeights);
+  recalculateBBCounts(BF, /*AllEdges=*/false);
+}
+
+void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
+  llvm_unreachable("not implemented");
+}
+
+}
+}
diff --git a/bolt/src/Passes/MCF.h b/bolt/src/Passes/MCF.h
new file mode 100644
index 000000000000..69268ba2643c
--- /dev/null
+++ b/bolt/src/Passes/MCF.h
@@ -0,0 +1,51 @@
+//===--- Passes/MCF.h -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_MCF_H
+#define LLVM_TOOLS_LLVM_BOLT_MCF_H
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+
+enum MCFCostFunction : char {
+  MCF_DISABLE = 0,
+  MCF_LINEAR,
+  MCF_QUADRATIC,
+  MCF_LOG,
+  MCF_BLAMEFTS
+};
+
+/// Fill edge counts based on the basic block count. Used in nonLBR mode when
+/// we only have bb count.
+void estimateEdgeCounts(BinaryFunction &BF);
+
+/// Entry point for computing a min-cost flow for the CFG with the goal
+/// of fixing the flow of the CFG edges, that is, making sure it obeys the
+/// flow-conservation equation  SumInEdges = SumOutEdges.
+///
+/// To do this, we create an instance of the min-cost flow problem in a
+/// similar way as the one discussed in the work of Roy Levin "Completing
+/// Incomplete Edge Profile by Applying Minimum Cost Circulation Algorithms".
+/// We do a few things differently, though. We don't populate edge counts using
+/// weights coming from a static branch prediction technique and we don't
+/// use the same cost function.
+///
+/// If cost function BlameFTs is used, assign all remaining flow to
+/// fall-throughs. This is used when the sampling is based on taken branches
+/// that do not account for them.
+void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction);
+
+}
+}
+
+
+#endif
diff --git a/bolt/src/Passes/PLTCall.cpp b/bolt/src/Passes/PLTCall.cpp
new file mode 100644
index 000000000000..b6874499e75f
--- /dev/null
+++ b/bolt/src/Passes/PLTCall.cpp
@@ -0,0 +1,90 @@
+//===--- Passes/PLTCall.h - PLT call optimization -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace calls to PLT entries with indirect calls against GOT.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PLTCall.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "bolt-plt"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+cl::opt<bolt::PLTCall::OptType>
+PLT("plt",
+  cl::desc("optimize PLT calls (requires linking with -znow)"),
+  cl::init(bolt::PLTCall::OT_NONE),
+  cl::values(clEnumValN(bolt::PLTCall::OT_NONE,
+      "none",
+      "do not optimize PLT calls"),
+    clEnumValN(bolt::PLTCall::OT_HOT,
+      "hot",
+      "optimize executed (hot) PLT calls"),
+    clEnumValN(bolt::PLTCall::OT_ALL,
+      "all",
+      "optimize all PLT calls")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+void PLTCall::runOnFunctions(BinaryContext &BC) {
+  if (opts::PLT == OT_NONE)
+    return;
+
+  uint64_t NumCallsOptimized = 0;
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+    if (!shouldOptimize(Function))
+      continue;
+
+    if (opts::PLT == OT_HOT &&
+        Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE)
+      continue;
+
+    for (BinaryBasicBlock *BB : Function.layout()) {
+      if (opts::PLT == OT_HOT && !BB->getKnownExecutionCount())
+        continue;
+
+      for (MCInst &Instr : *BB) {
+        if (!BC.MIB->isCall(Instr))
+          continue;
+        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr);
+        if (!CallSymbol)
+          continue;
+        const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CallSymbol);
+        if (!CalleeBF || !CalleeBF->isPLTFunction())
+          continue;
+        BC.MIB->convertCallToIndirectCall(Instr,
+                                          CalleeBF->getPLTSymbol(),
+                                          BC.Ctx.get());
+        BC.MIB->addAnnotation(Instr, "PLTCall", true);
+        ++NumCallsOptimized;
+      }
+    }
+  }
+
+  if (NumCallsOptimized) {
+    BC.RequiresZNow = true;
+    outs() << "BOLT-INFO: " << NumCallsOptimized
+           << " PLT calls in the binary were optimized.\n";
+  }
+}
+
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/PLTCall.h b/bolt/src/Passes/PLTCall.h
new file mode 100644
index 000000000000..065e0f56028e
--- /dev/null
+++ b/bolt/src/Passes/PLTCall.h
@@ -0,0 +1,44 @@
+//===--- Passes/PLTCall.h - PLT call optimization -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_PLTCALL_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_PLTCALL_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class PLTCall : public BinaryFunctionPass {
+public:
+
+  /// PLT optimization type
+  enum OptType : char {
+    OT_NONE = 0,   /// Do not optimize
+    OT_HOT  = 1,   /// Optimize hot PLT calls
+    OT_ALL  = 2    /// Optimize all PLT calls
+  };
+
+  explicit PLTCall(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "PLT call optimization";
+  }
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF);
+ }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/PatchEntries.cpp b/bolt/src/Passes/PatchEntries.cpp
new file mode 100644
index 000000000000..97e6bfc6c5fb
--- /dev/null
+++ b/bolt/src/Passes/PatchEntries.cpp
@@ -0,0 +1,144 @@
+//===--- Passes/PatchEntries.cpp - pass for patching function entries -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass for patching original function entry points.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PatchEntries.h"
+#include "NameResolver.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace opts {
+
+extern llvm::cl::OptionCategory BoltCategory;
+
+extern llvm::cl::opt<unsigned> Verbosity;
+
+llvm::cl::opt<bool>
+ForcePatch("force-patch",
+  llvm::cl::desc("force patching of original entry points"),
+  llvm::cl::init(false),
+  llvm::cl::Hidden,
+  llvm::cl::ZeroOrMore,
+  llvm::cl::cat(BoltCategory));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+void PatchEntries::runOnFunctions(BinaryContext &BC) {
+  if (!opts::ForcePatch) {
+    // Mark the binary for patching if we did not create external references
+    // for original code in any of functions we are not going to emit.
+    bool NeedsPatching = false;
+    for (auto &BFI : BC.getBinaryFunctions()) {
+      BinaryFunction &Function = BFI.second;
+      if (!BC.shouldEmit(Function) && !Function.hasExternalRefRelocations()) {
+        NeedsPatching = true;
+        break;
+      }
+    }
+
+    if (!NeedsPatching)
+      return;
+  }
+
+  if (opts::Verbosity >= 1) {
+    outs() << "BOLT-INFO: patching entries in original code\n";
+  }
+
+  // Calculate the size of the patch.
+  static size_t PatchSize = 0;
+  if (!PatchSize) {
+    MCInst TailCallInst;
+    BC.MIB->createTailCall(TailCallInst, BC.Ctx->createTempSymbol(),
+                           BC.Ctx.get());
+    PatchSize = BC.computeInstructionSize(TailCallInst);
+  }
+
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    // Patch original code only for functions that will be emitted.
+    if (!BC.shouldEmit(Function))
+      continue;
+
+    // Check if we can skip patching the function.
+    if (!opts::ForcePatch && !Function.hasEHRanges() &&
+        Function.getSize() < PatchThreshold) {
+      continue;
+    }
+
+    // List of patches for function entries. We either successfully patch
+    // all entries or, if we cannot patch one or more, do no patch any and
+    // mark the function as ignorable.
+    std::vector<Patch> PendingPatches;
+
+    uint64_t NextValidByte = 0; // offset of the byte past the last patch
+    bool Success = Function.forEachEntryPoint([&](uint64_t Offset,
+                                                  const MCSymbol *Symbol) {
+      if (Offset < NextValidByte) {
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: unable to patch entry point in " << Function
+                 << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
+        }
+        return false;
+      }
+
+      PendingPatches.emplace_back(Patch{Symbol, Function.getAddress() + Offset,
+                                        Function.getFileOffset() + Offset,
+                                        Function.getOriginSection()});
+      NextValidByte = Offset + PatchSize;
+      if (NextValidByte > Function.getMaxSize()) {
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: function " << Function << " too small to patch "
+                    "its entry point\n";
+        }
+        return false;
+      }
+
+      return true;
+    });
+
+    if (!Success) {
+      // If the original function entries cannot be patched, then we cannot
+      // safely emit new function body.
+      errs() << "BOLT-WARNING: failed to patch entries in " << Function
+             << ". The function will not be optimized.\n";
+      Function.setIgnored();
+      continue;
+    }
+
+    for (Patch &Patch : PendingPatches) {
+      BinaryFunction *PatchFunction =
+          BC.createInjectedBinaryFunction(
+              NameResolver::append(Patch.Symbol->getName(), ".org.0"));
+      // Force the function to be emitted at the given address.
+      PatchFunction->setOutputAddress(Patch.Address);
+      PatchFunction->setFileOffset(Patch.FileOffset);
+      PatchFunction->setOriginSection(Patch.Section);
+
+      MCInst TailCallInst;
+      BC.MIB->createTailCall(TailCallInst, Patch.Symbol, BC.Ctx.get());
+      PatchFunction->addBasicBlock(0)->addInstruction(TailCallInst);
+
+      // Verify the size requirements.
+      uint64_t HotSize, ColdSize;
+      std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(*PatchFunction);
+      assert(!ColdSize && "unexpected cold code");
+      assert(HotSize <= PatchSize && "max patch size exceeded");
+    }
+
+    Function.setIsPatched(true);
+  }
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/PatchEntries.h b/bolt/src/Passes/PatchEntries.h
new file mode 100644
index 000000000000..56ef71feef31
--- /dev/null
+++ b/bolt/src/Passes/PatchEntries.h
@@ -0,0 +1,46 @@
+//===--- Passes/PatchEntries.h - pass for patching function entries -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass for patching original function entry points.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_PATCH_ENTRIES_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_PATCH_ENTRIES_H
+
+#include "Passes/BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+/// Pass for patching original function entry points.
+class PatchEntries : public BinaryFunctionPass {
+  // If the function size is below the threshold, attempt to skip patching it.
+  static constexpr uint64_t PatchThreshold = 128;
+
+  struct Patch {
+    const MCSymbol *Symbol;
+    uint64_t Address;
+    uint64_t FileOffset;
+    BinarySection *Section;
+  };
+
+public:
+  explicit PatchEntries() : BinaryFunctionPass(false) {
+  }
+
+  const char *getName() const override {
+    return "patch-entries";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/PettisAndHansen.cpp b/bolt/src/Passes/PettisAndHansen.cpp
new file mode 100644
index 000000000000..08030ecc3e02
--- /dev/null
+++ b/bolt/src/Passes/PettisAndHansen.cpp
@@ -0,0 +1,218 @@
+//===--- Passes/PettisAndHansen.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "HFSort.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+#include <unordered_map>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "hfsort"
+
+namespace llvm {
+namespace bolt {
+
+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;
+
+namespace {
+class ClusterArc {
+public:
+  ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0)
+    : C1(std::min(Ca, Cb))
+    , C2(std::max(Ca, Cb))
+    , Weight(W)
+  {}
+
+  friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) {
+    return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2;
+  }
+
+  Cluster *const C1;
+  Cluster *const C2;
+  mutable double Weight;
+};
+
+class ClusterArcHash {
+public:
+  int64_t operator()(const ClusterArc &Arc) const {
+    std::hash<int64_t> Hasher;
+    return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2));
+  }
+};
+
+using ClusterArcSet = std::unordered_set<ClusterArc, ClusterArcHash>;
+
+void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) {
+  NodeId C1head = C1->targets().front();
+  NodeId C1tail = C1->targets().back();
+  NodeId C2head = C2->targets().front();
+  NodeId C2tail = C2->targets().back();
+
+  double C1headC2head = 0;
+  double C1headC2tail = 0;
+  double C1tailC2head = 0;
+  double C1tailC2tail = 0;
+
+  for (const Arc &Arc : Cg.arcs()) {
+    if ((Arc.src() == C1head && Arc.dst() == C2head) ||
+        (Arc.dst() == C1head && Arc.src() == C2head)) {
+      C1headC2head += Arc.weight();
+    } else if ((Arc.src() == C1head && Arc.dst() == C2tail) ||
+               (Arc.dst() == C1head && Arc.src() == C2tail)) {
+      C1headC2tail += Arc.weight();
+    } else if ((Arc.src() == C1tail && Arc.dst() == C2head) ||
+               (Arc.dst() == C1tail && Arc.src() == C2head)) {
+      C1tailC2head += Arc.weight();
+    } else if ((Arc.src() == C1tail && Arc.dst() == C2tail) ||
+               (Arc.dst() == C1tail && Arc.src() == C2tail)) {
+      C1tailC2tail += Arc.weight();
+    }
+  }
+
+  const double Max = std::max(std::max(C1headC2head, C1headC2tail),
+                              std::max(C1tailC2head, C1tailC2tail));
+
+  if (C1headC2head == Max) {
+    // flip C1
+    C1->reverseTargets();
+  } else if (C1headC2tail == Max) {
+    // flip C1 C2
+    C1->reverseTargets();
+    C2->reverseTargets();
+  } else if (C1tailC2tail == Max) {
+    // flip C2
+    C2->reverseTargets();
+  }
+}
+}
+
+std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {
+  // indexed by NodeId, keeps its current cluster
+  std::vector<Cluster*> FuncCluster(Cg.numNodes(), nullptr);
+  std::vector<Cluster> Clusters;
+  std::vector<NodeId> Funcs;
+
+  Clusters.reserve(Cg.numNodes());
+
+  for (NodeId F = 0; F < Cg.numNodes(); F++) {
+    if (Cg.samples(F) == 0) continue;
+    Clusters.emplace_back(F, Cg.getNode(F));
+    FuncCluster[F] = &Clusters.back();
+    Funcs.push_back(F);
+  }
+
+  ClusterArcSet Carcs;
+
+  auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) {
+    auto Res = Carcs.emplace(C1, C2, Weight);
+    if (!Res.second) {
+      Res.first->Weight += Weight;
+    }
+  };
+
+  // Create a std::vector of cluster arcs
+
+  for (const Arc &Arc : Cg.arcs()) {
+    if (Arc.weight() == 0) continue;
+
+    Cluster *const S = FuncCluster[Arc.src()];
+    Cluster *const D = FuncCluster[Arc.dst()];
+
+    // ignore if s or d is nullptr
+
+    if (S == nullptr || D == nullptr) continue;
+
+    // ignore self-edges
+
+    if (S == D) continue;
+
+    insertOrInc(S, D, Arc.weight());
+  }
+
+  // Find an arc with max weight and merge its nodes
+
+  while (!Carcs.empty()) {
+    auto Maxpos = std::max_element(
+      Carcs.begin(),
+      Carcs.end(),
+      [&] (const ClusterArc &Carc1, const ClusterArc &Carc2) {
+        return Carc1.Weight < Carc2.Weight;
+      }
+    );
+
+    ClusterArc Max = *Maxpos;
+    Carcs.erase(Maxpos);
+
+    Cluster *const C1 = Max.C1;
+    Cluster *const C2 = Max.C2;
+
+    if (C1->size() + C2->size() > MaxClusterSize) continue;
+
+    if (C1->frozen() || C2->frozen()) continue;
+
+    // order functions and merge cluster
+
+    orderFuncs(Cg, C1, C2);
+
+    LLVM_DEBUG(dbgs() << format("merging %s -> %s: %.1f\n",
+                                C2->toString().c_str(), C1->toString().c_str(),
+                                Max.Weight));
+
+    // update carcs: merge C1arcs to C2arcs
+
+    std::unordered_map<ClusterArc, Cluster *, ClusterArcHash> C2arcs;
+    for (const ClusterArc &Carc : Carcs) {
+      if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2);
+      if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1);
+    }
+
+    for (auto It : C2arcs) {
+      Cluster *const C = It.second;
+      ClusterArc const C2arc = It.first;
+
+      insertOrInc(C, C1, C2arc.Weight);
+      Carcs.erase(C2arc);
+    }
+
+    // update FuncCluster
+
+    for (NodeId F : C2->targets()) {
+      FuncCluster[F] = C1;
+    }
+    C1->merge(*C2, Max.Weight);
+    C2->clear();
+  }
+
+  // Return the set of Clusters that are left, which are the ones that
+  // didn't get merged.
+
+  std::set<Cluster*> LiveClusters;
+  std::vector<Cluster> OutClusters;
+
+  for (NodeId Fid : Funcs) {
+    LiveClusters.insert(FuncCluster[Fid]);
+  }
+  for (Cluster *C : LiveClusters) {
+    OutClusters.push_back(std::move(*C));
+  }
+
+  std::sort(OutClusters.begin(),
+            OutClusters.end(),
+            compareClustersDensity);
+
+  return OutClusters;
+}
+
+}
+}
diff --git a/bolt/src/Passes/ReachingDefOrUse.h b/bolt/src/Passes/ReachingDefOrUse.h
new file mode 100644
index 000000000000..12048982d9dc
--- /dev/null
+++ b/bolt/src/Passes/ReachingDefOrUse.h
@@ -0,0 +1,162 @@
+//===--- Passes/ReachingDefOrUse.h ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H
+
+#include "DataflowAnalysis.h"
+#include "RegAnalysis.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+/// If \p Def is true, this computes a forward dataflow equation to
+/// propagate reaching definitions.
+/// If false, this computes a backward dataflow equation propagating
+/// uses to their definitions.
+template <bool Def = false>
+class ReachingDefOrUse
+    : public InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def> {
+  friend class DataflowAnalysis<ReachingDefOrUse<Def>, BitVector, !Def>;
+
+public:
+  ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC,
+                   BinaryFunction &BF, Optional<MCPhysReg> TrackingReg = None,
+                   MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>(BC, BF, AllocId),
+        RA(RA), TrackingReg(TrackingReg) {}
+  virtual ~ReachingDefOrUse() {}
+
+  bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) {
+    for (auto I = Candidates; I != this->expr_end(); ++I) {
+      BitVector BV = BitVector(this->BC.MRI->getNumRegs(), false);
+      if (Def) {
+        RA.getInstClobberList(**I, BV);
+      } else {
+        this->BC.MIB->getTouchedRegs(**I, BV);
+      }
+      if (BV[Reg])
+        return true;
+    }
+    return false;
+  }
+
+  bool doesAReachesB(const MCInst &A, const MCInst &B) {
+    return (*this->getStateAt(B))[this->ExprToIdx[&A]];
+  }
+
+  void run() {
+    InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>::run();
+  }
+
+protected:
+  /// Reference to the result of reg analysis
+  const RegAnalysis &RA;
+
+  /// If set, limit the dataflow to only track instructions affecting this
+  /// register. Otherwise the analysis can be too permissive.
+  Optional<MCPhysReg> TrackingReg;
+
+  void preflight() {
+    // Populate our universe of tracked expressions with all instructions
+    // except pseudos
+    for (BinaryBasicBlock &BB : this->Func) {
+      for (MCInst &Inst : BB) {
+        this->Expressions.push_back(&Inst);
+        this->ExprToIdx[&Inst] = this->NumInstrs++;
+      }
+    }
+  }
+
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    return BitVector(this->NumInstrs, false);
+  }
+
+  BitVector getStartingStateAtPoint(const MCInst &Point) {
+    return BitVector(this->NumInstrs, false);
+  }
+
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn) {
+    StateOut |= StateIn;
+  }
+
+  /// Define the function computing the kill set -- whether expression Y, a
+  /// tracked expression, will be considered to be dead after executing X.
+  bool doesXKillsY(const MCInst *X, const MCInst *Y) {
+    // getClobberedRegs for X and Y. If they intersect, return true
+    BitVector XClobbers = BitVector(this->BC.MRI->getNumRegs(), false);
+    BitVector YClobbers = BitVector(this->BC.MRI->getNumRegs(), false);
+    RA.getInstClobberList(*X, XClobbers);
+    // In defs, write after write -> kills first write
+    // In uses, write after access (read or write) -> kills access
+    if (Def)
+      RA.getInstClobberList(*Y, YClobbers);
+    else
+      this->BC.MIB->getTouchedRegs(*Y, YClobbers);
+    // Limit the analysis, if requested
+    if (TrackingReg) {
+      XClobbers &= this->BC.MIB->getAliases(*TrackingReg);
+      YClobbers &= this->BC.MIB->getAliases(*TrackingReg);
+    }
+    // X kills Y if it clobbers Y completely -- this is a conservative approach.
+    // In practice, we may produce use-def links that may not exist.
+    XClobbers &= YClobbers;
+    return XClobbers == YClobbers;
+  }
+
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur) {
+    BitVector Next = Cur;
+    // Kill
+    for (auto I = this->expr_begin(Next), E = this->expr_end(); I != E; ++I) {
+      assert(*I != nullptr && "Lost pointers");
+      if (doesXKillsY(&Point, *I)) {
+        Next.reset(I.getBitVectorIndex());
+      }
+    }
+    // Gen
+    if (!this->BC.MIB->isCFI(Point)) {
+      if (TrackingReg == None) {
+        // Track all instructions
+        Next.set(this->ExprToIdx[&Point]);
+      }
+      else {
+        // Track only instructions relevant to TrackingReg
+        BitVector Regs = BitVector(this->BC.MRI->getNumRegs(), false);
+        if (Def)
+          RA.getInstClobberList(Point, Regs);
+        else
+          RA.getInstUsedRegsList(Point, Regs, false);
+        Regs &= this->BC.MIB->getAliases(*TrackingReg);
+        if (Regs.any())
+          Next.set(this->ExprToIdx[&Point]);
+      }
+    }
+    return Next;
+  }
+
+  StringRef getAnnotationName() const {
+    if (Def)
+      return StringRef("ReachingDefs");
+    return StringRef("ReachingUses");
+  }
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/ReachingInsns.h b/bolt/src/Passes/ReachingInsns.h
new file mode 100644
index 000000000000..b98022c406fa
--- /dev/null
+++ b/bolt/src/Passes/ReachingInsns.h
@@ -0,0 +1,96 @@
+//===--- Passes/ReachingInsns.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H
+
+#include "DataflowAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+template <bool Backward = false>
+class ReachingInsns
+    : public InstrsDataflowAnalysis<ReachingInsns<Backward>, Backward> {
+  friend class DataflowAnalysis<ReachingInsns<Backward>, BitVector, Backward>;
+
+public:
+  ReachingInsns(const BinaryContext &BC, BinaryFunction &BF,
+                MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : InstrsDataflowAnalysis<ReachingInsns, Backward>(BC, BF, AllocId) {}
+  virtual ~ReachingInsns() {}
+
+  bool isInLoop(const BinaryBasicBlock &BB) {
+    const MCInst *First = BB.begin() != BB.end() ? &*BB.begin() : nullptr;
+    assert(First && "This analysis does not work for empty BB");
+    return ((*this->getStateAt(BB))[this->ExprToIdx[First]]);
+  }
+
+  bool isInLoop(const MCInst &Inst) {
+    const BinaryBasicBlock *BB = InsnToBB[&Inst];
+    assert(BB && "Unknown instruction");
+    return isInLoop(*BB);
+  }
+
+  void run() {
+    InstrsDataflowAnalysis<ReachingInsns<Backward>, Backward>::run();
+  }
+
+protected:
+  std::unordered_map<const MCInst *, BinaryBasicBlock *> InsnToBB;
+
+  void preflight() {
+    for (BinaryBasicBlock &BB : this->Func) {
+      for (MCInst &Inst : BB) {
+        this->Expressions.push_back(&Inst);
+        this->ExprToIdx[&Inst] = this->NumInstrs++;
+        InsnToBB[&Inst] = &BB;
+      }
+    }
+  }
+
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    return BitVector(this->NumInstrs, false);
+  }
+
+  BitVector getStartingStateAtPoint(const MCInst &Point) {
+    return BitVector(this->NumInstrs, false);
+  }
+
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn) {
+    StateOut |= StateIn;
+  }
+
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur) {
+    BitVector Next = Cur;
+    // Gen
+    if (!this->BC.MIB->isCFI(Point)) {
+      Next.set(this->ExprToIdx[&Point]);
+    }
+    return Next;
+  }
+
+  StringRef getAnnotationName() const {
+    if (Backward)
+      return StringRef("ReachingInsnsBackward");
+    return StringRef("ReachingInsns");
+  }
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/RegAnalysis.cpp b/bolt/src/Passes/RegAnalysis.cpp
new file mode 100644
index 000000000000..40138ece8e69
--- /dev/null
+++ b/bolt/src/Passes/RegAnalysis.cpp
@@ -0,0 +1,243 @@
+//===--- RegAnalysis.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAnalysis.h"
+#include "BinaryFunction.h"
+#include "CallGraphWalker.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "ra"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::opt<unsigned> Verbosity;
+extern cl::OptionCategory BoltOptCategory;
+
+cl::opt<bool> AssumeABI(
+    "assume-abi",
+    cl::desc("assume the ABI is never violated"),
+    cl::ZeroOrMore,
+    cl::init(false),
+    cl::cat(BoltOptCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+RegAnalysis::RegAnalysis(BinaryContext &BC,
+                         std::map<uint64_t, BinaryFunction> *BFs,
+                         BinaryFunctionCallGraph *CG)
+    : BC(BC), CS(opts::AssumeABI ? ConservativeStrategy::CLOBBERS_ABI
+                                 : ConservativeStrategy::CLOBBERS_ALL) {
+  if (!CG)
+    return;
+
+  CallGraphWalker CGWalker(*CG);
+
+  CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool {
+    BitVector RegsKilled = getFunctionClobberList(Func);
+    bool Updated = RegsKilledMap.find(Func) == RegsKilledMap.end() ||
+                   RegsKilledMap[Func] != RegsKilled;
+    if (Updated)
+      RegsKilledMap[Func] = std::move(RegsKilled);
+    return Updated;
+  });
+
+  CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool {
+    BitVector RegsGen = getFunctionUsedRegsList(Func);
+    bool Updated = RegsGenMap.find(Func) == RegsGenMap.end() ||
+                   RegsGenMap[Func] != RegsGen;
+    if (Updated)
+      RegsGenMap[Func] = std::move(RegsGen);
+    return Updated;
+  });
+
+  CGWalker.walk();
+
+  if (opts::Verbosity == 0) {
+#ifndef NDEBUG
+    if (!DebugFlag || !isCurrentDebugType(DEBUG_TYPE))
+      return;
+#else
+    return;
+#endif
+  }
+
+  if (!BFs)
+    return;
+
+  // This loop is for computing statistics only
+  for (auto &MapEntry : *BFs) {
+    BinaryFunction *Func = &MapEntry.second;
+    auto Iter = RegsKilledMap.find(Func);
+    assert(Iter != RegsKilledMap.end() &&
+           "Failed to compute all clobbers list");
+    if (Iter->second.all()) {
+      uint64_t Count = Func->getExecutionCount();
+      if (Count != BinaryFunction::COUNT_NO_PROFILE)
+        CountFunctionsAllClobber += Count;
+      ++NumFunctionsAllClobber;
+    }
+    DEBUG_WITH_TYPE("ra",
+      dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n";
+      const BitVector &RegsKilled = Iter->second;
+      int RegIdx = RegsKilled.find_first();
+      while (RegIdx != -1) {
+        dbgs() << "\tREG" << RegIdx;
+        RegIdx = RegsKilled.find_next(RegIdx);
+      };
+      dbgs() << "\nUsed regs set for func: " << Func->getPrintName() << "\n";
+      const BitVector &RegsUsed = RegsGenMap.find(Func)->second;
+      RegIdx = RegsUsed.find_first();
+      while (RegIdx != -1) {
+        dbgs() << "\tREG" << RegIdx;
+        RegIdx = RegsUsed.find_next(RegIdx);
+      };
+      dbgs() << "\n";
+    );
+  }
+}
+
+void RegAnalysis::beConservative(BitVector &Result) const {
+  switch (CS) {
+  case ConservativeStrategy::CLOBBERS_ALL:
+    Result.set();
+    break;
+  case ConservativeStrategy::CLOBBERS_ABI: {
+    BitVector BV(BC.MRI->getNumRegs(), false);
+    BC.MIB->getCalleeSavedRegs(BV);
+    BV.flip();
+    Result |= BV;
+    break;
+  }
+  case ConservativeStrategy::CLOBBERS_NONE:
+    Result.reset();
+    break;
+  }
+}
+
+bool RegAnalysis::isConservative(BitVector &Vec) const {
+  switch (CS) {
+  case ConservativeStrategy::CLOBBERS_ALL:
+    return Vec.all();
+  case ConservativeStrategy::CLOBBERS_ABI: {
+    BitVector BV(BC.MRI->getNumRegs(), false);
+    BC.MIB->getCalleeSavedRegs(BV);
+    BV |= Vec;
+    return BV.all();
+  }
+  case ConservativeStrategy::CLOBBERS_NONE:
+    return Vec.none();
+  }
+  return false;
+}
+
+void RegAnalysis::getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet,
+                                      bool GetClobbers) const {
+  if (!BC.MIB->isCall(Inst)) {
+    if (GetClobbers)
+      BC.MIB->getClobberedRegs(Inst, RegSet);
+    else
+      BC.MIB->getUsedRegs(Inst, RegSet);
+    return;
+  }
+
+  // If no call graph supplied...
+  if (RegsKilledMap.size() == 0) {
+    beConservative(RegSet);
+    return;
+  }
+
+  const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+  // If indirect call, we know nothing
+  if (TargetSymbol == nullptr) {
+    beConservative(RegSet);
+    return;
+  }
+
+  const BinaryFunction *Function = BC.getFunctionForSymbol(TargetSymbol);
+  if (Function == nullptr) {
+    // Call to a function without a BinaryFunction object.
+    // This should be a call to a PLT entry, and since it is a trampoline to
+    // a DSO, we can't really know the code in advance.
+    beConservative(RegSet);
+    return;
+  }
+  if (GetClobbers) {
+    auto BV = RegsKilledMap.find(Function);
+    if (BV != RegsKilledMap.end()) {
+      RegSet |= BV->second;
+      return;
+    }
+    // Ignore calls to function whose clobber list wasn't yet calculated. This
+    // instruction will be evaluated again once we have info for the callee.
+    return;
+  }
+  auto BV = RegsGenMap.find(Function);
+  if (BV != RegsGenMap.end()) {
+    RegSet |= BV->second;
+    return;
+  }
+}
+
+void RegAnalysis::getInstClobberList(const MCInst &Inst,
+                                     BitVector &KillSet) const {
+  return getInstUsedRegsList(Inst, KillSet, /*GetClobbers*/ true);
+}
+
+BitVector RegAnalysis::getFunctionUsedRegsList(const BinaryFunction *Func) {
+  BitVector UsedRegs = BitVector(BC.MRI->getNumRegs(), false);
+
+  if (!Func->isSimple() || !Func->hasCFG()) {
+    beConservative(UsedRegs);
+    return UsedRegs;
+  }
+
+  for (const BinaryBasicBlock &BB : *Func) {
+    for (const MCInst &Inst : BB) {
+      getInstUsedRegsList(Inst, UsedRegs, /*GetClobbers*/false);
+      if (UsedRegs.all())
+        return UsedRegs;
+    }
+  }
+
+  return UsedRegs;
+}
+
+BitVector RegAnalysis::getFunctionClobberList(const BinaryFunction *Func) {
+  BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false);
+
+  if (!Func->isSimple() || !Func->hasCFG()) {
+    beConservative(RegsKilled);
+    return RegsKilled;
+  }
+
+  for (const BinaryBasicBlock &BB : *Func) {
+    for (const MCInst &Inst : BB) {
+      getInstClobberList(Inst, RegsKilled);
+      if (RegsKilled.all())
+        return RegsKilled;
+    }
+  }
+
+  return RegsKilled;
+}
+
+void RegAnalysis::printStats() {
+  outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively "
+            "treated as clobbering all registers: "
+         << NumFunctionsAllClobber
+         << format(" (%.1lf%% dyn cov)\n",
+                   (100.0 * CountFunctionsAllClobber / CountDenominator));
+}
+
+}
+}
diff --git a/bolt/src/Passes/RegAnalysis.h b/bolt/src/Passes/RegAnalysis.h
new file mode 100644
index 000000000000..a363ba1996b4
--- /dev/null
+++ b/bolt/src/Passes/RegAnalysis.h
@@ -0,0 +1,98 @@
+//===--- Passes/RegAnalysis.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REGANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REGANALYSIS_H
+
+#include "llvm/ADT/BitVector.h"
+#include <cstdint>
+#include <map>
+
+namespace llvm {
+class MCInst;
+
+namespace bolt {
+class BinaryContext;
+class BinaryFunction;
+class BinaryFunctionCallGraph;
+
+/// Determine the set of registers read or clobbered for each instruction
+/// in a BinaryFunction. If the instruction is a call, this analysis rely on
+/// a call graph traversal to accurately extract the set of registers touched
+/// after the call returns.
+class RegAnalysis {
+public:
+  /// Compute the set of registers \p Func may read from during its execution.
+  BitVector getFunctionUsedRegsList(const BinaryFunction *Func);
+
+  /// Compute the set of registers \p Func may write to during its execution,
+  /// starting at the point when it is called up until when it returns. Returns
+  /// a BitVector the size of the target number of registers, representing the
+  /// set of clobbered registers.
+  BitVector getFunctionClobberList(const BinaryFunction *Func);
+
+  RegAnalysis(BinaryContext &BC,
+              std::map<uint64_t, BinaryFunction> *BFs,
+              BinaryFunctionCallGraph *CG);
+
+  /// Compute the set of registers \p Inst may read from, marking them in
+  /// \p RegSet. If GetClobbers is true, the set set the instr may write to.
+  /// Use the callgraph to fill out this info for calls.
+  void getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet,
+                           bool GetClobbers) const;
+
+  /// Compute the set of registers \p Inst may write to, marking them in
+  /// \p KillSet. If this is a call, try to get the set of registers the call
+  /// target will write to.
+  void getInstClobberList(const MCInst &Inst, BitVector &KillSet) const;
+
+  /// Return true iff Vec has a conservative estimation of used/clobbered regs,
+  /// expressing no specific knowledge of reg usage.
+  bool isConservative(BitVector &Vec) const;
+
+
+  /// Set what to do when lacking information about a call
+  enum class ConservativeStrategy {
+    CLOBBERS_ALL,
+    CLOBBERS_ABI,
+    CLOBBERS_NONE
+  };
+  void setConservativeStrategy(ConservativeStrategy S) { CS = S; }
+
+  /// Print stats about the quality of our analysis
+  void printStats();
+
+private:
+  BinaryContext &BC;
+
+  /// Map functions to the set of registers they may overwrite starting at when
+  /// it is called until it returns to the caller.
+  std::map<const BinaryFunction *, BitVector> RegsKilledMap;
+
+  /// Similar concept above but for registers that are read in that function.
+  std::map<const BinaryFunction *, BitVector> RegsGenMap;
+
+  /// Analysis stats counters
+  uint64_t NumFunctionsAllClobber{0};
+  uint64_t CountFunctionsAllClobber{0};
+  uint64_t CountDenominator{0};
+
+  ConservativeStrategy CS;
+
+  /// Helper function used to get the set of clobbered/used regs whenever
+  /// we know nothing about the function.
+  void beConservative(BitVector &Result) const;
+
+};
+
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/RegReAssign.cpp b/bolt/src/Passes/RegReAssign.cpp
new file mode 100644
index 000000000000..5c578a3a99b9
--- /dev/null
+++ b/bolt/src/Passes/RegReAssign.cpp
@@ -0,0 +1,428 @@
+//===--- Passes/RegReAssign.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegReAssign.h"
+#include "BinaryFunctionCallGraph.h"
+#include "DataflowAnalysis.h"
+#include "DataflowInfoManager.h"
+#include "MCPlus.h"
+#include <numeric>
+
+#define DEBUG_TYPE "regreassign"
+
+using namespace llvm;
+
+namespace opts {
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> UpdateDebugSections;
+
+static cl::opt<bool>
+AggressiveReAssign("use-aggr-reg-reassign",
+  cl::desc("use register liveness analysis to try to find more opportunities "
+           "for -reg-reassign optimization"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+void RegReAssign::swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A,
+                      MCPhysReg B) {
+  const BitVector &AliasA = BC.MIB->getAliases(A, false);
+  const BitVector &AliasB = BC.MIB->getAliases(B, false);
+
+  // Regular instructions
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      for (int I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) {
+        MCOperand &Operand = Inst.getOperand(I);
+        if (!Operand.isReg())
+          continue;
+
+        unsigned Reg = Operand.getReg();
+        if (AliasA.test(Reg)) {
+          Operand.setReg(BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg)));
+          --StaticBytesSaved;
+          DynBytesSaved -= BB.getKnownExecutionCount();
+          continue;
+        }
+        if (!AliasB.test(Reg))
+          continue;
+        Operand.setReg(BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg)));
+        ++StaticBytesSaved;
+        DynBytesSaved += BB.getKnownExecutionCount();
+      }
+    }
+  }
+
+  // CFI
+  DenseSet<const MCCFIInstruction *> Changed;
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      if (!BC.MIB->isCFI(Inst))
+        continue;
+      MCCFIInstruction *CFI = Function.getCFIFor(Inst);
+      if (Changed.count(CFI))
+        continue;
+      Changed.insert(CFI);
+
+      switch (CFI->getOperation()) {
+      case MCCFIInstruction::OpRegister: {
+        const unsigned CFIReg2 = CFI->getRegister2();
+        const MCPhysReg Reg2 = *BC.MRI->getLLVMRegNum(CFIReg2, /*isEH=*/false);
+        if (AliasA.test(Reg2)) {
+          CFI->setRegister2(BC.MRI->getDwarfRegNum(
+              BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg2)), false));
+        } else if (AliasB.test(Reg2)) {
+          CFI->setRegister2(BC.MRI->getDwarfRegNum(
+              BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg2)), false));
+        }
+      }
+      LLVM_FALLTHROUGH;
+      case MCCFIInstruction::OpUndefined:
+      case MCCFIInstruction::OpDefCfa:
+      case MCCFIInstruction::OpOffset:
+      case MCCFIInstruction::OpRestore:
+      case MCCFIInstruction::OpSameValue:
+      case MCCFIInstruction::OpDefCfaRegister:
+      case MCCFIInstruction::OpRelOffset:
+      case MCCFIInstruction::OpExpression:
+      case MCCFIInstruction::OpValExpression: {
+        const unsigned CFIReg = CFI->getRegister();
+        const MCPhysReg Reg = *BC.MRI->getLLVMRegNum(CFIReg, /*isEH=*/false);
+        if (AliasA.test(Reg)) {
+          CFI->setRegister(BC.MRI->getDwarfRegNum(
+              BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg)), false));
+        } else if (AliasB.test(Reg)) {
+          CFI->setRegister(BC.MRI->getDwarfRegNum(
+              BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg)), false));
+        }
+        break;
+      }
+      default:
+        break;
+      }
+    }
+  }
+}
+
+void RegReAssign::rankRegisters(BinaryContext &BC, BinaryFunction &Function) {
+  std::fill(RegScore.begin(), RegScore.end(), 0);
+  std::fill(RankedRegs.begin(), RankedRegs.end(), 0);
+
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      const bool CannotUseREX = BC.MIB->cannotUseREX(Inst);
+      const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
+
+      // Disallow substituitions involving regs in implicit uses lists
+      const MCPhysReg *ImplicitUses = Desc.getImplicitUses();
+      while (ImplicitUses && *ImplicitUses) {
+        const size_t RegEC =
+            BC.MIB->getAliases(*ImplicitUses, false).find_first();
+        RegScore[RegEC] =
+            std::numeric_limits<decltype(RegScore)::value_type>::min();
+        ++ImplicitUses;
+      }
+
+      // Disallow substituitions involving regs in implicit defs lists
+      const MCPhysReg *ImplicitDefs = Desc.getImplicitDefs();
+      while (ImplicitDefs && *ImplicitDefs) {
+        const size_t RegEC =
+            BC.MIB->getAliases(*ImplicitDefs, false).find_first();
+        RegScore[RegEC] =
+            std::numeric_limits<decltype(RegScore)::value_type>::min();
+        ++ImplicitDefs;
+      }
+
+      for (int I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) {
+        const MCOperand &Operand = Inst.getOperand(I);
+        if (!Operand.isReg())
+          continue;
+
+        if (Desc.getOperandConstraint(I, MCOI::TIED_TO) != -1)
+          continue;
+
+        unsigned Reg = Operand.getReg();
+        size_t RegEC = BC.MIB->getAliases(Reg, false).find_first();
+        if (RegEC == 0)
+          continue;
+
+        // Disallow substituitions involving regs in instrs that cannot use REX
+        if (CannotUseREX) {
+          RegScore[RegEC] =
+              std::numeric_limits<decltype(RegScore)::value_type>::min();
+          continue;
+        }
+
+        // Unsupported substitution, cannot swap BH with R* regs, bail
+        if (BC.MIB->isUpper8BitReg(Reg) && ClassicCSR.test(Reg)) {
+          RegScore[RegEC] =
+              std::numeric_limits<decltype(RegScore)::value_type>::min();
+          continue;
+        }
+
+        RegScore[RegEC] += BB.getKnownExecutionCount();
+      }
+    }
+  }
+  std::iota(RankedRegs.begin(), RankedRegs.end(), 0); // 0, 1, 2, 3...
+  std::sort(RankedRegs.begin(), RankedRegs.end(),
+            [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; });
+
+  LLVM_DEBUG({
+    for (size_t Reg : RankedRegs) {
+      if (RegScore[Reg] == 0)
+        continue;
+      dbgs() << Reg << " ";
+      if (RegScore[Reg] > 0)
+        dbgs() << BC.MRI->getName(Reg) << ": " << RegScore[Reg] << "\n";
+      else
+        dbgs() << BC.MRI->getName(Reg) << ": (blacklisted)\n";
+    }
+  });
+}
+
+void RegReAssign::aggressivePassOverFunction(BinaryContext &BC,
+                                            BinaryFunction &Function) {
+  rankRegisters(BC, Function);
+
+  // Bail early if our registers are all black listed, before running expensive
+  // analysis passes
+  bool Bail = true;
+  int64_t LowScoreClassic = std::numeric_limits<int64_t>::max();
+  for (int J = ClassicRegs.find_first(); J != -1;
+       J = ClassicRegs.find_next(J)) {
+    if (RegScore[J] <= 0)
+      continue;
+    Bail = false;
+    if (RegScore[J] < LowScoreClassic)
+      LowScoreClassic = RegScore[J];
+  }
+  if (Bail)
+    return;
+  BitVector Extended = ClassicRegs;
+  Extended.flip();
+  Extended &= GPRegs;
+  Bail = true;
+  int64_t HighScoreExtended = 0;
+  for (int J = Extended.find_first(); J != -1; J = Extended.find_next(J)) {
+    if (RegScore[J] <= 0)
+      continue;
+    Bail = false;
+    if (RegScore[J] > HighScoreExtended)
+      HighScoreExtended = RegScore[J];
+  }
+  // Also bail early if there is no profitable substitution even if we assume
+  // all registers can be exchanged
+  if (Bail || (LowScoreClassic << 1) >= HighScoreExtended)
+    return;
+
+  // -- expensive pass -- determine all regs alive during func start
+  DataflowInfoManager Info(BC, Function, RA.get(), nullptr);
+  BitVector AliveAtStart = *Info.getLivenessAnalysis().getStateAt(
+      ProgramPoint::getFirstPointAt(*Function.begin()));
+  for (BinaryBasicBlock &BB : Function) {
+    if (BB.pred_size() == 0)
+      AliveAtStart |= *Info.getLivenessAnalysis().getStateAt(
+          ProgramPoint::getFirstPointAt(BB));
+  }
+  // Mark frame pointer alive because of CFI
+  AliveAtStart |= BC.MIB->getAliases(BC.MIB->getFramePointer(), false);
+  // Never touch return registers
+  BC.MIB->getDefaultLiveOut(AliveAtStart);
+
+  // Try swapping more profitable options first
+  auto Begin = RankedRegs.begin();
+  auto End = std::prev(RankedRegs.end());
+  while (Begin != End) {
+    MCPhysReg ClassicReg = *End;
+    if (!ClassicRegs[ClassicReg] || RegScore[ClassicReg] <= 0) {
+      --End;
+      continue;
+    }
+
+    MCPhysReg ExtReg = *Begin;
+    if (!Extended[ExtReg] || RegScore[ExtReg] <= 0) {
+      ++Begin;
+      continue;
+    }
+
+    if (RegScore[ClassicReg] << 1 >= RegScore[ExtReg]) {
+      LLVM_DEBUG(dbgs() << " Ending at " << BC.MRI->getName(ClassicReg)
+                        << " with " << BC.MRI->getName(ExtReg)
+                        << " because exchange is not profitable\n");
+      break;
+    }
+
+    BitVector AnyAliasAlive = AliveAtStart;
+    AnyAliasAlive &= BC.MIB->getAliases(ClassicReg);
+    if (AnyAliasAlive.any()) {
+      LLVM_DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg)
+                        << " with " << BC.MRI->getName(ExtReg)
+                        << " because classic reg is alive\n");
+      --End;
+      continue;
+    }
+    AnyAliasAlive = AliveAtStart;
+    AnyAliasAlive &= BC.MIB->getAliases(ExtReg);
+    if (AnyAliasAlive.any()) {
+      LLVM_DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg)
+                        << " with " << BC.MRI->getName(ExtReg)
+                        << " because extended reg is alive\n");
+      ++Begin;
+      continue;
+    }
+
+    // Opportunity detected. Swap.
+    LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(ClassicReg)
+                      << " with " << BC.MRI->getName(ExtReg) << "\n\n");
+    swap(BC, Function, ClassicReg, ExtReg);
+    FuncsChanged.insert(&Function);
+    ++Begin;
+    if (Begin == End)
+      break;
+    --End;
+  }
+}
+
+bool RegReAssign::conservativePassOverFunction(BinaryContext &BC,
+                                              BinaryFunction &Function) {
+  rankRegisters(BC, Function);
+
+  // Try swapping R12, R13, R14 or R15 with RBX (we work with all callee-saved
+  // regs except RBP)
+  MCPhysReg Candidate = 0;
+  for (int J = ExtendedCSR.find_first(); J != -1;
+       J = ExtendedCSR.find_next(J)) {
+    if (RegScore[J] > RegScore[Candidate])
+      Candidate = J;
+  }
+
+  if (!Candidate || RegScore[Candidate] < 0)
+    return false;
+
+  // Check if our classic callee-saved reg (RBX is the only one) has lower
+  // score / utilization rate
+  MCPhysReg RBX = 0;
+  for (int I = ClassicCSR.find_first(); I != -1; I = ClassicCSR.find_next(I)) {
+    int64_t ScoreRBX = RegScore[I];
+    if (ScoreRBX <= 0)
+      continue;
+
+    if (RegScore[Candidate] > (ScoreRBX + 10)) {
+      RBX = I;
+    }
+  }
+
+  if (!RBX)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with "
+                    << BC.MRI->getName(Candidate) << "\n\n");
+  swap(BC, Function, RBX, Candidate);
+  FuncsChanged.insert(&Function);
+  return true;
+}
+
+void RegReAssign::setupAggressivePass(BinaryContext &BC,
+                                     std::map<uint64_t, BinaryFunction> &BFs) {
+  setupConservativePass(BC, BFs);
+  CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
+  RA.reset(new RegAnalysis(BC, &BFs, &*CG));
+
+  GPRegs = BitVector(BC.MRI->getNumRegs(), false);
+  BC.MIB->getGPRegs(GPRegs);
+}
+
+void RegReAssign::setupConservativePass(
+    BinaryContext &BC, std::map<uint64_t, BinaryFunction> &BFs) {
+  // Set up constant bitvectors used throughout this analysis
+  ClassicRegs = BitVector(BC.MRI->getNumRegs(), false);
+  CalleeSaved = BitVector(BC.MRI->getNumRegs(), false);
+  ClassicCSR = BitVector(BC.MRI->getNumRegs(), false);
+  ExtendedCSR = BitVector(BC.MRI->getNumRegs(), false);
+  // Never consider the frame pointer
+  BC.MIB->getClassicGPRegs(ClassicRegs);
+  ClassicRegs.flip();
+  ClassicRegs |= BC.MIB->getAliases(BC.MIB->getFramePointer(), false);
+  ClassicRegs.flip();
+  BC.MIB->getCalleeSavedRegs(CalleeSaved);
+  ClassicCSR |= ClassicRegs;
+  ClassicCSR &= CalleeSaved;
+  BC.MIB->getClassicGPRegs(ClassicRegs);
+  ExtendedCSR |= ClassicRegs;
+  ExtendedCSR.flip();
+  ExtendedCSR &= CalleeSaved;
+
+  LLVM_DEBUG({
+    RegStatePrinter P(BC);
+    dbgs() << "Starting register reassignment\nClassicRegs: ";
+    P.print(dbgs(), ClassicRegs);
+    dbgs() << "\nCalleeSaved: ";
+    P.print(dbgs(), CalleeSaved);
+    dbgs() << "\nClassicCSR: ";
+    P.print(dbgs(), ClassicCSR);
+    dbgs() << "\nExtendedCSR: ";
+    P.print(dbgs(), ExtendedCSR);
+    dbgs() << "\n";
+  });
+}
+
+void RegReAssign::runOnFunctions(BinaryContext &BC) {
+  RegScore = std::vector<int64_t>(BC.MRI->getNumRegs(), 0);
+  RankedRegs = std::vector<size_t>(BC.MRI->getNumRegs(), 0);
+
+  if (opts::AggressiveReAssign)
+    setupAggressivePass(BC, BC.getBinaryFunctions());
+  else
+    setupConservativePass(BC, BC.getBinaryFunctions());
+
+  for (auto &I : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = I.second;
+
+    if (!Function.isSimple() || Function.isIgnored())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "====================================\n");
+    LLVM_DEBUG(dbgs() << " - " << Function.getPrintName() << "\n");
+    if (!conservativePassOverFunction(BC, Function) &&
+        opts::AggressiveReAssign) {
+      aggressivePassOverFunction(BC, Function);
+      LLVM_DEBUG({
+        if (FuncsChanged.count(&Function)) {
+          dbgs() << "Aggressive pass successful on " << Function.getPrintName()
+                 << "\n";
+        }
+      });
+    }
+  }
+
+  if (FuncsChanged.empty()) {
+    outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n";
+    return;
+  }
+  if (opts::UpdateDebugSections) {
+    outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections."
+           << " Some registers were changed but associated AT_LOCATION for "
+           << "impacted variables were NOT updated! This operation is "
+           << "currently unsupported by BOLT.\n";
+  }
+  outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n";
+  outs() << "\t   " << FuncsChanged.size() << " functions affected.\n";
+  outs() << "\t   " << StaticBytesSaved << " static bytes saved.\n";
+  outs() << "\t   " << DynBytesSaved << " dynamic bytes saved.\n";
+}
+
+}
+}
diff --git a/bolt/src/Passes/RegReAssign.h b/bolt/src/Passes/RegReAssign.h
new file mode 100644
index 000000000000..28c0795f3b30
--- /dev/null
+++ b/bolt/src/Passes/RegReAssign.h
@@ -0,0 +1,65 @@
+//===--- Passes/RegReAssign.h ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REGREASSIGN_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REGREASSIGN_H
+
+#include "BinaryPasses.h"
+#include "RegAnalysis.h"
+
+namespace llvm {
+namespace bolt {
+
+class RegReAssign : public BinaryFunctionPass {
+  std::vector<int64_t> RegScore;
+  std::vector<size_t> RankedRegs;
+  BitVector ClassicRegs;
+  BitVector CalleeSaved;
+  BitVector ClassicCSR;
+  BitVector ExtendedCSR;
+  BitVector GPRegs;
+
+  /// Hooks to other passes
+  std::unique_ptr<RegAnalysis> RA;
+  std::unique_ptr<BinaryFunctionCallGraph> CG;
+
+  /// Stats
+  DenseSet<const BinaryFunction *> FuncsChanged;
+  int64_t StaticBytesSaved{0};
+  int64_t DynBytesSaved{0};
+
+  void swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A,
+            MCPhysReg B);
+  void rankRegisters(BinaryContext &BC, BinaryFunction &Function);
+  void aggressivePassOverFunction(BinaryContext &BC, BinaryFunction &Function);
+  bool conservativePassOverFunction(BinaryContext &BC,
+                                    BinaryFunction &Function);
+  void setupAggressivePass(BinaryContext &BC,
+                           std::map<uint64_t, BinaryFunction> &BFs);
+  void setupConservativePass(BinaryContext &BC,
+                             std::map<uint64_t, BinaryFunction> &BFs);
+
+public:
+  /// BinaryPass public interface
+
+  explicit RegReAssign(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override { return "regreassign"; }
+
+  bool shouldPrint(const BinaryFunction &BF) const override {
+    return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/ReorderAlgorithm.cpp b/bolt/src/Passes/ReorderAlgorithm.cpp
new file mode 100644
index 000000000000..1ee35edab433
--- /dev/null
+++ b/bolt/src/Passes/ReorderAlgorithm.cpp
@@ -0,0 +1,739 @@
+//===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements different basic block reordering algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReorderAlgorithm.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "llvm/Support/CommandLine.h"
+#include <queue>
+#include <random>
+#include <stack>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<bool> NoThreads;
+
+static cl::opt<unsigned> ColdThreshold(
+    "cold-threshold",
+    cl::desc("tenths of percents of main entry frequency to use as a "
+             "threshold when evaluating whether a basic block is cold "
+             "(0 means it is only considered cold if the block has zero "
+             "samples). Default: 0 "),
+    cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+PrintClusters("print-clusters",
+  cl::desc("print clusters"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<uint32_t>
+RandomSeed("bolt-seed",
+  cl::desc("seed for randomization"),
+  cl::init(42),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace {
+
+template <class T>
+inline void hashCombine(size_t &Seed, const T &Val) {
+  std::hash<T> Hasher;
+  Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
+}
+
+template <typename A, typename B>
+struct HashPair {
+  size_t operator()(const std::pair<A,B>& Val) const {
+    std::hash<A> Hasher;
+    size_t Seed = Hasher(Val.first);
+    hashCombine(Seed, Val.second);
+    return Seed;
+  }
+};
+
+}
+
+void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
+  // Create a separate MCCodeEmitter to allow lock-free execution
+  BinaryContext::IndependentCodeEmitter Emitter;
+  if (!opts::NoThreads) {
+    Emitter = BC.createIndependentMCCodeEmitter();
+  }
+
+  AvgFreq.resize(Clusters.size(), 0.0);
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
+    double Freq = 0.0;
+    uint64_t ClusterSize = 0;
+    for (BinaryBasicBlock *BB : Clusters[I]) {
+      if (BB->getNumNonPseudos() > 0) {
+        Freq += BB->getExecutionCount();
+        // Estimate the size of a block in bytes at run time
+        // NOTE: This might be inaccurate
+        ClusterSize += BB->estimateSize(Emitter.MCE.get());
+      }
+    }
+    AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
+  }
+}
+
+void ClusterAlgorithm::printClusters() const {
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
+    errs() << "Cluster number " << I;
+    if (AvgFreq.size() == Clusters.size())
+      errs() << " (frequency: " << AvgFreq[I] << ")";
+    errs() << " : ";
+    const char *Sep = "";
+    for (BinaryBasicBlock *BB : Clusters[I]) {
+      errs() << Sep << BB->getName();
+      Sep = ", ";
+    }
+    errs() << "\n";
+  }
+}
+
+void ClusterAlgorithm::reset() {
+  Clusters.clear();
+  ClusterEdges.clear();
+  AvgFreq.clear();
+}
+
+void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
+  OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
+}
+
+size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
+  HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
+  return Hasher(std::make_pair(E.Src, E.Dst));
+}
+
+bool GreedyClusterAlgorithm::EdgeEqual::operator()(
+    const EdgeTy &A, const EdgeTy &B) const {
+  return A.Src == B.Src && A.Dst == B.Dst;
+}
+
+void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
+                                                bool ComputeEdges) {
+  reset();
+
+  // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
+  // maximize weight during a path traversing all BBs. In this way, we will
+  // convert the hottest branches into fall-throughs.
+
+  // This is the queue of edges from which we will pop edges and use them to
+  // cluster basic blocks in a greedy fashion.
+  std::vector<EdgeTy> Queue;
+
+  // Initialize inter-cluster weights.
+  if (ComputeEdges)
+    ClusterEdges.resize(BF.layout_size());
+
+  // Initialize clusters and edge queue.
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    // Create a cluster for this BB.
+    uint32_t I = Clusters.size();
+    Clusters.emplace_back();
+    std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
+    Cluster.push_back(BB);
+    BBToClusterMap[BB] = I;
+    // Populate priority queue with edges.
+    auto BI = BB->branch_info_begin();
+    for (BinaryBasicBlock *&I : BB->successors()) {
+      assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+             "attempted reordering blocks of function with no profile data");
+      Queue.emplace_back(EdgeTy(BB, I, BI->Count));
+      ++BI;
+    }
+  }
+  // Sort and adjust the edge queue.
+  initQueue(Queue, BF);
+
+  // Grow clusters in a greedy fashion.
+  while (!Queue.empty()) {
+    EdgeTy E = Queue.back();
+    Queue.pop_back();
+
+    const BinaryBasicBlock *SrcBB = E.Src;
+    const BinaryBasicBlock *DstBB = E.Dst;
+
+    LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
+
+    // Case 1: BBSrc and BBDst are the same. Ignore this edge
+    if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
+      LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
+      continue;
+    }
+
+    int I = BBToClusterMap[SrcBB];
+    int J = BBToClusterMap[DstBB];
+
+    // Case 2: If they are already allocated at the same cluster, just increase
+    // the weight of this cluster
+    if (I == J) {
+      if (ComputeEdges)
+        ClusterEdges[I][I] += E.Count;
+      LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
+      continue;
+    }
+
+    std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
+    std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
+    if (areClustersCompatible(ClusterA, ClusterB, E)) {
+      // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
+      // allowing us to merge two clusters.
+      for (BinaryBasicBlock *BB : ClusterB)
+        BBToClusterMap[BB] = I;
+      ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
+      ClusterB.clear();
+      if (ComputeEdges) {
+        // Increase the intra-cluster edge count of cluster A with the count of
+        // this edge as well as with the total count of previously visited edges
+        // from cluster B cluster A.
+        ClusterEdges[I][I] += E.Count;
+        ClusterEdges[I][I] += ClusterEdges[J][I];
+        // Iterate through all inter-cluster edges and transfer edges targeting
+        // cluster B to cluster A.
+        for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
+          ClusterEdges[K][I] += ClusterEdges[K][J];
+      }
+      // Adjust the weights of the remaining edges and re-sort the queue.
+      adjustQueue(Queue, BF);
+      LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
+    } else {
+      // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
+      // merge them. Add the count of this edge to the inter-cluster edge count
+      // between clusters A and B to help us decide ordering between these
+      // clusters.
+      if (ComputeEdges)
+        ClusterEdges[I][J] += E.Count;
+      LLVM_DEBUG(
+          dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
+    }
+  }
+}
+
+void GreedyClusterAlgorithm::reset() {
+  ClusterAlgorithm::reset();
+  BBToClusterMap.clear();
+}
+
+void PHGreedyClusterAlgorithm::initQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Define a comparison function to establish SWO between edges.
+  auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
+    // With equal weights, prioritize branches with lower index
+    // source/destination. This helps to keep original block order for blocks
+    // when optimal order cannot be deducted from a profile.
+    if (A.Count == B.Count) {
+      const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
+      return (SrcOrder != 0)
+        ? SrcOrder > 0
+        : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
+    }
+    return A.Count < B.Count;
+  };
+
+  // Sort edges in increasing profile count order.
+  std::sort(Queue.begin(), Queue.end(), Comp);
+}
+
+void PHGreedyClusterAlgorithm::adjustQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Nothing to do.
+  return;
+}
+
+bool PHGreedyClusterAlgorithm::areClustersCompatible(
+    const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
+  return Front.back() == E.Src && Back.front() == E.Dst;
+}
+
+int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
+    const EdgeTy &E, const BinaryFunction &BF) const {
+  const BinaryBasicBlock *SrcBB = E.Src;
+  const BinaryBasicBlock *DstBB = E.Dst;
+
+  // Initial weight value.
+  int64_t W = (int64_t)E.Count;
+
+  // Adjust the weight by taking into account other edges with the same source.
+  auto BI = SrcBB->branch_info_begin();
+  for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
+    assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+           "attempted reordering blocks of function with no profile data");
+    assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
+           "overflow detected");
+    // Ignore edges with same source and destination, edges that target the
+    // entry block as well as the edge E itself.
+    if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
+      W -= (int64_t)BI->Count;
+    ++BI;
+  }
+
+  // Adjust the weight by taking into account other edges with the same
+  // destination.
+  for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
+    // Ignore edges with same source and destination as well as the edge E
+    // itself.
+    if (PredBB == DstBB || PredBB == SrcBB)
+      continue;
+    auto BI = PredBB->branch_info_begin();
+    for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
+      if (SuccBB == DstBB)
+        break;
+      ++BI;
+    }
+    assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
+    assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+           "attempted reordering blocks of function with no profile data");
+    assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
+           "overflow detected");
+    W -= (int64_t)BI->Count;
+  }
+
+  return W;
+}
+
+void MinBranchGreedyClusterAlgorithm::initQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Initialize edge weights.
+  for (const EdgeTy &E : Queue)
+    Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
+
+  // Sort edges in increasing weight order.
+  adjustQueue(Queue, BF);
+}
+
+void MinBranchGreedyClusterAlgorithm::adjustQueue(
+    std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
+  // Define a comparison function to establish SWO between edges.
+  auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
+    // With equal weights, prioritize branches with lower index
+    // source/destination. This helps to keep original block order for blocks
+    // when optimal order cannot be deduced from a profile.
+    if (Weight[A] == Weight[B]) {
+      const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
+      return (SrcOrder != 0)
+        ? SrcOrder > 0
+        : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
+    }
+    return Weight[A] < Weight[B];
+  };
+
+  // Iterate through all remaining edges to find edges that have their
+  // source and destination in the same cluster.
+  std::vector<EdgeTy> NewQueue;
+  for (const EdgeTy &E : Queue) {
+    const BinaryBasicBlock *SrcBB = E.Src;
+    const BinaryBasicBlock *DstBB = E.Dst;
+
+    // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
+    // this edge.
+    if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
+      LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
+                 dbgs() << " (same src, dst)\n");
+      continue;
+    }
+
+    int I = BBToClusterMap[SrcBB];
+    int J = BBToClusterMap[DstBB];
+    std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
+    std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
+
+    // Case 2: They are already allocated at the same cluster or incompatible
+    // clusters. Adjust the weights of edges with the same source or
+    // destination, so that this edge has no effect on them any more, and ignore
+    // this edge. Also increase the intra- (or inter-) cluster edge count.
+    if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
+      if (!ClusterEdges.empty())
+        ClusterEdges[I][J] += E.Count;
+      LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
+                 dbgs() << " (src, dst belong to same cluster or incompatible "
+                           "clusters)\n");
+      for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
+        if (SuccBB == DstBB)
+          continue;
+        auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
+        assert(WI != Weight.end() && "CFG edge not found in Weight map");
+        WI->second += (int64_t)E.Count;
+      }
+      for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
+        if (PredBB == SrcBB)
+          continue;
+        auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
+        assert(WI != Weight.end() && "CFG edge not found in Weight map");
+        WI->second += (int64_t)E.Count;
+      }
+      continue;
+    }
+
+    // Case 3: None of the previous cases is true, so just keep this edge in
+    // the queue.
+    NewQueue.emplace_back(E);
+  }
+
+  // Sort remaining edges in increasing weight order.
+  Queue.swap(NewQueue);
+  std::sort(Queue.begin(), Queue.end(), Comp);
+}
+
+bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
+    const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
+  return Front.back() == E.Src && Back.front() == E.Dst;
+}
+
+void MinBranchGreedyClusterAlgorithm::reset() {
+  GreedyClusterAlgorithm::reset();
+  Weight.clear();
+}
+
+void TSPReorderAlgorithm::reorderBasicBlocks(
+    const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  std::vector<std::vector<uint64_t>> Weight;
+  std::vector<BinaryBasicBlock *> IndexToBB;
+
+  const size_t N = BF.layout_size();
+  assert(N <= std::numeric_limits<uint64_t>::digits &&
+         "cannot use TSP solution for sizes larger than bits in uint64_t");
+
+  // Populating weight map and index map
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    BB->setLayoutIndex(IndexToBB.size());
+    IndexToBB.push_back(BB);
+  }
+  Weight.resize(N);
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    auto BI = BB->branch_info_begin();
+    Weight[BB->getLayoutIndex()].resize(N);
+    for (BinaryBasicBlock *SuccBB : BB->successors()) {
+      if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
+        Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
+      ++BI;
+    }
+  }
+
+  std::vector<std::vector<int64_t>> DP;
+  DP.resize(1 << N);
+  for (std::vector<long> &Elmt : DP) {
+    Elmt.resize(N, -1);
+  }
+  // Start with the entry basic block being allocated with cost zero
+  DP[1][0] = 0;
+  // Walk through TSP solutions using a bitmask to represent state (current set
+  // of BBs in the layout)
+  uint64_t BestSet = 1;
+  uint64_t BestLast = 0;
+  int64_t BestWeight = 0;
+  for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
+    // Traverse each possibility of Last BB visited in this layout
+    for (uint64_t Last = 0; Last < N; ++Last) {
+      // Case 1: There is no possible layout with this BB as Last
+      if (DP[Set][Last] == -1)
+        continue;
+
+      // Case 2: There is a layout with this Set and this Last, and we try
+      // to expand this set with New
+      for (uint64_t New = 1; New < N; ++New) {
+        // Case 2a: BB "New" is already in this Set
+        if ((Set & (1ULL << New)) != 0)
+          continue;
+
+        // Case 2b: BB "New" is not in this set and we add it to this Set and
+        // record total weight of this layout with "New" as the last BB.
+        uint64_t NewSet = (Set | (1ULL << New));
+        if (DP[NewSet][New] == -1)
+          DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
+        DP[NewSet][New] = std::max(DP[NewSet][New],
+                                   DP[Set][Last] + (int64_t)Weight[Last][New]);
+
+        if (DP[NewSet][New] > BestWeight) {
+          BestWeight = DP[NewSet][New];
+          BestSet = NewSet;
+          BestLast = New;
+        }
+      }
+    }
+  }
+
+  // Define final function layout based on layout that maximizes weight
+  uint64_t Last = BestLast;
+  uint64_t Set = BestSet;
+  std::vector<bool> Visited;
+  Visited.resize(N);
+  Visited[Last] = true;
+  Order.push_back(IndexToBB[Last]);
+  Set = Set & ~(1ULL << Last);
+  while (Set != 0) {
+    int64_t Best = -1;
+    uint64_t NewLast;
+    for (uint64_t I = 0; I < N; ++I) {
+      if (DP[Set][I] == -1)
+        continue;
+      int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
+      if (DP[Set][I] + AdjWeight > Best) {
+        NewLast = I;
+        Best = DP[Set][I] + AdjWeight;
+      }
+    }
+    Last = NewLast;
+    Visited[Last] = true;
+    Order.push_back(IndexToBB[Last]);
+    Set = Set & ~(1ULL << Last);
+  }
+  std::reverse(Order.begin(), Order.end());
+
+  // Finalize layout with BBs that weren't assigned to the layout using the
+  // input layout.
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (Visited[BB->getLayoutIndex()] == false)
+      Order.push_back(BB);
+  }
+}
+
+void OptimizeReorderAlgorithm::reorderBasicBlocks(
+    const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF);
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Arrange basic blocks according to clusters.
+  for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+}
+
+void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
+    const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
+  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+  std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
+      CAlgo->ClusterEdges;
+
+  // Compute clusters' average frequencies.
+  CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
+  std::vector<double> &AvgFreq = CAlgo->AvgFreq;
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Cluster layout order
+  std::vector<uint32_t> ClusterOrder;
+
+  // Do a topological sort for clusters, prioritizing frequently-executed BBs
+  // during the traversal.
+  std::stack<uint32_t> Stack;
+  std::vector<uint32_t> Status;
+  std::vector<uint32_t> Parent;
+  Status.resize(Clusters.size(), 0);
+  Parent.resize(Clusters.size(), 0);
+  constexpr uint32_t STACKED = 1;
+  constexpr uint32_t VISITED = 2;
+  Status[0] = STACKED;
+  Stack.push(0);
+  while (!Stack.empty()) {
+    uint32_t I = Stack.top();
+    if (!(Status[I] & VISITED)) {
+      Status[I] |= VISITED;
+      // Order successors by weight
+      auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
+        return ClusterEdges[I][A] > ClusterEdges[I][B];
+      };
+      std::priority_queue<uint32_t, std::vector<uint32_t>,
+                          decltype(ClusterComp)> SuccQueue(ClusterComp);
+      for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
+        if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
+            !Clusters[Target.first].empty()) {
+          Parent[Target.first] = I;
+          Status[Target.first] = STACKED;
+          SuccQueue.push(Target.first);
+        }
+      }
+      while (!SuccQueue.empty()) {
+        Stack.push(SuccQueue.top());
+        SuccQueue.pop();
+      }
+      continue;
+    }
+    // Already visited this node
+    Stack.pop();
+    ClusterOrder.push_back(I);
+  }
+  std::reverse(ClusterOrder.begin(), ClusterOrder.end());
+  // Put unreachable clusters at the end
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+    if (!(Status[I] & VISITED) && !Clusters[I].empty())
+      ClusterOrder.push_back(I);
+
+  // Sort nodes with equal precedence
+  auto Beg = ClusterOrder.begin();
+  // Don't reorder the first cluster, which contains the function entry point
+  ++Beg;
+  std::stable_sort(Beg, ClusterOrder.end(),
+                   [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
+                     uint32_t P = Parent[A];
+                     while (Parent[P] != 0) {
+                       if (Parent[P] == B)
+                         return false;
+                       P = Parent[P];
+                     }
+                     P = Parent[B];
+                     while (Parent[P] != 0) {
+                       if (Parent[P] == A)
+                         return true;
+                       P = Parent[P];
+                     }
+                     return AvgFreq[A] > AvgFreq[B];
+                   });
+
+  if (opts::PrintClusters) {
+    errs() << "New cluster order: ";
+    const char *Sep = "";
+    for (uint32_t O : ClusterOrder) {
+      errs() << Sep << O;
+      Sep = ", ";
+    }
+    errs() << '\n';
+  }
+
+  // Arrange basic blocks according to cluster order.
+  for (uint32_t ClusterIndex : ClusterOrder) {
+    ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+  }
+}
+
+void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  const uint64_t ColdThreshold =
+      opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF);
+  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+
+  // Compute clusters' average frequencies.
+  CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
+  std::vector<double> &AvgFreq = CAlgo->AvgFreq;
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Cluster layout order
+  std::vector<uint32_t> ClusterOrder;
+
+  // Order clusters based on average instruction execution frequency
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+    if (!Clusters[I].empty())
+      ClusterOrder.push_back(I);
+  // Don't reorder the first cluster, which contains the function entry point
+  std::stable_sort(std::next(ClusterOrder.begin()),
+                   ClusterOrder.end(),
+                   [&AvgFreq](uint32_t A, uint32_t B) {
+                     return AvgFreq[A] > AvgFreq[B];
+                   });
+
+  if (opts::PrintClusters) {
+    errs() << "New cluster order: ";
+    const char *Sep = "";
+    for (uint32_t O : ClusterOrder) {
+      errs() << Sep << O;
+      Sep = ", ";
+    }
+    errs() << '\n';
+  }
+
+  // Arrange basic blocks according to cluster order.
+  for (uint32_t ClusterIndex : ClusterOrder) {
+    ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+    // Force zero execution count on clusters that do not meet the cut off
+    // specified by --cold-threshold.
+    if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) {
+      for (BinaryBasicBlock *BBPtr : Cluster) {
+        BBPtr->setExecutionCount(0);
+      }
+    }
+  }
+}
+
+void ReverseReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  BinaryBasicBlock *FirstBB = *BF.layout_begin();
+  Order.push_back(FirstBB);
+  for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
+    Order.push_back(*RLI);
+}
+
+void RandomClusterReorderAlgorithm::reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const {
+  if (BF.layout_empty())
+    return;
+
+  // Cluster basic blocks.
+  CAlgo->clusterBasicBlocks(BF);
+  std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
+
+  if (opts::PrintClusters)
+    CAlgo->printClusters();
+
+  // Cluster layout order
+  std::vector<uint32_t> ClusterOrder;
+
+  // Order clusters based on average instruction execution frequency
+  for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+    if (!Clusters[I].empty())
+      ClusterOrder.push_back(I);
+
+  std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
+               std::default_random_engine(opts::RandomSeed.getValue()));
+
+  if (opts::PrintClusters) {
+    errs() << "New cluster order: ";
+    const char *Sep = "";
+    for (uint32_t O : ClusterOrder) {
+      errs() << Sep << O;
+      Sep = ", ";
+    }
+    errs() << '\n';
+  }
+
+  // Arrange basic blocks according to cluster order.
+  for (uint32_t ClusterIndex : ClusterOrder) {
+    ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
+    Order.insert(Order.end(),  Cluster.begin(), Cluster.end());
+  }
+}
diff --git a/bolt/src/Passes/ReorderAlgorithm.h b/bolt/src/Passes/ReorderAlgorithm.h
new file mode 100644
index 000000000000..abcccd39c084
--- /dev/null
+++ b/bolt/src/Passes/ReorderAlgorithm.h
@@ -0,0 +1,270 @@
+// Passes/ReorderAlgorithm.h - Interface for basic block reorderng algorithms //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to different basic block reordering algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H
+
+#include "BinaryFunction.h"
+#include <unordered_map>
+#include <memory>
+#include <vector>
+
+
+namespace llvm {
+
+class raw_ostream;
+
+
+namespace bolt {
+
+
+/// Objects of this class implement various basic block clustering algorithms.
+/// Basic block clusters are chains of basic blocks that should be laid out
+/// in this order to maximize performace. These algorithms group basic blocks
+/// into clusters using execution profile data and various heuristics.
+class ClusterAlgorithm {
+public:
+  using ClusterTy = std::vector<BinaryBasicBlock *>;
+  std::vector<ClusterTy> Clusters;
+  std::vector<std::unordered_map<uint32_t, uint64_t>> ClusterEdges;
+  std::vector<double> AvgFreq;
+
+  /// Group the basic blocks in the given function into clusters stored in the
+  /// Clusters vector. Also encode relative weights between two clusters in
+  /// the ClusterEdges vector if requested. This vector is indexed by
+  /// the clusters indices in the Clusters vector.
+  virtual void clusterBasicBlocks(const BinaryFunction &BF,
+                                  bool ComputeEdges = false) = 0;
+
+  /// Compute for each cluster its averagae execution frequency, that is
+  /// the sum of average frequencies of its blocks (execution count / # instrs).
+  /// The average frequencies are stored in the AvgFreq vector, index by the
+  /// cluster indices in the Clusters vector.
+  void computeClusterAverageFrequency(const BinaryContext &BC);
+
+  /// Clear clusters and related info.
+  virtual void reset();
+
+  void printClusters() const;
+
+  virtual ~ClusterAlgorithm() {}
+};
+
+/// Base class for a greedy clustering algorithm that selects edges in order
+/// based on some heuristic and uses them to join basic blocks into clusters.
+class GreedyClusterAlgorithm : public ClusterAlgorithm {
+protected:
+  // Represents an edge between two basic blocks, with source, destination, and
+  // profile count.
+  struct EdgeTy {
+    const BinaryBasicBlock *Src;
+    const BinaryBasicBlock *Dst;
+    uint64_t Count;
+
+    EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst,
+           uint64_t Count) :
+      Src(Src), Dst(Dst), Count(Count) {}
+
+    void print(raw_ostream &OS) const;
+  };
+
+  struct EdgeHash {
+   size_t operator() (const EdgeTy &E) const;
+  };
+
+  struct EdgeEqual {
+    bool operator() (const EdgeTy &A, const EdgeTy &B) const;
+  };
+
+  // Virtual methods that allow custom specialization of the heuristic used by
+  // the algorithm to select edges.
+  virtual void initQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
+  virtual void adjustQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
+  virtual bool areClustersCompatible(
+      const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0;
+
+  // Map from basic block to owning cluster index.
+  using BBToClusterMapTy = std::unordered_map<const BinaryBasicBlock *,
+                                              unsigned>;
+  BBToClusterMapTy BBToClusterMap;
+
+public:
+  void clusterBasicBlocks(const BinaryFunction &BF,
+                          bool ComputeEdges = false) override;
+  void reset() override;
+};
+
+
+/// This clustering algorithm is based on a greedy heuristic suggested by
+/// Pettis and Hansen (PLDI '90).
+class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
+protected:
+  void initQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF)  override;
+  void adjustQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
+  bool areClustersCompatible(
+      const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
+  override;
+};
+
+
+/// This clustering algorithm is based on a greedy heuristic that is a
+/// modification of the heuristic suggested by Pettis (PLDI '90). It is
+/// geared towards minimizing branches.
+class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
+private:
+  // Map from an edge to its weight which is used by the algorithm to sort the
+  // edges.
+  std::unordered_map<EdgeTy, int64_t, EdgeHash, EdgeEqual> Weight;
+
+  // The weight of an edge is calculated as the win in branches if we choose
+  // to layout this edge as a fall-through. For example, consider the edges
+  //  A -> B with execution count 500,
+  //  A -> C with execution count 100, and
+  //  D -> B with execution count 150
+  // wher B, C are the only successors of A and A, D are thr only predessecors
+  // of B. Then if we choose to layout edge A -> B as a fallthrough, the win in
+  // branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B.
+  int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const;
+
+protected:
+  void initQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF)  override;
+  void adjustQueue(
+      std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
+  bool areClustersCompatible(
+      const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
+  override;
+
+public:
+  void reset() override;
+};
+
+
+/// Objects of this class implement various basic block reordering alogrithms.
+/// Most of these algorithms depend on a clustering alogrithm.
+/// Here we have 3 conflicting goals as to how to layout clusters. If we want
+/// to minimize jump offsets, we should put clusters with heavy inter-cluster
+/// dependence as close as possible. If we want to maximize the probability
+/// that all inter-cluster edges are predicted as not-taken, we should enforce
+/// a topological order to make targets appear after sources, creating forward
+/// branches. If we want to separate hot from cold blocks to maximize the
+/// probability that unfrequently executed code doesn't pollute the cache, we
+/// should put clusters in descending order of hotness.
+class ReorderAlgorithm {
+protected:
+  std::unique_ptr<ClusterAlgorithm> CAlgo;
+
+public:
+  ReorderAlgorithm() { }
+  explicit ReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    CAlgo(std::move(CAlgo)) { }
+
+  using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
+
+  /// Reorder the basic blocks of the given function and store the new order in
+  /// the new Clusters vector.
+  virtual void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const = 0;
+
+  void setClusterAlgorithm(ClusterAlgorithm *CAlgo) {
+    this->CAlgo.reset(CAlgo);
+  }
+
+  virtual ~ReorderAlgorithm() { }
+};
+
+
+/// Dynamic programming implementation for the TSP, applied to BB layout. Find
+/// the optimal way to maximize weight during a path traversing all BBs. In
+/// this way, we will convert the hottest branches into fall-throughs.
+///
+/// Uses exponential amount of memory on the number of basic blocks and should
+/// only be used for small functions.
+class TSPReorderAlgorithm : public ReorderAlgorithm {
+public:
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// Simple algorithm that groups basic blocks into clusters and then
+/// lays them out cluster after cluster.
+class OptimizeReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit OptimizeReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// This reorder algorithm tries to ensure that all inter-cluster edges are
+/// predicted as not-taken, by enforcing a topological order to make
+/// targets appear after sources, creating forward branches.
+class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit OptimizeBranchReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+
+/// This reorder tries to separate hot from cold blocks to maximize the
+/// probability that unfrequently executed code doesn't pollute the cache, by
+/// putting clusters in descending order of hotness.
+class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit OptimizeCacheReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+/// A new reordering algorithm for basic blocks, ext-tsp
+class ExtTSPReorderAlgorithm : public ReorderAlgorithm {
+public:
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+/// Toy example that simply reverses the original basic block order.
+class ReverseReorderAlgorithm : public ReorderAlgorithm {
+public:
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+/// Create clusters as usual and place them in random order.
+class RandomClusterReorderAlgorithm : public ReorderAlgorithm {
+public:
+  explicit RandomClusterReorderAlgorithm(
+      std::unique_ptr<ClusterAlgorithm> CAlgo) :
+    ReorderAlgorithm(std::move(CAlgo)) { }
+
+  void reorderBasicBlocks(
+      const BinaryFunction &BF, BasicBlockOrder &Order) const override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/ReorderData.cpp b/bolt/src/Passes/ReorderData.cpp
new file mode 100644
index 000000000000..54027937f5ff
--- /dev/null
+++ b/bolt/src/Passes/ReorderData.cpp
@@ -0,0 +1,555 @@
+//===--- ReorderSection.cpp - Profile based reordering of section data =======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+// TODO:
+// - make sure writeable data isn't put on same cache line unless temporally local
+// - estimate temporal locality by looking at CFG?
+
+#include "ReorderData.h"
+#include <algorithm>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "reorder-data"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+extern cl::OptionCategory BoltCategory;
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+
+static cl::opt<bool>
+PrintReorderedData("print-reordered-data",
+  cl::desc("print section contents after reordering"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::list<std::string>
+ReorderData("reorder-data",
+  cl::CommaSeparated,
+  cl::desc("list of sections to reorder"),
+  cl::value_desc("section1,section2,section3,..."),
+  cl::cat(BoltOptCategory));
+
+enum ReorderAlgo : char {
+  REORDER_COUNT         = 0,
+  REORDER_FUNCS         = 1
+};
+
+static cl::opt<ReorderAlgo>
+ReorderAlgorithm("reorder-data-algo",
+  cl::desc("algorithm used to reorder data sections"),
+  cl::init(REORDER_COUNT),
+  cl::values(
+    clEnumValN(REORDER_COUNT,
+      "count",
+      "sort hot data by read counts"),
+    clEnumValN(REORDER_FUNCS,
+      "funcs",
+      "sort hot data by hot function usage and count")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ReorderDataMaxSymbols("reorder-data-max-symbols",
+  cl::desc("maximum number of symbols to reorder"),
+  cl::ZeroOrMore,
+  cl::init(std::numeric_limits<unsigned>::max()),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+ReorderDataMaxBytes("reorder-data-max-bytes",
+  cl::desc("maximum number of bytes to reorder"),
+  cl::ZeroOrMore,
+  cl::init(std::numeric_limits<unsigned>::max()),
+  cl::cat(BoltOptCategory));
+
+static cl::list<std::string>
+ReorderSymbols("reorder-symbols",
+  cl::CommaSeparated,
+  cl::desc("list of symbol names that can be reordered"),
+  cl::value_desc("symbol1,symbol2,symbol3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::list<std::string>
+SkipSymbols("reorder-skip-symbols",
+  cl::CommaSeparated,
+  cl::desc("list of symbol names that cannot be reordered"),
+  cl::value_desc("symbol1,symbol2,symbol3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+ReorderInplace("reorder-data-inplace",
+  cl::desc("reorder data sections in place"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+static constexpr uint16_t MinAlignment = 16;
+
+bool isSupported(const BinarySection &BS) {
+  return BS.isData() && !BS.isTLS();
+}
+
+bool filterSymbol(const BinaryData *BD) {
+  if (!BD->isAtomic() || BD->isJumpTable() || !BD->isMoveable())
+    return false;
+
+  bool IsValid = true;
+
+  if (!opts::ReorderSymbols.empty()) {
+    IsValid = false;
+    for (const std::string &Name : opts::ReorderSymbols) {
+      if (BD->hasName(Name)) {
+        IsValid = true;
+        break;
+      }
+    }
+  }
+
+  if (!IsValid)
+    return false;
+
+  if (!opts::SkipSymbols.empty()) {
+    for (const std::string &Name : opts::SkipSymbols) {
+      if (BD->hasName(Name)) {
+        IsValid = false;
+        break;
+      }
+    }
+  }
+
+  return IsValid;
+}
+
+}
+
+using DataOrder = ReorderData::DataOrder;
+
+void ReorderData::printOrder(const BinarySection &Section,
+                             DataOrder::const_iterator Begin,
+                             DataOrder::const_iterator End) const {
+  uint64_t TotalSize = 0;
+  bool PrintHeader = false;
+  while (Begin != End) {
+    const BinaryData *BD = Begin->first;
+
+    if (!PrintHeader) {
+      outs() << "BOLT-INFO: Hot global symbols for "
+             << Section.getName() << ":\n";
+      PrintHeader = true;
+    }
+
+    outs() << "BOLT-INFO: " << *BD << ", moveable=" << BD->isMoveable()
+           << format(", weight=%.5f\n", double(Begin->second)/BD->getSize());
+
+    TotalSize += BD->getSize();
+    ++Begin;
+  }
+  if (TotalSize)
+    outs() << "BOLT-INFO: Total hot symbol size = " << TotalSize << "\n";
+}
+
+DataOrder ReorderData::baseOrder(BinaryContext &BC,
+                                 const BinarySection &Section) const {
+  DataOrder Order;
+  for (auto &Entry : BC.getBinaryDataForSection(Section)) {
+    BinaryData *BD = Entry.second;
+    if (!BD->isAtomic()) // skip sub-symbols
+      continue;
+    auto BDCI = BinaryDataCounts.find(BD);
+    uint64_t BDCount = BDCI == BinaryDataCounts.end() ? 0 : BDCI->second;
+    Order.emplace_back(BD, BDCount);
+  }
+  return Order;
+}
+
+void ReorderData::assignMemData(BinaryContext &BC) {
+  // Map of sections (or heap/stack) to count/size.
+  StringMap<uint64_t> Counts;
+  StringMap<uint64_t> JumpTableCounts;
+  uint64_t TotalCount = 0;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction &BF = BFI.second;
+    if (!BF.hasMemoryProfile())
+      continue;
+
+    for (const BinaryBasicBlock &BB : BF) {
+      for (const MCInst &Inst : BB) {
+        auto ErrorOrMemAccesssProfile =
+          BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(
+              Inst, "MemoryAccessProfile");
+        if (!ErrorOrMemAccesssProfile)
+          continue;
+
+        const MemoryAccessProfile &MemAccessProfile =
+            ErrorOrMemAccesssProfile.get();
+        for (const AddressAccess &AccessInfo :
+             MemAccessProfile.AddressAccessInfo) {
+          if (BinaryData *BD = AccessInfo.MemoryObject) {
+            BinaryDataCounts[BD->getAtomicRoot()] += AccessInfo.Count;
+            Counts[BD->getSectionName()] += AccessInfo.Count;
+            if (BD->getAtomicRoot()->isJumpTable()) {
+              JumpTableCounts[BD->getSectionName()] += AccessInfo.Count;
+            }
+          } else {
+            Counts["Heap/stack"] += AccessInfo.Count;
+          }
+          TotalCount += AccessInfo.Count;
+        }
+      }
+    }
+  }
+
+  if (!Counts.empty()) {
+    outs() << "BOLT-INFO: Memory stats breakdown:\n";
+    for (StringMapEntry<uint64_t> &Entry : Counts) {
+      StringRef Section = Entry.first();
+      const uint64_t Count = Entry.second;
+      outs() << "BOLT-INFO:   " << Section << " = " << Count
+             << format(" (%.1f%%)\n", 100.0*Count/TotalCount);
+      if (JumpTableCounts.count(Section) != 0) {
+        const uint64_t JTCount = JumpTableCounts[Section];
+        outs() << "BOLT-INFO:     jump tables = " << JTCount
+               << format(" (%.1f%%)\n", 100.0*JTCount/Count);
+      }
+    }
+    outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n";
+  }
+}
+
+/// Only consider moving data that is used by the hottest functions with
+/// valid profiles.
+std::pair<DataOrder, unsigned> ReorderData::sortedByFunc(
+  BinaryContext &BC,
+  const BinarySection &Section,
+  std::map<uint64_t, BinaryFunction> &BFs
+) const {
+  std::map<BinaryData *, std::set<BinaryFunction *>> BDtoFunc;
+  std::map<BinaryData *, uint64_t> BDtoFuncCount;
+
+  auto dataUses = [&BC](const BinaryFunction &BF, bool OnlyHot) {
+    std::set<BinaryData *> Uses;
+    for (const BinaryBasicBlock &BB : BF) {
+      if (OnlyHot && BB.isCold())
+        continue;
+
+      for (const MCInst &Inst : BB) {
+        auto ErrorOrMemAccesssProfile =
+          BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(
+              Inst, "MemoryAccessProfile");
+        if (!ErrorOrMemAccesssProfile)
+          continue;
+
+        const MemoryAccessProfile &MemAccessProfile =
+            ErrorOrMemAccesssProfile.get();
+        for (const AddressAccess &AccessInfo :
+             MemAccessProfile.AddressAccessInfo) {
+          if (AccessInfo.MemoryObject)
+            Uses.insert(AccessInfo.MemoryObject);
+        }
+      }
+    }
+    return Uses;
+  };
+
+  for (auto &Entry : BFs) {
+    BinaryFunction &BF = Entry.second;
+    if (BF.hasValidProfile()) {
+      for (BinaryData *BD : dataUses(BF, true)) {
+        if (!BC.getFunctionForSymbol(BD->getSymbol())) {
+          BDtoFunc[BD->getAtomicRoot()].insert(&BF);
+          BDtoFuncCount[BD->getAtomicRoot()] += BF.getKnownExecutionCount();
+        }
+      }
+    }
+  }
+
+  DataOrder Order = baseOrder(BC, Section);
+  unsigned SplitPoint = Order.size();
+
+  std::sort(Order.begin(), Order.end(),
+            [&](const DataOrder::value_type &A,
+                const DataOrder::value_type &B) {
+              // Total execution counts of functions referencing BD.
+              const uint64_t ACount = BDtoFuncCount[A.first];
+              const uint64_t BCount = BDtoFuncCount[B.first];
+              // Weight by number of loads/data size.
+              const double AWeight = double(A.second) / A.first->getSize();
+              const double BWeight = double(B.second) / B.first->getSize();
+              return (ACount > BCount ||
+                      (ACount == BCount &&
+                       (AWeight > BWeight ||
+                        (AWeight == BWeight &&
+                         A.first->getAddress() < B.first->getAddress()))));
+            });
+
+  for (unsigned Idx = 0; Idx < Order.size(); ++Idx) {
+    if (!BDtoFuncCount[Order[Idx].first]) {
+      SplitPoint = Idx;
+      break;
+    }
+  }
+
+  return std::make_pair(Order, SplitPoint);
+}
+
+std::pair<DataOrder, unsigned> ReorderData::sortedByCount(
+  BinaryContext &BC,
+  const BinarySection &Section
+) const {
+  DataOrder Order = baseOrder(BC, Section);
+  unsigned SplitPoint = Order.size();
+
+  std::sort(Order.begin(), Order.end(),
+            [](const DataOrder::value_type &A,
+               const DataOrder::value_type &B) {
+              // Weight by number of loads/data size.
+              const double AWeight = double(A.second) / A.first->getSize();
+              const double BWeight = double(B.second) / B.first->getSize();
+              return (AWeight > BWeight ||
+                      (AWeight == BWeight &&
+                       (A.first->getSize() < B.first->getSize() ||
+                        (A.first->getSize() == B.first->getSize() &&
+                         A.first->getAddress() < B.first->getAddress()))));
+            });
+
+  for (unsigned Idx = 0; Idx < Order.size(); ++Idx) {
+    if (!Order[Idx].second) {
+      SplitPoint = Idx;
+      break;
+    }
+  }
+
+  return std::make_pair(Order, SplitPoint);
+}
+
+// TODO
+// add option for cache-line alignment (or just use cache-line when section
+// is writeable)?
+void ReorderData::setSectionOrder(BinaryContext &BC,
+                                  BinarySection &OutputSection,
+                                  DataOrder::iterator Begin,
+                                  DataOrder::iterator End) {
+  std::vector<BinaryData *> NewOrder;
+  unsigned NumReordered = 0;
+  uint64_t Offset = 0;
+  uint64_t Count = 0;
+
+  // Get the total count just for stats
+  uint64_t TotalCount = 0;
+  for (auto Itr = Begin; Itr != End; ++Itr) {
+    TotalCount += Itr->second;
+  }
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: setSectionOrder for "
+                    << OutputSection.getName() << "\n");
+
+  for (; Begin != End; ++Begin) {
+    BinaryData *BD = Begin->first;
+
+    // we can't move certain symbols because they are screwy, see T25076484.
+    if (!filterSymbol(BD))
+      continue;
+
+    ++NumReordered;
+    if (NumReordered > opts::ReorderDataMaxSymbols) {
+      if (!NewOrder.empty()) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: processing ending on symbol "
+                          << *NewOrder.back() << "\n");
+      }
+      break;
+    }
+
+    uint16_t Alignment = std::max(BD->getAlignment(), MinAlignment);
+    Offset = alignTo(Offset, Alignment);
+
+    if ((Offset + BD->getSize()) > opts::ReorderDataMaxBytes) {
+      if (!NewOrder.empty()) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: processing ending on symbol "
+                          << *NewOrder.back() << "\n");
+      }
+      break;
+    }
+
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ 0x"
+                      << Twine::utohexstr(Offset) << "\n");
+
+    BD->setOutputLocation(OutputSection, Offset);
+
+    // reorder sub-symbols
+    for (std::pair<const uint64_t, BinaryData *> &SubBD :
+         BC.getSubBinaryData(BD)) {
+      if (!SubBD.second->isJumpTable()) {
+        uint64_t SubOffset =
+            Offset + SubBD.second->getAddress() - BD->getAddress();
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: SubBD " << SubBD.second->getName()
+                          << " @ " << SubOffset << "\n");
+        SubBD.second->setOutputLocation(OutputSection, SubOffset);
+      }
+    }
+
+    Offset += BD->getSize();
+    Count += Begin->second;
+    NewOrder.push_back(BD);
+  }
+
+  OutputSection.reorderContents(NewOrder, opts::ReorderInplace);
+
+  outs() << "BOLT-INFO: reorder-data: " << Count << "/" << TotalCount
+         << format(" (%.1f%%)", 100.0*Count/TotalCount) << " events, "
+         << Offset << " hot bytes\n";
+}
+
+bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
+                                        BinarySection &Section) const {
+  // Private symbols currently can't be moved because data can "leak" across
+  // the boundary of one symbol to the next, e.g. a string that has a common
+  // suffix might start in one private symbol and end with the common
+  // suffix in another.
+  auto isPrivate = [&](const BinaryData *BD) {
+    auto Prefix = std::string("PG") + BC.AsmInfo->getPrivateGlobalPrefix();
+    return BD->getName().startswith(Prefix.str());
+  };
+  auto Range = BC.getBinaryDataForSection(Section);
+  bool FoundUnmoveable = false;
+  for (auto Itr = Range.begin(); Itr != Range.end(); ++Itr) {
+    if (Itr->second->getName().startswith("PG.")) {
+      BinaryData *Prev =
+          Itr != Range.begin() ? std::prev(Itr)->second : nullptr;
+      BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr;
+      bool PrevIsPrivate = Prev && isPrivate(Prev);
+      bool NextIsPrivate = Next && isPrivate(Next);
+      if (isPrivate(Itr->second) && (PrevIsPrivate || NextIsPrivate))
+        Itr->second->setIsMoveable(false);
+    } else {
+      // check for overlapping symbols.
+      BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr;
+      if (Next &&
+          Itr->second->getEndAddress() != Next->getAddress() &&
+          Next->containsAddress(Itr->second->getEndAddress())) {
+        Itr->second->setIsMoveable(false);
+        Next->setIsMoveable(false);
+      }
+    }
+    FoundUnmoveable |= !Itr->second->isMoveable();
+  }
+  return FoundUnmoveable;
+}
+
+void ReorderData::runOnFunctions(BinaryContext &BC) {
+  static const char* DefaultSections[] = {
+    ".rodata",
+    ".data",
+    ".bss",
+    nullptr
+  };
+
+  if (!BC.HasRelocations || opts::ReorderData.empty())
+    return;
+
+  // For now
+  if (opts::JumpTables > JTS_BASIC) {
+    outs() << "BOLT-WARNING: jump table support must be basic for "
+           << "data reordering to work.\n";
+    return;
+  }
+
+  assignMemData(BC);
+
+  std::vector<BinarySection *> Sections;
+
+  for (const std::string &SectionName : opts::ReorderData) {
+    if (SectionName == "default") {
+      for (unsigned I = 0; DefaultSections[I]; ++I) {
+        if (ErrorOr<BinarySection &> Section =
+                BC.getUniqueSectionByName(DefaultSections[I]))
+          Sections.push_back(&*Section);
+      }
+      continue;
+    }
+
+    ErrorOr<BinarySection &> Section = BC.getUniqueSectionByName(SectionName);
+    if (!Section) {
+      outs() << "BOLT-WARNING: Section " << SectionName
+             << " not found, skipping.\n";
+      continue;
+    }
+
+    if (!isSupported(*Section)) {
+      outs() << "BOLT-ERROR: Section " << SectionName << " not supported.\n";
+      exit(1);
+    }
+
+    Sections.push_back(&*Section);
+  }
+
+  for (BinarySection *Section : Sections) {
+    const bool FoundUnmoveable = markUnmoveableSymbols(BC, *Section);
+
+    DataOrder Order;
+    unsigned SplitPointIdx;
+
+    if (opts::ReorderAlgorithm == opts::ReorderAlgo::REORDER_COUNT) {
+      outs() << "BOLT-INFO: reorder-sections: ordering data by count\n";
+      std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section);
+    } else {
+      outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
+      std::tie(Order, SplitPointIdx) =
+        sortedByFunc(BC, *Section, BC.getBinaryFunctions());
+    }
+    auto SplitPoint = Order.begin() + SplitPointIdx;
+
+    if (opts::PrintReorderedData) {
+      printOrder(*Section, Order.begin(), SplitPoint);
+    }
+
+    if (!opts::ReorderInplace || FoundUnmoveable) {
+      if (opts::ReorderInplace && FoundUnmoveable) {
+        outs() << "BOLT-INFO: Found unmoveable symbols in "
+               << Section->getName() << " falling back to splitting "
+               << "instead of in-place reordering.\n";
+      }
+
+      // Copy original section to <section name>.cold.
+      BinarySection &Cold = BC.registerSection(
+          std::string(Section->getName()) + ".cold", *Section);
+
+      // Reorder contents of original section.
+      setSectionOrder(BC, *Section, Order.begin(), SplitPoint);
+
+      // This keeps the original data from thinking it has been moved.
+      for (std::pair<const uint64_t, BinaryData *> &Entry :
+           BC.getBinaryDataForSection(*Section)) {
+        if (!Entry.second->isMoved()) {
+          Entry.second->setSection(Cold);
+          Entry.second->setOutputSection(Cold);
+        }
+      }
+    } else {
+      outs() << "BOLT-WARNING: Inplace section reordering not supported yet.\n";
+      setSectionOrder(BC, *Section, Order.begin(), Order.end());
+    }
+  }
+}
+
+}
+}
diff --git a/bolt/src/Passes/ReorderData.h b/bolt/src/Passes/ReorderData.h
new file mode 100644
index 000000000000..e84b55a7ca85
--- /dev/null
+++ b/bolt/src/Passes/ReorderData.h
@@ -0,0 +1,69 @@
+//===--- ReorderSection.h - Profile based reordering of section data =========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_DATA_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_DATA_H
+
+#include "BinaryPasses.h"
+#include <unordered_map>
+
+namespace llvm {
+namespace bolt {
+
+class ReorderData : public BinaryFunctionPass {
+public:
+  using DataOrder = std::vector<std::pair<BinaryData *, uint64_t>>;
+
+private:
+  DataOrder baseOrder(BinaryContext &BC,
+                      const BinarySection &Section) const;
+
+  std::unordered_map<BinaryData *, uint64_t> BinaryDataCounts;
+
+  void assignMemData(BinaryContext &BC);
+
+  /// Sort symbols by memory profiling data execution count.  The output
+  /// is a vector of [address,count] pairs.
+  std::pair<DataOrder, unsigned>
+  sortedByCount(BinaryContext &BC, const BinarySection &Section) const;
+
+  std::pair<DataOrder, unsigned>
+  sortedByFunc(BinaryContext &BC,
+               const BinarySection &Section,
+               std::map<uint64_t, BinaryFunction> &BFs) const;
+
+  void printOrder(const BinarySection &Section,
+                  DataOrder::const_iterator Begin,
+                  DataOrder::const_iterator End) const;
+
+  /// Set the ordering of the section with \p SectionName.  \p NewOrder is a
+  /// vector of [old address, size] pairs.  The new symbol order is implicit
+  /// in the order of the vector.
+  void setSectionOrder(BinaryContext &BC,
+                       BinarySection &OutputSection,
+                       DataOrder::iterator Begin,
+                       DataOrder::iterator End);
+
+  bool markUnmoveableSymbols(BinaryContext &BC,
+                             BinarySection &Section) const;
+public:
+  explicit ReorderData() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override {
+    return "reorder-data";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/ReorderFunctions.cpp b/bolt/src/Passes/ReorderFunctions.cpp
new file mode 100644
index 000000000000..a80312d46b10
--- /dev/null
+++ b/bolt/src/Passes/ReorderFunctions.cpp
@@ -0,0 +1,487 @@
+//===--- ReorderFunctions.cpp - Function reordering pass ------------ -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReorderFunctions.h"
+#include "HFSort.h"
+#include "llvm/Support/CommandLine.h"
+#include <fstream>
+
+#define DEBUG_TYPE "hfsort"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<unsigned> Verbosity;
+extern cl::opt<uint32_t> RandomSeed;
+
+extern size_t padFunction(const bolt::BinaryFunction &Function);
+
+cl::opt<bolt::ReorderFunctions::ReorderType>
+ReorderFunctions("reorder-functions",
+  cl::desc("reorder and cluster functions (works only with relocations)"),
+  cl::init(bolt::ReorderFunctions::RT_NONE),
+  cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE,
+      "none",
+      "do not reorder functions"),
+    clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT,
+      "exec-count",
+      "order by execution count"),
+    clEnumValN(bolt::ReorderFunctions::RT_HFSORT,
+      "hfsort",
+      "use hfsort algorithm"),
+    clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS,
+      "hfsort+",
+      "use hfsort+ algorithm"),
+    clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
+      "pettis-hansen",
+      "use Pettis-Hansen algorithm"),
+    clEnumValN(bolt::ReorderFunctions::RT_RANDOM,
+      "random",
+      "reorder functions randomly"),
+    clEnumValN(bolt::ReorderFunctions::RT_USER,
+      "user",
+      "use function order specified by -function-order")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+ReorderFunctionsUseHotSize("reorder-functions-use-hot-size",
+  cl::desc("use a function's hot size when doing clustering"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<std::string>
+FunctionOrderFile("function-order",
+  cl::desc("file containing an ordered list of functions to use for function "
+           "reordering"),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<std::string>
+GenerateFunctionOrderFile("generate-function-order",
+  cl::desc("file to dump the ordered list of functions to use for function "
+           "reordering"),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<std::string>
+LinkSectionsFile("generate-link-sections",
+  cl::desc("generate a list of function sections in a format suitable for "
+           "inclusion in a linker script"),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+UseEdgeCounts("use-edge-counts",
+  cl::desc("use edge count data when doing clustering"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+CgFromPerfData("cg-from-perf-data",
+  cl::desc("use perf data directly when constructing the call graph"
+           " for stale functions"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+CgIgnoreRecursiveCalls("cg-ignore-recursive-calls",
+  cl::desc("ignore recursive calls when constructing the call graph"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+CgUseSplitHotSize("cg-use-split-hot-size",
+  cl::desc("use hot/cold data on basic blocks to determine hot sizes for "
+           "call graph functions"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+using NodeId = CallGraph::NodeId;
+using Arc = CallGraph::Arc;
+using Node = CallGraph::Node;
+
+void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
+                               std::map<uint64_t, BinaryFunction> &BFs) {
+  std::vector<uint64_t> FuncAddr(Cg.numNodes());  // Just for computing stats
+  uint64_t TotalSize = 0;
+  uint32_t Index = 0;
+
+  // Set order of hot functions based on clusters.
+  for (const Cluster &Cluster : Clusters) {
+    for (const NodeId FuncId : Cluster.targets()) {
+      Cg.nodeIdToFunc(FuncId)->setIndex(Index++);
+      FuncAddr[FuncId] = TotalSize;
+      TotalSize += Cg.size(FuncId);
+    }
+  }
+
+  if (opts::ReorderFunctions == RT_NONE)
+    return;
+
+  if (opts::Verbosity == 0) {
+#ifndef NDEBUG
+    if (!DebugFlag || !isCurrentDebugType("hfsort"))
+      return;
+#else
+    return;
+#endif
+  }
+
+  bool PrintDetailed = opts::Verbosity > 1;
+#ifndef NDEBUG
+  PrintDetailed |=
+    (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0);
+#endif
+  TotalSize   = 0;
+  uint64_t CurPage     = 0;
+  uint64_t Hotfuncs    = 0;
+  double TotalDistance = 0;
+  double TotalCalls    = 0;
+  double TotalCalls64B = 0;
+  double TotalCalls4KB = 0;
+  double TotalCalls2MB = 0;
+  if (PrintDetailed) {
+    outs() << "BOLT-INFO: Function reordering page layout\n"
+           << "BOLT-INFO: ============== page 0 ==============\n";
+  }
+  for (Cluster &Cluster : Clusters) {
+    if (PrintDetailed) {
+      outs() <<
+        format("BOLT-INFO: -------- density = %.3lf (%u / %u) --------\n",
+               Cluster.density(), Cluster.samples(), Cluster.size());
+    }
+
+    for (NodeId FuncId : Cluster.targets()) {
+      if (Cg.samples(FuncId) > 0) {
+        Hotfuncs++;
+
+        if (PrintDetailed) {
+          outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId)
+                 << " (" << Cg.size(FuncId) << ")\n";
+        }
+
+        uint64_t Dist = 0;
+        uint64_t Calls = 0;
+        for (NodeId Dst : Cg.successors(FuncId)) {
+          if (FuncId == Dst) // ignore recursive calls in stats
+            continue;
+          const Arc &Arc = *Cg.findArc(FuncId, Dst);
+          const auto D = std::abs(FuncAddr[Arc.dst()] -
+                                      (FuncAddr[FuncId] + Arc.avgCallOffset()));
+          const double W = Arc.weight();
+          if (D < 64 && PrintDetailed && opts::Verbosity > 2) {
+            outs() << "BOLT-INFO: short (" << D << "B) call:\n"
+                   << "BOLT-INFO:   Src: " << *Cg.nodeIdToFunc(FuncId) << "\n"
+                   << "BOLT-INFO:   Dst: " << *Cg.nodeIdToFunc(Dst) << "\n"
+                   << "BOLT-INFO:   Weight = " << W << "\n"
+                   << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset() << "\n";
+          }
+          Calls += W;
+          if (D < 64)        TotalCalls64B += W;
+          if (D < 4096)      TotalCalls4KB += W;
+          if (D < (2 << 20)) TotalCalls2MB += W;
+          Dist += Arc.weight() * D;
+          if (PrintDetailed) {
+            outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
+                             "weight = %.0lf, callDist = %f\n",
+                             Arc.src(),
+                             FuncAddr[Arc.src()],
+                             Arc.avgCallOffset(),
+                             Arc.dst(),
+                             FuncAddr[Arc.dst()],
+                             Arc.weight(), D);
+          }
+        }
+        TotalCalls += Calls;
+        TotalDistance += Dist;
+        TotalSize += Cg.size(FuncId);
+
+        if (PrintDetailed) {
+          outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : ",
+                           TotalSize,
+                           Calls ? Dist / Calls : 0)
+                 << Cg.nodeIdToFunc(FuncId)->getPrintName() << '\n';
+          const uint64_t NewPage = TotalSize / HugePageSize;
+          if (NewPage != CurPage) {
+            CurPage = NewPage;
+            outs() <<
+              format("BOLT-INFO: ============== page %u ==============\n",
+                     CurPage);
+          }
+        }
+      }
+    }
+  }
+  outs() << "BOLT-INFO: Function reordering stats\n"
+         << format("BOLT-INFO:  Number of hot functions: %u\n"
+                   "BOLT-INFO:  Number of clusters: %lu\n",
+                   Hotfuncs, Clusters.size())
+         << format("BOLT-INFO:  Final average call distance = %.1lf "
+                   "(%.0lf / %.0lf)\n",
+                   TotalCalls ? TotalDistance / TotalCalls : 0,
+                   TotalDistance, TotalCalls)
+         << format("BOLT-INFO:  Total Calls = %.0lf\n", TotalCalls);
+  if (TotalCalls) {
+    outs() << format("BOLT-INFO:  Total Calls within 64B = %.0lf (%.2lf%%)\n",
+                     TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
+           << format("BOLT-INFO:  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
+                     TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
+           << format("BOLT-INFO:  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
+                     TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
+  }
+}
+
+namespace {
+
+std::vector<std::string> readFunctionOrderFile() {
+  std::vector<std::string> FunctionNames;
+  std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
+  if (!FuncsFile) {
+    errs() << "Ordered functions file \"" << opts::FunctionOrderFile
+           << "\" can't be opened.\n";
+    exit(1);
+  }
+  std::string FuncName;
+  while (std::getline(FuncsFile, FuncName)) {
+    FunctionNames.push_back(FuncName);
+  }
+  return FunctionNames;
+}
+
+}
+
+void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
+  auto &BFs = BC.getBinaryFunctions();
+  if (opts::ReorderFunctions != RT_NONE &&
+      opts::ReorderFunctions != RT_EXEC_COUNT &&
+      opts::ReorderFunctions != RT_USER) {
+    Cg = buildCallGraph(BC,
+                        [](const BinaryFunction &BF) {
+                          if (!BF.hasProfile())
+                            return true;
+                          if (BF.getState() != BinaryFunction::State::CFG)
+                            return true;
+                          return false;
+                        },
+                        opts::CgFromPerfData,
+                        false, // IncludeColdCalls
+                        opts::ReorderFunctionsUseHotSize,
+                        opts::CgUseSplitHotSize,
+                        opts::UseEdgeCounts,
+                        opts::CgIgnoreRecursiveCalls);
+    Cg.normalizeArcWeights();
+  }
+
+  std::vector<Cluster> Clusters;
+
+  switch(opts::ReorderFunctions) {
+  case RT_NONE:
+    break;
+  case RT_EXEC_COUNT:
+    {
+      std::vector<BinaryFunction *> SortedFunctions(BFs.size());
+      uint32_t Index = 0;
+      std::transform(BFs.begin(),
+                     BFs.end(),
+                     SortedFunctions.begin(),
+                     [](std::pair<const uint64_t, BinaryFunction> &BFI) {
+                       return &BFI.second;
+                     });
+      std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(),
+                       [&](const BinaryFunction *A, const BinaryFunction *B) {
+                         if (A->isIgnored())
+                           return false;
+                         const size_t PadA = opts::padFunction(*A);
+                         const size_t PadB = opts::padFunction(*B);
+                         if (!PadA || !PadB) {
+                           if (PadA)
+                             return true;
+                           if (PadB)
+                             return false;
+                         }
+                         return !A->hasProfile() &&
+                           (B->hasProfile() ||
+                            (A->getExecutionCount() > B->getExecutionCount()));
+                       });
+      for (BinaryFunction *BF : SortedFunctions) {
+        if (BF->hasProfile())
+          BF->setIndex(Index++);
+      }
+    }
+    break;
+  case RT_HFSORT:
+    Clusters = clusterize(Cg);
+    break;
+  case RT_HFSORT_PLUS:
+    Clusters = hfsortPlus(Cg);
+    break;
+  case RT_PETTIS_HANSEN:
+    Clusters = pettisAndHansen(Cg);
+    break;
+  case RT_RANDOM:
+    std::srand(opts::RandomSeed);
+    Clusters = randomClusters(Cg);
+    break;
+  case RT_USER:
+    {
+      uint32_t Index = 0;
+      for (const std::string &Function : readFunctionOrderFile()) {
+        std::vector<uint64_t> FuncAddrs;
+
+        BinaryData *BD = BC.getBinaryDataByName(Function);
+        if (!BD) {
+          uint32_t LocalID = 1;
+          while(1) {
+            // If we can't find the main symbol name, look for alternates.
+            const std::string FuncName =
+                Function + "/" + std::to_string(LocalID);
+            BD = BC.getBinaryDataByName(FuncName);
+            if (BD)
+              FuncAddrs.push_back(BD->getAddress());
+            else
+              break;
+            LocalID++;
+          }
+        } else {
+          FuncAddrs.push_back(BD->getAddress());
+        }
+
+        if (FuncAddrs.empty()) {
+          errs() << "BOLT-WARNING: Reorder functions: can't find function for "
+                 << Function << ".\n";
+          continue;
+        }
+
+        for (const uint64_t FuncAddr : FuncAddrs) {
+          const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr);
+          assert(FuncBD);
+
+          BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol());
+          if (!BF) {
+            errs() << "BOLT-WARNING: Reorder functions: can't find function for "
+                   << Function << ".\n";
+            break;
+          }
+          if (!BF->hasValidIndex()) {
+            BF->setIndex(Index++);
+          } else if (opts::Verbosity > 0) {
+            errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
+                   << ".\n";
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  reorder(std::move(Clusters), BFs);
+
+  std::unique_ptr<std::ofstream> FuncsFile;
+  if (!opts::GenerateFunctionOrderFile.empty()) {
+    FuncsFile =
+      std::make_unique<std::ofstream>(opts::GenerateFunctionOrderFile,
+                                       std::ios::out);
+    if (!FuncsFile) {
+      errs() << "BOLT-ERROR: ordered functions file "
+             << opts::GenerateFunctionOrderFile << " cannot be opened\n";
+      exit(1);
+    }
+  }
+
+  std::unique_ptr<std::ofstream> LinkSectionsFile;
+  if (!opts::LinkSectionsFile.empty()) {
+    LinkSectionsFile =
+      std::make_unique<std::ofstream>(opts::LinkSectionsFile,
+                                       std::ios::out);
+    if (!LinkSectionsFile) {
+      errs() << "BOLT-ERROR: link sections file "
+             << opts::LinkSectionsFile << " cannot be opened\n";
+      exit(1);
+    }
+  }
+
+  if (FuncsFile || LinkSectionsFile) {
+    std::vector<BinaryFunction *> SortedFunctions(BFs.size());
+    std::transform(BFs.begin(),
+                   BFs.end(),
+                   SortedFunctions.begin(),
+                   [](std::pair<const uint64_t, BinaryFunction> &BFI) {
+                     return &BFI.second;
+                   });
+
+    // Sort functions by index.
+    std::stable_sort(
+      SortedFunctions.begin(),
+      SortedFunctions.end(),
+      [](const BinaryFunction *A, const BinaryFunction *B) {
+        if (A->hasValidIndex() && B->hasValidIndex()) {
+          return A->getIndex() < B->getIndex();
+        } else if (A->hasValidIndex() && !B->hasValidIndex()) {
+          return true;
+        } else if (!A->hasValidIndex() && B->hasValidIndex()) {
+          return false;
+        } else {
+          return A->getAddress() < B->getAddress();
+        }
+      });
+
+    for (const BinaryFunction *Func : SortedFunctions) {
+      if (!Func->hasValidIndex())
+        break;
+      if (Func->isPLTFunction())
+        continue;
+
+      if (FuncsFile)
+        *FuncsFile << Func->getOneName().str() << '\n';
+
+      if (LinkSectionsFile) {
+        const char *Indent = "";
+        std::vector<StringRef> AllNames = Func->getNames();
+        std::sort(AllNames.begin(), AllNames.end());
+        for (StringRef Name : AllNames) {
+          const size_t SlashPos = Name.find('/');
+          if (SlashPos != std::string::npos) {
+            // Avoid duplicates for local functions.
+            if (Name.find('/', SlashPos + 1) != std::string::npos)
+              continue;
+            Name = Name.substr(0, SlashPos);
+          }
+          *LinkSectionsFile << Indent << ".text." << Name.str() << '\n';
+          Indent = " ";
+        }
+      }
+    }
+
+    if (FuncsFile) {
+      FuncsFile->close();
+      outs() << "BOLT-INFO: dumped function order to "
+             << opts::GenerateFunctionOrderFile << '\n';
+    }
+
+    if (LinkSectionsFile) {
+      LinkSectionsFile->close();
+      outs() << "BOLT-INFO: dumped linker section order to "
+             << opts::LinkSectionsFile << '\n';
+    }
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/ReorderFunctions.h b/bolt/src/Passes/ReorderFunctions.h
new file mode 100644
index 000000000000..7ee12c4f7612
--- /dev/null
+++ b/bolt/src/Passes/ReorderFunctions.h
@@ -0,0 +1,50 @@
+//===--- ReorderFunctions.h - Function reordering pass --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H
+
+#include "BinaryPasses.h"
+#include "BinaryFunctionCallGraph.h"
+
+namespace llvm {
+namespace bolt {
+class Cluster;
+
+/// Modify function order for streaming based on hotness.
+class ReorderFunctions : public BinaryFunctionPass {
+  BinaryFunctionCallGraph Cg;
+
+  void reorder(std::vector<Cluster> &&Clusters,
+               std::map<uint64_t, BinaryFunction> &BFs);
+public:
+  enum ReorderType : char {
+    RT_NONE = 0,
+    RT_EXEC_COUNT,
+    RT_HFSORT,
+    RT_HFSORT_PLUS,
+    RT_PETTIS_HANSEN,
+    RT_RANDOM,
+    RT_USER
+  };
+
+  explicit ReorderFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  const char *getName() const override {
+    return "reorder-functions";
+  }
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/ReorderUtils.h b/bolt/src/Passes/ReorderUtils.h
new file mode 100644
index 000000000000..d966ec578a93
--- /dev/null
+++ b/bolt/src/Passes/ReorderUtils.h
@@ -0,0 +1,154 @@
+// Passes/ReorderUtils.h - Helper methods for function and block reordering   //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H
+
+#include <memory>
+#include <vector>
+
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+namespace bolt {
+
+// This class maintains adjacency information for all Clusters being
+// processed. It is used for visiting all neighbors of any given Cluster
+// while merging pairs of Clusters. Every Cluster must implement the id() method
+template <typename Cluster> class AdjacencyMatrix {
+public:
+  explicit AdjacencyMatrix(size_t Size) : Bits(Size, BitVector(Size, false)) {}
+
+  void initialize(std::vector<Cluster *> &_Clusters) { Clusters = _Clusters; }
+
+  template <typename F> void forAllAdjacent(const Cluster *C, F Func) const {
+    const_cast<AdjacencyMatrix *>(this)->forallAdjacent(C, Func);
+  }
+
+  template <typename F> void forAllAdjacent(const Cluster *C, F Func) {
+    for (int I = Bits[C->id()].find_first(); I != -1;
+         I = Bits[C->id()].find_next(I)) {
+      Func(Clusters[I]);
+    }
+  }
+
+  /// Merge adjacency info from cluster B into cluster A.  Info for cluster B is
+  /// left in an undefined state.
+  void merge(const Cluster *A, const Cluster *B) {
+    Bits[A->id()] |= Bits[B->id()];
+    Bits[A->id()][A->id()] = false;
+    Bits[A->id()][B->id()] = false;
+    Bits[B->id()][A->id()] = false;
+    for (int I = Bits[B->id()].find_first(); I != -1;
+         I = Bits[B->id()].find_next(I)) {
+      Bits[I][A->id()] = true;
+      Bits[I][B->id()] = false;
+    }
+  }
+
+  void set(const Cluster *A, const Cluster *B) { set(A, B, true); }
+
+private:
+  void set(const Cluster *A, const Cluster *B, bool Value) {
+    assert(A != B);
+    Bits[A->id()][B->id()] = Value;
+    Bits[B->id()][A->id()] = Value;
+  }
+
+  std::vector<Cluster *> Clusters;
+  std::vector<BitVector> Bits;
+};
+
+// This class holds cached results of specified type for a pair of Clusters.
+// It can invalidate all cache entries associated with a given Cluster.
+template <typename Cluster, typename ValueType> class ClusterPairCache {
+public:
+  explicit ClusterPairCache(size_t Size)
+      : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
+
+  bool contains(const Cluster *First, const Cluster *Second) const {
+    return Valid[index(First, Second)];
+  }
+
+  ValueType get(const Cluster *First, const Cluster *Second) const {
+    assert(contains(First, Second));
+    return Cache[index(First, Second)];
+  }
+
+  void set(const Cluster *First, const Cluster *Second, ValueType Value) {
+    const size_t Index = index(First, Second);
+    Cache[Index] = Value;
+    Valid[Index] = true;
+  }
+
+  void invalidate(const Cluster *C) {
+    Valid.reset(C->id() * Size, (C->id() + 1) * Size);
+    for (size_t Id = 0; Id < Size; Id++) {
+      Valid.reset((Id * Size) + C->id());
+    }
+  }
+
+private:
+  size_t index(const Cluster *First, const Cluster *Second) const {
+    return (First->id() * Size) + Second->id();
+  }
+
+  size_t Size;
+  std::vector<ValueType> Cache;
+  BitVector Valid;
+};
+
+// This class holds cached results of specified type for a pair of Clusters.
+// It can invalidate all cache entries associated with a given Cluster.
+// The functions set, get and contains are thread safe when called with
+// distinct keys.
+template <typename Cluster, typename ValueType>
+class ClusterPairCacheThreadSafe {
+public:
+  explicit ClusterPairCacheThreadSafe(size_t Size)
+      : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {}
+
+  bool contains(const Cluster *First, const Cluster *Second) const {
+    return Valid[index(First, Second)];
+  }
+
+  ValueType get(const Cluster *First, const Cluster *Second) const {
+    assert(contains(First, Second));
+    return Cache[index(First, Second)];
+  }
+
+  void set(const Cluster *First, const Cluster *Second, ValueType Value) {
+    const size_t Index = index(First, Second);
+    Cache[Index] = Value;
+    Valid[Index] = true;
+  }
+
+  void invalidate(const Cluster *C) {
+    for (size_t Idx = C->id() * Size; Idx < (C->id() + 1) * Size; Idx++)
+      Valid[Idx] = false;
+
+    for (size_t Id = 0; Id < Size; Id++)
+      Valid[(Id * Size) + C->id()] = false;
+  }
+
+private:
+  size_t Size;
+  std::vector<ValueType> Cache;
+  std::vector<bool> Valid;
+
+  size_t index(const Cluster *First, const Cluster *Second) const {
+    return (First->id() * Size) + Second->id();
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/RetpolineInsertion.cpp b/bolt/src/Passes/RetpolineInsertion.cpp
new file mode 100644
index 000000000000..4cdc457b8ebb
--- /dev/null
+++ b/bolt/src/Passes/RetpolineInsertion.cpp
@@ -0,0 +1,337 @@
+//===--- Passes/RetpolineInsertion.cpp-------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements a pass that replaces indirect branches (calls and
+// jumps) with calls to retpolines to protect against branch target injection
+// attacks.
+// A unique retpoline is created for each register holding the address of the
+// callee, if the callee address is in memory %r11 is used if available to
+// hold the address of the callee before calling the retpoline, otherwise an
+// address pattern specific retpoline is called where the callee address is
+// loaded inside the retpoline.
+// The user can determine when to assume %r11 available using r11-availability
+// option, by default %r11 is assumed not available.
+// Adding lfence instruction to the body of the speculate code is enabled by
+// default and can be controlled by the user using retpoline-lfence option.
+//===----------------------------------------------------------------------===//
+#include "RetpolineInsertion.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "bolt-retpoline"
+
+using namespace llvm;
+using namespace bolt;
+namespace opts {
+
+extern cl::OptionCategory BoltCategory;
+
+llvm::cl::opt<bool>
+InsertRetpolines("insert-retpolines",
+  cl::desc("run retpoline insertion pass"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+llvm::cl::opt<bool>
+RetpolineLfence("retpoline-lfence",
+  cl::desc("determine if lfence instruction should exist in the retpoline"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<RetpolineInsertion::AvailabilityOptions>
+R11Availability("r11-availability",
+  cl::desc("determine the availablity of r11 before indirect branches"),
+  cl::init(RetpolineInsertion::AvailabilityOptions::NEVER),
+  cl::values(
+    clEnumValN(RetpolineInsertion::AvailabilityOptions::NEVER,
+      "never", "r11 not available"),
+    clEnumValN(RetpolineInsertion::AvailabilityOptions::ALWAYS,
+      "always", "r11 avaialable before calls and jumps"),
+    clEnumValN(RetpolineInsertion::AvailabilityOptions::ABI,
+      "abi", "r11 avaialable before calls but not before jumps")),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+// Retpoline function structure:
+// BB0: call BB2
+// BB1: pause
+//      lfence
+//      jmp BB1
+// BB2: mov %reg, (%rsp)
+//      ret
+// or
+// BB2: push %r11
+//      mov Address, %r11
+//      mov %r11, 8(%rsp)
+//      pop %r11
+//      ret
+BinaryFunction *createNewRetpoline(BinaryContext &BC,
+                                   const std::string &RetpolineTag,
+                                   const IndirectBranchInfo &BrInfo,
+                                   bool R11Available) {
+  auto &MIB = *BC.MIB;
+  MCContext &Ctx = *BC.Ctx.get();
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Creating a new retpoline function["
+                    << RetpolineTag << "]\n");
+
+  BinaryFunction *NewRetpoline =
+      BC.createInjectedBinaryFunction(RetpolineTag, true);
+  std::vector<std::unique_ptr<BinaryBasicBlock>> NewBlocks(3);
+  for (int I = 0; I < 3; I++) {
+    MCSymbol *Symbol =
+        Ctx.createNamedTempSymbol(Twine(RetpolineTag + "_BB" + to_string(I)));
+    NewBlocks[I] = NewRetpoline->createBasicBlock(
+        BinaryBasicBlock::INVALID_OFFSET, Symbol);
+    NewBlocks[I].get()->setCFIState(0);
+  }
+
+  BinaryBasicBlock &BB0 = *NewBlocks[0].get();
+  BinaryBasicBlock &BB1 = *NewBlocks[1].get();
+  BinaryBasicBlock &BB2 = *NewBlocks[2].get();
+
+  BB0.addSuccessor(&BB2, 0, 0);
+  BB1.addSuccessor(&BB1, 0, 0);
+
+  // Build BB0
+  MCInst DirectCall;
+  MIB.createDirectCall(DirectCall, BB2.getLabel(), &Ctx, /*IsTailCall*/ false);
+  BB0.addInstruction(DirectCall);
+
+  // Build BB1
+  MCInst Pause;
+  MIB.createPause(Pause);
+  BB1.addInstruction(Pause);
+
+  if (opts::RetpolineLfence) {
+    MCInst Lfence;
+    MIB.createLfence(Lfence);
+    BB1.addInstruction(Lfence);
+  }
+
+  std::vector<MCInst> Seq;
+  MIB.createShortJmp(Seq, BB1.getLabel(), &Ctx);
+  BB1.addInstructions(Seq.begin(), Seq.end());
+
+  // Build BB2
+  if (BrInfo.isMem()) {
+    if (R11Available) {
+      MCInst StoreToStack;
+      MIB.createSaveToStack(StoreToStack, MIB.getStackPointer(), 0,
+                            MIB.getX86R11(), 8);
+      BB2.addInstruction(StoreToStack);
+    } else {
+      MCInst PushR11;
+      MIB.createPushRegister(PushR11, MIB.getX86R11(), 8);
+      BB2.addInstruction(PushR11);
+
+      MCInst LoadCalleeAddrs;
+      const IndirectBranchInfo::MemOpInfo &MemRef = BrInfo.Memory;
+      MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
+                     MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
+                     MemRef.SegRegNum, MIB.getX86R11(), 8);
+
+      BB2.addInstruction(LoadCalleeAddrs);
+
+      MCInst StoreToStack;
+      MIB.createSaveToStack(StoreToStack, MIB.getStackPointer(), 8,
+                            MIB.getX86R11(), 8);
+      BB2.addInstruction(StoreToStack);
+
+      MCInst PopR11;
+      MIB.createPopRegister(PopR11, MIB.getX86R11(), 8);
+      BB2.addInstruction(PopR11);
+    }
+  } else if (BrInfo.isReg()) {
+    MCInst StoreToStack;
+    MIB.createSaveToStack(StoreToStack, MIB.getStackPointer(), 0,
+                          BrInfo.BranchReg, 8);
+    BB2.addInstruction(StoreToStack);
+  } else {
+    llvm_unreachable("not expected");
+  }
+
+  // return
+  MCInst Return;
+  MIB.createReturn(Return);
+  BB2.addInstruction(Return);
+  NewRetpoline->insertBasicBlocks(nullptr, move(NewBlocks),
+                                  /* UpdateLayout */ true,
+                                  /* UpdateCFIState */ false);
+
+  NewRetpoline->updateState(BinaryFunction::State::CFG_Finalized);
+  return NewRetpoline;
+}
+
+std::string createRetpolineFunctionTag(BinaryContext &BC,
+                                       const IndirectBranchInfo &BrInfo,
+                                       bool R11Available) {
+  if (BrInfo.isReg())
+    return "__retpoline_r" + to_string(BrInfo.BranchReg) + "_";
+
+  // Memory Branch
+  if (R11Available)
+    return "__retpoline_r11";
+
+  std::string Tag = "__retpoline_mem_";
+
+  const IndirectBranchInfo::MemOpInfo &MemRef = BrInfo.Memory;
+
+  std::string DispExprStr;
+  if (MemRef.DispExpr) {
+    llvm::raw_string_ostream Ostream(DispExprStr);
+    MemRef.DispExpr->print(Ostream, BC.AsmInfo.get());
+    Ostream.flush();
+  }
+
+  Tag += MemRef.BaseRegNum != BC.MIB->getNoRegister()
+             ? "r" + to_string(MemRef.BaseRegNum)
+             : "";
+
+  Tag +=
+      MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue);
+
+  Tag += MemRef.IndexRegNum != BC.MIB->getNoRegister()
+             ? "+" + to_string(MemRef.ScaleValue) + "*" +
+                   to_string(MemRef.IndexRegNum)
+             : "";
+
+  Tag += MemRef.SegRegNum != BC.MIB->getNoRegister()
+             ? "_seg_" + to_string(MemRef.SegRegNum)
+             : "";
+
+  return Tag;
+}
+
+BinaryFunction *RetpolineInsertion::getOrCreateRetpoline(
+    BinaryContext &BC, const IndirectBranchInfo &BrInfo, bool R11Available) {
+  const std::string RetpolineTag =
+      createRetpolineFunctionTag(BC, BrInfo, R11Available);
+
+  if (CreatedRetpolines.count(RetpolineTag))
+    return CreatedRetpolines[RetpolineTag];
+
+  return CreatedRetpolines[RetpolineTag] =
+             createNewRetpoline(BC, RetpolineTag, BrInfo, R11Available);
+}
+
+void createBranchReplacement(BinaryContext &BC,
+                                const IndirectBranchInfo &BrInfo,
+                                bool R11Available,
+                                std::vector<MCInst> &Replacement,
+                                const MCSymbol *RetpolineSymbol) {
+  auto &MIB = *BC.MIB;
+  // Load the branch address in r11 if available
+  if (BrInfo.isMem() && R11Available) {
+    const IndirectBranchInfo::MemOpInfo &MemRef = BrInfo.Memory;
+    MCInst LoadCalleeAddrs;
+    MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue,
+                   MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr,
+                   MemRef.SegRegNum, MIB.getX86R11(), 8);
+    Replacement.push_back(LoadCalleeAddrs);
+  }
+
+  // Call the retpoline
+  MCInst RetpolineCall;
+  MIB.createDirectCall(RetpolineCall, RetpolineSymbol, BC.Ctx.get(),
+                       BrInfo.isJump() || BrInfo.isTailCall());
+
+  Replacement.push_back(RetpolineCall);
+}
+
+IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
+  IsCall = MIB.isCall(Inst);
+  IsTailCall = MIB.isTailCall(Inst);
+
+  if (MIB.isBranchOnMem(Inst)) {
+    IsMem = true;
+    if (!MIB.evaluateX86MemoryOperand(Inst, &Memory.BaseRegNum,
+                                      &Memory.ScaleValue,
+                                      &Memory.IndexRegNum, &Memory.DispValue,
+                                      &Memory.SegRegNum, &Memory.DispExpr)) {
+      llvm_unreachable("not expected");
+    }
+  } else if (MIB.isBranchOnReg(Inst)) {
+    assert(MCPlus::getNumPrimeOperands(Inst) == 1 && "expect 1 operand");
+    BranchReg = Inst.getOperand(0).getReg();
+  } else {
+    llvm_unreachable("unexpected instruction");
+  }
+}
+
+void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
+  if (!opts::InsertRetpolines)
+    return;
+
+  assert(BC.isX86() &&
+         "retpoline insertion not supported for target architecture");
+
+  assert(BC.HasRelocations && "retpoline mode not supported in non-reloc");
+
+  auto &MIB = *BC.MIB;
+  uint32_t RetpolinedBranches = 0;
+  for (auto &It : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = It.second;
+    for (BinaryBasicBlock &BB : Function) {
+      for (auto It = BB.begin(); It != BB.end(); ++It) {
+        MCInst &Inst = *It;
+
+        if (!MIB.isIndirectCall(Inst) && !MIB.isIndirectBranch(Inst))
+          continue;
+
+        IndirectBranchInfo BrInfo(Inst, MIB);
+        bool R11Available = false;
+        BinaryFunction *TargetRetpoline;
+        std::vector<MCInst> Replacement;
+
+        // Determine if r11 is available before this instruction
+        if (BrInfo.isMem()) {
+          if(MIB.hasAnnotation(Inst, "PLTCall"))
+            R11Available= true;
+          else if (opts::R11Availability == AvailabilityOptions::ALWAYS)
+            R11Available = true;
+          else if (opts::R11Availability == AvailabilityOptions::ABI)
+            R11Available = BrInfo.isCall();
+        }
+
+        // If the instruction addressing pattern uses rsp and the retpoline
+        // loads the callee address then displacement needs to be updated
+        if (BrInfo.isMem() && !R11Available) {
+          IndirectBranchInfo::MemOpInfo &MemRef = BrInfo.Memory;
+          int Addend = (BrInfo.isJump() || BrInfo.isTailCall()) ? 8 : 16;
+          if (MemRef.BaseRegNum == MIB.getStackPointer()) {
+            MemRef.DispValue += Addend;
+          }
+          if (MemRef.IndexRegNum == MIB.getStackPointer())
+            MemRef.DispValue += Addend * MemRef.ScaleValue;
+        }
+
+        TargetRetpoline = getOrCreateRetpoline(BC, BrInfo, R11Available);
+
+        createBranchReplacement(BC, BrInfo, R11Available, Replacement,
+                                   TargetRetpoline->getSymbol());
+
+        It = BB.replaceInstruction(It, Replacement.begin(), Replacement.end());
+        RetpolinedBranches++;
+      }
+    }
+  }
+  outs() << "BOLT-INFO: The number of created retpoline functions is : "
+         << CreatedRetpolines.size()
+         << "\nBOLT-INFO: The number of retpolined branches is : " << RetpolinedBranches
+         << "\n";
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/RetpolineInsertion.h b/bolt/src/Passes/RetpolineInsertion.h
new file mode 100644
index 000000000000..07982837c4d1
--- /dev/null
+++ b/bolt/src/Passes/RetpolineInsertion.h
@@ -0,0 +1,79 @@
+//===--- Passes/RetpolineInsertion.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_RETPOLINE_INSERTION_H
+#define LLVM_TOOLS_LLVM_BOLT_RETPOLINE_INSERTION_H
+
+#include "BinaryPasses.h"
+#include <string>
+#include <unordered_map>
+
+namespace llvm {
+namespace bolt {
+
+struct IndirectBranchInfo {
+private:
+  bool IsMem = false;
+  bool IsCall = false;
+  bool IsTailCall = false;
+
+public:
+  IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB);
+  bool isMem() const { return IsMem; }
+  bool isReg() const { return !IsMem; }
+  bool isCall() const { return IsCall; }
+  bool isJump() const { return !IsCall; }
+  bool isTailCall() const { return IsTailCall; }
+
+  struct MemOpInfo {
+    unsigned BaseRegNum;
+    int64_t ScaleValue;
+    unsigned IndexRegNum;
+    int64_t DispValue;
+    unsigned SegRegNum;
+    const MCExpr *DispExpr{nullptr};
+  };
+
+  union {
+    // Register branch information
+    MCPhysReg BranchReg;
+
+    // Memory branch information
+    MemOpInfo Memory;
+  };
+};
+
+class RetpolineInsertion : public BinaryFunctionPass {
+private:
+  std::unordered_map<std::string, BinaryFunction *> CreatedRetpolines;
+
+  BinaryFunction *getOrCreateRetpoline(BinaryContext &BC,
+                                       const IndirectBranchInfo &BrInfo,
+                                       bool R11Available);
+
+public:
+  /// Register r11 availability options
+  enum AvailabilityOptions : char {
+    ALWAYS = 0, ///  r11 available before calls and jumps
+    ABI = 1,    ///  r11 available before calls
+    NEVER = 2   ///  r11 not available
+  };
+
+  explicit RetpolineInsertion(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override { return "retpoline-insertion"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/ShrinkWrapping.cpp b/bolt/src/Passes/ShrinkWrapping.cpp
new file mode 100644
index 000000000000..10dd5f4948ec
--- /dev/null
+++ b/bolt/src/Passes/ShrinkWrapping.cpp
@@ -0,0 +1,2097 @@
+//===--- Passes/ShrinkWrapping.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ShrinkWrapping.h"
+#include "DataflowInfoManager.h"
+#include "MCPlus.h"
+#include <numeric>
+#include <stack>
+
+#define DEBUG_TYPE "shrinkwrapping"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::opt<bool> TimeOpts;
+extern cl::OptionCategory BoltOptCategory;
+
+static cl::opt<unsigned> ShrinkWrappingThreshold(
+    "shrink-wrapping-threshold",
+    cl::desc("Percentage of prologue execution count to use as threshold when"
+             " evaluating whether a block is cold enough to be profitable to"
+             " move eligible spills there"),
+    cl::init(30), cl::ZeroOrMore, cl::cat(BoltOptCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+void CalleeSavedAnalysis::analyzeSaves() {
+  ReachingDefOrUse</*Def=*/true> &RD = Info.getReachingDefs();
+  StackReachingUses &SRU = Info.getStackReachingUses();
+  auto &InsnToBB = Info.getInsnToBBMap();
+  BitVector BlacklistedRegs(BC.MRI->getNumRegs(), false);
+
+  LLVM_DEBUG(dbgs() << "Checking spill locations\n");
+  for (BinaryBasicBlock &BB : BF) {
+    LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
+    const MCInst *Prev = nullptr;
+    for (MCInst &Inst : BB) {
+      if (ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Inst)) {
+        // Blacklist weird stores we don't understand
+        if ((!FIE->IsSimple || FIE->StackOffset >= 0) && FIE->IsStore &&
+            FIE->IsStoreFromReg) {
+          BlacklistedRegs.set(FIE->RegOrImm);
+          CalleeSaved.reset(FIE->RegOrImm);
+          Prev = &Inst;
+          continue;
+        }
+
+        if (!FIE->IsStore || !FIE->IsStoreFromReg ||
+            BlacklistedRegs[FIE->RegOrImm]) {
+          Prev = &Inst;
+          continue;
+        }
+
+        // If this reg is defined locally, it is not a callee-saved reg
+        if (RD.isReachedBy(FIE->RegOrImm,
+                           Prev ? RD.expr_begin(*Prev) : RD.expr_begin(BB))) {
+          BlacklistedRegs.set(FIE->RegOrImm);
+          CalleeSaved.reset(FIE->RegOrImm);
+          Prev = &Inst;
+          continue;
+        }
+
+        // If this stack position is accessed in another function, we are
+        // probably dealing with a parameter passed in a stack -- do not mess
+        // with it
+        if (SRU.isStoreUsed(*FIE,
+                            Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB)),
+            /*IncludeLocalAccesses=*/false) {
+          BlacklistedRegs.set(FIE->RegOrImm);
+          CalleeSaved.reset(FIE->RegOrImm);
+          Prev = &Inst;
+          continue;
+        }
+
+        // If this stack position is loaded elsewhere in another reg, we can't
+        // update it, so blacklist it.
+        if (SRU.isLoadedInDifferentReg(*FIE, Prev ? SRU.expr_begin(*Prev)
+                                                  : SRU.expr_begin(BB))) {
+          BlacklistedRegs.set(FIE->RegOrImm);
+          CalleeSaved.reset(FIE->RegOrImm);
+          Prev = &Inst;
+          continue;
+        }
+
+        // Ignore regs with multiple saves
+        if (CalleeSaved[FIE->RegOrImm]) {
+          BlacklistedRegs.set(FIE->RegOrImm);
+          CalleeSaved.reset(FIE->RegOrImm);
+          Prev = &Inst;
+          continue;
+        }
+
+        CalleeSaved.set(FIE->RegOrImm);
+        SaveFIEByReg[FIE->RegOrImm] = &*FIE;
+        SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount();
+        BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm, AllocatorId);
+        OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset;
+        LLVM_DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: "
+                          << FIE->RegOrImm << "\n");
+      }
+      Prev = &Inst;
+    }
+  }
+}
+
+void CalleeSavedAnalysis::analyzeRestores() {
+  ReachingDefOrUse</*Def=*/false> &RU = Info.getReachingUses();
+
+  // Now compute all restores of these callee-saved regs
+  for (BinaryBasicBlock &BB : BF) {
+    const MCInst *Prev = nullptr;
+    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
+      MCInst &Inst = *I;
+      if (ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Inst)) {
+        if (!FIE->IsLoad || !CalleeSaved[FIE->RegOrImm]) {
+          Prev = &Inst;
+          continue;
+        }
+
+        // If this reg is used locally after a restore, then we are probably
+        // not dealing with a callee-saved reg. Except if this use is by
+        // another store, but we don't cover this case yet.
+        // Also not callee-saved if this load accesses caller stack or isn't
+        // simple.
+        if (!FIE->IsSimple || FIE->StackOffset >= 0 ||
+            RU.isReachedBy(FIE->RegOrImm,
+                           Prev ? RU.expr_begin(*Prev) : RU.expr_begin(BB))) {
+          CalleeSaved.reset(FIE->RegOrImm);
+          Prev = &Inst;
+          continue;
+        }
+        // If stack offsets between saves/store don't agree with each other,
+        // we don't completely understand what's happening here
+        if (FIE->StackOffset != OffsetsByReg[FIE->RegOrImm]) {
+          CalleeSaved.reset(FIE->RegOrImm);
+          LLVM_DEBUG(dbgs() << "Dismissing Callee-Saved Reg because we found a "
+                               "mismatching restore: "
+                            << FIE->RegOrImm << "\n");
+          Prev = &Inst;
+          continue;
+        }
+
+        LLVM_DEBUG(dbgs() << "Adding matching restore for: " << FIE->RegOrImm
+                          << "\n");
+        if (LoadFIEByReg[FIE->RegOrImm] == nullptr)
+          LoadFIEByReg[FIE->RegOrImm] = &*FIE;
+        BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm,
+                              AllocatorId);
+        HasRestores.set(FIE->RegOrImm);
+      }
+      Prev = &Inst;
+    }
+  }
+}
+
+std::vector<MCInst *> CalleeSavedAnalysis::getSavesByReg(uint16_t Reg) {
+  std::vector<MCInst *> Results;
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (getSavedReg(Inst) == Reg)
+        Results.push_back(&Inst);
+    }
+  }
+  return Results;
+}
+
+std::vector<MCInst *> CalleeSavedAnalysis::getRestoresByReg(uint16_t Reg) {
+  std::vector<MCInst *> Results;
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (getRestoredReg(Inst) == Reg)
+        Results.push_back(&Inst);
+    }
+  }
+  return Results;
+}
+
+CalleeSavedAnalysis::~CalleeSavedAnalysis() {
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      BC.MIB->removeAnnotation(Inst, getSaveTag());
+      BC.MIB->removeAnnotation(Inst, getRestoreTag());
+    }
+  }
+}
+
+void StackLayoutModifier::blacklistRegion(int64_t Offset, int64_t Size) {
+  if (BlacklistedRegions[Offset] < Size) {
+    BlacklistedRegions[Offset] = Size;
+  }
+}
+
+bool StackLayoutModifier::isRegionBlacklisted(int64_t Offset, int64_t Size) {
+  for (std::pair<const int64_t, int64_t> Elem : BlacklistedRegions) {
+    if (Offset + Size > Elem.first && Offset < Elem.first + Elem.second)
+      return true;
+  }
+  return false;
+}
+
+bool StackLayoutModifier::blacklistAllInConflictWith(int64_t Offset,
+                                                     int64_t Size) {
+  bool HasConflict = false;
+  for (auto Iter = AvailableRegions.begin(); Iter != AvailableRegions.end();) {
+    std::pair<const int64_t, int64_t> &Elem = *Iter;
+    if (Offset + Size > Elem.first && Offset < Elem.first + Elem.second &&
+        (Offset != Elem.first || Size != Elem.second)) {
+      Iter = AvailableRegions.erase(Iter);
+      HasConflict = true;
+      continue;
+    }
+    ++Iter;
+  }
+  if (HasConflict) {
+    blacklistRegion(Offset, Size);
+    return true;
+  }
+  return false;
+}
+
+void StackLayoutModifier::checkFramePointerInitialization(MCInst &Point) {
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  if (!BC.MII->get(Point.getOpcode())
+           .hasDefOfPhysReg(Point, BC.MIB->getFramePointer(), *BC.MRI))
+    return;
+
+  int SPVal, FPVal;
+  std::tie(SPVal, FPVal) = *SPT.getStateBefore(Point);
+  std::pair<MCPhysReg, int64_t> FP;
+
+  if (FPVal != SPT.EMPTY && FPVal != SPT.SUPERPOSITION)
+    FP = std::make_pair(BC.MIB->getFramePointer(), FPVal);
+  else
+    FP = std::make_pair(0, 0);
+  std::pair<MCPhysReg, int64_t> SP;
+
+  if (SPVal != SPT.EMPTY && SPVal != SPT.SUPERPOSITION)
+    SP = std::make_pair(BC.MIB->getStackPointer(), SPVal);
+  else
+    SP = std::make_pair(0, 0);
+
+  int64_t Output;
+  if (!BC.MIB->evaluateSimple(Point, Output, SP, FP))
+    return;
+
+  // Not your regular frame pointer initialization... bail
+  if (Output != SPVal)
+    blacklistRegion(0, 0);
+}
+
+void StackLayoutModifier::checkStackPointerRestore(MCInst &Point) {
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  if (!BC.MII->get(Point.getOpcode())
+      .hasDefOfPhysReg(Point, BC.MIB->getStackPointer(), *BC.MRI))
+    return;
+  // Check if the definition of SP comes from FP -- in this case, this
+  // value may need to be updated depending on our stack layout changes
+  const MCInstrDesc &InstInfo = BC.MII->get(Point.getOpcode());
+  unsigned NumDefs = InstInfo.getNumDefs();
+  bool UsesFP = false;
+  for (unsigned I = NumDefs, E = MCPlus::getNumPrimeOperands(Point);
+       I < E; ++I) {
+    MCOperand &Operand = Point.getOperand(I);
+    if (!Operand.isReg())
+      continue;
+    if (Operand.getReg() == BC.MIB->getFramePointer()) {
+      UsesFP = true;
+      break;
+    }
+  }
+  if (!UsesFP)
+    return;
+
+  // Setting up evaluation
+  int SPVal, FPVal;
+  std::tie(SPVal, FPVal) = *SPT.getStateBefore(Point);
+  std::pair<MCPhysReg, int64_t> FP;
+
+  if (FPVal != SPT.EMPTY && FPVal != SPT.SUPERPOSITION)
+    FP = std::make_pair(BC.MIB->getFramePointer(), FPVal);
+  else
+    FP = std::make_pair(0, 0);
+  std::pair<MCPhysReg, int64_t> SP;
+
+  if (SPVal != SPT.EMPTY && SPVal != SPT.SUPERPOSITION)
+    SP = std::make_pair(BC.MIB->getStackPointer(), SPVal);
+  else
+    SP = std::make_pair(0, 0);
+
+  int64_t Output;
+  if (!BC.MIB->evaluateSimple(Point, Output, SP, FP))
+    return;
+
+  // If the value is the same of FP, no need to adjust it
+  if (Output == FPVal)
+    return;
+
+  // If an allocation happened through FP, bail
+  if (Output <= SPVal) {
+    blacklistRegion(0, 0);
+    return;
+  }
+
+  // We are restoring SP to an old value based on FP. Mark it as a stack
+  // access to be fixed later.
+  BC.MIB->addAnnotation(Point, getSlotTag(), Output, AllocatorId);
+}
+
+void StackLayoutModifier::classifyStackAccesses() {
+  // Understand when stack slots are being used non-locally
+  StackReachingUses &SRU = Info.getStackReachingUses();
+
+  for (BinaryBasicBlock &BB : BF) {
+    const MCInst *Prev = nullptr;
+    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
+      MCInst &Inst = *I;
+      checkFramePointerInitialization(Inst);
+      checkStackPointerRestore(Inst);
+      ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
+      if (!FIEX) {
+        Prev = &Inst;
+        continue;
+      }
+      if (!FIEX->IsSimple || (FIEX->IsStore && !FIEX->IsStoreFromReg)) {
+        blacklistRegion(FIEX->StackOffset, FIEX->Size);
+        Prev = &Inst;
+        continue;
+      }
+      // If this stack position is accessed in another function, we are
+      // probably dealing with a parameter passed in a stack -- do not mess
+      // with it
+      if (SRU.isStoreUsed(*FIEX,
+                          Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB),
+                          /*IncludeLocalAccesses=*/false)) {
+        blacklistRegion(FIEX->StackOffset, FIEX->Size);
+        Prev = &Inst;
+        continue;
+      }
+      // Now we have a clear stack slot access. Check if its blacklisted or if
+      // it conflicts with another chunk.
+      if (isRegionBlacklisted(FIEX->StackOffset, FIEX->Size) ||
+          blacklistAllInConflictWith(FIEX->StackOffset, FIEX->Size)) {
+        Prev = &Inst;
+        continue;
+      }
+      // We are free to go. Add it as available stack slot which we know how
+      // to move it.
+      AvailableRegions[FIEX->StackOffset] = FIEX->Size;
+      BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset, AllocatorId);
+      RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm);
+      RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset);
+      LLVM_DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size "
+                        << (int)FIEX->Size << "\n");
+    }
+  }
+}
+
+void StackLayoutModifier::classifyCFIs() {
+  std::stack<std::pair<int64_t, uint16_t>> CFIStack;
+  int64_t CfaOffset = -8;
+  uint16_t CfaReg = 7;
+
+  auto recordAccess = [&](MCInst *Inst, int64_t Offset) {
+    const uint16_t Reg = *BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false);
+    if (Reg == BC.MIB->getStackPointer() || Reg == BC.MIB->getFramePointer()) {
+      BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset, AllocatorId);
+      LLVM_DEBUG(dbgs() << "Recording CFI " << Offset << "\n");
+    } else {
+      IsSimple = false;
+      return;
+    }
+  };
+
+  for (BinaryBasicBlock *&BB : BF.layout()) {
+    for (MCInst &Inst : *BB) {
+      if (!BC.MIB->isCFI(Inst))
+        continue;
+      MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+      switch (CFI->getOperation()) {
+      case MCCFIInstruction::OpDefCfa:
+        CfaOffset = -CFI->getOffset();
+        recordAccess(&Inst, CfaOffset);
+        LLVM_FALLTHROUGH;
+      case MCCFIInstruction::OpDefCfaRegister:
+        CfaReg = CFI->getRegister();
+        break;
+      case MCCFIInstruction::OpDefCfaOffset:
+        CfaOffset = -CFI->getOffset();
+        recordAccess(&Inst, CfaOffset);
+        break;
+      case MCCFIInstruction::OpOffset:
+        recordAccess(&Inst, CFI->getOffset());
+        BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
+                              BC.MRI->getLLVMRegNum(CFI->getRegister(),
+                                                    /*isEH=*/false),
+                              AllocatorId);
+        break;
+      case MCCFIInstruction::OpSameValue:
+        BC.MIB->addAnnotation(Inst, getOffsetCFIRegTag(),
+                              BC.MRI->getLLVMRegNum(CFI->getRegister(),
+                                                    /*isEH=*/false),
+                              AllocatorId);
+        break;
+      case MCCFIInstruction::OpRememberState:
+        CFIStack.push(std::make_pair(CfaOffset, CfaReg));
+        break;
+      case MCCFIInstruction::OpRestoreState: {
+        assert(!CFIStack.empty() && "Corrupt CFI stack");
+        std::pair<int64_t, uint16_t> &Elem = CFIStack.top();
+        CFIStack.pop();
+        CfaOffset = Elem.first;
+        CfaReg = Elem.second;
+        break;
+      }
+      case MCCFIInstruction::OpRelOffset:
+      case MCCFIInstruction::OpAdjustCfaOffset:
+        llvm_unreachable("Unhandled AdjustCfaOffset");
+        break;
+      default:
+        break;
+      }
+    }
+  }
+}
+
+void StackLayoutModifier::scheduleChange(
+    MCInst &Inst, StackLayoutModifier::WorklistItem Item) {
+  auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
+      Inst, getTodoTag(), AllocatorId);
+  WList.push_back(Item);
+}
+
+bool StackLayoutModifier::canCollapseRegion(MCInst *DeletedPush) {
+  if (!IsSimple || !BC.MIB->isPush(*DeletedPush))
+    return false;
+
+  ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(*DeletedPush);
+  if (!FIE)
+    return false;
+
+  return canCollapseRegion(FIE->StackOffset);
+}
+
+bool StackLayoutModifier::canCollapseRegion(int64_t RegionAddr) {
+  if (!IsInitialized)
+    initialize();
+  if (!IsSimple)
+    return false;
+
+  if (CollapsedRegions.count(RegionAddr))
+    return true;
+
+  // Check if it is possible to readjust all accesses below RegionAddr
+  if (!BlacklistedRegions.empty())
+    return false;
+
+  return true;
+}
+
+bool StackLayoutModifier::collapseRegion(MCInst *DeletedPush) {
+  ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(*DeletedPush);
+  if (!FIE)
+    return false;
+  int64_t RegionAddr = FIE->StackOffset;
+  int64_t RegionSz = FIE->Size;
+  return collapseRegion(DeletedPush, RegionAddr, RegionSz);
+}
+
+bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr,
+                                         int64_t RegionSz) {
+  if (!canCollapseRegion(RegionAddr))
+    return false;
+
+  assert(IsInitialized);
+  StackAllocationAnalysis &SAA = Info.getStackAllocationAnalysis();
+
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (!BC.MIB->hasAnnotation(Inst, getSlotTag()))
+        continue;
+      auto Slot =
+          BC.MIB->getAnnotationAs<decltype(FrameIndexEntry::StackOffset)>(
+              Inst, getSlotTag());
+      if (!AvailableRegions.count(Slot))
+        continue;
+      // We need to ensure this access is affected by the deleted push
+      if (!(*SAA.getStateBefore(Inst))[SAA.ExprToIdx[Alloc]])
+        continue;
+
+      if (BC.MIB->isCFI(Inst)) {
+        if (Slot > RegionAddr)
+          continue;
+        scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, RegionSz));
+        continue;
+      }
+      ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Inst);
+      if (!FIE) {
+        if (Slot > RegionAddr)
+          continue;
+        // SP update based on frame pointer
+        scheduleChange(
+          Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, RegionSz));
+        continue;
+      }
+
+      if (Slot == RegionAddr) {
+        BC.MIB->addAnnotation(Inst, "AccessesDeletedPos", 0U, AllocatorId);
+        continue;
+      }
+      if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst)) {
+        continue;
+      }
+
+
+      if (FIE->StackPtrReg == BC.MIB->getStackPointer() && Slot < RegionAddr)
+        continue;
+
+      if (FIE->StackPtrReg == BC.MIB->getFramePointer() && Slot > RegionAddr)
+        continue;
+
+      scheduleChange(
+          Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, RegionSz));
+    }
+  }
+
+  CollapsedRegions.insert(RegionAddr);
+  return true;
+}
+
+void StackLayoutModifier::setOffsetForCollapsedAccesses(int64_t NewOffset) {
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (!BC.MIB->hasAnnotation(Inst, "AccessesDeletedPos"))
+        continue;
+      BC.MIB->removeAnnotation(Inst, "AccessesDeletedPos");
+      scheduleChange(
+          Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, NewOffset));
+    }
+  }
+}
+
+bool StackLayoutModifier::canInsertRegion(ProgramPoint P) {
+  if (!IsInitialized)
+    initialize();
+  if (!IsSimple)
+    return false;
+
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  int64_t RegionAddr = SPT.getStateBefore(P)->first;
+  if (RegionAddr == SPT.SUPERPOSITION || RegionAddr == SPT.EMPTY)
+    return false;
+
+  if (InsertedRegions.count(RegionAddr))
+    return true;
+
+  // Check if we are going to screw up stack accesses at call sites that
+  // pass parameters via stack
+  if (!BlacklistedRegions.empty())
+    return false;
+
+  return true;
+}
+
+bool StackLayoutModifier::insertRegion(ProgramPoint P, int64_t RegionSz) {
+  if (!canInsertRegion(P))
+    return false;
+
+  assert(IsInitialized);
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  // This RegionAddr is slightly different from the one seen in collapseRegion
+  // This is the value of SP before the allocation the user wants to make.
+  int64_t RegionAddr = SPT.getStateBefore(P)->first;
+  if (RegionAddr == SPT.SUPERPOSITION || RegionAddr == SPT.EMPTY)
+    return false;
+
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (!BC.MIB->hasAnnotation(Inst, getSlotTag()))
+        continue;
+      auto Slot =
+          BC.MIB->getAnnotationAs<decltype(FrameIndexEntry::StackOffset)>(
+              Inst, getSlotTag());
+      if (!AvailableRegions.count(Slot))
+        continue;
+
+      if (!(DA.doesADominateB(P, Inst)))
+        continue;
+
+      if (BC.MIB->isCFI(Inst)) {
+        if (Slot >= RegionAddr)
+          continue;
+        scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, -RegionSz));
+        continue;
+      }
+      ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Inst);
+      if (!FIE) {
+        if (Slot >= RegionAddr)
+          continue;
+        scheduleChange(
+          Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, -RegionSz));
+        continue;
+      }
+
+      if (FIE->StackPtrReg == BC.MIB->getStackPointer() && Slot < RegionAddr)
+        continue;
+      if (FIE->StackPtrReg == BC.MIB->getFramePointer() && Slot >= RegionAddr)
+        continue;
+      if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst))
+        continue;
+      scheduleChange(
+          Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, -RegionSz));
+    }
+  }
+
+  InsertedRegions.insert(RegionAddr);
+  return true;
+}
+
+void StackLayoutModifier::performChanges() {
+  std::set<uint32_t> ModifiedCFIIndices;
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
+      MCInst &Inst = *I;
+      if (BC.MIB->hasAnnotation(Inst, "AccessesDeletedPos")) {
+        assert(BC.MIB->isPop(Inst) || BC.MIB->isPush(Inst));
+        BC.MIB->removeAnnotation(Inst, "AccessesDeletedPos");
+      }
+      if (!BC.MIB->hasAnnotation(Inst, getTodoTag()))
+        continue;
+      auto &WList = BC.MIB->getAnnotationAs<std::vector<WorklistItem>>(
+          Inst, getTodoTag());
+      int64_t Adjustment = 0;
+      WorklistItem::ActionType AdjustmentType = WorklistItem::None;
+      for (WorklistItem &WI : WList) {
+        if (WI.Action == WorklistItem::None)
+          continue;
+        assert(WI.Action == WorklistItem::AdjustLoadStoreOffset ||
+               WI.Action == WorklistItem::AdjustCFI);
+        assert((AdjustmentType == WorklistItem::None ||
+                AdjustmentType == WI.Action) &&
+               "Conflicting actions requested at the same program point");
+        AdjustmentType = WI.Action;
+        Adjustment += WI.OffsetUpdate;
+      }
+      if (!Adjustment)
+        continue;
+      if (AdjustmentType != WorklistItem::AdjustLoadStoreOffset) {
+        assert(BC.MIB->isCFI(Inst));
+        uint32_t CFINum = Inst.getOperand(0).getImm();
+        if (ModifiedCFIIndices.count(CFINum))
+          continue;
+        ModifiedCFIIndices.insert(CFINum);
+        MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+        const MCCFIInstruction::OpType Operation = CFI->getOperation();
+        if (Operation == MCCFIInstruction::OpDefCfa ||
+            Operation == MCCFIInstruction::OpDefCfaOffset) {
+          Adjustment = 0 - Adjustment;
+        }
+        LLVM_DEBUG(dbgs() << "Changing CFI offset from " << CFI->getOffset()
+                          << " to " << (CFI->getOffset() + Adjustment) << "\n");
+        CFI->setOffset(CFI->getOffset() + Adjustment);
+        continue;
+      }
+      int32_t SrcImm = 0;
+      MCPhysReg Reg = 0;
+      MCPhysReg StackPtrReg = 0;
+      int64_t StackOffset = 0;
+      bool IsIndexed = false;
+      bool IsLoad = false;
+      bool IsStore = false;
+      bool IsSimple = false;
+      bool IsStoreFromReg = false;
+      uint8_t Size = 0;
+      bool Success = false;
+      Success = BC.MIB->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg,
+                                      Reg, SrcImm, StackPtrReg, StackOffset,
+                                      Size, IsSimple, IsIndexed);
+      if (!Success) {
+        // SP update based on FP value
+        Success = BC.MIB->addToImm(Inst, Adjustment, &*BC.Ctx);
+        assert(Success);
+        continue;
+      }
+      assert(Success && IsSimple && !IsIndexed && (!IsStore || IsStoreFromReg));
+      if (StackPtrReg != BC.MIB->getFramePointer())
+        Adjustment = -Adjustment;
+      if (IsLoad)
+        Success = BC.MIB->createRestoreFromStack(
+            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
+      else if (IsStore)
+        Success = BC.MIB->createSaveToStack(
+            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
+      LLVM_DEBUG({
+        dbgs() << "Adjusted instruction: ";
+        Inst.dump();
+      });
+      assert(Success);
+    }
+  }
+}
+
+void StackLayoutModifier::initialize() {
+  classifyStackAccesses();
+  classifyCFIs();
+  IsInitialized = true;
+}
+
+uint64_t ShrinkWrapping::SpillsMovedRegularMode = 0;
+uint64_t ShrinkWrapping::SpillsMovedPushPopMode = 0;
+
+using BBIterTy = BinaryBasicBlock::iterator;
+
+void ShrinkWrapping::classifyCSRUses() {
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  UsesByReg = std::vector<BitVector>(BC.MRI->getNumRegs(),
+                                     BitVector(DA.NumInstrs, false));
+
+  const BitVector &FPAliases =
+      BC.MIB->getAliases(BC.MIB->getFramePointer());
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (BC.MIB->isCFI(Inst))
+        continue;
+      BitVector BV = BitVector(BC.MRI->getNumRegs(), false);
+      BC.MIB->getTouchedRegs(Inst, BV);
+      BV &= CSA.CalleeSaved;
+      for (int I = BV.find_first(); I != -1; I = BV.find_next(I)) {
+        if (I == 0)
+          continue;
+        if (CSA.getSavedReg(Inst) != I && CSA.getRestoredReg(Inst) != I)
+          UsesByReg[I].set(DA.ExprToIdx[&Inst]);
+      }
+      if (!SPT.HasFramePointer || !BC.MIB->isCall(Inst))
+        continue;
+      BV = CSA.CalleeSaved;
+      BV &= FPAliases;
+      for (int I = BV.find_first(); I > 0; I = BV.find_next(I)) {
+        UsesByReg[I].set(DA.ExprToIdx[&Inst]);
+      }
+    }
+  }
+}
+
+void ShrinkWrapping::pruneUnwantedCSRs() {
+  BitVector ParamRegs = BC.MIB->getRegsUsedAsParams();
+  for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+    if (!CSA.CalleeSaved[I])
+      continue;
+    if (ParamRegs[I]) {
+      CSA.CalleeSaved.reset(I);
+      continue;
+    }
+    if (UsesByReg[I].empty()) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Dismissing Callee-Saved Reg because we found no uses of it:" << I
+          << "\n");
+      CSA.CalleeSaved.reset(I);
+      continue;
+    }
+    if (!CSA.HasRestores[I]) {
+      LLVM_DEBUG(
+          dbgs() << "Dismissing Callee-Saved Reg because it does not have "
+                    "restores:"
+                 << I << "\n");
+      CSA.CalleeSaved.reset(I);
+    }
+  }
+}
+
+void ShrinkWrapping::computeSaveLocations() {
+  SavePos = std::vector<SmallSetVector<MCInst *, 4>>(BC.MRI->getNumRegs());
+  ReachingInsns<true> &RI = Info.getReachingInsnsBackwards();
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+
+  LLVM_DEBUG(dbgs() << "Checking save/restore possibilities\n");
+  for (BinaryBasicBlock &BB : BF) {
+    LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
+
+    MCInst *First = BB.begin() != BB.end() ? &*BB.begin() : nullptr;
+    if (!First)
+      continue;
+
+    // Use reaching instructions to detect if we are inside a loop - if we
+    // are, do not consider this BB as valid placement for saves.
+    if (RI.isInLoop(BB))
+      continue;
+
+    const std::pair<int, int> SPFP = *SPT.getStateBefore(*First);
+    // If we don't know stack state at this point, bail
+    if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) &&
+        (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY))
+      continue;
+
+    for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+      if (!CSA.CalleeSaved[I])
+        continue;
+
+      BitVector BBDominatedUses = BitVector(DA.NumInstrs, false);
+      for (int J = UsesByReg[I].find_first(); J > 0;
+           J = UsesByReg[I].find_next(J)) {
+        if (DA.doesADominateB(*First, J))
+          BBDominatedUses.set(J);
+      }
+      LLVM_DEBUG(dbgs() << "\t\tBB " << BB.getName() << " dominates "
+                        << BBDominatedUses.count() << " uses for reg " << I
+                        << ". Total uses for reg is " << UsesByReg[I].count()
+                        << "\n");
+      BBDominatedUses &= UsesByReg[I];
+      if (BBDominatedUses == UsesByReg[I]) {
+        LLVM_DEBUG(dbgs() << "\t\t\tAdded " << BB.getName()
+                          << " as a save pos for " << I << "\n");
+        SavePos[I].insert(First);
+        LLVM_DEBUG({
+          dbgs() << "Dominated uses are:\n";
+          for (int J = UsesByReg[I].find_first(); J > 0;
+               J = UsesByReg[I].find_next(J)) {
+            dbgs() << "Idx " << J << ": ";
+            DA.Expressions[J]->dump();
+          }
+        });
+      }
+    }
+  }
+
+  BestSaveCount = std::vector<uint64_t>(BC.MRI->getNumRegs(),
+                                        std::numeric_limits<uint64_t>::max());
+  BestSavePos = std::vector<MCInst *>(BC.MRI->getNumRegs(), nullptr);
+  auto &InsnToBB = Info.getInsnToBBMap();
+  for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+    if (!CSA.CalleeSaved[I])
+      continue;
+
+    for (MCInst *Pos : SavePos[I]) {
+      BinaryBasicBlock *BB = InsnToBB[Pos];
+      uint64_t Count = BB->getExecutionCount();
+      if (Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+          Count < BestSaveCount[I]) {
+        BestSavePos[I] = Pos;
+        BestSaveCount[I] = Count;
+      }
+    }
+  }
+}
+
+void ShrinkWrapping::computeDomOrder() {
+  std::vector<MCPhysReg> Order;
+  for (MCPhysReg I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+    Order.push_back(I);
+  }
+
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+  auto &InsnToBB = Info.getInsnToBBMap();
+  std::sort(Order.begin(), Order.end(),
+            [&](const MCPhysReg &A, const MCPhysReg &B) {
+              BinaryBasicBlock *BBA =
+                  BestSavePos[A] ? InsnToBB[BestSavePos[A]] : nullptr;
+              BinaryBasicBlock *BBB =
+                  BestSavePos[B] ? InsnToBB[BestSavePos[B]] : nullptr;
+              if (BBA == BBB)
+                return A < B;
+              if (!BBA && BBB)
+                return false;
+              if (BBA && !BBB)
+                return true;
+              if (DA.doesADominateB(*BestSavePos[A], *BestSavePos[B]))
+                return true;
+              if (DA.doesADominateB(*BestSavePos[B], *BestSavePos[A]))
+                return false;
+              return A < B;
+            });
+
+  for (MCPhysReg I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+    DomOrder[Order[I]] = I;
+  }
+}
+
+bool ShrinkWrapping::isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave,
+                                       uint64_t &TotalEstimatedWin) {
+  const uint64_t CurSavingCost = CSA.SavingCost[CSR];
+  if (!CSA.CalleeSaved[CSR])
+    return false;
+
+  uint64_t BestCount = BestSaveCount[CSR];
+  BestPosSave = BestSavePos[CSR];
+  bool ShouldMove = false;
+  if (BestCount != std::numeric_limits<uint64_t>::max() &&
+      BestCount < (opts::ShrinkWrappingThreshold / 100.0) * CurSavingCost) {
+    LLVM_DEBUG({
+      auto &InsnToBB = Info.getInsnToBBMap();
+      dbgs() << "Better position for saves found in func " << BF.getPrintName()
+             << " count << " << BF.getKnownExecutionCount() << "\n";
+      dbgs() << "Reg: " << CSR
+             << "; New BB: " << InsnToBB[BestPosSave]->getName()
+             << " Freq reduction: " << (CurSavingCost - BestCount) << "\n";
+    });
+    TotalEstimatedWin += CurSavingCost - BestCount;
+    ShouldMove = true;
+  }
+
+  if (!ShouldMove)
+    return false;
+  if (!BestPosSave) {
+    LLVM_DEBUG({
+      dbgs() << "Dropping opportunity because we don't know where to put "
+                "stores -- total est. freq reduc: "
+             << TotalEstimatedWin << "\n";
+    });
+    return false;
+  }
+  return true;
+}
+
+/// Auxiliar function used to create basic blocks for critical edges and update
+/// the dominance frontier with these new locations
+void ShrinkWrapping::splitFrontierCritEdges(
+    BinaryFunction *Func, SmallVector<ProgramPoint, 4> &Frontier,
+    const SmallVector<bool, 4> &IsCritEdge,
+    const SmallVector<BinaryBasicBlock *, 4> &From,
+    const SmallVector<SmallVector<BinaryBasicBlock *, 4>, 4> &To) {
+  LLVM_DEBUG(dbgs() << "splitFrontierCritEdges: Now handling func "
+                    << BF.getPrintName() << "\n");
+  // For every FromBB, there might be one or more critical edges, with
+  // To[I] containing destination BBs. It's important to memorize
+  // the original size of the Frontier as we may append to it while splitting
+  // critical edges originating with blocks with multiple destinations.
+  for (size_t I = 0, IE = Frontier.size(); I < IE; ++I) {
+    if (!IsCritEdge[I])
+      continue;
+    if (To[I].empty())
+      continue;
+    BinaryBasicBlock *FromBB = From[I];
+    LLVM_DEBUG(dbgs() << " - Now handling FrontierBB " << FromBB->getName()
+                      << "\n");
+    // Split edge for every DestinationBBs
+    for (size_t DI = 0, DIE = To[I].size(); DI < DIE; ++DI) {
+      BinaryBasicBlock *DestinationBB = To[I][DI];
+      LLVM_DEBUG(dbgs() << "   - Dest : " << DestinationBB->getName() << "\n");
+      BinaryBasicBlock *NewBB = Func->splitEdge(FromBB, DestinationBB);
+      // Insert dummy instruction so this BB is never empty (we need this for
+      // PredictiveStackPointerTracking to work, since it annotates instructions
+      // and not BBs).
+      if (NewBB->empty()) {
+        MCInst NewInst;
+        BC.MIB->createNoop(NewInst);
+        NewBB->addInstruction(std::move(NewInst));
+        scheduleChange(&*NewBB->begin(), WorklistItem(WorklistItem::Erase, 0));
+      }
+
+      // Update frontier
+      ProgramPoint NewFrontierPP = ProgramPoint::getLastPointAt(*NewBB);
+      if (DI == 0) {
+        // Update frontier inplace
+        Frontier[I] = NewFrontierPP;
+        LLVM_DEBUG(dbgs() << "   - Update frontier with " << NewBB->getName()
+                          << '\n');
+      } else {
+        // Append new frontier to the end of the list
+        Frontier.push_back(NewFrontierPP);
+        LLVM_DEBUG(dbgs() << "   - Append frontier " << NewBB->getName()
+                          << '\n');
+      }
+    }
+  }
+}
+
+SmallVector<ProgramPoint, 4>
+ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
+                                   uint64_t TotalEstimatedWin) {
+  SmallVector<ProgramPoint, 4> Frontier;
+  SmallVector<bool, 4> IsCritEdge;
+  bool CannotPlace = false;
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+
+  SmallVector<BinaryBasicBlock *, 4> CritEdgesFrom;
+  SmallVector<SmallVector<BinaryBasicBlock *, 4>, 4> CritEdgesTo;
+  // In case of a critical edge, we need to create extra BBs to host restores
+  // into edges transitioning to the dominance frontier, otherwise we pull these
+  // restores to inside the dominated area.
+  Frontier = DA.getDominanceFrontierFor(*BestPosSave).takeVector();
+  LLVM_DEBUG({
+    dbgs() << "Dumping dominance frontier for ";
+    BC.printInstruction(dbgs(), *BestPosSave);
+    for (ProgramPoint &PP : Frontier) {
+      if (PP.isInst()) {
+        BC.printInstruction(dbgs(), *PP.getInst());
+      } else {
+        dbgs() << PP.getBB()->getName() << "\n";
+      }
+    }
+  });
+  for (ProgramPoint &PP : Frontier) {
+    bool HasCritEdges = false;
+    if (PP.isInst() && BC.MIB->isTerminator(*PP.getInst()) &&
+        doesInstUsesCSR(*PP.getInst(), CSR)) {
+      CannotPlace = true;
+    }
+    BinaryBasicBlock *FrontierBB = Info.getParentBB(PP);
+    CritEdgesFrom.emplace_back(FrontierBB);
+    CritEdgesTo.emplace_back(0);
+    SmallVector<BinaryBasicBlock *, 4> &Dests = CritEdgesTo.back();
+    // Check for invoke instructions at the dominance frontier, which indicates
+    // the landing pad is not dominated.
+    if (PP.isInst() && BC.MIB->isInvoke(*PP.getInst())) {
+      LLVM_DEBUG(
+          dbgs() << "Bailing on restore placement to avoid LP splitting\n");
+      Frontier.clear();
+      return Frontier;
+    }
+    doForAllSuccs(*FrontierBB, [&](ProgramPoint P) {
+      if (!DA.doesADominateB(*BestPosSave, P)) {
+        Dests.emplace_back(Info.getParentBB(P));
+        return;
+      }
+      HasCritEdges = true;
+    });
+    IsCritEdge.push_back(HasCritEdges);
+  }
+  // Restores cannot be placed in empty BBs because we have a dataflow
+  // analysis that depends on insertions happening before real instructions
+  // (PredictiveStackPointerTracking). Detect now for empty BBs and add a
+  // dummy nop that is scheduled to be removed later.
+  bool InvalidateRequired = false;
+  for (BinaryBasicBlock *&BB : BF.layout()) {
+    if (BB->size() != 0)
+      continue;
+    MCInst NewInst;
+    BC.MIB->createNoop(NewInst);
+    auto II = BB->addInstruction(std::move(NewInst));
+    scheduleChange(&*II, WorklistItem(WorklistItem::Erase, 0));
+    InvalidateRequired = true;
+  }
+  if (std::accumulate(IsCritEdge.begin(), IsCritEdge.end(), 0)) {
+    LLVM_DEBUG({
+      dbgs() << "Now detected critical edges in the following frontier:\n";
+      for (ProgramPoint &PP : Frontier) {
+        if (PP.isBB())
+          dbgs() << "  BB: " << PP.getBB()->getName() << "\n";
+        else {
+          dbgs() << "  Inst: ";
+          PP.getInst()->dump();
+        }
+      }
+    });
+    splitFrontierCritEdges(&BF, Frontier, IsCritEdge, CritEdgesFrom,
+                           CritEdgesTo);
+    InvalidateRequired = true;
+  }
+  if (InvalidateRequired) {
+    // BitVectors that represent all insns of the function are invalid now
+    // since we changed BBs/Insts. Re-run steps that depend on pointers being
+    // valid
+    Info.invalidateAll();
+    classifyCSRUses();
+  }
+  if (CannotPlace) {
+    LLVM_DEBUG({
+      dbgs() << "Dropping opportunity because restore placement failed"
+                " -- total est. freq reduc: "
+             << TotalEstimatedWin << "\n";
+    });
+    Frontier.clear();
+    return Frontier;
+  }
+  return Frontier;
+}
+
+bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
+                                          int64_t SaveOffset) {
+  if (FA.requiresAlignment(BF)) {
+    LLVM_DEBUG({
+      dbgs() << "Reg " << CSR << " is not using push/pops due to function "
+                                 "alignment requirements.\n";
+    });
+    return false;
+  }
+  for (MCInst *Save : CSA.getSavesByReg(CSR)) {
+    if (!SLM.canCollapseRegion(Save)) {
+      LLVM_DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n");
+      return false;
+    }
+  }
+  // Abort if one of the restores for this CSR is not a POP.
+  for (MCInst *Load : CSA.getRestoresByReg(CSR)) {
+    if (!BC.MIB->isPop(*Load)) {
+      LLVM_DEBUG(dbgs() << "Reg " << CSR << " has a mismatching restore.\n");
+      return false;
+    }
+  }
+
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  // Abort if we are inserting a push into an entry BB (offset -8) and this
+  // func sets up a frame pointer.
+  if (!SLM.canInsertRegion(BestPosSave) ||
+      SaveOffset == SPT.SUPERPOSITION || SaveOffset == SPT.EMPTY ||
+      (SaveOffset == -8 && SPT.HasFramePointer)) {
+    LLVM_DEBUG({
+      dbgs() << "Reg " << CSR << " cannot insert region or we are "
+                                 "trying to insert a push into entry bb.\n";
+    });
+    return false;
+  }
+  return true;
+}
+
+SmallVector<ProgramPoint, 4> ShrinkWrapping::fixPopsPlacements(
+    const SmallVector<ProgramPoint, 4> &RestorePoints, int64_t SaveOffset,
+    unsigned CSR) {
+  SmallVector<ProgramPoint, 4> FixedRestorePoints = RestorePoints;
+  // Moving pop locations to the correct sp offset
+  ReachingInsns<true> &RI = Info.getReachingInsnsBackwards();
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  for (ProgramPoint &PP : FixedRestorePoints) {
+    BinaryBasicBlock *BB = Info.getParentBB(PP);
+    bool Found = false;
+    if (SPT.getStateAt(ProgramPoint::getLastPointAt(*BB))->first ==
+        SaveOffset) {
+      BitVector BV = *RI.getStateAt(ProgramPoint::getLastPointAt(*BB));
+      BV &= UsesByReg[CSR];
+      if (!BV.any()) {
+        Found = true;
+        PP = BB;
+        continue;
+      }
+    }
+    for (auto RIt = BB->rbegin(), End = BB->rend(); RIt != End; ++RIt) {
+      if (SPT.getStateBefore(*RIt)->first == SaveOffset) {
+        BitVector BV = *RI.getStateAt(*RIt);
+        BV &= UsesByReg[CSR];
+        if (!BV.any()) {
+          Found = true;
+          PP = &*RIt;
+          break;
+        }
+      }
+    }
+    if (!Found) {
+      LLVM_DEBUG({
+        dbgs() << "Could not find restore insertion point for " << CSR
+               << ", falling back to load/store mode\n";
+      });
+      FixedRestorePoints.clear();
+      return FixedRestorePoints;
+    }
+  }
+  return FixedRestorePoints;
+}
+
+void ShrinkWrapping::scheduleOldSaveRestoresRemoval(unsigned CSR,
+                                                    bool UsePushPops) {
+
+  for (BinaryBasicBlock *&BB : BF.layout()) {
+    std::vector<MCInst *> CFIs;
+    for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
+      MCInst &Inst = *I;
+      if (BC.MIB->isCFI(Inst)) {
+        // Delete all offset CFIs related to this CSR
+        if (SLM.getOffsetCFIReg(Inst) == CSR) {
+          HasDeletedOffsetCFIs[CSR] = true;
+          scheduleChange(&Inst, WorklistItem(WorklistItem::Erase, CSR));
+          continue;
+        }
+        CFIs.push_back(&Inst);
+        continue;
+      }
+
+      uint16_t SavedReg = CSA.getSavedReg(Inst);
+      uint16_t RestoredReg = CSA.getRestoredReg(Inst);
+      if (SavedReg != CSR && RestoredReg != CSR) {
+        CFIs.clear();
+        continue;
+      }
+
+      scheduleChange(&Inst, WorklistItem(UsePushPops
+                                             ? WorklistItem::Erase
+                                             : WorklistItem::ChangeToAdjustment,
+                                         CSR));
+
+      // Delete associated CFIs
+      const bool RecordDeletedPushCFIs =
+          SavedReg == CSR && DeletedPushCFIs[CSR].empty();
+      const bool RecordDeletedPopCFIs =
+          RestoredReg == CSR && DeletedPopCFIs[CSR].empty();
+      for (MCInst *CFI : CFIs) {
+        MCCFIInstruction *MCCFI = BF.getCFIFor(*CFI);
+        // Do not touch these...
+        if (MCCFI->getOperation() == MCCFIInstruction::OpRestoreState ||
+            MCCFI->getOperation() == MCCFIInstruction::OpRememberState)
+          continue;
+        scheduleChange(CFI, WorklistItem(WorklistItem::Erase, CSR));
+        if (RecordDeletedPushCFIs) {
+          // Do not record this to be replayed later because we are going to
+          // rebuild it.
+          if (MCCFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
+            continue;
+          DeletedPushCFIs[CSR].push_back(CFI->getOperand(0).getImm());
+        }
+        if (RecordDeletedPopCFIs) {
+          if (MCCFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
+            continue;
+          DeletedPopCFIs[CSR].push_back(CFI->getOperand(0).getImm());
+        }
+      }
+      CFIs.clear();
+    }
+  }
+}
+
+bool ShrinkWrapping::doesInstUsesCSR(const MCInst &Inst, uint16_t CSR) {
+  if (BC.MIB->isCFI(Inst) || CSA.getSavedReg(Inst) == CSR ||
+      CSA.getRestoredReg(Inst) == CSR)
+    return false;
+  BitVector BV = BitVector(BC.MRI->getNumRegs(), false);
+  BC.MIB->getTouchedRegs(Inst, BV);
+  return BV[CSR];
+}
+
+void ShrinkWrapping::scheduleSaveRestoreInsertions(
+    unsigned CSR, MCInst *BestPosSave,
+    SmallVector<ProgramPoint, 4> &RestorePoints, bool UsePushPops) {
+  auto &InsnToBB = Info.getInsnToBBMap();
+  const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[CSR];
+  const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[CSR];
+  assert(FIESave && FIELoad && "Invalid CSR");
+
+  LLVM_DEBUG({
+    dbgs() << "Scheduling save insertion at: ";
+    BestPosSave->dump();
+  });
+
+  scheduleChange(BestPosSave, UsePushPops ? WorklistItem::InsertPushOrPop
+                                          : WorklistItem::InsertLoadOrStore,
+                 *FIESave, CSR);
+
+  for (ProgramPoint &PP : RestorePoints) {
+    BinaryBasicBlock *FrontierBB = Info.getParentBB(PP);
+    LLVM_DEBUG({
+      dbgs() << "Scheduling restore insertion at: ";
+      if (PP.isInst())
+        PP.getInst()->dump();
+      else {
+        dbgs() << PP.getBB()->getName() << "\n";
+      }
+    });
+    MCInst *Term =
+        FrontierBB->getTerminatorBefore(PP.isInst() ? PP.getInst() : nullptr);
+    if (Term)
+      PP = Term;
+    bool PrecededByPrefix = false;
+    if (PP.isInst()) {
+      auto Iter = FrontierBB->findInstruction(PP.getInst());
+      if (Iter != FrontierBB->end() && Iter != FrontierBB->begin()) {
+        --Iter;
+        PrecededByPrefix = BC.MIB->isPrefix(*Iter);
+      }
+    }
+    if (PP.isInst() && (doesInstUsesCSR(*PP.getInst(), CSR) ||
+                        PrecededByPrefix)) {
+      assert(!InsnToBB[PP.getInst()]->hasTerminatorAfter(PP.getInst()) &&
+             "cannot move to end of bb");
+      scheduleChange(InsnToBB[PP.getInst()],
+                     UsePushPops ? WorklistItem::InsertPushOrPop
+                                 : WorklistItem::InsertLoadOrStore,
+                     *FIELoad, CSR);
+      continue;
+    }
+    scheduleChange(PP, UsePushPops ? WorklistItem::InsertPushOrPop
+                                   : WorklistItem::InsertLoadOrStore,
+                   *FIELoad, CSR);
+  }
+}
+
+void ShrinkWrapping::moveSaveRestores() {
+  bool DisablePushPopMode = false;
+  bool UsedPushPopMode = false;
+  // Keeps info about successfully moved regs: reg index, save position and
+  // save size
+  std::vector<std::tuple<unsigned, MCInst *, size_t>> MovedRegs;
+
+  for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+    MCInst *BestPosSave = nullptr;
+    uint64_t TotalEstimatedWin = 0;
+    if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin))
+      continue;
+    SmallVector<ProgramPoint, 4> RestorePoints =
+        doRestorePlacement(BestPosSave, I, TotalEstimatedWin);
+    if (RestorePoints.empty())
+      continue;
+
+    const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[I];
+    const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[I];
+    (void)FIELoad;
+    assert(FIESave && FIELoad);
+    StackPointerTracking &SPT = Info.getStackPointerTracking();
+    const std::pair<int, int> SPFP = *SPT.getStateBefore(*BestPosSave);
+    int SaveOffset = SPFP.first;
+    uint8_t SaveSize = FIESave->Size;
+
+    // If we don't know stack state at this point, bail
+    if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) &&
+        (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY))
+      continue;
+
+    // Operation mode: if true, will insert push/pops instead of loads/restores
+    bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset);
+
+    if (UsePushPops) {
+      SmallVector<ProgramPoint, 4> FixedRestorePoints =
+          fixPopsPlacements(RestorePoints, SaveOffset, I);
+      if (FixedRestorePoints.empty())
+        UsePushPops = false;
+      else
+        RestorePoints = FixedRestorePoints;
+    }
+
+    // Disable push-pop mode for all CSRs in this function
+    if (!UsePushPops)
+      DisablePushPopMode = true;
+    else
+      UsedPushPopMode = true;
+
+    scheduleOldSaveRestoresRemoval(I, UsePushPops);
+    scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops);
+    MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize));
+  }
+
+  // Revert push-pop mode if it failed for a single CSR
+  if (DisablePushPopMode && UsedPushPopMode) {
+    UsedPushPopMode = false;
+    for (BinaryBasicBlock &BB : BF) {
+      auto WRI = Todo.find(&BB);
+      if (WRI != Todo.end()) {
+        std::vector<WorklistItem> &TodoList = WRI->second;
+        for (WorklistItem &Item : TodoList) {
+          if (Item.Action == WorklistItem::InsertPushOrPop)
+            Item.Action = WorklistItem::InsertLoadOrStore;
+        }
+      }
+      for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
+        MCInst &Inst = *I;
+        auto TodoList = BC.MIB->tryGetAnnotationAs<std::vector<WorklistItem>>(
+            Inst, getAnnotationIndex());
+        if (!TodoList)
+          continue;
+        bool isCFI = BC.MIB->isCFI(Inst);
+        for (WorklistItem &Item : *TodoList) {
+          if (Item.Action == WorklistItem::InsertPushOrPop)
+            Item.Action = WorklistItem::InsertLoadOrStore;
+          if (!isCFI && Item.Action == WorklistItem::Erase)
+            Item.Action = WorklistItem::ChangeToAdjustment;
+        }
+      }
+    }
+  }
+
+  // Update statistics
+  if (!UsedPushPopMode) {
+    SpillsMovedRegularMode += MovedRegs.size();
+    return;
+  }
+
+  // Schedule modifications to stack-accessing instructions via
+  // StackLayoutModifier.
+  SpillsMovedPushPopMode += MovedRegs.size();
+  for (std::tuple<unsigned, MCInst *, size_t> &I : MovedRegs) {
+    unsigned RegNdx;
+    MCInst *SavePos;
+    size_t SaveSize;
+    std::tie(RegNdx, SavePos, SaveSize) = I;
+    for (MCInst *Save : CSA.getSavesByReg(RegNdx)) {
+      SLM.collapseRegion(Save);
+    }
+    SLM.insertRegion(SavePos, SaveSize);
+  }
+}
+
+namespace {
+/// Helper function to identify whether two basic blocks created by splitting
+/// a critical edge have the same contents.
+bool isIdenticalSplitEdgeBB(const BinaryContext &BC,
+                            const BinaryBasicBlock &A,
+                            const BinaryBasicBlock &B) {
+  if (A.succ_size() != B.succ_size())
+    return false;
+  if (A.succ_size() != 1)
+    return false;
+
+  if (*A.succ_begin() != *B.succ_begin())
+    return false;
+
+  if (A.size() != B.size())
+    return false;
+
+  // Compare instructions
+  auto I = A.begin(), E = A.end();
+  auto OtherI = B.begin(), OtherE = B.end();
+  while (I != E && OtherI != OtherE) {
+    if (I->getOpcode() != OtherI->getOpcode())
+      return false;
+    if (!BC.MIB->equals(*I, *OtherI,
+          [](const MCSymbol *A, const MCSymbol *B) { return true; }))
+      return false;
+    ++I;
+    ++OtherI;
+  }
+  return true;
+}
+}
+
+bool ShrinkWrapping::foldIdenticalSplitEdges() {
+  bool Changed = false;
+  for (auto Iter = BF.begin(); Iter != BF.end(); ++Iter) {
+    BinaryBasicBlock &BB = *Iter;
+    if (!BB.getName().startswith(".LSplitEdge"))
+      continue;
+    for (auto RIter = BF.rbegin(); RIter != BF.rend(); ++RIter) {
+      BinaryBasicBlock &RBB = *RIter;
+      if (&RBB == &BB)
+        break;
+      if (!RBB.getName().startswith(".LSplitEdge") ||
+          !RBB.isValid() ||
+          !isIdenticalSplitEdgeBB(BC, *Iter, RBB))
+        continue;
+      assert(RBB.pred_size() == 1 && "Invalid split edge BB");
+      BinaryBasicBlock *Pred = *RBB.pred_begin();
+      uint64_t OrigCount = Pred->branch_info_begin()->Count;
+      uint64_t OrigMispreds = Pred->branch_info_begin()->MispredictedCount;
+      BF.replaceJumpTableEntryIn(Pred, &RBB, &BB);
+      Pred->replaceSuccessor(&RBB, &BB, OrigCount, OrigMispreds);
+      Changed = true;
+      // Remove the block from CFG
+      RBB.markValid(false);
+    }
+  }
+
+  return Changed;
+}
+
+namespace {
+
+// A special StackPointerTracking that compensates for our future plans
+// in removing/adding insn.
+class PredictiveStackPointerTracking
+    : public StackPointerTrackingBase<PredictiveStackPointerTracking> {
+  friend class DataflowAnalysis<PredictiveStackPointerTracking,
+                                std::pair<int, int>>;
+  decltype(ShrinkWrapping::Todo) &TodoMap;
+  DataflowInfoManager &Info;
+
+  Optional<unsigned> AnnotationIndex;
+
+protected:
+  void compNextAux(const MCInst &Point,
+                   const std::vector<ShrinkWrapping::WorklistItem> &TodoItems,
+                   std::pair<int, int> &Res) {
+    for (const ShrinkWrapping::WorklistItem &Item : TodoItems) {
+      if (Item.Action == ShrinkWrapping::WorklistItem::Erase &&
+          BC.MIB->isPush(Point)) {
+        Res.first += BC.MIB->getPushSize(Point);
+        continue;
+      }
+      if (Item.Action == ShrinkWrapping::WorklistItem::Erase &&
+          BC.MIB->isPop(Point)) {
+        Res.first -= BC.MIB->getPopSize(Point);
+        continue;
+      }
+      if (Item.Action == ShrinkWrapping::WorklistItem::InsertPushOrPop &&
+          Item.FIEToInsert.IsStore) {
+        Res.first -= Item.FIEToInsert.Size;
+        continue;
+      }
+      if (Item.Action == ShrinkWrapping::WorklistItem::InsertPushOrPop &&
+          Item.FIEToInsert.IsLoad) {
+        Res.first += Item.FIEToInsert.Size;
+        continue;
+      }
+    }
+  }
+
+  std::pair<int, int> computeNext(const MCInst &Point,
+                                  const std::pair<int, int> &Cur) {
+    std::pair<int, int> Res =
+        StackPointerTrackingBase<PredictiveStackPointerTracking>::computeNext(
+            Point, Cur);
+    if (Res.first == StackPointerTracking::SUPERPOSITION ||
+        Res.first == StackPointerTracking::EMPTY)
+      return Res;
+    auto TodoItems =
+        BC.MIB->tryGetAnnotationAs<std::vector<ShrinkWrapping::WorklistItem>>(
+            Point, ShrinkWrapping::getAnnotationName());
+    if (TodoItems)
+      compNextAux(Point, *TodoItems, Res);
+    auto &InsnToBBMap = Info.getInsnToBBMap();
+    if (&*InsnToBBMap[&Point]->rbegin() != &Point)
+      return Res;
+    auto WRI = TodoMap.find(InsnToBBMap[&Point]);
+    if (WRI == TodoMap.end())
+      return Res;
+    compNextAux(Point, WRI->second, Res);
+    return Res;
+  }
+
+  StringRef getAnnotationName() const {
+    return StringRef("PredictiveStackPointerTracking");
+  }
+
+public:
+  PredictiveStackPointerTracking(const BinaryContext &BC, BinaryFunction &BF,
+                                 decltype(ShrinkWrapping::Todo) &TodoMap,
+                                 DataflowInfoManager &Info,
+                                 MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
+      : StackPointerTrackingBase<PredictiveStackPointerTracking>(BC, BF,
+                                                                 AllocatorId),
+        TodoMap(TodoMap), Info(Info) {}
+
+  void run() {
+    StackPointerTrackingBase<PredictiveStackPointerTracking>::run();
+  }
+};
+
+} // end anonymous namespace
+
+void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush,
+                                      int SPValPop) {
+  MCInst *SavePoint = nullptr;
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto InstIter = BB.rbegin(), EndIter = BB.rend(); InstIter != EndIter;
+         ++InstIter) {
+      int32_t SrcImm = 0;
+      MCPhysReg Reg = 0;
+      MCPhysReg StackPtrReg = 0;
+      int64_t StackOffset = 0;
+      bool IsIndexed = false;
+      bool IsLoad = false;
+      bool IsStore = false;
+      bool IsSimple = false;
+      bool IsStoreFromReg = false;
+      uint8_t Size = 0;
+      if (!BC.MIB->isStackAccess(*InstIter, IsLoad, IsStore, IsStoreFromReg,
+                                 Reg, SrcImm, StackPtrReg, StackOffset,
+                                 Size, IsSimple, IsIndexed))
+        continue;
+      if (Reg != CSR || !IsStore || !IsSimple)
+        continue;
+      SavePoint = &*InstIter;
+      break;
+    }
+    if (SavePoint)
+      break;
+  }
+  assert(SavePoint);
+  LLVM_DEBUG({
+    dbgs() << "Now using as save point for reg " << CSR << " :";
+    SavePoint->dump();
+  });
+  bool PrevAffectedZone = false;
+  BinaryBasicBlock *PrevBB = nullptr;
+  DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (BB->size() == 0)
+      continue;
+    const bool InAffectedZoneAtEnd = DA.count(*BB->rbegin(), *SavePoint);
+    const bool InAffectedZoneAtBegin =
+        (*DA.getStateBefore(*BB->begin()))[DA.ExprToIdx[SavePoint]];
+    bool InAffectedZone = InAffectedZoneAtBegin;
+    for (auto InstIter = BB->begin(); InstIter != BB->end(); ++InstIter) {
+      const bool CurZone = DA.count(*InstIter, *SavePoint);
+      if (InAffectedZone != CurZone) {
+        auto InsertionIter = InstIter;
+        ++InsertionIter;
+        InAffectedZone = CurZone;
+        if (InAffectedZone) {
+          InstIter = --insertCFIsForPushOrPop(*BB, InsertionIter, CSR, true, 0,
+                                              SPValPop);
+        } else {
+          InstIter = --insertCFIsForPushOrPop(*BB, InsertionIter, CSR, false, 0,
+                                              SPValPush);
+        }
+      }
+    }
+    // Are we at the first basic block or hot-cold split point?
+    if (!PrevBB || (BF.isSplit() && BB->isCold() != PrevBB->isCold())) {
+      if (InAffectedZoneAtBegin) {
+        insertCFIsForPushOrPop(*BB, BB->begin(), CSR, true, 0, SPValPush);
+      }
+    } else {
+      if (InAffectedZoneAtBegin != PrevAffectedZone) {
+        if (InAffectedZoneAtBegin) {
+          insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, true, 0,
+                                 SPValPush);
+        } else {
+          insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, false, 0,
+                                 SPValPop);
+        }
+      }
+    }
+    PrevAffectedZone = InAffectedZoneAtEnd;
+    PrevBB = BB;
+  }
+}
+
+void ShrinkWrapping::rebuildCFIForSP() {
+  for (BinaryBasicBlock &BB : BF) {
+    for (MCInst &Inst : BB) {
+      if (!BC.MIB->isCFI(Inst))
+        continue;
+      MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+      if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
+        BC.MIB->addAnnotation(Inst, "DeleteMe", 0U, AllocatorId);
+    }
+  }
+
+  int PrevSPVal = -8;
+  BinaryBasicBlock *PrevBB = nullptr;
+  StackPointerTracking &SPT = Info.getStackPointerTracking();
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (BB->size() == 0)
+      continue;
+    const int SPValAtEnd = SPT.getStateAt(*BB->rbegin())->first;
+    const int SPValAtBegin = SPT.getStateBefore(*BB->begin())->first;
+    int SPVal = SPValAtBegin;
+    for (auto Iter = BB->begin(); Iter != BB->end(); ++Iter) {
+      const int CurVal = SPT.getStateAt(*Iter)->first;
+      if (SPVal != CurVal) {
+        auto InsertionIter = Iter;
+        ++InsertionIter;
+        Iter = BF.addCFIInstruction(
+            BB, InsertionIter,
+            MCCFIInstruction::cfiDefCfaOffset(nullptr, -CurVal));
+        SPVal = CurVal;
+      }
+    }
+    if (BF.isSplit() && PrevBB && BB->isCold() != PrevBB->isCold()) {
+      BF.addCFIInstruction(
+          BB, BB->begin(),
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, -SPValAtBegin));
+    } else {
+      if (SPValAtBegin != PrevSPVal) {
+        BF.addCFIInstruction(
+            PrevBB, PrevBB->end(),
+            MCCFIInstruction::cfiDefCfaOffset(nullptr, -SPValAtBegin));
+      }
+    }
+    PrevSPVal = SPValAtEnd;
+    PrevBB = BB;
+  }
+
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto I = BB.begin(); I != BB.end(); ) {
+      if (BC.MIB->hasAnnotation(*I, "DeleteMe"))
+        I = BB.eraseInstruction(I);
+      else
+        ++I;
+    }
+  }
+}
+
+MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
+                                         const FrameIndexEntry &FIE,
+                                         bool CreatePushOrPop) {
+  MCInst NewInst;
+  if (SPVal != StackPointerTracking::SUPERPOSITION &&
+      SPVal != StackPointerTracking::EMPTY) {
+    if (FIE.IsLoad) {
+      if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
+                                          FIE.StackOffset - SPVal, FIE.RegOrImm,
+                                          FIE.Size)) {
+        errs() << "createRestoreFromStack: not supported on this platform\n";
+        abort();
+      }
+    } else {
+      if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
+                                     FIE.StackOffset - SPVal, FIE.RegOrImm,
+                                     FIE.Size)) {
+        errs() << "createSaveToStack: not supported on this platform\n";
+        abort();
+      }
+    }
+    if (CreatePushOrPop)
+      BC.MIB->changeToPushOrPop(NewInst);
+    return NewInst;
+  }
+  assert(FPVal != StackPointerTracking::SUPERPOSITION &&
+         FPVal != StackPointerTracking::EMPTY);
+
+  if (FIE.IsLoad) {
+    if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
+                                        FIE.StackOffset - FPVal, FIE.RegOrImm,
+                                        FIE.Size)) {
+      errs() << "createRestoreFromStack: not supported on this platform\n";
+      abort();
+    }
+  } else {
+    if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
+                                   FIE.StackOffset - FPVal, FIE.RegOrImm,
+                                   FIE.Size)) {
+      errs() << "createSaveToStack: not supported on this platform\n";
+      abort();
+    }
+  }
+  return NewInst;
+}
+
+void ShrinkWrapping::updateCFIInstOffset(MCInst &Inst, int64_t NewOffset) {
+  MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+  if (UpdatedCFIs.count(CFI))
+    return;
+
+  switch (CFI->getOperation()) {
+  case MCCFIInstruction::OpDefCfa:
+  case MCCFIInstruction::OpDefCfaRegister:
+  case MCCFIInstruction::OpDefCfaOffset:
+    CFI->setOffset(-NewOffset);
+    break;
+  case MCCFIInstruction::OpOffset:
+  default:
+    break;
+  }
+
+  UpdatedCFIs.insert(CFI);
+}
+
+BBIterTy ShrinkWrapping::insertCFIsForPushOrPop(BinaryBasicBlock &BB,
+                                                BBIterTy Pos, unsigned Reg,
+                                                bool isPush, int Sz,
+                                                int64_t NewOffset) {
+  if (isPush) {
+    for (uint32_t Idx : DeletedPushCFIs[Reg]) {
+      Pos = BF.addCFIPseudo(&BB, Pos, Idx);
+      updateCFIInstOffset(*Pos++, NewOffset);
+    }
+    if (HasDeletedOffsetCFIs[Reg]) {
+      Pos = ++BF.addCFIInstruction(
+          &BB, Pos,
+          MCCFIInstruction::createOffset(
+              nullptr, BC.MRI->getDwarfRegNum(Reg, false), NewOffset));
+    }
+  } else {
+    for (uint32_t Idx : DeletedPopCFIs[Reg]) {
+      Pos = BF.addCFIPseudo(&BB, Pos, Idx);
+      updateCFIInstOffset(*Pos++, NewOffset);
+    }
+    if (HasDeletedOffsetCFIs[Reg]) {
+      Pos = ++BF.addCFIInstruction(
+          &BB, Pos,
+          MCCFIInstruction::createSameValue(
+              nullptr, BC.MRI->getDwarfRegNum(Reg, false)));
+    }
+  }
+  return Pos;
+}
+
+BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
+                                          BinaryBasicBlock *CurBB,
+                                          const WorklistItem &Item,
+                                          int64_t SPVal, int64_t FPVal) {
+  // Trigger CFI reconstruction for this CSR if necessary - writing to
+  // PushOffsetByReg/PopOffsetByReg *will* trigger CFI update
+  if ((Item.FIEToInsert.IsStore &&
+       !DeletedPushCFIs[Item.AffectedReg].empty()) ||
+      (Item.FIEToInsert.IsLoad && !DeletedPopCFIs[Item.AffectedReg].empty()) ||
+      HasDeletedOffsetCFIs[Item.AffectedReg]) {
+    if (Item.Action == WorklistItem::InsertPushOrPop) {
+      if (Item.FIEToInsert.IsStore)
+        PushOffsetByReg[Item.AffectedReg] = SPVal - Item.FIEToInsert.Size;
+      else
+        PopOffsetByReg[Item.AffectedReg] = SPVal;
+    } else {
+      if (Item.FIEToInsert.IsStore)
+        PushOffsetByReg[Item.AffectedReg] = Item.FIEToInsert.StackOffset;
+      else
+        PopOffsetByReg[Item.AffectedReg] = Item.FIEToInsert.StackOffset;
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Creating stack access with SPVal = " << SPVal
+           << "; stack offset = " << Item.FIEToInsert.StackOffset
+           << " Is push = " << (Item.Action == WorklistItem::InsertPushOrPop)
+           << "\n";
+  });
+  MCInst NewInst =
+      createStackAccess(SPVal, FPVal, Item.FIEToInsert,
+                        Item.Action == WorklistItem::InsertPushOrPop);
+  if (InsertionPoint != CurBB->end()) {
+    LLVM_DEBUG({
+      dbgs() << "Adding before Inst: ";
+      InsertionPoint->dump();
+      dbgs() << "the following inst: ";
+      NewInst.dump();
+    });
+    return ++CurBB->insertInstruction(InsertionPoint, std::move(NewInst));
+  }
+  CurBB->addInstruction(std::move(NewInst));
+  LLVM_DEBUG(dbgs() << "Adding to BB!\n");
+  return CurBB->end();
+}
+
+BBIterTy ShrinkWrapping::processInsertionsList(
+    BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
+    std::vector<WorklistItem> &TodoList, int64_t SPVal, int64_t FPVal) {
+  bool HasInsertions = false;
+  for (WorklistItem &Item : TodoList) {
+    if (Item.Action == WorklistItem::Erase ||
+        Item.Action == WorklistItem::ChangeToAdjustment)
+      continue;
+    HasInsertions = true;
+    break;
+  }
+
+  if (!HasInsertions)
+    return InsertionPoint;
+
+  assert(((SPVal != StackPointerTracking::SUPERPOSITION &&
+           SPVal != StackPointerTracking::EMPTY) ||
+          (FPVal != StackPointerTracking::SUPERPOSITION &&
+           FPVal != StackPointerTracking::EMPTY)) &&
+         "Cannot insert if we have no idea of the stack state here");
+
+  // Revert the effect of PSPT for this location, we want SP Value before
+  // insertions
+  if (InsertionPoint == CurBB->end()) {
+    for (WorklistItem &Item : TodoList) {
+      if (Item.Action != WorklistItem::InsertPushOrPop)
+        continue;
+      if (Item.FIEToInsert.IsStore)
+        SPVal += Item.FIEToInsert.Size;
+      if (Item.FIEToInsert.IsLoad)
+        SPVal -= Item.FIEToInsert.Size;
+    }
+  }
+
+  // Reorder POPs to obey the correct dominance relation between them
+  std::stable_sort(TodoList.begin(), TodoList.end(), [&](const WorklistItem &A,
+                                                         const WorklistItem
+                                                             &B) {
+    if ((A.Action != WorklistItem::InsertPushOrPop || !A.FIEToInsert.IsLoad) &&
+        (B.Action != WorklistItem::InsertPushOrPop || !B.FIEToInsert.IsLoad))
+      return false;
+    if ((A.Action != WorklistItem::InsertPushOrPop || !A.FIEToInsert.IsLoad))
+      return false;
+    if ((B.Action != WorklistItem::InsertPushOrPop || !B.FIEToInsert.IsLoad))
+      return true;
+    return DomOrder[B.AffectedReg] < DomOrder[A.AffectedReg];
+  });
+
+  // Process insertions
+  for (WorklistItem &Item : TodoList) {
+    if (Item.Action == WorklistItem::Erase ||
+        Item.Action == WorklistItem::ChangeToAdjustment)
+      continue;
+
+    InsertionPoint =
+        processInsertion(InsertionPoint, CurBB, Item, SPVal, FPVal);
+    if (Item.Action == WorklistItem::InsertPushOrPop &&
+        Item.FIEToInsert.IsStore) {
+      SPVal -= Item.FIEToInsert.Size;
+    }
+    if (Item.Action == WorklistItem::InsertPushOrPop &&
+        Item.FIEToInsert.IsLoad) {
+      SPVal += Item.FIEToInsert.Size;
+    }
+  }
+  return InsertionPoint;
+}
+
+bool ShrinkWrapping::processInsertions() {
+  PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info, AllocatorId);
+  PSPT.run();
+
+  bool Changes = false;
+  for (BinaryBasicBlock &BB : BF) {
+    // Process insertions before some inst.
+    for (auto I = BB.begin(); I != BB.end(); ++I) {
+      MCInst &Inst = *I;
+      auto TodoList = BC.MIB->tryGetAnnotationAs<std::vector<WorklistItem>>(
+          Inst, getAnnotationIndex());
+      if (!TodoList)
+        continue;
+      Changes = true;
+      std::vector<WorklistItem> List = *TodoList;
+      LLVM_DEBUG({
+        dbgs() << "Now processing insertions in " << BB.getName()
+               << " before inst: ";
+        Inst.dump();
+      });
+      auto Iter = I;
+      std::pair<int, int> SPTState =
+          *PSPT.getStateAt(Iter == BB.begin() ? (ProgramPoint)&BB : &*(--Iter));
+      I = processInsertionsList(I, &BB, List, SPTState.first, SPTState.second);
+    }
+    // Process insertions at the end of bb
+    auto WRI = Todo.find(&BB);
+    if (WRI != Todo.end()) {
+      std::pair<int, int> SPTState = *PSPT.getStateAt(*BB.rbegin());
+      processInsertionsList(BB.end(), &BB, WRI->second, SPTState.first,
+                            SPTState.second);
+      Changes = true;
+    }
+  }
+  return Changes;
+}
+
+void ShrinkWrapping::processDeletions() {
+  LivenessAnalysis &LA = Info.getLivenessAnalysis();
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto II = BB.begin(); II != BB.end(); ++II) {
+      MCInst &Inst = *II;
+      auto TodoList = BC.MIB->tryGetAnnotationAs<std::vector<WorklistItem>>(
+          Inst, getAnnotationIndex());
+      if (!TodoList)
+        continue;
+      // Process all deletions
+      for (WorklistItem &Item : *TodoList) {
+        if (Item.Action != WorklistItem::Erase &&
+            Item.Action != WorklistItem::ChangeToAdjustment)
+          continue;
+
+        if (Item.Action == WorklistItem::ChangeToAdjustment) {
+          // Is flag reg alive across this func?
+          bool DontClobberFlags = LA.isAlive(&Inst, BC.MIB->getFlagsReg());
+          if (int Sz = BC.MIB->getPushSize(Inst)) {
+            BC.MIB->createStackPointerIncrement(Inst, Sz, DontClobberFlags);
+            continue;
+          }
+          if (int Sz = BC.MIB->getPopSize(Inst)) {
+            BC.MIB->createStackPointerDecrement(Inst, Sz, DontClobberFlags);
+            continue;
+          }
+        }
+
+        LLVM_DEBUG({
+          dbgs() << "Erasing: ";
+          BC.printInstruction(dbgs(), Inst);
+        });
+        II = std::prev(BB.eraseInstruction(II));
+        break;
+      }
+    }
+  }
+}
+
+void ShrinkWrapping::rebuildCFI() {
+  const bool FP = Info.getStackPointerTracking().HasFramePointer;
+  Info.invalidateAll();
+  if (!FP) {
+    rebuildCFIForSP();
+    Info.invalidateAll();
+  }
+  for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
+    if (PushOffsetByReg[I] == 0 || PopOffsetByReg[I] == 0)
+      continue;
+    const int64_t SPValPush = PushOffsetByReg[I];
+    const int64_t SPValPop = PopOffsetByReg[I];
+    insertUpdatedCFI(I, SPValPush, SPValPop);
+    Info.invalidateAll();
+  }
+}
+
+bool ShrinkWrapping::perform() {
+  HasDeletedOffsetCFIs = std::vector<bool>(BC.MRI->getNumRegs(), false);
+  PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
+  PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
+  DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
+
+  if (BF.checkForAmbiguousJumpTables()) {
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
+                      << ".\n");
+    // We could call disambiguateJumpTables here, but it is probably not worth
+    // the cost (of duplicating potentially large jump tables that could regress
+    // dcache misses). Moreover, ambiguous JTs are rare and coming from code
+    // written in assembly language. Just bail.
+    return false;
+  }
+  SLM.initialize();
+  CSA.compute();
+  classifyCSRUses();
+  pruneUnwantedCSRs();
+  computeSaveLocations();
+  computeDomOrder();
+  moveSaveRestores();
+  LLVM_DEBUG({
+    dbgs() << "Func before shrink-wrapping: \n";
+    BF.dump();
+  });
+  SLM.performChanges();
+  // Early exit if processInsertions doesn't detect any todo items
+  if (!processInsertions())
+    return false;
+  processDeletions();
+  if (foldIdenticalSplitEdges()) {
+    const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
+    (void)Stats;
+    LLVM_DEBUG(dbgs() << "Deleted " << Stats.first
+                      << " redundant split edge BBs (" << Stats.second
+                      << " bytes) for " << BF.getPrintName() << "\n");
+  }
+  rebuildCFI();
+  // We may have split edges, creating BBs that need correct branching
+  BF.fixBranches();
+  LLVM_DEBUG({
+    dbgs() << "Func after shrink-wrapping: \n";
+    BF.dump();
+  });
+  return true;
+}
+
+void ShrinkWrapping::printStats() {
+  outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
+         << " spills inserting load/stores and " << SpillsMovedPushPopMode
+         << " spills inserting push/pops\n";
+}
+
+// Operators necessary as a result of using MCAnnotation
+raw_ostream &operator<<(raw_ostream &OS,
+                        const std::vector<ShrinkWrapping::WorklistItem> &Vec) {
+  OS << "SWTodo[";
+  const char *Sep = "";
+  for (const ShrinkWrapping::WorklistItem &Item : Vec) {
+    OS << Sep;
+    switch (Item.Action) {
+    case ShrinkWrapping::WorklistItem::Erase:
+      OS << "Erase";
+      break;
+    case ShrinkWrapping::WorklistItem::ChangeToAdjustment:
+      OS << "ChangeToAdjustment";
+      break;
+    case ShrinkWrapping::WorklistItem::InsertLoadOrStore:
+      OS << "InsertLoadOrStore";
+      break;
+    case ShrinkWrapping::WorklistItem::InsertPushOrPop:
+      OS << "InsertPushOrPop";
+      break;
+    }
+    Sep = ", ";
+  }
+  OS << "]";
+  return OS;
+}
+
+raw_ostream &
+operator<<(raw_ostream &OS,
+           const std::vector<StackLayoutModifier::WorklistItem> &Vec) {
+  OS << "SLMTodo[";
+  const char *Sep = "";
+  for (const StackLayoutModifier::WorklistItem &Item : Vec) {
+    OS << Sep;
+    switch (Item.Action) {
+    case StackLayoutModifier::WorklistItem::None:
+      OS << "None";
+      break;
+    case StackLayoutModifier::WorklistItem::AdjustLoadStoreOffset:
+      OS << "AdjustLoadStoreOffset";
+      break;
+    case StackLayoutModifier::WorklistItem::AdjustCFI:
+      OS << "AdjustCFI";
+      break;
+    }
+    Sep = ", ";
+  }
+  OS << "]";
+  return OS;
+}
+
+bool operator==(const ShrinkWrapping::WorklistItem &A,
+                const ShrinkWrapping::WorklistItem &B) {
+  return (A.Action == B.Action && A.AffectedReg == B.AffectedReg &&
+          A.Adjustment == B.Adjustment &&
+          A.FIEToInsert.IsLoad == B.FIEToInsert.IsLoad &&
+          A.FIEToInsert.IsStore == B.FIEToInsert.IsStore &&
+          A.FIEToInsert.RegOrImm == B.FIEToInsert.RegOrImm &&
+          A.FIEToInsert.Size == B.FIEToInsert.Size &&
+          A.FIEToInsert.IsSimple == B.FIEToInsert.IsSimple &&
+          A.FIEToInsert.StackOffset == B.FIEToInsert.StackOffset);
+}
+
+bool operator==(const StackLayoutModifier::WorklistItem &A,
+                const StackLayoutModifier::WorklistItem &B) {
+  return (A.Action == B.Action && A.OffsetUpdate == B.OffsetUpdate);
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/ShrinkWrapping.h b/bolt/src/Passes/ShrinkWrapping.h
new file mode 100644
index 000000000000..5e33910b4b3c
--- /dev/null
+++ b/bolt/src/Passes/ShrinkWrapping.h
@@ -0,0 +1,541 @@
+//===--- Passes/ShrinkWrapping.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H
+
+#include "FrameAnalysis.h"
+
+namespace llvm {
+namespace bolt {
+class DataflowInfoManager;
+
+/// Encapsulates logic required to analyze a binary function and detect which
+/// registers are being saved as callee-saved, where are these saves and where
+/// are the points where their original value are being restored.
+class CalleeSavedAnalysis {
+  const FrameAnalysis &FA;
+  const BinaryContext &BC;
+  BinaryFunction &BF;
+  DataflowInfoManager &Info;
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
+
+  Optional<unsigned> SaveTagIndex;
+  Optional<unsigned> RestoreTagIndex;
+
+  /// Compute all stores of callee-saved regs. Those are the ones that stores a
+  /// register whose definition is not local.
+  void analyzeSaves();
+
+  /// Similar to analyzeSaves, tries to determine all instructions that recover
+  /// the original value of the callee-saved register before exiting the
+  /// function.
+  void analyzeRestores();
+
+  unsigned getSaveTag() {
+    if (SaveTagIndex)
+      return *SaveTagIndex;
+    SaveTagIndex = BC.MIB->getOrCreateAnnotationIndex(getSaveTagName());
+    return *SaveTagIndex;
+  }
+
+  unsigned getRestoreTag() {
+    if (RestoreTagIndex)
+      return *RestoreTagIndex;
+    RestoreTagIndex = BC.MIB->getOrCreateAnnotationIndex(getRestoreTagName());
+    return *RestoreTagIndex;
+  }
+
+public:
+  BitVector CalleeSaved;
+  std::vector<int64_t> OffsetsByReg;
+  BitVector HasRestores;
+  std::vector<uint64_t> SavingCost;
+  std::vector<const FrameIndexEntry*> SaveFIEByReg;
+  std::vector<const FrameIndexEntry*> LoadFIEByReg;
+
+  CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC,
+                      BinaryFunction &BF, DataflowInfoManager &Info,
+                      MCPlusBuilder::AllocatorIdTy AllocId)
+      : FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
+        CalleeSaved(BC.MRI->getNumRegs(), false),
+        OffsetsByReg(BC.MRI->getNumRegs(), 0LL),
+        HasRestores(BC.MRI->getNumRegs(), false),
+        SavingCost(BC.MRI->getNumRegs(), 0ULL),
+        SaveFIEByReg(BC.MRI->getNumRegs(), nullptr),
+        LoadFIEByReg(BC.MRI->getNumRegs(), nullptr) {}
+
+  ~CalleeSavedAnalysis();
+
+  void compute() {
+    analyzeSaves();
+    analyzeRestores();
+  }
+
+  /// Retrieves the value of the callee-saved register that is saved by this
+  /// instruction or 0 if this is not a CSR save instruction.
+  uint16_t getSavedReg(const MCInst &Inst) {
+    auto Val = BC.MIB->tryGetAnnotationAs<decltype(FrameIndexEntry::RegOrImm)>(
+        Inst, getSaveTag());
+    if (Val)
+      return *Val;
+    return 0;
+  }
+
+  /// Retrieves the value of the callee-saved register that is restored by this
+  /// instruction or 0 if this is not a CSR restore instruction.
+  uint16_t getRestoredReg(const MCInst &Inst) {
+    auto Val = BC.MIB->tryGetAnnotationAs<decltype(FrameIndexEntry::RegOrImm)>(
+        Inst, getRestoreTag());
+    if (Val)
+      return *Val;
+    return 0;
+  }
+
+  /// Routines to compute all saves/restores for a Reg (needs to traverse all
+  /// instructions).
+  std::vector<MCInst *> getSavesByReg(uint16_t Reg);
+  std::vector<MCInst *> getRestoresByReg(uint16_t Reg);
+
+  /// Returns the identifying string used to annotate instructions with metadata
+  /// for this analysis. These are deleted in the destructor.
+  static StringRef getSaveTagName() {
+    return StringRef("CSA-SavedReg");
+  }
+
+  static StringRef getRestoreTagName() {
+    return StringRef("CSA-RestoredReg");
+  }
+
+};
+
+/// Identifies in a given binary function all stack regions being used and allow
+/// us to edit the layout, removing or inserting new regions. When the layout is
+/// modified, all affected stack-accessing instructions are updated.
+class StackLayoutModifier {
+  const FrameAnalysis &FA;
+  const BinaryContext &BC;
+  BinaryFunction &BF;
+  DataflowInfoManager &Info;
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
+
+  // Keep track of stack slots we know how to safely move
+  std::map<int64_t, int64_t> AvailableRegions;
+
+  DenseSet<int64_t> CollapsedRegions;
+  DenseSet<int64_t> InsertedRegions;
+
+  // A map of chunks of stack memory we don't really know what's happening there
+  // and we need to leave it untouched.
+  std::map<int64_t, int64_t> BlacklistedRegions;
+
+  // Maps stack slots to the regs that are saved to them
+  DenseMap<int64_t, std::set<MCPhysReg>> RegionToRegMap;
+  DenseMap<int, std::set<int64_t>> RegToRegionMap;
+
+  // If we can't understand how to move stack slots, IsSimple will be false
+  bool IsSimple{true};
+
+  bool IsInitialized{false};
+
+  Optional<unsigned> TodoTagIndex;
+  Optional<unsigned> SlotTagIndex;
+  Optional<unsigned> OffsetCFIRegTagIndex;
+
+public:
+  // Keep a worklist of operations to perform on the function to perform
+  // the requested layout modifications via collapseRegion()/insertRegion().
+  struct WorklistItem {
+    enum ActionType : uint8_t {
+      None = 0,
+      AdjustLoadStoreOffset,
+      AdjustCFI,
+    } Action;
+
+    int64_t OffsetUpdate{0};
+    WorklistItem() : Action(None) {}
+    WorklistItem(ActionType Action) : Action(Action) {}
+    WorklistItem(ActionType Action, int OffsetUpdate)
+        : Action(Action), OffsetUpdate(OffsetUpdate) {}
+  };
+private:
+
+  /// Mark the stack region identified by \p Offset and \p Size to be a
+  /// no-touch zone, whose accesses cannot be relocated to another region.
+  void blacklistRegion(int64_t Offset, int64_t Size);
+
+  /// Check if this region overlaps with blacklisted addresses
+  bool isRegionBlacklisted(int64_t Offset, int64_t Size);
+
+  /// Check if the region identified by \p Offset and \p Size has any conflicts
+  /// with available regions so far. If it has, blacklist all involved regions
+  /// and return true.
+  bool blacklistAllInConflictWith(int64_t Offset, int64_t Size);
+
+  /// If \p Point is identified as frame pointer initialization (defining the
+  /// value of FP with SP), check for non-standard initialization that precludes
+  /// us from changing the stack layout. If positive, update blacklisted
+  /// regions.
+  void checkFramePointerInitialization(MCInst &Point);
+
+  /// If \p Point is restoring the value with SP with FP plus offset,
+  /// add a slottag to this instruction as it needs to be updated when we
+  /// change the stack layout.
+  void checkStackPointerRestore(MCInst &Point);
+
+  /// Make sense of each stack offsets we can freely change
+  void classifyStackAccesses();
+  void classifyCFIs();
+
+  /// Used to keep track of modifications to the function that will later be
+  /// performed by performChanges();
+  void scheduleChange(MCInst &Inst, WorklistItem Item);
+
+  unsigned getTodoTag() {
+    if (TodoTagIndex)
+      return *TodoTagIndex;
+    TodoTagIndex = BC.MIB->getOrCreateAnnotationIndex(getTodoTagName());
+    return *TodoTagIndex;
+  }
+
+  unsigned getSlotTag() {
+    if (SlotTagIndex)
+      return *SlotTagIndex;
+    SlotTagIndex = BC.MIB->getOrCreateAnnotationIndex(getSlotTagName());
+    return *SlotTagIndex;
+  }
+
+  unsigned getOffsetCFIRegTag() {
+    if (OffsetCFIRegTagIndex)
+      return *OffsetCFIRegTagIndex;
+    OffsetCFIRegTagIndex =
+      BC.MIB->getOrCreateAnnotationIndex(getOffsetCFIRegTagName());
+    return *OffsetCFIRegTagIndex;
+  }
+
+public:
+  StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC,
+                      BinaryFunction &BF, DataflowInfoManager &Info,
+                      MCPlusBuilder::AllocatorIdTy AllocId)
+      : FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId) {}
+
+  ~StackLayoutModifier() {
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &Inst : BB) {
+        BC.MIB->removeAnnotation(Inst, getTodoTag());
+        BC.MIB->removeAnnotation(Inst, getSlotTag());
+        BC.MIB->removeAnnotation(Inst, getOffsetCFIRegTag());
+      }
+    }
+  }
+
+  /// Retrieves the value of the callee-saved register that is restored by this
+  /// instruction or 0 if this is not a CSR restore instruction.
+  uint16_t getOffsetCFIReg(const MCInst &Inst) {
+    auto Val =
+        BC.MIB->tryGetAnnotationAs<uint16_t>(Inst, getOffsetCFIRegTag());
+    if (Val)
+      return *Val;
+    return 0;
+  }
+
+  /// Check if it is possible to delete the push instruction \p DeletedPush.
+  /// This involves collapsing the region accessed by this push and updating all
+  /// other instructions that access affected memory regions. Return true if we
+  /// can update this.
+  bool canCollapseRegion(int64_t RegionAddr);
+  bool canCollapseRegion(MCInst *DeletedPush);
+
+  /// Notify the layout manager that \p DeletedPush was deleted and that it
+  /// needs to update other affected stack-accessing instructions.
+  bool collapseRegion(MCInst *Alloc, int64_t RegionAddr, int64_t RegionSize);
+  bool collapseRegion(MCInst *DeletedPush);
+
+  /// Set the new stack address difference for load/store instructions that
+  /// referenced a stack location that was deleted via collapseRegion.
+  void setOffsetForCollapsedAccesses(int64_t NewOffset);
+
+  /// Check if it is possible to insert a push instruction at point \p P.
+  /// This involves inserting a new region in the stack, possibly affecting
+  /// instructions that access the frame. Return true if we can update them all.
+  bool canInsertRegion(ProgramPoint P);
+
+  /// Notify the layout manager that a new push instruction has been inserted
+  /// at point \p P and that it will need to update relevant instructions.
+  bool insertRegion(ProgramPoint P, int64_t RegionSz);
+
+  /// Perform all changes scheduled by collapseRegion()/insertRegion()
+  void performChanges();
+
+  /// Perform initial assessment of the function trying to understand its stack
+  /// accesses.
+  void initialize();
+
+  static StringRef getTodoTagName() {
+    return StringRef("SLM-TodoTag");
+  }
+
+  static StringRef getSlotTagName() {
+    return StringRef("SLM-SlotTag");
+  }
+
+  static StringRef getOffsetCFIRegTagName() {
+    return StringRef("SLM-OffsetCFIReg");
+  }
+
+};
+
+/// Implements a pass to optimize callee-saved register spills. These spills
+/// typically happen at function prologue/epilogue. When these are hot basic
+/// blocks, this pass will try to move these spills to cold blocks whenever
+/// possible.
+class ShrinkWrapping {
+  const FrameAnalysis &FA;
+  const BinaryContext &BC;
+  BinaryFunction &BF;
+  DataflowInfoManager &Info;
+  MCPlusBuilder::AllocatorIdTy AllocatorId;
+  StackLayoutModifier SLM;
+  /// For each CSR, store a vector of all CFI indexes deleted as a consequence
+  /// of moving this Callee-Saved Reg
+  DenseMap<unsigned, std::vector<uint32_t>> DeletedPushCFIs;
+  DenseMap<unsigned, std::vector<uint32_t>> DeletedPopCFIs;
+  std::vector<bool> HasDeletedOffsetCFIs;
+  SmallPtrSet<const MCCFIInstruction *, 16> UpdatedCFIs;
+  std::vector<BitVector> UsesByReg;
+  std::vector<int64_t> PushOffsetByReg;
+  std::vector<int64_t> PopOffsetByReg;
+  std::vector<MCPhysReg> DomOrder;
+  CalleeSavedAnalysis CSA;
+  std::vector<SmallSetVector<MCInst *, 4>> SavePos;
+  std::vector<uint64_t> BestSaveCount;
+  std::vector<MCInst *> BestSavePos;
+
+  /// Pass stats
+  static uint64_t SpillsMovedRegularMode;
+  static uint64_t SpillsMovedPushPopMode;
+
+  Optional<unsigned> AnnotationIndex;
+
+  /// Allow our custom worklist-sensitive analysis
+  /// PredictiveStackPointerTracking to access WorklistItem
+public:
+  struct WorklistItem {
+    enum ActionType : uint8_t {
+      Erase = 0,
+      ChangeToAdjustment,
+      InsertLoadOrStore,
+      InsertPushOrPop
+    } Action;
+    FrameIndexEntry FIEToInsert;
+    unsigned AffectedReg;
+    int Adjustment{0};
+    WorklistItem(ActionType Action, unsigned AffectedReg)
+        : Action(Action), FIEToInsert(), AffectedReg(AffectedReg) {}
+    WorklistItem(ActionType Action, unsigned AffectedReg, int Adjustment)
+        : Action(Action), FIEToInsert(), AffectedReg(AffectedReg),
+          Adjustment(Adjustment) {}
+    WorklistItem(ActionType Action, const FrameIndexEntry &FIE,
+                 unsigned AffectedReg)
+        : Action(Action), FIEToInsert(FIE), AffectedReg(AffectedReg) {}
+  };
+
+  /// Insertion todo items scheduled to happen at the end of BBs. Since we
+  /// can't annotate BBs we maintain this bookkeeping here.
+  DenseMap<BinaryBasicBlock*, std::vector<WorklistItem>> Todo;
+
+  /// Annotation name used to tag instructions with removal or insertion actions
+  static StringRef getAnnotationName() {
+    return StringRef("ShrinkWrap-Todo");
+  }
+
+  unsigned getAnnotationIndex() {
+    if (AnnotationIndex)
+      return *AnnotationIndex;
+    AnnotationIndex = BC.MIB->getOrCreateAnnotationIndex(getAnnotationName());
+    return *AnnotationIndex;
+  }
+
+private:
+  using BBIterTy = BinaryBasicBlock::iterator;
+
+  /// Calculate all possible uses/defs of these callee-saved regs
+  void classifyCSRUses();
+
+  // Ensure we don't work on cases where there are no uses of the callee-saved
+  // register. These unnecessary spills should have been removed by previous
+  // passes.
+  void pruneUnwantedCSRs();
+
+  // Map regs to their possible save possibilities (at start of these BBs)
+  void computeSaveLocations();
+
+  /// Look into the best save location found for saving callee-saved reg
+  /// \p CSR and evaluates whether we would benefit by moving the spill to this
+  /// new save location. Returns true in case it is profitable to perform the
+  /// move.
+  bool validateBestSavePos(unsigned CSR, MCInst *&BestPosSave,
+                           uint64_t &TotalEstimatedWin);
+
+  /// Populate the Todo map with worklistitems to change the function
+  template <typename ...T>
+  void scheduleChange(ProgramPoint PP, T&& ...Item) {
+    if (PP.isInst()) {
+      auto &WList = BC.MIB->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
+          *PP.getInst(), getAnnotationIndex(), AllocatorId);
+      WList.emplace_back(std::forward<T>(Item)...);
+      return;
+    }
+    BinaryBasicBlock *BB = PP.getBB();
+    // Avoid inserting on BBs with no instructions because we have a dataflow
+    // analysis that depends on insertions happening before real instructions
+    // (PredictiveStackPointerTracking)
+    assert(BB->size() != 0 &&
+           "doRestorePlacement() should have handled empty BBs");
+    Todo[BB].emplace_back(std::forward<T>(Item)...);
+  }
+
+  /// Determine the POP ordering according to which CSR save is the dominator.
+  void computeDomOrder();
+
+  /// Check that the best possible location for a spill save (as determined by
+  /// computeSaveLocations) is cold enough to be worth moving the save to it.
+  /// \p CSR is the callee-saved register number, \p BestPosSave returns the
+  /// pointer to the cold location in case the function returns true, while
+  /// \p TotalEstimatedWin contains the ins dyn count reduction after moving.
+  bool isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave,
+                         uint64_t &TotalEstimatedWin);
+
+  /// Auxiliary function used to create basic blocks for critical edges and
+  /// update the dominance frontier with these new locations
+  void splitFrontierCritEdges(
+      BinaryFunction *Func, SmallVector<ProgramPoint, 4> &Frontier,
+      const SmallVector<bool, 4> &IsCritEdge,
+      const SmallVector<BinaryBasicBlock *, 4> &From,
+      const SmallVector<SmallVector<BinaryBasicBlock *, 4>, 4> &To);
+
+  /// After the best save location for a spill has been established in
+  /// \p BestPosSave for reg \p CSR, compute adequate locations to restore
+  /// the spilled value. This will be at the dominance frontier.
+  /// Returns an empty vector if we failed. In case of success, set
+  /// \p UsePushPops to true if we can operate in the push/pops mode.
+  SmallVector<ProgramPoint, 4> doRestorePlacement(MCInst *BestPosSave,
+                                                  unsigned CSR,
+                                                  uint64_t TotalEstimatedWin);
+
+  /// Checks whether using push and pops (instead of the longer load-store
+  /// counterparts) is correct for reg \p CSR
+  bool validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
+                            int64_t SaveOffset);
+
+  /// Adjust restore locations to the correct SP offset if we are using POPs
+  /// instead of random-access load instructions.
+  SmallVector<ProgramPoint, 4>
+  fixPopsPlacements(const SmallVector<ProgramPoint, 4> &RestorePoints,
+                    int64_t SaveOffset, unsigned CSR);
+
+  /// When moving spills, mark all old spill locations to be deleted
+  void scheduleOldSaveRestoresRemoval(unsigned CSR, bool UsePushPops);
+  /// Return true if \p Inst uses reg \p CSR
+  bool doesInstUsesCSR(const MCInst &Inst, uint16_t CSR);
+  /// When moving spills, mark all new spill locations for insertion
+  void
+  scheduleSaveRestoreInsertions(unsigned CSR, MCInst *BestPosSave,
+                                SmallVector<ProgramPoint, 4> &RestorePoints,
+                                bool UsePushPops);
+
+  /// Coordinate the replacement of callee-saved spills from their original
+  /// place (at prologue and epilogues) to colder basic blocks as determined
+  /// by computeSaveLocations().
+  void moveSaveRestores();
+
+  /// Compare multiple basic blocks created by splitting critical edges. If they
+  /// have the same contents and successor, fold them into one.
+  bool foldIdenticalSplitEdges();
+
+  /// After the spill locations for reg \p CSR has been moved and all affected
+  /// CFI has been removed, insert new updated CFI information for these
+  /// locations.
+  void insertUpdatedCFI(unsigned CSR, int SPValPush, int SPValPop);
+
+  /// In case the function anchors the CFA reg as SP and we inserted pushes/pops
+  /// insert def_cfa_offsets at appropriate places (and delete old
+  /// def_cfa_offsets)
+  void rebuildCFIForSP();
+
+  /// Rebuild all CFI for affected Callee-Saved Registers.
+  void rebuildCFI();
+
+  /// Create a load-store instruction (depending on the contents of \p FIE).
+  /// If \p CreatePushOrPop is true, create a push/pop instead. Current SP/FP
+  /// values, as determined by StackPointerTracking, should be informed via
+  /// \p SPVal and \p FPVal in order to emit the correct offset form SP/FP.
+  MCInst createStackAccess(int SPVal, int FPVal, const FrameIndexEntry &FIE,
+                           bool CreatePushOrPop);
+
+  /// Update the CFI referenced by \p Inst with \p NewOffset, if the CFI has
+  /// an offset.
+  void updateCFIInstOffset(MCInst &Inst, int64_t NewOffset);
+
+  /// Insert any CFI that should be attached to a register spill save/restore.
+  BBIterTy insertCFIsForPushOrPop(BinaryBasicBlock &BB, BBIterTy Pos,
+                                  unsigned Reg, bool IsPush, int Sz,
+                                  int64_t NewOffset);
+
+  /// Auxiliary function to processInsertionsList, adding a new instruction
+  /// before \p InsertionPoint as requested by \p Item. Return an updated
+  /// InsertionPoint for other instructions that need to be inserted at the same
+  /// original location, since this insertion may have invalidated the previous
+  /// location.
+  BBIterTy processInsertion(BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
+                            const WorklistItem &Item, int64_t SPVal,
+                            int64_t FPVal);
+
+  /// Auxiliary function to processInsertions(), helping perform all the
+  /// insertion tasks in the todo list associated with a single insertion point.
+  /// Return true if at least one insertion was performed.
+  BBIterTy processInsertionsList(BBIterTy InsertionPoint,
+                                 BinaryBasicBlock *CurBB,
+                                 std::vector<WorklistItem> &TodoList,
+                                 int64_t SPVal, int64_t FPVal);
+
+  /// Apply all insertion todo tasks regarding insertion of new stores/loads or
+  /// push/pops at annotated points. Return false if the entire function had
+  /// no todo tasks annotation and this pass has nothing to do.
+  bool processInsertions();
+
+  /// Apply all deletion todo tasks (or tasks to change a push/pop to a memory
+  /// access no-op)
+  void processDeletions();
+
+public:
+  ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC,
+                 BinaryFunction &BF, DataflowInfoManager &Info,
+                 MCPlusBuilder::AllocatorIdTy AllocId)
+      : FA(FA), BC(BC), BF(BF), Info(Info), AllocatorId(AllocId),
+        SLM(FA, BC, BF, Info, AllocId), CSA(FA, BC, BF, Info, AllocId) {}
+
+  ~ShrinkWrapping() {
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &Inst : BB) {
+        BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
+      }
+    }
+  }
+
+  bool perform();
+
+  static void printStats();
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/SplitFunctions.cpp b/bolt/src/Passes/SplitFunctions.cpp
new file mode 100644
index 000000000000..2c369d74ede2
--- /dev/null
+++ b/bolt/src/Passes/SplitFunctions.cpp
@@ -0,0 +1,248 @@
+//===--- SplitFunctions.cpp - pass for splitting function code ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryFunction.h"
+#include "ParallelUtilities.h"
+#include "SplitFunctions.h"
+#include "llvm/Support/CommandLine.h"
+
+#include <vector>
+
+#define DEBUG_TYPE "bolt-opts"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> SplitEH;
+extern cl::opt<unsigned> ExecutionCountThreshold;
+
+static cl::opt<bool>
+AggressiveSplitting("split-all-cold",
+  cl::desc("outline as many cold basic blocks as possible"),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+SplitAlignThreshold("split-align-threshold",
+  cl::desc("when deciding to split a function, apply this alignment "
+           "while doing the size comparison (see -split-threshold). "
+           "Default value: 2."),
+  cl::init(2),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<SplitFunctions::SplittingType>
+SplitFunctions("split-functions",
+  cl::desc("split functions into hot and cold regions"),
+  cl::init(SplitFunctions::ST_NONE),
+  cl::values(clEnumValN(SplitFunctions::ST_NONE, "0",
+                        "do not split any function"),
+             clEnumValN(SplitFunctions::ST_LARGE, "1",
+                        "in non-relocation mode only split functions too large "
+                        "to fit into original code space"),
+             clEnumValN(SplitFunctions::ST_LARGE, "2",
+                        "same as 1 (backwards compatibility)"),
+             clEnumValN(SplitFunctions::ST_ALL, "3",
+                        "split all functions")),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+SplitThreshold("split-threshold",
+  cl::desc("split function only if its main size is reduced by more than "
+           "given amount of bytes. Default value: 0, i.e. split iff the "
+           "size is reduced. Note that on some architectures the size can "
+           "increase after splitting."),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+void syncOptions(BinaryContext &BC) {
+  if (!BC.HasRelocations && opts::SplitFunctions == SplitFunctions::ST_LARGE)
+    opts::SplitFunctions = SplitFunctions::ST_ALL;
+}
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
+  // Apply execution count threshold
+  if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
+    return false;
+
+  return BinaryFunctionPass::shouldOptimize(BF);
+}
+
+void SplitFunctions::runOnFunctions(BinaryContext &BC) {
+  opts::syncOptions(BC);
+
+  if (opts::SplitFunctions == SplitFunctions::ST_NONE)
+    return;
+
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    splitFunction(BF);
+  };
+
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
+      "SplitFunctions");
+
+  if (SplitBytesHot + SplitBytesCold > 0) {
+    outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
+           << " hot bytes from " << SplitBytesCold << " cold bytes "
+           << format("(%.2lf%% of split functions is hot).\n",
+                     100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
+  }
+}
+
+void SplitFunctions::splitFunction(BinaryFunction &BF) {
+  if (!BF.size())
+    return;
+
+  if (!BF.hasValidProfile())
+    return;
+
+  bool AllCold = true;
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    uint64_t ExecCount = BB->getExecutionCount();
+    if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+      return;
+    if (ExecCount != 0)
+      AllCold = false;
+  }
+
+  if (AllCold)
+    return;
+
+  std::vector<BinaryBasicBlock *> PreSplitLayout = BF.getLayout();
+
+  BinaryContext &BC = BF.getBinaryContext();
+  size_t OriginalHotSize;
+  size_t HotSize;
+  size_t ColdSize;
+  if (BC.isX86()) {
+    std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
+    LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
+                      << " pre-split is <0x"
+                      << Twine::utohexstr(OriginalHotSize) << ", 0x"
+                      << Twine::utohexstr(ColdSize) << ">\n");
+  }
+
+  if (opts::SplitFunctions == SplitFunctions::ST_LARGE && !BC.HasRelocations) {
+    // Split only if the function wouldn't fit.
+    if (OriginalHotSize <= BF.getMaxSize())
+      return;
+  }
+
+  // Never outline the first basic block.
+  BF.layout_front()->setCanOutline(false);
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (!BB->canOutline())
+      continue;
+    if (BB->getExecutionCount() != 0) {
+      BB->setCanOutline(false);
+      continue;
+    }
+    // Do not split extra entry points in aarch64. They can be referred by
+    // using ADRs and when this happens, these blocks cannot be placed far
+    // away due to the limited range in ADR instruction.
+    if (BC.isAArch64() && BB->isEntryPoint()) {
+      BB->setCanOutline(false);
+      continue;
+    }
+    if (BF.hasEHRanges() && !opts::SplitEH) {
+      // We cannot move landing pads (or rather entry points for landing
+      // pads).
+      if (BB->isLandingPad()) {
+        BB->setCanOutline(false);
+        continue;
+      }
+      // We cannot move a block that can throw since exception-handling
+      // runtime cannot deal with split functions. However, if we can guarantee
+      // that the block never throws, it is safe to move the block to
+      // decrease the size of the function.
+      for (MCInst &Instr : *BB) {
+        if (BF.getBinaryContext().MIB->isInvoke(Instr)) {
+          BB->setCanOutline(false);
+          break;
+        }
+      }
+    }
+  }
+
+  if (opts::AggressiveSplitting) {
+    // All blocks with 0 count that we can move go to the end of the function.
+    // Even if they were natural to cluster formation and were seen in-between
+    // hot basic blocks.
+    std::stable_sort(BF.layout_begin(), BF.layout_end(),
+        [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) {
+          return A->canOutline() < B->canOutline();
+        });
+  } else if (BF.hasEHRanges() && !opts::SplitEH) {
+    // Typically functions with exception handling have landing pads at the end.
+    // We cannot move beginning of landing pads, but we can move 0-count blocks
+    // comprising landing pads to the end and thus facilitate splitting.
+    auto FirstLP = BF.layout_begin();
+    while ((*FirstLP)->isLandingPad())
+      ++FirstLP;
+
+    std::stable_sort(FirstLP, BF.layout_end(),
+        [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) {
+          return A->canOutline() < B->canOutline();
+        });
+  }
+
+  // Separate hot from cold starting from the bottom.
+  for (auto I = BF.layout_rbegin(), E = BF.layout_rend();
+       I != E; ++I) {
+    BinaryBasicBlock *BB = *I;
+    if (!BB->canOutline())
+      break;
+    BB->setIsCold(true);
+  }
+
+  // Check the new size to see if it's worth splitting the function.
+  if (BC.isX86() && BF.isSplit()) {
+    std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
+    LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
+                      << " post-split is <0x" << Twine::utohexstr(HotSize)
+                      << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
+    if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
+        alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
+      LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
+                        << Twine::utohexstr(HotSize) << ", 0x"
+                        << Twine::utohexstr(ColdSize) << " -> 0x"
+                        << Twine::utohexstr(OriginalHotSize) << '\n');
+
+      BF.updateBasicBlockLayout(PreSplitLayout);
+      for (BinaryBasicBlock &BB : BF) {
+        BB.setIsCold(false);
+      }
+    } else {
+      SplitBytesHot += HotSize;
+      SplitBytesCold += ColdSize;
+    }
+  }
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/SplitFunctions.h b/bolt/src/Passes/SplitFunctions.h
new file mode 100644
index 000000000000..8714aa3598ae
--- /dev/null
+++ b/bolt/src/Passes/SplitFunctions.h
@@ -0,0 +1,55 @@
+//===--- SplitFunctions.h - pass for splitting function code --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_SPLIT_FUNCTIONS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_SPLIT_FUNCTIONS_H
+
+#include "Passes/BinaryPasses.h"
+#include "llvm/Support/CommandLine.h"
+#include <atomic>
+
+namespace llvm {
+namespace bolt {
+
+/// Split function code in multiple parts.
+class SplitFunctions : public BinaryFunctionPass {
+public:
+  /// Settings for splitting function bodies into hot/cold partitions.
+  enum SplittingType : char {
+    ST_NONE = 0,      /// Do not split functions.
+    ST_LARGE,         /// In non-relocation mode, only split functions that
+                      /// are too large to fit into the original space.
+    ST_ALL,           /// Split all functions.
+  };
+
+private:
+  /// Split function body into fragments.
+  void splitFunction(BinaryFunction &Function);
+
+  std::atomic<uint64_t> SplitBytesHot{0ull};
+  std::atomic<uint64_t> SplitBytesCold{0ull};
+
+public:
+  explicit SplitFunctions(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) { }
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override {
+    return "split-functions";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/StackAllocationAnalysis.cpp b/bolt/src/Passes/StackAllocationAnalysis.cpp
new file mode 100644
index 000000000000..c09cbce7d0e6
--- /dev/null
+++ b/bolt/src/Passes/StackAllocationAnalysis.cpp
@@ -0,0 +1,150 @@
+//===--- Passes/StackAllocationAnalysis.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "StackAllocationAnalysis.h"
+#include "StackPointerTracking.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "saa"
+
+namespace llvm {
+namespace bolt {
+
+void StackAllocationAnalysis::preflight() {
+  LLVM_DEBUG(dbgs() << "Starting StackAllocationAnalysis on \""
+                    << Func.getPrintName() << "\"\n");
+
+  for (BinaryBasicBlock &BB : this->Func) {
+    for (MCInst &Inst : BB) {
+      MCPhysReg From, To;
+      if (!BC.MIB->isPush(Inst) && (!BC.MIB->isRegToRegMove(Inst, From, To) ||
+                                    To != BC.MIB->getStackPointer() ||
+                                    From != BC.MIB->getFramePointer()) &&
+          !BC.MII->get(Inst.getOpcode())
+               .hasDefOfPhysReg(Inst, BC.MIB->getStackPointer(), *BC.MRI))
+        continue;
+      this->Expressions.push_back(&Inst);
+      this->ExprToIdx[&Inst] = this->NumInstrs++;
+    }
+  }
+}
+
+BitVector
+StackAllocationAnalysis::getStartingStateAtBB(const BinaryBasicBlock &BB) {
+  return BitVector(this->NumInstrs, false);
+}
+
+BitVector
+StackAllocationAnalysis::getStartingStateAtPoint(const MCInst &Point) {
+  return BitVector(this->NumInstrs, false);
+}
+
+void StackAllocationAnalysis::doConfluence(BitVector &StateOut,
+                                           const BitVector &StateIn) {
+  StateOut |= StateIn;
+}
+
+BitVector StackAllocationAnalysis::doKill(const MCInst &Point,
+                                          const BitVector &StateIn,
+                                          int DeallocSize) {
+  int64_t SPOffset = SPT.getStateAt(Point)->first;
+  BitVector Next = StateIn;
+  if (SPOffset == SPT.SUPERPOSITION || SPOffset == SPT.EMPTY)
+    return Next;
+  for (auto I = this->expr_begin(Next), E = this->expr_end(); I != E; ++I) {
+    const MCInst *Instr = *I;
+    int64_t InstrOffset = SPT.getStateAt(*Instr)->first;
+    if (InstrOffset == SPT.SUPERPOSITION || InstrOffset == SPT.EMPTY)
+      continue;
+    if (InstrOffset < SPOffset) {
+      Next.reset(I.getBitVectorIndex());
+      LLVM_DEBUG({
+        dbgs() << "SAA FYI: Killed: ";
+        Instr->dump();
+        dbgs() << "by: ";
+        Point.dump();
+        dbgs() << "  (more info: Killed instr offset = " << InstrOffset
+               << ". SPOffset = " << SPOffset
+               << "; DeallocSize= " << DeallocSize << "\n";
+      });
+    }
+  }
+  return Next;
+}
+
+void StackAllocationAnalysis::doConfluenceWithLP(BitVector &StateOut,
+                                                 const BitVector &StateIn,
+                                                 const MCInst &Invoke) {
+  BitVector NewIn = StateIn;
+  const int64_t GnuArgsSize = BC.MIB->getGnuArgsSize(Invoke);
+  if (GnuArgsSize >= 0)
+    NewIn = doKill(Invoke, NewIn, GnuArgsSize);
+  StateOut |= NewIn;
+}
+
+BitVector StackAllocationAnalysis::computeNext(const MCInst &Point,
+                                               const BitVector &Cur) {
+  const auto &MIB = BC.MIB;
+  BitVector Next = Cur;
+  if (int Sz = MIB->getPopSize(Point)) {
+    Next = doKill(Point, Next, Sz);
+    return Next;
+  }
+  if (MIB->isPush(Point)) {
+    Next.set(this->ExprToIdx[&Point]);
+    return Next;
+  }
+
+  MCPhysReg From, To;
+  int64_t SPOffset, FPOffset;
+  std::tie(SPOffset, FPOffset) = *SPT.getStateBefore(Point);
+  if (MIB->isRegToRegMove(Point, From, To) && To == MIB->getStackPointer() &&
+      From == MIB->getFramePointer()) {
+    if (MIB->isLeave(Point))
+      FPOffset += 8;
+    if (SPOffset < FPOffset) {
+      Next = doKill(Point, Next, FPOffset - SPOffset);
+      return Next;
+    }
+    if (SPOffset > FPOffset) {
+      Next.set(this->ExprToIdx[&Point]);
+      return Next;
+    }
+  }
+  if (BC.MII->get(Point.getOpcode())
+          .hasDefOfPhysReg(Point, MIB->getStackPointer(), *BC.MRI)) {
+    std::pair<MCPhysReg, int64_t> SP;
+    if (SPOffset != SPT.EMPTY && SPOffset != SPT.SUPERPOSITION)
+      SP = std::make_pair(MIB->getStackPointer(), SPOffset);
+    else
+      SP = std::make_pair(0, 0);
+    std::pair<MCPhysReg, int64_t> FP;
+    if (FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION)
+      FP = std::make_pair(MIB->getFramePointer(), FPOffset);
+    else
+      FP = std::make_pair(0, 0);
+    int64_t Output;
+    if (!MIB->evaluateSimple(Point, Output, SP, FP))
+      return Next;
+
+    if (SPOffset < Output) {
+      Next = doKill(Point, Next, Output - SPOffset);
+      return Next;
+    }
+    if (SPOffset > Output) {
+      Next.set(this->ExprToIdx[&Point]);
+      return Next;
+    }
+  }
+  return Next;
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/Passes/StackAllocationAnalysis.h b/bolt/src/Passes/StackAllocationAnalysis.h
new file mode 100644
index 000000000000..6bce6a67d410
--- /dev/null
+++ b/bolt/src/Passes/StackAllocationAnalysis.h
@@ -0,0 +1,71 @@
+//===--- Passes/StackAllocationAnalysis.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H
+
+#include "DataflowAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+class StackPointerTracking;
+
+/// Perform a dataflow analysis to track the value of SP as an offset relative
+/// to the CFA.
+class StackAllocationAnalysis
+    : public InstrsDataflowAnalysis<StackAllocationAnalysis,
+                                    /*Backward=*/false> {
+  friend class DataflowAnalysis<StackAllocationAnalysis, BitVector>;
+
+  StackPointerTracking &SPT;
+
+public:
+  StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF,
+                          StackPointerTracking &SPT,
+                          MCPlusBuilder::AllocatorIdTy AllocId)
+      : InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF, AllocId),
+        SPT(SPT) {}
+  virtual ~StackAllocationAnalysis() {}
+
+  void run() {
+    InstrsDataflowAnalysis<StackAllocationAnalysis, false>::run();
+  }
+
+protected:
+  void preflight();
+
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB);
+
+  BitVector getStartingStateAtPoint(const MCInst &Point);
+
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn);
+
+  BitVector doKill(const MCInst &Point, const BitVector &StateIn,
+                   int DeallocSize);
+
+  void doConfluenceWithLP(BitVector &StateOut, const BitVector &StateIn,
+                          const MCInst &Invoke);
+
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur);
+
+  StringRef getAnnotationName() const {
+    return StringRef("StackAllocationAnalysis");
+  }
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/StackAvailableExpressions.cpp b/bolt/src/Passes/StackAvailableExpressions.cpp
new file mode 100644
index 000000000000..5f2c1ad36862
--- /dev/null
+++ b/bolt/src/Passes/StackAvailableExpressions.cpp
@@ -0,0 +1,133 @@
+//===--- Passes/StackAvailableExpressions.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "StackAvailableExpressions.h"
+#include "FrameAnalysis.h"
+#include "RegAnalysis.h"
+
+#define DEBUG_TYPE "sae"
+
+namespace llvm {
+namespace bolt {
+
+StackAvailableExpressions::StackAvailableExpressions(const RegAnalysis &RA,
+                                                     const FrameAnalysis &FA,
+                                                     const BinaryContext &BC,
+                                                     BinaryFunction &BF)
+    : InstrsDataflowAnalysis(BC, BF), RA(RA), FA(FA) {}
+
+void StackAvailableExpressions::preflight() {
+  LLVM_DEBUG(dbgs() << "Starting StackAvailableExpressions on \""
+                    << Func.getPrintName() << "\"\n");
+
+  // Populate our universe of tracked expressions. We are interested in
+  // tracking available stores to frame position at any given point of the
+  // program.
+  for (BinaryBasicBlock &BB : Func) {
+    for (MCInst &Inst : BB) {
+      ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Inst);
+      if (!FIE)
+        continue;
+      if (FIE->IsStore == true && FIE->IsSimple == true) {
+        Expressions.push_back(&Inst);
+        ExprToIdx[&Inst] = NumInstrs++;
+      }
+    }
+  }
+}
+
+BitVector
+StackAvailableExpressions::getStartingStateAtBB(const BinaryBasicBlock &BB) {
+  // Entry points start with empty set
+  // All others start with the full set.
+  if (BB.pred_size() == 0 && BB.throw_size() == 0)
+    return BitVector(NumInstrs, false);
+  return BitVector(NumInstrs, true);
+}
+
+BitVector
+StackAvailableExpressions::getStartingStateAtPoint(const MCInst &Point) {
+  return BitVector(NumInstrs, true);
+}
+
+void StackAvailableExpressions::doConfluence(BitVector &StateOut,
+                                             const BitVector &StateIn) {
+  StateOut &= StateIn;
+}
+
+namespace {
+
+bool isLoadRedundant(const FrameIndexEntry &LoadFIE,
+                     const FrameIndexEntry &StoreFIE) {
+  if (LoadFIE.IsLoad == false || LoadFIE.IsSimple == false) {
+    return false;
+  }
+  if (LoadFIE.StackOffset == StoreFIE.StackOffset &&
+      LoadFIE.Size == StoreFIE.Size) {
+    return true;
+  }
+
+  return false;
+}
+}
+
+bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) {
+  // if both are stores, and both store to the same stack location, return
+  // true
+  ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(*X);
+  ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*Y);
+  if (FIEX && FIEY) {
+    if (isLoadRedundant(*FIEX, *FIEY))
+      return false;
+    if (FIEX->IsStore == true && FIEY->IsStore == true &&
+        FIEX->StackOffset + FIEX->Size > FIEY->StackOffset &&
+        FIEX->StackOffset < FIEY->StackOffset + FIEY->Size)
+      return true;
+  }
+  // getClobberedRegs for X and Y. If they intersect, return true
+  BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false);
+  BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false);
+  RA.getInstClobberList(*X, XClobbers);
+  // If Y is a store to stack, its clobber list is its source reg. This is
+  // different than the rest because we want to check if the store source
+  // reaches its corresponding load untouched.
+  if (FIEY && FIEY->IsStore == true && FIEY->IsStoreFromReg) {
+    YClobbers.set(FIEY->RegOrImm);
+  } else {
+    RA.getInstClobberList(*Y, YClobbers);
+  }
+  XClobbers &= YClobbers;
+  return XClobbers.any();
+}
+
+BitVector StackAvailableExpressions::computeNext(const MCInst &Point,
+                                                 const BitVector &Cur) {
+  BitVector Next = Cur;
+  // Kill
+  for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
+    assert(*I != nullptr && "Lost pointers");
+    LLVM_DEBUG(dbgs() << "\t\t\tDoes it kill ");
+    LLVM_DEBUG((*I)->dump());
+    if (doesXKillsY(&Point, *I)) {
+      LLVM_DEBUG(dbgs() << "\t\t\t\tKilling ");
+      LLVM_DEBUG((*I)->dump());
+      Next.reset(I.getBitVectorIndex());
+    }
+  }
+  // Gen
+  if (ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Point)) {
+    if (FIE->IsStore == true && FIE->IsSimple == true)
+      Next.set(ExprToIdx[&Point]);
+  }
+  return Next;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/StackAvailableExpressions.h b/bolt/src/Passes/StackAvailableExpressions.h
new file mode 100644
index 000000000000..b6aaedbf2057
--- /dev/null
+++ b/bolt/src/Passes/StackAvailableExpressions.h
@@ -0,0 +1,61 @@
+//===--- Passes/StackAvailableExpressions.h -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H
+
+#include "DataflowAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+class FrameAnalysis;
+class RegAnalysis;
+
+class StackAvailableExpressions
+    : public InstrsDataflowAnalysis<StackAvailableExpressions> {
+  friend class DataflowAnalysis<StackAvailableExpressions, BitVector>;
+
+public:
+  StackAvailableExpressions(const RegAnalysis &RA, const FrameAnalysis &FA,
+                            const BinaryContext &BC, BinaryFunction &BF);
+  virtual ~StackAvailableExpressions() {}
+
+  void run() {
+    InstrsDataflowAnalysis<StackAvailableExpressions>::run();
+  }
+
+protected:
+  const RegAnalysis &RA;
+  const FrameAnalysis &FA;
+
+  void preflight();
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB);
+  BitVector getStartingStateAtPoint(const MCInst &Point);
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn);
+  /// Define the function computing the kill set -- whether expression Y, a
+  /// tracked expression, will be considered to be dead after executing X.
+  bool doesXKillsY(const MCInst *X, const MCInst *Y);
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur);
+
+  StringRef getAnnotationName() const {
+    return StringRef("StackAvailableExpressions");
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/StackPointerTracking.cpp b/bolt/src/Passes/StackPointerTracking.cpp
new file mode 100644
index 000000000000..8e679b46257f
--- /dev/null
+++ b/bolt/src/Passes/StackPointerTracking.cpp
@@ -0,0 +1,28 @@
+//===--- Passes/StackPointerTracking.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "StackPointerTracking.h"
+
+namespace llvm {
+namespace bolt {
+
+StackPointerTracking::StackPointerTracking(
+    const BinaryContext &BC, BinaryFunction &BF,
+    MCPlusBuilder::AllocatorIdTy AllocatorId)
+    : StackPointerTrackingBase<StackPointerTracking>(BC, BF, AllocatorId) {}
+
+} // end namespace bolt
+} // end namespace llvm
+
+llvm::raw_ostream &llvm::operator<<(llvm::raw_ostream &OS,
+                                    const std::pair<int, int> &Val) {
+  OS << Val.first << ", " << Val.second;
+  return OS;
+}
diff --git a/bolt/src/Passes/StackPointerTracking.h b/bolt/src/Passes/StackPointerTracking.h
new file mode 100644
index 000000000000..d31cc7cd5b0b
--- /dev/null
+++ b/bolt/src/Passes/StackPointerTracking.h
@@ -0,0 +1,218 @@
+//===--- Passes/StackPointerTracking.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H
+
+#include "DataflowAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+/// Perform a dataflow analysis to track the value of SP as an offset relative
+/// to the CFA.
+template <typename Derived>
+class StackPointerTrackingBase
+    : public DataflowAnalysis<Derived, std::pair<int, int>> {
+  friend class DataflowAnalysis<Derived, std::pair<int, int>>;
+
+protected:
+  void preflight() {}
+
+  int getEmpty() { return EMPTY; }
+
+  std::pair<int, int> getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    // Entry BB start with offset 8 from CFA.
+    // All others start with EMPTY (meaning we don't know anything).
+    if (BB.isEntryPoint())
+      return std::make_pair(-8, getEmpty());
+    return std::make_pair(getEmpty(), getEmpty());
+  }
+
+  std::pair<int, int> getStartingStateAtPoint(const MCInst &Point) {
+    return std::make_pair(getEmpty(), getEmpty());
+  }
+
+  void doConfluenceSingleReg(int &StateOut, const int &StateIn) {
+    if (StateOut == EMPTY) {
+      StateOut = StateIn;
+      return;
+    }
+    if (StateIn == EMPTY || StateIn == StateOut)
+      return;
+
+    // We can't agree on a specific value from this point on
+    StateOut = SUPERPOSITION;
+  }
+
+  void doConfluence(std::pair<int, int> &StateOut,
+                    const std::pair<int, int> &StateIn) {
+    doConfluenceSingleReg(StateOut.first, StateIn.first);
+    doConfluenceSingleReg(StateOut.second, StateIn.second);
+  }
+
+  void doConfluenceWithLP(std::pair<int, int> &StateOut,
+                          const std::pair<int, int> &StateIn,
+                          const MCInst &Invoke) {
+    int SPVal = StateIn.first;
+    if (SPVal != EMPTY && SPVal != SUPERPOSITION) {
+      const int64_t GnuArgsSize = this->BC.MIB->getGnuArgsSize(Invoke);
+      if (GnuArgsSize > 0)
+        SPVal += GnuArgsSize;
+    }
+    doConfluenceSingleReg(StateOut.first, SPVal);
+    doConfluenceSingleReg(StateOut.second, StateIn.second);
+  }
+
+  int computeNextSP(const MCInst &Point, int SPVal, int FPVal) {
+    const auto &MIB = this->BC.MIB;
+
+    if (int Sz = MIB->getPushSize(Point)) {
+      if (SPVal == EMPTY || SPVal == SUPERPOSITION)
+        return SPVal;
+
+      return SPVal - Sz;
+    }
+
+    if (int Sz = MIB->getPopSize(Point)) {
+      if (SPVal == EMPTY || SPVal == SUPERPOSITION)
+        return SPVal;
+
+      return SPVal + Sz;
+    }
+
+    MCPhysReg From, To;
+    if (MIB->isRegToRegMove(Point, From, To) && To == MIB->getStackPointer() &&
+        From == MIB->getFramePointer()) {
+      if (FPVal == EMPTY || FPVal == SUPERPOSITION)
+        return FPVal;
+
+      if (MIB->isLeave(Point))
+        return FPVal + 8;
+      else
+        return FPVal;
+    }
+
+    if (this->BC.MII->get(Point.getOpcode())
+            .hasDefOfPhysReg(Point, MIB->getStackPointer(), *this->BC.MRI)) {
+      std::pair<MCPhysReg, int64_t> SP;
+      if (SPVal != EMPTY && SPVal != SUPERPOSITION)
+        SP = std::make_pair(MIB->getStackPointer(), SPVal);
+      else
+        SP = std::make_pair(0, 0);
+      std::pair<MCPhysReg, int64_t> FP;
+      if (FPVal != EMPTY && FPVal != SUPERPOSITION)
+        FP = std::make_pair(MIB->getFramePointer(), FPVal);
+      else
+        FP = std::make_pair(0, 0);
+      int64_t Output;
+      if (!MIB->evaluateSimple(Point, Output, SP, FP)) {
+        if (SPVal == EMPTY && FPVal == EMPTY)
+          return SPVal;
+        return SUPERPOSITION;
+      }
+
+      return static_cast<int>(Output);
+    }
+
+    return SPVal;
+  }
+
+  int computeNextFP(const MCInst &Point, int SPVal, int FPVal) {
+    const auto &MIB = this->BC.MIB;
+
+    MCPhysReg From, To;
+    if (MIB->isRegToRegMove(Point, From, To) && To == MIB->getFramePointer() &&
+        From == MIB->getStackPointer()) {
+      HasFramePointer = true;
+      return SPVal;
+    }
+
+    if (this->BC.MII->get(Point.getOpcode())
+            .hasDefOfPhysReg(Point, MIB->getFramePointer(), *this->BC.MRI)) {
+      std::pair<MCPhysReg, int64_t> FP;
+      if (FPVal != EMPTY && FPVal != SUPERPOSITION)
+        FP = std::make_pair(MIB->getFramePointer(), FPVal);
+      else
+        FP = std::make_pair(0, 0);
+      std::pair<MCPhysReg, int64_t> SP;
+      if (SPVal != EMPTY && SPVal != SUPERPOSITION)
+        SP = std::make_pair(MIB->getStackPointer(), SPVal);
+      else
+        SP = std::make_pair(0, 0);
+      int64_t Output;
+      if (!MIB->evaluateSimple(Point, Output, SP, FP)) {
+        if (SPVal == EMPTY && FPVal == EMPTY)
+          return FPVal;
+        return SUPERPOSITION;
+      }
+
+      if (!HasFramePointer) {
+        if (MIB->escapesVariable(Point, false)) {
+          HasFramePointer = true;
+        }
+      }
+      return static_cast<int>(Output);
+    }
+
+    return FPVal;
+  }
+
+  std::pair<int, int> computeNext(const MCInst &Point,
+                                  const std::pair<int, int> &Cur) {
+    return std::make_pair(computeNextSP(Point, Cur.first, Cur.second),
+                          computeNextFP(Point, Cur.first, Cur.second));
+  }
+
+  StringRef getAnnotationName() const {
+    return StringRef("StackPointerTracking");
+  }
+
+public:
+  StackPointerTrackingBase(const BinaryContext &BC, BinaryFunction &BF,
+                           MCPlusBuilder::AllocatorIdTy AllocatorId = 0)
+      : DataflowAnalysis<Derived, std::pair<int, int>>(BC, BF, AllocatorId) {}
+
+  virtual ~StackPointerTrackingBase() {}
+
+  bool HasFramePointer{false};
+
+  static constexpr int SUPERPOSITION = std::numeric_limits<int>::max();
+  static constexpr int EMPTY = std::numeric_limits<int>::min();
+};
+
+class StackPointerTracking
+    : public StackPointerTrackingBase<StackPointerTracking> {
+  friend class DataflowAnalysis<StackPointerTracking, std::pair<int, int>>;
+
+public:
+  StackPointerTracking(const BinaryContext &BC, BinaryFunction &BF,
+                       MCPlusBuilder::AllocatorIdTy AllocatorId = 0);
+  virtual ~StackPointerTracking() {}
+
+  void run() {
+    StackPointerTrackingBase<StackPointerTracking>::run();
+  }
+};
+
+} // end namespace bolt
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const std::pair<int, int> &Val);
+
+} // end namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/StackReachingUses.cpp b/bolt/src/Passes/StackReachingUses.cpp
new file mode 100644
index 000000000000..268fe6d634fc
--- /dev/null
+++ b/bolt/src/Passes/StackReachingUses.cpp
@@ -0,0 +1,128 @@
+//===--- Passes/StackReachingUses.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#include "StackReachingUses.h"
+#include "FrameAnalysis.h"
+
+#define DEBUG_TYPE "sru"
+
+namespace llvm {
+namespace bolt {
+
+bool StackReachingUses::isLoadedInDifferentReg(const FrameIndexEntry &StoreFIE,
+                                               ExprIterator Candidates) const {
+  for (auto I = Candidates; I != expr_end(); ++I) {
+    const MCInst *ReachingInst = *I;
+    if (ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*ReachingInst)) {
+      assert(FIEY->IsLoad == 1);
+      if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset &&
+          StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size &&
+          StoreFIE.RegOrImm != FIEY->RegOrImm) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE,
+                                    ExprIterator Candidates,
+                                    bool IncludeLocalAccesses) const {
+  for (auto I = Candidates; I != expr_end(); ++I) {
+    const MCInst *ReachingInst = *I;
+    if (IncludeLocalAccesses) {
+      if (ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*ReachingInst)) {
+        assert(FIEY->IsLoad == 1);
+        if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset &&
+            StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size) {
+          return true;
+        }
+      }
+    }
+    ErrorOr<const ArgAccesses &> Args = FA.getArgAccessesFor(*ReachingInst);
+    if (!Args)
+      continue;
+    if (Args->AssumeEverything) {
+      return true;
+    }
+    for (ArgInStackAccess FIEY : Args->Set) {
+      if (StoreFIE.StackOffset + StoreFIE.Size > FIEY.StackOffset &&
+          StoreFIE.StackOffset < FIEY.StackOffset + FIEY.Size) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void StackReachingUses::preflight() {
+  LLVM_DEBUG(dbgs() << "Starting StackReachingUses on \"" << Func.getPrintName()
+                    << "\"\n");
+
+  // Populate our universe of tracked expressions. We are interested in
+  // tracking reaching loads from frame position at any given point of the
+  // program.
+  for (BinaryBasicBlock &BB : Func) {
+    for (MCInst &Inst : BB) {
+      if (ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Inst)) {
+        if (FIE->IsLoad == true) {
+          Expressions.push_back(&Inst);
+          ExprToIdx[&Inst] = NumInstrs++;
+          continue;
+        }
+      }
+      ErrorOr<const ArgAccesses &> AA = FA.getArgAccessesFor(Inst);
+      if (AA && (!AA->Set.empty() || AA->AssumeEverything)) {
+        Expressions.push_back(&Inst);
+        ExprToIdx[&Inst] = NumInstrs++;
+      }
+    }
+  }
+}
+
+bool StackReachingUses::doesXKillsY(const MCInst *X, const MCInst *Y) {
+  // if X is a store to the same stack location and the bytes fetched is a
+  // superset of those bytes affected by the load in Y, return true
+  ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(*X);
+  ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*Y);
+  if (FIEX && FIEY) {
+    if (FIEX->IsSimple == true && FIEY->IsSimple == true &&
+        FIEX->IsStore == true && FIEY->IsLoad == true &&
+        FIEX->StackOffset <= FIEY->StackOffset &&
+        FIEX->StackOffset + FIEX->Size >= FIEY->StackOffset + FIEY->Size)
+      return true;
+  }
+  return false;
+}
+
+BitVector StackReachingUses::computeNext(const MCInst &Point,
+                                         const BitVector &Cur) {
+  BitVector Next = Cur;
+  // Kill
+  for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
+    assert(*I != nullptr && "Lost pointers");
+    if (doesXKillsY(&Point, *I)) {
+      LLVM_DEBUG(dbgs() << "\t\t\tKilling ");
+      LLVM_DEBUG((*I)->dump());
+      Next.reset(I.getBitVectorIndex());
+    }
+  };
+  // Gen
+  if (ErrorOr<const FrameIndexEntry &> FIE = FA.getFIEFor(Point)) {
+    if (FIE->IsLoad == true)
+      Next.set(ExprToIdx[&Point]);
+  }
+  ErrorOr<const ArgAccesses &> AA = FA.getArgAccessesFor(Point);
+  if (AA && (!AA->Set.empty() || AA->AssumeEverything))
+    Next.set(ExprToIdx[&Point]);
+  return Next;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/StackReachingUses.h b/bolt/src/Passes/StackReachingUses.h
new file mode 100644
index 000000000000..5bb30233b8eb
--- /dev/null
+++ b/bolt/src/Passes/StackReachingUses.h
@@ -0,0 +1,85 @@
+//===--- Passes/StackReachingUses.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H
+
+#include "DataflowAnalysis.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace opts {
+extern llvm::cl::opt<bool> TimeOpts;
+}
+
+namespace llvm {
+namespace bolt {
+
+class FrameAnalysis;
+struct FrameIndexEntry;
+
+class StackReachingUses
+    : public InstrsDataflowAnalysis<StackReachingUses, /*Backward=*/true> {
+  friend class DataflowAnalysis<StackReachingUses, BitVector, true>;
+
+public:
+  StackReachingUses(const FrameAnalysis &FA, const BinaryContext &BC,
+                    BinaryFunction &BF,
+                    MCPlusBuilder::AllocatorIdTy AllocId = 0)
+      : InstrsDataflowAnalysis(BC, BF, AllocId), FA(FA) {}
+  virtual ~StackReachingUses() {}
+
+  /// Return true if the stack position written by the store in \p StoreFIE was
+  /// later consumed by a load to a different register (not the same one used in
+  /// the store). Useful for identifying loads/stores of callee-saved regs.
+  bool isLoadedInDifferentReg(const FrameIndexEntry &StoreFIE,
+                              ExprIterator Candidates) const;
+
+  /// Answer whether the stack position written by the store represented in
+  /// \p StoreFIE is loaded from or consumed in any way. The set of all
+  /// relevant expressions reaching this store should be in \p Candidates.
+  /// If \p IncludelocalAccesses is false, we only consider wheter there is
+  /// a callee that consumes this stack position.
+  bool isStoreUsed(const FrameIndexEntry &StoreFIE, ExprIterator Candidates,
+                   bool IncludeLocalAccesses = true) const;
+
+  void run() {
+    InstrsDataflowAnalysis<StackReachingUses, true>::run();
+  }
+
+protected:
+  // Reference to the result of stack frame analysis
+  const FrameAnalysis &FA;
+
+  void preflight();
+
+  BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    return BitVector(NumInstrs, false);
+  }
+
+  BitVector getStartingStateAtPoint(const MCInst &Point) {
+    return BitVector(NumInstrs, false);
+  }
+
+  void doConfluence(BitVector &StateOut, const BitVector &StateIn) {
+    StateOut |= StateIn;
+  }
+
+  // Define the function computing the kill set -- whether expression Y, a
+  // tracked expression, will be considered to be dead after executing X.
+  bool doesXKillsY(const MCInst *X, const MCInst *Y);
+  BitVector computeNext(const MCInst &Point, const BitVector &Cur);
+
+  StringRef getAnnotationName() const { return StringRef("StackReachingUses"); }
+};
+
+} // end namespace bolt
+} // end namespace llvm
+
+#endif
diff --git a/bolt/src/Passes/StokeInfo.cpp b/bolt/src/Passes/StokeInfo.cpp
new file mode 100644
index 000000000000..f203238be9d8
--- /dev/null
+++ b/bolt/src/Passes/StokeInfo.cpp
@@ -0,0 +1,197 @@
+//===--- Passes/StokeInfo.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "StokeInfo.h"
+#include "BinaryFunctionCallGraph.h"
+#include "DataflowInfoManager.h"
+#include "llvm/Support/CommandLine.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "stoke"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+cl::OptionCategory StokeOptCategory("STOKE pass options");
+
+static cl::opt<std::string>
+StokeOutputDataFilename("stoke-out",
+  cl::desc("output data (.csv) for Stoke's use"),
+  cl::Optional,
+  cl::cat(StokeOptCategory));
+}
+
+namespace llvm {
+namespace bolt {
+
+
+void getRegNameFromBitVec(const BinaryContext &BC, const BitVector &RegV,
+    std::set<std::string> *NameVec = nullptr) {
+  int RegIdx = RegV.find_first();
+  while (RegIdx != -1) {
+    LLVM_DEBUG(dbgs() << BC.MRI->getName(RegIdx) << " ");
+    if (NameVec) {
+      NameVec->insert(std::string(BC.MRI->getName(RegIdx)));
+    }
+    RegIdx = RegV.find_next(RegIdx);
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF,
+    StokeFuncInfo &FuncInfo) {
+
+  BitVector RegV(NumRegs, false);
+  for (BinaryBasicBlock *BB : BF.layout()) {
+    if (BB->empty()) {
+      continue;
+    }
+    for (MCInst &It : *BB) {
+      if (BC.MIB->isPseudo(It))
+        continue;
+      // skip function with exception handling yet
+      if (BC.MIB->isEHLabel(It) || BC.MIB->isInvoke(It)) {
+        FuncInfo.Omitted = true;
+        return;
+      }
+      // check if this function contains call instruction
+      if (BC.MIB->isCall(It)) {
+        FuncInfo.HasCall = true;
+        const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(It);
+        // if it is an indirect call, skip
+        if (TargetSymbol == nullptr) {
+          FuncInfo.Omitted = true;
+          return;
+        }
+      }
+      // check if this function modify stack or heap
+      // TODO: more accurate analysis
+      bool IsPush = BC.MIB->isPush(It);
+      bool IsRipAddr = BC.MIB->hasPCRelOperand(It);
+      if (IsPush) {
+        FuncInfo.StackOut = true;
+      }
+      if (BC.MIB->isStore(It) && !IsPush && !IsRipAddr) {
+        FuncInfo.HeapOut = true;
+      }
+      if (IsRipAddr) {
+        FuncInfo.HasRipAddr = true;
+      }
+
+    } // end of for (auto &It : ...)
+  } // end of for (auto *BB : ...)
+}
+
+bool StokeInfo::checkFunction(const BinaryContext &BC, BinaryFunction &BF,
+    DataflowInfoManager &DInfo, RegAnalysis &RA,
+    StokeFuncInfo &FuncInfo) {
+
+  std::string Name = BF.getSymbol()->getName().str();
+
+  if (!BF.isSimple() || BF.isMultiEntry() || BF.empty()) {
+    return false;
+  }
+  outs() << " STOKE-INFO: analyzing function " << Name << "\n";
+
+  FuncInfo.FuncName = Name;
+  FuncInfo.Offset = BF.getFileOffset();
+  FuncInfo.Size = BF.getMaxSize();
+  FuncInfo.NumInstrs = BF.getNumNonPseudos();
+  FuncInfo.NumBlocks = BF.size();
+  // early stop for large functions
+  if (FuncInfo.NumInstrs > 500) {
+    return false;
+  }
+
+  FuncInfo.IsLoopFree = BF.isLoopFree();
+  if (!FuncInfo.IsLoopFree) {
+    const BinaryLoopInfo &BLI = BF.getLoopInfo();
+    FuncInfo.NumLoops = BLI.OuterLoops;
+    FuncInfo.MaxLoopDepth = BLI.MaximumDepth;
+  }
+
+  FuncInfo.HotSize = BF.estimateHotSize();
+  FuncInfo.TotalSize = BF.estimateSize();
+  FuncInfo.Score = BF.getFunctionScore();
+
+  checkInstr(BC, BF, FuncInfo);
+
+  // register analysis
+  BinaryBasicBlock &EntryBB = BF.front();
+  assert(EntryBB.isEntryPoint() && "Weird, this should be the entry block!");
+
+  MCInst *FirstNonPseudo = EntryBB.getFirstNonPseudoInstr();
+  if (!FirstNonPseudo) {
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "\t [DefIn]\n\t ");
+  BitVector LiveInBV =
+      *(DInfo.getLivenessAnalysis().getStateAt(FirstNonPseudo));
+  LiveInBV &= DefaultDefInMask;
+  getRegNameFromBitVec(BC, LiveInBV, &FuncInfo.DefIn);
+
+  LLVM_DEBUG(dbgs() << "\t [LiveOut]\n\t ");
+  BitVector LiveOutBV = RA.getFunctionClobberList(&BF);
+  LiveOutBV &= DefaultLiveOutMask;
+  getRegNameFromBitVec(BC, LiveOutBV, &FuncInfo.LiveOut);
+
+  outs() << " STOKE-INFO: end function \n";
+  return true;
+}
+
+void StokeInfo::runOnFunctions(BinaryContext &BC) {
+  outs() << "STOKE-INFO: begin of stoke pass\n";
+
+  std::ofstream Outfile;
+  if (!opts::StokeOutputDataFilename.empty()) {
+    Outfile.open(opts::StokeOutputDataFilename);
+  } else {
+    errs() << "STOKE-INFO: output file is required\n";
+    return;
+  }
+
+  // check some context meta data
+  LLVM_DEBUG(dbgs() << "\tTarget: " << BC.TheTarget->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "\tTripleName " << BC.TripleName << "\n");
+  LLVM_DEBUG(dbgs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n");
+
+  BinaryFunctionCallGraph CG = buildCallGraph(BC);
+  RegAnalysis RA(BC, &BC.getBinaryFunctions(), &CG);
+
+  NumRegs = BC.MRI->getNumRegs();
+  assert(NumRegs > 0 && "STOKE-INFO: the target register number is incorrect!");
+
+  DefaultDefInMask.resize(NumRegs, false);
+  DefaultLiveOutMask.resize(NumRegs, false);
+
+  BC.MIB->getDefaultDefIn(DefaultDefInMask);
+  BC.MIB->getDefaultLiveOut(DefaultLiveOutMask);
+
+  getRegNameFromBitVec(BC, DefaultDefInMask);
+  getRegNameFromBitVec(BC, DefaultLiveOutMask);
+
+  StokeFuncInfo FuncInfo;
+  // analyze all functions
+  FuncInfo.printCsvHeader(Outfile);
+  for (auto &BF : BC.getBinaryFunctions()) {
+    DataflowInfoManager DInfo(BC, BF.second, &RA/*RA.get()*/, nullptr);
+    FuncInfo.reset();
+    if (checkFunction(BC, BF.second, DInfo, RA, FuncInfo)) {
+      FuncInfo.printData(Outfile);
+    }
+  }
+
+  outs() << "STOKE-INFO: end of stoke pass\n";
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/StokeInfo.h b/bolt/src/Passes/StokeInfo.h
new file mode 100644
index 000000000000..1fc6e5e57857
--- /dev/null
+++ b/bolt/src/Passes/StokeInfo.h
@@ -0,0 +1,148 @@
+//===--- Passes/StokeInfo.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//  Pass to get information for functions for the Stoke Optimization
+//  To use the Stoke optimization technique to optimize the HHVM.
+//  This Pass solves the two major problems to use the Stoke program without
+//  proting its code:
+//
+//  1. Stoke works on function level, but it is only limited to relative
+//  small functions which are loop-free, call-free, exception-free, etc.
+//
+//  2. Stoke requires much information being manually provided, such as the
+//  register usages and memory modification, etc.
+//
+//  This Pass analyzes all functions and get the required information into
+//  .csv file. Next, we use python scripts to process the file, filter
+//  out functions for optimization and automatically generate configure files.
+//  Finally, these configure files are feed to the Stoke to do the job.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STOKEINFO_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_STOKEINFO_H
+
+#include <fstream>
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+class DataflowInfoManager;
+class RegAnalysis;
+
+/// Structure to hold information needed by Stoke for a function
+struct StokeFuncInfo {
+  std::string FuncName;
+  uint64_t Offset;
+  uint64_t Size;
+  uint64_t NumInstrs;
+  uint64_t NumBlocks;
+  bool IsLoopFree;
+  unsigned NumLoops;
+  unsigned MaxLoopDepth;
+  uint64_t HotSize;
+  uint64_t TotalSize;
+  uint64_t Score;
+  bool HasCall;
+  std::set<std::string> DefIn;
+  std::set<std::string> LiveOut;
+  bool HeapOut;
+  bool StackOut;
+  bool HasRipAddr;
+  bool Omitted;
+
+  StokeFuncInfo() {
+    reset();
+  }
+
+  void reset() {
+    FuncName = "";
+    Offset = Size = NumInstrs = NumBlocks = 0;
+    NumLoops = MaxLoopDepth = 0;
+    HotSize = TotalSize = 0;
+    Score = 0;
+    IsLoopFree
+      = HasCall
+      = HeapOut
+      = StackOut
+      = HasRipAddr
+      = Omitted
+      = false;
+    DefIn.clear();
+    LiveOut.clear();
+  }
+
+  void printCsvHeader(std::ofstream &Outfile) {
+    if (Outfile.is_open()) {
+      Outfile
+        << "FuncName,Offset,Size,NumInstrs,NumBlocks,"
+        << "IsLoopFree,NumLoops,MaxLoopDepth,"
+        << "HotSize,TotalSize,"
+        << "Score,"
+        << "HasCall,"
+        << "DefIn,LiveOut,HeapOut,StackOut,"
+        << "HasRipAddr,"
+        << "Omitted\n";
+    }
+  }
+
+  void printData(std::ofstream &Outfile) {
+    if (Outfile.is_open()) {
+      Outfile
+        << FuncName << ","
+        << Offset << "," << Size << "," << NumInstrs << "," << NumBlocks << ","
+        << IsLoopFree << "," << NumLoops << "," << MaxLoopDepth << ","
+        << HotSize << "," << TotalSize << ","
+        << Score << ","
+        << HasCall << ",\"{ ";
+      for (std::string S : DefIn) {
+        Outfile << "%" << S << " ";
+      }
+      Outfile << "}\",\"{ ";
+      for (std::string S : LiveOut) {
+        Outfile << "%" << S << " ";
+      }
+      Outfile << "}\"," << HeapOut << "," << StackOut << ","
+        << HasRipAddr << ","
+        << Omitted << "\n";
+    }
+  }
+};
+
+class StokeInfo : public BinaryFunctionPass {
+
+private:
+  // stoke --def_in option default value, for X86:
+  // rax, rcx, rdx, rsi, rdi, r8, r9, xmm0-xmm7
+  BitVector DefaultDefInMask;
+  // --live_out option default value: rax, rdx, xmm0, xmm1
+  BitVector DefaultLiveOutMask;
+
+  uint16_t NumRegs;
+
+public:
+  StokeInfo(const cl::opt<bool> &PrintPass) : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "stoke-get-stat";
+  }
+
+  void checkInstr(const BinaryContext &BC, const BinaryFunction &BF,
+    StokeFuncInfo &FuncInfo);
+
+  /// Get all required information for the stoke optimization
+  bool checkFunction(const BinaryContext &BC, BinaryFunction &BF,
+    DataflowInfoManager &DInfo, RegAnalysis &RA,
+    StokeFuncInfo &FuncInfo);
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+
+#endif
diff --git a/bolt/src/Passes/ValidateInternalCalls.cpp b/bolt/src/Passes/ValidateInternalCalls.cpp
new file mode 100644
index 000000000000..2a91b9b6972b
--- /dev/null
+++ b/bolt/src/Passes/ValidateInternalCalls.cpp
@@ -0,0 +1,344 @@
+//===--- Passes/ValidateInternalCalls.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValidateInternalCalls.h"
+#include "FrameAnalysis.h"
+#include "Passes/DataflowInfoManager.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+#define DEBUG_TYPE "bolt-internalcalls"
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// Helper used to extract the target basic block used in an internal call.
+// Return nullptr if this is not an internal call target.
+BinaryBasicBlock *getInternalCallTarget(BinaryFunction &Function,
+                                        const MCInst &Inst) {
+  const BinaryContext &BC = Function.getBinaryContext();
+  if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
+      !Inst.getOperand(0).isExpr())
+    return nullptr;
+
+  return Function.getBasicBlockForLabel(BC.MIB->getTargetSymbol(Inst));
+}
+
+
+// A special StackPointerTracking that considers internal calls
+class StackPointerTrackingForInternalCalls
+    : public StackPointerTrackingBase<StackPointerTrackingForInternalCalls> {
+  friend class DataflowAnalysis<StackPointerTrackingForInternalCalls,
+                                std::pair<int, int>>;
+
+  Optional<unsigned> AnnotationIndex;
+
+protected:
+  // We change the starting state to only consider the first block as an
+  // entry point, otherwise the analysis won't converge (there will be two valid
+  // stack offsets, one for an external call and another for an internal call).
+  std::pair<int, int> getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    if (&BB == &*Func.begin())
+      return std::make_pair(-8, getEmpty());
+    return std::make_pair(getEmpty(), getEmpty());
+  }
+
+  // Here we decrement SP for internal calls too, in addition to the regular
+  // StackPointerTracking processing.
+  std::pair<int, int> computeNext(const MCInst &Point,
+                                  const std::pair<int, int> &Cur) {
+    std::pair<int, int> Res = StackPointerTrackingBase<
+        StackPointerTrackingForInternalCalls>::computeNext(Point, Cur);
+    if (Res.first == StackPointerTracking::SUPERPOSITION ||
+        Res.first == StackPointerTracking::EMPTY)
+      return Res;
+
+    if (BC.MIB->isReturn(Point)) {
+      Res.first += 8;
+      return Res;
+    }
+
+    BinaryBasicBlock *Target = getInternalCallTarget(Func, Point);
+    if (!Target)
+      return Res;
+
+    Res.first -= 8;
+    return Res;
+  }
+
+  StringRef getAnnotationName() const {
+    return StringRef("StackPointerTrackingForInternalCalls");
+  }
+
+public:
+  StackPointerTrackingForInternalCalls(const BinaryContext &BC,
+                                       BinaryFunction &BF)
+      : StackPointerTrackingBase<StackPointerTrackingForInternalCalls>(BC, BF) {
+  }
+
+  void run() {
+    StackPointerTrackingBase<StackPointerTrackingForInternalCalls>::run();
+  }
+};
+
+} // end anonymous namespace
+
+bool ValidateInternalCalls::fixCFGForPIC(BinaryFunction &Function) const {
+  const BinaryContext &BC = Function.getBinaryContext();
+  for (BinaryBasicBlock &BB : Function) {
+    for (auto II = BB.begin(); II != BB.end(); ++II) {
+      MCInst &Inst = *II;
+      BinaryBasicBlock *Target = getInternalCallTarget(Function, Inst);
+      if (!Target || BC.MIB->hasAnnotation(Inst, getProcessedICTag()))
+        continue;
+
+      BC.MIB->addAnnotation(Inst, getProcessedICTag(), 0U);
+      std::vector<MCInst> MovedInsts = BB.splitInstructions(&Inst);
+      if (!MovedInsts.empty()) {
+        // Split this block at the call instruction. Create an unreachable
+        // block.
+        std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
+        NewBBs.emplace_back(Function.createBasicBlock(0));
+        NewBBs.back()->addInstructions(MovedInsts.begin(), MovedInsts.end());
+        BB.moveAllSuccessorsTo(NewBBs.back().get());
+        Function.insertBasicBlocks(&BB, std::move(NewBBs));
+      }
+      // Update successors
+      BB.removeAllSuccessors();
+      BB.addSuccessor(Target, BB.getExecutionCount(), 0ULL);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ValidateInternalCalls::fixCFGForIC(BinaryFunction &Function) const {
+  const BinaryContext &BC = Function.getBinaryContext();
+  // Track SP value
+  StackPointerTrackingForInternalCalls SPTIC(BC, Function);
+  SPTIC.run();
+
+  // Track instructions reaching a given point of the CFG to answer
+  // "There is a path from entry to point A that contains instruction B"
+  ReachingInsns<false> RI(BC, Function);
+  RI.run();
+
+  // We use the InsnToBB map that DataflowInfoManager provides us
+  DataflowInfoManager Info(BC, Function, nullptr, nullptr);
+
+  bool Updated = false;
+
+  auto processReturns = [&] (BinaryBasicBlock &BB, MCInst &Return) {
+    // Check all reaching internal calls
+    for (auto I = RI.expr_begin(Return), E = RI.expr_end(); I != E; ++I) {
+      MCInst &ReachingInst = **I;
+      if (!getInternalCallTarget(Function, ReachingInst) ||
+          BC.MIB->hasAnnotation(ReachingInst, getProcessedICTag()))
+        continue;
+
+      // Stack pointer matching
+      int SPAtCall = SPTIC.getStateAt(ReachingInst)->first;
+      int SPAtRet = SPTIC.getStateAt(Return)->first;
+      if (SPAtCall != StackPointerTracking::SUPERPOSITION &&
+          SPAtRet != StackPointerTracking::SUPERPOSITION &&
+          SPAtCall != SPAtRet - 8)
+        continue;
+
+      Updated = true;
+
+      // Mark this call as processed, so we don't try to analyze it as a
+      // PIC-computation internal call.
+      BC.MIB->addAnnotation(ReachingInst, getProcessedICTag(), 0U);
+
+      // Connect this block with the returning block of the caller
+      BinaryBasicBlock *CallerBlock = Info.getInsnToBBMap()[&ReachingInst];
+      BinaryBasicBlock *ReturnDestBlock =
+          Function.getBasicBlockAfter(CallerBlock);
+      BB.addSuccessor(ReturnDestBlock, BB.getExecutionCount(), 0);
+    }
+  };
+
+  // This will connect blocks terminated with RETs to their respective
+  // internal caller return block. A note here: this is overly conservative
+  // because in nested calls, or unrelated calls, it will create edges
+  // connecting RETs to potentially unrelated internal calls. This is safe
+  // and if this causes a problem to recover the stack offsets properly, we
+  // will fail later.
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      if (!BC.MIB->isReturn(Inst))
+        continue;
+
+      processReturns(BB, Inst);
+    }
+  }
+  return Updated;
+}
+
+bool ValidateInternalCalls::hasTailCallsInRange(BinaryFunction &Function) const {
+  const BinaryContext &BC = Function.getBinaryContext();
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      if (BC.MIB->isTailCall(Inst))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
+  while (fixCFGForPIC(Function)) {}
+  clearAnnotations(Function);
+  while (fixCFGForIC(Function)) {}
+
+  BinaryContext &BC = Function.getBinaryContext();
+  RegAnalysis RA = RegAnalysis(BC, nullptr, nullptr);
+  RA.setConservativeStrategy(RegAnalysis::ConservativeStrategy::CLOBBERS_NONE);
+  bool HasTailCalls = hasTailCallsInRange(Function);
+
+  for (BinaryBasicBlock &BB : Function) {
+    for (MCInst &Inst : BB) {
+      BinaryBasicBlock *Target = getInternalCallTarget(Function, Inst);
+      if (!Target || BC.MIB->hasAnnotation(Inst, getProcessedICTag()))
+        continue;
+
+      if (HasTailCalls) {
+        LLVM_DEBUG(dbgs() << Function
+                          << " has tail calls and internal calls.\n");
+        return false;
+      }
+
+      FrameIndexEntry FIE;
+      int32_t SrcImm = 0;
+      MCPhysReg Reg = 0;
+      int64_t StackOffset = 0;
+      bool IsIndexed = false;
+      MCInst *TargetInst = ProgramPoint::getFirstPointAt(*Target).getInst();
+      if (!BC.MIB->isStackAccess(*TargetInst, FIE.IsLoad, FIE.IsStore,
+                                 FIE.IsStoreFromReg, Reg, SrcImm,
+                                 FIE.StackPtrReg, StackOffset, FIE.Size,
+                                 FIE.IsSimple, IsIndexed)) {
+        LLVM_DEBUG({
+          dbgs() << "Frame analysis failed - not simple: " << Function << "\n";
+          Function.dump();
+        });
+        return false;
+      }
+      if (!FIE.IsLoad || FIE.StackPtrReg != BC.MIB->getStackPointer() ||
+          StackOffset != 0) {
+        LLVM_DEBUG({
+          dbgs() << "Target instruction does not fetch return address - not "
+                    "simple: "
+                 << Function << "\n";
+          Function.dump();
+        });
+        return false;
+      }
+      // Now track how the return address is used by tracking uses of Reg
+      ReachingDefOrUse</*Def=*/false> RU =
+        ReachingDefOrUse<false>(RA, BC, Function, Reg);
+      RU.run();
+
+      int64_t Offset = static_cast<int64_t>(Target->getInputOffset());
+      bool UseDetected = false;
+      for (auto I = RU.expr_begin(*RU.getStateBefore(*TargetInst)),
+                E = RU.expr_end();
+           I != E; ++I) {
+        MCInst &Use = **I;
+        BitVector UsedRegs = BitVector(BC.MRI->getNumRegs(), false);
+        BC.MIB->getTouchedRegs(Use, UsedRegs);
+        if (!UsedRegs[Reg])
+          continue;
+        UseDetected = true;
+        int64_t Output;
+        std::pair<MCPhysReg, int64_t> Input1 = std::make_pair(Reg, 0);
+        std::pair<MCPhysReg, int64_t> Input2 = std::make_pair(0, 0);
+        if (!BC.MIB->evaluateSimple(Use, Output, Input1, Input2)) {
+          LLVM_DEBUG(dbgs() << "Evaluate simple failed.\n");
+          return false;
+        }
+        if (Offset + Output < 0 ||
+            Offset + Output > static_cast<int64_t>(Function.getSize())) {
+          LLVM_DEBUG({
+            dbgs() << "Detected out-of-range PIC reference in " << Function
+                   << "\nReturn address load: ";
+            BC.InstPrinter->printInst(TargetInst, 0, "", *BC.STI, dbgs());
+            dbgs() << "\nUse: ";
+            BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs());
+            dbgs() << "\n";
+            Function.dump();
+          });
+          return false;
+        }
+        LLVM_DEBUG({
+          dbgs() << "Validated access: ";
+          BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs());
+          dbgs() << "\n";
+        });
+      }
+      if (!UseDetected) {
+        LLVM_DEBUG(dbgs() << "No use detected.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
+  if (!BC.isX86())
+    return;
+
+  // Look for functions that need validation. This should be pretty rare.
+  std::set<BinaryFunction *> NeedsValidation;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    for (BinaryBasicBlock &BB : Function) {
+      for (MCInst &Inst : BB) {
+        if (getInternalCallTarget(Function, Inst)) {
+          NeedsValidation.insert(&Function);
+          Function.setSimple(false);
+          break;
+        }
+      }
+    }
+  }
+
+  // Skip validation for non-relocation mode
+  if (!BC.HasRelocations)
+    return;
+
+  // Since few functions need validation, we can work with our most expensive
+  // algorithms here. Fix the CFG treating internal calls as unconditional
+  // jumps. This optimistically assumes this call is a PIC trick to get the PC
+  // value, so it is not really a call, but a jump. If we find that it's not the
+  // case, we mark this function as non-simple and stop processing it.
+  std::set<BinaryFunction *> Invalid;
+  for (BinaryFunction *Function : NeedsValidation) {
+    LLVM_DEBUG(dbgs() << "Validating " << *Function << "\n");
+    if (!analyzeFunction(*Function)) {
+      Invalid.insert(Function);
+    }
+    clearAnnotations(*Function);
+  }
+
+  if (!Invalid.empty()) {
+    errs() << "BOLT-WARNING: will skip the following function(s) as unsupported"
+              " internal calls were detected:\n";
+    for (BinaryFunction *Function : Invalid) {
+      errs() << "              " << *Function << "\n";
+      Function->setIgnored();
+    }
+  }
+}
+
+}
+}
diff --git a/bolt/src/Passes/ValidateInternalCalls.h b/bolt/src/Passes/ValidateInternalCalls.h
new file mode 100644
index 000000000000..c0bb01ed165c
--- /dev/null
+++ b/bolt/src/Passes/ValidateInternalCalls.h
@@ -0,0 +1,103 @@
+//===--- Passes/ValidateInternalCalls.h -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_VALIDATEINTERNALCALLS_H
+#define LLVM_TOOLS_LLVM_BOLT_PASSES_VALIDATEINTERNALCALLS_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+/// Post-processing for internal calls. What are those? They are call
+/// instructions that do not transfer control to another function, but
+/// rather branch to a basic block inside the caller function itself.
+/// This pass checks that the internal calls observed in a function are
+/// manageable. We support two types:
+///
+///   1. Position Independent Code (PIC) tricks: in this type of internal
+///      call, we don't really have a call because the return address is
+///      not utilized for branching to, but only as a base address to
+///      reference other objects. We call it a "trick" because this is not
+///      the standard way a compiler would do this and this will often come
+///      from awkwardly written assembly code.
+///
+///   2. Real internal calls: in this case, a function was inlined inside
+///      a caller, but the CALL instruction wasn't removed. This pair of
+///      caller-callee is treated as a single function and is analyzed
+///      here.
+///
+/// In general, the rest of the BOLT pipeline (other optimizations, including
+/// code reordering) will not support neither of these cases. In this pass,
+/// we just identify them, verify they are safe (do not reference objects
+/// that will be moved after reordering) and freeze these functions in the
+/// way they were read. We do this by marking them as non-simple.
+///
+/// Why do we freeze them?
+///
+/// Type 1 is not safe to optimize because any changed offsets will break the
+/// PIC references made in this code. Type 2 is not safe to optimize because
+/// it requires BOLT to understand a new CFG format where internal calls are
+/// broken into two BBs (calling block and returning block), and we currently do
+/// not support this  elsewhere. Only this pass is able to make sense of these
+/// non-canonical CFGs (specifically, fixBranches does not support them).
+///
+class ValidateInternalCalls : public BinaryFunctionPass {
+public:
+  explicit ValidateInternalCalls(const cl::opt<bool> &PrintPass)
+    : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override {
+    return "validate-internal-calls";
+  }
+
+  void runOnFunctions(BinaryContext &BC) override;
+
+private:
+  /// Fix the CFG to take into consideration internal calls that do not
+  /// return, but are only used as a trick to perform Position Independent
+  /// Code (PIC) computations. This will change internal calls to be treated
+  /// as unconditional jumps.
+  bool fixCFGForPIC(BinaryFunction &Function) const;
+
+  /// Fix the CFG to take into consideration real internal calls (whole
+  /// functions that got inlined inside its caller, but the CALL instruction
+  /// wasn't removed).
+  bool fixCFGForIC(BinaryFunction &Function) const;
+
+  /// Detect tail calls in the range of the PIC access and fail to validate if
+  /// one is detected. Tail calls are dangerous because they may be emitted
+  /// with a different size in comparison with the original code.
+  /// FIXME: shortenInstructions and NOP sizes can impact offsets too
+  bool hasTailCallsInRange(BinaryFunction &Function) const;
+
+  /// Check that the PIC computations performed by Type 1 internal calls are
+  /// safe
+  bool analyzeFunction(BinaryFunction &Function) const;
+
+  /// The annotation tag we use to keep track of internal calls we already
+  /// processed.
+  StringRef getProcessedICTag() const {
+    return "ProcessedInternalCall";
+  }
+
+  void clearAnnotations(BinaryFunction &Function) const {
+    const BinaryContext &BC = Function.getBinaryContext();
+    for (BinaryBasicBlock &BB : Function) {
+      for (MCInst &Inst : BB) {
+        BC.MIB->removeAnnotation(Inst, getProcessedICTag());
+      }
+    }
+  }
+};
+
+}
+}
+
+#endif
diff --git a/bolt/src/Passes/VeneerElimination.cpp b/bolt/src/Passes/VeneerElimination.cpp
new file mode 100644
index 000000000000..1270d156d56b
--- /dev/null
+++ b/bolt/src/Passes/VeneerElimination.cpp
@@ -0,0 +1,105 @@
+//===--- Passes/VeneerElimination.cpp--------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements a pass that removes linker-inserted veneers from the
+// code and redirects veneer callers to call to veneers destinations
+//
+//===----------------------------------------------------------------------===//
+#include "VeneerElimination.h"
+#define DEBUG_TYPE "veneer-elim"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+static llvm::cl::opt<bool>
+EliminateVeneers("elim-link-veneers",
+  cl::desc("run veneer elimination pass"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+void VeneerElimination::runOnFunctions(BinaryContext &BC) {
+  if (!opts::EliminateVeneers || !BC.isAArch64())
+    return;
+
+  auto &BFs = BC.getBinaryFunctions();
+  std::unordered_map<const MCSymbol *, const MCSymbol *> VeneerDestinations;
+  uint64_t VeneersCount = 0;
+  for (auto It = BFs.begin(); It != BFs.end();) {
+    auto CurrentIt = It;
+    ++It;
+
+    if (CurrentIt->second.isAArch64Veneer()) {
+      VeneersCount++;
+      BinaryFunction &VeneerFunction = CurrentIt->second;
+
+      MCInst &FirstInstruction = *(VeneerFunction.begin()->begin());
+      const MCSymbol *VeneerTargetSymbol =
+          BC.MIB->getTargetSymbol(FirstInstruction, 1);
+
+      // Functions can have multiple symbols
+      for (StringRef Name : VeneerFunction.getNames()) {
+        MCSymbol *Symbol = BC.Ctx->lookupSymbol(Name);
+        VeneerDestinations[Symbol] = VeneerTargetSymbol;
+        BC.SymbolToFunctionMap.erase(Symbol);
+      }
+
+      BC.BinaryDataMap.erase(VeneerFunction.getAddress());
+      BFs.erase(CurrentIt);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "BOLT-INFO: number of removed linker-inserted veneers :"
+                    << VeneersCount << "\n");
+
+  // Handle veneers to veneers in case they occur
+  for (auto entry : VeneerDestinations) {
+    const MCSymbol *src = entry.first;
+    const MCSymbol *dest = entry.second;
+    while (VeneerDestinations.find(dest) != VeneerDestinations.end()) {
+      dest = VeneerDestinations[dest];
+    }
+    VeneerDestinations[src] = dest;
+  }
+
+  uint64_t VeneerCallers = 0;
+  for (auto &It : BFs) {
+    BinaryFunction &Function = It.second;
+    for (BinaryBasicBlock &BB : Function) {
+      for (MCInst &Instr : BB) {
+        if (!BC.MIB->isCall(Instr) || BC.MIB->isIndirectCall(Instr))
+          continue;
+
+        const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Instr, 0);
+        if (VeneerDestinations.find(TargetSymbol) == VeneerDestinations.end())
+          continue;
+
+        VeneerCallers++;
+        if (!BC.MIB->replaceBranchTarget(
+                Instr, VeneerDestinations[TargetSymbol], BC.Ctx.get())) {
+          assert(false && "updating veneer call destination failed");
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "BOLT-INFO: number of linker-inserted veneers call sites :"
+             << VeneerCallers << "\n");
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Passes/VeneerElimination.h b/bolt/src/Passes/VeneerElimination.h
new file mode 100644
index 000000000000..7897623d5911
--- /dev/null
+++ b/bolt/src/Passes/VeneerElimination.h
@@ -0,0 +1,34 @@
+//===--- Passes/VeneerElimination.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_VENEER_ELIMINATION_H
+#define LLVM_TOOLS_LLVM_BOLT_VENEER_ELIMINATION_H
+
+#include "BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class VeneerElimination : public BinaryFunctionPass {
+public:
+  /// BinaryPass public interface
+  explicit VeneerElimination(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {
+    ;
+  }
+
+  const char *getName() const override { return "veneer-elimination"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/ProfileReaderBase.cpp b/bolt/src/ProfileReaderBase.cpp
new file mode 100644
index 000000000000..ff9b1274c3a3
--- /dev/null
+++ b/bolt/src/ProfileReaderBase.cpp
@@ -0,0 +1,23 @@
+//===-- ProfileReaderBase.cpp - Base class for profile readers --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Interface to be implemented by all profile readers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfileReaderBase.h"
+
+namespace llvm {
+namespace bolt {
+
+bool ProfileReaderBase::mayHaveProfileData(const BinaryFunction &BF) {
+  return true;
+}
+
+}
+}
diff --git a/bolt/src/ProfileReaderBase.h b/bolt/src/ProfileReaderBase.h
new file mode 100644
index 000000000000..21dcfe7215b9
--- /dev/null
+++ b/bolt/src/ProfileReaderBase.h
@@ -0,0 +1,117 @@
+//===-- ProfileReaderBase.h - Base class for profile readers ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Interface to be implemented by all profile readers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILE_READER_BASE_H
+#define LLVM_TOOLS_LLVM_BOLT_PROFILE_READER_BASE_H
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace bolt {
+
+class BinaryContext;
+class BinaryFunction;
+class BoltAddressTranslation;
+
+/// LTO-generated function names take a form:
+///
+///   <function_name>.lto_priv.<decimal_number>/...
+///     or
+///   <function_name>.constprop.<decimal_number>/...
+///
+/// they can also be:
+///
+///   <function_name>.lto_priv.<decimal_number1>.lto_priv.<decimal_number2>/...
+///
+/// The <decimal_number> is a global counter used for the whole program. As a
+/// result, a tiny change in a program may affect the naming of many LTO
+/// functions. For us this means that if we do a precise name matching, then
+/// a large set of functions could be left without a profile.
+///
+/// To solve this issue, we try to match a function to any profile:
+///
+///   <function_name>.(lto_priv|consprop).*
+///
+/// The name before an asterisk above represents a common LTO name for a family
+/// of functions. Later, out of all matching profiles we pick the one with the
+/// best match.
+///
+/// Return a common part of LTO name for a given \p Name.
+Optional<StringRef> getLTOCommonName(const StringRef Name);
+
+class ProfileReaderBase {
+protected:
+  /// Name of the file with profile.
+  std::string Filename;
+
+public:
+  ProfileReaderBase() = delete;
+  ProfileReaderBase(const ProfileReaderBase &) = delete;
+  ProfileReaderBase &operator=(const ProfileReaderBase &) = delete;
+  ProfileReaderBase(ProfileReaderBase &&) = delete;
+  ProfileReaderBase &operator=(ProfileReaderBase &&) = delete;
+
+  /// Construct a reader for a given file.
+  explicit ProfileReaderBase(StringRef Filename)
+    : Filename(Filename) {}
+
+  virtual ~ProfileReaderBase() = default;
+
+  /// Return the name of the file containing the profile.
+  StringRef getFilename() const { return Filename; }
+
+  /// Instruct the profiler to use address-translation tables.
+  virtual void setBAT(BoltAddressTranslation *BAT) {}
+
+  /// Pre-process the profile when functions in \p BC are discovered,
+  /// but not yet disassembled. Once the profile is pre-processed, calls to
+  /// mayHaveProfileData() should be able to identify if the function possibly
+  /// has a profile available.
+  virtual Error preprocessProfile(BinaryContext &BC) = 0;
+
+  /// Assign profile to all objects in the \p BC while functions are
+  /// in pre-CFG state with instruction addresses available.
+  virtual Error readProfilePreCFG(BinaryContext &BC) = 0;
+
+  /// Assign profile to all objects in the \p BC.
+  virtual Error readProfile(BinaryContext &BC) = 0;
+
+  /// Return the string identifying the reader.
+  virtual StringRef getReaderName() const = 0;
+
+  /// Return true if the function \p BF may have a profile available.
+  /// The result is based on the name(s) of the function alone and the profile
+  /// match is not guaranteed.
+  virtual bool mayHaveProfileData(const BinaryFunction &BF);
+
+  /// Return true if the profile contains an entry for a local object
+  /// that has an associated file name.
+  virtual bool hasLocalsWithFileName() const {
+    return true;
+  }
+
+  /// Return all event names used to collect this profile.
+  virtual StringSet<> getEventNames() const {
+    return StringSet<>();
+  }
+
+  /// Return true if the source of the profile should be trusted. E.g., even
+  /// good source of profile data may contain discrepancies. Nevertheless, the
+  /// rest of the profile is correct.
+  virtual bool isTrustedSource() const = 0;
+};
+
+}
+}
+
+#endif
diff --git a/bolt/src/ProfileYAMLMapping.h b/bolt/src/ProfileYAMLMapping.h
new file mode 100644
index 000000000000..948f5700e459
--- /dev/null
+++ b/bolt/src/ProfileYAMLMapping.h
@@ -0,0 +1,233 @@
+//===-- ProfileYAMLMapping.h - mappings for BOLT profile --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement mapping between binary function profile and YAML representation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILEYAMLMAPPING_H
+#define LLVM_TOOLS_LLVM_BOLT_PROFILEYAMLMAPPING_H
+
+#include "BinaryFunction.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+
+using llvm::bolt::BinaryFunction;
+
+namespace llvm {
+namespace yaml {
+
+namespace bolt {
+struct CallSiteInfo {
+  llvm::yaml::Hex32 Offset{0};
+  uint32_t DestId{0};
+  uint32_t EntryDiscriminator{0}; /// multiple entry discriminator
+  uint64_t Count{0};
+  uint64_t Mispreds{0};
+
+  bool operator==(const CallSiteInfo &Other) const {
+    return Offset == Other.Offset &&
+           DestId == Other.DestId &&
+           EntryDiscriminator == Other.EntryDiscriminator;
+  }
+
+  bool operator!=(const CallSiteInfo &Other) const {
+    return !(*this == Other);
+  }
+
+  bool operator<(const CallSiteInfo &Other) const {
+    if (Offset < Other.Offset)
+      return true;
+    if (Offset > Other.Offset)
+      return false;
+
+    if (DestId < Other.DestId)
+      return true;
+    if (DestId > Other.DestId)
+      return false;
+
+    if (EntryDiscriminator < Other.EntryDiscriminator)
+      return true;
+
+    return false;
+  }
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::CallSiteInfo> {
+  static void mapping(IO &YamlIO, bolt::CallSiteInfo &CSI) {
+    YamlIO.mapRequired("off", CSI.Offset);
+    YamlIO.mapRequired("fid", CSI.DestId);
+    YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0);
+    YamlIO.mapRequired("cnt", CSI.Count);
+    YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0);
+  }
+
+  static const bool flow = true;
+};
+
+namespace bolt {
+struct SuccessorInfo {
+  uint32_t Index{0};
+  uint64_t Count{0};
+  uint64_t Mispreds{0};
+
+  bool operator==(const SuccessorInfo &Other) const {
+    return Index == Other.Index;
+  }
+  bool operator!=(const SuccessorInfo &Other) const {
+    return !(*this == Other);
+  }
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::SuccessorInfo> {
+  static void mapping(IO &YamlIO, bolt::SuccessorInfo &SI) {
+    YamlIO.mapRequired("bid", SI.Index);
+    YamlIO.mapRequired("cnt", SI.Count);
+    YamlIO.mapOptional("mis", SI.Mispreds, (uint64_t)0);
+  }
+
+  static const bool flow = true;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::CallSiteInfo)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::SuccessorInfo)
+
+namespace llvm {
+namespace yaml {
+
+namespace bolt {
+struct BinaryBasicBlockProfile {
+  uint32_t Index{0};
+  uint32_t NumInstructions{0};
+  llvm::yaml::Hex64 Hash{0};
+  uint64_t ExecCount{0};
+  uint64_t EventCount{0};
+  std::vector<CallSiteInfo> CallSites;
+  std::vector<SuccessorInfo> Successors;
+
+  bool operator==(const BinaryBasicBlockProfile &Other) const {
+    return Index == Other.Index;
+  }
+  bool operator!=(const BinaryBasicBlockProfile &Other) const {
+    return !(*this == Other);
+  }
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::BinaryBasicBlockProfile> {
+  static void mapping(IO &YamlIO, bolt::BinaryBasicBlockProfile &BBP) {
+    YamlIO.mapRequired("bid", BBP.Index);
+    YamlIO.mapRequired("insns", BBP.NumInstructions);
+    YamlIO.mapOptional("exec", BBP.ExecCount, (uint64_t)0);
+    YamlIO.mapOptional("events", BBP.EventCount, (uint64_t)0);
+    YamlIO.mapOptional("calls", BBP.CallSites,
+                       std::vector<bolt::CallSiteInfo>());
+    YamlIO.mapOptional("succ", BBP.Successors,
+                       std::vector<bolt::SuccessorInfo>());
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::BinaryBasicBlockProfile)
+
+namespace llvm {
+namespace yaml {
+
+namespace bolt {
+struct BinaryFunctionProfile {
+  std::string Name;
+  uint32_t NumBasicBlocks{0};
+  uint32_t Id{0};
+  llvm::yaml::Hex64 Hash{0};
+  uint64_t ExecCount{0};
+  std::vector<BinaryBasicBlockProfile> Blocks;
+  bool Used{false};
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::BinaryFunctionProfile> {
+  static void mapping(IO &YamlIO, bolt::BinaryFunctionProfile &BFP) {
+    YamlIO.mapRequired("name", BFP.Name);
+    YamlIO.mapRequired("fid", BFP.Id);
+    YamlIO.mapRequired("hash", BFP.Hash);
+    YamlIO.mapRequired("exec", BFP.ExecCount);
+    YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks);
+    YamlIO.mapOptional("blocks", BFP.Blocks,
+                        std::vector<bolt::BinaryBasicBlockProfile>());
+  }
+};
+
+LLVM_YAML_STRONG_TYPEDEF(uint16_t, PROFILE_PF)
+
+template <>
+struct ScalarBitSetTraits<PROFILE_PF> {
+  static void bitset(IO &io, PROFILE_PF &value) {
+    io.bitSetCase(value, "lbr", BinaryFunction::PF_LBR);
+    io.bitSetCase(value, "sample", BinaryFunction::PF_SAMPLE);
+    io.bitSetCase(value, "memevent", BinaryFunction::PF_MEMEVENT);
+  }
+};
+
+namespace bolt {
+struct BinaryProfileHeader {
+  uint32_t Version{1};
+  std::string FileName;       // Name of the profiled binary.
+  std::string Id;             // BuildID.
+  PROFILE_PF Flags{BinaryFunction::PF_NONE};
+                              // Type of the profile.
+  std::string Origin;         // How the profile was obtained.
+  std::string EventNames;     // Events used for sample profile.
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::BinaryProfileHeader> {
+  static void mapping(IO &YamlIO, bolt::BinaryProfileHeader &Header) {
+    YamlIO.mapRequired("profile-version", Header.Version);
+    YamlIO.mapRequired("binary-name", Header.FileName);
+    YamlIO.mapOptional("binary-build-id", Header.Id);
+    YamlIO.mapRequired("profile-flags", Header.Flags);
+    YamlIO.mapOptional("profile-origin", Header.Origin);
+    YamlIO.mapOptional("profile-events", Header.EventNames);
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::BinaryFunctionProfile)
+
+namespace llvm {
+namespace yaml {
+
+namespace bolt {
+struct BinaryProfile {
+  BinaryProfileHeader Header;
+  std::vector<BinaryFunctionProfile> Functions;
+};
+}
+
+template <> struct MappingTraits<bolt::BinaryProfile> {
+  static void mapping(IO &YamlIO, bolt::BinaryProfile &BP) {
+    YamlIO.mapRequired("header", BP.Header);
+    YamlIO.mapRequired("functions", BP.Functions);
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+
+#endif
diff --git a/bolt/src/Relocation.cpp b/bolt/src/Relocation.cpp
new file mode 100644
index 000000000000..3a280d5e71a6
--- /dev/null
+++ b/bolt/src/Relocation.cpp
@@ -0,0 +1,455 @@
+//===--- Relocation.cpp  - Interface for object file relocations ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Relocation.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+using namespace bolt;
+
+Triple::ArchType Relocation::Arch;
+
+namespace {
+
+bool isSupportedX86(uint64_t Type) {
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_X86_64_8:
+  case ELF::R_X86_64_16:
+  case ELF::R_X86_64_32:
+  case ELF::R_X86_64_32S:
+  case ELF::R_X86_64_64:
+  case ELF::R_X86_64_PC8:
+  case ELF::R_X86_64_PC32:
+  case ELF::R_X86_64_PC64:
+  case ELF::R_X86_64_PLT32:
+  case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_GOTTPOFF:
+  case ELF::R_X86_64_TPOFF32:
+  case ELF::R_X86_64_GOTPCRELX:
+  case ELF::R_X86_64_REX_GOTPCRELX:
+    return true;
+  }
+}
+
+bool isSupportedAArch64(uint64_t Type) {
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_AARCH64_CALL26:
+  case ELF::R_AARCH64_ADR_PREL_PG_HI21:
+  case ELF::R_AARCH64_LDST64_ABS_LO12_NC:
+  case ELF::R_AARCH64_ADD_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+  case ELF::R_AARCH64_ADR_GOT_PAGE:
+  case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+  case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+  case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+  case ELF::R_AARCH64_TLSDESC_CALL:
+  case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+  case ELF::R_AARCH64_JUMP26:
+  case ELF::R_AARCH64_PREL32:
+  case ELF::R_AARCH64_ABS64:
+    return true;
+  }
+}
+
+size_t getSizeForTypeX86(uint64_t Type) {
+  switch (Type) {
+  default:
+    llvm_unreachable("unsupported relocation type");
+  case ELF::R_X86_64_8:
+  case ELF::R_X86_64_PC8:
+    return 1;
+  case ELF::R_X86_64_16:
+    return 2;
+  case ELF::R_X86_64_PLT32:
+  case ELF::R_X86_64_PC32:
+  case ELF::R_X86_64_32S:
+  case ELF::R_X86_64_32:
+  case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_GOTTPOFF:
+  case ELF::R_X86_64_TPOFF32:
+  case ELF::R_X86_64_GOTPCRELX:
+  case ELF::R_X86_64_REX_GOTPCRELX:
+    return 4;
+  case ELF::R_X86_64_PC64:
+  case ELF::R_X86_64_64:
+    return 8;
+  }
+}
+
+size_t getSizeForTypeAArch64(uint64_t Type) {
+  switch (Type) {
+  default:
+    dbgs() << "Reloc num: " << Type << "\n";
+    llvm_unreachable("unsupported relocation type");
+  case ELF::R_AARCH64_CALL26:
+  case ELF::R_AARCH64_ADR_PREL_PG_HI21:
+  case ELF::R_AARCH64_LDST64_ABS_LO12_NC:
+  case ELF::R_AARCH64_ADD_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+  case ELF::R_AARCH64_ADR_GOT_PAGE:
+  case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+  case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+  case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+  case ELF::R_AARCH64_TLSDESC_CALL:
+  case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+  case ELF::R_AARCH64_JUMP26:
+  case ELF::R_AARCH64_PREL32:
+    return 4;
+  case ELF::R_AARCH64_ABS64:
+    return 8;
+  }
+}
+
+uint64_t extractValueX86(uint64_t Type, uint64_t Contents, uint64_t PC) {
+  if (Type == ELF::R_X86_64_32S)
+    return SignExtend64<32>(Contents & 0xffffffff);
+  return Contents;
+}
+
+uint64_t extractValueAArch64(uint64_t Type, uint64_t Contents, uint64_t PC) {
+  switch (Type) {
+  default:
+    llvm_unreachable("unsupported relocation type");
+  case ELF::R_AARCH64_ABS64:
+    return Contents;
+  case ELF::R_AARCH64_PREL32:
+    return static_cast<int64_t>(PC) + SignExtend64<32>(Contents & 0xffffffff);
+  case ELF::R_AARCH64_TLSDESC_CALL:
+  case ELF::R_AARCH64_JUMP26:
+  case ELF::R_AARCH64_CALL26:
+    // Immediate goes in bits 25:0 of B and BL.
+    Contents &= ~0xfffffffffc000000ULL;
+    return static_cast<int64_t>(PC) + SignExtend64<28>(Contents << 2);
+  case ELF::R_AARCH64_ADR_GOT_PAGE:
+  case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+  case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+  case ELF::R_AARCH64_ADR_PREL_PG_HI21: {
+    // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP
+    // instruction
+    Contents &= ~0xffffffff9f00001fUll;
+    uint64_t LowBits = (Contents >> 29) & 0x3;
+    uint64_t HighBits = (Contents >> 5) & 0x7ffff;
+    Contents = LowBits | (HighBits << 2);
+    Contents = static_cast<int64_t>(PC) + SignExtend64<32>(Contents << 12);
+    Contents &= ~0xfffUll;
+    return Contents;
+  }
+  case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+  case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+  case ELF::R_AARCH64_LDST64_ABS_LO12_NC: {
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:3 of Symbol address
+    Contents &= ~0xffffffffffc003ffU;
+    return Contents >> (10 - 3);
+  }
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+  case ELF::R_AARCH64_ADD_ABS_LO12_NC: {
+    // Immediate goes in bits 21:10 of ADD instruction
+    Contents &= ~0xffffffffffc003ffU;
+    return Contents >> (10 - 0);
+  }
+  case ELF::R_AARCH64_LDST128_ABS_LO12_NC: {
+    // Immediate goes in bits 21:10 of ADD instruction, taken
+    // from bits 11:4 of Symbol address
+    Contents &= ~0xffffffffffc003ffU;
+    return Contents >> (10 - 4);
+  }
+  case ELF::R_AARCH64_LDST32_ABS_LO12_NC: {
+    // Immediate goes in bits 21:10 of ADD instruction, taken
+    // from bits 11:2 of Symbol address
+    Contents &= ~0xffffffffffc003ffU;
+    return Contents >> (10 - 2);
+  }
+  case ELF::R_AARCH64_LDST16_ABS_LO12_NC: {
+    // Immediate goes in bits 21:10 of ADD instruction, taken
+    // from bits 11:1 of Symbol address
+    Contents &= ~0xffffffffffc003ffU;
+    return Contents >> (10 - 1);
+  }
+  case ELF::R_AARCH64_LDST8_ABS_LO12_NC: {
+    // Immediate goes in bits 21:10 of ADD instruction, taken
+    // from bits 11:0 of Symbol address
+    Contents &= ~0xffffffffffc003ffU;
+    return Contents >> (10 - 0);
+  }
+  }
+}
+
+bool isGOTX86(uint64_t Type) {
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_X86_64_GOT32:
+  case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_GOTTPOFF:
+  case ELF::R_X86_64_GOTOFF64:
+  case ELF::R_X86_64_GOTPC32:
+  case ELF::R_X86_64_GOT64:
+  case ELF::R_X86_64_GOTPCREL64:
+  case ELF::R_X86_64_GOTPC64:
+  case ELF::R_X86_64_GOTPLT64:
+  case ELF::R_X86_64_GOTPC32_TLSDESC:
+  case ELF::R_X86_64_GOTPCRELX:
+  case ELF::R_X86_64_REX_GOTPCRELX:
+    return true;
+  }
+}
+
+bool isGOTAArch64(uint64_t Type) {
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_AARCH64_ADR_GOT_PAGE:
+  case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+  case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+  case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+  case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+  case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+  case ELF::R_AARCH64_TLSDESC_CALL:
+    return true;
+  }
+}
+
+bool isTLSX86(uint64_t Type) {
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_X86_64_TPOFF32:
+  case ELF::R_X86_64_TPOFF64:
+  case ELF::R_X86_64_GOTTPOFF:
+    return true;
+  }
+}
+
+bool isTLSAArch64(uint64_t Type) {
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+  case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+  case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+  case ELF::R_AARCH64_TLSDESC_CALL:
+  case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+    return true;
+  }
+}
+
+bool isPCRelativeX86(uint64_t Type) {
+  switch (Type) {
+  default:
+    llvm_unreachable("Unknown relocation type");
+  case ELF::R_X86_64_64:
+  case ELF::R_X86_64_32:
+  case ELF::R_X86_64_32S:
+  case ELF::R_X86_64_16:
+  case ELF::R_X86_64_8:
+  case ELF::R_X86_64_TPOFF32:
+    return false;
+  case ELF::R_X86_64_PC8:
+  case ELF::R_X86_64_PC32:
+  case ELF::R_X86_64_PC64:
+  case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_PLT32:
+  case ELF::R_X86_64_GOTTPOFF:
+  case ELF::R_X86_64_GOTPCRELX:
+  case ELF::R_X86_64_REX_GOTPCRELX:
+    return true;
+  }
+}
+
+bool isPCRelativeAArch64(uint64_t Type) {
+  switch (Type) {
+  default:
+    llvm_unreachable("Unknown relocation type");
+  case ELF::R_AARCH64_ABS64:
+  case ELF::R_AARCH64_LDST64_ABS_LO12_NC:
+  case ELF::R_AARCH64_ADD_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+  case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+  case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
+  case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+  case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+  case ELF::R_AARCH64_TLSDESC_ADD_LO12:
+    return false;
+  case ELF::R_AARCH64_TLSDESC_CALL:
+  case ELF::R_AARCH64_CALL26:
+  case ELF::R_AARCH64_ADR_PREL_PG_HI21:
+  case ELF::R_AARCH64_ADR_GOT_PAGE:
+  case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+  case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
+  case ELF::R_AARCH64_JUMP26:
+  case ELF::R_AARCH64_PREL32:
+    return true;
+  }
+}
+
+} // end anonymous namespace
+
+bool Relocation::isSupported(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return isSupportedAArch64(Type);
+  return isSupportedX86(Type);
+}
+
+size_t Relocation::getSizeForType(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return getSizeForTypeAArch64(Type);
+  return getSizeForTypeX86(Type);
+}
+
+uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents,
+                                  uint64_t PC) {
+  if (Arch == Triple::aarch64)
+    return extractValueAArch64(Type, Contents, PC);
+  return extractValueX86(Type, Contents, PC);
+}
+
+bool Relocation::isGOT(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return isGOTAArch64(Type);
+  return isGOTX86(Type);
+}
+
+bool Relocation::isNone(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return Type == ELF::R_AARCH64_NONE;
+  return Type == ELF::R_X86_64_NONE;
+}
+
+bool Relocation::isRelative(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return Type == ELF::R_AARCH64_RELATIVE;
+  return Type == ELF::R_X86_64_RELATIVE;
+}
+
+bool Relocation::isIRelative(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return Type == ELF::R_AARCH64_IRELATIVE;
+  return Type == ELF::R_X86_64_IRELATIVE;
+}
+
+bool Relocation::isTLS(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return isTLSAArch64(Type);
+  return isTLSX86(Type);
+}
+
+bool Relocation::isPCRelative(uint64_t Type) {
+  if (Arch == Triple::aarch64)
+    return isPCRelativeAArch64(Type);
+  return isPCRelativeX86(Type);
+}
+
+uint64_t Relocation::getPC32() {
+  if (Arch == Triple::aarch64)
+    return ELF::R_AARCH64_PREL32;
+  return ELF::R_X86_64_PC32;
+}
+
+uint64_t Relocation::getPC64() {
+  if (Arch == Triple::aarch64)
+    return ELF::R_AARCH64_PREL64;
+  return ELF::R_X86_64_PC64;
+}
+
+size_t Relocation::emit(MCStreamer *Streamer) const {
+  const size_t Size = getSizeForType(Type);
+  MCContext &Ctx = Streamer->getContext();
+  if (isPCRelative(Type)) {
+    MCSymbol *TempLabel = Ctx.createNamedTempSymbol();
+    Streamer->emitLabel(TempLabel);
+    const MCExpr *Value = nullptr;
+    if (Symbol) {
+      Value = MCSymbolRefExpr::create(Symbol, Ctx);
+      if (Addend) {
+        Value = MCBinaryExpr::createAdd(Value,
+                                        MCConstantExpr::create(Addend, Ctx),
+                                        Ctx);
+      }
+    } else {
+      Value = MCConstantExpr::create(Addend, Ctx);
+    }
+    Value = MCBinaryExpr::createSub(Value,
+                                    MCSymbolRefExpr::create(TempLabel, Ctx),
+                                    Ctx);
+    Streamer->emitValue(Value, Size);
+
+    return Size;
+  }
+
+  if (Symbol && Addend) {
+    auto Value = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, Ctx),
+                                         MCConstantExpr::create(Addend, Ctx),
+                                         Ctx);
+    Streamer->emitValue(Value, Size);
+  } else if (Symbol) {
+    Streamer->emitSymbolValue(Symbol, Size);
+  } else {
+    Streamer->emitIntValue(Addend, Size);
+  }
+
+  return Size;
+}
+
+#define ELF_RELOC(name, value) #name,
+
+void Relocation::print(raw_ostream &OS) const {
+  static const char *X86RelocNames[] = {
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
+  };
+  static const char *AArch64RelocNames[] = {
+#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
+  };
+  if (Arch == Triple::aarch64)
+    OS << AArch64RelocNames[Type];
+  else
+    OS << X86RelocNames[Type];
+  OS << ", 0x" << Twine::utohexstr(Offset);
+  if (Symbol) {
+    OS << ", " << Symbol->getName();
+  }
+  if (int64_t(Addend) < 0)
+    OS << ", -0x" << Twine::utohexstr(-int64_t(Addend));
+  else
+    OS << ", 0x" << Twine::utohexstr(Addend);
+  OS << ", 0x" << Twine::utohexstr(Value);
+}
diff --git a/bolt/src/Relocation.h b/bolt/src/Relocation.h
new file mode 100644
index 000000000000..a97f911cfa9f
--- /dev/null
+++ b/bolt/src/Relocation.h
@@ -0,0 +1,120 @@
+//===--- Relocation.h  - Interface for object file relocations ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_RELOCATION_H
+#define LLVM_TOOLS_LLVM_BOLT_RELOCATION_H
+
+#include "llvm/ADT/Triple.h"
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+class raw_ostream;
+
+namespace ELF {
+/// Relocation type mask that was accidentally output by bfd 2.30 linker.
+enum {
+  R_X86_64_converted_reloc_bit = 0x80
+};
+}
+
+namespace bolt {
+
+/// Relocation class.
+struct Relocation {
+  static Triple::ArchType Arch; /// set by BinaryContext ctor.
+
+  /// The offset of this relocation in the object it is contained in.
+  uint64_t Offset;
+
+  /// The symbol this relocation is referring to.
+  MCSymbol *Symbol;
+
+  /// Relocation type.
+  uint64_t Type;
+
+  /// The offset from the \p Symbol base used to compute the final
+  /// value of this relocation.
+  uint64_t Addend;
+
+  /// The computed relocation value extracted from the binary file.
+  /// Used to validate relocation correctness.
+  uint64_t Value;
+
+  /// Return size of the given relocation \p Type.
+  static size_t getSizeForType(uint64_t Type);
+
+  /// Return size of this relocation.
+  size_t getSize() const { return getSizeForType(Type); }
+
+  /// Extract current relocated value from binary contents. This is used for
+  /// RISC architectures where values are encoded in specific bits depending
+  /// on the relocation value. For X86, we limit to sign extending the value
+  /// if necessary.
+  static uint64_t extractValue(uint64_t Type, uint64_t Contents, uint64_t PC);
+
+  /// Return true if relocation type is PC-relative. Return false otherwise.
+  static bool isPCRelative(uint64_t Type);
+
+  /// Check if \p Type is a supported relocation type.
+  static bool isSupported(uint64_t Type);
+
+  /// Return true if relocation type implies the creation of a GOT entry
+  static bool isGOT(uint64_t Type);
+
+  /// Return true if relocation type is NONE
+  static bool isNone(uint64_t Type);
+
+  /// Return true if relocation type is RELATIVE
+  static bool isRelative(uint64_t Type);
+
+  /// Return true if relocation type is IRELATIVE
+  static bool isIRelative(uint64_t Type);
+
+  /// Return true if relocation type is for thread local storage.
+  static bool isTLS(uint64_t Type);
+
+  /// Return code for a PC-relative 4-byte relocation
+  static uint64_t getPC32();
+
+  /// Return code for a PC-relative 8-byte relocation
+  static uint64_t getPC64();
+
+  /// Return true if this relocation is PC-relative. Return false otherwise.
+  bool isPCRelative() const {
+    return isPCRelative(Type);
+  }
+
+  /// Emit relocation at a current \p Streamer' position. The caller is
+  /// responsible for setting the position correctly.
+  size_t emit(MCStreamer *Streamer) const;
+
+  /// Print a relocation to \p OS.
+  void print(raw_ostream &OS) const;
+};
+
+/// Relocation ordering by offset.
+inline bool operator<(const Relocation &A, const Relocation &B) {
+  return A.Offset < B.Offset;
+}
+
+inline bool operator<(const Relocation &A, uint64_t B) { return A.Offset < B; }
+
+inline bool operator<(uint64_t A, const Relocation &B) { return A < B.Offset; }
+
+inline raw_ostream &operator<<(raw_ostream &OS, const Relocation &Rel) {
+  Rel.print(OS);
+  return OS;
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp
new file mode 100644
index 000000000000..307ccd64f8b8
--- /dev/null
+++ b/bolt/src/RewriteInstance.cpp
@@ -0,0 +1,5272 @@
+//===--- RewriteInstance.cpp - Interface for machine-level function -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "RewriteInstance.h"
+#include "BinaryContext.h"
+#include "BinaryEmitter.h"
+#include "BinaryFunction.h"
+#include "BinaryPassManager.h"
+#include "BoltAddressTranslation.h"
+#include "CacheMetrics.h"
+#include "DWARFRewriter.h"
+#include "DataAggregator.h"
+#include "DataReader.h"
+#include "DebugData.h"
+#include "Exceptions.h"
+#include "ExecutableFileMemoryManager.h"
+#include "MCPlusBuilder.h"
+#include "ParallelUtilities.h"
+#include "Passes/ReorderFunctions.h"
+#include "Relocation.h"
+#include "RuntimeLibs/HugifyRuntimeLibrary.h"
+#include "RuntimeLibs/InstrumentationRuntimeLibrary.h"
+#include "Utils.h"
+#include "YAMLProfileReader.h"
+#include "YAMLProfileWriter.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <fstream>
+#include <system_error>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+extern cl::opt<uint32_t> X86AlignBranchBoundary;
+extern cl::opt<bool> X86AlignBranchWithin32BBoundaries;
+
+namespace opts {
+
+extern bool HeatmapMode;
+extern bool LinuxKernelMode;
+
+extern cl::OptionCategory BoltCategory;
+extern cl::OptionCategory BoltDiffCategory;
+extern cl::OptionCategory BoltOptCategory;
+extern cl::OptionCategory BoltOutputCategory;
+extern cl::OptionCategory AggregatorCategory;
+
+extern cl::opt<MacroFusionType> AlignMacroOpFusion;
+extern cl::opt<bool> Hugify;
+extern cl::opt<bool> Instrument;
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+extern cl::list<std::string> ReorderData;
+extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
+extern cl::opt<bool> TimeBuild;
+
+cl::opt<unsigned>
+AlignText("align-text",
+  cl::desc("alignment of .text section"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+ForceToDataRelocations("force-data-relocations",
+  cl::desc("force relocations to data sections to always be processed"),
+  cl::init(false),
+  cl::Hidden,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+PrintCacheMetrics("print-cache-metrics",
+  cl::desc("calculate and print various metrics for instruction cache"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltOptCategory));
+
+cl::opt<std::string>
+OutputFilename("o",
+  cl::desc("<output file>"),
+  cl::Optional,
+  cl::cat(BoltOutputCategory));
+
+cl::opt<std::string>
+BoltID("bolt-id",
+  cl::desc("add any string to tag this execution in the "
+           "output binary via bolt info section"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+AllowStripped("allow-stripped",
+  cl::desc("allow processing of stripped binaries"),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+DumpDotAll("dump-dot-all",
+  cl::desc("dump function CFGs to graphviz format after each stage"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::list<std::string>
+ForceFunctionNames("funcs",
+  cl::CommaSeparated,
+  cl::desc("limit optimizations to functions from the list"),
+  cl::value_desc("func1,func2,func3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string>
+FunctionNamesFile("funcs-file",
+  cl::desc("file with list of functions to optimize"),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+HotFunctionsAtEnd(
+  "hot-functions-at-end",
+  cl::desc(
+      "if reorder-functions is used, order functions putting hottest last"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool> HotText(
+    "hot-text",
+    cl::desc(
+        "Generate hot text symbols. Apply this option to a precompiled binary "
+        "that manually calls into hugify, such that at runtime hugify call "
+        "will put hot code into 2M pages. This requires relocation."),
+    cl::ZeroOrMore, cl::cat(BoltCategory));
+
+static cl::list<std::string>
+HotTextMoveSections("hot-text-move-sections",
+  cl::desc("list of sections containing functions used for hugifying hot text. "
+           "BOLT makes sure these functions are not placed on the same page as "
+           "the hot text. (default=\'.stub,.mover\')."),
+  cl::value_desc("sec1,sec2,sec3,..."),
+  cl::CommaSeparated,
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+HotData("hot-data",
+  cl::desc("hot data symbols support (relocation mode)"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+KeepTmp("keep-tmp",
+  cl::desc("preserve intermediate .o file"),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+Lite("lite",
+  cl::desc("skip processing of cold functions"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+static cl::opt<unsigned>
+LiteThresholdPct("lite-threshold-pct",
+  cl::desc("threshold (in percent) for selecting functions to process in lite "
+            "mode. Higher threshold means fewer functions to process. E.g "
+            "threshold of 90 means only top 10 percent of functions with "
+            "profile will be processed."),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+LiteThresholdCount("lite-threshold-count",
+  cl::desc("similar to '-lite-threshold-pct' but specify threshold using "
+           "absolute function call count. I.e. limit processing to functions "
+           "executed at least the specified number of times."),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<unsigned>
+MaxFunctions("max-funcs",
+  cl::desc("maximum number of functions to process"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<unsigned>
+MaxDataRelocations("max-data-relocations",
+  cl::desc("maximum number of data relocations to process"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+PrintAll("print-all",
+  cl::desc("print functions after each stage"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+PrintCFG("print-cfg",
+  cl::desc("print functions after CFG construction"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool> PrintDisasm("print-disasm",
+  cl::desc("print function after disassembly"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+PrintGlobals("print-globals",
+  cl::desc("print global symbols after disassembly"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+extern cl::opt<bool> PrintSections;
+
+static cl::opt<bool>
+PrintLoopInfo("print-loops",
+  cl::desc("print loop related information"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+PrintSDTMarkers("print-sdt",
+  cl::desc("print all SDT markers"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+enum PrintPseudoProbesOptions {
+  PPP_None = 0,
+  PPP_Probes_Section_Decode = 0x1,
+  PPP_Probes_Address_Conversion = 0x2,
+  PPP_All = 0xf
+};
+
+cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
+    "print-pseudo-probes", cl::desc("print pseudo probe info"),
+    cl::init(PPP_None),
+    cl::values(clEnumValN(PPP_Probes_Section_Decode, "decode",
+                          "decode probes section from binary"),
+               clEnumValN(PPP_Probes_Address_Conversion, "address_conversion",
+                          "update address2ProbesMap with output block address"),
+               clEnumValN(PPP_All, "all", "enable all debugging printout")),
+    cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory));
+
+static cl::opt<cl::boolOrDefault>
+RelocationMode("relocs",
+  cl::desc("use relocations in the binary (default=autodetect)"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string>
+SaveProfile("w",
+  cl::desc("save recorded profile to a file"),
+  cl::cat(BoltOutputCategory));
+
+static cl::list<std::string>
+SkipFunctionNames("skip-funcs",
+  cl::CommaSeparated,
+  cl::desc("list of functions to skip"),
+  cl::value_desc("func1,func2,func3,..."),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string>
+SkipFunctionNamesFile("skip-funcs-file",
+  cl::desc("file with list of functions to skip"),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+SplitEH("split-eh",
+  cl::desc("split C++ exception handling code"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+cl::opt<bool>
+StrictMode("strict",
+  cl::desc("trust the input to be from a well-formed source"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+TrapOldCode("trap-old-code",
+  cl::desc("insert traps in old function bodies (relocation mode)"),
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string> DWPPathName("dwp",
+                                        cl::desc("Path and name to DWP file."),
+                                        cl::Hidden, cl::ZeroOrMore,
+                                        cl::init(""), cl::cat(BoltCategory));
+
+cl::opt<bool>
+UpdateDebugSections("update-debug-sections",
+  cl::desc("update DWARF debug sections of the executable"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+EnableBAT("enable-bat",
+  cl::desc("write BOLT Address Translation tables"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+UseGnuStack("use-gnu-stack",
+  cl::desc("use GNU_STACK program header for new segment (workaround for "
+           "issues with strip/objcopy)"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+cl::opt<bool>
+UseOldText("use-old-text",
+  cl::desc("re-use space in old .text if possible (relocation mode)"),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory));
+
+// The default verbosity level (0) is pretty terse, level 1 is fairly
+// verbose and usually prints some informational message for every
+// function processed.  Level 2 is for the noisiest of messages and
+// often prints a message per basic block.
+// Error messages should never be suppressed by the verbosity level.
+// Only warnings and info messages should be affected.
+//
+// The rational behind stream usage is as follows:
+// outs() for info and debugging controlled by command line flags.
+// errs() for errors and warnings.
+// dbgs() for output within DEBUG().
+cl::opt<unsigned>
+Verbosity("v",
+  cl::desc("set verbosity level for diagnostic output"),
+  cl::init(0),
+  cl::ZeroOrMore,
+  cl::cat(BoltCategory),
+  cl::sub(*cl::AllSubCommands));
+
+cl::opt<bool>
+AggregateOnly("aggregate-only",
+  cl::desc("exit after writing aggregated data file"),
+  cl::Hidden,
+  cl::cat(AggregatorCategory));
+
+cl::opt<bool>
+DiffOnly("diff-only",
+  cl::desc("stop processing once we have enough to compare two binaries"),
+  cl::Hidden,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<bool>
+TimeRewrite("time-rewrite",
+  cl::desc("print time spent in rewriting passes"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+static cl::opt<bool>
+SequentialDisassembly("sequential-disassembly",
+  cl::desc("performs disassembly sequentially"),
+  cl::init(false),
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+WriteBoltInfoSection("bolt-info",
+  cl::desc("write bolt info section in the output binary"),
+  cl::init(true),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOutputCategory));
+
+bool isHotTextMover(const BinaryFunction &Function) {
+  for (std::string &SectionName : opts::HotTextMoveSections) {
+    if (Function.getOriginSectionName() &&
+        *Function.getOriginSectionName() == SectionName)
+      return true;
+  }
+
+  return false;
+}
+
+/// Return true if we should process all functions in the binary.
+bool processAllFunctions() {
+  if (opts::AggregateOnly)
+    return false;
+
+  if (UseOldText || StrictMode)
+    return true;
+
+  return false;
+}
+
+} // namespace opts
+
+constexpr const char *RewriteInstance::SectionsToOverwrite[];
+std::vector<std::string> RewriteInstance::DebugSectionsToOverwrite = {
+    ".debug_aranges", ".debug_line", ".debug_loc",
+    ".debug_ranges",  ".gdb_index",  ".debug_addr"};
+
+const char RewriteInstance::TimerGroupName[] = "rewrite";
+const char RewriteInstance::TimerGroupDesc[] = "Rewrite passes";
+
+namespace llvm {
+namespace bolt {
+
+extern const char *BoltRevision;
+
+} // namespace bolt
+} // namespace llvm
+
+namespace {
+
+bool refersToReorderedSection(ErrorOr<BinarySection &> Section) {
+  auto Itr = std::find_if(opts::ReorderData.begin(),
+                          opts::ReorderData.end(),
+                          [&](const std::string &SectionName) {
+                            return (Section &&
+                                    Section->getName() == SectionName);
+                          });
+  return Itr != opts::ReorderData.end();
+}
+
+} // namespace
+
+RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
+                                 const char *const *Argv, StringRef ToolPath)
+    : InputFile(File), Argc(Argc), Argv(Argv), ToolPath(ToolPath),
+      SHStrTab(StringTableBuilder::ELF) {
+  auto ELF64LEFile = dyn_cast<ELF64LEObjectFile>(InputFile);
+  if (!ELF64LEFile) {
+    errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n";
+    exit(1);
+  }
+
+  bool IsPIC = false;
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+  if (Obj.getHeader().e_type != ELF::ET_EXEC) {
+    outs() << "BOLT-INFO: shared object or position-independent executable "
+              "detected\n";
+    IsPIC = true;
+  }
+
+  BC = BinaryContext::createBinaryContext(
+      File, IsPIC,
+      DWARFContext::create(*File, nullptr, opts::DWPPathName,
+                           WithColor::defaultErrorHandler,
+                           WithColor::defaultWarningHandler,
+                           /*UsesRelocs=*/false));
+
+  BAT = std::make_unique<BoltAddressTranslation>(*BC);
+
+  if (opts::UpdateDebugSections)
+    DebugInfoRewriter = std::make_unique<DWARFRewriter>(*BC);
+
+  if (opts::Instrument) {
+    BC->setRuntimeLibrary(std::make_unique<InstrumentationRuntimeLibrary>());
+  } else if (opts::Hugify) {
+    BC->setRuntimeLibrary(std::make_unique<HugifyRuntimeLibrary>());
+  }
+}
+
+RewriteInstance::~RewriteInstance() {}
+
+Error RewriteInstance::setProfile(StringRef Filename) {
+  if (!sys::fs::exists(Filename))
+    return errorCodeToError(make_error_code(errc::no_such_file_or_directory));
+
+  if (ProfileReader) {
+    // Already exists
+    return make_error<StringError>(
+        Twine("multiple profiles specified: ") + ProfileReader->getFilename() +
+        " and " + Filename, inconvertibleErrorCode());
+  }
+
+  // Spawn a profile reader based on file contents.
+  if (DataAggregator::checkPerfDataMagic(Filename)) {
+    ProfileReader = std::make_unique<DataAggregator>(Filename);
+  } else if (YAMLProfileReader::isYAML(Filename)) {
+    ProfileReader = std::make_unique<YAMLProfileReader>(Filename);
+  } else {
+    ProfileReader = std::make_unique<DataReader>(Filename);
+  }
+
+  return Error::success();
+}
+
+/// Return true if the function \p BF should be disassembled.
+static bool shouldDisassemble(const BinaryFunction &BF) {
+  if (BF.isPseudo())
+    return false;
+
+  if (opts::processAllFunctions())
+    return true;
+
+  return !BF.isIgnored();
+}
+
+void RewriteInstance::discoverStorage() {
+  NamedRegionTimer T("discoverStorage", "discover storage", TimerGroupName,
+                     TimerGroupDesc, opts::TimeRewrite);
+
+  // Stubs are harmful because RuntimeDyld may try to increase the size of
+  // sections accounting for stubs when we need those sections to match the
+  // same size seen in the input binary, in case this section is a copy
+  // of the original one seen in the binary.
+  BC->EFMM.reset(new ExecutableFileMemoryManager(*BC, /*AllowStubs*/ false));
+
+  auto ELF64LEFile = dyn_cast<ELF64LEObjectFile>(InputFile);
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+
+  BC->StartFunctionAddress = Obj.getHeader().e_entry;
+
+  NextAvailableAddress = 0;
+  uint64_t NextAvailableOffset = 0;
+  ELF64LE::PhdrRange PHs =
+      cantFail(Obj.program_headers(), "program_headers() failed");
+  for (const ELF64LE::Phdr &Phdr : PHs) {
+    if (Phdr.p_type == ELF::PT_LOAD) {
+      BC->FirstAllocAddress = std::min(BC->FirstAllocAddress,
+                                       static_cast<uint64_t>(Phdr.p_vaddr));
+      NextAvailableAddress = std::max(NextAvailableAddress,
+                                      Phdr.p_vaddr + Phdr.p_memsz);
+      NextAvailableOffset = std::max(NextAvailableOffset,
+                                     Phdr.p_offset + Phdr.p_filesz);
+
+      BC->SegmentMapInfo[Phdr.p_vaddr] = SegmentInfo{Phdr.p_vaddr,
+                                                     Phdr.p_memsz,
+                                                     Phdr.p_offset,
+                                                     Phdr.p_filesz,
+                                                     Phdr.p_align};
+    }
+    if (Phdr.p_type == ELF::PT_INTERP)
+      BC->hasInterpHeader = true;
+  }
+
+  for (const SectionRef &Section : InputFile->sections()) {
+    StringRef SectionName = cantFail(Section.getName());
+    if (SectionName == ".text") {
+      BC->OldTextSectionAddress = Section.getAddress();
+      BC->OldTextSectionSize = Section.getSize();
+
+      StringRef SectionContents = cantFail(Section.getContents());
+      BC->OldTextSectionOffset =
+        SectionContents.data() - InputFile->getData().data();
+    }
+
+    if (!opts::HeatmapMode &&
+        !(opts::AggregateOnly && BAT->enabledFor(InputFile)) &&
+        (SectionName.startswith(getOrgSecPrefix()) ||
+         SectionName == getBOLTTextSectionName())) {
+      errs() << "BOLT-ERROR: input file was processed by BOLT. "
+                "Cannot re-optimize.\n";
+      exit(1);
+    }
+  }
+
+  assert(NextAvailableAddress && NextAvailableOffset &&
+         "no PT_LOAD pheader seen");
+
+  outs() << "BOLT-INFO: first alloc address is 0x"
+         << Twine::utohexstr(BC->FirstAllocAddress) << '\n';
+
+  FirstNonAllocatableOffset = NextAvailableOffset;
+
+  NextAvailableAddress = alignTo(NextAvailableAddress, BC->PageAlign);
+  NextAvailableOffset = alignTo(NextAvailableOffset, BC->PageAlign);
+
+  if (!opts::UseGnuStack) {
+    // This is where the black magic happens. Creating PHDR table in a segment
+    // other than that containing ELF header is tricky. Some loaders and/or
+    // parts of loaders will apply e_phoff from ELF header assuming both are in
+    // the same segment, while others will do the proper calculation.
+    // We create the new PHDR table in such a way that both of the methods
+    // of loading and locating the table work. There's a slight file size
+    // overhead because of that.
+    //
+    // NB: bfd's strip command cannot do the above and will corrupt the
+    //     binary during the process of stripping non-allocatable sections.
+    if (NextAvailableOffset <= NextAvailableAddress - BC->FirstAllocAddress) {
+      NextAvailableOffset = NextAvailableAddress - BC->FirstAllocAddress;
+    } else {
+      NextAvailableAddress = NextAvailableOffset + BC->FirstAllocAddress;
+    }
+    assert(NextAvailableOffset == NextAvailableAddress - BC->FirstAllocAddress
+           && "PHDR table address calculation error");
+
+    outs() << "BOLT-INFO: creating new program header table at address 0x"
+           << Twine::utohexstr(NextAvailableAddress) << ", offset 0x"
+           << Twine::utohexstr(NextAvailableOffset) << '\n';
+
+    PHDRTableAddress = NextAvailableAddress;
+    PHDRTableOffset = NextAvailableOffset;
+
+    // Reserve space for 3 extra pheaders.
+    unsigned Phnum = Obj.getHeader().e_phnum;
+    Phnum += 3;
+
+    NextAvailableAddress += Phnum * sizeof(ELF64LEPhdrTy);
+    NextAvailableOffset  += Phnum * sizeof(ELF64LEPhdrTy);
+  }
+
+  // Align at cache line.
+  NextAvailableAddress = alignTo(NextAvailableAddress, 64);
+  NextAvailableOffset = alignTo(NextAvailableOffset, 64);
+
+  NewTextSegmentAddress = NextAvailableAddress;
+  NewTextSegmentOffset = NextAvailableOffset;
+  BC->LayoutStartAddress = NextAvailableAddress;
+
+  // Tools such as objcopy can strip section contents but leave header
+  // entries. Check that at least .text is mapped in the file.
+  if (!getFileOffsetForAddress(BC->OldTextSectionAddress)) {
+    errs() << "BOLT-ERROR: input binary is not a valid ELF executable as its "
+              "text section is not mapped to a valid segment\n";
+    exit(1);
+  }
+}
+
+void RewriteInstance::parseSDTNotes() {
+  if (!SDTSection)
+    return;
+
+  StringRef Buf = SDTSection->getContents();
+  DataExtractor DE = DataExtractor(Buf, BC->AsmInfo->isLittleEndian(),
+                                   BC->AsmInfo->getCodePointerSize());
+  uint64_t Offset = 0;
+
+  while (DE.isValidOffset(Offset)) {
+    uint32_t NameSz = DE.getU32(&Offset);
+    DE.getU32(&Offset); // skip over DescSz
+    uint32_t Type = DE.getU32(&Offset);
+    Offset = alignTo(Offset, 4);
+
+    if (Type != 3)
+      errs() << "BOLT-WARNING: SDT note type \"" << Type
+             << "\" is not expected\n";
+
+    if (NameSz == 0)
+      errs() << "BOLT-WARNING: SDT note has empty name\n";
+
+    StringRef Name = DE.getCStr(&Offset);
+
+    if (!Name.equals("stapsdt"))
+      errs() << "BOLT-WARNING: SDT note name \"" << Name
+             << "\" is not expected\n";
+
+    // Parse description
+    SDTMarkerInfo Marker;
+    Marker.PCOffset = Offset;
+    Marker.PC = DE.getU64(&Offset);
+    Marker.Base = DE.getU64(&Offset);
+    Marker.Semaphore = DE.getU64(&Offset);
+    Marker.Provider = DE.getCStr(&Offset);
+    Marker.Name = DE.getCStr(&Offset);
+    Marker.Args = DE.getCStr(&Offset);
+    Offset = alignTo(Offset, 4);
+    BC->SDTMarkers[Marker.PC] = Marker;
+  }
+
+  if (opts::PrintSDTMarkers)
+    printSDTMarkers();
+}
+
+void RewriteInstance::parsePseudoProbe() {
+  if (!PseudoProbeDescSection && !PseudoProbeSection)
+    // pesudo probe is not added to binary. It is normal and no warning needed.
+    return;
+  // If only one section is found, it might mean the ELF is corrupted.
+  if (!PseudoProbeDescSection) {
+    errs() << "BOLT-WARNING: fail in reading .pseudo_probe_desc binary\n";
+    return;
+  } else if (!PseudoProbeSection) {
+    errs() << "BOLT-WARNING: fail in reading .pseudo_probe binary\n";
+    return;
+  }
+
+  StringRef Contents = PseudoProbeDescSection->getContents();
+  if (!BC->ProbeDecoder.buildGUID2FuncDescMap(
+          reinterpret_cast<const uint8_t *>(Contents.data()),
+          Contents.size())) {
+    errs() << "BOLT-WARNING: fail in building GUID2FuncDescMap\n";
+    return;
+  }
+  Contents = PseudoProbeSection->getContents();
+  if (!BC->ProbeDecoder.buildAddress2ProbeMap(
+          reinterpret_cast<const uint8_t *>(Contents.data()),
+          Contents.size())) {
+    BC->ProbeDecoder.getAddress2ProbesMap().clear();
+    errs() << "BOLT-WARNING: fail in building Address2ProbeMap\n";
+    return;
+  }
+
+  if (opts::PrintPseudoProbes == opts::PrintPseudoProbesOptions::PPP_All ||
+      opts::PrintPseudoProbes ==
+          opts::PrintPseudoProbesOptions::PPP_Probes_Section_Decode) {
+    outs() << "Report of decoding input pseudo probe binaries \n";
+    BC->ProbeDecoder.printGUID2FuncDescMap(outs());
+    BC->ProbeDecoder.printProbesForAllAddresses(outs());
+  }
+}
+
+void RewriteInstance::printSDTMarkers() {
+  outs() << "BOLT-INFO: Number of SDT markers is " << BC->SDTMarkers.size()
+         << "\n";
+  for (auto It : BC->SDTMarkers) {
+    SDTMarkerInfo &Marker = It.second;
+    outs() << "BOLT-INFO: PC: " << utohexstr(Marker.PC)
+           << ", Base: " << utohexstr(Marker.Base)
+           << ", Semaphore: " << utohexstr(Marker.Semaphore)
+           << ", Provider: " << Marker.Provider << ", Name: " << Marker.Name
+           << ", Args: " << Marker.Args << "\n";
+  }
+}
+
+void RewriteInstance::parseBuildID() {
+  if (!BuildIDSection)
+    return;
+
+  StringRef Buf = BuildIDSection->getContents();
+
+  // Reading notes section (see Portable Formats Specification, Version 1.1,
+  // pg 2-5, section "Note Section").
+  DataExtractor DE = DataExtractor(Buf, true, 8);
+  uint64_t Offset = 0;
+  if (!DE.isValidOffset(Offset))
+    return;
+  uint32_t NameSz = DE.getU32(&Offset);
+  if (!DE.isValidOffset(Offset))
+    return;
+  uint32_t DescSz = DE.getU32(&Offset);
+  if (!DE.isValidOffset(Offset))
+    return;
+  uint32_t Type = DE.getU32(&Offset);
+
+  LLVM_DEBUG(dbgs() << "NameSz = " << NameSz << "; DescSz = " << DescSz
+                    << "; Type = " << Type << "\n");
+
+  // Type 3 is a GNU build-id note section
+  if (Type != 3)
+    return;
+
+  StringRef Name = Buf.slice(Offset, Offset + NameSz);
+  Offset = alignTo(Offset + NameSz, 4);
+  if (Name.substr(0, 3) != "GNU")
+    return;
+
+  BuildID = Buf.slice(Offset, Offset + DescSz);
+}
+
+Optional<std::string> RewriteInstance::getPrintableBuildID() const {
+  if (BuildID.empty())
+    return NoneType();
+
+  std::string Str;
+  raw_string_ostream OS(Str);
+  const unsigned char *CharIter = BuildID.bytes_begin();
+  while (CharIter != BuildID.bytes_end()) {
+    if (*CharIter < 0x10)
+      OS << "0";
+    OS << Twine::utohexstr(*CharIter);
+    ++CharIter;
+  }
+  return OS.str();
+}
+
+void RewriteInstance::patchBuildID() {
+  raw_fd_ostream &OS = Out->os();
+
+  if (BuildID.empty())
+    return;
+
+  size_t IDOffset = BuildIDSection->getContents().rfind(BuildID);
+  assert(IDOffset != StringRef::npos && "failed to patch build-id");
+
+  uint64_t FileOffset = getFileOffsetForAddress(BuildIDSection->getAddress());
+  if (!FileOffset) {
+    errs() << "BOLT-WARNING: Non-allocatable build-id will not be updated.\n";
+    return;
+  }
+
+  char LastIDByte = BuildID[BuildID.size() - 1];
+  LastIDByte ^= 1;
+  OS.pwrite(&LastIDByte, 1, FileOffset + IDOffset + BuildID.size() - 1);
+
+  outs() << "BOLT-INFO: patched build-id (flipped last bit)\n";
+}
+
+void RewriteInstance::run() {
+  if (!BC) {
+    errs() << "BOLT-ERROR: failed to create a binary context\n";
+    return;
+  }
+
+  outs() << "BOLT-INFO: Target architecture: "
+         << Triple::getArchTypeName(
+                (llvm::Triple::ArchType)InputFile->getArch())
+         << "\n";
+  outs() << "BOLT-INFO: BOLT version: " << BoltRevision << "\n";
+
+  discoverStorage();
+  readSpecialSections();
+  adjustCommandLineOptions();
+  discoverFileObjects();
+
+  preprocessProfileData();
+
+  // Skip disassembling if we have a translation table and we are running an
+  // aggregation job.
+  if (opts::AggregateOnly && BAT->enabledFor(InputFile)) {
+    processProfileData();
+    return;
+  }
+
+  selectFunctionsToProcess();
+
+  readDebugInfo();
+
+  disassembleFunctions();
+
+  processProfileDataPreCFG();
+
+  buildFunctionsCFG();
+
+  processProfileData();
+
+  postProcessFunctions();
+
+  if (opts::DiffOnly)
+    return;
+
+  runOptimizationPasses();
+
+  emitAndLink();
+
+  updateMetadata();
+
+  if (opts::LinuxKernelMode) {
+    errs() << "BOLT-WARNING: not writing the output file for Linux Kernel\n";
+    return;
+  } else if (opts::OutputFilename == "/dev/null") {
+    outs() << "BOLT-INFO: skipping writing final binary to disk\n";
+    return;
+  }
+
+  // Rewrite allocatable contents and copy non-allocatable parts with mods.
+  rewriteFile();
+}
+
+void RewriteInstance::discoverFileObjects() {
+  NamedRegionTimer T("discoverFileObjects", "discover file objects",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+  FileSymRefs.clear();
+  BC->getBinaryFunctions().clear();
+  BC->clearBinaryData();
+
+  // For local symbols we want to keep track of associated FILE symbol name for
+  // disambiguation by combined name.
+  StringRef  FileSymbolName;
+  bool SeenFileName = false;
+  struct SymbolRefHash {
+    size_t operator()(SymbolRef const &S) const {
+      return std::hash<decltype(DataRefImpl::p)>{}(S.getRawDataRefImpl().p);
+    }
+  };
+  std::unordered_map<SymbolRef, StringRef, SymbolRefHash> SymbolToFileName;
+  for (const ELFSymbolRef &Symbol : InputFile->symbols()) {
+    Expected<StringRef> NameOrError = Symbol.getName();
+    if (NameOrError && NameOrError->startswith("__asan_init")) {
+      errs() << "BOLT-ERROR: input file was compiled or linked with sanitizer "
+                "support. Cannot optimize.\n";
+      exit(1);
+    }
+    if (NameOrError && NameOrError->startswith("__llvm_coverage_mapping")) {
+      errs() << "BOLT-ERROR: input file was compiled or linked with coverage "
+                "support. Cannot optimize.\n";
+      exit(1);
+    }
+
+    if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
+      continue;
+
+    if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
+      StringRef Name =
+          cantFail(std::move(NameOrError), "cannot get symbol name for file");
+      // Ignore Clang LTO artificial FILE symbol as it is not always generated,
+      // and this uncertainty is causing havoc in function name matching.
+      if (Name == "ld-temp.o")
+        continue;
+      FileSymbolName = Name;
+      SeenFileName = true;
+      continue;
+    }
+    if (!FileSymbolName.empty() &&
+        !(cantFail(Symbol.getFlags()) & SymbolRef::SF_Global)) {
+      SymbolToFileName[Symbol] = FileSymbolName;
+    }
+  }
+
+  // Sort symbols in the file by value. Ignore symbols from non-allocatable
+  // sections.
+  auto isSymbolInMemory = [this](const SymbolRef &Sym) {
+    if (cantFail(Sym.getType()) == SymbolRef::ST_File)
+      return false;
+    if (cantFail(Sym.getFlags()) & SymbolRef::SF_Absolute)
+      return true;
+    if (cantFail(Sym.getFlags()) & SymbolRef::SF_Undefined)
+      return false;
+    BinarySection Section(*BC, *cantFail(Sym.getSection()));
+    return Section.isAllocatable();
+  };
+  std::vector<SymbolRef> SortedFileSymbols;
+  std::copy_if(InputFile->symbol_begin(), InputFile->symbol_end(),
+               std::back_inserter(SortedFileSymbols),
+               isSymbolInMemory);
+
+  std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(),
+                   [](const SymbolRef &A, const SymbolRef &B) {
+                     // FUNC symbols have the highest precedence, while SECTIONs
+                     // have the lowest.
+                     uint64_t AddressA = cantFail(A.getAddress());
+                     uint64_t AddressB = cantFail(B.getAddress());
+                     if (AddressA != AddressB)
+                      return AddressA < AddressB;
+
+                     SymbolRef::Type AType = cantFail(A.getType());
+                     SymbolRef::Type BType = cantFail(B.getType());
+                     if (AType == SymbolRef::ST_Function &&
+                         BType != SymbolRef::ST_Function)
+                       return true;
+                     if (BType == SymbolRef::ST_Debug &&
+                         AType != SymbolRef::ST_Debug)
+                       return true;
+
+                     return false;
+                   });
+
+  // For aarch64, the ABI defines mapping symbols so we identify data in the
+  // code section (see IHI0056B). $d identifies data contents.
+  auto LastSymbol = SortedFileSymbols.end() - 1;
+  if (BC->isAArch64()) {
+    LastSymbol = std::stable_partition(
+        SortedFileSymbols.begin(), SortedFileSymbols.end(),
+        [](const SymbolRef &Symbol) {
+          StringRef Name = cantFail(Symbol.getName());
+          return !(cantFail(Symbol.getType()) == SymbolRef::ST_Unknown &&
+                   (Name == "$d" || Name == "$x"));
+        });
+    --LastSymbol;
+  }
+
+  BinaryFunction *PreviousFunction = nullptr;
+  unsigned AnonymousId = 0;
+
+  const auto MarkersBegin = std::next(LastSymbol);
+  for (auto ISym = SortedFileSymbols.begin(); ISym != MarkersBegin; ++ISym) {
+    const SymbolRef &Symbol = *ISym;
+    // Keep undefined symbols for pretty printing?
+    if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
+      continue;
+
+    const SymbolRef::Type SymbolType = cantFail(Symbol.getType());
+
+    if (SymbolType == SymbolRef::ST_File)
+      continue;
+
+    StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name");
+    uint64_t Address =
+        cantFail(Symbol.getAddress(), "cannot get symbol address");
+    if (Address == 0) {
+      if (opts::Verbosity >= 1 && SymbolType == SymbolRef::ST_Function)
+        errs() << "BOLT-WARNING: function with 0 address seen\n";
+      continue;
+    }
+
+    // Ignore input hot markers
+    if (SymName == "__hot_start" || SymName == "__hot_end") {
+      continue;
+    }
+
+    FileSymRefs[Address] = Symbol;
+
+    // Skip section symbols that will be registered by disassemblePLT().
+    if ((cantFail(Symbol.getType()) == SymbolRef::ST_Debug)) {
+      ErrorOr<BinarySection &> BSection = BC->getSectionForAddress(Address);
+      if (BSection && getPLTSectionInfo(BSection->getName()))
+        continue;
+    }
+
+    /// It is possible we are seeing a globalized local. LLVM might treat it as
+    /// a local if it has a "private global" prefix, e.g. ".L". Thus we have to
+    /// change the prefix to enforce global scope of the symbol.
+    std::string Name = SymName.startswith(BC->AsmInfo->getPrivateGlobalPrefix())
+                           ? "PG" + std::string(SymName)
+                           : std::string(SymName);
+
+    // Disambiguate all local symbols before adding to symbol table.
+    // Since we don't know if we will see a global with the same name,
+    // always modify the local name.
+    //
+    // NOTE: the naming convention for local symbols should match
+    //       the one we use for profile data.
+    std::string UniqueName;
+    std::string AlternativeName;
+    if (Name.empty()) {
+      UniqueName = "ANONYMOUS." + std::to_string(AnonymousId++);
+    } else if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global) {
+      assert(!BC->getBinaryDataByName(Name) && "global name not unique");
+      UniqueName = Name;
+    } else {
+      // If we have a local file name, we should create 2 variants for the
+      // function name. The reason is that perf profile might have been
+      // collected on a binary that did not have the local file name (e.g. as
+      // a side effect of stripping debug info from the binary):
+      //
+      //   primary:     <function>/<id>
+      //   alternative: <function>/<file>/<id2>
+      //
+      // The <id> field is used for disambiguation of local symbols since there
+      // could be identical function names coming from identical file names
+      // (e.g. from different directories).
+      std::string AltPrefix;
+      auto SFI = SymbolToFileName.find(Symbol);
+      if (SymbolType == SymbolRef::ST_Function &&
+          SFI != SymbolToFileName.end()) {
+        AltPrefix = Name + "/" + std::string(SFI->second);
+      }
+
+      UniqueName = NR.uniquify(Name);
+      if (!AltPrefix.empty())
+        AlternativeName = NR.uniquify(AltPrefix);
+    }
+
+    uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
+    uint64_t SymbolAlignment = Symbol.getAlignment();
+    unsigned SymbolFlags = cantFail(Symbol.getFlags());
+
+    auto registerName = [&](uint64_t FinalSize) {
+      // Register names even if it's not a function, e.g. for an entry point.
+      BC->registerNameAtAddress(UniqueName, Address, FinalSize,
+                                SymbolAlignment, SymbolFlags);
+      if (!AlternativeName.empty())
+        BC->registerNameAtAddress(AlternativeName, Address, FinalSize,
+                                  SymbolAlignment, SymbolFlags);
+    };
+
+    section_iterator Section =
+        cantFail(Symbol.getSection(), "cannot get symbol section");
+    if (Section == InputFile->section_end()) {
+      // Could be an absolute symbol. Could record for pretty printing.
+      LLVM_DEBUG(if (opts::Verbosity > 1) {
+        dbgs() << "BOLT-INFO: absolute sym " << UniqueName << "\n";
+      });
+      registerName(SymbolSize);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: considering symbol " << UniqueName
+                      << " for function\n");
+
+    if (!Section->isText()) {
+      assert(SymbolType != SymbolRef::ST_Function &&
+             "unexpected function inside non-code section");
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: rejecting as symbol is not in code\n");
+      registerName(SymbolSize);
+      continue;
+    }
+
+    // Assembly functions could be ST_NONE with 0 size. Check that the
+    // corresponding section is a code section and they are not inside any
+    // other known function to consider them.
+    //
+    // Sometimes assembly functions are not marked as functions and neither are
+    // their local labels. The only way to tell them apart is to look at
+    // symbol scope - global vs local.
+    if (PreviousFunction && SymbolType != SymbolRef::ST_Function) {
+      if (PreviousFunction->containsAddress(Address)) {
+        if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
+          LLVM_DEBUG(dbgs()
+                     << "BOLT-DEBUG: symbol is a function local symbol\n");
+        } else if (Address == PreviousFunction->getAddress() && !SymbolSize) {
+          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring symbol as a marker\n");
+        } else if (opts::Verbosity > 1) {
+          errs() << "BOLT-WARNING: symbol " << UniqueName
+                 << " seen in the middle of function " << *PreviousFunction
+                 << ". Could be a new entry.\n";
+        }
+        registerName(SymbolSize);
+        continue;
+      } else if (PreviousFunction->getSize() == 0 &&
+                 PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: symbol is a function local symbol\n");
+        registerName(SymbolSize);
+        continue;
+      }
+    }
+
+    if (PreviousFunction &&
+        PreviousFunction->containsAddress(Address) &&
+        PreviousFunction->getAddress() != Address) {
+      if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: skipping possibly another entry for function "
+                 << *PreviousFunction << " : " << UniqueName << '\n';
+        }
+      } else {
+        outs() << "BOLT-INFO: using " << UniqueName << " as another entry to "
+               << "function " << *PreviousFunction << '\n';
+
+        registerName(0);
+
+        PreviousFunction->
+          addEntryPointAtOffset(Address - PreviousFunction->getAddress());
+
+        // Remove the symbol from FileSymRefs so that we can skip it from
+        // in the future.
+        auto SI = FileSymRefs.find(Address);
+        assert(SI != FileSymRefs.end() && "symbol expected to be present");
+        assert(SI->second == Symbol && "wrong symbol found");
+        FileSymRefs.erase(SI);
+      }
+      registerName(SymbolSize);
+      continue;
+    }
+
+    // Checkout for conflicts with function data from FDEs.
+    bool IsSimple = true;
+    auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address);
+    if (FDEI != CFIRdWrt->getFDEs().end()) {
+      const dwarf::FDE &FDE = *FDEI->second;
+      if (FDEI->first != Address) {
+        // There's no matching starting address in FDE. Make sure the previous
+        // FDE does not contain this address.
+        if (FDEI != CFIRdWrt->getFDEs().begin()) {
+          --FDEI;
+          const dwarf::FDE &PrevFDE = *FDEI->second;
+          uint64_t PrevStart = PrevFDE.getInitialLocation();
+          uint64_t PrevLength = PrevFDE.getAddressRange();
+          if (Address > PrevStart && Address < PrevStart + PrevLength) {
+            errs() << "BOLT-ERROR: function " << UniqueName
+                   << " is in conflict with FDE ["
+                   << Twine::utohexstr(PrevStart) << ", "
+                   << Twine::utohexstr(PrevStart + PrevLength)
+                   << "). Skipping.\n";
+            IsSimple = false;
+          }
+        }
+      } else if (FDE.getAddressRange() != SymbolSize) {
+        if (SymbolSize) {
+          // Function addresses match but sizes differ.
+          errs() << "BOLT-WARNING: sizes differ for function " << UniqueName
+                 << ". FDE : " << FDE.getAddressRange()
+                 << "; symbol table : " << SymbolSize << ". Using max size.\n";
+        }
+        SymbolSize = std::max(SymbolSize, FDE.getAddressRange());
+        if (BC->getBinaryDataAtAddress(Address)) {
+          BC->setBinaryDataSize(Address, SymbolSize);
+        } else {
+          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: No BD @ 0x"
+                            << Twine::utohexstr(Address) << "\n");
+        }
+      }
+    }
+
+    BinaryFunction *BF = nullptr;
+    // Since function may not have yet obtained its real size, do a search
+    // using the list of registered functions instead of calling
+    // getBinaryFunctionAtAddress().
+    auto BFI = BC->getBinaryFunctions().find(Address);
+    if (BFI != BC->getBinaryFunctions().end()) {
+      BF = &BFI->second;
+      // Duplicate the function name. Make sure everything matches before we add
+      // an alternative name.
+      if (SymbolSize != BF->getSize()) {
+        if (opts::Verbosity >= 1) {
+          if (SymbolSize && BF->getSize()) {
+            errs() << "BOLT-WARNING: size mismatch for duplicate entries "
+                   << *BF << " and " << UniqueName << '\n';
+          }
+          outs() << "BOLT-INFO: adjusting size of function " << *BF
+                 << " old " << BF->getSize() << " new " << SymbolSize << "\n";
+        }
+        BF->setSize(std::max(SymbolSize, BF->getSize()));
+        BC->setBinaryDataSize(Address, BF->getSize());
+      }
+      BF->addAlternativeName(UniqueName);
+    } else {
+      ErrorOr<BinarySection &> Section = BC->getSectionForAddress(Address);
+      // Skip symbols from invalid sections
+      if (!Section) {
+        errs() << "BOLT-WARNING: " << UniqueName << " (0x"
+               << Twine::utohexstr(Address)
+               << ") does not have any section\n";
+        continue;
+      }
+      assert(Section && "section for functions must be registered");
+
+      // Skip symbols from zero-sized sections.
+      if (!Section->getSize())
+        continue;
+
+      BF = BC->createBinaryFunction(UniqueName, *Section, Address, SymbolSize);
+      if (!IsSimple)
+        BF->setSimple(false);
+    }
+    if (!AlternativeName.empty())
+      BF->addAlternativeName(AlternativeName);
+
+    registerName(SymbolSize);
+    PreviousFunction = BF;
+  }
+
+  // Read dynamic relocation first as their presence affects the way we process
+  // static relocations. E.g. we will ignore a static relocation at an address
+  // that is a subject to dynamic relocation processing.
+  processDynamicRelocations();
+
+  // Process PLT section.
+  if (BC->TheTriple->getArch() == Triple::x86_64)
+    disassemblePLT();
+
+  // See if we missed any functions marked by FDE.
+  for (const auto &FDEI : CFIRdWrt->getFDEs()) {
+    const uint64_t Address = FDEI.first;
+    const dwarf::FDE *FDE = FDEI.second;
+    const BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
+    if (BF)
+      continue;
+
+    BF = BC->getBinaryFunctionContainingAddress(Address);
+    if (BF) {
+      errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x"
+             << Twine::utohexstr(Address + FDE->getAddressRange())
+             << ") conflicts with function " << *BF << '\n';
+      continue;
+    }
+
+    if (opts::Verbosity >= 1) {
+      errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address)
+             << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange())
+             << ") has no corresponding symbol table entry\n";
+    }
+    ErrorOr<BinarySection &> Section = BC->getSectionForAddress(Address);
+    assert(Section && "cannot get section for address from FDE");
+    std::string FunctionName =
+      "__BOLT_FDE_FUNCat" + Twine::utohexstr(Address).str();
+    BC->createBinaryFunction(FunctionName, *Section, Address,
+                             FDE->getAddressRange());
+  }
+
+  BC->setHasSymbolsWithFileName(SeenFileName);
+
+  // Now that all the functions were created - adjust their boundaries.
+  adjustFunctionBoundaries();
+
+  // Annotate functions with code/data markers in AArch64
+  for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) {
+    const SymbolRef &Symbol = *ISym;
+    uint64_t Address =
+        cantFail(Symbol.getAddress(), "cannot get symbol address");
+    uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
+    BinaryFunction *BF =
+        BC->getBinaryFunctionContainingAddress(Address, true, true);
+    if (!BF) {
+      // Stray marker
+      continue;
+    }
+    const uint64_t EntryOffset = Address - BF->getAddress();
+    if (BF->isCodeMarker(Symbol, SymbolSize)) {
+      BF->markCodeAtOffset(EntryOffset);
+      continue;
+    }
+    if (BF->isDataMarker(Symbol, SymbolSize)) {
+      BF->markDataAtOffset(EntryOffset);
+      BC->AddressToConstantIslandMap[Address] = BF;
+      continue;
+    }
+    llvm_unreachable("Unknown marker");
+  }
+
+  if (opts::LinuxKernelMode) {
+    // Read all special linux kernel sections and their relocations
+    processLKSections();
+  } else {
+    // Read all relocations now that we have binary functions mapped.
+    processRelocations();
+  }
+}
+
+void RewriteInstance::disassemblePLT() {
+  auto analyzeOnePLTSection = [&](BinarySection &Section, uint64_t EntrySize) {
+    const uint64_t PLTAddress = Section.getAddress();
+    StringRef PLTContents = Section.getContents();
+    ArrayRef<uint8_t> PLTData(
+        reinterpret_cast<const uint8_t *>(PLTContents.data()),
+        Section.getSize());
+    const unsigned PtrSize = BC->AsmInfo->getCodePointerSize();
+
+    for (uint64_t EntryOffset = 0; EntryOffset + EntrySize <= Section.getSize();
+         EntryOffset += EntrySize) {
+      uint64_t InstrOffset = EntryOffset;
+      uint64_t InstrSize;
+      MCInst Instruction;
+      while (InstrOffset < EntryOffset + EntrySize) {
+        uint64_t InstrAddr = PLTAddress + InstrOffset;
+        if (!BC->DisAsm->getInstruction(Instruction, InstrSize,
+                                        PLTData.slice(InstrOffset), InstrAddr,
+                                        nulls())) {
+          errs() << "BOLT-ERROR: unable to disassemble instruction in PLT "
+                    "section "
+                 << Section.getName() << " at offset 0x"
+                 << Twine::utohexstr(InstrOffset) << '\n';
+          exit(1);
+        }
+
+        if (BC->MIB->isIndirectBranch(Instruction))
+          break;
+
+        InstrOffset += InstrSize;
+      }
+
+      if (InstrOffset + InstrSize > EntryOffset + EntrySize)
+        continue;
+
+      uint64_t TargetAddress;
+      if (!BC->MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
+                                             PLTAddress + InstrOffset,
+                                             InstrSize)) {
+        errs() << "BOLT-ERROR: error evaluating PLT instruction at offset 0x"
+               << Twine::utohexstr(PLTAddress + InstrOffset) << '\n';
+        exit(1);
+      }
+
+      const Relocation *Rel = BC->getDynamicRelocationAt(TargetAddress);
+      if (!Rel || !Rel->Symbol)
+        continue;
+
+      BinaryFunction *BF = BC->createBinaryFunction(
+          Rel->Symbol->getName().str() + "@PLT", Section,
+          PLTAddress + EntryOffset, 0, EntrySize, Section.getAlignment());
+      MCSymbol *TargetSymbol =
+          BC->registerNameAtAddress(Rel->Symbol->getName().str() + "@GOT",
+                                    TargetAddress, PtrSize, PtrSize);
+      BF->setPLTSymbol(TargetSymbol);
+    }
+  };
+
+  for (BinarySection &Section : BC->allocatableSections()) {
+    const PLTSectionInfo *PLTSI = getPLTSectionInfo(Section.getName());
+    if (!PLTSI)
+      continue;
+
+    analyzeOnePLTSection(Section, PLTSI->EntrySize);
+    // If we did not register any function at the start of the section,
+    // then it must be a general PLT entry. Add a function at the location.
+    if (BC->getBinaryFunctions().find(Section.getAddress()) ==
+        BC->getBinaryFunctions().end()) {
+      BinaryFunction *BF = BC->createBinaryFunction(
+          "__BOLT_PSEUDO_" + Section.getName().str(), Section,
+          Section.getAddress(), 0, PLTSI->EntrySize, Section.getAlignment());
+      BF->setPseudo(true);
+    }
+  }
+}
+
+void RewriteInstance::adjustFunctionBoundaries() {
+  for (auto BFI = BC->getBinaryFunctions().begin(),
+            BFE = BC->getBinaryFunctions().end();
+       BFI != BFE; ++BFI) {
+    BinaryFunction &Function = BFI->second;
+    const BinaryFunction *NextFunction = nullptr;
+    if (std::next(BFI) != BFE)
+      NextFunction = &std::next(BFI)->second;
+
+    // Check if it's a fragment of a function.
+    Optional<StringRef> FragName =
+        Function.hasRestoredNameRegex(".*\\.cold(\\.[0-9]+)?");
+    if (FragName) {
+      static bool PrintedWarning = false;
+      if (BC->HasRelocations && !PrintedWarning) {
+        errs() << "BOLT-WARNING: split function detected on input : "
+               << *FragName << ". The support is limited in relocation mode.\n";
+        PrintedWarning = true;
+      }
+      Function.IsFragment = true;
+    }
+
+    // Check if there's a symbol or a function with a larger address in the
+    // same section. If there is - it determines the maximum size for the
+    // current function. Otherwise, it is the size of a containing section
+    // the defines it.
+    //
+    // NOTE: ignore some symbols that could be tolerated inside the body
+    //       of a function.
+    auto NextSymRefI = FileSymRefs.upper_bound(Function.getAddress());
+    while (NextSymRefI != FileSymRefs.end()) {
+      SymbolRef &Symbol = NextSymRefI->second;
+      const uint64_t SymbolAddress = NextSymRefI->first;
+      const uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
+
+      if (NextFunction && SymbolAddress >= NextFunction->getAddress())
+        break;
+
+      if (!Function.isSymbolValidInScope(Symbol, SymbolSize))
+        break;
+
+      // This is potentially another entry point into the function.
+      uint64_t EntryOffset = NextSymRefI->first - Function.getAddress();
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function "
+                        << Function << " at offset 0x"
+                        << Twine::utohexstr(EntryOffset) << '\n');
+      Function.addEntryPointAtOffset(EntryOffset);
+
+      ++NextSymRefI;
+    }
+
+    // Function runs at most till the end of the containing section.
+    uint64_t NextObjectAddress = Function.getOriginSection()->getEndAddress();
+    // Or till the next object marked by a symbol.
+    if (NextSymRefI != FileSymRefs.end()) {
+      NextObjectAddress = std::min(NextSymRefI->first, NextObjectAddress);
+    }
+    // Or till the next function not marked by a symbol.
+    if (NextFunction) {
+      NextObjectAddress =
+          std::min(NextFunction->getAddress(), NextObjectAddress);
+    }
+
+    const uint64_t MaxSize = NextObjectAddress - Function.getAddress();
+    if (MaxSize < Function.getSize()) {
+      errs() << "BOLT-ERROR: symbol seen in the middle of the function "
+             << Function << ". Skipping.\n";
+      Function.setSimple(false);
+      Function.setMaxSize(Function.getSize());
+      continue;
+    }
+    Function.setMaxSize(MaxSize);
+    if (!Function.getSize() && Function.isSimple()) {
+      // Some assembly functions have their size set to 0, use the max
+      // size as their real size.
+      if (opts::Verbosity >= 1) {
+        outs() << "BOLT-INFO: setting size of function " << Function << " to "
+               << Function.getMaxSize() << " (was 0)\n";
+      }
+      Function.setSize(Function.getMaxSize());
+    }
+  }
+}
+
+void RewriteInstance::relocateEHFrameSection() {
+  assert(EHFrameSection && "non-empty .eh_frame section expected");
+
+  DWARFDebugFrame EHFrame(BC->TheTriple->getArch(), true,
+                          EHFrameSection->getAddress());
+  DWARFDataExtractor DE(EHFrameSection->getContents(),
+                        BC->AsmInfo->isLittleEndian(),
+                        BC->AsmInfo->getCodePointerSize());
+  auto createReloc = [&](uint64_t Value, uint64_t Offset, uint64_t DwarfType) {
+    if (DwarfType == dwarf::DW_EH_PE_omit)
+      return;
+
+    // Only fix references that are relative to other locations.
+    if (!(DwarfType & dwarf::DW_EH_PE_pcrel) &&
+        !(DwarfType & dwarf::DW_EH_PE_textrel) &&
+        !(DwarfType & dwarf::DW_EH_PE_funcrel) &&
+        !(DwarfType & dwarf::DW_EH_PE_datarel)) {
+      return;
+    }
+
+    if (!(DwarfType & dwarf::DW_EH_PE_sdata4))
+      return;
+
+    uint64_t RelType;
+    switch (DwarfType & 0x0f) {
+    default:
+      llvm_unreachable("unsupported DWARF encoding type");
+    case dwarf::DW_EH_PE_sdata4:
+    case dwarf::DW_EH_PE_udata4:
+      RelType = Relocation::getPC32();
+      Offset -= 4;
+      break;
+    case dwarf::DW_EH_PE_sdata8:
+    case dwarf::DW_EH_PE_udata8:
+      RelType = Relocation::getPC64();
+      Offset -= 8;
+      break;
+    }
+
+    // Create a relocation against an absolute value since the goal is to
+    // preserve the contents of the section independent of the new values
+    // of referenced symbols.
+    EHFrameSection->addRelocation(Offset, nullptr, RelType, Value);
+  };
+
+  Error E = EHFrame.parse(DE, createReloc);
+  check_error(std::move(E), "failed to parse EH frame");
+}
+
+ArrayRef<uint8_t> RewriteInstance::getLSDAData() {
+  return ArrayRef<uint8_t>(LSDASection->getData(),
+                           LSDASection->getContents().size());
+}
+
+uint64_t RewriteInstance::getLSDAAddress() {
+  return LSDASection->getAddress();
+}
+
+void RewriteInstance::readSpecialSections() {
+  NamedRegionTimer T("readSpecialSections", "read special sections",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+
+  bool HasTextRelocations = false;
+  bool HasDebugInfo = false;
+
+  // Process special sections.
+  for (const SectionRef &Section : InputFile->sections()) {
+    Expected<StringRef> SectionNameOrErr = Section.getName();
+    check_error(SectionNameOrErr.takeError(), "cannot get section name");
+    StringRef SectionName = *SectionNameOrErr;
+
+    // Only register sections with names.
+    if (!SectionName.empty()) {
+      BC->registerSection(Section);
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: registering section " << SectionName << " @ 0x"
+                 << Twine::utohexstr(Section.getAddress()) << ":0x"
+                 << Twine::utohexstr(Section.getAddress() + Section.getSize())
+                 << "\n");
+      if (isDebugSection(SectionName))
+        HasDebugInfo = true;
+      if (isKSymtabSection(SectionName))
+        opts::LinuxKernelMode = true;
+    }
+  }
+
+  if (HasDebugInfo && !opts::UpdateDebugSections && !opts::AggregateOnly) {
+    errs() << "BOLT-WARNING: debug info will be stripped from the binary. "
+              "Use -update-debug-sections to keep it.\n";
+  }
+
+  HasTextRelocations = (bool)BC->getUniqueSectionByName(".rela.text");
+  LSDASection = BC->getUniqueSectionByName(".gcc_except_table");
+  EHFrameSection = BC->getUniqueSectionByName(".eh_frame");
+  GOTPLTSection = BC->getUniqueSectionByName(".got.plt");
+  RelaPLTSection = BC->getUniqueSectionByName(".rela.plt");
+  RelaDynSection = BC->getUniqueSectionByName(".rela.dyn");
+  BuildIDSection = BC->getUniqueSectionByName(".note.gnu.build-id");
+  SDTSection = BC->getUniqueSectionByName(".note.stapsdt");
+  PseudoProbeDescSection = BC->getUniqueSectionByName(".pseudo_probe_desc");
+  PseudoProbeSection = BC->getUniqueSectionByName(".pseudo_probe");
+
+  if (ErrorOr<BinarySection &> BATSec =
+          BC->getUniqueSectionByName(BoltAddressTranslation::SECTION_NAME)) {
+    // Do not read BAT when plotting a heatmap
+    if (!opts::HeatmapMode) {
+      if (std::error_code EC = BAT->parse(BATSec->getContents())) {
+        errs() << "BOLT-ERROR: failed to parse BOLT address translation "
+          "table.\n";
+        exit(1);
+      }
+    }
+  }
+
+  if (opts::PrintSections) {
+    outs() << "BOLT-INFO: Sections from original binary:\n";
+    BC->printSections(outs());
+  }
+
+  if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) {
+    errs() << "BOLT-ERROR: relocations against code are missing from the input "
+              "file. Cannot proceed in relocations mode (-relocs).\n";
+    exit(1);
+  }
+
+  BC->HasRelocations = HasTextRelocations &&
+                       (opts::RelocationMode != cl::BOU_FALSE);
+
+  // Force non-relocation mode for heatmap generation
+  if (opts::HeatmapMode) {
+    BC->HasRelocations = false;
+  }
+
+  if (BC->HasRelocations) {
+    outs() << "BOLT-INFO: enabling " << (opts::StrictMode ? "strict " : "")
+           << "relocation mode\n";
+  }
+
+  // Read EH frame for function boundaries info.
+  Expected<const DWARFDebugFrame *> EHFrameOrError = BC->DwCtx->getEHFrame();
+  if (!EHFrameOrError)
+    report_error("expected valid eh_frame section", EHFrameOrError.takeError());
+  CFIRdWrt.reset(new CFIReaderWriter(*EHFrameOrError.get()));
+
+  // Parse build-id
+  parseBuildID();
+  if (Optional<std::string> FileBuildID = getPrintableBuildID()) {
+    BC->setFileBuildID(*FileBuildID);
+  }
+
+  parseSDTNotes();
+
+  // Read .dynamic/PT_DYNAMIC.
+  readELFDynamic();
+}
+
+void RewriteInstance::adjustCommandLineOptions() {
+  if (BC->isAArch64() && !BC->HasRelocations) {
+    errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully "
+              "supported\n";
+  }
+
+  if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) {
+    RtLibrary->adjustCommandLineOptions(*BC);
+  }
+
+  if (opts::AlignMacroOpFusion != MFT_NONE && !BC->isX86()) {
+    outs() << "BOLT-INFO: disabling -align-macro-fusion on non-x86 platform\n";
+    opts::AlignMacroOpFusion = MFT_NONE;
+  }
+
+  if (BC->isX86() && BC->MAB->allowAutoPadding()) {
+    if (!BC->HasRelocations) {
+      errs() << "BOLT-ERROR: cannot apply mitigations for Intel JCC erratum in "
+                "non-relocation mode\n";
+      exit(1);
+    }
+    outs() << "BOLT-WARNING: using mitigation for Intel JCC erratum, layout "
+              "may take several minutes\n";
+    opts::AlignMacroOpFusion = MFT_NONE;
+  }
+
+  if (opts::AlignMacroOpFusion != MFT_NONE && !BC->HasRelocations) {
+    outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation "
+              "mode\n";
+    opts::AlignMacroOpFusion = MFT_NONE;
+  }
+
+  if (opts::SplitEH && !BC->HasRelocations) {
+    errs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n";
+    opts::SplitEH = false;
+  }
+
+  if (opts::SplitEH && !BC->HasFixedLoadAddress) {
+    errs() << "BOLT-WARNING: disabling -split-eh for shared object\n";
+    opts::SplitEH = false;
+  }
+
+  if (opts::StrictMode && !BC->HasRelocations) {
+    errs() << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
+              "mode\n";
+    opts::StrictMode = false;
+  }
+
+  if (BC->HasRelocations && opts::AggregateOnly &&
+      !opts::StrictMode.getNumOccurrences()) {
+    outs() << "BOLT-INFO: enabling strict relocation mode for aggregation "
+              "purposes\n";
+    opts::StrictMode = true;
+  }
+
+  if (BC->isX86() && BC->HasRelocations &&
+      opts::AlignMacroOpFusion == MFT_HOT && !ProfileReader) {
+    outs() << "BOLT-INFO: enabling -align-macro-fusion=all since no profile "
+              "was specified\n";
+    opts::AlignMacroOpFusion = MFT_ALL;
+  }
+
+  if (!BC->HasRelocations &&
+      opts::ReorderFunctions != ReorderFunctions::RT_NONE) {
+    errs() << "BOLT-ERROR: function reordering only works when "
+           << "relocations are enabled\n";
+    exit(1);
+  }
+
+  if (opts::ReorderFunctions != ReorderFunctions::RT_NONE &&
+      !opts::HotText.getNumOccurrences()) {
+    opts::HotText = true;
+  } else if (opts::HotText && !BC->HasRelocations) {
+    errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
+    opts::HotText = false;
+  }
+
+  if (opts::HotText && opts::HotTextMoveSections.getNumOccurrences() == 0) {
+    opts::HotTextMoveSections.addValue(".stub");
+    opts::HotTextMoveSections.addValue(".mover");
+    opts::HotTextMoveSections.addValue(".never_hugify");
+  }
+
+  if (opts::UseOldText && !BC->OldTextSectionAddress) {
+    errs() << "BOLT-WARNING: cannot use old .text as the section was not found"
+              "\n";
+    opts::UseOldText = false;
+  }
+  if (opts::UseOldText && !BC->HasRelocations) {
+    errs() << "BOLT-WARNING: cannot use old .text in non-relocation mode\n";
+    opts::UseOldText = false;
+  }
+
+
+  if (!opts::AlignText.getNumOccurrences()) {
+    opts::AlignText = BC->PageAlign;
+  }
+
+  if (BC->isX86() && opts::Lite.getNumOccurrences() == 0 &&
+      !opts::StrictMode && !opts::UseOldText) {
+    opts::Lite = true;
+  }
+
+  if (opts::Lite && opts::UseOldText) {
+    errs() << "BOLT-WARNING: cannot combine -lite with -use-old-text. "
+              "Disabling -use-old-text.\n";
+    opts::UseOldText = false;
+  }
+
+  if (opts::Lite && opts::StrictMode) {
+    errs() << "BOLT-ERROR: -strict and -lite cannot be used at the same time\n";
+    exit(1);
+  }
+
+  if (opts::Lite) {
+    outs() << "BOLT-INFO: enabling lite mode\n";
+  }
+
+  if (!opts::SaveProfile.empty() && BAT->enabledFor(InputFile)) {
+    errs() << "BOLT-ERROR: unable to save profile in YAML format for input "
+              "file processed by BOLT. Please remove -w option and use branch "
+              "profile.\n";
+    exit(1);
+  }
+}
+
+namespace {
+template <typename ELFT>
+int64_t getRelocationAddend(const ELFObjectFile<ELFT> *Obj,
+                            const RelocationRef &RelRef) {
+  using ELFShdrTy = typename ELFT::Shdr;
+  using Elf_Rela = typename ELFT::Rela;
+  int64_t Addend = 0;
+  const ELFFile<ELFT> &EF = Obj->getELFFile();
+  DataRefImpl Rel = RelRef.getRawDataRefImpl();
+  const ELFShdrTy *RelocationSection = cantFail(EF.getSection(Rel.d.a));
+  switch (RelocationSection->sh_type) {
+  default: llvm_unreachable("unexpected relocation section type");
+  case ELF::SHT_REL:
+    break;
+  case ELF::SHT_RELA: {
+    const Elf_Rela *RelA = Obj->getRela(Rel);
+    Addend = RelA->r_addend;
+    break;
+  }
+  }
+
+  return Addend;
+}
+
+int64_t getRelocationAddend(const ELFObjectFileBase *Obj,
+                         const RelocationRef &Rel) {
+  if (auto *ELF32LE = dyn_cast<ELF32LEObjectFile>(Obj))
+    return getRelocationAddend(ELF32LE, Rel);
+  if (auto *ELF64LE = dyn_cast<ELF64LEObjectFile>(Obj))
+    return getRelocationAddend(ELF64LE, Rel);
+  if (auto *ELF32BE = dyn_cast<ELF32BEObjectFile>(Obj))
+    return getRelocationAddend(ELF32BE, Rel);
+  auto *ELF64BE = cast<ELF64BEObjectFile>(Obj);
+  return getRelocationAddend(ELF64BE, Rel);
+}
+} // anonymous namespace
+
+bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel,
+                                        uint64_t RType,
+                                        std::string &SymbolName,
+                                        bool &IsSectionRelocation,
+                                        uint64_t &SymbolAddress,
+                                        int64_t &Addend,
+                                        uint64_t &ExtractedValue) const {
+  if (!Relocation::isSupported(RType))
+    return false;
+
+  const bool IsAArch64 = BC->isAArch64();
+
+  const size_t RelSize = Relocation::getSizeForType(RType);
+
+  ErrorOr<uint64_t> Value =
+      BC->getUnsignedValueAtAddress(Rel.getOffset(), RelSize);
+  assert(Value && "failed to extract relocated value");
+  ExtractedValue = Relocation::extractValue(RType,
+                                            *Value,
+                                            Rel.getOffset());
+  Addend = getRelocationAddend(InputFile, Rel);
+
+  const bool IsPCRelative = Relocation::isPCRelative(RType);
+  const uint64_t PCRelOffset = IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0;
+  bool SkipVerification = false;
+  auto SymbolIter = Rel.getSymbol();
+  if (SymbolIter == InputFile->symbol_end()) {
+    SymbolAddress = ExtractedValue - Addend + PCRelOffset;
+    MCSymbol *RelSymbol =
+        BC->getOrCreateGlobalSymbol(SymbolAddress, "RELSYMat");
+    SymbolName = std::string(RelSymbol->getName());
+    IsSectionRelocation = false;
+  } else {
+    const SymbolRef &Symbol = *SymbolIter;
+    SymbolName = std::string(cantFail(Symbol.getName()));
+    SymbolAddress = cantFail(Symbol.getAddress());
+    SkipVerification = (cantFail(Symbol.getType()) == SymbolRef::ST_Other);
+    // Section symbols are marked as ST_Debug.
+    IsSectionRelocation = (cantFail(Symbol.getType()) == SymbolRef::ST_Debug);
+  }
+  // For PIE or dynamic libs, the linker may choose not to put the relocation
+  // result at the address if it is a X86_64_64 one because it will emit a
+  // dynamic relocation (X86_RELATIVE) for the dynamic linker and loader to
+  // resolve it at run time. The static relocation result goes as the addend
+  // of the dynamic relocation in this case. We can't verify these cases.
+  // FIXME: perhaps we can try to find if it really emitted a corresponding
+  // RELATIVE relocation at this offset with the correct value as the addend.
+  if (!BC->HasFixedLoadAddress && RelSize == 8)
+    SkipVerification = true;
+
+  if (IsSectionRelocation && !IsAArch64) {
+    ErrorOr<BinarySection &> Section = BC->getSectionForAddress(SymbolAddress);
+    assert(Section && "section expected for section relocation");
+    SymbolName = "section " + std::string(Section->getName());
+    // Convert section symbol relocations to regular relocations inside
+    // non-section symbols.
+    if (Section->containsAddress(ExtractedValue) && !IsPCRelative) {
+      SymbolAddress = ExtractedValue;
+      Addend = 0;
+    } else {
+      Addend = ExtractedValue - (SymbolAddress - PCRelOffset);
+    }
+  }
+
+  // If no symbol has been found or if it is a relocation requiring the
+  // creation of a GOT entry, do not link against the symbol but against
+  // whatever address was extracted from the instruction itself. We are
+  // not creating a GOT entry as this was already processed by the linker.
+  // For GOT relocs, do not subtract addend as the addend does not refer
+  // to this instruction's target, but it refers to the target in the GOT
+  // entry.
+  if (Relocation::isGOT(RType)) {
+    Addend = 0;
+    SymbolAddress = ExtractedValue + PCRelOffset;
+  } else if (!SymbolAddress) {
+    assert(!IsSectionRelocation);
+    if (ExtractedValue || Addend == 0 || IsPCRelative) {
+      SymbolAddress = truncateToSize(ExtractedValue - Addend + PCRelOffset,
+                                     RelSize);
+    } else {
+      // This is weird case.  The extracted value is zero but the addend is
+      // non-zero and the relocation is not pc-rel.  Using the previous logic,
+      // the SymbolAddress would end up as a huge number.  Seen in
+      // exceptions_pic.test.
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: relocation @ 0x"
+                        << Twine::utohexstr(Rel.getOffset())
+                        << " value does not match addend for "
+                        << "relocation to undefined symbol.\n");
+      return true;
+    }
+  }
+
+  auto verifyExtractedValue = [&]() {
+    if (SkipVerification)
+      return true;
+
+    if (IsAArch64)
+      return true;
+
+    if (SymbolName == "__hot_start" || SymbolName == "__hot_end")
+      return true;
+
+    if (Relocation::isTLS(RType))
+      return true;
+
+    if (RType == ELF::R_X86_64_PLT32)
+      return true;
+
+    return truncateToSize(ExtractedValue, RelSize) ==
+           truncateToSize(SymbolAddress + Addend - PCRelOffset, RelSize);
+  };
+
+  (void)verifyExtractedValue;
+  assert(verifyExtractedValue() && "mismatched extracted relocation value");
+
+  return true;
+}
+
+void RewriteInstance::processDynamicRelocations() {
+  // Read relocations for PLT - DT_JMPREL.
+  if (PLTRelocationsSize > 0) {
+    ErrorOr<BinarySection &> PLTRelSectionOrErr =
+        BC->getSectionForAddress(*PLTRelocationsAddress);
+    if (!PLTRelSectionOrErr) {
+      report_error("unable to find section corresponding to DT_JMPREL",
+                   PLTRelSectionOrErr.getError());
+    }
+    if (PLTRelSectionOrErr->getSize() != PLTRelocationsSize) {
+      report_error("section size mismatch for DT_PLTRELSZ",
+                   errc::executable_format_error);
+    }
+    readDynamicRelocations(PLTRelSectionOrErr->getSectionRef());
+  }
+
+  // The rest of dynamic relocations - DT_RELA.
+  if (DynamicRelocationsSize > 0) {
+    ErrorOr<BinarySection &> DynamicRelSectionOrErr =
+        BC->getSectionForAddress(*DynamicRelocationsAddress);
+    if (!DynamicRelSectionOrErr) {
+      report_error("unable to find section corresponding to DT_RELA",
+                   DynamicRelSectionOrErr.getError());
+    }
+    if (DynamicRelSectionOrErr->getSize() != DynamicRelocationsSize) {
+      report_error("section size mismatch for DT_RELASZ",
+                   errc::executable_format_error);
+    }
+    readDynamicRelocations(DynamicRelSectionOrErr->getSectionRef());
+  }
+}
+
+void RewriteInstance::processRelocations() {
+  if (!BC->HasRelocations)
+    return;
+
+  for (const SectionRef &Section : InputFile->sections()) {
+    if (cantFail(Section.getRelocatedSection()) != InputFile->section_end() &&
+        !BinarySection(*BC, Section).isAllocatable()) {
+      readRelocations(Section);
+    }
+  }
+}
+
+void RewriteInstance::insertLKMarker(uint64_t PC, uint64_t SectionOffset,
+                                     int32_t PCRelativeOffset,
+                                     bool IsPCRelative, StringRef SectionName) {
+  BC->LKMarkers[PC].emplace_back(
+      LKInstructionMarkerInfo{SectionOffset, PCRelativeOffset, IsPCRelative,
+                              SectionName});
+}
+
+void RewriteInstance::processLKSections() {
+  assert(opts::LinuxKernelMode &&
+         "process Linux Kernel special sections and their relocations only in "
+         "linux kernel mode.\n");
+
+  processLKExTable();
+  processLKPCIFixup();
+  processLKKSymtab();
+  processLKKSymtab(true);
+  processLKBugTable();
+  processLKSMPLocks();
+}
+
+/// Process __ex_table section of Linux Kernel.
+/// This section contains information regarding kernel level exception
+/// handling (https://www.kernel.org/doc/html/latest/x86/exception-tables.html).
+/// More documentation is in arch/x86/include/asm/extable.h.
+///
+/// The section is the list of the following structures:
+///
+///   struct exception_table_entry {
+///     int insn;
+///     int fixup;
+///     int handler;
+///   };
+///
+void RewriteInstance::processLKExTable() {
+  ErrorOr<BinarySection &> SectionOrError =
+      BC->getUniqueSectionByName("__ex_table");
+  if (!SectionOrError)
+    return;
+
+  const uint64_t SectionSize = SectionOrError->getSize();
+  const uint64_t SectionAddress = SectionOrError->getAddress();
+  assert((SectionSize % 12) == 0 &&
+         "The size of the __ex_table section should be a multiple of 12");
+  for (uint64_t I = 0; I < SectionSize; I += 4) {
+    const uint64_t EntryAddress = SectionAddress + I;
+    ErrorOr<uint64_t> Offset = BC->getSignedValueAtAddress(EntryAddress, 4);
+    assert(Offset && "failed reading PC-relative offset for __ex_table");
+    int32_t SignedOffset = *Offset;
+    const uint64_t RefAddress = EntryAddress + SignedOffset;
+
+    BinaryFunction *ContainingBF =
+        BC->getBinaryFunctionContainingAddress(RefAddress);
+    if (!ContainingBF)
+      continue;
+
+    MCSymbol *ReferencedSymbol = ContainingBF->getSymbol();
+    const uint64_t FunctionOffset = RefAddress - ContainingBF->getAddress();
+    switch (I % 12) {
+    default:
+      llvm_unreachable("bad alignment of __ex_table");
+      break;
+    case 0:
+      // insn
+      insertLKMarker(RefAddress, I, SignedOffset, true, "__ex_table");
+      break;
+    case 4:
+      // fixup
+      if (FunctionOffset)
+        ReferencedSymbol = ContainingBF->addEntryPointAtOffset(FunctionOffset);
+      BC->addRelocation(EntryAddress, ReferencedSymbol, Relocation::getPC32(),
+                        0, *Offset);
+      break;
+    case 8:
+      // handler
+      assert(!FunctionOffset &&
+             "__ex_table handler entry should point to function start");
+      BC->addRelocation(EntryAddress, ReferencedSymbol, Relocation::getPC32(),
+                        0, *Offset);
+      break;
+    }
+  }
+}
+
+/// Process .pci_fixup section of Linux Kernel.
+/// This section contains a list of entries for different PCI devices and their
+/// corresponding hook handler (code pointer where the fixup
+/// code resides, usually on x86_64 it is an entry PC relative 32 bit offset).
+/// Documentation is in include/linux/pci.h.
+void RewriteInstance::processLKPCIFixup() {
+  ErrorOr<BinarySection &> SectionOrError =
+      BC->getUniqueSectionByName(".pci_fixup");
+  assert(SectionOrError &&
+         ".pci_fixup section not found in Linux Kernel binary");
+  const uint64_t SectionSize = SectionOrError->getSize();
+  const uint64_t SectionAddress = SectionOrError->getAddress();
+  assert((SectionSize % 16) == 0 && ".pci_fixup size is not a multiple of 16");
+
+  for (uint64_t I = 12; I + 4 <= SectionSize; I += 16) {
+    const uint64_t PC = SectionAddress + I;
+    ErrorOr<uint64_t> Offset = BC->getSignedValueAtAddress(PC, 4);
+    assert(Offset && "cannot read value from .pci_fixup");
+    const int32_t SignedOffset = *Offset;
+    const uint64_t HookupAddress = PC + SignedOffset;
+    BinaryFunction *HookupFunction =
+        BC->getBinaryFunctionAtAddress(HookupAddress);
+    assert(HookupFunction && "expected function for entry in .pci_fixup");
+    BC->addRelocation(PC, HookupFunction->getSymbol(),
+                      Relocation::getPC32(), 0, *Offset);
+  }
+}
+
+/// Process __ksymtab[_gpl] sections of Linux Kernel.
+/// This section lists all the vmlinux symbols that kernel modules can access.
+///
+/// All the entries are 4 bytes each and hence we can read them by one by one
+/// and ignore the ones that are not pointing to the .text section. All pointers
+/// are PC relative offsets. Always, points to the beginning of the function.
+void RewriteInstance::processLKKSymtab(bool IsGPL) {
+  StringRef SectionName = "__ksymtab";
+  if (IsGPL) {
+    SectionName = "__ksymtab_gpl";
+  }
+  ErrorOr<BinarySection &> SectionOrError =
+      BC->getUniqueSectionByName(SectionName);
+  assert(SectionOrError &&
+         "__ksymtab[_gpl] section not found in Linux Kernel binary");
+  const uint64_t SectionSize = SectionOrError->getSize();
+  const uint64_t SectionAddress = SectionOrError->getAddress();
+  assert((SectionSize % 4) == 0 &&
+         "The size of the __ksymtab[_gpl] section should be a multiple of 4");
+
+  for (uint64_t I = 0; I < SectionSize; I += 4) {
+    const uint64_t EntryAddress = SectionAddress + I;
+    ErrorOr<uint64_t> Offset = BC->getSignedValueAtAddress(EntryAddress, 4);
+    assert(Offset && "Reading valid PC-relative offset for a ksymtab entry");
+    const int32_t SignedOffset = *Offset;
+    const uint64_t RefAddress = EntryAddress + SignedOffset;
+    BinaryFunction *BF = BC->getBinaryFunctionAtAddress(RefAddress);
+    if (!BF)
+      continue;
+
+    BC->addRelocation(EntryAddress, BF->getSymbol(), Relocation::getPC32(), 0,
+                      *Offset);
+  }
+}
+
+/// Process __bug_table section.
+/// This section contains information useful for kernel debugging.
+/// Each entry in the section is a struct bug_entry that contains a pointer to
+/// the ud2 instruction corresponding to the bug, corresponding file name (both
+/// pointers use PC relative offset addressing), line number, and flags.
+/// The definition of the struct bug_entry can be found in
+/// `include/asm-generic/bug.h`
+void RewriteInstance::processLKBugTable() {
+  ErrorOr<BinarySection &> SectionOrError =
+      BC->getUniqueSectionByName("__bug_table");
+  if (!SectionOrError)
+    return;
+
+  const uint64_t SectionSize = SectionOrError->getSize();
+  const uint64_t SectionAddress = SectionOrError->getAddress();
+  assert((SectionSize % 12) == 0 &&
+         "The size of the __bug_table section should be a multiple of 12");
+  for (uint64_t I = 0; I < SectionSize; I += 12) {
+    const uint64_t EntryAddress = SectionAddress + I;
+    ErrorOr<uint64_t> Offset = BC->getSignedValueAtAddress(EntryAddress, 4);
+    assert(Offset &&
+           "Reading valid PC-relative offset for a __bug_table entry");
+    const int32_t SignedOffset = *Offset;
+    const uint64_t RefAddress = EntryAddress + SignedOffset;
+    assert(BC->getBinaryFunctionContainingAddress(RefAddress) &&
+           "__bug_table entries should point to a function");
+
+    insertLKMarker(RefAddress, I, SignedOffset, true, "__bug_table");
+  }
+}
+
+
+/// .smp_locks section contains PC-relative references to instructions with LOCK
+/// prefix. The prefix can be converted to NOP at boot time on non-SMP systems.
+void RewriteInstance::processLKSMPLocks() {
+  ErrorOr<BinarySection &> SectionOrError =
+      BC->getUniqueSectionByName(".smp_locks");
+  if (!SectionOrError)
+    return;
+
+  uint64_t SectionSize = SectionOrError->getSize();
+  const uint64_t SectionAddress = SectionOrError->getAddress();
+  assert((SectionSize % 4) == 0 &&
+         "The size of the .smp_locks section should be a multiple of 4");
+
+  for (uint64_t I = 0; I < SectionSize; I += 4) {
+    const uint64_t EntryAddress = SectionAddress + I;
+    ErrorOr<uint64_t> Offset = BC->getSignedValueAtAddress(EntryAddress, 4);
+    assert(Offset && "Reading valid PC-relative offset for a .smp_locks entry");
+    int32_t SignedOffset = *Offset;
+    uint64_t RefAddress = EntryAddress + SignedOffset;
+
+    BinaryFunction *ContainingBF =
+        BC->getBinaryFunctionContainingAddress(RefAddress);
+    if (!ContainingBF)
+      continue;
+
+    insertLKMarker(RefAddress, I, SignedOffset, true, ".smp_locks");
+  }
+}
+
+void RewriteInstance::readDynamicRelocations(const SectionRef &Section) {
+  assert(BinarySection(*BC, Section).isAllocatable() && "allocatable expected");
+
+  LLVM_DEBUG({
+    StringRef SectionName = cantFail(Section.getName());
+    dbgs() << "BOLT-DEBUG: reading relocations for section " << SectionName
+           << ":\n";
+  });
+
+  for (const RelocationRef &Rel : Section.relocations()) {
+    uint64_t RType = Rel.getType();
+    if (Relocation::isNone(RType))
+      continue;
+
+    StringRef SymbolName = "<none>";
+    MCSymbol *Symbol = nullptr;
+    uint64_t SymbolAddress = 0;
+    const uint64_t Addend = getRelocationAddend(InputFile, Rel);
+
+    symbol_iterator SymbolIter = Rel.getSymbol();
+    if (SymbolIter != InputFile->symbol_end()) {
+      SymbolName = cantFail(SymbolIter->getName());
+      BinaryData *BD = BC->getBinaryDataByName(SymbolName);
+      Symbol = BD ? BD->getSymbol()
+                  : BC->getOrCreateUndefinedGlobalSymbol(SymbolName);
+      SymbolAddress = cantFail(SymbolIter->getAddress());
+      (void)SymbolAddress;
+    }
+
+    LLVM_DEBUG(
+      SmallString<16> TypeName;
+      Rel.getTypeName(TypeName);
+      dbgs() << "BOLT-DEBUG: dynamic relocation at 0x"
+             << Twine::utohexstr(Rel.getOffset()) << " : " << TypeName
+             << " : " << SymbolName << " : " <<  Twine::utohexstr(SymbolAddress)
+             << " : + 0x" << Twine::utohexstr(Addend) << '\n'
+    );
+
+    BC->addDynamicRelocation(Rel.getOffset(), Symbol, Rel.getType(), Addend);
+  }
+}
+
+void RewriteInstance::readRelocations(const SectionRef &Section) {
+  LLVM_DEBUG({
+    StringRef SectionName = cantFail(Section.getName());
+    dbgs() << "BOLT-DEBUG: reading relocations for section " << SectionName
+           << ":\n";
+  });
+  if (BinarySection(*BC, Section).isAllocatable()) {
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring runtime relocations\n");
+    return;
+  }
+  section_iterator SecIter = cantFail(Section.getRelocatedSection());
+  assert(SecIter != InputFile->section_end() && "relocated section expected");
+  SectionRef RelocatedSection = *SecIter;
+
+  StringRef RelocatedSectionName = cantFail(RelocatedSection.getName());
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: relocated section is "
+                    << RelocatedSectionName << '\n');
+
+  if (!BinarySection(*BC, RelocatedSection).isAllocatable()) {
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocations against "
+                      << "non-allocatable section\n");
+    return;
+  }
+  const bool SkipRelocs = StringSwitch<bool>(RelocatedSectionName)
+                              .Cases(".plt", ".rela.plt", ".got.plt",
+                                     ".eh_frame", ".gcc_except_table", true)
+                              .Default(false);
+  if (SkipRelocs) {
+    LLVM_DEBUG(
+        dbgs() << "BOLT-DEBUG: ignoring relocations against known section\n");
+    return;
+  }
+
+  const bool IsAArch64 = BC->isAArch64();
+  const bool IsFromCode = RelocatedSection.isText();
+
+  auto printRelocationInfo = [&](const RelocationRef &Rel,
+                                 StringRef SymbolName,
+                                 uint64_t SymbolAddress,
+                                 uint64_t Addend,
+                                 uint64_t ExtractedValue) {
+    SmallString<16> TypeName;
+    Rel.getTypeName(TypeName);
+    const uint64_t Address = SymbolAddress + Addend;
+    ErrorOr<BinarySection &> Section = BC->getSectionForAddress(SymbolAddress);
+    dbgs() << "Relocation: offset = 0x"
+           << Twine::utohexstr(Rel.getOffset())
+           << "; type = " << TypeName
+           << "; value = 0x" << Twine::utohexstr(ExtractedValue)
+           << "; symbol = " << SymbolName
+           << " (" << (Section ? Section->getName() : "") << ")"
+           << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress)
+           << "; addend = 0x" << Twine::utohexstr(Addend)
+           << "; address = 0x" << Twine::utohexstr(Address)
+           << "; in = ";
+    if (BinaryFunction *Func = BC->getBinaryFunctionContainingAddress(
+            Rel.getOffset(), false, IsAArch64)) {
+      dbgs() << Func->getPrintName() << "\n";
+    } else {
+      dbgs() << BC->getSectionForAddress(Rel.getOffset())->getName() << "\n";
+    }
+  };
+
+  for (const RelocationRef &Rel : Section.relocations()) {
+    SmallString<16> TypeName;
+    Rel.getTypeName(TypeName);
+    uint64_t RType = Rel.getType();
+
+    // Adjust the relocation type as the linker might have skewed it.
+    if (BC->isX86() && (RType & ELF::R_X86_64_converted_reloc_bit)) {
+      if (opts::Verbosity >= 1) {
+        dbgs() << "BOLT-WARNING: ignoring R_X86_64_converted_reloc_bit\n";
+      }
+      RType &= ~ELF::R_X86_64_converted_reloc_bit;
+    }
+
+    // No special handling required for TLS relocations on X86.
+    if (Relocation::isTLS(RType) && BC->isX86())
+      continue;
+
+    if (BC->getDynamicRelocationAt(Rel.getOffset())) {
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: address 0x"
+                 << Twine::utohexstr(Rel.getOffset())
+                 << " has a dynamic relocation against it. Ignoring static "
+                    "relocation.\n");
+      continue;
+    }
+
+    std::string SymbolName;
+    uint64_t SymbolAddress;
+    int64_t Addend;
+    uint64_t ExtractedValue;
+    bool IsSectionRelocation;
+    if (!analyzeRelocation(Rel,
+                           RType,
+                           SymbolName,
+                           IsSectionRelocation,
+                           SymbolAddress,
+                           Addend,
+                           ExtractedValue)) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: skipping relocation @ offset = 0x"
+                        << Twine::utohexstr(Rel.getOffset())
+                        << "; type name = " << TypeName << '\n');
+      continue;
+    }
+
+    const uint64_t Address = SymbolAddress + Addend;
+
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ";
+          printRelocationInfo(Rel,
+                              SymbolName,
+                              SymbolAddress,
+                              Addend,
+                              ExtractedValue));
+
+    BinaryFunction *ContainingBF = nullptr;
+    if (IsFromCode) {
+      ContainingBF =
+        BC->getBinaryFunctionContainingAddress(Rel.getOffset(),
+                                               /*CheckPastEnd*/ false,
+                                               /*UseMaxSize*/ true);
+      assert(ContainingBF && "cannot find function for address in code");
+      if (!IsAArch64 && !ContainingBF->containsAddress(Rel.getOffset())) {
+        if (opts::Verbosity >= 1) {
+          outs() << "BOLT-INFO: " << *ContainingBF
+                 << " has relocations in padding area\n";
+        }
+        ContainingBF->setSize(ContainingBF->getMaxSize());
+        ContainingBF->setSimple(false);
+        continue;
+      }
+    }
+
+    // PC-relative relocations from data to code are tricky since the original
+    // information is typically lost after linking even with '--emit-relocs'.
+    // They are normally used by PIC-style jump tables and reference both
+    // the jump table and jump destination by computing the difference
+    // between the two. If we blindly apply the relocation it will appear
+    // that it references an arbitrary location in the code, possibly even
+    // in a different function from that containing the jump table.
+    if (!IsAArch64 && Relocation::isPCRelative(RType)) {
+      // Just register the fact that we have PC-relative relocation at a given
+      // address. The actual referenced label/address cannot be determined
+      // from linker data alone.
+      if (!IsFromCode) {
+        BC->addPCRelativeDataRelocation(Rel.getOffset());
+      }
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: not creating PC-relative relocation at 0x"
+                 << Twine::utohexstr(Rel.getOffset()) << " for " << SymbolName
+                 << "\n");
+      continue;
+    }
+
+    bool ForceRelocation = BC->forceSymbolRelocations(SymbolName);
+
+    if (BC->isAArch64() && RType == ELF::R_AARCH64_ADR_GOT_PAGE)
+      ForceRelocation = true;
+
+    ErrorOr<BinarySection &> RefSection =
+        BC->getSectionForAddress(SymbolAddress);
+    if (!RefSection && !ForceRelocation) {
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: cannot determine referenced section.\n");
+      continue;
+    }
+
+    const bool IsToCode = RefSection && RefSection->isText();
+
+    // Occasionally we may see a reference past the last byte of the function
+    // typically as a result of __builtin_unreachable(). Check it here.
+    BinaryFunction *ReferencedBF = BC->getBinaryFunctionContainingAddress(
+        Address, /*CheckPastEnd*/ true, /*UseMaxSize*/ IsAArch64);
+
+    if (!IsSectionRelocation) {
+      if (BinaryFunction *BF =
+              BC->getBinaryFunctionContainingAddress(SymbolAddress)) {
+        if (BF != ReferencedBF) {
+          // It's possible we are referencing a function without referencing any
+          // code, e.g. when taking a bitmask action on a function address.
+          errs() << "BOLT-WARNING: non-standard function reference (e.g. "
+                    "bitmask) detected against function " << *BF;
+          if (IsFromCode) {
+            errs() << " from function " << *ContainingBF << '\n';
+          } else {
+            errs() << " from data section at 0x"
+                   << Twine::utohexstr(Rel.getOffset()) << '\n';
+          }
+          LLVM_DEBUG(printRelocationInfo(Rel,
+                                         SymbolName,
+                                         SymbolAddress,
+                                         Addend,
+                                         ExtractedValue)
+          );
+          ReferencedBF = BF;
+        }
+      }
+    } else if (ReferencedBF) {
+      assert(RefSection && "section expected for section relocation");
+      if (*ReferencedBF->getOriginSection() != *RefSection) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring false function reference\n");
+        ReferencedBF = nullptr;
+      }
+    }
+
+    // Workaround for a member function pointer de-virtualization bug. We check
+    // if a non-pc-relative relocation in the code is pointing to (fptr - 1).
+    if (IsToCode && ContainingBF && !Relocation::isPCRelative(RType) &&
+        (!ReferencedBF || (ReferencedBF->getAddress() != Address))) {
+      if (const BinaryFunction *RogueBF =
+              BC->getBinaryFunctionAtAddress(Address + 1)) {
+        // Do an extra check that the function was referenced previously.
+        // It's a linear search, but it should rarely happen.
+        bool Found = false;
+        for (const auto &RelKV : ContainingBF->Relocations) {
+          const Relocation &Rel = RelKV.second;
+          if (Rel.Symbol == RogueBF->getSymbol() &&
+              !Relocation::isPCRelative(Rel.Type)) {
+            Found = true;
+            break;
+          }
+        }
+
+        if (Found) {
+          errs() << "BOLT-WARNING: detected possible compiler "
+                    "de-virtualization bug: -1 addend used with "
+                    "non-pc-relative relocation against function "
+                 << *RogueBF << " in function " << *ContainingBF << '\n';
+          continue;
+        }
+      }
+    }
+
+    MCSymbol *ReferencedSymbol = nullptr;
+    if (ForceRelocation) {
+      std::string Name = Relocation::isGOT(RType) ? "Zero" : SymbolName;
+      ReferencedSymbol = BC->registerNameAtAddress(Name, 0, 0, 0);
+      SymbolAddress = 0;
+      if (Relocation::isGOT(RType))
+        Addend = Address;
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: forcing relocation against symbol "
+                        << SymbolName << " with addend " << Addend << '\n');
+    } else if (ReferencedBF) {
+      ReferencedSymbol = ReferencedBF->getSymbol();
+      uint64_t RefFunctionOffset = 0;
+
+      // Adjust the point of reference to a code location inside a function.
+      if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */true)) {
+        RefFunctionOffset = Address - ReferencedBF->getAddress();
+        if (RefFunctionOffset) {
+          if (ContainingBF && ContainingBF != ReferencedBF) {
+            ReferencedSymbol =
+              ReferencedBF->addEntryPointAtOffset(RefFunctionOffset);
+          } else {
+            ReferencedSymbol =
+              ReferencedBF->getOrCreateLocalLabel(Address,
+                                                  /*CreatePastEnd =*/ true);
+            ReferencedBF->registerReferencedOffset(RefFunctionOffset);
+          }
+          if (opts::Verbosity > 1 &&
+              !BinarySection(*BC, RelocatedSection).isReadOnly()) {
+            dbgs() << "BOLT-WARNING: writable reference into the middle of "
+                   << "the function " << *ReferencedBF
+                   << " detected at address 0x"
+                   << Twine::utohexstr(Rel.getOffset()) << '\n';
+          }
+        }
+        SymbolAddress = Address;
+        Addend = 0;
+      }
+      LLVM_DEBUG(
+        dbgs() << "  referenced function " << *ReferencedBF;
+        if (Address != ReferencedBF->getAddress())
+          dbgs() << " at offset 0x" << Twine::utohexstr(RefFunctionOffset);
+        dbgs() << '\n'
+      );
+    } else {
+      if (IsToCode && SymbolAddress) {
+        // This can happen e.g. with PIC-style jump tables.
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: no corresponding function for "
+                             "relocation against code\n");
+      }
+
+      // In AArch64 there are zero reasons to keep a reference to the
+      // "original" symbol plus addend. The original symbol is probably just a
+      // section symbol. If we are here, this means we are probably accessing
+      // data, so it is imperative to keep the original address.
+      if (IsAArch64) {
+        SymbolName = ("SYMBOLat0x" + Twine::utohexstr(Address)).str();
+        SymbolAddress = Address;
+        Addend = 0;
+      }
+
+      if (BinaryData *BD = BC->getBinaryDataContainingAddress(SymbolAddress)) {
+        // Note: this assertion is trying to check sanity of BinaryData objects
+        // but AArch64 has inferred and incomplete object locations coming from
+        // GOT/TLS or any other non-trivial relocation (that requires creation
+        // of sections and whose symbol address is not really what should be
+        // encoded in the instruction). So we essentially disabled this check
+        // for AArch64 and live with bogus names for objects.
+        assert((IsAArch64 ||
+                IsSectionRelocation ||
+                BD->nameStartsWith(SymbolName) ||
+                BD->nameStartsWith("PG" + SymbolName) ||
+                (BD->nameStartsWith("ANONYMOUS") &&
+                 (BD->getSectionName().startswith(".plt") ||
+                  BD->getSectionName().endswith(".plt")))) &&
+               "BOLT symbol names of all non-section relocations must match "
+               "up with symbol names referenced in the relocation");
+
+        if (IsSectionRelocation) {
+          BC->markAmbiguousRelocations(*BD, Address);
+        }
+
+        ReferencedSymbol = BD->getSymbol();
+        Addend += (SymbolAddress - BD->getAddress());
+        SymbolAddress = BD->getAddress();
+        assert(Address == SymbolAddress + Addend);
+      } else {
+        // These are mostly local data symbols but undefined symbols
+        // in relocation sections can get through here too, from .plt.
+        assert((IsAArch64 ||
+                IsSectionRelocation ||
+                BC->getSectionNameForAddress(SymbolAddress)->startswith(".plt"))
+               && "known symbols should not resolve to anonymous locals");
+
+        if (IsSectionRelocation) {
+          ReferencedSymbol = BC->getOrCreateGlobalSymbol(SymbolAddress,
+                                                         "SYMBOLat");
+        } else {
+          SymbolRef Symbol = *Rel.getSymbol();
+          const uint64_t SymbolSize =
+              IsAArch64 ? 0 : ELFSymbolRef(Symbol).getSize();
+          const uint64_t SymbolAlignment =
+              IsAArch64 ? 1 : Symbol.getAlignment();
+          const uint32_t SymbolFlags = cantFail(Symbol.getFlags());
+          std::string Name;
+          if (SymbolFlags & SymbolRef::SF_Global) {
+            Name = SymbolName;
+          } else {
+            if (StringRef(SymbolName).startswith(
+                  BC->AsmInfo->getPrivateGlobalPrefix())) {
+              Name = NR.uniquify("PG" + SymbolName);
+            } else {
+              Name = NR.uniquify(SymbolName);
+            }
+          }
+          ReferencedSymbol = BC->registerNameAtAddress(Name,
+                                                       SymbolAddress,
+                                                       SymbolSize,
+                                                       SymbolAlignment,
+                                                       SymbolFlags);
+        }
+
+        if (IsSectionRelocation) {
+          BinaryData *BD = BC->getBinaryDataByName(ReferencedSymbol->getName());
+          BC->markAmbiguousRelocations(*BD, Address);
+        }
+      }
+    }
+
+    auto checkMaxDataRelocations = [&]() {
+      ++NumDataRelocations;
+      if (opts::MaxDataRelocations &&
+          NumDataRelocations + 1 == opts::MaxDataRelocations) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: processing ending on data relocation "
+                          << NumDataRelocations << ": ");
+        printRelocationInfo(Rel, ReferencedSymbol->getName(), SymbolAddress,
+                            Addend, ExtractedValue);
+      }
+
+      return (!opts::MaxDataRelocations ||
+              NumDataRelocations < opts::MaxDataRelocations);
+    };
+
+    if ((RefSection && refersToReorderedSection(RefSection)) ||
+        (opts::ForceToDataRelocations && checkMaxDataRelocations()))
+      ForceRelocation = true;
+
+    if (IsFromCode) {
+      ContainingBF->addRelocation(Rel.getOffset(),
+                                  ReferencedSymbol,
+                                  RType,
+                                  Addend,
+                                  ExtractedValue);
+    } else if (IsToCode || ForceRelocation) {
+      BC->addRelocation(Rel.getOffset(), ReferencedSymbol, RType, Addend,
+                        ExtractedValue);
+    } else {
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: ignoring relocation from data to data\n");
+    }
+  }
+}
+
+void RewriteInstance::selectFunctionsToProcess() {
+  // Extend the list of functions to process or skip from a file.
+  auto populateFunctionNames = [](cl::opt<std::string> &FunctionNamesFile,
+                                  cl::list<std::string> &FunctionNames) {
+    if (FunctionNamesFile.empty())
+      return;
+    std::ifstream FuncsFile(FunctionNamesFile, std::ios::in);
+    std::string FuncName;
+    while (std::getline(FuncsFile, FuncName)) {
+      FunctionNames.push_back(FuncName);
+    }
+  };
+  populateFunctionNames(opts::FunctionNamesFile, opts::ForceFunctionNames);
+  populateFunctionNames(opts::SkipFunctionNamesFile, opts::SkipFunctionNames);
+
+  if (!opts::ForceFunctionNames.empty() && !opts::SkipFunctionNames.empty()) {
+    errs() << "BOLT-ERROR: cannot select functions to process and skip at the "
+              "same time. Please use only one type of selection.\n";
+    exit(1);
+  }
+
+  uint64_t LiteThresholdExecCount = 0;
+  if (opts::LiteThresholdPct) {
+    if (opts::LiteThresholdPct > 100)
+      opts::LiteThresholdPct = 100;
+
+    std::vector<const BinaryFunction *> TopFunctions;
+    for (auto &BFI : BC->getBinaryFunctions()) {
+      const BinaryFunction &Function = BFI.second;
+      if (ProfileReader->mayHaveProfileData(Function))
+        TopFunctions.push_back(&Function);
+    }
+    std::sort(TopFunctions.begin(), TopFunctions.end(),
+              [](const BinaryFunction *A, const BinaryFunction *B) {
+                return
+                    A->getKnownExecutionCount() < B->getKnownExecutionCount();
+              });
+
+    size_t Index = TopFunctions.size() * opts::LiteThresholdPct / 100;
+    if (Index)
+      --Index;
+    LiteThresholdExecCount = TopFunctions[Index]->getKnownExecutionCount();
+    outs() << "BOLT-INFO: limiting processing to functions with at least "
+           << LiteThresholdExecCount << " invocations\n";
+  }
+  LiteThresholdExecCount =
+      std::max(LiteThresholdExecCount,
+               static_cast<uint64_t>(opts::LiteThresholdCount));
+
+  uint64_t NumFunctionsToProcess = 0;
+  auto shouldProcess = [&](const BinaryFunction &Function) {
+    if (opts::MaxFunctions && NumFunctionsToProcess > opts::MaxFunctions) {
+      return false;
+    }
+
+    // If the list is not empty, only process functions from the list.
+    if (!opts::ForceFunctionNames.empty()) {
+      for (std::string &Name : opts::ForceFunctionNames) {
+        if (Function.hasNameRegex(Name)) {
+          return true;
+        }
+      }
+      return false;
+    }
+
+    for (std::string &Name : opts::SkipFunctionNames) {
+      if (Function.hasNameRegex(Name)) {
+        return false;
+      }
+    }
+
+    if (opts::Lite) {
+      if (ProfileReader && !ProfileReader->mayHaveProfileData(Function))
+        return false;
+
+      if (Function.getKnownExecutionCount() < LiteThresholdExecCount)
+        return false;
+    }
+
+    return true;
+  };
+
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    // Pseudo functions are explicitly marked by us not to be processed.
+    if (Function.isPseudo()) {
+      Function.IsIgnored = true;
+      Function.HasExternalRefRelocations = true;
+      continue;
+    }
+
+    if (!shouldProcess(Function)) {
+      LLVM_DEBUG(dbgs() << "BOLT-INFO: skipping processing of function "
+                        << Function << " per user request\n");
+      Function.setIgnored();
+    } else {
+      ++NumFunctionsToProcess;
+      if (opts::MaxFunctions && NumFunctionsToProcess == opts::MaxFunctions) {
+        outs() << "BOLT-INFO: processing ending on " << Function << '\n';
+      }
+    }
+  }
+}
+
+void RewriteInstance::readDebugInfo() {
+  NamedRegionTimer T("readDebugInfo", "read debug info", TimerGroupName,
+                     TimerGroupDesc, opts::TimeRewrite);
+  if (!opts::UpdateDebugSections)
+    return;
+
+  BC->preprocessDebugInfo();
+}
+
+void RewriteInstance::preprocessProfileData() {
+  if (!ProfileReader)
+    return;
+
+  NamedRegionTimer T("preprocessprofile", "pre-process profile data",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+
+  outs() << "BOLT-INFO: pre-processing profile using "
+         << ProfileReader->getReaderName() << '\n';
+
+  if (BAT->enabledFor(InputFile)) {
+    outs() << "BOLT-INFO: profile collection done on a binary already "
+              "processed by BOLT\n";
+    ProfileReader->setBAT(&*BAT);
+  }
+
+  if (Error E = ProfileReader->preprocessProfile(*BC.get()))
+    report_error("cannot pre-process profile", std::move(E));
+
+  if (!BC->hasSymbolsWithFileName() &&
+      ProfileReader->hasLocalsWithFileName() &&
+      !opts::AllowStripped) {
+    errs() << "BOLT-ERROR: input binary does not have local file symbols "
+              "but profile data includes function names with embedded file "
+              "names. It appears that the input binary was stripped while a "
+              "profiled binary was not. If you know what you are doing and "
+              "wish to proceed, use -allow-stripped option.\n";
+    exit(1);
+  }
+}
+
+void RewriteInstance::processProfileDataPreCFG() {
+  if (!ProfileReader)
+    return;
+
+  NamedRegionTimer T("processprofile-precfg", "process profile data pre-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+
+  if (Error E = ProfileReader->readProfilePreCFG(*BC.get()))
+    report_error("cannot read profile pre-CFG", std::move(E));
+}
+
+void RewriteInstance::processProfileData() {
+  if (!ProfileReader)
+    return;
+
+  NamedRegionTimer T("processprofile", "process profile data", TimerGroupName,
+                     TimerGroupDesc, opts::TimeRewrite);
+
+  if (Error E = ProfileReader->readProfile(*BC.get()))
+    report_error("cannot read profile", std::move(E));
+
+  if (!opts::SaveProfile.empty()) {
+    YAMLProfileWriter PW(opts::SaveProfile);
+    PW.writeProfile(*this);
+  }
+
+  // Release memory used by profile reader.
+  ProfileReader.reset();
+
+  if (opts::AggregateOnly) {
+    exit(0);
+  }
+}
+
+void RewriteInstance::disassembleFunctions() {
+  NamedRegionTimer T("disassembleFunctions", "disassemble functions",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    ErrorOr<ArrayRef<uint8_t>> FunctionData = Function.getData();
+    if (!FunctionData) {
+      errs() << "BOLT-ERROR: corresponding section is non-executable or "
+             << "empty for function " << Function << '\n';
+      exit(1);
+    }
+
+    // Treat zero-sized functions as non-simple ones.
+    if (Function.getSize() == 0) {
+      Function.setSimple(false);
+      continue;
+    }
+
+    // Offset of the function in the file.
+    const auto *FileBegin =
+      reinterpret_cast<const uint8_t*>(InputFile->getData().data());
+    Function.setFileOffset(FunctionData->begin() - FileBegin);
+
+    if (!shouldDisassemble(Function)) {
+      NamedRegionTimer T("scan", "scan functions", "buildfuncs",
+                         "Scan Binary Functions", opts::TimeBuild);
+      Function.scanExternalRefs();
+      Function.setSimple(false);
+      continue;
+    }
+
+    if (!Function.disassemble()) {
+      if (opts::processAllFunctions()) {
+        BC->exitWithBugReport("function cannot be properly disassembled. "
+                              "Unable to continue in relocation mode.",
+                              Function);
+      }
+      if (opts::Verbosity >= 1) {
+        outs() << "BOLT-INFO: could not disassemble function " << Function
+               << ". Will ignore.\n";
+      }
+      // Forcefully ignore the function.
+      Function.setIgnored();
+      continue;
+    }
+
+    if (opts::PrintAll || opts::PrintDisasm)
+      Function.print(outs(), "after disassembly", true);
+
+    BC->processInterproceduralReferences(Function);
+  }
+
+  BC->populateJumpTables();
+
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    if (!shouldDisassemble(Function))
+      continue;
+
+    Function.postProcessEntryPoints();
+    Function.postProcessJumpTables();
+  }
+
+  BC->adjustCodePadding();
+
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    if (!shouldDisassemble(Function))
+      continue;
+
+    if (!Function.isSimple()) {
+      assert((!BC->HasRelocations || Function.getSize() == 0) &&
+             "unexpected non-simple function in relocation mode");
+      continue;
+    }
+
+    // Fill in CFI information for this function
+    if (!Function.trapsOnEntry()) {
+      if (!CFIRdWrt->fillCFIInfoFor(Function)) {
+        if (BC->HasRelocations) {
+          BC->exitWithBugReport("unable to fill CFI.", Function);
+        } else {
+          errs() << "BOLT-WARNING: unable to fill CFI for function "
+                 << Function << ". Skipping.\n";
+          Function.setSimple(false);
+          continue;
+        }
+      }
+    }
+
+    // Parse LSDA.
+    if (Function.getLSDAAddress() != 0)
+      Function.parseLSDA(getLSDAData(), getLSDAAddress());
+  }
+}
+
+void RewriteInstance::buildFunctionsCFG() {
+  NamedRegionTimer T("buildCFG", "buildCFG", "buildfuncs",
+                     "Build Binary Functions", opts::TimeBuild);
+
+  // Create annotation indices to allow lock-free execution
+  BC->MIB->getOrCreateAnnotationIndex("Offset");
+  BC->MIB->getOrCreateAnnotationIndex("JTIndexReg");
+
+  ParallelUtilities::WorkFuncWithAllocTy WorkFun =
+      [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
+        if (!BF.buildCFG(AllocId))
+          return;
+
+        if (opts::PrintAll)
+          BF.print(outs(), "while building cfg", true);
+      };
+
+  ParallelUtilities::PredicateTy SkipPredicate =
+      [&](const BinaryFunction &BF) {
+        return !shouldDisassemble(BF) || !BF.isSimple();
+      };
+
+  ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
+      *BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+      SkipPredicate, "disassembleFunctions-buildCFG",
+      /*ForceSequential*/ opts::SequentialDisassembly || opts::PrintAll);
+
+  BC->postProcessSymbolTable();
+}
+
+void RewriteInstance::postProcessFunctions() {
+  BC->TotalScore = 0;
+  BC->SumExecutionCount = 0;
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    if (Function.empty())
+      continue;
+
+    Function.postProcessCFG();
+
+    if (opts::PrintAll || opts::PrintCFG)
+      Function.print(outs(), "after building cfg", true);
+
+    if (opts::DumpDotAll)
+      Function.dumpGraphForPass("00_build-cfg");
+
+    if (opts::PrintLoopInfo) {
+      Function.calculateLoopInfo();
+      Function.printLoopInfo(outs());
+    }
+
+    BC->TotalScore += Function.getFunctionScore();
+    BC->SumExecutionCount += Function.getKnownExecutionCount();
+  }
+
+  if (opts::PrintGlobals) {
+    outs() << "BOLT-INFO: Global symbols:\n";
+    BC->printGlobalSymbols(outs());
+  }
+}
+
+void RewriteInstance::runOptimizationPasses() {
+  NamedRegionTimer T("runOptimizationPasses", "run optimization passes",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+  BinaryFunctionPassManager::runAllPasses(*BC);
+}
+
+namespace {
+
+template <typename T>
+std::vector<T> singletonSet(T t) {
+  std::vector<T> Vec;
+  Vec.push_back(std::move(t));
+  return Vec;
+}
+
+class BOLTSymbolResolver : public JITSymbolResolver {
+  BinaryContext &BC;
+  RuntimeDyld &RTDyld;
+public:
+  BOLTSymbolResolver(BinaryContext &BC, RuntimeDyld &RTDyld)
+      : BC(BC), RTDyld(RTDyld) {}
+
+  // We are responsible for all symbols
+  Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) override {
+    return Symbols;
+  }
+
+  // Some of our symbols may resolve to zero and this should not be an error
+  bool allowsZeroSymbols() override { return true; }
+
+  /// Resolves the address of each symbol requested
+  void lookup(const LookupSet &Symbols,
+              OnResolvedFunction OnResolved) override {
+    JITSymbolResolver::LookupResult AllResults;
+
+    if (BC.EFMM->ObjectsLoaded) {
+      for (const StringRef &Symbol : Symbols) {
+        std::string SymName = Symbol.str();
+        LLVM_DEBUG(dbgs() << "BOLT: looking for " << SymName << "\n");
+        JITEvaluatedSymbol Result = RTDyld.getSymbol(SymName);
+        if (Result.getAddress() == 0) {
+          // Resolve to a PLT entry if possible
+          if (BinaryData *I = BC.getBinaryDataByName(SymName + "@PLT")) {
+            AllResults[Symbol] =
+                JITEvaluatedSymbol(I->getAddress(), JITSymbolFlags());
+            continue;
+          }
+          OnResolved(make_error<StringError>(
+              "Symbol not found required by runtime: " + Symbol,
+              inconvertibleErrorCode()));
+          return;
+        }
+        AllResults[Symbol] = Result;
+      }
+      OnResolved(std::move(AllResults));
+      return;
+    }
+
+    for (const StringRef &Symbol : Symbols) {
+      std::string SymName = Symbol.str();
+      LLVM_DEBUG(dbgs() << "BOLT: looking for " << SymName << "\n");
+
+      if (BinaryData *I = BC.getBinaryDataByName(SymName)) {
+        uint64_t Address = I->isMoved() && !I->isJumpTable()
+                               ? I->getOutputAddress()
+                               : I->getAddress();
+        LLVM_DEBUG(dbgs() << "Resolved to address 0x"
+                          << Twine::utohexstr(Address) << "\n");
+        AllResults[Symbol] = JITEvaluatedSymbol(Address, JITSymbolFlags());
+        continue;
+      }
+      LLVM_DEBUG(dbgs() << "Resolved to address 0x0\n");
+      AllResults[Symbol] = JITEvaluatedSymbol(0, JITSymbolFlags());
+    }
+
+    OnResolved(std::move(AllResults));
+  }
+};
+
+} // anonymous namespace
+
+void RewriteInstance::emitAndLink() {
+  NamedRegionTimer T("emitAndLink", "emit and link", TimerGroupName,
+                     TimerGroupDesc, opts::TimeRewrite);
+  std::error_code EC;
+
+  // This is an object file, which we keep for debugging purposes.
+  // Once we decide it's useless, we should create it in memory.
+  SmallString<128> OutObjectPath;
+  sys::fs::getPotentiallyUniqueTempFileName("output", "o", OutObjectPath);
+  std::unique_ptr<ToolOutputFile> TempOut =
+      std::make_unique<ToolOutputFile>(OutObjectPath, EC, sys::fs::OF_None);
+  check_error(EC, "cannot create output object file");
+
+  std::unique_ptr<buffer_ostream> BOS =
+    std::make_unique<buffer_ostream>(TempOut->os());
+  raw_pwrite_stream *OS = BOS.get();
+
+  // Implicitly MCObjectStreamer takes ownership of MCAsmBackend (MAB)
+  // and MCCodeEmitter (MCE). ~MCObjectStreamer() will delete these
+  // two instances.
+  std::unique_ptr<MCStreamer> Streamer = BC->createStreamer(*OS);
+
+  if (EHFrameSection) {
+    if (opts::UseOldText || opts::StrictMode) {
+      // The section is going to be regenerated from scratch.
+      // Empty the contents, but keep the section reference.
+      EHFrameSection->clearContents();
+    } else {
+      // Make .eh_frame relocatable.
+      relocateEHFrameSection();
+    }
+  }
+
+  emitBinaryContext(*Streamer, *BC, getOrgSecPrefix());
+
+  Streamer->Finish();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Assign addresses to new sections.
+  //////////////////////////////////////////////////////////////////////////////
+
+  if (opts::UpdateDebugSections) {
+    // Compute offsets of tables in .debug_line for each compile unit.
+    DebugInfoRewriter->updateLineTableOffsets();
+  }
+
+  // Get output object as ObjectFile.
+  std::unique_ptr<MemoryBuffer> ObjectMemBuffer =
+      MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false);
+  std::unique_ptr<object::ObjectFile> Obj = cantFail(
+      object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()),
+      "error creating in-memory object");
+
+  BOLTSymbolResolver Resolver = BOLTSymbolResolver(*BC, *RTDyld);
+
+  MCAsmLayout FinalLayout(
+        static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler());
+
+  RTDyld.reset(new decltype(RTDyld)::element_type(*BC->EFMM, Resolver));
+  RTDyld->setProcessAllSections(false);
+  RTDyld->loadObject(*Obj);
+
+  // Assign addresses to all sections. If key corresponds to the object
+  // created by ourselves, call our regular mapping function. If we are
+  // loading additional objects as part of runtime libraries for
+  // instrumentation, treat them as extra sections.
+  mapFileSections(*RTDyld);
+
+  RTDyld->finalizeWithMemoryManagerLocking();
+  if (RTDyld->hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed: " << RTDyld->getErrorString() << "\n";
+    exit(1);
+  }
+
+  // Update output addresses based on the new section map and
+  // layout. Only do this for the object created by ourselves.
+  updateOutputValues(FinalLayout);
+
+  if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) {
+    RtLibrary->link(*BC, ToolPath, *RTDyld, [this](RuntimeDyld &R) {
+      this->mapExtraSections(*RTDyld);
+    });
+  }
+
+  // Once the code is emitted, we can rename function sections to actual
+  // output sections and de-register sections used for emission.
+  for (BinaryFunction *Function : BC->getAllBinaryFunctions()) {
+    ErrorOr<BinarySection &> Section = Function->getCodeSection();
+    if (Section && (Function->getImageAddress() == 0 ||
+                    Function->getImageSize() == 0)) {
+      continue;
+    }
+
+    // Restore origin section for functions that were emitted or supposed to
+    // be emitted to patch sections.
+    if (Section)
+      BC->deregisterSection(*Section);
+    assert(Function->getOriginSectionName() && "expected origin section");
+    Function->CodeSectionName = std::string(*Function->getOriginSectionName());
+    if (Function->isSplit()) {
+      if (ErrorOr<BinarySection &> ColdSection = Function->getColdCodeSection())
+        BC->deregisterSection(*ColdSection);
+      Function->ColdCodeSectionName = std::string(getBOLTTextSectionName());
+    }
+  }
+
+  if (opts::PrintCacheMetrics) {
+    outs() << "BOLT-INFO: cache metrics after emitting functions:\n";
+    CacheMetrics::printAll(BC->getSortedFunctions());
+  }
+
+  if (opts::KeepTmp) {
+    TempOut->keep();
+    outs() << "BOLT-INFO: intermediary output object file saved for debugging "
+              "purposes: "
+           << OutObjectPath << "\n";
+  }
+}
+
+void RewriteInstance::updateMetadata() {
+  updateSDTMarkers();
+  updateLKMarkers();
+  parsePseudoProbe();
+  updatePseudoProbes();
+
+  if (opts::UpdateDebugSections) {
+    NamedRegionTimer T("updateDebugInfo", "update debug info", TimerGroupName,
+                       TimerGroupDesc, opts::TimeRewrite);
+    DebugInfoRewriter->updateDebugInfo();
+  }
+
+  if (opts::WriteBoltInfoSection) {
+    addBoltInfoSection();
+  }
+}
+
+void RewriteInstance::updatePseudoProbes() {
+  // input address converted to output
+  AddressProbesMap &Address2ProbesMap = BC->ProbeDecoder.getAddress2ProbesMap();
+  const GUIDProbeFunctionMap &GUID2Func =
+      BC->ProbeDecoder.getGUID2FuncDescMap();
+  for (auto &AP : Address2ProbesMap) {
+    BinaryFunction *F = BC->getBinaryFunctionContainingAddress(AP.first);
+    // If F is not emitted, eliminate all probes inside it from inline tree
+    // Setting probes' addresses as INT64_MAX means elimination
+    if (!F->isEmitted()) {
+      for (MCDecodedPseudoProbe &Probe : AP.second) {
+        Probe.setAddress(INT64_MAX);
+      }
+      continue;
+    }
+    uint64_t Offset = AP.first - F->getAddress();
+    const BinaryBasicBlock *BB = F->getBasicBlockContainingOffset(Offset);
+    uint64_t BlkOutputAddress = BB->getOutputAddressRange().first;
+    // Check if block output address is defined.
+    // If not, such block is removed from binary. Then remove the probes from
+    // inline tree
+    if (BlkOutputAddress == 0) {
+      for (MCDecodedPseudoProbe &Probe : AP.second) {
+        Probe.setAddress(INT64_MAX);
+      }
+      continue;
+    }
+    for (MCDecodedPseudoProbe &Probe : AP.second) {
+      if (Probe.isBlock())
+        Probe.setAddress(BlkOutputAddress);
+      else if (Probe.isCall()) {
+        const InputOffsetToAddressMapTy &Offset2Addr =
+            F->getInputOffsetToAddressMap();
+        auto CallOutputAddress = Offset2Addr.find(Offset);
+        if (CallOutputAddress == Offset2Addr.end())
+          Probe.setAddress(INT64_MAX);
+        else
+          Probe.setAddress(CallOutputAddress->second);
+      }
+    }
+  }
+
+  if (opts::PrintPseudoProbes == opts::PrintPseudoProbesOptions::PPP_All ||
+      opts::PrintPseudoProbes ==
+          opts::PrintPseudoProbesOptions::PPP_Probes_Address_Conversion) {
+    outs() << "Pseudo Probe Address Conversion results:\n";
+    // table that correlates address to block
+    std::unordered_map<uint64_t, StringRef> Addr2BlockNames;
+    for (auto &F : BC->getBinaryFunctions()) {
+      for (BinaryBasicBlock &BinaryBlock : F.second) {
+        Addr2BlockNames[BinaryBlock.getOutputAddressRange().first] =
+            BinaryBlock.getName();
+      }
+    }
+    // scan all addresses -> correlate probe to block when print out
+    std::vector<uint64_t> Addresses;
+    for (auto &Entry : Address2ProbesMap)
+      Addresses.push_back(Entry.first);
+    std::sort(Addresses.begin(), Addresses.end());
+    for (uint64_t Key : Addresses) {
+      for (MCDecodedPseudoProbe &Probe : Address2ProbesMap[Key]) {
+        if (Probe.getAddress() == INT64_MAX)
+          outs() << "Deleted Probe: ";
+        else
+          outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
+        Probe.print(outs(), GUID2Func, true);
+        // print block name only if the probe is block type and undeleted.
+        if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
+          outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
+                 << Addr2BlockNames[Probe.getAddress()] << "\n";
+      }
+    }
+    outs() << "=======================================\n";
+  }
+}
+
+void RewriteInstance::updateSDTMarkers() {
+  NamedRegionTimer T("updateSDTMarkers", "update SDT markers", TimerGroupName,
+                     TimerGroupDesc, opts::TimeRewrite);
+
+  if (!SDTSection)
+    return;
+  SDTSection->registerPatcher(std::make_unique<SimpleBinaryPatcher>());
+
+  SimpleBinaryPatcher *SDTNotePatcher =
+      static_cast<SimpleBinaryPatcher *>(SDTSection->getPatcher());
+  for (auto &SDTInfoKV : BC->SDTMarkers) {
+    const uint64_t OriginalAddress = SDTInfoKV.first;
+    SDTMarkerInfo &SDTInfo = SDTInfoKV.second;
+    const BinaryFunction *F =
+        BC->getBinaryFunctionContainingAddress(OriginalAddress);
+    if (!F)
+      continue;
+    const uint64_t NewAddress =
+        F->translateInputToOutputAddress(OriginalAddress);
+    SDTNotePatcher->addLE64Patch(SDTInfo.PCOffset, NewAddress);
+  }
+}
+
+void RewriteInstance::updateLKMarkers() {
+  if (BC->LKMarkers.size() == 0) {
+    return;
+  }
+
+  NamedRegionTimer T("updateLKMarkers", "update LK markers", TimerGroupName,
+                     TimerGroupDesc, opts::TimeRewrite);
+
+  std::unordered_map<std::string, uint64_t> PatchCounts;
+  for (std::pair<const uint64_t, std::vector<LKInstructionMarkerInfo>>
+           &LKMarkerInfoKV : BC->LKMarkers) {
+    const uint64_t OriginalAddress = LKMarkerInfoKV.first;
+    const BinaryFunction *BF =
+        BC->getBinaryFunctionContainingAddress(OriginalAddress, false, true);
+    if (!BF)
+      continue;
+
+    uint64_t NewAddress = BF->translateInputToOutputAddress(OriginalAddress);
+    if (NewAddress == 0)
+      continue;
+
+    // Apply base address.
+    if (OriginalAddress >= 0xffffffff00000000 && NewAddress < 0xffffffff)
+      NewAddress = NewAddress + 0xffffffff00000000;
+
+    if (OriginalAddress == NewAddress)
+      continue;
+
+    for (LKInstructionMarkerInfo &LKMarkerInfo : LKMarkerInfoKV.second) {
+      StringRef SectionName = LKMarkerInfo.SectionName;
+      SimpleBinaryPatcher *LKPatcher;
+      ErrorOr<BinarySection &> BSec = BC->getUniqueSectionByName(SectionName);
+      assert(BSec && "missing section info for kernel section");
+      if (!BSec->getPatcher())
+        BSec->registerPatcher(std::make_unique<SimpleBinaryPatcher>());
+      LKPatcher = static_cast<SimpleBinaryPatcher *>(BSec->getPatcher());
+      PatchCounts[std::string(SectionName)]++;
+      if (LKMarkerInfo.IsPCRelative) {
+        LKPatcher->addLE32Patch(LKMarkerInfo.SectionOffset,
+                                NewAddress - OriginalAddress +
+                                    LKMarkerInfo.PCRelativeOffset);
+      } else {
+        LKPatcher->addLE64Patch(LKMarkerInfo.SectionOffset, NewAddress);
+      }
+    }
+  }
+  outs() << "BOLT-INFO: patching linux kernel sections. Total patches per "
+            "section are as follows:\n";
+  for (const std::pair<const std::string, uint64_t> &KV : PatchCounts) {
+    outs() << "  Section: " << KV.first << ", patch-counts: " << KV.second
+           << '\n';
+  }
+}
+
+void RewriteInstance::mapFileSections(RuntimeDyld &RTDyld) {
+  mapCodeSections(RTDyld);
+  mapDataSections(RTDyld);
+}
+
+std::vector<BinarySection *>
+RewriteInstance::getCodeSections() {
+  std::vector<BinarySection *> CodeSections;
+  for (BinarySection &Section : BC->textSections()) {
+    if (Section.hasValidSectionID())
+      CodeSections.emplace_back(&Section);
+  };
+
+  auto compareSections = [&](const BinarySection *A, const BinarySection *B) {
+    // Place movers before anything else.
+    if (A->getName() == BC->getHotTextMoverSectionName())
+      return true;
+    if (B->getName() == BC->getHotTextMoverSectionName())
+      return false;
+
+    // Depending on the option, put main text at the beginning or at the end.
+    if (opts::HotFunctionsAtEnd) {
+      return B->getName() == BC->getMainCodeSectionName();
+    } else {
+      return A->getName() == BC->getMainCodeSectionName();
+    }
+  };
+
+  // Determine the order of sections.
+  std::stable_sort(CodeSections.begin(), CodeSections.end(), compareSections);
+
+  return CodeSections;
+}
+
+void RewriteInstance::mapCodeSections(RuntimeDyld &RTDyld) {
+  ErrorOr<BinarySection &> TextSection =
+      BC->getUniqueSectionByName(BC->getMainCodeSectionName());
+  assert(TextSection && ".text section not found in output");
+
+  if (BC->HasRelocations) {
+    assert(TextSection->hasValidSectionID() && ".text section should be valid");
+
+    // Map sections for functions with pre-assigned addresses.
+    for (BinaryFunction *InjectedFunction : BC->getInjectedBinaryFunctions()) {
+      const uint64_t OutputAddress = InjectedFunction->getOutputAddress();
+      if (!OutputAddress)
+        continue;
+
+      ErrorOr<BinarySection &> FunctionSection =
+          InjectedFunction->getCodeSection();
+      assert(FunctionSection && "function should have section");
+      FunctionSection->setOutputAddress(OutputAddress);
+      RTDyld.reassignSectionAddress(FunctionSection->getSectionID(),
+                                    OutputAddress);
+      InjectedFunction->setImageAddress(FunctionSection->getAllocAddress());
+      InjectedFunction->setImageSize(FunctionSection->getOutputSize());
+    }
+
+    // Populate the list of sections to be allocated.
+    std::vector<BinarySection *> CodeSections = getCodeSections();
+
+    // Remove sections that were pre-allocated (patch sections).
+    CodeSections.erase(
+        std::remove_if(CodeSections.begin(), CodeSections.end(),
+                       [](BinarySection *Section) {
+                         return Section->getOutputAddress();
+                       }),
+        CodeSections.end());
+    LLVM_DEBUG(dbgs() << "Code sections in the order of output:\n";
+      for (const BinarySection *Section : CodeSections) {
+        dbgs() << Section->getName() << '\n';
+      });
+
+    uint64_t PaddingSize = 0; // size of padding required at the end
+
+    // Allocate sections starting at a given Address.
+    auto allocateAt = [&](uint64_t Address) {
+      for (BinarySection *Section : CodeSections) {
+        Address = alignTo(Address, Section->getAlignment());
+        Section->setOutputAddress(Address);
+        Address += Section->getOutputSize();
+      }
+
+      // Make sure we allocate enough space for huge pages.
+      if (opts::HotText) {
+        uint64_t HotTextEnd =
+            TextSection->getOutputAddress() + TextSection->getOutputSize();
+        HotTextEnd = alignTo(HotTextEnd, BC->PageAlign);
+        if (HotTextEnd > Address) {
+          PaddingSize = HotTextEnd - Address;
+          Address = HotTextEnd;
+        }
+      }
+      return Address;
+    };
+
+    // Check if we can fit code in the original .text
+    bool AllocationDone = false;
+    if (opts::UseOldText) {
+      const uint64_t CodeSize =
+          allocateAt(BC->OldTextSectionAddress) - BC->OldTextSectionAddress;
+
+      if (CodeSize <= BC->OldTextSectionSize) {
+        outs() << "BOLT-INFO: using original .text for new code with 0x"
+               << Twine::utohexstr(opts::AlignText) << " alignment\n";
+        AllocationDone = true;
+      } else {
+        errs() << "BOLT-WARNING: original .text too small to fit the new code"
+               << " using 0x" << Twine::utohexstr(opts::AlignText)
+               << " alignment. " << CodeSize
+               << " bytes needed, have " << BC->OldTextSectionSize
+               << " bytes available.\n";
+        opts::UseOldText = false;
+      }
+    }
+
+    if (!AllocationDone) {
+      NextAvailableAddress = allocateAt(NextAvailableAddress);
+    }
+
+    // Do the mapping for ORC layer based on the allocation.
+    for (BinarySection *Section : CodeSections) {
+      LLVM_DEBUG(
+          dbgs() << "BOLT: mapping " << Section->getName() << " at 0x"
+                 << Twine::utohexstr(Section->getAllocAddress()) << " to 0x"
+                 << Twine::utohexstr(Section->getOutputAddress()) << '\n');
+      RTDyld.reassignSectionAddress(Section->getSectionID(),
+                                    Section->getOutputAddress());
+      Section->setOutputFileOffset(
+          getFileOffsetForAddress(Section->getOutputAddress()));
+    }
+
+    // Check if we need to insert a padding section for hot text.
+    if (PaddingSize && !opts::UseOldText) {
+      outs() << "BOLT-INFO: padding code to 0x"
+             << Twine::utohexstr(NextAvailableAddress)
+             << " to accommodate hot text\n";
+    }
+
+    return;
+  }
+
+  // Processing in non-relocation mode.
+  uint64_t NewTextSectionStartAddress = NextAvailableAddress;
+
+  // Prepare .text section for injected functions
+  if (TextSection->hasValidSectionID()) {
+    uint64_t NewTextSectionOffset = 0;
+    uint64_t Padding = offsetToAlignment(NewTextSectionStartAddress,
+                                         llvm::Align(BC->PageAlign));
+    NextAvailableAddress += Padding;
+    NewTextSectionStartAddress = NextAvailableAddress;
+    NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress);
+    NextAvailableAddress += Padding + TextSection->getOutputSize();
+    TextSection->setOutputAddress(NewTextSectionStartAddress);
+    TextSection->setOutputFileOffset(NewTextSectionOffset);
+
+    LLVM_DEBUG(dbgs() << "BOLT: mapping .text 0x"
+                      << Twine::utohexstr(TextSection->getAllocAddress())
+                      << " to 0x"
+                      << Twine::utohexstr(NewTextSectionStartAddress) << '\n');
+    RTDyld.reassignSectionAddress(TextSection->getSectionID(),
+                                  NewTextSectionStartAddress);
+  }
+
+  for (auto &BFI : BC->getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+    if (!Function.isEmitted())
+      continue;
+
+    bool TooLarge = false;
+    ErrorOr<BinarySection &> FuncSection = Function.getCodeSection();
+    assert(FuncSection && "cannot find section for function");
+    FuncSection->setOutputAddress(Function.getAddress());
+    LLVM_DEBUG(dbgs() << "BOLT: mapping 0x"
+                      << Twine::utohexstr(FuncSection->getAllocAddress())
+                      << " to 0x" << Twine::utohexstr(Function.getAddress())
+                      << '\n');
+    RTDyld.reassignSectionAddress(FuncSection->getSectionID(),
+                                  Function.getAddress());
+    Function.setImageAddress(FuncSection->getAllocAddress());
+    Function.setImageSize(FuncSection->getOutputSize());
+    if (Function.getImageSize() > Function.getMaxSize()) {
+      TooLarge = true;
+      FailedAddresses.emplace_back(Function.getAddress());
+    }
+
+    // Map jump tables if updating in-place.
+    if (opts::JumpTables == JTS_BASIC) {
+      for (auto &JTI : Function.JumpTables) {
+        JumpTable *JT = JTI.second;
+        BinarySection &Section = JT->getOutputSection();
+        Section.setOutputAddress(JT->getAddress());
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: mapping " << Section.getName()
+                          << " to 0x" << Twine::utohexstr(JT->getAddress())
+                          << '\n');
+        RTDyld.reassignSectionAddress(Section.getSectionID(),
+                                      JT->getAddress());
+      }
+    }
+
+    if (!Function.isSplit())
+      continue;
+
+    ErrorOr<BinarySection &> ColdSection = Function.getColdCodeSection();
+    assert(ColdSection && "cannot find section for cold part");
+    // Cold fragments are aligned at 16 bytes.
+    NextAvailableAddress = alignTo(NextAvailableAddress, 16);
+    BinaryFunction::FragmentInfo &ColdPart = Function.cold();
+    if (TooLarge) {
+      // The corresponding FDE will refer to address 0.
+      ColdPart.setAddress(0);
+      ColdPart.setImageAddress(0);
+      ColdPart.setImageSize(0);
+      ColdPart.setFileOffset(0);
+    } else {
+      ColdPart.setAddress(NextAvailableAddress);
+      ColdPart.setImageAddress(ColdSection->getAllocAddress());
+      ColdPart.setImageSize(ColdSection->getOutputSize());
+      ColdPart.setFileOffset(getFileOffsetForAddress(NextAvailableAddress));
+      ColdSection->setOutputAddress(ColdPart.getAddress());
+    }
+
+    LLVM_DEBUG(dbgs() << "BOLT: mapping cold fragment 0x"
+                      << Twine::utohexstr(ColdPart.getImageAddress())
+                      << " to 0x" << Twine::utohexstr(ColdPart.getAddress())
+                      << " with size "
+                      << Twine::utohexstr(ColdPart.getImageSize()) << '\n');
+    RTDyld.reassignSectionAddress(ColdSection->getSectionID(),
+                                  ColdPart.getAddress());
+
+    NextAvailableAddress += ColdPart.getImageSize();
+  }
+
+  // Add the new text section aggregating all existing code sections.
+  // This is pseudo-section that serves a purpose of creating a corresponding
+  // entry in section header table.
+  int64_t NewTextSectionSize =
+      NextAvailableAddress - NewTextSectionStartAddress;
+  if (NewTextSectionSize) {
+    const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
+                                                   /*IsText=*/true,
+                                                   /*IsAllocatable=*/true);
+    BinarySection &Section =
+      BC->registerOrUpdateSection(getBOLTTextSectionName(),
+                                  ELF::SHT_PROGBITS,
+                                  Flags,
+                                  /*Data=*/nullptr,
+                                  NewTextSectionSize,
+                                  16);
+    Section.setOutputAddress(NewTextSectionStartAddress);
+    Section.setOutputFileOffset(
+      getFileOffsetForAddress(NewTextSectionStartAddress));
+  }
+}
+
+void RewriteInstance::mapDataSections(RuntimeDyld &RTDyld) {
+  // Map special sections to their addresses in the output image.
+  // These are the sections that we generate via MCStreamer.
+  // The order is important.
+  std::vector<std::string> Sections = {
+      ".eh_frame", Twine(getOrgSecPrefix(), ".eh_frame").str(),
+      ".gcc_except_table", ".rodata", ".rodata.cold"};
+  if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) {
+    RtLibrary->addRuntimeLibSections(Sections);
+  }
+  for (std::string &SectionName : Sections) {
+    ErrorOr<BinarySection &> Section = BC->getUniqueSectionByName(SectionName);
+    if (!Section || !Section->isAllocatable() || !Section->isFinalized())
+      continue;
+    NextAvailableAddress = alignTo(NextAvailableAddress,
+                                   Section->getAlignment());
+    LLVM_DEBUG(dbgs() << "BOLT: mapping section " << SectionName << " (0x"
+                      << Twine::utohexstr(Section->getAllocAddress())
+                      << ") to 0x" << Twine::utohexstr(NextAvailableAddress)
+                      << ":0x"
+                      << Twine::utohexstr(NextAvailableAddress +
+                                          Section->getOutputSize())
+                      << '\n');
+
+    RTDyld.reassignSectionAddress(Section->getSectionID(),
+                                  NextAvailableAddress);
+    Section->setOutputAddress(NextAvailableAddress);
+    Section->setOutputFileOffset(getFileOffsetForAddress(NextAvailableAddress));
+
+    NextAvailableAddress += Section->getOutputSize();
+  }
+
+  // Handling for sections with relocations.
+  for (BinarySection &Section : BC->sections()) {
+    if (!Section.hasSectionRef())
+      continue;
+
+    StringRef SectionName = Section.getName();
+    ErrorOr<BinarySection &> OrgSection =
+        BC->getUniqueSectionByName((getOrgSecPrefix() + SectionName).str());
+    if (!OrgSection ||
+        !OrgSection->isAllocatable() ||
+        !OrgSection->isFinalized() ||
+        !OrgSection->hasValidSectionID())
+      continue;
+
+    if (OrgSection->getOutputAddress()) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: section " << SectionName
+                        << " is already mapped at 0x"
+                        << Twine::utohexstr(OrgSection->getOutputAddress())
+                        << '\n');
+      continue;
+    }
+    LLVM_DEBUG(
+        dbgs() << "BOLT: mapping original section " << SectionName << " (0x"
+               << Twine::utohexstr(OrgSection->getAllocAddress()) << ") to 0x"
+               << Twine::utohexstr(Section.getAddress()) << '\n');
+
+    RTDyld.reassignSectionAddress(OrgSection->getSectionID(),
+                                  Section.getAddress());
+
+    OrgSection->setOutputAddress(Section.getAddress());
+    OrgSection->setOutputFileOffset(Section.getContents().data() -
+                                    InputFile->getData().data());
+  }
+}
+
+void RewriteInstance::mapExtraSections(RuntimeDyld &RTDyld) {
+  for (BinarySection &Section : BC->allocatableSections()) {
+    if (Section.getOutputAddress() || !Section.hasValidSectionID())
+      continue;
+    NextAvailableAddress =
+        alignTo(NextAvailableAddress, Section.getAlignment());
+    Section.setOutputAddress(NextAvailableAddress);
+    NextAvailableAddress += Section.getOutputSize();
+
+    LLVM_DEBUG(dbgs() << "BOLT: (extra) mapping " << Section.getName()
+                      << " at 0x" << Twine::utohexstr(Section.getAllocAddress())
+                      << " to 0x"
+                      << Twine::utohexstr(Section.getOutputAddress()) << '\n');
+
+    RTDyld.reassignSectionAddress(Section.getSectionID(),
+                                  Section.getOutputAddress());
+    Section.setOutputFileOffset(
+        getFileOffsetForAddress(Section.getOutputAddress()));
+  }
+}
+
+void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) {
+  for (BinaryFunction *Function : BC->getAllBinaryFunctions()) {
+    Function->updateOutputValues(Layout);
+  }
+}
+
+void RewriteInstance::patchELFPHDRTable() {
+  auto ELF64LEFile = dyn_cast<ELF64LEObjectFile>(InputFile);
+  if (!ELF64LEFile) {
+    errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n";
+    exit(1);
+  }
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+  raw_fd_ostream &OS = Out->os();
+
+  // Write/re-write program headers.
+  Phnum = Obj.getHeader().e_phnum;
+  if (PHDRTableOffset) {
+    // Writing new pheader table.
+    Phnum += 1; // only adding one new segment
+    // Segment size includes the size of the PHDR area.
+    NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress;
+  } else {
+    assert(!PHDRTableAddress && "unexpected address for program header table");
+    // Update existing table.
+    PHDRTableOffset = Obj.getHeader().e_phoff;
+    NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress;
+  }
+  OS.seek(PHDRTableOffset);
+
+  bool ModdedGnuStack = false;
+  (void)ModdedGnuStack;
+  bool AddedSegment = false;
+  (void)AddedSegment;
+
+  auto createNewTextPhdr = [&]() {
+    ELF64LEPhdrTy NewPhdr;
+    NewPhdr.p_type = ELF::PT_LOAD;
+    if (PHDRTableAddress) {
+      NewPhdr.p_offset = PHDRTableOffset;
+      NewPhdr.p_vaddr = PHDRTableAddress;
+      NewPhdr.p_paddr = PHDRTableAddress;
+    } else {
+      NewPhdr.p_offset = NewTextSegmentOffset;
+      NewPhdr.p_vaddr = NewTextSegmentAddress;
+      NewPhdr.p_paddr = NewTextSegmentAddress;
+    }
+    NewPhdr.p_filesz = NewTextSegmentSize;
+    NewPhdr.p_memsz = NewTextSegmentSize;
+    NewPhdr.p_flags = ELF::PF_X | ELF::PF_R;
+    // FIXME: Currently instrumentation is experimental and the runtime data
+    // is emitted with code, thus everything needs to be writable
+    if (opts::Instrument)
+      NewPhdr.p_flags |= ELF::PF_W;
+    NewPhdr.p_align = BC->PageAlign;
+
+    return NewPhdr;
+  };
+
+  // Copy existing program headers with modifications.
+  for (const ELF64LE::Phdr &Phdr : cantFail(Obj.program_headers())) {
+    ELF64LE::Phdr NewPhdr = Phdr;
+    if (PHDRTableAddress && Phdr.p_type == ELF::PT_PHDR) {
+      NewPhdr.p_offset = PHDRTableOffset;
+      NewPhdr.p_vaddr = PHDRTableAddress;
+      NewPhdr.p_paddr = PHDRTableAddress;
+      NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum;
+      NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum;
+    } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
+      ErrorOr<BinarySection &> EHFrameHdrSec =
+          BC->getUniqueSectionByName(".eh_frame_hdr");
+      if (EHFrameHdrSec &&
+          EHFrameHdrSec->isAllocatable() &&
+          EHFrameHdrSec->isFinalized()) {
+        NewPhdr.p_offset = EHFrameHdrSec->getOutputFileOffset();
+        NewPhdr.p_vaddr = EHFrameHdrSec->getOutputAddress();
+        NewPhdr.p_paddr = EHFrameHdrSec->getOutputAddress();
+        NewPhdr.p_filesz = EHFrameHdrSec->getOutputSize();
+        NewPhdr.p_memsz = EHFrameHdrSec->getOutputSize();
+      }
+    } else if (opts::UseGnuStack && Phdr.p_type == ELF::PT_GNU_STACK) {
+      NewPhdr = createNewTextPhdr();
+      ModdedGnuStack = true;
+    } else if (!opts::UseGnuStack && Phdr.p_type == ELF::PT_DYNAMIC) {
+      // Insert the new header before DYNAMIC.
+      ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
+      OS.write(reinterpret_cast<const char *>(&NewTextPhdr),
+               sizeof(NewTextPhdr));
+      AddedSegment = true;
+    }
+    OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
+  }
+
+  if (!opts::UseGnuStack && !AddedSegment) {
+    // Append the new header to the end of the table.
+    ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
+    OS.write(reinterpret_cast<const char *>(&NewTextPhdr),
+             sizeof(NewTextPhdr));
+  }
+
+  assert((!opts::UseGnuStack || ModdedGnuStack) &&
+         "could not find GNU_STACK program header to modify");
+}
+
+namespace {
+
+/// Write padding to \p OS such that its current \p Offset becomes aligned
+/// at \p Alignment. Return new (aligned) offset.
+uint64_t appendPadding(raw_pwrite_stream &OS,
+                       uint64_t Offset,
+                       uint64_t Alignment) {
+  if (!Alignment)
+    return Offset;
+
+  const uint64_t PaddingSize =
+      offsetToAlignment(Offset, llvm::Align(Alignment));
+  for (unsigned I = 0; I < PaddingSize; ++I)
+    OS.write((unsigned char)0);
+  return Offset + PaddingSize;
+}
+
+}
+
+void RewriteInstance::rewriteNoteSections() {
+  auto ELF64LEFile = dyn_cast<ELF64LEObjectFile>(InputFile);
+  if (!ELF64LEFile) {
+    errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n";
+    exit(1);
+  }
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+  raw_fd_ostream &OS = Out->os();
+
+  uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress);
+  assert(NextAvailableOffset >= FirstNonAllocatableOffset &&
+         "next available offset calculation failure");
+  OS.seek(NextAvailableOffset);
+
+  // Copy over non-allocatable section contents and update file offsets.
+  for (const ELF64LE::Shdr &Section : cantFail(Obj.sections())) {
+    if (Section.sh_type == ELF::SHT_NULL)
+      continue;
+    if (Section.sh_flags & ELF::SHF_ALLOC)
+      continue;
+
+    StringRef SectionName =
+        cantFail(Obj.getSectionName(Section), "cannot get section name");
+    ErrorOr<BinarySection &> BSec = BC->getUniqueSectionByName(SectionName);
+
+    if (shouldStrip(Section, SectionName))
+      continue;
+
+    // Insert padding as needed.
+    NextAvailableOffset =
+      appendPadding(OS, NextAvailableOffset, Section.sh_addralign);
+
+    // New section size.
+    uint64_t Size = 0;
+    bool DataWritten = false;
+    uint8_t *SectionData = nullptr;
+    // Copy over section contents unless it's one of the sections we overwrite.
+    if (!willOverwriteSection(SectionName)) {
+      Size = Section.sh_size;
+      std::string Data =
+          std::string(InputFile->getData().substr(Section.sh_offset, Size));
+      if (BSec && BSec->getPatcher())
+        BSec->getPatcher()->patchBinary(Data, 0);
+
+      // Section was expanded, so need to treat it as overwrite.
+      if (Size != Data.size()) {
+        BSec = BC->registerOrUpdateNoteSection(SectionName, copyByteArray(Data),
+                                               Data.size());
+        Size = 0;
+      } else {
+        OS << Data;
+        DataWritten = true;
+
+        // Add padding as the section extension might rely on the alignment.
+        Size = appendPadding(OS, Size, Section.sh_addralign);
+      }
+    }
+
+    // Perform section post-processing.
+    if (BSec && !BSec->isAllocatable()) {
+      assert(BSec->getAlignment() <= Section.sh_addralign &&
+             "alignment exceeds value in file");
+
+      if (BSec->getAllocAddress()) {
+        assert(!DataWritten && "Writing section twice.");
+        SectionData = BSec->getOutputData();
+
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << (Size ? "appending" : "writing")
+                          << " contents to section " << SectionName << '\n');
+        OS.write(reinterpret_cast<char *>(SectionData),
+                 BSec->getOutputSize());
+        Size += BSec->getOutputSize();
+      }
+
+      BSec->setOutputFileOffset(NextAvailableOffset);
+      BSec->flushPendingRelocations(OS,
+        [this] (const MCSymbol *S) {
+          return getNewValueForSymbol(S->getName());
+        });
+    }
+
+    // Set/modify section info.
+    BinarySection &NewSection =
+      BC->registerOrUpdateNoteSection(SectionName,
+                                      SectionData,
+                                      Size,
+                                      Section.sh_addralign,
+                                      BSec ? BSec->isReadOnly() : false,
+                                      BSec ? BSec->getELFType()
+                                           : ELF::SHT_PROGBITS);
+    NewSection.setOutputAddress(0);
+    NewSection.setOutputFileOffset(NextAvailableOffset);
+
+    NextAvailableOffset += Size;
+  }
+
+  // Write new note sections.
+  for (BinarySection &Section : BC->nonAllocatableSections()) {
+    if (Section.getOutputFileOffset() || !Section.getAllocAddress())
+      continue;
+
+    assert(!Section.hasPendingRelocations() && "cannot have pending relocs");
+
+    NextAvailableOffset = appendPadding(OS, NextAvailableOffset,
+                                        Section.getAlignment());
+    Section.setOutputFileOffset(NextAvailableOffset);
+
+    LLVM_DEBUG(
+        dbgs() << "BOLT-DEBUG: writing out new section " << Section.getName()
+               << " of size " << Section.getOutputSize() << " at offset 0x"
+               << Twine::utohexstr(Section.getOutputFileOffset()) << '\n');
+
+    OS.write(Section.getOutputContents().data(), Section.getOutputSize());
+    NextAvailableOffset += Section.getOutputSize();
+  }
+}
+
+template <typename ELFT>
+void RewriteInstance::finalizeSectionStringTable(ELFObjectFile<ELFT> *File) {
+  using ELFShdrTy = typename ELFT::Shdr;
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+
+  // Pre-populate section header string table.
+  for (const ELFShdrTy &Section : cantFail(Obj.sections())) {
+    StringRef SectionName =
+        cantFail(Obj.getSectionName(Section), "cannot get section name");
+    SHStrTab.add(SectionName);
+    std::string OutputSectionName = getOutputSectionName(Obj, Section);
+    if (OutputSectionName != SectionName) {
+      SHStrTabPool.emplace_back(std::move(OutputSectionName));
+    }
+  }
+  for (const std::string &Str : SHStrTabPool) {
+    SHStrTab.add(Str);
+  }
+  for (const BinarySection &Section : BC->sections()) {
+    SHStrTab.add(Section.getName());
+  }
+  SHStrTab.finalize();
+
+  const size_t SHStrTabSize = SHStrTab.getSize();
+  uint8_t *DataCopy = new uint8_t[SHStrTabSize];
+  memset(DataCopy, 0, SHStrTabSize);
+  SHStrTab.write(DataCopy);
+  BC->registerOrUpdateNoteSection(".shstrtab",
+                                  DataCopy,
+                                  SHStrTabSize,
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true,
+                                  ELF::SHT_STRTAB);
+}
+
+void RewriteInstance::addBoltInfoSection() {
+  std::string DescStr;
+  raw_string_ostream DescOS(DescStr);
+
+  DescOS << "BOLT revision: " << BoltRevision << ", "
+         << "command line:";
+  for (int I = 0; I < Argc; ++I) {
+    DescOS << " " << Argv[I];
+  }
+  DescOS.flush();
+
+  // Encode as GNU GOLD VERSION so it is easily printable by 'readelf -n'
+  const std::string BoltInfo =
+      BinarySection::encodeELFNote("GNU", DescStr, 4 /*NT_GNU_GOLD_VERSION*/);
+  BC->registerOrUpdateNoteSection(".note.bolt_info", copyByteArray(BoltInfo),
+                                  BoltInfo.size(),
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true, ELF::SHT_NOTE);
+}
+
+void RewriteInstance::addBATSection() {
+  BC->registerOrUpdateNoteSection(BoltAddressTranslation::SECTION_NAME, nullptr,
+                                  0,
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true, ELF::SHT_NOTE);
+}
+
+void RewriteInstance::encodeBATSection() {
+  std::string DescStr;
+  raw_string_ostream DescOS(DescStr);
+
+  BAT->write(DescOS);
+  DescOS.flush();
+
+  const std::string BoltInfo =
+      BinarySection::encodeELFNote("BOLT", DescStr, BinarySection::NT_BOLT_BAT);
+  BC->registerOrUpdateNoteSection(BoltAddressTranslation::SECTION_NAME,
+                                  copyByteArray(BoltInfo), BoltInfo.size(),
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true, ELF::SHT_NOTE);
+}
+
+template<typename ELFObjType, typename ELFShdrTy>
+std::string RewriteInstance::getOutputSectionName(const ELFObjType &Obj,
+                                                  const ELFShdrTy &Section) {
+  if (Section.sh_type == ELF::SHT_NULL)
+    return "";
+
+  StringRef SectionName =
+      cantFail(Obj.getSectionName(Section), "cannot get section name");
+
+  if ((Section.sh_flags & ELF::SHF_ALLOC) && willOverwriteSection(SectionName))
+    return (getOrgSecPrefix() + SectionName).str();
+
+  return std::string(SectionName);
+}
+
+template <typename ELFShdrTy>
+bool RewriteInstance::shouldStrip(const ELFShdrTy &Section,
+                                  StringRef SectionName) {
+  // Strip non-allocatable relocation sections.
+  if (!(Section.sh_flags & ELF::SHF_ALLOC) && Section.sh_type == ELF::SHT_RELA)
+    return true;
+
+  // Strip debug sections if not updating them.
+  if (isDebugSection(SectionName) && !opts::UpdateDebugSections)
+    return true;
+
+  return false;
+}
+
+template <typename ELFT, typename ELFShdrTy>
+std::vector<ELFShdrTy> RewriteInstance::getOutputSections(
+    ELFObjectFile<ELFT> *File, std::vector<uint32_t> &NewSectionIndex) {
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+  typename ELFT::ShdrRange Sections = cantFail(Obj.sections());
+
+  // Keep track of section header entries together with their name.
+  std::vector<std::pair<std::string, ELFShdrTy>> OutputSections;
+  auto addSection = [&](const std::string &Name, const ELFShdrTy &Section) {
+    ELFShdrTy NewSection = Section;
+    NewSection.sh_name = SHStrTab.getOffset(Name);
+    OutputSections.emplace_back(Name, std::move(NewSection));
+  };
+
+  // Copy over entries for original allocatable sections using modified name.
+  for (const ELFShdrTy &Section : Sections) {
+    // Always ignore this section.
+    if (Section.sh_type == ELF::SHT_NULL) {
+      OutputSections.emplace_back("", Section);
+      continue;
+    }
+
+    if (!(Section.sh_flags & ELF::SHF_ALLOC))
+      continue;
+
+    addSection(getOutputSectionName(Obj, Section), Section);
+  }
+
+  for (const BinarySection &Section : BC->allocatableSections()) {
+    if (!Section.isFinalized())
+      continue;
+
+    if (Section.getName().startswith(getOrgSecPrefix()) ||
+        Section.isAnonymous()) {
+      if (opts::Verbosity)
+        outs() << "BOLT-INFO: not writing section header for section "
+               << Section.getName() << '\n';
+      continue;
+    }
+
+    if (opts::Verbosity >= 1)
+      outs() << "BOLT-INFO: writing section header for "
+             << Section.getName() << '\n';
+    ELFShdrTy NewSection;
+    NewSection.sh_type = ELF::SHT_PROGBITS;
+    NewSection.sh_addr = Section.getOutputAddress();
+    NewSection.sh_offset = Section.getOutputFileOffset();
+    NewSection.sh_size = Section.getOutputSize();
+    NewSection.sh_entsize = 0;
+    NewSection.sh_flags = Section.getELFFlags();
+    NewSection.sh_link = 0;
+    NewSection.sh_info = 0;
+    NewSection.sh_addralign = Section.getAlignment();
+    addSection(std::string(Section.getName()), NewSection);
+  }
+
+  // Sort all allocatable sections by their offset.
+  std::stable_sort(OutputSections.begin(), OutputSections.end(),
+      [] (const std::pair<std::string, ELFShdrTy> &A,
+          const std::pair<std::string, ELFShdrTy> &B) {
+        return A.second.sh_offset < B.second.sh_offset;
+      });
+
+  // Fix section sizes to prevent overlapping.
+  ELFShdrTy *PrevSection = nullptr;
+  StringRef PrevSectionName;
+  for (auto &SectionKV : OutputSections) {
+    ELFShdrTy &Section = SectionKV.second;
+
+    // TBSS section does not take file or memory space. Ignore it for layout
+    // purposes.
+    if (Section.sh_type == ELF::SHT_NOBITS && (Section.sh_flags & ELF::SHF_TLS))
+      continue;
+
+    if (PrevSection &&
+        PrevSection->sh_addr + PrevSection->sh_size > Section.sh_addr) {
+      if (opts::Verbosity > 1) {
+        outs() << "BOLT-INFO: adjusting size for section " << PrevSectionName
+               << '\n';
+      }
+      PrevSection->sh_size = Section.sh_addr > PrevSection->sh_addr ?
+        Section.sh_addr - PrevSection->sh_addr : 0;
+    }
+
+    PrevSection = &Section;
+    PrevSectionName = SectionKV.first;
+  }
+
+  uint64_t LastFileOffset = 0;
+
+  // Copy over entries for non-allocatable sections performing necessary
+  // adjustments.
+  for (const ELFShdrTy &Section : Sections) {
+    if (Section.sh_type == ELF::SHT_NULL)
+      continue;
+    if (Section.sh_flags & ELF::SHF_ALLOC)
+      continue;
+
+    StringRef SectionName =
+        cantFail(Obj.getSectionName(Section), "cannot get section name");
+
+    if (shouldStrip(Section, SectionName))
+      continue;
+
+    ErrorOr<BinarySection &> BSec = BC->getUniqueSectionByName(SectionName);
+    assert(BSec && "missing section info for non-allocatable section");
+
+    ELFShdrTy NewSection = Section;
+    NewSection.sh_offset = BSec->getOutputFileOffset();
+    NewSection.sh_size = BSec->getOutputSize();
+
+    if (NewSection.sh_type == ELF::SHT_SYMTAB) {
+      NewSection.sh_info = NumLocalSymbols;
+    }
+
+    addSection(std::string(SectionName), NewSection);
+
+    LastFileOffset = BSec->getOutputFileOffset();
+  }
+
+  // Create entries for new non-allocatable sections.
+  for (BinarySection &Section : BC->nonAllocatableSections()) {
+    if (Section.getOutputFileOffset() <= LastFileOffset)
+      continue;
+
+    if (opts::Verbosity >= 1) {
+      outs() << "BOLT-INFO: writing section header for "
+             << Section.getName() << '\n';
+    }
+    ELFShdrTy NewSection;
+    NewSection.sh_type = Section.getELFType();
+    NewSection.sh_addr = 0;
+    NewSection.sh_offset = Section.getOutputFileOffset();
+    NewSection.sh_size = Section.getOutputSize();
+    NewSection.sh_entsize = 0;
+    NewSection.sh_flags = Section.getELFFlags();
+    NewSection.sh_link = 0;
+    NewSection.sh_info = 0;
+    NewSection.sh_addralign = Section.getAlignment();
+
+    addSection(std::string(Section.getName()), NewSection);
+  }
+
+  // Assign indices to sections.
+  std::unordered_map<std::string, uint64_t> NameToIndex;
+  for (uint32_t Index = 1; Index < OutputSections.size(); ++Index) {
+    const std::string &SectionName = OutputSections[Index].first;
+    NameToIndex[SectionName] = Index;
+    if (ErrorOr<BinarySection &> Section =
+            BC->getUniqueSectionByName(SectionName))
+      Section->setIndex(Index);
+  }
+
+  // Update section index mapping
+  NewSectionIndex.clear();
+  NewSectionIndex.resize(Sections.size(), 0);
+  for (const ELFShdrTy &Section : Sections) {
+    if (Section.sh_type == ELF::SHT_NULL)
+      continue;
+
+    size_t OrgIndex = std::distance(Sections.begin(), &Section);
+    std::string SectionName = getOutputSectionName(Obj, Section);
+
+    // Some sections are stripped
+    if (!NameToIndex.count(SectionName))
+      continue;
+
+    NewSectionIndex[OrgIndex] = NameToIndex[SectionName];
+  }
+
+  std::vector<ELFShdrTy> SectionsOnly(OutputSections.size());
+  std::transform(OutputSections.begin(), OutputSections.end(),
+                 SectionsOnly.begin(),
+                 [](std::pair<std::string, ELFShdrTy> &SectionInfo) {
+                   return SectionInfo.second;
+                 });
+
+  return SectionsOnly;
+}
+
+// Rewrite section header table inserting new entries as needed. The sections
+// header table size itself may affect the offsets of other sections,
+// so we are placing it at the end of the binary.
+//
+// As we rewrite entries we need to track how many sections were inserted
+// as it changes the sh_link value. We map old indices to new ones for
+// existing sections.
+template <typename ELFT>
+void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile<ELFT> *File) {
+  using ELFShdrTy = typename ELFObjectFile<ELFT>::Elf_Shdr;
+  using ELFEhdrTy = typename ELFObjectFile<ELFT>::Elf_Ehdr;
+  raw_fd_ostream &OS = Out->os();
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+
+  std::vector<uint32_t> NewSectionIndex;
+  std::vector<ELFShdrTy> OutputSections =
+      getOutputSections(File, NewSectionIndex);
+  LLVM_DEBUG(
+    dbgs() << "BOLT-DEBUG: old to new section index mapping:\n";
+    for (uint64_t I = 0; I < NewSectionIndex.size(); ++I) {
+      dbgs() << "  " << I << " -> " << NewSectionIndex[I] << '\n';
+    }
+  );
+
+  // Align starting address for section header table.
+  uint64_t SHTOffset = OS.tell();
+  SHTOffset = appendPadding(OS, SHTOffset, sizeof(ELFShdrTy));
+
+  // Write all section header entries while patching section references.
+  for (ELFShdrTy &Section : OutputSections) {
+    Section.sh_link = NewSectionIndex[Section.sh_link];
+    if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) {
+      if (Section.sh_info)
+        Section.sh_info = NewSectionIndex[Section.sh_info];
+    }
+    OS.write(reinterpret_cast<const char *>(&Section), sizeof(Section));
+  }
+
+  // Fix ELF header.
+  ELFEhdrTy NewEhdr = Obj.getHeader();
+
+  if (BC->HasRelocations) {
+    if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) {
+      NewEhdr.e_entry = RtLibrary->getRuntimeStartAddress();
+    } else {
+      NewEhdr.e_entry = getNewFunctionAddress(NewEhdr.e_entry);
+    }
+    assert((NewEhdr.e_entry || !Obj.getHeader().e_entry) &&
+           "cannot find new address for entry point");
+  }
+  NewEhdr.e_phoff = PHDRTableOffset;
+  NewEhdr.e_phnum = Phnum;
+  NewEhdr.e_shoff = SHTOffset;
+  NewEhdr.e_shnum = OutputSections.size();
+  NewEhdr.e_shstrndx = NewSectionIndex[NewEhdr.e_shstrndx];
+  OS.pwrite(reinterpret_cast<const char *>(&NewEhdr), sizeof(NewEhdr), 0);
+}
+
+template <typename ELFT,
+          typename ELFShdrTy,
+          typename WriteFuncTy,
+          typename StrTabFuncTy>
+void RewriteInstance::updateELFSymbolTable(
+    ELFObjectFile<ELFT> *File,
+    bool IsDynSym,
+    const ELFShdrTy &SymTabSection,
+    const std::vector<uint32_t> &NewSectionIndex,
+    WriteFuncTy Write,
+    StrTabFuncTy AddToStrTab) {
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+  using ELFSymTy  = typename ELFObjectFile<ELFT>::Elf_Sym;
+
+  StringRef StringSection =
+      cantFail(Obj.getStringTableForSymtab(SymTabSection));
+
+  unsigned NumHotTextSymsUpdated = 0;
+  unsigned NumHotDataSymsUpdated = 0;
+
+  std::map<const BinaryFunction *, uint64_t> IslandSizes;
+  auto getConstantIslandSize = [&IslandSizes](const BinaryFunction &BF) {
+    auto Itr = IslandSizes.find(&BF);
+    if (Itr != IslandSizes.end())
+      return Itr->second;
+    return IslandSizes[&BF] = BF.estimateConstantIslandSize();
+  };
+
+  // Symbols for the new symbol table.
+  std::vector<ELFSymTy> Symbols;
+
+  auto getNewSectionIndex = [&](uint32_t OldIndex) {
+    assert(OldIndex < NewSectionIndex.size() && "section index out of bounds");
+    const uint32_t NewIndex = NewSectionIndex[OldIndex];
+
+    // We may have stripped the section that dynsym was referencing due to
+    // the linker bug. In that case return the old index avoiding marking
+    // the symbol as undefined.
+    if (IsDynSym && NewIndex != OldIndex && NewIndex == ELF::SHN_UNDEF)
+      return OldIndex;
+    return NewIndex;
+  };
+
+  // Add extra symbols for the function.
+  //
+  // Note that addExtraSymbols() could be called multiple times for the same
+  // function with different FunctionSymbol matching the main function entry
+  // point.
+  auto addExtraSymbols = [&](const BinaryFunction &Function,
+                             const ELFSymTy &FunctionSymbol) {
+    if (Function.isFolded()) {
+      BinaryFunction *ICFParent = Function.getFoldedIntoFunction();
+      while (ICFParent->isFolded())
+        ICFParent = ICFParent->getFoldedIntoFunction();
+      ELFSymTy ICFSymbol = FunctionSymbol;
+      SmallVector<char, 256> Buf;
+      ICFSymbol.st_name =
+        AddToStrTab(Twine(cantFail(FunctionSymbol.getName(StringSection)))
+                        .concat(".icf.0")
+                        .toStringRef(Buf));
+      ICFSymbol.st_value = ICFParent->getOutputAddress();
+      ICFSymbol.st_size = ICFParent->getOutputSize();
+      ICFSymbol.st_shndx = ICFParent->getCodeSection()->getIndex();
+      Symbols.emplace_back(ICFSymbol);
+    }
+    if (Function.isSplit() && Function.cold().getAddress()) {
+      ELFSymTy NewColdSym = FunctionSymbol;
+      SmallVector<char, 256> Buf;
+      NewColdSym.st_name =
+        AddToStrTab(Twine(cantFail(FunctionSymbol.getName(StringSection)))
+                        .concat(".cold.0")
+                        .toStringRef(Buf));
+      NewColdSym.st_shndx = Function.getColdCodeSection()->getIndex();
+      NewColdSym.st_value = Function.cold().getAddress();
+      NewColdSym.st_size = Function.cold().getImageSize();
+      NewColdSym.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FUNC);
+      Symbols.emplace_back(NewColdSym);
+    }
+    if (Function.hasConstantIsland()) {
+      uint64_t DataMark = Function.getOutputDataAddress();
+      uint64_t CISize = getConstantIslandSize(Function);
+      uint64_t CodeMark = DataMark + CISize;
+      ELFSymTy DataMarkSym = FunctionSymbol;
+      DataMarkSym.st_name = AddToStrTab("$d");
+      DataMarkSym.st_value = DataMark;
+      DataMarkSym.st_size = 0;
+      DataMarkSym.setType(ELF::STT_NOTYPE);
+      DataMarkSym.setBinding(ELF::STB_LOCAL);
+      ELFSymTy CodeMarkSym = DataMarkSym;
+      CodeMarkSym.st_name = AddToStrTab("$x");
+      CodeMarkSym.st_value = CodeMark;
+      Symbols.emplace_back(DataMarkSym);
+      Symbols.emplace_back(CodeMarkSym);
+    }
+    if (Function.hasConstantIsland() && Function.isSplit()) {
+      uint64_t DataMark = Function.getOutputColdDataAddress();
+      uint64_t CISize = getConstantIslandSize(Function);
+      uint64_t CodeMark = DataMark + CISize;
+      ELFSymTy DataMarkSym = FunctionSymbol;
+      DataMarkSym.st_name = AddToStrTab("$d");
+      DataMarkSym.st_value = DataMark;
+      DataMarkSym.st_size = 0;
+      DataMarkSym.setType(ELF::STT_NOTYPE);
+      DataMarkSym.setBinding(ELF::STB_LOCAL);
+      ELFSymTy CodeMarkSym = DataMarkSym;
+      CodeMarkSym.st_name = AddToStrTab("$x");
+      CodeMarkSym.st_value = CodeMark;
+      Symbols.emplace_back(DataMarkSym);
+      Symbols.emplace_back(CodeMarkSym);
+    }
+  };
+
+  // For regular (non-dynamic) symbol table, exclude symbols referring
+  // to non-allocatable sections.
+  auto shouldStrip = [&](const ELFSymTy &Symbol) {
+    if (Symbol.isAbsolute() || !Symbol.isDefined())
+      return false;
+
+    // If we cannot link the symbol to a section, leave it as is.
+    Expected<const typename ELFT::Shdr *> Section =
+        Obj.getSection(Symbol.st_shndx);
+    if (!Section)
+      return false;
+
+    // Remove the section symbol iif the corresponding section was stripped.
+    if (Symbol.getType() == ELF::STT_SECTION) {
+      if (!getNewSectionIndex(Symbol.st_shndx))
+        return true;
+      return false;
+    }
+
+    // Symbols in non-allocatable sections are typically remnants of relocations
+    // emitted under "-emit-relocs" linker option. Delete those as we delete
+    // relocations against non-allocatable sections.
+    if (!((*Section)->sh_flags & ELF::SHF_ALLOC))
+      return true;
+
+    return false;
+  };
+
+  for (const ELFSymTy &Symbol : cantFail(Obj.symbols(&SymTabSection))) {
+    // For regular (non-dynamic) symbol table strip unneeded symbols.
+    if (!IsDynSym && shouldStrip(Symbol))
+      continue;
+
+    const BinaryFunction *Function =
+        BC->getBinaryFunctionAtAddress(Symbol.st_value);
+    // Ignore false function references, e.g. when the section address matches
+    // the address of the function.
+    if (Function && Symbol.getType() == ELF::STT_SECTION)
+      Function = nullptr;
+
+    // For non-dynamic symtab, make sure the symbol section matches that of
+    // the function. It can mismatch e.g. if the symbol is a section marker
+    // in which case we treat the symbol separately from the function.
+    // For dynamic symbol table, the section index could be wrong on the input,
+    // and its value is ignored by the runtime if it's different from
+    // SHN_UNDEF and SHN_ABS.
+    if (!IsDynSym && Function &&
+        Symbol.st_shndx !=
+                      Function->getOriginSection()->getSectionRef().getIndex())
+      Function = nullptr;
+
+    // Create a new symbol based on the existing symbol.
+    ELFSymTy NewSymbol = Symbol;
+
+    if (Function) {
+      // If the symbol matched a function that was not emitted, update the
+      // corresponding section index but otherwise leave it unchanged.
+      if (Function->isEmitted()) {
+        NewSymbol.st_value = Function->getOutputAddress();
+        NewSymbol.st_size = Function->getOutputSize();
+        NewSymbol.st_shndx = Function->getCodeSection()->getIndex();
+      } else if (Symbol.st_shndx < ELF::SHN_LORESERVE) {
+        NewSymbol.st_shndx = getNewSectionIndex(Symbol.st_shndx);
+      }
+
+      // Add new symbols to the symbol table if necessary.
+      if (!IsDynSym)
+        addExtraSymbols(*Function, NewSymbol);
+    } else {
+      // Check if the function symbol matches address inside a function, i.e.
+      // it marks a secondary entry point.
+      Function = (Symbol.getType() == ELF::STT_FUNC)
+        ? BC->getBinaryFunctionContainingAddress(Symbol.st_value,
+                                                 /*CheckPastEnd=*/false,
+                                                 /*UseMaxSize=*/true)
+        : nullptr;
+
+      if (Function && Function->isEmitted()) {
+        const uint64_t OutputAddress =
+            Function->translateInputToOutputAddress(Symbol.st_value);
+
+        NewSymbol.st_value = OutputAddress;
+        // Force secondary entry points to have zero size.
+        NewSymbol.st_size = 0;
+        NewSymbol.st_shndx = OutputAddress >= Function->cold().getAddress() &&
+                             OutputAddress < Function->cold().getImageSize()
+                                 ? Function->getColdCodeSection()->getIndex()
+                                 : Function->getCodeSection()->getIndex();
+      } else {
+        // Check if the symbol belongs to moved data object and update it.
+        BinaryData *BD = opts::ReorderData.empty()
+          ? nullptr
+          : BC->getBinaryDataAtAddress(Symbol.st_value);
+        if (BD && BD->isMoved() && !BD->isJumpTable()) {
+          assert((!BD->getSize() || !Symbol.st_size ||
+                  Symbol.st_size == BD->getSize()) &&
+                 "sizes must match");
+
+          BinarySection &OutputSection = BD->getOutputSection();
+          assert(OutputSection.getIndex());
+          LLVM_DEBUG(dbgs()
+                     << "BOLT-DEBUG: moving " << BD->getName() << " from "
+                     << *BC->getSectionNameForAddress(Symbol.st_value) << " ("
+                     << Symbol.st_shndx << ") to " << OutputSection.getName()
+                     << " (" << OutputSection.getIndex() << ")\n");
+          NewSymbol.st_shndx = OutputSection.getIndex();
+          NewSymbol.st_value = BD->getOutputAddress();
+        } else {
+          // Otherwise just update the section for the symbol.
+          if (Symbol.st_shndx < ELF::SHN_LORESERVE) {
+            NewSymbol.st_shndx = getNewSectionIndex(Symbol.st_shndx);
+          }
+        }
+
+        // Detect local syms in the text section that we didn't update
+        // and that were preserved by the linker to support relocations against
+        // .text. Remove them from the symtab.
+        if (Symbol.getType() == ELF::STT_NOTYPE &&
+            Symbol.getBinding() == ELF::STB_LOCAL &&
+            Symbol.st_size == 0) {
+          if (BC->getBinaryFunctionContainingAddress(Symbol.st_value,
+                                                     /*CheckPastEnd=*/false,
+                                                     /*UseMaxSize=*/true)) {
+            // Can only delete the symbol if not patching. Such symbols should
+            // not exist in the dynamic symbol table.
+            assert(!IsDynSym && "cannot delete symbol");
+            continue;
+          }
+        }
+      }
+    }
+
+    // Handle special symbols based on their name.
+    Expected<StringRef> SymbolName = Symbol.getName(StringSection);
+    assert(SymbolName && "cannot get symbol name");
+
+    auto updateSymbolValue = [&](const StringRef Name, unsigned &IsUpdated) {
+      NewSymbol.st_value = getNewValueForSymbol(Name);
+      NewSymbol.st_shndx = ELF::SHN_ABS;
+      outs() << "BOLT-INFO: setting " << Name << " to 0x"
+             << Twine::utohexstr(NewSymbol.st_value) << '\n';
+      ++IsUpdated;
+    };
+
+    if (opts::HotText && (*SymbolName == "__hot_start" ||
+                          *SymbolName == "__hot_end"))
+      updateSymbolValue(*SymbolName, NumHotTextSymsUpdated);
+
+    if (opts::HotData && (*SymbolName == "__hot_data_start" ||
+                          *SymbolName == "__hot_data_end"))
+      updateSymbolValue(*SymbolName, NumHotDataSymsUpdated);
+
+    if (*SymbolName == "_end") {
+      unsigned Ignored;
+      updateSymbolValue(*SymbolName, Ignored);
+    }
+
+    if (IsDynSym) {
+      Write((&Symbol - cantFail(Obj.symbols(&SymTabSection)).begin()) *
+                sizeof(ELFSymTy),
+            NewSymbol);
+    } else {
+      Symbols.emplace_back(NewSymbol);
+    }
+  }
+
+  if (IsDynSym) {
+    assert(Symbols.empty());
+    return;
+  }
+
+  // Add symbols of injected functions
+  for (BinaryFunction *Function : BC->getInjectedBinaryFunctions()) {
+    ELFSymTy NewSymbol;
+    BinarySection *OriginSection = Function->getOriginSection();
+    NewSymbol.st_shndx = OriginSection ?
+          getNewSectionIndex(OriginSection->getSectionRef().getIndex()) :
+          Function->getCodeSection()->getIndex();
+    NewSymbol.st_value = Function->getOutputAddress();
+    NewSymbol.st_name = AddToStrTab(Function->getOneName());
+    NewSymbol.st_size = Function->getOutputSize();
+    NewSymbol.st_other = 0;
+    NewSymbol.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FUNC);
+    Symbols.emplace_back(NewSymbol);
+
+    if (Function->isSplit()) {
+      ELFSymTy NewColdSym = NewSymbol;
+      NewColdSym.setType(ELF::STT_NOTYPE);
+      SmallVector<char, 256> Buf;
+      NewColdSym.st_name = AddToStrTab(
+        Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf));
+      NewColdSym.st_value = Function->cold().getAddress();
+      NewColdSym.st_size = Function->cold().getImageSize();
+      Symbols.emplace_back(NewColdSym);
+    }
+  }
+
+  assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) &&
+         "either none or both __hot_start/__hot_end symbols were expected");
+  assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) &&
+         "either none or both __hot_data_start/__hot_data_end symbols were "
+         "expected");
+
+  auto addSymbol = [&](const std::string &Name) {
+    ELFSymTy Symbol;
+    Symbol.st_value = getNewValueForSymbol(Name);
+    Symbol.st_shndx = ELF::SHN_ABS;
+    Symbol.st_name = AddToStrTab(Name);
+    Symbol.st_size = 0;
+    Symbol.st_other = 0;
+    Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE);
+
+    outs() << "BOLT-INFO: setting " << Name << " to 0x"
+           << Twine::utohexstr(Symbol.st_value) << '\n';
+
+    Symbols.emplace_back(Symbol);
+  };
+
+  if (opts::HotText && !NumHotTextSymsUpdated) {
+    addSymbol("__hot_start");
+    addSymbol("__hot_end");
+  }
+
+  if (opts::HotData && !NumHotDataSymsUpdated) {
+    addSymbol("__hot_data_start");
+    addSymbol("__hot_data_end");
+  }
+
+  // Put local symbols at the beginning.
+  std::stable_sort(Symbols.begin(), Symbols.end(),
+                   [](const ELFSymTy &A, const ELFSymTy &B) {
+                     if (A.getBinding() == ELF::STB_LOCAL &&
+                         B.getBinding() != ELF::STB_LOCAL)
+                       return true;
+                     return false;
+                   });
+
+  for (const ELFSymTy &Symbol : Symbols) {
+    Write(0, Symbol);
+  }
+}
+
+template <typename ELFT>
+void RewriteInstance::patchELFSymTabs(ELFObjectFile<ELFT> *File) {
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+  using ELFShdrTy = typename ELFObjectFile<ELFT>::Elf_Shdr;
+  using ELFSymTy  = typename ELFObjectFile<ELFT>::Elf_Sym;
+
+  // Compute a preview of how section indices will change after rewriting, so
+  // we can properly update the symbol table based on new section indices.
+  std::vector<uint32_t> NewSectionIndex;
+  getOutputSections(File, NewSectionIndex);
+
+  // Set pointer at the end of the output file, so we can pwrite old symbol
+  // tables if we need to.
+  uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress);
+  assert(NextAvailableOffset >= FirstNonAllocatableOffset &&
+         "next available offset calculation failure");
+  Out->os().seek(NextAvailableOffset);
+
+  // Update dynamic symbol table.
+  const ELFShdrTy *DynSymSection = nullptr;
+  for (const ELFShdrTy &Section : cantFail(Obj.sections())) {
+    if (Section.sh_type == ELF::SHT_DYNSYM) {
+      DynSymSection = &Section;
+      break;
+    }
+  }
+  assert((DynSymSection || !BC->hasDynamicHeader) &&
+         "dynamic symbol table expected");
+  if (DynSymSection) {
+    updateELFSymbolTable(
+        File,
+        /*IsDynSym=*/true,
+        *DynSymSection,
+        NewSectionIndex,
+        [&](size_t Offset, const ELFSymTy &Sym) {
+          Out->os().pwrite(reinterpret_cast<const char *>(&Sym),
+                           sizeof(ELFSymTy),
+                           DynSymSection->sh_offset + Offset);
+        },
+        [](StringRef) -> size_t { return 0; });
+  }
+
+  // (re)create regular symbol table.
+  const ELFShdrTy *SymTabSection = nullptr;
+  for (const ELFShdrTy &Section : cantFail(Obj.sections())) {
+    if (Section.sh_type == ELF::SHT_SYMTAB) {
+      SymTabSection = &Section;
+      break;
+    }
+  }
+  if (!SymTabSection) {
+    errs() << "BOLT-WARNING: no symbol table found\n";
+    return;
+  }
+
+  const ELFShdrTy *StrTabSection =
+      cantFail(Obj.getSection(SymTabSection->sh_link));
+  std::string NewContents;
+  std::string NewStrTab = std::string(
+      File->getData().substr(StrTabSection->sh_offset, StrTabSection->sh_size));
+  StringRef SecName = cantFail(Obj.getSectionName(*SymTabSection));
+  StringRef StrSecName = cantFail(Obj.getSectionName(*StrTabSection));
+
+  NumLocalSymbols = 0;
+  updateELFSymbolTable(
+      File,
+      /*IsDynSym=*/false,
+      *SymTabSection,
+      NewSectionIndex,
+      [&](size_t Offset, const ELFSymTy &Sym) {
+        if (Sym.getBinding() == ELF::STB_LOCAL)
+          ++NumLocalSymbols;
+        NewContents.append(reinterpret_cast<const char *>(&Sym),
+                           sizeof(ELFSymTy));
+      },
+      [&](StringRef Str) {
+        size_t Idx = NewStrTab.size();
+        NewStrTab.append(NameResolver::restore(Str).str());
+        NewStrTab.append(1, '\0');
+        return Idx;
+      });
+
+  BC->registerOrUpdateNoteSection(SecName,
+                                  copyByteArray(NewContents),
+                                  NewContents.size(),
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true,
+                                  ELF::SHT_SYMTAB);
+
+  BC->registerOrUpdateNoteSection(StrSecName,
+                                  copyByteArray(NewStrTab),
+                                  NewStrTab.size(),
+                                  /*Alignment=*/1,
+                                  /*IsReadOnly=*/true,
+                                  ELF::SHT_STRTAB);
+}
+
+template <typename ELFT>
+void
+RewriteInstance::patchELFAllocatableRelaSections(ELFObjectFile<ELFT> *File) {
+  using Elf_Rela = typename ELFT::Rela;
+  raw_fd_ostream &OS = Out->os();
+
+  for (BinarySection &RelaSection : BC->allocatableRelaSections()) {
+    for (const RelocationRef &Rel : RelaSection.getSectionRef().relocations()) {
+      uint64_t RType = Rel.getType();
+      if (Relocation::isRelative(RType) || Relocation::isIRelative(RType)) {
+        DataRefImpl DRI = Rel.getRawDataRefImpl();
+        const Elf_Rela *RelA = File->getRela(DRI);
+        auto Address = RelA->r_addend;
+        uint64_t NewAddress = getNewFunctionAddress(Address);
+        if (!NewAddress)
+          continue;
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: patching (I)RELATIVE "
+                          << RelaSection.getName() << " entry 0x"
+                          << Twine::utohexstr(Address) << " with 0x"
+                          << Twine::utohexstr(NewAddress) << '\n');
+        Elf_Rela NewRelA = *RelA;
+        NewRelA.r_addend = NewAddress;
+        OS.pwrite(reinterpret_cast<const char *>(&NewRelA), sizeof(NewRelA),
+          reinterpret_cast<const char *>(RelA) - File->getData().data());
+      }
+    }
+  }
+}
+
+template <typename ELFT>
+void RewriteInstance::patchELFGOT(ELFObjectFile<ELFT> *File) {
+  raw_fd_ostream &OS = Out->os();
+
+  SectionRef GOTSection;
+  for (const SectionRef &Section : File->sections()) {
+    StringRef SectionName = cantFail(Section.getName());
+    if (SectionName == ".got") {
+      GOTSection = Section;
+      break;
+    }
+  }
+  if (!GOTSection.getObject()) {
+    errs() << "BOLT-INFO: no .got section found\n";
+    return;
+  }
+
+  StringRef GOTContents = cantFail(GOTSection.getContents());
+  for (const uint64_t *GOTEntry =
+        reinterpret_cast<const uint64_t *>(GOTContents.data());
+       GOTEntry < reinterpret_cast<const uint64_t *>(GOTContents.data() +
+                                                            GOTContents.size());
+       ++GOTEntry) {
+    if (uint64_t NewAddress = getNewFunctionAddress(*GOTEntry)) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: patching GOT entry 0x"
+                        << Twine::utohexstr(*GOTEntry) << " with 0x"
+                        << Twine::utohexstr(NewAddress) << '\n');
+      OS.pwrite(reinterpret_cast<const char *>(&NewAddress), sizeof(NewAddress),
+        reinterpret_cast<const char *>(GOTEntry) - File->getData().data());
+    }
+  }
+}
+
+template <typename ELFT>
+void RewriteInstance::patchELFDynamic(ELFObjectFile<ELFT> *File) {
+  if (!BC->hasDynamicHeader)
+    return;
+
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+  raw_fd_ostream &OS = Out->os();
+
+  using Elf_Phdr = typename ELFFile<ELFT>::Elf_Phdr;
+  using Elf_Dyn  = typename ELFFile<ELFT>::Elf_Dyn;
+
+  // Locate DYNAMIC by looking through program headers.
+  uint64_t DynamicOffset = 0;
+  const Elf_Phdr *DynamicPhdr = 0;
+  for (const Elf_Phdr &Phdr : cantFail(Obj.program_headers())) {
+    if (Phdr.p_type == ELF::PT_DYNAMIC) {
+      DynamicOffset = Phdr.p_offset;
+      DynamicPhdr = &Phdr;
+      assert(Phdr.p_memsz == Phdr.p_filesz && "dynamic sizes should match");
+      break;
+    }
+  }
+  assert(DynamicPhdr && "missing dynamic in ELF binary");
+
+  bool ZNowSet = false;
+
+  // Go through all dynamic entries and patch functions addresses with
+  // new ones.
+  typename ELFT::DynRange DynamicEntries =
+      cantFail(Obj.dynamicEntries(), "error accessing dynamic table");
+  auto DTB = DynamicEntries.begin();
+  for (const Elf_Dyn &Dyn : DynamicEntries) {
+    Elf_Dyn NewDE = Dyn;
+    bool ShouldPatch = true;
+    switch (Dyn.d_tag) {
+    default:
+      ShouldPatch = false;
+      break;
+    case ELF::DT_INIT:
+    case ELF::DT_FINI:
+      if (BC->HasRelocations) {
+        if (uint64_t NewAddress = getNewFunctionAddress(Dyn.getPtr())) {
+          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: patching dynamic entry of type "
+                            << Dyn.getTag() << '\n');
+          NewDE.d_un.d_ptr = NewAddress;
+        }
+      }
+      if (Dyn.getTag() == ELF::DT_FINI) {
+        if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) {
+          if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress()) {
+            NewDE.d_un.d_ptr = Addr;
+          }
+        }
+      }
+      if (Dyn.getTag() == ELF::DT_INIT && !BC->isExecutable()) {
+        if (auto *RtLibrary = BC->getRuntimeLibrary()) {
+          if (auto Addr = RtLibrary->getRuntimeStartAddress()) {
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set DT_INIT to 0x"
+                              << Twine::utohexstr(Addr) << '\n');
+            NewDE.d_un.d_ptr = Addr;
+          }
+        }
+      }
+      break;
+    case ELF::DT_FLAGS:
+      if (BC->RequiresZNow) {
+        NewDE.d_un.d_val |= ELF::DF_BIND_NOW;
+        ZNowSet = true;
+      }
+      break;
+    case ELF::DT_FLAGS_1:
+      if (BC->RequiresZNow) {
+        NewDE.d_un.d_val |= ELF::DF_1_NOW;
+        ZNowSet = true;
+      }
+      break;
+    }
+    if (ShouldPatch) {
+      OS.pwrite(reinterpret_cast<const char *>(&NewDE), sizeof(NewDE),
+                DynamicOffset + (&Dyn - DTB) * sizeof(Dyn));
+    }
+  }
+
+  if (BC->RequiresZNow && !ZNowSet) {
+    errs() << "BOLT-ERROR: output binary requires immediate relocation "
+              "processing which depends on DT_FLAGS or DT_FLAGS_1 presence in "
+              ".dynamic. Please re-link the binary with -znow.\n";
+    exit(1);
+  }
+}
+
+template <typename ELFT>
+void RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
+  const ELFFile<ELFT> &Obj = File->getELFFile();
+
+  using Elf_Phdr = typename ELFFile<ELFT>::Elf_Phdr;
+  using Elf_Dyn  = typename ELFFile<ELFT>::Elf_Dyn;
+
+  // Locate DYNAMIC by looking through program headers.
+  const Elf_Phdr *DynamicPhdr = 0;
+  for (const Elf_Phdr &Phdr : cantFail(Obj.program_headers())) {
+    if (Phdr.p_type == ELF::PT_DYNAMIC) {
+      DynamicPhdr = &Phdr;
+      break;
+    }
+  }
+
+  if (!DynamicPhdr) {
+    outs()
+        << "BOLT-INFO: static input executable detected (no dynamic header)\n";
+    BC->hasDynamicHeader = false;
+    return;
+  }
+  BC->hasDynamicHeader = true;
+
+  assert(DynamicPhdr->p_memsz == DynamicPhdr->p_filesz &&
+        "dynamic section sizes should match");
+
+  // Go through all dynamic entries to locate entries of interest.
+  typename ELFT::DynRange DynamicEntries =
+      cantFail(Obj.dynamicEntries(), "error accessing dynamic table");
+
+  for (const Elf_Dyn &Dyn : DynamicEntries) {
+    switch (Dyn.d_tag) {
+    case ELF::DT_INIT:
+      if (!BC->isExecutable()) {
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set start function address\n");
+        BC->StartFunctionAddress = Dyn.getPtr();
+      }
+      break;
+    case ELF::DT_FINI:
+      BC->FiniFunctionAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_RELA:
+      DynamicRelocationsAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_RELASZ:
+      DynamicRelocationsSize = Dyn.getVal();
+      break;
+    case ELF::DT_JMPREL:
+      PLTRelocationsAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_PLTRELSZ:
+      PLTRelocationsSize = Dyn.getVal();
+      break;
+    }
+  }
+
+  if (!DynamicRelocationsAddress)
+    DynamicRelocationsSize = 0;
+
+  if (!PLTRelocationsAddress)
+    PLTRelocationsSize = 0;
+}
+
+
+uint64_t RewriteInstance::getNewFunctionAddress(uint64_t OldAddress) {
+  const BinaryFunction *Function = BC->getBinaryFunctionAtAddress(OldAddress);
+  if (!Function)
+    return 0;
+
+  assert(!Function->isFragment() && "cannot get new address for a fragment");
+
+  return Function->getOutputAddress();
+}
+
+void RewriteInstance::rewriteFile() {
+  std::error_code EC;
+  Out = std::make_unique<ToolOutputFile>(opts::OutputFilename, EC,
+                                          sys::fs::OF_None);
+  check_error(EC, "cannot create output executable file");
+
+  raw_fd_ostream &OS = Out->os();
+
+  // Copy allocatable part of the input.
+  OS << InputFile->getData().substr(0, FirstNonAllocatableOffset);
+
+  // We obtain an asm-specific writer so that we can emit nops in an
+  // architecture-specific way at the end of the function.
+  std::unique_ptr<MCAsmBackend> MAB(
+      BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions()));
+  auto Streamer = BC->createStreamer(OS);
+  // Make sure output stream has enough reserved space, otherwise
+  // pwrite() will fail.
+  uint64_t Offset = OS.seek(getFileOffsetForAddress(NextAvailableAddress));
+  (void)Offset;
+  assert(Offset == getFileOffsetForAddress(NextAvailableAddress) &&
+         "error resizing output file");
+
+  // Overwrite functions with fixed output address.
+  uint64_t CountOverwrittenFunctions = 0;
+  uint64_t OverwrittenScore = 0;
+  for (BinaryFunction *Function : BC->getAllBinaryFunctions()) {
+
+    if (Function->getImageAddress() == 0 || Function->getImageSize() == 0)
+      continue;
+
+    if (Function->getImageSize() > Function->getMaxSize()) {
+      if (opts::Verbosity >= 1) {
+        errs() << "BOLT-WARNING: new function size (0x"
+               << Twine::utohexstr(Function->getImageSize())
+               << ") is larger than maximum allowed size (0x"
+               << Twine::utohexstr(Function->getMaxSize())
+               << ") for function " << *Function << '\n';
+      }
+      FailedAddresses.emplace_back(Function->getAddress());
+      continue;
+    }
+
+    if (Function->isSplit() && (Function->cold().getImageAddress() == 0 ||
+                                Function->cold().getImageSize() == 0))
+      continue;
+
+    OverwrittenScore += Function->getFunctionScore();
+    // Overwrite function in the output file.
+    if (opts::Verbosity >= 2) {
+      outs() << "BOLT: rewriting function \"" << *Function << "\"\n";
+    }
+    OS.pwrite(reinterpret_cast<char *>(Function->getImageAddress()),
+              Function->getImageSize(),
+              Function->getFileOffset());
+
+    // Write nops at the end of the function.
+    if (Function->getMaxSize() != std::numeric_limits<uint64_t>::max()) {
+      uint64_t Pos = OS.tell();
+      OS.seek(Function->getFileOffset() + Function->getImageSize());
+      MAB->writeNopData(OS, Function->getMaxSize() - Function->getImageSize());
+
+      OS.seek(Pos);
+    }
+
+    // Write jump tables if updating in-place.
+    if (opts::JumpTables == JTS_BASIC) {
+      for (auto &JTI : Function->JumpTables) {
+        JumpTable *JT = JTI.second;
+        BinarySection &Section = JT->getOutputSection();
+        Section.setOutputFileOffset(
+            getFileOffsetForAddress(JT->getAddress()));
+        assert(Section.getOutputFileOffset() && "no matching offset in file");
+        OS.pwrite(reinterpret_cast<const char*>(Section.getOutputData()),
+                  Section.getOutputSize(),
+                  Section.getOutputFileOffset());
+      }
+    }
+
+    if (!Function->isSplit()) {
+      ++CountOverwrittenFunctions;
+      if (opts::MaxFunctions &&
+          CountOverwrittenFunctions == opts::MaxFunctions) {
+        outs() << "BOLT: maximum number of functions reached\n";
+        break;
+      }
+      continue;
+    }
+
+    // Write cold part
+    if (opts::Verbosity >= 2) {
+      outs() << "BOLT: rewriting function \"" << *Function
+             << "\" (cold part)\n";
+    }
+    OS.pwrite(reinterpret_cast<char*>(Function->cold().getImageAddress()),
+              Function->cold().getImageSize(),
+              Function->cold().getFileOffset());
+
+    ++CountOverwrittenFunctions;
+    if (opts::MaxFunctions &&
+        CountOverwrittenFunctions == opts::MaxFunctions) {
+      outs() << "BOLT: maximum number of functions reached\n";
+      break;
+    }
+  }
+
+  // Print function statistics for non-relocation mode.
+  if (!BC->HasRelocations) {
+    outs() << "BOLT: " << CountOverwrittenFunctions
+           << " out of " << BC->getBinaryFunctions().size()
+           << " functions were overwritten.\n";
+    if (BC->TotalScore != 0) {
+      double Coverage = OverwrittenScore / (double) BC->TotalScore * 100.0;
+      outs() << format("BOLT-INFO: rewritten functions cover %.2lf", Coverage)
+             << "% of the execution count of simple functions of "
+                "this binary\n";
+    }
+  }
+
+  if (BC->HasRelocations && opts::TrapOldCode) {
+    uint64_t SavedPos = OS.tell();
+    // Overwrite function body to make sure we never execute these instructions.
+    for (auto &BFI : BC->getBinaryFunctions()) {
+      BinaryFunction &BF = BFI.second;
+      if (!BF.getFileOffset() || !BF.isEmitted())
+        continue;
+      OS.seek(BF.getFileOffset());
+      for (unsigned I = 0; I < BF.getMaxSize(); ++I)
+        OS.write((unsigned char)
+            Streamer->getContext().getAsmInfo()->getTrapFillValue());
+    }
+    OS.seek(SavedPos);
+  }
+
+  // Write all non-local sections, i.e. those not emitted with the function.
+  for (BinarySection &Section : BC->allocatableSections()) {
+    if (!Section.isFinalized() || !Section.getOutputData())
+      continue;
+
+    if (opts::Verbosity >= 1) {
+      outs() << "BOLT: writing new section " << Section.getName()
+             << "\n data at 0x" << Twine::utohexstr(Section.getAllocAddress())
+             << "\n of size " << Section.getOutputSize()
+             << "\n at offset " << Section.getOutputFileOffset() << '\n';
+    }
+    OS.pwrite(reinterpret_cast<const char*>(Section.getOutputData()),
+              Section.getOutputSize(),
+              Section.getOutputFileOffset());
+  }
+
+  for (BinarySection &Section : BC->allocatableSections()) {
+    Section.flushPendingRelocations(OS,
+        [this] (const MCSymbol *S) {
+          return getNewValueForSymbol(S->getName());
+        });
+  }
+
+  // If .eh_frame is present create .eh_frame_hdr.
+  if (EHFrameSection && EHFrameSection->isFinalized()) {
+    writeEHFrameHeader();
+  }
+
+  // Add BOLT Addresses Translation maps to allow profile collection to
+  // happen in the output binary
+  if (opts::EnableBAT)
+    addBATSection();
+
+  // Patch program header table.
+  patchELFPHDRTable();
+
+  // Finalize memory image of section string table.
+  finalizeSectionStringTable();
+
+  // Update symbol tables.
+  patchELFSymTabs();
+
+  patchBuildID();
+
+  if (opts::EnableBAT)
+    encodeBATSection();
+
+  // Copy non-allocatable sections once allocatable part is finished.
+  rewriteNoteSections();
+
+  // Patch dynamic section/segment.
+  patchELFDynamic();
+
+  if (BC->HasRelocations) {
+    patchELFAllocatableRelaSections();
+    patchELFGOT();
+  }
+
+  // Update ELF book-keeping info.
+  patchELFSectionHeaderTable();
+
+  if (opts::PrintSections) {
+    outs() << "BOLT-INFO: Sections after processing:\n";
+    BC->printSections(outs());
+  }
+
+  Out->keep();
+  EC = sys::fs::setPermissions(opts::OutputFilename,
+                               sys::fs::perms::all_all);
+  check_error(EC, "cannot set permissions of output file");
+}
+
+void RewriteInstance::writeEHFrameHeader() {
+  DWARFDebugFrame NewEHFrame(BC->TheTriple->getArch(), true,
+                             EHFrameSection->getOutputAddress());
+  Error E = NewEHFrame.parse(DWARFDataExtractor(
+      EHFrameSection->getOutputContents(), BC->AsmInfo->isLittleEndian(),
+      BC->AsmInfo->getCodePointerSize()));
+  check_error(std::move(E), "failed to parse EH frame");
+
+  uint64_t OldEHFrameAddress = 0;
+  StringRef OldEHFrameContents;
+  ErrorOr<BinarySection &> OldEHFrameSection =
+      BC->getUniqueSectionByName(Twine(getOrgSecPrefix(), ".eh_frame").str());
+  if (OldEHFrameSection) {
+    OldEHFrameAddress = OldEHFrameSection->getOutputAddress();
+    OldEHFrameContents = OldEHFrameSection->getOutputContents();
+  }
+  DWARFDebugFrame OldEHFrame(BC->TheTriple->getArch(), true, OldEHFrameAddress);
+  Error Er = OldEHFrame.parse(
+      DWARFDataExtractor(OldEHFrameContents, BC->AsmInfo->isLittleEndian(),
+                         BC->AsmInfo->getCodePointerSize()));
+  check_error(std::move(Er), "failed to parse EH frame");
+
+  LLVM_DEBUG(dbgs() << "BOLT: writing a new .eh_frame_hdr\n");
+
+  NextAvailableAddress =
+    appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign);
+
+  const uint64_t EHFrameHdrOutputAddress = NextAvailableAddress;
+  const uint64_t EHFrameHdrFileOffset =
+      getFileOffsetForAddress(NextAvailableAddress);
+
+  std::vector<char> NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(
+      OldEHFrame, NewEHFrame, EHFrameHdrOutputAddress, FailedAddresses);
+
+  assert(Out->os().tell() == EHFrameHdrFileOffset && "offset mismatch");
+  Out->os().write(NewEHFrameHdr.data(), NewEHFrameHdr.size());
+
+  const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
+                                                 /*IsText=*/false,
+                                                 /*IsAllocatable=*/true);
+  BinarySection &EHFrameHdrSec = BC->registerOrUpdateSection(
+      ".eh_frame_hdr", ELF::SHT_PROGBITS, Flags, nullptr, NewEHFrameHdr.size(),
+      /*Alignment=*/1);
+  EHFrameHdrSec.setOutputFileOffset(EHFrameHdrFileOffset);
+  EHFrameHdrSec.setOutputAddress(EHFrameHdrOutputAddress);
+
+  NextAvailableAddress += EHFrameHdrSec.getOutputSize();
+
+  // Merge new .eh_frame with original so that gdb can locate all FDEs.
+  if (OldEHFrameSection) {
+    const uint64_t EHFrameSectionSize = (OldEHFrameSection->getOutputAddress() +
+                                         OldEHFrameSection->getOutputSize() -
+                                         EHFrameSection->getOutputAddress());
+    EHFrameSection =
+      BC->registerOrUpdateSection(".eh_frame",
+                                  EHFrameSection->getELFType(),
+                                  EHFrameSection->getELFFlags(),
+                                  EHFrameSection->getOutputData(),
+                                  EHFrameSectionSize,
+                                  EHFrameSection->getAlignment());
+    BC->deregisterSection(*OldEHFrameSection);
+  }
+
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: size of .eh_frame after merge is "
+                    << EHFrameSection->getOutputSize() << '\n');
+}
+
+uint64_t RewriteInstance::getNewValueForSymbol(const StringRef Name) {
+  uint64_t Value =  RTDyld->getSymbol(Name).getAddress();
+  if (Value != 0)
+    return Value;
+
+  // Return the original value if we haven't emitted the symbol.
+  BinaryData *BD = BC->getBinaryDataByName(Name);
+  if (!BD)
+    return 0;
+
+  return BD->getAddress();
+}
+
+uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const {
+  // Check if it's possibly part of the new segment.
+  if (Address >= NewTextSegmentAddress) {
+    return Address - NewTextSegmentAddress + NewTextSegmentOffset;
+  }
+
+  // Find an existing segment that matches the address.
+  const auto SegmentInfoI = BC->SegmentMapInfo.upper_bound(Address);
+  if (SegmentInfoI == BC->SegmentMapInfo.begin())
+    return 0;
+
+  const SegmentInfo &SegmentInfo = std::prev(SegmentInfoI)->second;
+  if (Address < SegmentInfo.Address ||
+      Address >= SegmentInfo.Address + SegmentInfo.FileSize)
+    return 0;
+
+  return  SegmentInfo.FileOffset + Address - SegmentInfo.Address;
+}
+
+bool RewriteInstance::willOverwriteSection(StringRef SectionName) {
+  for (const char *const &OverwriteName : SectionsToOverwrite) {
+    if (SectionName == OverwriteName)
+      return true;
+  }
+  for (std::string &OverwriteName : DebugSectionsToOverwrite) {
+    if (SectionName == OverwriteName)
+      return true;
+  }
+
+  ErrorOr<BinarySection &> Section = BC->getUniqueSectionByName(SectionName);
+  return Section && Section->isAllocatable() && Section->isFinalized();
+}
+
+bool RewriteInstance::isDebugSection(StringRef SectionName) {
+  if (SectionName.startswith(".debug_") || SectionName == ".gdb_index")
+    return true;
+
+  return false;
+}
+
+bool RewriteInstance::isKSymtabSection(StringRef SectionName) {
+  if (SectionName.startswith("__ksymtab"))
+    return true;
+
+  return false;
+}
diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h
new file mode 100644
index 000000000000..0ce8a81c8ad4
--- /dev/null
+++ b/bolt/src/RewriteInstance.h
@@ -0,0 +1,551 @@
+//===--- RewriteInstance.h - Instance of a rewriting process. -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to control an instance of a binary rewriting process.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H
+#define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H
+
+#include "BinaryContext.h"
+#include "NameResolver.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include <map>
+#include <set>
+
+namespace llvm {
+
+class ToolOutputFile;
+
+namespace bolt {
+
+class BoltAddressTranslation;
+class CFIReaderWriter;
+class DWARFRewriter;
+class ProfileReaderBase;
+
+/// This class encapsulates all data necessary to carry on binary reading,
+/// disassembly, CFG building, BB reordering (among other binary-level
+/// optimizations) and rewriting. It also has the logic to coordinate such
+/// events.
+class RewriteInstance {
+public:
+  RewriteInstance(llvm::object::ELFObjectFileBase *File,
+                  const int Argc, const char *const *Argv,
+                  StringRef ToolPath);
+  ~RewriteInstance();
+
+  /// Assign profile from \p Filename to this instance.
+  Error setProfile(StringRef Filename);
+
+  /// Run all the necessary steps to read, optimize and rewrite the binary.
+  void run();
+
+  /// Diff this instance against another one. Non-const since we may run passes
+  /// to fold identical functions.
+  void compare(RewriteInstance &RI2);
+
+  /// Return binary context.
+  const BinaryContext &getBinaryContext() const {
+    return *BC;
+  }
+
+  /// Return total score of all functions for this instance.
+  uint64_t getTotalScore() const {
+    return BC->TotalScore;
+  }
+
+  /// Return the name of the input file.
+  StringRef getInputFilename() const {
+    assert(InputFile && "cannot have an instance without a file");
+    return InputFile->getFileName();
+  }
+
+  /// Set the build-id string if we did not fail to parse the contents of the
+  /// ELF note section containing build-id information.
+  void parseBuildID();
+
+  /// The build-id is typically a stream of 20 bytes. Return these bytes in
+  /// printable hexadecimal form if they are available, or NoneType otherwise.
+  Optional<std::string> getPrintableBuildID() const;
+
+  /// If this instance uses a profile, return appropriate profile reader.
+  const ProfileReaderBase *getProfileReader() const {
+    return ProfileReader.get();
+  }
+
+private:
+  using ELF64LEPhdrTy = object::ELF64LEFile::Elf_Phdr;
+
+  /// Populate array of binary functions and other objects of interest
+  /// from meta data in the file.
+  void discoverFileObjects();
+
+  /// Read info from special sections. E.g. eh_frame and .gcc_except_table
+  /// for exception and stack unwinding information.
+  void readSpecialSections();
+
+  /// Adjust supplied command-line options based on input data.
+  void adjustCommandLineOptions();
+
+  /// Process runtime relocations.
+  void processDynamicRelocations();
+
+  /// Process input relocations.
+  void processRelocations();
+
+  /// Insert an LKMarker for a given code pointer \p PC from a non-code section
+  /// \p SectionName.
+  void insertLKMarker(uint64_t PC, uint64_t SectionOffset,
+                      int32_t PCRelativeOffset, bool IsPCRelative,
+                      StringRef SectionName);
+
+  /// Process linux kernel special sections and their relocations.
+  void processLKSections();
+
+  /// Process special linux kernel section, __ex_table.
+  void processLKExTable();
+
+  /// Process special linux kernel section, .pci_fixup.
+  void processLKPCIFixup();
+
+  /// Process __ksymtab and __ksymtab_gpl.
+  void processLKKSymtab(bool IsGPL = false);
+
+  /// Process special linux kernel section, __bug_table.
+  void processLKBugTable();
+
+  /// Process special linux kernel section, .smp_locks.
+  void processLKSMPLocks();
+
+  /// Read relocations from a given section.
+  void readDynamicRelocations(const object::SectionRef &Section);
+
+  /// Read relocations from a given section.
+  void readRelocations(const object::SectionRef &Section);
+
+  /// Mark functions that are not meant for processing as ignored.
+  void selectFunctionsToProcess();
+
+  /// Read information from debug sections.
+  void readDebugInfo();
+
+  /// Read profile data without having disassembled functions available.
+  void preprocessProfileData();
+
+  void processProfileDataPreCFG();
+
+  /// Associate profile data with functions and data objects.
+  void processProfileData();
+
+  /// Disassemble each function in the binary and associate it with a
+  /// BinaryFunction object, preparing all information necessary for binary
+  /// optimization.
+  void disassembleFunctions();
+
+  void buildFunctionsCFG();
+
+  void postProcessFunctions();
+
+  /// Run optimizations that operate at the binary, or post-linker, level.
+  void runOptimizationPasses();
+
+  /// Write code and data into an intermediary object file, map virtual to real
+  /// addresses and link the object file, resolving all relocations and
+  /// performing final relaxation.
+  void emitAndLink();
+
+  /// Link additional runtime code to support instrumentation.
+  void linkRuntime();
+
+  /// Update debug and other auxiliary information in the file.
+  void updateMetadata();
+
+  /// Update SDTMarkers' locations for the output binary.
+  void updateSDTMarkers();
+
+  /// Update LKMarkers' locations for the output binary.
+  void updateLKMarkers();
+
+  /// Update address of MCDecodedPseudoProbe.
+  void updatePseudoProbes();
+
+  /// Return the list of code sections in the output order.
+  std::vector<BinarySection *> getCodeSections();
+
+  /// Map all sections to their final addresses.
+  void mapCodeSections(RuntimeDyld &RTDyld);
+  void mapDataSections(RuntimeDyld &RTDyld);
+  void mapFileSections(RuntimeDyld &RTDyld);
+  void mapExtraSections(RuntimeDyld &RTDyld);
+
+  /// Update output object's values based on the final \p Layout.
+  void updateOutputValues(const MCAsmLayout &Layout);
+
+  /// Rewrite back all functions (hopefully optimized) that fit in the original
+  /// memory footprint for that function. If the function is now larger and does
+  /// not fit in the binary, reject it and preserve the original version of the
+  /// function. If we couldn't understand the function for some reason in
+  /// disassembleFunctions(), also preserve the original version.
+  void rewriteFile();
+
+  /// Return address of a function in the new binary corresponding to
+  /// \p OldAddress address in the original binary.
+  uint64_t getNewFunctionAddress(uint64_t OldAddress);
+
+  /// Return value for the symbol \p Name in the output.
+  uint64_t getNewValueForSymbol(const StringRef Name);
+
+  /// Detect addresses and offsets available in the binary for allocating
+  /// new sections.
+  void discoverStorage();
+
+  /// Adjust function sizes and set proper maximum size values after the whole
+  /// symbol table has been processed.
+  void adjustFunctionBoundaries();
+
+  /// Make .eh_frame section relocatable.
+  void relocateEHFrameSection();
+
+  /// Analyze relocation \p Rel.
+  /// Return true if the relocation was successfully processed, false otherwise.
+  /// The \p SymbolName, \p SymbolAddress, \p Addend and \p ExtractedValue
+  /// parameters will be set on success.
+  bool analyzeRelocation(const object::RelocationRef &Rel, uint64_t RType,
+                         std::string &SymbolName, bool &IsSectionRelocation,
+                         uint64_t &SymbolAddress, int64_t &Addend,
+                         uint64_t &ExtractedValue) const;
+
+  /// Rewrite non-allocatable sections with modifications.
+  void rewriteNoteSections();
+
+  /// Write .eh_frame_hdr.
+  void writeEHFrameHeader();
+
+  /// Disassemble and create function entries for PLT.
+  void disassemblePLT();
+
+  /// ELF-specific part. TODO: refactor into new class.
+#define ELF_FUNCTION(FUNC)                                                     \
+  template <typename ELFT> void FUNC(object::ELFObjectFile<ELFT> *Obj);        \
+  void FUNC() {                                                                \
+    if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(InputFile))        \
+      return FUNC(ELF32LE);                                                    \
+    if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(InputFile))        \
+      return FUNC(ELF64LE);                                                    \
+    if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(InputFile))        \
+      return FUNC(ELF32BE);                                                    \
+    auto *ELF64BE = cast<object::ELF64BEObjectFile>(InputFile);                \
+    return FUNC(ELF64BE);                                                      \
+  }
+
+  /// Patch ELF book-keeping info.
+  void patchELFPHDRTable();
+
+  /// Create section header table.
+  ELF_FUNCTION(patchELFSectionHeaderTable);
+
+  /// Create the regular symbol table and patch dyn symbol tables.
+  ELF_FUNCTION(patchELFSymTabs);
+
+  /// Read dynamic section/segment of ELF.
+  ELF_FUNCTION(readELFDynamic);
+
+  /// Patch dynamic section/segment of ELF.
+  ELF_FUNCTION(patchELFDynamic);
+
+  /// Patch .got
+  ELF_FUNCTION(patchELFGOT);
+
+  /// Patch allocatable relocation sections.
+  ELF_FUNCTION(patchELFAllocatableRelaSections);
+
+  /// Finalize memory image of section header string table.
+  ELF_FUNCTION(finalizeSectionStringTable);
+
+  /// Return a name of the input file section in the output file.
+  template<typename ELFObjType, typename ELFShdrTy>
+  std::string getOutputSectionName(const ELFObjType &Obj,
+                                   const ELFShdrTy &Section);
+
+  /// Return a list of all sections to include in the output binary.
+  /// Populate \p NewSectionIndex with a map of input to output indices.
+  template <typename ELFT,
+            typename ELFShdrTy = typename object::ELFObjectFile<ELFT>::Elf_Shdr>
+  std::vector<ELFShdrTy>
+  getOutputSections(object::ELFObjectFile<ELFT> *File,
+                    std::vector<uint32_t> &NewSectionIndex);
+
+  /// Return true if \p Section should be stripped from the output binary.
+  template <typename ELFShdrTy>
+  bool shouldStrip(const ELFShdrTy &Section, StringRef SectionName);
+
+  /// Write ELF symbol table using \p Write and \p AddToStrTab functions
+  /// based on the input file symbol table passed in \p SymTabSection.
+  /// \p IsDynSym is set to true for dynamic symbol table since we
+  /// are updating it in-place with minimal modifications.
+  template <typename ELFT,
+            typename ELFShdrTy = typename object::ELFObjectFile<ELFT>::Elf_Shdr,
+            typename WriteFuncTy, typename StrTabFuncTy>
+  void updateELFSymbolTable(object::ELFObjectFile<ELFT> *File, bool IsDynSym,
+                            const ELFShdrTy &SymTabSection,
+                            const std::vector<uint32_t> &NewSectionIndex,
+                            WriteFuncTy Write, StrTabFuncTy AddToStrTab);
+
+  /// Add a notes section containing the BOLT revision and command line options.
+  void addBoltInfoSection();
+
+  /// Add a notes section containing the serialized BOLT Address Translation
+  /// maps that can be used to enable sampling of the output binary for the
+  /// purpose of generating BOLT profile data for the input binary.
+  void addBATSection();
+
+  /// Loop over now emitted functions to write translation maps
+  void encodeBATSection();
+
+  /// Update the ELF note section containing the binary build-id to reflect
+  /// a new build-id, so tools can differentiate between the old and the
+  /// rewritten binary.
+  void patchBuildID();
+
+  /// Return file offset corresponding to a given virtual address.
+  uint64_t getFileOffsetFor(uint64_t Address) {
+    assert(Address >= NewTextSegmentAddress &&
+           "address in not in the new text segment");
+    return Address - NewTextSegmentAddress + NewTextSegmentOffset;
+  }
+
+  /// Return file offset corresponding to a virtual \p Address.
+  /// Return 0 if the address has no mapping in the file, including being
+  /// part of .bss section.
+  uint64_t getFileOffsetForAddress(uint64_t Address) const;
+
+  /// Return true if we will overwrite contents of the section instead
+  /// of appending contents to it.
+  bool willOverwriteSection(StringRef SectionName);
+
+  /// Parse .note.stapsdt section
+  void parseSDTNotes();
+
+  /// Parse .pseudo_probe_desc section and .pseudo_probe section
+  /// Setup Pseudo probe decoder
+  void parsePseudoProbe();
+
+  /// Print all SDT markers
+  void printSDTMarkers();
+
+public:
+  /// Standard ELF sections we overwrite.
+  static constexpr const char *SectionsToOverwrite[] = {
+    ".shstrtab",
+    ".symtab",
+    ".strtab",
+  };
+
+  /// Debug section to we overwrite while updating the debug info.
+  static std::vector<std::string> DebugSectionsToOverwrite;
+
+  /// Return true if the section holds debug information.
+  static bool isDebugSection(StringRef SectionName);
+
+  /// Return true if the section holds linux kernel symbol information.
+  static bool isKSymtabSection(StringRef SectionName);
+
+  /// Adds Debug section to overwrite.
+  static void addToDebugSectionsToOverwrite(const char *Section) {
+    DebugSectionsToOverwrite.emplace_back(Section);
+  }
+
+private:
+  /// Get the contents of the LSDA section for this binary.
+  ArrayRef<uint8_t> getLSDAData();
+
+  /// Get the mapped address of the LSDA section for this binary.
+  uint64_t getLSDAAddress();
+
+  static const char TimerGroupName[];
+
+  static const char TimerGroupDesc[];
+
+  /// Alignment value used for .eh_frame_hdr.
+  static constexpr uint64_t EHFrameHdrAlign = 4;
+
+  /// String to be added before the original section name.
+  ///
+  /// When BOLT creates a new section with the same name as the one in the
+  /// input file, it may need to preserve the original section. This prefix
+  /// will be added to the name of the original section.
+  static StringRef getOrgSecPrefix() { return ".bolt.org"; }
+
+  /// Section name used for new code.
+  static StringRef getBOLTTextSectionName() { return ".bolt.text"; }
+
+  /// An instance of the input binary we are processing, externally owned.
+  llvm::object::ELFObjectFileBase *InputFile;
+
+  /// Command line args used to process binary.
+  const int Argc;
+  const char *const *Argv;
+  StringRef ToolPath;
+
+  std::unique_ptr<ProfileReaderBase> ProfileReader;
+
+  std::unique_ptr<BinaryContext> BC;
+  std::unique_ptr<CFIReaderWriter> CFIRdWrt;
+
+  // Run ExecutionEngine linker with custom memory manager and symbol resolver.
+  std::unique_ptr<RuntimeDyld> RTDyld;
+
+  /// Output file where we mix original code from the input binary and
+  /// optimized code for selected functions.
+  std::unique_ptr<ToolOutputFile> Out;
+
+  /// Offset in the input file where non-allocatable sections start.
+  uint64_t FirstNonAllocatableOffset{0};
+
+  /// Information about program header table.
+  uint64_t PHDRTableAddress{0};
+  uint64_t PHDRTableOffset{0};
+  unsigned Phnum{0};
+
+  /// New code segment info.
+  uint64_t NewTextSegmentAddress{0};
+  uint64_t NewTextSegmentOffset{0};
+  uint64_t NewTextSegmentSize{0};
+
+  /// Track next available address for new allocatable sections.
+  uint64_t NextAvailableAddress{0};
+
+  /// Location and size of dynamic relocations.
+  Optional<uint64_t> DynamicRelocationsAddress;
+  uint64_t DynamicRelocationsSize{0};
+
+  /// PLT relocations are special kind of dynamic relocations stored separately.
+  Optional<uint64_t> PLTRelocationsAddress;
+  uint64_t PLTRelocationsSize{0};
+
+  /// Store all non-zero symbols in this map for a quick address lookup.
+  std::map<uint64_t, llvm::object::SymbolRef> FileSymRefs;
+
+  std::unique_ptr<DWARFRewriter> DebugInfoRewriter;
+
+  std::unique_ptr<BoltAddressTranslation> BAT;
+
+  /// Number of local symbols in newly written symbol table.
+  uint64_t NumLocalSymbols{0};
+
+  /// Information on special Procedure Linkage Table sections. There are
+  /// multiple variants generated by different linkers.
+  struct PLTSectionInfo {
+    const char *Name;
+    uint64_t EntrySize;
+  };
+
+  /// Different types of X86-64 PLT sections.
+  const PLTSectionInfo X86_64_PLTSections[4] = {
+      { ".plt", 16 },
+      { ".plt.got", 8 },
+      { ".plt.sec", 16 },
+      { nullptr, 0 }
+  };
+
+  /// AArch64 PLT sections.
+  const PLTSectionInfo AArch64_PLTSections[2] = {
+      { ".plt", 16 },
+      { nullptr, 0 }
+  };
+
+  /// Return PLT information for a section with \p SectionName or nullptr
+  /// if the section is not PLT.
+  const PLTSectionInfo *getPLTSectionInfo(StringRef SectionName) {
+    const PLTSectionInfo *PLTSI = nullptr;
+    switch (BC->TheTriple->getArch()) {
+    default:
+      break;
+    case Triple::x86_64:
+      PLTSI = X86_64_PLTSections;
+      break;
+    case Triple::aarch64:
+      PLTSI = AArch64_PLTSections;
+      break;
+    }
+    for (; PLTSI && PLTSI->Name; ++PLTSI) {
+      if (SectionName == PLTSI->Name)
+        return PLTSI;
+    }
+    return nullptr;
+  }
+
+  /// Exception handling and stack unwinding information in this binary.
+  ErrorOr<BinarySection &> LSDASection{std::errc::bad_address};
+  ErrorOr<BinarySection &> EHFrameSection{std::errc::bad_address};
+
+  /// .got.plt sections.
+  ///
+  /// Contains jump slots (addresses) indirectly referenced by
+  /// instructions in .plt section.
+  ErrorOr<BinarySection &> GOTPLTSection{std::errc::bad_address};
+
+  /// .rela.plt section.
+  ///
+  /// Contains relocations against .got.plt.
+  ErrorOr<BinarySection &> RelaPLTSection{std::errc::bad_address};
+  ErrorOr<BinarySection &> RelaDynSection{std::errc::bad_address};
+
+  /// .note.gnu.build-id section.
+  ErrorOr<BinarySection &> BuildIDSection{std::errc::bad_address};
+
+  /// .note.stapsdt section.
+  /// Contains information about statically defined tracing points
+  ErrorOr<BinarySection &> SDTSection{std::errc::bad_address};
+
+  /// .pseudo_probe_desc section.
+  /// Contains information about pseudo probe description, like its related
+  /// function
+  ErrorOr<BinarySection &> PseudoProbeDescSection{std::errc::bad_address};
+
+  /// .pseudo_probe section.
+  /// Contains information about pseudo probe details, like its address
+  ErrorOr<BinarySection &> PseudoProbeSection{std::errc::bad_address};
+
+  /// A reference to the build-id bytes in the original binary
+  StringRef BuildID;
+
+  /// Keep track of functions we fail to write in the binary. We need to avoid
+  /// rewriting CFI info for these functions.
+  std::vector<uint64_t> FailedAddresses;
+
+  /// Keep track of which functions didn't fit in their original space in the
+  /// last emission, so that we may either decide to split or not optimize them.
+  std::set<uint64_t> LargeFunctions;
+
+  /// Section header string table.
+  StringTableBuilder SHStrTab;
+  std::vector<std::string> SHStrTabPool;
+
+  /// A rewrite of strtab
+  std::string NewStrTab;
+
+  /// Number of processed to data relocations.  Used to implement the
+  /// -max-relocations debugging option.
+  uint64_t NumDataRelocations{0};
+
+  NameResolver NR;
+
+  friend class RewriteInstanceDiff;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/RuntimeLibs/CMakeLists.txt b/bolt/src/RuntimeLibs/CMakeLists.txt
new file mode 100644
index 000000000000..c657bedd175c
--- /dev/null
+++ b/bolt/src/RuntimeLibs/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMBOLTRuntimeLibs
+  RuntimeLibrary.cpp
+  HugifyRuntimeLibrary.cpp
+  InstrumentationRuntimeLibrary.cpp
+
+  DEPENDS
+  intrinsics_gen
+  )
+
+include_directories( ${BOLT_SOURCE_DIR}/src )
diff --git a/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.cpp b/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.cpp
new file mode 100644
index 000000000000..d3b93def6996
--- /dev/null
+++ b/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.cpp
@@ -0,0 +1,109 @@
+//===------HugifyRuntimeLibrary.cpp - The Hugify Runtime Library ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "HugifyRuntimeLibrary.h"
+#include "BinaryFunction.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> HotText;
+
+cl::opt<bool>
+    Hugify("hugify",
+           cl::desc("Automatically put hot code on 2MB page(s) (hugify) at "
+                    "runtime. No manual call to hugify is needed in the binary "
+                    "(which is what --hot-text relies on)."),
+           cl::ZeroOrMore, cl::cat(BoltOptCategory));
+
+static cl::opt<std::string> RuntimeHugifyLib(
+    "runtime-hugify-lib",
+    cl::desc("specify file name of the runtime hugify library"), cl::ZeroOrMore,
+    cl::init("libbolt_rt_hugify.a"), cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+void HugifyRuntimeLibrary::adjustCommandLineOptions(
+    const BinaryContext &BC) const {
+  if (opts::HotText) {
+    errs()
+        << "BOLT-ERROR: -hot-text should be applied to binaries with "
+           "pre-compiled manual hugify support, while -hugify will add hugify "
+           "support automatcally. These two options cannot both be present.\n";
+    exit(1);
+  }
+  // After the check, we set HotText to be true because automated hugify support
+  // relies on it.
+  opts::HotText = true;
+  if (!BC.StartFunctionAddress) {
+    errs() << "BOLT-ERROR: hugify runtime libraries require a known entry "
+              "point of "
+              "the input binary\n";
+    exit(1);
+  }
+}
+
+void HugifyRuntimeLibrary::emitBinary(BinaryContext &BC, MCStreamer &Streamer) {
+  const BinaryFunction *StartFunction =
+      BC.getBinaryFunctionAtAddress(*(BC.StartFunctionAddress));
+  assert(!StartFunction->isFragment() && "expected main function fragment");
+  if (!StartFunction) {
+    errs() << "BOLT-ERROR: failed to locate function at binary start address\n";
+    exit(1);
+  }
+
+  const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
+                                             /*IsText=*/false,
+                                             /*IsAllocatable=*/true);
+  MCSectionELF *Section =
+      BC.Ctx->getELFSection(".bolt.hugify.entries", ELF::SHT_PROGBITS, Flags);
+
+  // __bolt_hugify_init_ptr stores the poiter the hugify library needs to
+  // jump to after finishing the init code.
+  MCSymbol *InitPtr = BC.Ctx->getOrCreateSymbol("__bolt_hugify_init_ptr");
+
+  Section->setAlignment(llvm::Align(BC.RegularPageSize));
+  Streamer.SwitchSection(Section);
+
+  Streamer.emitLabel(InitPtr);
+  Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global);
+  Streamer.emitValue(
+      MCSymbolRefExpr::create(StartFunction->getSymbol(), *(BC.Ctx)),
+      /*Size=*/8);
+}
+
+void HugifyRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
+                                RuntimeDyld &RTDyld,
+                                std::function<void(RuntimeDyld &)> OnLoad) {
+  std::string LibPath = getLibPath(ToolPath, opts::RuntimeHugifyLib);
+  loadLibrary(LibPath, RTDyld);
+  OnLoad(RTDyld);
+  RTDyld.finalizeWithMemoryManagerLocking();
+  if (RTDyld.hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed: " << RTDyld.getErrorString() << "\n";
+    exit(1);
+  }
+
+  assert(!RuntimeStartAddress &&
+         "We don't currently support linking multiple runtime libraries");
+  RuntimeStartAddress = RTDyld.getSymbol("__bolt_hugify_self").getAddress();
+  if (!RuntimeStartAddress) {
+    errs() << "BOLT-ERROR: instrumentation library does not define "
+              "__bolt_hugify_self: "
+           << LibPath << "\n";
+    exit(1);
+  }
+}
diff --git a/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.h b/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.h
new file mode 100644
index 000000000000..c24374ed925a
--- /dev/null
+++ b/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.h
@@ -0,0 +1,36 @@
+//===------- HugifyRuntimeLibrary.h - The Hugify Runtime Library ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_HUGIFY_RUNTIME_LIBRARY_H
+#define LLVM_TOOLS_LLVM_BOLT_HUGIFY_RUNTIME_LIBRARY_H
+
+#include "RuntimeLibs/RuntimeLibrary.h"
+
+namespace llvm {
+namespace bolt {
+
+class HugifyRuntimeLibrary : public RuntimeLibrary {
+public:
+  /// Add custom section names generated by the runtime libraries to \p
+  /// SecNames.
+  void addRuntimeLibSections(std::vector<std::string> &SecNames) const final {
+    SecNames.push_back(".bolt.hugify.entries");
+  }
+
+  void adjustCommandLineOptions(const BinaryContext &BC) const final;
+
+  void emitBinary(BinaryContext &BC, MCStreamer &Streamer) final;
+
+  void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+            std::function<void(RuntimeDyld &)> OnLoad) final;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
new file mode 100644
index 000000000000..4ee343ab9f4b
--- /dev/null
+++ b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
@@ -0,0 +1,336 @@
+//=  InstrumentationRuntimeLibrary.cpp - The Instrumentation Runtime Library =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrumentationRuntimeLibrary.h"
+#include "BinaryFunction.h"
+#include "JumpTable.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> InstrumentationFileAppendPID;
+extern cl::opt<std::string> InstrumentationFilename;
+extern cl::opt<std::string> InstrumentationBinpath;
+extern cl::opt<uint32_t> InstrumentationSleepTime;
+extern cl::opt<bool> InstrumentationNoCountersClear;
+extern cl::opt<bool> InstrumentationWaitForks;
+extern cl::opt<JumpTableSupportLevel> JumpTables;
+
+cl::opt<bool>
+    Instrument("instrument",
+               cl::desc("instrument code to generate accurate profile data"),
+               cl::ZeroOrMore, cl::cat(BoltOptCategory));
+
+cl::opt<std::string> RuntimeInstrumentationLib(
+    "runtime-instrumentation-lib",
+    cl::desc("specify file name of the runtime instrumentation library"),
+    cl::ZeroOrMore, cl::init("libbolt_rt_instr.a"), cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+void InstrumentationRuntimeLibrary::adjustCommandLineOptions(
+    const BinaryContext &BC) const {
+  if (!BC.HasRelocations) {
+    errs() << "BOLT-ERROR: instrumentation runtime libraries require "
+              "relocations\n";
+    exit(1);
+  }
+  if (opts::JumpTables != JTS_MOVE) {
+    opts::JumpTables = JTS_MOVE;
+    outs() << "BOLT-INFO: forcing -jump-tables=move for instrumentation\n";
+  }
+  if (!BC.StartFunctionAddress) {
+    errs() << "BOLT-ERROR: instrumentation runtime libraries require a known "
+              "entry point of "
+              "the input binary\n";
+    exit(1);
+  }
+  if (!BC.FiniFunctionAddress && BC.hasDynamicHeader) {
+    errs() << "BOLT-ERROR: input binary lacks DT_FINI entry in the dynamic "
+              "section but instrumentation currently relies on patching "
+              "DT_FINI to write the profile\n";
+    exit(1);
+  }
+}
+
+void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
+                                               MCStreamer &Streamer) {
+  MCSection *Section = BC.isELF()
+                           ? static_cast<MCSection *>(BC.Ctx->getELFSection(
+                                 ".bolt.instr.counters", ELF::SHT_PROGBITS,
+                                 BinarySection::getFlags(/*IsReadOnly=*/false,
+                                                         /*IsText=*/false,
+                                                         /*IsAllocatable=*/true)
+
+                                     ))
+                           : static_cast<MCSection *>(BC.Ctx->getMachOSection(
+                                 "__BOLT", "__counters", MachO::S_REGULAR,
+                                 SectionKind::getData()));
+
+  if (!BC.hasDynamicHeader && !opts::InstrumentationSleepTime) {
+    errs() << "BOLT-ERROR: instrumentation of binary without dynamic header "
+              "(e.g. static) currently does not support profile output on "
+              "binary finalization, so it requires "
+              "-instrumentation-sleep-time=N (N>0) usage\n";
+    exit(1);
+  }
+
+  Section->setAlignment(llvm::Align(BC.RegularPageSize));
+  Streamer.SwitchSection(Section);
+
+  // EmitOffset is used to determine padding size for data alignment
+  uint64_t EmitOffset = 0;
+
+  auto emitLabel = [&Streamer](MCSymbol *Symbol, bool IsGlobal = true) {
+    Streamer.emitLabel(Symbol);
+    if (IsGlobal)
+      Streamer.emitSymbolAttribute(Symbol, MCSymbolAttr::MCSA_Global);
+  };
+
+  auto emitLabelByName = [&BC, emitLabel](StringRef Name,
+                                          bool IsGlobal = true) {
+    MCSymbol *Symbol = BC.Ctx->getOrCreateSymbol(Name);
+    emitLabel(Symbol, IsGlobal);
+  };
+
+  auto emitPadding = [&Streamer, &EmitOffset](unsigned Size) {
+    const uint64_t Padding = alignTo(EmitOffset, Size) - EmitOffset;
+    if (Padding) {
+      Streamer.emitFill(Padding, 0);
+      EmitOffset += Padding;
+    }
+  };
+
+  auto emitDataSize = [&EmitOffset](unsigned Size) { EmitOffset += Size; };
+
+  auto emitDataPadding = [emitPadding, emitDataSize](unsigned Size) {
+    emitPadding(Size);
+    emitDataSize(Size);
+  };
+
+  auto emitFill = [&Streamer, emitDataSize,
+                   emitLabel](unsigned Size, MCSymbol *Symbol = nullptr,
+                              uint8_t Byte = 0) {
+    emitDataSize(Size);
+    if (Symbol)
+      emitLabel(Symbol, /*IsGlobal*/ false);
+    Streamer.emitFill(Size, Byte);
+  };
+
+  auto emitValue = [&BC, &Streamer, emitDataPadding,
+                    emitLabel](MCSymbol *Symbol, const MCExpr *Value) {
+    const unsigned Psize = BC.AsmInfo->getCodePointerSize();
+    emitDataPadding(Psize);
+    emitLabel(Symbol);
+    if (Value)
+      Streamer.emitValue(Value, Psize);
+    else
+      Streamer.emitFill(Psize, 0);
+  };
+
+  auto emitIntValue = [&Streamer, emitDataPadding, emitLabelByName](
+                          StringRef Name, uint64_t Value, unsigned Size = 4) {
+    emitDataPadding(Size);
+    emitLabelByName(Name);
+    Streamer.emitIntValue(Value, Size);
+  };
+
+  auto emitString = [&Streamer, emitDataSize, emitLabelByName,
+                     emitFill](StringRef Name, StringRef Contents) {
+    emitDataSize(Contents.size());
+    emitLabelByName(Name);
+    Streamer.emitBytes(Contents);
+    emitFill(1);
+  };
+
+  // All of the following symbols will be exported as globals to be used by the
+  // instrumentation runtime library to dump the instrumentation data to disk.
+  // Label marking start of the memory region containing instrumentation
+  // counters, total vector size is Counters.size() 8-byte counters
+  emitLabelByName("__bolt_instr_locations");
+  for (MCSymbol *const &Label : Summary->Counters)
+    emitFill(sizeof(uint64_t), Label);
+
+  emitPadding(BC.RegularPageSize);
+  emitIntValue("__bolt_instr_sleep_time", opts::InstrumentationSleepTime);
+  emitIntValue("__bolt_instr_no_counters_clear",
+               !!opts::InstrumentationNoCountersClear, 1);
+  emitIntValue("__bolt_instr_wait_forks", !!opts::InstrumentationWaitForks, 1);
+  emitIntValue("__bolt_num_counters", Summary->Counters.size());
+  emitValue(Summary->IndCallInstrPointer, nullptr);
+  emitValue(Summary->IndTailCallInstrPointer, nullptr);
+  emitIntValue("__bolt_instr_num_ind_calls",
+               Summary->IndCallDescriptions.size());
+  emitIntValue("__bolt_instr_num_ind_targets",
+               Summary->IndCallTargetDescriptions.size());
+  emitIntValue("__bolt_instr_num_funcs", Summary->FunctionDescriptions.size());
+  emitString("__bolt_instr_filename", opts::InstrumentationFilename);
+  emitString("__bolt_instr_binpath", opts::InstrumentationBinpath);
+  emitIntValue("__bolt_instr_use_pid", !!opts::InstrumentationFileAppendPID, 1);
+
+  if (BC.isMachO()) {
+    MCSection *TablesSection = BC.Ctx->getMachOSection(
+                                 "__BOLT", "__tables", MachO::S_REGULAR,
+                                 SectionKind::getData());
+    TablesSection->setAlignment(llvm::Align(BC.RegularPageSize));
+    Streamer.SwitchSection(TablesSection);
+    emitString("__bolt_instr_tables", buildTables(BC));
+  }
+}
+
+void InstrumentationRuntimeLibrary::link(
+    BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+    std::function<void(RuntimeDyld &)> OnLoad) {
+  std::string LibPath = getLibPath(ToolPath, opts::RuntimeInstrumentationLib);
+  loadLibrary(LibPath, RTDyld);
+  OnLoad(RTDyld);
+  RTDyld.finalizeWithMemoryManagerLocking();
+  if (RTDyld.hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed: " << RTDyld.getErrorString() << "\n";
+    exit(1);
+  }
+
+  if (BC.isMachO())
+    return;
+
+  RuntimeFiniAddress = RTDyld.getSymbol("__bolt_instr_fini").getAddress();
+  if (!RuntimeFiniAddress) {
+    errs() << "BOLT-ERROR: instrumentation library does not define "
+              "__bolt_instr_fini: "
+           << LibPath << "\n";
+    exit(1);
+  }
+  RuntimeStartAddress = RTDyld.getSymbol("__bolt_instr_start").getAddress();
+  if (!RuntimeStartAddress) {
+    errs() << "BOLT-ERROR: instrumentation library does not define "
+              "__bolt_instr_start: "
+           << LibPath << "\n";
+    exit(1);
+  }
+  outs() << "BOLT-INFO: output linked against instrumentation runtime "
+            "library, lib entry point is 0x"
+         << Twine::utohexstr(RuntimeFiniAddress) << "\n";
+  outs() << "BOLT-INFO: clear procedure is 0x"
+         << Twine::utohexstr(
+                RTDyld.getSymbol("__bolt_instr_clear_counters").getAddress())
+         << "\n";
+
+  emitTablesAsELFNote(BC);
+}
+
+std::string InstrumentationRuntimeLibrary::buildTables(BinaryContext &BC) {
+  std::string TablesStr;
+  raw_string_ostream OS(TablesStr);
+
+  // This is sync'ed with runtime/instr.cpp:readDescriptions()
+  auto getOutputAddress = [](const BinaryFunction &Func,
+                             uint64_t Offset) -> uint64_t {
+    return Offset == 0
+               ? Func.getOutputAddress()
+               : Func.translateInputToOutputAddress(Func.getAddress() + Offset);
+  };
+
+  // Indirect targets need to be sorted for fast lookup during runtime
+  std::sort(Summary->IndCallTargetDescriptions.begin(),
+            Summary->IndCallTargetDescriptions.end(),
+            [&](const IndCallTargetDescription &A,
+                const IndCallTargetDescription &B) {
+              return getOutputAddress(*A.Target, A.ToLoc.Offset) <
+                     getOutputAddress(*B.Target, B.ToLoc.Offset);
+            });
+
+  // Start of the vector with descriptions (one CounterDescription for each
+  // counter), vector size is Counters.size() CounterDescription-sized elmts
+  const size_t IDSize =
+      Summary->IndCallDescriptions.size() * sizeof(IndCallDescription);
+  OS.write(reinterpret_cast<const char *>(&IDSize), 4);
+  for (const IndCallDescription &Desc : Summary->IndCallDescriptions) {
+    OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.FuncString), 4);
+    OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.Offset), 4);
+  }
+
+  const size_t ITDSize = Summary->IndCallTargetDescriptions.size() *
+                         sizeof(IndCallTargetDescription);
+  OS.write(reinterpret_cast<const char *>(&ITDSize), 4);
+  for (const IndCallTargetDescription &Desc :
+       Summary->IndCallTargetDescriptions) {
+    OS.write(reinterpret_cast<const char *>(&Desc.ToLoc.FuncString), 4);
+    OS.write(reinterpret_cast<const char *>(&Desc.ToLoc.Offset), 4);
+    uint64_t TargetFuncAddress =
+        getOutputAddress(*Desc.Target, Desc.ToLoc.Offset);
+    OS.write(reinterpret_cast<const char *>(&TargetFuncAddress), 8);
+  }
+
+  uint32_t FuncDescSize = Summary->getFDSize();
+  OS.write(reinterpret_cast<const char *>(&FuncDescSize), 4);
+  for (const FunctionDescription &Desc : Summary->FunctionDescriptions) {
+    const size_t LeafNum = Desc.LeafNodes.size();
+    OS.write(reinterpret_cast<const char *>(&LeafNum), 4);
+    for (const InstrumentedNode &LeafNode : Desc.LeafNodes) {
+      OS.write(reinterpret_cast<const char *>(&LeafNode.Node), 4);
+      OS.write(reinterpret_cast<const char *>(&LeafNode.Counter), 4);
+    }
+    const size_t EdgesNum = Desc.Edges.size();
+    OS.write(reinterpret_cast<const char *>(&EdgesNum), 4);
+    for (const EdgeDescription &Edge : Desc.Edges) {
+      OS.write(reinterpret_cast<const char *>(&Edge.FromLoc.FuncString), 4);
+      OS.write(reinterpret_cast<const char *>(&Edge.FromLoc.Offset), 4);
+      OS.write(reinterpret_cast<const char *>(&Edge.FromNode), 4);
+      OS.write(reinterpret_cast<const char *>(&Edge.ToLoc.FuncString), 4);
+      OS.write(reinterpret_cast<const char *>(&Edge.ToLoc.Offset), 4);
+      OS.write(reinterpret_cast<const char *>(&Edge.ToNode), 4);
+      OS.write(reinterpret_cast<const char *>(&Edge.Counter), 4);
+    }
+    const size_t CallsNum = Desc.Calls.size();
+    OS.write(reinterpret_cast<const char *>(&CallsNum), 4);
+    for (const CallDescription &Call : Desc.Calls) {
+      OS.write(reinterpret_cast<const char *>(&Call.FromLoc.FuncString), 4);
+      OS.write(reinterpret_cast<const char *>(&Call.FromLoc.Offset), 4);
+      OS.write(reinterpret_cast<const char *>(&Call.FromNode), 4);
+      OS.write(reinterpret_cast<const char *>(&Call.ToLoc.FuncString), 4);
+      OS.write(reinterpret_cast<const char *>(&Call.ToLoc.Offset), 4);
+      OS.write(reinterpret_cast<const char *>(&Call.Counter), 4);
+      uint64_t TargetFuncAddress =
+          getOutputAddress(*Call.Target, Call.ToLoc.Offset);
+      OS.write(reinterpret_cast<const char *>(&TargetFuncAddress), 8);
+    }
+    const size_t EntryNum = Desc.EntryNodes.size();
+    OS.write(reinterpret_cast<const char *>(&EntryNum), 4);
+    for (const EntryNode &EntryNode : Desc.EntryNodes) {
+      OS.write(reinterpret_cast<const char *>(&EntryNode.Node), 8);
+      uint64_t TargetFuncAddress =
+          getOutputAddress(*Desc.Function, EntryNode.Address);
+      OS.write(reinterpret_cast<const char *>(&TargetFuncAddress), 8);
+    }
+  }
+  // Our string table lives immediately after descriptions vector
+  OS << Summary->StringTable;
+  OS.flush();
+
+  return TablesStr;
+}
+
+void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
+  std::string TablesStr = buildTables(BC);
+  const std::string BoltInfo = BinarySection::encodeELFNote(
+      "BOLT", TablesStr, BinarySection::NT_BOLT_INSTRUMENTATION_TABLES);
+  BC.registerOrUpdateNoteSection(".bolt.instr.tables", copyByteArray(BoltInfo),
+                                 BoltInfo.size(),
+                                 /*Alignment=*/1,
+                                 /*IsReadOnly=*/true, ELF::SHT_NOTE);
+}
diff --git a/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.h b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.h
new file mode 100644
index 000000000000..39f1691d5d2c
--- /dev/null
+++ b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.h
@@ -0,0 +1,50 @@
+//===--- InstrumentationRuntimeLibrary.h - The Instrument Runtime Library -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_INSTRUMENTATION_RUNTIME_LIBRARY_H
+#define LLVM_TOOLS_LLVM_BOLT_INSTRUMENTATION_RUNTIME_LIBRARY_H
+
+#include "Passes/InstrumentationSummary.h"
+#include "RuntimeLibs/RuntimeLibrary.h"
+
+namespace llvm {
+namespace bolt {
+
+class InstrumentationRuntimeLibrary : public RuntimeLibrary {
+public:
+  void setSummary(std::unique_ptr<InstrumentationSummary> &&S) {
+    Summary.swap(S);
+  }
+
+  void addRuntimeLibSections(std::vector<std::string> &SecNames) const final {
+    SecNames.push_back(".bolt.instr.counters");
+  }
+
+  void adjustCommandLineOptions(const BinaryContext &BC) const final;
+
+  void emitBinary(BinaryContext &BC, MCStreamer &Streamer) final;
+
+  void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+            std::function<void(RuntimeDyld &)> OnLoad) final;
+
+private:
+  std::string buildTables(BinaryContext &BC);
+
+  /// Create a non-allocatable ELF section with read-only tables necessary for
+  /// writing the instrumented data profile during program finish. The runtime
+  /// library needs to open the program executable file and read this data from
+  /// disk, this is not loaded by the system.
+  void emitTablesAsELFNote(BinaryContext &BC);
+
+  std::unique_ptr<InstrumentationSummary> Summary;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/RuntimeLibs/RuntimeLibrary.cpp b/bolt/src/RuntimeLibs/RuntimeLibrary.cpp
new file mode 100644
index 000000000000..8400c51c6e4c
--- /dev/null
+++ b/bolt/src/RuntimeLibs/RuntimeLibrary.cpp
@@ -0,0 +1,73 @@
+//===--- RuntimeLibrary.cpp - The Runtime Library -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "RuntimeLibrary.h"
+#include "Utils.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Support/Path.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "bolt-rtlib"
+
+using namespace llvm;
+using namespace bolt;
+
+void RuntimeLibrary::anchor() {}
+
+std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
+                                       StringRef LibFileName) {
+  StringRef Dir = llvm::sys::path::parent_path(ToolPath);
+  SmallString<128> LibPath = llvm::sys::path::parent_path(Dir);
+  llvm::sys::path::append(LibPath, "lib");
+  if (!llvm::sys::fs::exists(LibPath)) {
+    // In some cases we install bolt binary into one level deeper in bin/,
+    // we need to go back one more level to find lib directory.
+    LibPath =
+        llvm::sys::path::parent_path(llvm::sys::path::parent_path(Dir));
+    llvm::sys::path::append(LibPath, "lib");
+  }
+  llvm::sys::path::append(LibPath, LibFileName);
+  if (!llvm::sys::fs::exists(LibPath)) {
+    errs() << "BOLT-ERROR: library not found: " << LibPath << "\n";
+    exit(1);
+  }
+  return std::string(LibPath.str());
+}
+
+void RuntimeLibrary::loadLibrary(StringRef LibPath, RuntimeDyld &RTDyld) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf =
+      MemoryBuffer::getFile(LibPath, -1, false);
+  check_error(MaybeBuf.getError(), LibPath);
+  std::unique_ptr<MemoryBuffer> B = std::move(MaybeBuf.get());
+  file_magic Magic = identify_magic(B->getBuffer());
+
+  if (Magic == file_magic::archive) {
+    Error Err = Error::success();
+    object::Archive Archive(B.get()->getMemBufferRef(), Err);
+    for (const object::Archive::Child &C : Archive.children(Err)) {
+      std::unique_ptr<object::Binary> Bin = cantFail(C.getAsBinary());
+      if (object::ObjectFile *Obj = dyn_cast<object::ObjectFile>(&*Bin)) {
+        RTDyld.loadObject(*Obj);
+      }
+    }
+    check_error(std::move(Err), B->getBufferIdentifier());
+  } else if (Magic == file_magic::elf_relocatable ||
+             Magic == file_magic::elf_shared_object) {
+    std::unique_ptr<object::ObjectFile> Obj = cantFail(
+      object::ObjectFile::createObjectFile(B.get()->getMemBufferRef()),
+      "error creating in-memory object");
+    RTDyld.loadObject(*Obj);
+  } else {
+    errs() << "BOLT-ERROR: unrecognized library format: " << LibPath << "\n";
+    exit(1);
+  }
+}
diff --git a/bolt/src/RuntimeLibs/RuntimeLibrary.h b/bolt/src/RuntimeLibs/RuntimeLibrary.h
new file mode 100644
index 000000000000..4451a3c6401b
--- /dev/null
+++ b/bolt/src/RuntimeLibs/RuntimeLibrary.h
@@ -0,0 +1,71 @@
+//===------------- RuntimeLibrary.h - The Runtime Library -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides all the necessary utilities to link runtime libraries during
+// binary rewriting, such as the instrumentation runtime library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_LINKRUNTIME_H
+#define LLVM_TOOLS_LLVM_BOLT_LINKRUNTIME_H
+
+#include <llvm/ADT/StringRef.h>
+#include <functional>
+#include <vector>
+
+namespace llvm {
+
+class MCStreamer;
+class RuntimeDyld;
+
+namespace bolt {
+
+class BinaryContext;
+
+class RuntimeLibrary {
+  // vtable anchor.
+  virtual void anchor();
+
+public:
+  virtual ~RuntimeLibrary() = default;
+
+  uint64_t getRuntimeFiniAddress() const { return RuntimeFiniAddress; }
+
+  uint64_t getRuntimeStartAddress() const { return RuntimeStartAddress; }
+
+  /// Add custom sections added by the runtime libraries.
+  virtual void
+  addRuntimeLibSections(std::vector<std::string> &SecNames) const = 0;
+
+  /// Validity check and modify if necessary all the command line options
+  /// for this runtime library.
+  virtual void adjustCommandLineOptions(const BinaryContext &BC) const = 0;
+
+  /// Emit data structures that will be necessary during runtime.
+  virtual void emitBinary(BinaryContext &BC, MCStreamer &Streamer) = 0;
+
+  /// Link with the library code.
+  virtual void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+                    std::function<void(RuntimeDyld &)> OnLoad) = 0;
+
+protected:
+  /// The fini and init address set by the runtime library.
+  uint64_t RuntimeFiniAddress{0};
+  uint64_t RuntimeStartAddress{0};
+
+  /// Get the full path to a runtime library specified by \p LibFileName.
+  static std::string getLibPath(StringRef ToolPath, StringRef LibFileName);
+
+  /// Load a static runtime library specified by \p LibPath.
+  static void loadLibrary(StringRef LibPath, RuntimeDyld &RTDyld);
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp
new file mode 100644
index 000000000000..728c9589ea05
--- /dev/null
+++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -0,0 +1,1064 @@
+//===-- AArch64MCPlusBuilder.cpp - --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64-specific MC+ builder.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCPlus.h"
+#include "MCPlusBuilder.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "bolt-aarch64"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+
+class AArch64MCPlusBuilder : public MCPlusBuilder {
+public:
+  AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
+                       const MCRegisterInfo *RegInfo)
+    : MCPlusBuilder(Analysis, Info, RegInfo) {}
+
+  bool equals(const MCTargetExpr &A, const MCTargetExpr &B,
+              CompFuncTy Comp) const override {
+    const auto &AArch64ExprA = cast<AArch64MCExpr>(A);
+    const auto &AArch64ExprB = cast<AArch64MCExpr>(B);
+    if (AArch64ExprA.getKind() !=  AArch64ExprB.getKind())
+      return false;
+
+    return MCPlusBuilder::equals(*AArch64ExprA.getSubExpr(),
+                                 *AArch64ExprB.getSubExpr(), Comp);
+  }
+
+  bool hasEVEXEncoding(const MCInst &) const override {
+    return false;
+  }
+
+  bool isMacroOpFusionPair(ArrayRef<MCInst> Insts) const override {
+    return false;
+  }
+
+  bool shortenInstruction(MCInst &) const override {
+    return false;
+  }
+
+  bool isADRP(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::ADRP;
+  }
+
+  bool isADR(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::ADR;
+  }
+
+  bool isTB(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::TBNZW ||
+            Inst.getOpcode() == AArch64::TBNZX ||
+            Inst.getOpcode() == AArch64::TBZW ||
+            Inst.getOpcode() == AArch64::TBZX);
+  }
+
+  bool isCB(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::CBNZW ||
+            Inst.getOpcode() == AArch64::CBNZX ||
+            Inst.getOpcode() == AArch64::CBZW ||
+            Inst.getOpcode() == AArch64::CBZX);
+  }
+
+  bool isMOVW(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::MOVKWi ||
+            Inst.getOpcode() == AArch64::MOVKXi ||
+            Inst.getOpcode() == AArch64::MOVNWi ||
+            Inst.getOpcode() == AArch64::MOVNXi ||
+            Inst.getOpcode() == AArch64::MOVZXi ||
+            Inst.getOpcode() == AArch64::MOVZWi);
+  }
+
+  bool isADD(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::ADDSWri ||
+            Inst.getOpcode() == AArch64::ADDSWrr ||
+            Inst.getOpcode() == AArch64::ADDSWrs ||
+            Inst.getOpcode() == AArch64::ADDSWrx ||
+            Inst.getOpcode() == AArch64::ADDSXri ||
+            Inst.getOpcode() == AArch64::ADDSXrr ||
+            Inst.getOpcode() == AArch64::ADDSXrs ||
+            Inst.getOpcode() == AArch64::ADDSXrx ||
+            Inst.getOpcode() == AArch64::ADDSXrx64 ||
+            Inst.getOpcode() == AArch64::ADDWri ||
+            Inst.getOpcode() == AArch64::ADDWrr ||
+            Inst.getOpcode() == AArch64::ADDWrs ||
+            Inst.getOpcode() == AArch64::ADDWrx ||
+            Inst.getOpcode() == AArch64::ADDXri ||
+            Inst.getOpcode() == AArch64::ADDXrr ||
+            Inst.getOpcode() == AArch64::ADDXrs ||
+            Inst.getOpcode() == AArch64::ADDXrx ||
+            Inst.getOpcode() == AArch64::ADDXrx64);
+  }
+
+  bool isLDRB(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::LDRBBpost ||
+            Inst.getOpcode() == AArch64::LDRBBpre ||
+            Inst.getOpcode() == AArch64::LDRBBroW ||
+            Inst.getOpcode() == AArch64::LDRBBroX ||
+            Inst.getOpcode() == AArch64::LDRBBui ||
+            Inst.getOpcode() == AArch64::LDRSBWpost ||
+            Inst.getOpcode() == AArch64::LDRSBWpre ||
+            Inst.getOpcode() == AArch64::LDRSBWroW ||
+            Inst.getOpcode() == AArch64::LDRSBWroX ||
+            Inst.getOpcode() == AArch64::LDRSBWui ||
+            Inst.getOpcode() == AArch64::LDRSBXpost ||
+            Inst.getOpcode() == AArch64::LDRSBXpre ||
+            Inst.getOpcode() == AArch64::LDRSBXroW ||
+            Inst.getOpcode() == AArch64::LDRSBXroX ||
+            Inst.getOpcode() == AArch64::LDRSBXui);
+  }
+
+  bool isLDRH(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::LDRHHpost ||
+            Inst.getOpcode() == AArch64::LDRHHpre ||
+            Inst.getOpcode() == AArch64::LDRHHroW ||
+            Inst.getOpcode() == AArch64::LDRHHroX ||
+            Inst.getOpcode() == AArch64::LDRHHui ||
+            Inst.getOpcode() == AArch64::LDRSHWpost ||
+            Inst.getOpcode() == AArch64::LDRSHWpre ||
+            Inst.getOpcode() == AArch64::LDRSHWroW ||
+            Inst.getOpcode() == AArch64::LDRSHWroX ||
+            Inst.getOpcode() == AArch64::LDRSHWui ||
+            Inst.getOpcode() == AArch64::LDRSHXpost ||
+            Inst.getOpcode() == AArch64::LDRSHXpre ||
+            Inst.getOpcode() == AArch64::LDRSHXroW ||
+            Inst.getOpcode() == AArch64::LDRSHXroX ||
+            Inst.getOpcode() == AArch64::LDRSHXui);
+  }
+
+  bool isLDRW(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::LDRWpost ||
+            Inst.getOpcode() == AArch64::LDRWpre ||
+            Inst.getOpcode() == AArch64::LDRWroW ||
+            Inst.getOpcode() == AArch64::LDRWroX ||
+            Inst.getOpcode() == AArch64::LDRWui);
+  }
+
+  bool isLDRX(const MCInst &Inst) const {
+    return (Inst.getOpcode() == AArch64::LDRXpost ||
+            Inst.getOpcode() == AArch64::LDRXpre ||
+            Inst.getOpcode() == AArch64::LDRXroW ||
+            Inst.getOpcode() == AArch64::LDRXroX ||
+            Inst.getOpcode() == AArch64::LDRXui);
+  }
+
+  bool isLoad(const MCInst &Inst) const override {
+    return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst);
+  }
+
+  bool isLoadFromStack(const MCInst &Inst) const {
+    if (!isLoad(Inst))
+      return false;
+    const MCInstrDesc &InstInfo = Info->get(Inst.getOpcode());
+    unsigned NumDefs = InstInfo.getNumDefs();
+    for (unsigned I = NumDefs, E = InstInfo.getNumOperands(); I < E; ++I) {
+      const MCOperand &Operand = Inst.getOperand(I);
+      if (!Operand.isReg())
+        continue;
+      unsigned Reg = Operand.getReg();
+      if (Reg == AArch64::SP || Reg == AArch64::WSP ||
+          Reg == AArch64::FP || Reg == AArch64::W29)
+        return true;
+    }
+    return false;
+  }
+
+  bool isRegToRegMove(const MCInst &Inst, MCPhysReg &From,
+                      MCPhysReg &To) const override {
+    if (Inst.getOpcode() != AArch64::ORRXrs)
+      return false;
+    if (Inst.getOperand(1).getReg() != AArch64::XZR)
+      return false;
+    if (Inst.getOperand(3).getImm() != 0)
+      return false;
+    From = Inst.getOperand(2).getReg();
+    To = Inst.getOperand(0).getReg();
+    return true;
+  }
+
+  bool isIndirectCall(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::BLR;
+  }
+
+  MCPhysReg getNoRegister() const override {
+    return AArch64::NoRegister;
+  }
+
+  bool hasPCRelOperand(const MCInst &Inst) const override {
+    // ADRP is blacklisted and is an exception. Even though it has a
+    // PC-relative operand, this operand is not a complete symbol reference
+    // and BOLT shouldn't try to process it in isolation.
+    if (isADRP(Inst))
+      return false;
+
+    if (isADR(Inst))
+      return true;
+
+    // Look for literal addressing mode (see C1-143 ARM DDI 0487B.a)
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+    for (unsigned I = 0, E = MCII.getNumOperands(); I != E; ++I) {
+      if (MCII.OpInfo[I].OperandType == MCOI::OPERAND_PCREL)
+        return true;
+    }
+    return false;
+  }
+
+  bool evaluateADR(const MCInst &Inst, int64_t &Imm,
+                   const MCExpr **DispExpr) const {
+    assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction");
+
+    const MCOperand &Label = Inst.getOperand(1);
+    if (!Label.isImm()) {
+      assert (Label.isExpr() && "Unexpected ADR operand");
+      assert (DispExpr && "DispExpr must be set");
+      *DispExpr = Label.getExpr();
+      return false;
+    }
+
+    if (Inst.getOpcode() == AArch64::ADR) {
+      Imm = Label.getImm();
+      return true;
+    }
+    Imm = Label.getImm() << 12;
+    return true;
+  }
+
+  bool evaluateAArch64MemoryOperand(const MCInst &Inst,
+                                    int64_t &DispImm,
+                                    const MCExpr **DispExpr = nullptr)
+                                                                const {
+    if (isADR(Inst) || isADRP(Inst))
+      return evaluateADR(Inst, DispImm, DispExpr);
+
+    // Literal addressing mode
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+    for (unsigned I = 0, E = MCII.getNumOperands(); I != E; ++I) {
+      if (MCII.OpInfo[I].OperandType != MCOI::OPERAND_PCREL)
+        continue;
+
+      if (!Inst.getOperand(I).isImm()) {
+        assert(Inst.getOperand(I).isExpr() && "Unexpected PCREL operand");
+        assert(DispExpr && "DispExpr must be set");
+        *DispExpr = Inst.getOperand(I).getExpr();
+        return true;
+      }
+
+      DispImm = Inst.getOperand(I).getImm() << 2;
+      return true;
+    }
+    return false;
+  }
+
+  bool evaluateMemOperandTarget(const MCInst &Inst, uint64_t &Target,
+                                uint64_t Address,
+                                uint64_t Size) const override {
+    int64_t       DispValue;
+    const MCExpr *DispExpr = nullptr;
+    if (!evaluateAArch64MemoryOperand(Inst, DispValue, &DispExpr))
+      return false;
+
+    // Make sure it's a well-formed addressing we can statically evaluate.
+    if (DispExpr)
+      return false;
+
+    Target = DispValue;
+    if (Inst.getOpcode() == AArch64::ADRP)
+      Target += Address & ~0xFFFULL;
+    else
+      Target += Address;
+    return true;
+  }
+
+  bool replaceMemOperandDisp(MCInst &Inst, MCOperand Operand) const override {
+    MCInst::iterator OI = Inst.begin();
+    if (isADR(Inst) || isADRP(Inst)) {
+      assert(MCPlus::getNumPrimeOperands(Inst) >= 2 &&
+             "Unexpected number of operands");
+      ++OI;
+    } else {
+      const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+      for (unsigned I = 0, E = MCII.getNumOperands(); I != E; ++I) {
+        if (MCII.OpInfo[I].OperandType == MCOI::OPERAND_PCREL) {
+          break;
+        }
+        ++OI;
+      }
+      assert(OI != Inst.end() && "Literal operand not found");
+    }
+    *OI = Operand;
+    return true;
+  }
+
+  const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr,
+                                 MCContext &Ctx,
+                                 uint64_t RelType) const override {
+    if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE ||
+        RelType == ELF::R_AARCH64_TLSDESC_ADR_PAGE21) {
+      // Never emit a GOT reloc, we handled this in
+      // RewriteInstance::readRelocations().
+      return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, Ctx);
+    } else if (Inst.getOpcode() == AArch64::ADRP) {
+      return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, Ctx);
+    } else {
+      switch(RelType) {
+      case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+      case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+      case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
+      case ELF::R_AARCH64_LDST64_ABS_LO12_NC:
+      case ELF::R_AARCH64_LD64_GOT_LO12_NC:
+      case ELF::R_AARCH64_TLSDESC_LD64_LO12:
+      case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+      case ELF::R_AARCH64_ADD_ABS_LO12_NC:
+        return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_LO12, Ctx);
+      default:
+        break;
+      }
+    }
+    return Expr;
+  }
+
+  const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
+    auto *AArchExpr = dyn_cast<AArch64MCExpr>(Expr);
+    if (AArchExpr && AArchExpr->getSubExpr()) {
+      return getTargetSymbol(AArchExpr->getSubExpr());
+    }
+
+    auto *SymExpr = dyn_cast<MCSymbolRefExpr>(Expr);
+    if (!SymExpr || SymExpr->getKind() != MCSymbolRefExpr::VK_None)
+      return nullptr;
+
+    return &SymExpr->getSymbol();
+  }
+
+  const MCSymbol *getTargetSymbol(const MCInst &Inst,
+                                  unsigned OpNum = 0) const override {
+    if (OpNum >= MCPlus::getNumPrimeOperands(Inst))
+      return nullptr;
+
+    // Auto-select correct operand number
+    if (OpNum == 0) {
+      if (isConditionalBranch(Inst))
+        OpNum = 1;
+      if (isTB(Inst))
+        OpNum = 2;
+      if (isMOVW(Inst))
+        OpNum = 1;
+    }
+
+    const MCOperand &Op = Inst.getOperand(OpNum);
+    if (!Op.isExpr())
+      return nullptr;
+
+    return getTargetSymbol(Op.getExpr());
+  }
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    size_t OpNum = 0;
+
+    if (isConditionalBranch(Inst)) {
+      assert(MCPlus::getNumPrimeOperands(Inst) >= 2 &&
+             "Invalid number of operands");
+      OpNum = 1;
+    }
+
+    if (isTB(Inst)) {
+      assert(MCPlus::getNumPrimeOperands(Inst) >= 3 &&
+             "Invalid number of operands");
+      OpNum = 2;
+    }
+
+    if (Info->get(Inst.getOpcode()).OpInfo[OpNum].OperandType !=
+        MCOI::OPERAND_PCREL) {
+      assert((isIndirectBranch(Inst) || isIndirectCall(Inst)) &&
+             "FAILED evaluateBranch");
+      return false;
+    }
+
+    int64_t Imm = Inst.getOperand(OpNum).getImm() << 2;
+    Target = Addr + Imm;
+    return true;
+  }
+
+  bool replaceBranchTarget(MCInst &Inst, const MCSymbol *TBB,
+                           MCContext *Ctx) const override {
+    assert((isCall(Inst) || isBranch(Inst)) && !isIndirectBranch(Inst) &&
+           "Invalid instruction");
+    assert(MCPlus::getNumPrimeOperands(Inst) >= 1 &&
+           "Invalid number of operands");
+    MCInst::iterator OI = Inst.begin();
+
+    if (isConditionalBranch(Inst)) {
+      assert(MCPlus::getNumPrimeOperands(Inst) >= 2 &&
+             "Invalid number of operands");
+      ++OI;
+    }
+
+    if (isTB(Inst)) {
+      assert(MCPlus::getNumPrimeOperands(Inst) >= 3 &&
+             "Invalid number of operands");
+      OI = Inst.begin() + 2;
+    }
+
+    *OI = MCOperand::createExpr(
+        MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx));
+    return true;
+  }
+
+  /// Matches indirect branch patterns in AArch64 related to a jump table (JT),
+  /// helping us to build the complete CFG. A typical indirect branch to
+  /// a jump table entry in AArch64 looks like the following:
+  ///
+  ///   adrp    x1, #-7585792           # Get JT Page location
+  ///   add     x1, x1, #692            # Complement with JT Page offset
+  ///   ldrh    w0, [x1, w0, uxtw #1]   # Loads JT entry
+  ///   adr     x1, #12                 # Get PC + 12 (end of this BB) used next
+  ///   add     x0, x1, w0, sxth #2     # Finish building branch target
+  ///                                   # (entries in JT are relative to the end
+  ///                                   #  of this BB)
+  ///   br      x0                      # Indirect jump instruction
+  ///
+  bool analyzeIndirectBranchFragment(
+      const MCInst &Inst,
+      DenseMap<const MCInst *, SmallVector<MCInst *, 4>> &UDChain,
+      const MCExpr *&JumpTable, int64_t &Offset, int64_t &ScaleValue,
+      MCInst *&PCRelBase) const {
+    // Expect AArch64 BR
+    assert(Inst.getOpcode() == AArch64::BR && "Unexpected opcode");
+
+    // Match the indirect branch pattern for aarch64
+    SmallVector<MCInst *, 4> &UsesRoot = UDChain[&Inst];
+    if (UsesRoot.size() == 0 || UsesRoot[0] == nullptr) {
+      return false;
+    }
+    const MCInst *DefAdd = UsesRoot[0];
+
+    // Now we match an ADD
+    if (!isADD(*DefAdd)) {
+      // If the address is not broken up in two parts, this is not branching
+      // according to a jump table entry. Fail.
+      return false;
+    }
+    if (DefAdd->getOpcode() == AArch64::ADDXri) {
+      // This can happen when there is no offset, but a direct jump that was
+      // transformed into an indirect one  (indirect tail call) :
+      //   ADRP   x2, Perl_re_compiler
+      //   ADD    x2, x2, :lo12:Perl_re_compiler
+      //   BR     x2
+      return false;
+    }
+    if (DefAdd->getOpcode() == AArch64::ADDXrs) {
+      // Covers the less common pattern where JT entries are relative to
+      // the JT itself (like x86). Seems less efficient since we can't
+      // assume the JT is aligned at 4B boundary and thus drop 2 bits from
+      // JT values.
+      // cde264:
+      //    adrp    x12, #21544960  ; 216a000
+      //    add     x12, x12, #1696 ; 216a6a0  (JT object in .rodata)
+      //    ldrsw   x8, [x12, x8, lsl #2]   --> loads e.g. 0xfeb73bd8
+      //  * add     x8, x8, x12   --> = cde278, next block
+      //    br      x8
+      // cde278:
+      //
+      // Parsed as ADDXrs reg:x8 reg:x8 reg:x12 imm:0
+      return false;
+    }
+    assert(DefAdd->getOpcode() == AArch64::ADDXrx &&
+           "Failed to match indirect branch!");
+
+    // Validate ADD operands
+    int64_t OperandExtension = DefAdd->getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getArithShiftValue(OperandExtension);
+    AArch64_AM::ShiftExtendType ExtendType =
+        AArch64_AM::getArithExtendType(OperandExtension);
+    if (ShiftVal != 2) {
+      llvm_unreachable("Failed to match indirect branch! (fragment 2)");
+    }
+    if (ExtendType == AArch64_AM::SXTB) {
+      ScaleValue = 1LL;
+    } else if (ExtendType == AArch64_AM::SXTH) {
+      ScaleValue = 2LL;
+    } else if (ExtendType == AArch64_AM::SXTW) {
+      ScaleValue = 4LL;
+    } else {
+      llvm_unreachable("Failed to match indirect branch! (fragment 3)");
+    }
+
+    // Match an ADR to load base address to be used when addressing JT targets
+    SmallVector<MCInst *, 4> &UsesAdd = UDChain[DefAdd];
+    if (UsesAdd.size() <= 1 || UsesAdd[1] == nullptr || UsesAdd[2] == nullptr) {
+      // This happens when we don't have enough context about this jump table
+      // because the jumping code sequence was split in multiple basic blocks.
+      // This was observed in the wild in HHVM code (dispatchImpl).
+      return false;
+    }
+    MCInst *DefBaseAddr = UsesAdd[1];
+    assert(DefBaseAddr->getOpcode() == AArch64::ADR &&
+           "Failed to match indirect branch pattern! (fragment 3)");
+
+    PCRelBase = DefBaseAddr;
+    // Match LOAD to load the jump table (relative) target
+    const MCInst *DefLoad = UsesAdd[2];
+    assert(isLoad(*DefLoad) &&
+           "Failed to match indirect branch load pattern! (1)");
+    assert((ScaleValue != 1LL || isLDRB(*DefLoad)) &&
+           "Failed to match indirect branch load pattern! (2)");
+    assert((ScaleValue != 2LL || isLDRH(*DefLoad)) &&
+           "Failed to match indirect branch load pattern! (3)");
+
+    // Match ADD that calculates the JumpTable Base Address (not the offset)
+    SmallVector<MCInst *, 4> &UsesLoad = UDChain[DefLoad];
+    const MCInst *DefJTBaseAdd = UsesLoad[1];
+    MCPhysReg From, To;
+    if (DefJTBaseAdd == nullptr || isLoadFromStack(*DefJTBaseAdd) ||
+        isRegToRegMove(*DefJTBaseAdd, From, To)) {
+      // Sometimes base address may have been defined in another basic block
+      // (hoisted). Return with no jump table info.
+      JumpTable = nullptr;
+      return true;
+    }
+
+    assert(DefJTBaseAdd->getOpcode() == AArch64::ADDXri &&
+           "Failed to match jump table base address pattern! (1)");
+
+    if (DefJTBaseAdd->getOperand(2).isImm())
+      Offset = DefJTBaseAdd->getOperand(2).getImm();
+    SmallVector<MCInst *, 4> &UsesJTBaseAdd = UDChain[DefJTBaseAdd];
+    const MCInst *DefJTBasePage = UsesJTBaseAdd[1];
+    if (DefJTBasePage == nullptr || isLoadFromStack(*DefJTBasePage)) {
+      JumpTable = nullptr;
+      return true;
+    }
+    assert(DefJTBasePage->getOpcode() == AArch64::ADRP &&
+           "Failed to match jump table base page pattern! (2)");
+    if (DefJTBasePage->getOperand(1).isExpr())
+      JumpTable = DefJTBasePage->getOperand(1).getExpr();
+    return true;
+  }
+
+  DenseMap<const MCInst *, SmallVector<MCInst *, 4>>
+  computeLocalUDChain(const MCInst *CurInstr,
+                      InstructionIterator Begin,
+                      InstructionIterator End) const {
+    DenseMap<int, MCInst *> RegAliasTable;
+    DenseMap<const MCInst *, SmallVector<MCInst *, 4>> Uses;
+
+    auto addInstrOperands = [&](const MCInst &Instr) {
+      // Update Uses table
+      for (unsigned OpNum = 0, OpEnd = MCPlus::getNumPrimeOperands(Instr);
+           OpNum != OpEnd; ++OpNum) {
+        if (!Instr.getOperand(OpNum).isReg())
+          continue;
+        unsigned Reg = Instr.getOperand(OpNum).getReg();
+        MCInst* AliasInst = RegAliasTable[Reg];
+        Uses[&Instr].push_back(AliasInst);
+        LLVM_DEBUG({
+          dbgs() << "Adding reg operand " << Reg << " refs ";
+          if (AliasInst != nullptr)
+            AliasInst->dump();
+          else
+            dbgs() << "\n";
+        });
+      }
+    };
+
+    LLVM_DEBUG(dbgs() << "computeLocalUDChain\n");
+    bool TerminatorSeen = false;
+    for (auto II = Begin; II != End; ++II) {
+      MCInst &Instr = *II;
+      // Ignore nops and CFIs
+      if (isPseudo(Instr) || isNoop(Instr))
+        continue;
+      if (TerminatorSeen) {
+        RegAliasTable.clear();
+        Uses.clear();
+      }
+
+      LLVM_DEBUG(dbgs() << "Now updating for:\n ");
+      LLVM_DEBUG(Instr.dump());
+      addInstrOperands(Instr);
+
+      BitVector Regs = BitVector(RegInfo->getNumRegs(), false);
+      getWrittenRegs(Instr, Regs);
+
+      // Update register definitions after this point
+      int Idx = Regs.find_first();
+      while (Idx != -1) {
+        RegAliasTable[Idx] = &Instr;
+        LLVM_DEBUG(dbgs() << "Setting reg " << Idx
+                          << " def to current instr.\n");
+        Idx = Regs.find_next(Idx);
+      }
+
+      TerminatorSeen = isTerminator(Instr);
+    }
+
+    // Process the last instruction, which is not currently added into the
+    // instruction stream
+    if (CurInstr) {
+      addInstrOperands(*CurInstr);
+    }
+    return Uses;
+  }
+
+  IndirectBranchType analyzeIndirectBranch(
+     MCInst &Instruction,
+     InstructionIterator Begin,
+     InstructionIterator End,
+     const unsigned PtrSize,
+     MCInst *&MemLocInstrOut,
+     unsigned &BaseRegNumOut,
+     unsigned &IndexRegNumOut,
+     int64_t &DispValueOut,
+     const MCExpr *&DispExprOut,
+     MCInst *&PCRelBaseOut
+  ) const override {
+    MemLocInstrOut = nullptr;
+    BaseRegNumOut = AArch64::NoRegister;
+    IndexRegNumOut = AArch64::NoRegister;
+    DispValueOut = 0;
+    DispExprOut = nullptr;
+
+    // An instruction referencing memory used by jump instruction (directly or
+    // via register). This location could be an array of function pointers
+    // in case of indirect tail call, or a jump table.
+    MCInst *MemLocInstr = nullptr;
+
+    // Analyze the memory location.
+    int64_t       ScaleValue, DispValue;
+    const MCExpr *DispExpr;
+
+    DenseMap<const MCInst *, SmallVector<llvm::MCInst *, 4>> UDChain =
+        computeLocalUDChain(&Instruction, Begin, End);
+    MCInst *PCRelBase;
+    if (!analyzeIndirectBranchFragment(Instruction, UDChain, DispExpr,
+                                       DispValue, ScaleValue, PCRelBase)) {
+      return IndirectBranchType::UNKNOWN;
+    }
+
+    MemLocInstrOut = MemLocInstr;
+    DispValueOut = DispValue;
+    DispExprOut = DispExpr;
+    PCRelBaseOut = PCRelBase;
+    return IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE;
+  }
+
+  unsigned getInvertedBranchOpcode(unsigned Opcode) const {
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Failed to invert branch opcode");
+      return Opcode;
+    case AArch64::TBZW:     return AArch64::TBNZW;
+    case AArch64::TBZX:     return AArch64::TBNZX;
+    case AArch64::TBNZW:    return AArch64::TBZW;
+    case AArch64::TBNZX:    return AArch64::TBZX;
+    case AArch64::CBZW:     return AArch64::CBNZW;
+    case AArch64::CBZX:     return AArch64::CBNZX;
+    case AArch64::CBNZW:    return AArch64::CBZW;
+    case AArch64::CBNZX:    return AArch64::CBZX;
+    }
+  }
+
+  unsigned getCondCode(const MCInst &Inst) const override {
+    // AArch64 does not use conditional codes, so we just return the opcode
+    // of the conditional branch here.
+    return Inst.getOpcode();
+  }
+
+  unsigned getCanonicalBranchCondCode(unsigned Opcode) const override {
+    switch (Opcode) {
+    default:
+      return Opcode;
+    case AArch64::TBNZW:    return AArch64::TBZW;
+    case AArch64::TBNZX:    return AArch64::TBZX;
+    case AArch64::CBNZW:    return AArch64::CBZW;
+    case AArch64::CBNZX:    return AArch64::CBZX;
+    }
+  }
+
+  bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
+                              MCContext *Ctx) const override {
+    if (isTB(Inst) || isCB(Inst)) {
+      Inst.setOpcode(getInvertedBranchOpcode(Inst.getOpcode()));
+      assert(Inst.getOpcode() != 0 && "Invalid branch instruction");
+    } else if (Inst.getOpcode() == AArch64::Bcc) {
+      Inst.getOperand(0).setImm(AArch64CC::getInvertedCondCode(
+          static_cast<AArch64CC::CondCode>(Inst.getOperand(0).getImm())));
+      assert(Inst.getOperand(0).getImm() != AArch64CC::AL &&
+             Inst.getOperand(0).getImm() != AArch64CC::NV &&
+             "Can't reverse ALWAYS cond code");
+    } else {
+      LLVM_DEBUG(Inst.dump());
+      llvm_unreachable("Unrecognized branch instruction");
+    }
+    return replaceBranchTarget(Inst, TBB, Ctx);
+  }
+
+  int getPCRelEncodingSize(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("Failed to get pcrel encoding size");
+      return 0;
+    case AArch64::TBZW:     return 16;
+    case AArch64::TBZX:     return 16;
+    case AArch64::TBNZW:    return 16;
+    case AArch64::TBNZX:    return 16;
+    case AArch64::CBZW:     return 21;
+    case AArch64::CBZX:     return 21;
+    case AArch64::CBNZW:    return 21;
+    case AArch64::CBNZX:    return 21;
+    case AArch64::B:        return 28;
+    case AArch64::BL:       return 28;
+    case AArch64::Bcc:      return 21;
+    }
+  }
+
+  int getShortJmpEncodingSize() const override {
+    return 32;
+  }
+
+  int getUncondBranchEncodingSize() const override {
+    return 28;
+  }
+
+  bool isTailCall(const MCInst &Inst) const override {
+    auto IsTCOrErr = tryGetAnnotationAs<bool>(Inst, "TC");
+    if (IsTCOrErr)
+      return *IsTCOrErr;
+    if (getConditionalTailCall(Inst))
+      return true;
+    return false;
+  }
+
+  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+                      MCContext *Ctx) override {
+    Inst.setOpcode(AArch64::B);
+    Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
+        Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        *Ctx, 0)));
+    addAnnotation(Inst, "TC", true);
+    return true;
+  }
+
+  bool convertJmpToTailCall(MCInst &Inst) override {
+    addAnnotation(Inst, "TC", true);
+    return true;
+  }
+
+  bool convertTailCallToJmp(MCInst &Inst) override {
+    removeAnnotation(Inst, "TC");
+    removeAnnotation(Inst, "Offset");
+    if (getConditionalTailCall(Inst))
+      unsetConditionalTailCall(Inst);
+    return true;
+  }
+
+  bool lowerTailCall(MCInst &Inst) override {
+    removeAnnotation(Inst, "TC");
+    if (getConditionalTailCall(Inst))
+      unsetConditionalTailCall(Inst);
+    return true;
+  }
+
+  bool isNoop(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::HINT &&
+           Inst.getOperand(0).getImm() == 0;
+  }
+
+  bool createNoop(MCInst &Inst) const override {
+    Inst.setOpcode(AArch64::HINT);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createImm(0));
+    return true;
+  }
+
+  bool isStore(const MCInst &Inst) const override {
+    return false;
+  }
+
+  bool analyzeBranch(InstructionIterator Begin,
+                     InstructionIterator End,
+                     const MCSymbol *&TBB,
+                     const MCSymbol *&FBB,
+                     MCInst *&CondBranch,
+                     MCInst *&UncondBranch) const override {
+    auto I = End;
+
+    while (I != Begin) {
+      --I;
+
+      // Ignore nops and CFIs
+      if (isPseudo(*I) || isNoop(*I))
+        continue;
+
+      // Stop when we find the first non-terminator
+      if (!isTerminator(*I) || isTailCall(*I) || !isBranch(*I))
+        break;
+
+      // Handle unconditional branches.
+      if (isUnconditionalBranch(*I)) {
+        // If any code was seen after this unconditional branch, we've seen
+        // unreachable code. Ignore them.
+        CondBranch = nullptr;
+        UncondBranch = &*I;
+        const MCSymbol *Sym = getTargetSymbol(*I);
+        assert(Sym != nullptr &&
+               "Couldn't extract BB symbol from jump operand");
+        TBB = Sym;
+        continue;
+      }
+
+      // Handle conditional branches and ignore indirect branches
+      if (isIndirectBranch(*I)) {
+        return false;
+      }
+
+      if (CondBranch == nullptr) {
+        const MCSymbol *TargetBB = getTargetSymbol(*I);
+        if (TargetBB == nullptr) {
+          // Unrecognized branch target
+          return false;
+        }
+        FBB = TBB;
+        TBB = TargetBB;
+        CondBranch = &*I;
+        continue;
+      }
+
+      llvm_unreachable("multiple conditional branches in one BB");
+    }
+    return true;
+  }
+
+  void createLongJmp(std::vector<MCInst> &Seq, const MCSymbol *Target,
+                     MCContext *Ctx) const override {
+    // ip0 (r16) is reserved to the linker (refer to 5.3.1.1 of "Procedure Call
+    //   Standard for the ARM 64-bit Architecture (AArch64)".
+    // The sequence of instructions we create here is the following:
+    //  movz ip0, #:abs_g3:<addr>
+    //  movk ip0, #:abs_g2_nc:<addr>
+    //  movk ip0, #:abs_g1_nc:<addr>
+    //  movk ip0, #:abs_g0_nc:<addr>
+    //  br ip0
+    MCInst Inst;
+    Inst.setOpcode(AArch64::MOVZXi);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        AArch64MCExpr::VK_ABS_G3, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(0x30));
+    Seq.emplace_back(Inst);
+
+    Inst.clear();
+    Inst.setOpcode(AArch64::MOVKXi);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        AArch64MCExpr::VK_ABS_G2_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(0x20));
+    Seq.emplace_back(Inst);
+
+    Inst.clear();
+    Inst.setOpcode(AArch64::MOVKXi);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        AArch64MCExpr::VK_ABS_G1_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(0x10));
+    Seq.emplace_back(Inst);
+
+    Inst.clear();
+    Inst.setOpcode(AArch64::MOVKXi);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        AArch64MCExpr::VK_ABS_G0_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(0));
+    Seq.emplace_back(Inst);
+
+    Inst.clear();
+    Inst.setOpcode(AArch64::BR);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Seq.emplace_back(Inst);
+  }
+
+  void createShortJmp(std::vector<MCInst> &Seq, const MCSymbol *Target,
+                      MCContext *Ctx) const override {
+    // ip0 (r16) is reserved to the linker (refer to 5.3.1.1 of "Procedure Call
+    //   Standard for the ARM 64-bit Architecture (AArch64)".
+    // The sequence of instructions we create here is the following:
+    //  movz ip0, #:abs_g1_nc:<addr>
+    //  movk ip0, #:abs_g0_nc:<addr>
+    //  br ip0
+    MCInst Inst;
+    Inst.setOpcode(AArch64::MOVZXi);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        AArch64MCExpr::VK_ABS_G1_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(0x10));
+    Seq.emplace_back(Inst);
+
+    Inst.clear();
+    Inst.setOpcode(AArch64::MOVKXi);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        AArch64MCExpr::VK_ABS_G0_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(0));
+    Seq.emplace_back(Inst);
+
+    Inst.clear();
+    Inst.setOpcode(AArch64::BR);
+    Inst.addOperand(MCOperand::createReg(AArch64::X16));
+    Seq.emplace_back(Inst);
+  }
+
+  /// Matching pattern here is
+  ///
+  ///    ADRP  x16, imm
+  ///    ADD   x16, x16, imm
+  ///    BR    x16
+  ///
+  bool matchLinkerVeneer(InstructionIterator Begin, InstructionIterator End,
+                         uint64_t Address, const MCInst &CurInst,
+                         MCInst *&TargetHiBits, MCInst *&TargetLowBits,
+                         uint64_t &Target) const override {
+    if (CurInst.getOpcode() != AArch64::BR || !CurInst.getOperand(0).isReg() ||
+        CurInst.getOperand(0).getReg() != AArch64::X16)
+      return false;
+
+    auto I = End;
+    if (I == Begin)
+      return false;
+
+    --I;
+    Address -= 4;
+    if (I == Begin ||
+        I->getOpcode() != AArch64::ADDXri ||
+        MCPlus::getNumPrimeOperands(*I) < 3 ||
+        !I->getOperand(0).isReg() ||
+        !I->getOperand(1).isReg() ||
+        I->getOperand(0).getReg() != AArch64::X16 ||
+        I->getOperand(1).getReg() != AArch64::X16 ||
+        !I->getOperand(2).isImm())
+      return false;
+    TargetLowBits = &*I;
+    uint64_t Addr = I->getOperand(2).getImm() & 0xFFF;
+
+    --I;
+    Address -= 4;
+    if (I->getOpcode() != AArch64::ADRP ||
+        MCPlus::getNumPrimeOperands(*I) < 2 ||
+        !I->getOperand(0).isReg() ||
+        !I->getOperand(1).isImm() ||
+        I->getOperand(0).getReg() != AArch64::X16)
+      return false;
+    TargetHiBits = &*I;
+    Addr |= (Address + ((int64_t)I->getOperand(1).getImm() << 12)) &
+            0xFFFFFFFFFFFFF000ULL;
+    Target = Addr;
+    return true;
+  }
+
+  bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
+                               int64_t Addend, MCContext *Ctx, int64_t &Value,
+                               uint64_t RelType) const override {
+    unsigned ImmOpNo = -1U;
+    for (unsigned Index = 0; Index < MCPlus::getNumPrimeOperands(Inst);
+         ++Index) {
+      if (Inst.getOperand(Index).isImm()) {
+        ImmOpNo = Index;
+        break;
+      }
+    }
+    if (ImmOpNo == -1U)
+      return false;
+
+    Value = Inst.getOperand(ImmOpNo).getImm();
+
+    setOperandToSymbolRef(Inst, ImmOpNo, Symbol, Addend, Ctx, RelType);
+
+    return true;
+  }
+
+  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+                          MCContext *Ctx) const override {
+    Inst.setOpcode(AArch64::B);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
+        Inst, MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx),
+        *Ctx, 0)));
+    return true;
+  }
+
+  bool isMoveMem2Reg(const MCInst &Inst) const override {
+    return false;
+  }
+
+  bool isADD64rr(const MCInst &Inst) const override {
+    return false;
+  }
+
+  bool isLeave(const MCInst &Inst) const override {
+    return false;
+  }
+
+  bool isPop(const MCInst &Inst) const override {
+    return false;
+  }
+
+  bool isPrefix(const MCInst &Inst) const override {
+    return false;
+  }
+
+  bool deleteREPPrefix(MCInst &Inst) const override {
+    return false;
+  }
+
+  bool createReturn(MCInst &Inst) const override {
+    Inst.setOpcode(AArch64::RET);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(AArch64::LR));
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+
+namespace llvm {
+namespace bolt {
+
+MCPlusBuilder *createAArch64MCPlusBuilder(const MCInstrAnalysis *Analysis,
+                                          const MCInstrInfo *Info,
+                                          const MCRegisterInfo *RegInfo) {
+  return new AArch64MCPlusBuilder(Analysis, Info, RegInfo);
+}
+
+}
+}
diff --git a/bolt/src/Target/AArch64/CMakeLists.txt b/bolt/src/Target/AArch64/CMakeLists.txt
new file mode 100644
index 000000000000..124725c25a11
--- /dev/null
+++ b/bolt/src/Target/AArch64/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMBOLTTargetAArch64
+  AArch64MCPlusBuilder.cpp
+
+  DEPENDS
+  intrinsics_gen
+  AArch64CommonTableGen
+  )
+
+include_directories(${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64 ${LLVM_BINARY_DIR}/lib/Target/AArch64)
+include_directories(${BOLT_SOURCE_DIR}/src)
diff --git a/bolt/src/Target/CMakeLists.txt b/bolt/src/Target/CMakeLists.txt
new file mode 100644
index 000000000000..d929366e7fd2
--- /dev/null
+++ b/bolt/src/Target/CMakeLists.txt
@@ -0,0 +1,4 @@
+foreach(t ${LLVM_TARGETS_TO_BUILD})
+  message(STATUS "Targeting llvm-bolt ${t}")
+  add_subdirectory(${t})
+endforeach()
diff --git a/bolt/src/Target/X86/CMakeLists.txt b/bolt/src/Target/X86/CMakeLists.txt
new file mode 100644
index 000000000000..282c0c66677c
--- /dev/null
+++ b/bolt/src/Target/X86/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMBOLTTargetX86
+  X86MCPlusBuilder.cpp
+
+  DEPENDS
+  intrinsics_gen
+  X86CommonTableGen
+  )
+
+include_directories(${LLVM_MAIN_SRC_DIR}/lib/Target/X86 ${LLVM_BINARY_DIR}/lib/Target/X86)
+include_directories(${BOLT_SOURCE_DIR}/src)
diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp
new file mode 100644
index 000000000000..1d155e7a11da
--- /dev/null
+++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp
@@ -0,0 +1,3672 @@
+//===-- X86MCPlusBuilder.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86-specific MC+ builder.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCPlusBuilder.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include <set>
+
+#define DEBUG_TYPE "bolt-x86"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+
+unsigned getShortBranchOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return Opcode;
+  case X86::JMP_2: return X86::JMP_1;
+  case X86::JMP_4: return X86::JMP_1;
+  case X86::JCC_2: return X86::JCC_1;
+  case X86::JCC_4: return X86::JCC_1;
+  }
+}
+
+unsigned getShortArithOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return Opcode;
+
+  // IMUL
+  case X86::IMUL16rri:   return X86::IMUL16rri8;
+  case X86::IMUL16rmi:   return X86::IMUL16rmi8;
+  case X86::IMUL32rri:   return X86::IMUL32rri8;
+  case X86::IMUL32rmi:   return X86::IMUL32rmi8;
+  case X86::IMUL64rri32: return X86::IMUL64rri8;
+  case X86::IMUL64rmi32: return X86::IMUL64rmi8;
+
+  // OR
+  case X86::OR16ri:    return X86::OR16ri8;
+  case X86::OR16mi:    return X86::OR16mi8;
+  case X86::OR32ri:    return X86::OR32ri8;
+  case X86::OR32mi:    return X86::OR32mi8;
+  case X86::OR64ri32:  return X86::OR64ri8;
+  case X86::OR64mi32:  return X86::OR64mi8;
+
+  // AND
+  case X86::AND16ri:   return X86::AND16ri8;
+  case X86::AND16mi:   return X86::AND16mi8;
+  case X86::AND32ri:   return X86::AND32ri8;
+  case X86::AND32mi:   return X86::AND32mi8;
+  case X86::AND64ri32: return X86::AND64ri8;
+  case X86::AND64mi32: return X86::AND64mi8;
+
+  // XOR
+  case X86::XOR16ri:   return X86::XOR16ri8;
+  case X86::XOR16mi:   return X86::XOR16mi8;
+  case X86::XOR32ri:   return X86::XOR32ri8;
+  case X86::XOR32mi:   return X86::XOR32mi8;
+  case X86::XOR64ri32: return X86::XOR64ri8;
+  case X86::XOR64mi32: return X86::XOR64mi8;
+
+  // ADD
+  case X86::ADD16ri:   return X86::ADD16ri8;
+  case X86::ADD16mi:   return X86::ADD16mi8;
+  case X86::ADD32ri:   return X86::ADD32ri8;
+  case X86::ADD32mi:   return X86::ADD32mi8;
+  case X86::ADD64ri32: return X86::ADD64ri8;
+  case X86::ADD64mi32: return X86::ADD64mi8;
+
+  // SUB
+  case X86::SUB16ri:   return X86::SUB16ri8;
+  case X86::SUB16mi:   return X86::SUB16mi8;
+  case X86::SUB32ri:   return X86::SUB32ri8;
+  case X86::SUB32mi:   return X86::SUB32mi8;
+  case X86::SUB64ri32: return X86::SUB64ri8;
+  case X86::SUB64mi32: return X86::SUB64mi8;
+
+  // CMP
+  case X86::CMP16ri:   return X86::CMP16ri8;
+  case X86::CMP16mi:   return X86::CMP16mi8;
+  case X86::CMP32ri:   return X86::CMP32ri8;
+  case X86::CMP32mi:   return X86::CMP32mi8;
+  case X86::CMP64ri32: return X86::CMP64ri8;
+  case X86::CMP64mi32: return X86::CMP64mi8;
+
+  // PUSH
+  case X86::PUSHi32:    return X86::PUSH32i8;
+  case X86::PUSHi16:    return X86::PUSH16i8;
+  case X86::PUSH64i32:  return X86::PUSH64i8;
+  }
+}
+
+unsigned getInvertedCondCode(unsigned CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+bool isADD(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::ADD16i16:
+  case X86::ADD16mi:
+  case X86::ADD16mi8:
+  case X86::ADD16mr:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD16rr_REV:
+  case X86::ADD32i32:
+  case X86::ADD32mi:
+  case X86::ADD32mi8:
+  case X86::ADD32mr:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD32rr_REV:
+  case X86::ADD64i32:
+  case X86::ADD64mi32:
+  case X86::ADD64mi8:
+  case X86::ADD64mr:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD64rr_REV:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mi8:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8ri8:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::ADD8rr_REV:
+    return true;
+  }
+}
+
+bool isAND(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::AND16i16:
+  case X86::AND16mi:
+  case X86::AND16mi8:
+  case X86::AND16mr:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND16rr_REV:
+  case X86::AND32i32:
+  case X86::AND32mi:
+  case X86::AND32mi8:
+  case X86::AND32mr:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND32rr_REV:
+  case X86::AND64i32:
+  case X86::AND64mi32:
+  case X86::AND64mi8:
+  case X86::AND64mr:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND64rr_REV:
+  case X86::AND8i8:
+  case X86::AND8mi:
+  case X86::AND8mi8:
+  case X86::AND8mr:
+  case X86::AND8ri:
+  case X86::AND8ri8:
+  case X86::AND8rm:
+  case X86::AND8rr:
+  case X86::AND8rr_REV:
+    return true;
+  }
+}
+
+bool isCMP(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::CMP16i16:
+  case X86::CMP16mi:
+  case X86::CMP16mi8:
+  case X86::CMP16mr:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP16rr_REV:
+  case X86::CMP32i32:
+  case X86::CMP32mi:
+  case X86::CMP32mi8:
+  case X86::CMP32mr:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP32rr_REV:
+  case X86::CMP64i32:
+  case X86::CMP64mi32:
+  case X86::CMP64mi8:
+  case X86::CMP64mr:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP64rr_REV:
+  case X86::CMP8i8:
+  case X86::CMP8mi:
+  case X86::CMP8mi8:
+  case X86::CMP8mr:
+  case X86::CMP8ri:
+  case X86::CMP8ri8:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::CMP8rr_REV:
+    return true;
+  }
+}
+
+bool isDEC(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::DEC16m:
+  case X86::DEC16r:
+  case X86::DEC16r_alt:
+  case X86::DEC32m:
+  case X86::DEC32r:
+  case X86::DEC32r_alt:
+  case X86::DEC64r:
+  case X86::DEC64m:
+  case X86::DEC8m:
+  case X86::DEC8r:
+    return true;
+  }
+}
+
+bool isINC(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::INC16m:
+  case X86::INC16r:
+  case X86::INC16r_alt:
+  case X86::INC32m:
+  case X86::INC32r:
+  case X86::INC32r_alt:
+  case X86::INC64r:
+  case X86::INC64m:
+  case X86::INC8m:
+  case X86::INC8r:
+    return true;
+  }
+}
+
+bool isSUB(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::SUB16i16:
+  case X86::SUB16mi:
+  case X86::SUB16mi8:
+  case X86::SUB16mr:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB16rr_REV:
+  case X86::SUB32i32:
+  case X86::SUB32mi:
+  case X86::SUB32mi8:
+  case X86::SUB32mr:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB32rr_REV:
+  case X86::SUB64i32:
+  case X86::SUB64mi32:
+  case X86::SUB64mi8:
+  case X86::SUB64mr:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB64rr_REV:
+  case X86::SUB8i8:
+  case X86::SUB8mi:
+  case X86::SUB8mi8:
+  case X86::SUB8mr:
+  case X86::SUB8ri:
+  case X86::SUB8ri8:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+  case X86::SUB8rr_REV:
+    return true;
+  }
+}
+
+bool isTEST(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::TEST16i16:
+  case X86::TEST16mi:
+  case X86::TEST16mr:
+  case X86::TEST16ri:
+  case X86::TEST16rr:
+  case X86::TEST32i32:
+  case X86::TEST32mi:
+  case X86::TEST32mr:
+  case X86::TEST32ri:
+  case X86::TEST32rr:
+  case X86::TEST64i32:
+  case X86::TEST64mi32:
+  case X86::TEST64mr:
+  case X86::TEST64ri32:
+  case X86::TEST64rr:
+  case X86::TEST8i8:
+  case X86::TEST8mi:
+  case X86::TEST8mr:
+  case X86::TEST8ri:
+  case X86::TEST8rr:
+    return true;
+  }
+}
+
+class X86MCPlusBuilder : public MCPlusBuilder {
+public:
+  X86MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
+                       const MCRegisterInfo *RegInfo)
+    : MCPlusBuilder(Analysis, Info, RegInfo) {}
+
+  bool isNoop(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    case X86::NOOP:
+    case X86::NOOPL:
+    case X86::NOOPLr:
+    case X86::NOOPQ:
+    case X86::NOOPQr:
+    case X86::NOOPW:
+    case X86::NOOPWr:
+      return true;
+    }
+    return false;
+  }
+
+  unsigned getCondCode(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    default:
+      return X86::COND_INVALID;
+    case X86::JCC_1:
+    case X86::JCC_2:
+    case X86::JCC_4:
+      return Inst.getOperand(Info->get(Inst.getOpcode()).NumOperands - 1)
+          .getImm();
+    }
+  }
+
+  bool isBreakpoint(const MCInst &Inst) const override {
+    return Inst.getOpcode() == X86::INT3;
+  }
+
+  bool isPrefix(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    case X86::LOCK_PREFIX:
+    case X86::REPNE_PREFIX:
+    case X86::REP_PREFIX:
+      return true;
+    }
+    return false;
+  }
+
+  bool deleteREPPrefix(MCInst &Inst) const override {
+    if (Inst.getFlags() == X86::IP_HAS_REPEAT) {
+      Inst.setFlags(0);
+      return true;
+    }
+    return false;
+  }
+
+  // FIXME: For compatibility with old LLVM only!
+  bool isTerminator(const MCInst &Inst) const override {
+    if (Info->get(Inst.getOpcode()).isTerminator())
+      return true;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::TRAP:
+    // Opcodes previously known as X86::UD2B
+    case X86::UD1Wm:
+    case X86::UD1Lm:
+    case X86::UD1Qm:
+    case X86::UD1Wr:
+    case X86::UD1Lr:
+    case X86::UD1Qr:
+      return true;
+    }
+  }
+
+  bool isIndirectCall(const MCInst &Inst) const override {
+    return isCall(Inst) &&
+           ((getMemoryOperandNo(Inst) != -1) || Inst.getOperand(0).isReg());
+  }
+
+  bool isPop(const MCInst &Inst) const override {
+    return getPopSize(Inst) == 0 ? false : true;
+  }
+
+  int getPopSize(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    case X86::POP16r:
+    case X86::POP16rmm:
+    case X86::POP16rmr:
+    case X86::POPF16:
+    case X86::POPA16:
+    case X86::POPDS16:
+    case X86::POPES16:
+    case X86::POPFS16:
+    case X86::POPGS16:
+    case X86::POPSS16:
+      return 2;
+    case X86::POP32r:
+    case X86::POP32rmm:
+    case X86::POP32rmr:
+    case X86::POPA32:
+    case X86::POPDS32:
+    case X86::POPES32:
+    case X86::POPF32:
+    case X86::POPFS32:
+    case X86::POPGS32:
+    case X86::POPSS32:
+      return 4;
+    case X86::POP64r:
+    case X86::POP64rmm:
+    case X86::POP64rmr:
+    case X86::POPF64:
+    case X86::POPFS64:
+    case X86::POPGS64:
+      return 8;
+    }
+    return 0;
+  }
+
+  bool isPush(const MCInst &Inst) const override {
+    return getPushSize(Inst) == 0 ? false : true;
+  }
+
+  int getPushSize(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    case X86::PUSH16i8:
+    case X86::PUSH16r:
+    case X86::PUSH16rmm:
+    case X86::PUSH16rmr:
+    case X86::PUSHA16:
+    case X86::PUSHCS16:
+    case X86::PUSHDS16:
+    case X86::PUSHES16:
+    case X86::PUSHF16:
+    case X86::PUSHFS16:
+    case X86::PUSHGS16:
+    case X86::PUSHSS16:
+    case X86::PUSHi16:
+      return 2;
+    case X86::PUSH32i8:
+    case X86::PUSH32r:
+    case X86::PUSH32rmm:
+    case X86::PUSH32rmr:
+    case X86::PUSHA32:
+    case X86::PUSHCS32:
+    case X86::PUSHDS32:
+    case X86::PUSHES32:
+    case X86::PUSHF32:
+    case X86::PUSHFS32:
+    case X86::PUSHGS32:
+    case X86::PUSHSS32:
+    case X86::PUSHi32:
+      return 4;
+    case X86::PUSH64i32:
+    case X86::PUSH64i8:
+    case X86::PUSH64r:
+    case X86::PUSH64rmm:
+    case X86::PUSH64rmr:
+    case X86::PUSHF64:
+    case X86::PUSHFS64:
+    case X86::PUSHGS64:
+      return 8;
+    }
+    return 0;
+  }
+
+  bool isADD64rr(const MCInst &Inst) const override {
+    return Inst.getOpcode() == X86::ADD64rr;
+  }
+
+  bool isSUB(const MCInst &Inst) const override {
+    return ::isSUB(Inst.getOpcode());
+  }
+
+  bool isADDri(const MCInst &Inst) const {
+    return Inst.getOpcode() == X86::ADD64ri32 ||
+           Inst.getOpcode() == X86::ADD64ri8;
+  }
+
+  bool isLEA64r(const MCInst &Inst) const override {
+    return Inst.getOpcode() == X86::LEA64r;
+  }
+
+  bool isMOVSX64rm32(const MCInst &Inst) const override {
+    return Inst.getOpcode() == X86::MOVSX64rm32;
+  }
+
+  bool isLeave(const MCInst &Inst) const override {
+    return Inst.getOpcode() == X86::LEAVE ||
+           Inst.getOpcode() == X86::LEAVE64;
+  }
+
+  bool isEnter(const MCInst &Inst) const override {
+    return Inst.getOpcode() == X86::ENTER;
+  }
+
+  bool isMoveMem2Reg(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    case X86::MOV16rm:
+    case X86::MOV32rm:
+    case X86::MOV64rm:
+      return true;
+    }
+    return false;
+  }
+
+  bool isUnsupportedBranch(unsigned Opcode) const override {
+    switch (Opcode) {
+    default:
+      return false;
+    case X86::LOOP:
+    case X86::LOOPE:
+    case X86::LOOPNE:
+    case X86::JECXZ:
+    case X86::JRCXZ:
+      return true;
+    }
+  }
+
+  bool isLoad(const MCInst &Inst) const override {
+    if (isPop(Inst))
+      return true;
+
+    int MemOpNo = getMemoryOperandNo(Inst);
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+
+    if (MemOpNo == -1)
+      return false;
+
+    return MCII.mayLoad();
+  }
+
+  bool isStore(const MCInst &Inst) const override {
+    if (isPush(Inst))
+      return true;
+
+    int MemOpNo = getMemoryOperandNo(Inst);
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+
+    if (MemOpNo == -1)
+      return false;
+
+    return MCII.mayStore();
+  }
+
+  bool isCleanRegXOR(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+      case X86::XOR16rr:
+      case X86::XOR32rr:
+      case X86::XOR64rr:
+        break;
+      default:
+        return false;
+    }
+    return (Inst.getOperand(0).getReg() ==
+            Inst.getOperand(2).getReg());
+  }
+
+  struct IndJmpMatcherFrag1 : MCInstMatcher {
+    std::unique_ptr<MCInstMatcher> Base;
+    std::unique_ptr<MCInstMatcher> Scale;
+    std::unique_ptr<MCInstMatcher> Index;
+    std::unique_ptr<MCInstMatcher> Offset;
+
+    IndJmpMatcherFrag1(std::unique_ptr<MCInstMatcher> Base,
+                       std::unique_ptr<MCInstMatcher> Scale,
+                       std::unique_ptr<MCInstMatcher> Index,
+                       std::unique_ptr<MCInstMatcher> Offset)
+        : Base(std::move(Base)), Scale(std::move(Scale)),
+          Index(std::move(Index)), Offset(std::move(Offset)) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIB,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIB, InInstrWindow, OpNum))
+        return false;
+
+      if (CurInst->getOpcode() != X86::JMP64m)
+        return false;
+
+      int MemOpNo = MIB.getMemoryOperandNo(*CurInst);
+      if (MemOpNo == -1)
+        return false;
+
+      if (!Base->match(MRI, MIB, this->InstrWindow, MemOpNo + X86::AddrBaseReg))
+        return false;
+      if (!Scale->match(MRI, MIB, this->InstrWindow,
+                        MemOpNo + X86::AddrScaleAmt))
+        return false;
+      if (!Index->match(MRI, MIB, this->InstrWindow,
+                        MemOpNo + X86::AddrIndexReg))
+        return false;
+      if (!Offset->match(MRI, MIB, this->InstrWindow, MemOpNo + X86::AddrDisp))
+        return false;
+      return true;
+    }
+
+    void annotate(MCPlusBuilder &MIB, StringRef Annotation) override {
+      MIB.addAnnotation(*CurInst, Annotation, true);
+      Base->annotate(MIB, Annotation);
+      Scale->annotate(MIB, Annotation);
+      Index->annotate(MIB, Annotation);
+      Offset->annotate(MIB, Annotation);
+    }
+  };
+
+  std::unique_ptr<MCInstMatcher>
+  matchIndJmp(std::unique_ptr<MCInstMatcher> Base,
+              std::unique_ptr<MCInstMatcher> Scale,
+              std::unique_ptr<MCInstMatcher> Index,
+              std::unique_ptr<MCInstMatcher> Offset) const override {
+    return std::unique_ptr<MCInstMatcher>(
+        new IndJmpMatcherFrag1(std::move(Base), std::move(Scale),
+                               std::move(Index), std::move(Offset)));
+  }
+
+  struct IndJmpMatcherFrag2 : MCInstMatcher {
+    std::unique_ptr<MCInstMatcher> Reg;
+
+    IndJmpMatcherFrag2(std::unique_ptr<MCInstMatcher> Reg)
+        : Reg(std::move(Reg)) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIB,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIB, InInstrWindow, OpNum))
+        return false;
+
+      if (CurInst->getOpcode() != X86::JMP64r)
+        return false;
+
+      return Reg->match(MRI, MIB, this->InstrWindow, 0);
+    }
+
+    void annotate(MCPlusBuilder &MIB, StringRef Annotation) override {
+      MIB.addAnnotation(*CurInst, Annotation, true);
+      Reg->annotate(MIB, Annotation);
+    }
+  };
+
+  std::unique_ptr<MCInstMatcher>
+  matchIndJmp(std::unique_ptr<MCInstMatcher> Target) const override {
+    return std::unique_ptr<MCInstMatcher>(
+        new IndJmpMatcherFrag2(std::move(Target)));
+  }
+
+  struct LoadMatcherFrag1 : MCInstMatcher {
+    std::unique_ptr<MCInstMatcher> Base;
+    std::unique_ptr<MCInstMatcher> Scale;
+    std::unique_ptr<MCInstMatcher> Index;
+    std::unique_ptr<MCInstMatcher> Offset;
+
+    LoadMatcherFrag1(std::unique_ptr<MCInstMatcher> Base,
+                     std::unique_ptr<MCInstMatcher> Scale,
+                     std::unique_ptr<MCInstMatcher> Index,
+                     std::unique_ptr<MCInstMatcher> Offset)
+        : Base(std::move(Base)), Scale(std::move(Scale)),
+          Index(std::move(Index)), Offset(std::move(Offset)) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIB,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIB, InInstrWindow, OpNum))
+        return false;
+
+      if (CurInst->getOpcode() != X86::MOV64rm &&
+          CurInst->getOpcode() != X86::MOVSX64rm32)
+        return false;
+
+      int MemOpNo = MIB.getMemoryOperandNo(*CurInst);
+      if (MemOpNo == -1)
+        return false;
+
+      if (!Base->match(MRI, MIB, this->InstrWindow, MemOpNo + X86::AddrBaseReg))
+        return false;
+      if (!Scale->match(MRI, MIB, this->InstrWindow,
+                        MemOpNo + X86::AddrScaleAmt))
+        return false;
+      if (!Index->match(MRI, MIB, this->InstrWindow,
+                        MemOpNo + X86::AddrIndexReg))
+        return false;
+      if (!Offset->match(MRI, MIB, this->InstrWindow, MemOpNo + X86::AddrDisp))
+        return false;
+      return true;
+    }
+
+    void annotate(MCPlusBuilder &MIB, StringRef Annotation) override {
+      MIB.addAnnotation(*CurInst, Annotation, true);
+      Base->annotate(MIB, Annotation);
+      Scale->annotate(MIB, Annotation);
+      Index->annotate(MIB, Annotation);
+      Offset->annotate(MIB, Annotation);
+    }
+  };
+
+  std::unique_ptr<MCInstMatcher>
+  matchLoad(std::unique_ptr<MCInstMatcher> Base,
+            std::unique_ptr<MCInstMatcher> Scale,
+            std::unique_ptr<MCInstMatcher> Index,
+            std::unique_ptr<MCInstMatcher> Offset) const override {
+    return std::unique_ptr<MCInstMatcher>(
+        new LoadMatcherFrag1(std::move(Base), std::move(Scale),
+                               std::move(Index), std::move(Offset)));
+  }
+
+  struct AddMatcher : MCInstMatcher {
+    std::unique_ptr<MCInstMatcher> A;
+    std::unique_ptr<MCInstMatcher> B;
+
+    AddMatcher(std::unique_ptr<MCInstMatcher> A,
+               std::unique_ptr<MCInstMatcher> B)
+        : A(std::move(A)), B(std::move(B)) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIB,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIB, InInstrWindow, OpNum))
+        return false;
+
+      if (CurInst->getOpcode() == X86::ADD64rr ||
+          CurInst->getOpcode() == X86::ADD64rr_DB ||
+          CurInst->getOpcode() == X86::ADD64rr_REV) {
+        if (!A->match(MRI, MIB, this->InstrWindow, 1)) {
+          if (!B->match(MRI, MIB, this->InstrWindow, 1))
+            return false;
+          return A->match(MRI, MIB, this->InstrWindow, 2);
+        }
+
+        if (B->match(MRI, MIB, this->InstrWindow, 2))
+          return true;
+
+        if (!B->match(MRI, MIB, this->InstrWindow, 1))
+          return false;
+        return A->match(MRI, MIB, this->InstrWindow, 2);
+      }
+
+      return false;
+    }
+
+    void annotate(MCPlusBuilder &MIB, StringRef Annotation) override {
+      MIB.addAnnotation(*CurInst, Annotation, true);
+      A->annotate(MIB, Annotation);
+      B->annotate(MIB, Annotation);
+    }
+  };
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchAdd(std::unique_ptr<MCInstMatcher> A,
+           std::unique_ptr<MCInstMatcher> B) const override {
+    return std::unique_ptr<MCInstMatcher>(
+        new AddMatcher(std::move(A), std::move(B)));
+  }
+
+  struct LEAMatcher : MCInstMatcher {
+    std::unique_ptr<MCInstMatcher> Target;
+
+    LEAMatcher(std::unique_ptr<MCInstMatcher> Target)
+        : Target(std::move(Target)) {}
+
+    bool match(const MCRegisterInfo &MRI, MCPlusBuilder &MIB,
+               MutableArrayRef<MCInst> InInstrWindow, int OpNum) override {
+      if (!MCInstMatcher::match(MRI, MIB, InInstrWindow, OpNum))
+        return false;
+
+      if (CurInst->getOpcode() != X86::LEA64r)
+        return false;
+
+      if (CurInst->getOperand(1 + X86::AddrScaleAmt).getImm() != 1 ||
+          CurInst->getOperand(1 + X86::AddrIndexReg).getReg() !=
+              X86::NoRegister ||
+          (CurInst->getOperand(1 + X86::AddrBaseReg).getReg() !=
+               X86::NoRegister &&
+           CurInst->getOperand(1 + X86::AddrBaseReg).getReg() != X86::RIP))
+        return false;
+
+      return Target->match(MRI, MIB, this->InstrWindow, 1 + X86::AddrDisp);
+    }
+
+    void annotate(MCPlusBuilder &MIB, StringRef Annotation) override {
+      MIB.addAnnotation(*CurInst, Annotation, true);
+      Target->annotate(MIB, Annotation);
+    }
+  };
+
+  virtual std::unique_ptr<MCInstMatcher>
+  matchLoadAddr(std::unique_ptr<MCInstMatcher> Target) const override {
+    return std::unique_ptr<MCInstMatcher>(new LEAMatcher(std::move(Target)));
+  }
+
+  bool hasPCRelOperand(const MCInst &Inst) const override {
+    for (const MCOperand &Operand : Inst) {
+      if (Operand.isReg() && Operand.getReg() == X86::RIP)
+        return true;
+    }
+    return false;
+  }
+
+  int getMemoryOperandNo(const MCInst &Inst) const override {
+    unsigned Opcode = Inst.getOpcode();
+    const MCInstrDesc &Desc = Info->get(Opcode);
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+    if (MemOpNo >= 0)
+      MemOpNo += X86II::getOperandBias(Desc);
+    return MemOpNo;
+  }
+
+  bool hasEVEXEncoding(const MCInst &Inst) const override {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    return (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX;
+  }
+
+  bool isMacroOpFusionPair(ArrayRef<MCInst> Insts) const override {
+    // FIXME: the macro-op fusion is triggered under different conditions
+    //        on different cores. This implementation is for sandy-bridge+.
+    const auto *I = Insts.begin();
+    while (I != Insts.end() && isPrefix(*I))
+      ++I;
+    if (I == Insts.end())
+      return false;
+
+    const MCInst &FirstInst = *I;
+    ++I;
+    while (I != Insts.end() && isPrefix(*I))
+      ++I;
+    if (I == Insts.end())
+      return false;
+    const MCInst &SecondInst = *I;
+
+    if (!isConditionalBranch(SecondInst))
+      return false;
+    // J?CXZ and LOOP cannot be fused
+    if (SecondInst.getOpcode() == X86::LOOP ||
+        SecondInst.getOpcode() == X86::LOOPE ||
+        SecondInst.getOpcode() == X86::LOOPNE ||
+        SecondInst.getOpcode() == X86::JECXZ ||
+        SecondInst.getOpcode() == X86::JRCXZ)
+      return false;
+
+    // Cannot fuse if first instruction operands are MEM-IMM.
+    const MCInstrDesc &Desc = Info->get(FirstInst.getOpcode());
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+    if (MemOpNo != -1 && X86II::hasImm(Desc.TSFlags))
+      return false;
+
+    // Cannot fuse if the first instruction uses RIP-relative memory.
+    // FIXME: verify that this is true.
+    if (hasPCRelOperand(FirstInst))
+      return false;
+
+    // Check instructions against table 3-1 in Intel's Optimization Guide.
+    unsigned FirstInstGroup = 0;
+    if (isTEST(FirstInst.getOpcode()) || isAND(FirstInst.getOpcode())) {
+      FirstInstGroup = 1;
+    } else if (isCMP(FirstInst.getOpcode()) || isADD(FirstInst.getOpcode()) ||
+               ::isSUB(FirstInst.getOpcode())) {
+      FirstInstGroup = 2;
+    } else if (isINC(FirstInst.getOpcode()) || isDEC(FirstInst.getOpcode())) {
+      FirstInstGroup = 3;
+    }
+    if (FirstInstGroup == 0)
+      return false;
+
+    const unsigned CondCode =
+        getCanonicalBranchCondCode(getCondCode(SecondInst));
+    switch (CondCode) {
+    default:
+      llvm_unreachable("unexpected conditional code");
+      return false;
+    case X86::COND_E:
+    case X86::COND_L:
+    case X86::COND_G:
+      return true;
+    case X86::COND_O:
+    case X86::COND_P:
+    case X86::COND_S:
+      if (FirstInstGroup == 1)
+        return true;
+      return false;
+    case X86::COND_A:
+    case X86::COND_B:
+      if (FirstInstGroup != 3)
+        return true;
+      return false;
+    }
+  }
+
+  bool evaluateX86MemoryOperand(const MCInst &Inst,
+                                unsigned *BaseRegNum,
+                                int64_t *ScaleImm,
+                                unsigned *IndexRegNum,
+                                int64_t *DispImm,
+                                unsigned *SegmentRegNum,
+                                const MCExpr **DispExpr = nullptr)
+                                                                const override {
+    assert(BaseRegNum && ScaleImm && IndexRegNum && SegmentRegNum &&
+           "one of the input pointers is null");
+    int MemOpNo = getMemoryOperandNo(Inst);
+    if (MemOpNo < 0)
+      return false;
+    unsigned MemOpOffset = static_cast<unsigned>(MemOpNo);
+
+    if (MemOpOffset + X86::AddrSegmentReg >= MCPlus::getNumPrimeOperands(Inst))
+      return false;
+
+    const MCOperand &Base = Inst.getOperand(MemOpOffset + X86::AddrBaseReg);
+    const MCOperand &Scale = Inst.getOperand(MemOpOffset + X86::AddrScaleAmt);
+    const MCOperand &Index = Inst.getOperand(MemOpOffset + X86::AddrIndexReg);
+    const MCOperand &Disp = Inst.getOperand(MemOpOffset + X86::AddrDisp);
+    const MCOperand &Segment =
+        Inst.getOperand(MemOpOffset + X86::AddrSegmentReg);
+
+    // Make sure it is a well-formed memory operand.
+    if (!Base.isReg() || !Scale.isImm() || !Index.isReg() ||
+        (!Disp.isImm() && !Disp.isExpr()) || !Segment.isReg())
+      return false;
+
+    *BaseRegNum = Base.getReg();
+    *ScaleImm = Scale.getImm();
+    *IndexRegNum = Index.getReg();
+    if (Disp.isImm()) {
+      assert(DispImm && "DispImm needs to be set");
+      *DispImm = Disp.getImm();
+      if (DispExpr) {
+        *DispExpr = nullptr;
+      }
+    } else {
+      assert(DispExpr && "DispExpr needs to be set");
+      *DispExpr = Disp.getExpr();
+      if (DispImm) {
+        *DispImm = 0;
+      }
+    }
+    *SegmentRegNum = Segment.getReg();
+    return true;
+  }
+
+  bool evaluateMemOperandTarget(const MCInst &Inst,
+                                uint64_t &Target,
+                                uint64_t Address,
+                                uint64_t Size) const override {
+    unsigned      BaseRegNum;
+    int64_t       ScaleValue;
+    unsigned      IndexRegNum;
+    int64_t       DispValue;
+    unsigned      SegRegNum;
+    const MCExpr *DispExpr = nullptr;
+    if (!evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, &IndexRegNum,
+                                  &DispValue, &SegRegNum, &DispExpr)) {
+      return false;
+    }
+
+    // Make sure it's a well-formed addressing we can statically evaluate.
+    if ((BaseRegNum != X86::RIP && BaseRegNum != X86::NoRegister) ||
+        IndexRegNum != X86::NoRegister || SegRegNum != X86::NoRegister ||
+        DispExpr) {
+      return false;
+    }
+    Target = DispValue;
+    if (BaseRegNum == X86::RIP) {
+      assert(Size != 0 && "instruction size required in order to statically "
+                          "evaluate RIP-relative address");
+      Target += Address + Size;
+    }
+    return true;
+  }
+
+  MCInst::iterator getMemOperandDisp(MCInst &Inst) const override {
+    int MemOpNo = getMemoryOperandNo(Inst);
+    if (MemOpNo < 0)
+      return Inst.end();
+    return Inst.begin() + (MemOpNo + X86::AddrDisp);
+  }
+
+  bool replaceMemOperandDisp(MCInst &Inst, MCOperand Operand) const override {
+    MCOperand *OI = getMemOperandDisp(Inst);
+    if (OI == Inst.end())
+      return false;
+    *OI = Operand;
+    return true;
+  }
+
+  /// Get the registers used as function parameters.
+  /// This function is specific to the x86_64 abi on Linux.
+  BitVector getRegsUsedAsParams() const override {
+    BitVector Regs = BitVector(RegInfo->getNumRegs(), false);
+    Regs |= getAliases(X86::RSI);
+    Regs |= getAliases(X86::RDI);
+    Regs |= getAliases(X86::RDX);
+    Regs |= getAliases(X86::RCX);
+    Regs |= getAliases(X86::R8);
+    Regs |= getAliases(X86::R9);
+    return Regs;
+  }
+
+  void getCalleeSavedRegs(BitVector &Regs) const override {
+    Regs |= getAliases(X86::RBX);
+    Regs |= getAliases(X86::RBP);
+    Regs |= getAliases(X86::R12);
+    Regs |= getAliases(X86::R13);
+    Regs |= getAliases(X86::R14);
+    Regs |= getAliases(X86::R15);
+  }
+
+  void getDefaultDefIn(BitVector &Regs) const override {
+    assert(Regs.size() >= RegInfo->getNumRegs() &&
+            "The size of BitVector is less than RegInfo->getNumRegs().");
+    Regs.set(X86::RAX);
+    Regs.set(X86::RCX);
+    Regs.set(X86::RDX);
+    Regs.set(X86::RSI);
+    Regs.set(X86::RDI);
+    Regs.set(X86::R8);
+    Regs.set(X86::R9);
+    Regs.set(X86::XMM0);
+    Regs.set(X86::XMM1);
+    Regs.set(X86::XMM2);
+    Regs.set(X86::XMM3);
+    Regs.set(X86::XMM4);
+    Regs.set(X86::XMM5);
+    Regs.set(X86::XMM6);
+    Regs.set(X86::XMM7);
+  }
+
+  void getDefaultLiveOut(BitVector &Regs) const override {
+    assert(Regs.size() >= RegInfo->getNumRegs() &&
+            "The size of BitVector is less than RegInfo->getNumRegs().");
+    Regs |= getAliases(X86::RAX);
+    Regs |= getAliases(X86::RDX);
+    Regs |= getAliases(X86::RCX);
+    Regs |= getAliases(X86::XMM0);
+    Regs |= getAliases(X86::XMM1);
+  }
+
+  void getGPRegs(BitVector &Regs, bool IncludeAlias) const override {
+    if (IncludeAlias) {
+      Regs |= getAliases(X86::RAX);
+      Regs |= getAliases(X86::RBX);
+      Regs |= getAliases(X86::RBP);
+      Regs |= getAliases(X86::RSI);
+      Regs |= getAliases(X86::RDI);
+      Regs |= getAliases(X86::RDX);
+      Regs |= getAliases(X86::RCX);
+      Regs |= getAliases(X86::R8);
+      Regs |= getAliases(X86::R9);
+      Regs |= getAliases(X86::R10);
+      Regs |= getAliases(X86::R11);
+      Regs |= getAliases(X86::R12);
+      Regs |= getAliases(X86::R13);
+      Regs |= getAliases(X86::R14);
+      Regs |= getAliases(X86::R15);
+      return;
+    }
+    Regs.set(X86::RAX);
+    Regs.set(X86::RBX);
+    Regs.set(X86::RBP);
+    Regs.set(X86::RSI);
+    Regs.set(X86::RDI);
+    Regs.set(X86::RDX);
+    Regs.set(X86::RCX);
+    Regs.set(X86::R8);
+    Regs.set(X86::R9);
+    Regs.set(X86::R10);
+    Regs.set(X86::R11);
+    Regs.set(X86::R12);
+    Regs.set(X86::R13);
+    Regs.set(X86::R14);
+    Regs.set(X86::R15);
+  }
+
+  void getClassicGPRegs(BitVector &Regs) const override {
+    Regs |= getAliases(X86::RAX);
+    Regs |= getAliases(X86::RBX);
+    Regs |= getAliases(X86::RBP);
+    Regs |= getAliases(X86::RSI);
+    Regs |= getAliases(X86::RDI);
+    Regs |= getAliases(X86::RDX);
+    Regs |= getAliases(X86::RCX);
+  }
+
+  MCPhysReg getAliasSized(MCPhysReg Reg, uint8_t Size) const override {
+    switch (Reg) {
+      case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: case X86::AH:
+        switch (Size) {
+          case 8: return X86::RAX;       case 4: return X86::EAX;
+          case 2: return X86::AX;        case 1: return X86::AL;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: case X86::BH:
+        switch (Size) {
+          case 8: return X86::RBX;       case 4: return X86::EBX;
+          case 2: return X86::BX;        case 1: return X86::BL;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: case X86::DH:
+        switch (Size) {
+          case 8: return X86::RDX;       case 4: return X86::EDX;
+          case 2: return X86::DX;        case 1: return X86::DL;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL:
+        switch (Size) {
+          case 8: return X86::RDI;       case 4: return X86::EDI;
+          case 2: return X86::DI;        case 1: return X86::DIL;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL:
+        switch (Size) {
+          case 8: return X86::RSI;       case 4: return X86::ESI;
+          case 2: return X86::SI;        case 1: return X86::SIL;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: case X86::CH:
+        switch (Size) {
+          case 8: return X86::RCX;       case 4: return X86::ECX;
+          case 2: return X86::CX;        case 1: return X86::CL;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B:
+        switch (Size) {
+          case 8: return X86::R8;        case 4: return X86::R8D;
+          case 2: return X86::R8W;       case 1: return X86::R8B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B:
+        switch (Size) {
+          case 8: return X86::R9;        case 4: return X86::R9D;
+          case 2: return X86::R9W;       case 1: return X86::R9B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+        switch (Size) {
+          case 8: return X86::R10;        case 4: return X86::R10D;
+          case 2: return X86::R10W;       case 1: return X86::R10B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+        switch (Size) {
+          case 8: return X86::R11;        case 4: return X86::R11D;
+          case 2: return X86::R11W;       case 1: return X86::R11B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+        switch (Size) {
+          case 8: return X86::R12;        case 4: return X86::R12D;
+          case 2: return X86::R12W;       case 1: return X86::R12B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+        switch (Size) {
+          case 8: return X86::R13;        case 4: return X86::R13D;
+          case 2: return X86::R13W;       case 1: return X86::R13B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+        switch (Size) {
+          case 8: return X86::R14;        case 4: return X86::R14D;
+          case 2: return X86::R14W;       case 1: return X86::R14B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+        switch (Size) {
+          case 8: return X86::R15;        case 4: return X86::R15D;
+          case 2: return X86::R15W;       case 1: return X86::R15B;
+          default: llvm_unreachable("Unexpected size");
+        }
+      default:
+        dbgs() << Reg << " (get alias sized)\n";
+        llvm_unreachable("Unexpected reg number");
+        break;
+    }
+  }
+
+  bool isUpper8BitReg(MCPhysReg Reg) const override {
+    switch (Reg) {
+      case X86::AH:  case X86::BH:   case X86::CH:   case X86::DH:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  bool cannotUseREX(const MCInst &Inst) const override {
+    switch(Inst.getOpcode()) {
+      case X86::MOV8mr_NOREX:
+      case X86::MOV8rm_NOREX:
+      case X86::MOV8rr_NOREX:
+      case X86::MOVSX32rm8_NOREX:
+      case X86::MOVSX32rr8_NOREX:
+      case X86::MOVZX32rm8_NOREX:
+      case X86::MOVZX32rr8_NOREX:
+      case X86::MOV8mr:
+      case X86::MOV8rm:
+      case X86::MOV8rr:
+      case X86::MOVSX32rm8:
+      case X86::MOVSX32rr8:
+      case X86::MOVZX32rm8:
+      case X86::MOVZX32rr8:
+      case X86::TEST8ri:
+        for (int I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) {
+          const MCOperand &Operand = Inst.getOperand(I);
+          if (!Operand.isReg())
+            continue;
+          if (isUpper8BitReg(Operand.getReg()))
+            return true;
+        }
+      LLVM_FALLTHROUGH;
+      default:
+        return false;
+    }
+  }
+
+  bool isStackAccess(const MCInst &Inst, bool &IsLoad, bool &IsStore,
+                     bool &IsStoreFromReg, MCPhysReg &Reg, int32_t &SrcImm,
+                     uint16_t &StackPtrReg, int64_t &StackOffset,
+                     uint8_t &Size, bool &IsSimple,
+                     bool &IsIndexed) const override {
+    // Detect simple push/pop cases first
+    if (int Sz = getPushSize(Inst)) {
+      IsLoad = false;
+      IsStore = true;
+      IsStoreFromReg = true;
+      StackPtrReg = X86::RSP;
+      StackOffset = -Sz;
+      Size = Sz;
+      IsSimple = true;
+      if (Inst.getOperand(0).isImm()) {
+        SrcImm = Inst.getOperand(0).getImm();
+      } else if (Inst.getOperand(0).isReg()) {
+        Reg = Inst.getOperand(0).getReg();
+      } else {
+        IsSimple = false;
+      }
+      return true;
+    }
+    if (int Sz = getPopSize(Inst)) {
+      assert(Inst.getOperand(0).isReg() &&
+             "Expected register operand for push");
+      IsLoad = true;
+      IsStore = false;
+      Reg = Inst.getOperand(0).getReg();
+      StackPtrReg = X86::RSP;
+      StackOffset = 0;
+      Size = Sz;
+      IsSimple = true;
+      return true;
+    }
+
+    struct InstInfo {
+      // Size in bytes that Inst loads from memory.
+      uint8_t DataSize;
+      bool IsLoad;
+      bool IsStore;
+      bool StoreFromReg;
+      bool Simple;
+    };
+
+    InstInfo I;
+    int MemOpNo = getMemoryOperandNo(Inst);
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+    // If it is not dealing with a memory operand, we discard it
+    if (MemOpNo == -1 || MCII.isCall())
+      return false;
+
+    switch (Inst.getOpcode()) {
+    default: {
+      uint8_t Sz = 0;
+      bool IsLoad = MCII.mayLoad();
+      bool IsStore = MCII.mayStore();
+      // Is it LEA? (deals with memory but is not loading nor storing)
+      if (!IsLoad && !IsStore)
+        return false;
+
+      // Try to guess data size involved in the load/store by looking at the
+      // register size. If there's no reg involved, return 0 as size, meaning
+      // we don't know.
+      for (unsigned I = 0, E = MCII.getNumOperands(); I != E; ++I) {
+        if (MCII.OpInfo[I].OperandType != MCOI::OPERAND_REGISTER)
+          continue;
+        if (static_cast<int>(I) >= MemOpNo && I < X86::AddrNumOperands)
+          continue;
+        Sz = RegInfo->getRegClass(MCII.OpInfo[I].RegClass).getSizeInBits() / 8;
+        break;
+      }
+      I = {Sz, IsLoad, IsStore, false, false};
+      break;
+    }
+    case X86::MOV16rm: I = {2, true, false, false, true}; break;
+    case X86::MOV32rm: I = {4, true, false, false, true}; break;
+    case X86::MOV64rm: I = {8, true, false, false, true}; break;
+    case X86::MOV16mr: I = {2, false, true, true, true};  break;
+    case X86::MOV32mr: I = {4, false, true, true, true};  break;
+    case X86::MOV64mr: I = {8, false, true, true, true};  break;
+    case X86::MOV16mi: I = {2, false, true, false, true}; break;
+    case X86::MOV32mi: I = {4, false, true, false, true}; break;
+    } // end switch (Inst.getOpcode())
+
+    unsigned BaseRegNum;
+    int64_t ScaleValue;
+    unsigned IndexRegNum;
+    int64_t DispValue;
+    unsigned SegRegNum;
+    const MCExpr *DispExpr;
+    if (!evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, &IndexRegNum,
+                                  &DispValue, &SegRegNum, &DispExpr)) {
+      LLVM_DEBUG(dbgs() << "Evaluate failed on ");
+      LLVM_DEBUG(Inst.dump());
+      return false;
+    }
+
+    // Make sure it's a stack access
+    if (BaseRegNum != X86::RBP && BaseRegNum != X86::RSP) {
+      return false;
+    }
+
+    IsLoad = I.IsLoad;
+    IsStore = I.IsStore;
+    IsStoreFromReg = I.StoreFromReg;
+    Size = I.DataSize;
+    IsSimple = I.Simple;
+    StackPtrReg = BaseRegNum;
+    StackOffset = DispValue;
+    IsIndexed = IndexRegNum != X86::NoRegister || SegRegNum != X86::NoRegister;
+
+    if (!I.Simple)
+      return true;
+
+    // Retrieve related register in simple MOV from/to stack operations.
+    unsigned MemOpOffset = static_cast<unsigned>(MemOpNo);
+    if (I.IsLoad) {
+      MCOperand RegOpnd = Inst.getOperand(0);
+      assert(RegOpnd.isReg() && "unexpected destination operand");
+      Reg = RegOpnd.getReg();
+    } else if (I.IsStore) {
+      MCOperand SrcOpnd =
+          Inst.getOperand(MemOpOffset + X86::AddrSegmentReg + 1);
+      if (I.StoreFromReg) {
+        assert(SrcOpnd.isReg() && "unexpected source operand");
+        Reg = SrcOpnd.getReg();
+      } else {
+        assert(SrcOpnd.isImm() && "unexpected source operand");
+        SrcImm = SrcOpnd.getImm();
+      }
+    }
+
+    return true;
+  }
+
+  void changeToPushOrPop(MCInst &Inst) const override {
+    assert(!isPush(Inst) && !isPop(Inst));
+
+    struct InstInfo {
+      // Size in bytes that Inst loads from memory.
+      uint8_t DataSize;
+      bool IsLoad;
+      bool StoreFromReg;
+    };
+
+    InstInfo I;
+    switch (Inst.getOpcode()) {
+    default: {
+      llvm_unreachable("Unhandled opcode");
+      return;
+    }
+    case X86::MOV16rm: I = {2, true, false}; break;
+    case X86::MOV32rm: I = {4, true, false}; break;
+    case X86::MOV64rm: I = {8, true, false}; break;
+    case X86::MOV16mr: I = {2, false, true};  break;
+    case X86::MOV32mr: I = {4, false, true};  break;
+    case X86::MOV64mr: I = {8, false, true};  break;
+    case X86::MOV16mi: I = {2, false, false}; break;
+    case X86::MOV32mi: I = {4, false, false}; break;
+    } // end switch (Inst.getOpcode())
+
+    unsigned BaseRegNum;
+    int64_t ScaleValue;
+    unsigned IndexRegNum;
+    int64_t DispValue;
+    unsigned SegRegNum;
+    const MCExpr *DispExpr;
+    if (!evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, &IndexRegNum,
+                                  &DispValue, &SegRegNum, &DispExpr)) {
+      llvm_unreachable("Evaluate failed");
+      return;
+    }
+    // Make sure it's a stack access
+    if (BaseRegNum != X86::RBP && BaseRegNum != X86::RSP) {
+      llvm_unreachable("Not a stack access");
+      return;
+    }
+
+    unsigned MemOpOffset = getMemoryOperandNo(Inst);
+    unsigned NewOpcode = 0;
+    if (I.IsLoad) {
+      switch (I.DataSize) {
+        case 2: NewOpcode = X86::POP16r; break;
+        case 4: NewOpcode = X86::POP32r; break;
+        case 8: NewOpcode = X86::POP64r; break;
+        default:
+          assert(false);
+      }
+      unsigned RegOpndNum = Inst.getOperand(0).getReg();
+      Inst.clear();
+      Inst.setOpcode(NewOpcode);
+      Inst.addOperand(MCOperand::createReg(RegOpndNum));
+    } else {
+      MCOperand SrcOpnd =
+          Inst.getOperand(MemOpOffset + X86::AddrSegmentReg + 1);
+      if (I.StoreFromReg) {
+        switch (I.DataSize) {
+          case 2: NewOpcode = X86::PUSH16r; break;
+          case 4: NewOpcode = X86::PUSH32r; break;
+          case 8: NewOpcode = X86::PUSH64r; break;
+          default:
+            assert(false);
+        }
+        assert(SrcOpnd.isReg() && "unexpected source operand");
+        unsigned RegOpndNum = SrcOpnd.getReg();
+        Inst.clear();
+        Inst.setOpcode(NewOpcode);
+        Inst.addOperand(MCOperand::createReg(RegOpndNum));
+      } else {
+        switch (I.DataSize) {
+          case 2: NewOpcode = X86::PUSH16i8; break;
+          case 4: NewOpcode = X86::PUSH32i8; break;
+          case 8: NewOpcode = X86::PUSH64i32; break;
+          default:
+            assert(false);
+        }
+        assert(SrcOpnd.isImm() && "unexpected source operand");
+        int64_t SrcImm = SrcOpnd.getImm();
+        Inst.clear();
+        Inst.setOpcode(NewOpcode);
+        Inst.addOperand(MCOperand::createImm(SrcImm));
+      }
+    }
+  }
+
+  bool isStackAdjustment(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::SUB64ri32:
+    case X86::SUB64ri8:
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+    case X86::LEA64r:
+      break;
+    }
+
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+    for (int I = 0, E = MCII.getNumDefs(); I != E; ++I) {
+      const MCOperand &Operand = Inst.getOperand(I);
+      if (Operand.isReg() && Operand.getReg() == X86::RSP) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool evaluateSimple(const MCInst &Inst, int64_t &Output,
+                      std::pair<MCPhysReg, int64_t> Input1,
+                      std::pair<MCPhysReg, int64_t> Input2) const override {
+
+    auto getOperandVal = [&](MCPhysReg Reg) -> ErrorOr<int64_t> {
+      if (Reg == Input1.first)
+        return Input1.second;
+      if (Reg == Input2.first)
+        return Input2.second;
+      return make_error_code(errc::result_out_of_range);
+    };
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+
+    case X86::AND64ri32:
+    case X86::AND64ri8:
+      if (!Inst.getOperand(2).isImm())
+        return false;
+      if (ErrorOr<int64_t> InputVal =
+              getOperandVal(Inst.getOperand(1).getReg())) {
+        Output = *InputVal & Inst.getOperand(2).getImm();
+      } else {
+        return false;
+      }
+      break;
+    case X86::SUB64ri32:
+    case X86::SUB64ri8:
+      if (!Inst.getOperand(2).isImm())
+        return false;
+      if (ErrorOr<int64_t> InputVal =
+              getOperandVal(Inst.getOperand(1).getReg())) {
+        Output = *InputVal - Inst.getOperand(2).getImm();
+      } else {
+        return false;
+      }
+      break;
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+      if (!Inst.getOperand(2).isImm())
+        return false;
+      if (ErrorOr<int64_t> InputVal =
+              getOperandVal(Inst.getOperand(1).getReg())) {
+        Output = *InputVal + Inst.getOperand(2).getImm();
+      } else {
+        return false;
+      }
+      break;
+    case X86::ADD64i32:
+      if (!Inst.getOperand(0).isImm())
+        return false;
+      if (ErrorOr<int64_t> InputVal = getOperandVal(X86::RAX)) {
+        Output = *InputVal + Inst.getOperand(0).getImm();
+      } else {
+        return false;
+      }
+      break;
+
+    case X86::LEA64r: {
+      unsigned BaseRegNum;
+      int64_t ScaleValue;
+      unsigned IndexRegNum;
+      int64_t DispValue;
+      unsigned SegRegNum;
+      const MCExpr *DispExpr = nullptr;
+      if (!evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue,
+                                    &IndexRegNum, &DispValue, &SegRegNum,
+                                    &DispExpr)) {
+        return false;
+      }
+
+      if (BaseRegNum == X86::NoRegister || IndexRegNum != X86::NoRegister ||
+          SegRegNum != X86::NoRegister || DispExpr) {
+        return false;
+      }
+
+      if (ErrorOr<int64_t> InputVal = getOperandVal(BaseRegNum)) {
+        Output = *InputVal + DispValue;
+      } else {
+        return false;
+      }
+
+      break;
+    }
+    }
+    return true;
+  }
+
+  bool isRegToRegMove(const MCInst &Inst, MCPhysReg &From,
+                      MCPhysReg &To) const override {
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::LEAVE:
+    case X86::LEAVE64:
+      To = getStackPointer();
+      From = getFramePointer();
+      return true;
+    case X86::MOV64rr:
+      To = Inst.getOperand(0).getReg();
+      From = Inst.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  MCPhysReg getStackPointer() const override { return X86::RSP; }
+  MCPhysReg getFramePointer() const override { return X86::RBP; }
+  MCPhysReg getFlagsReg() const override { return X86::EFLAGS; }
+
+  bool escapesVariable(const MCInst &Inst, bool HasFramePointer) const override {
+    int MemOpNo = getMemoryOperandNo(Inst);
+    const MCInstrDesc &MCII = Info->get(Inst.getOpcode());
+    const unsigned NumDefs = MCII.getNumDefs();
+    static BitVector SPBPAliases(BitVector(getAliases(X86::RSP)) |=
+                                 getAliases(X86::RBP));
+    static BitVector SPAliases(getAliases(X86::RSP));
+
+    // FIXME: PUSH can be technically a leak, but let's ignore this for now
+    // because a lot of harmless prologue code will spill SP to the stack
+    // (t15117648) - Unless push is clearly pushing an object address to the
+    // stack as demonstrated by having a MemOp.
+    bool IsPush = isPush(Inst);
+    if (IsPush && MemOpNo == -1)
+      return false;
+
+    // We use this to detect LEA (has memop but does not access mem)
+    bool AccessMem = MCII.mayLoad() || MCII.mayStore();
+    bool DoesLeak = false;
+    for (int I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) {
+      // Ignore if SP/BP is used to derefence memory -- that's fine
+      if (MemOpNo != -1 && !IsPush && AccessMem && I >= MemOpNo &&
+          I <= MemOpNo + 5)
+        continue;
+      // Ignore if someone is writing to SP/BP
+      if (I < static_cast<int>(NumDefs))
+        continue;
+
+      const MCOperand &Operand = Inst.getOperand(I);
+      if (HasFramePointer && Operand.isReg() && SPBPAliases[Operand.getReg()]) {
+        DoesLeak = true;
+        break;
+      }
+      if (!HasFramePointer && Operand.isReg() && SPAliases[Operand.getReg()]) {
+        DoesLeak = true;
+        break;
+      }
+    }
+
+    // If potential leak, check if it is not just writing to itself/sp/bp
+    if (DoesLeak) {
+      for (int I = 0, E = NumDefs; I != E; ++I) {
+        const MCOperand &Operand = Inst.getOperand(I);
+        if (HasFramePointer && Operand.isReg() &&
+            SPBPAliases[Operand.getReg()]) {
+          DoesLeak = false;
+          break;
+        }
+        if (!HasFramePointer && Operand.isReg() &&
+            SPAliases[Operand.getReg()]) {
+          DoesLeak = false;
+          break;
+        }
+      }
+    }
+    return DoesLeak;
+  }
+
+  bool addToImm(MCInst &Inst, int64_t &Amt, MCContext *Ctx) const override {
+    unsigned ImmOpNo = -1U;
+    int MemOpNo = getMemoryOperandNo(Inst);
+    if (MemOpNo != -1) {
+      ImmOpNo = MemOpNo + X86::AddrDisp;
+    } else {
+      for (unsigned Index = 0;
+           Index < MCPlus::getNumPrimeOperands(Inst); ++Index) {
+        if (Inst.getOperand(Index).isImm()) {
+          ImmOpNo = Index;
+        }
+      }
+    }
+    if (ImmOpNo == -1U)
+      return false;
+
+    MCOperand &Operand = Inst.getOperand(ImmOpNo);
+    Amt += Operand.getImm();
+    Operand.setImm(Amt);
+    // Check for the need for relaxation
+    if (int64_t(Amt) == int64_t(int8_t(Amt)))
+      return true;
+
+    // Relax instruction
+    switch (Inst.getOpcode()) {
+    case X86::SUB64ri8:
+      Inst.setOpcode(X86::SUB64ri32);
+      break;
+    case X86::ADD64ri8:
+      Inst.setOpcode(X86::ADD64ri32);
+      break;
+    default:
+      // No need for relaxation
+      break;
+    }
+    return true;
+  }
+
+  /// TODO: this implementation currently works for the most common opcodes that
+  /// load from memory. It can be extended to work with memory store opcodes as
+  /// well as more memory load opcodes.
+  bool replaceMemOperandWithImm(MCInst &Inst, StringRef ConstantData,
+                                uint64_t Offset) const override {
+    enum CheckSignExt : uint8_t {
+      NOCHECK = 0,
+      CHECK8,
+      CHECK32,
+    };
+
+    using CheckList = std::vector<std::pair<CheckSignExt, unsigned>>;
+    struct InstInfo {
+      // Size in bytes that Inst loads from memory.
+      uint8_t DataSize;
+
+      // True when the target operand has to be duplicated because the opcode
+      // expects a LHS operand.
+      bool HasLHS;
+
+      // List of checks and corresponding opcodes to be used. We try to use the
+      // smallest possible immediate value when various sizes are available,
+      // hence we may need to check whether a larger constant fits in a smaller
+      // immediate.
+      CheckList Checks;
+    };
+
+    InstInfo I;
+
+    switch (Inst.getOpcode()) {
+    default: {
+      switch (getPopSize(Inst)) {
+      case 2:            I = {2, false, {{NOCHECK, X86::MOV16ri}}};  break;
+      case 4:            I = {4, false, {{NOCHECK, X86::MOV32ri}}};  break;
+      case 8:            I = {8, false, {{CHECK32, X86::MOV64ri32},
+                                         {NOCHECK, X86::MOV64rm}}};  break;
+      default:           return false;
+      }
+      break;
+    }
+
+    // MOV
+    case X86::MOV8rm:      I = {1, false, {{NOCHECK, X86::MOV8ri}}};   break;
+    case X86::MOV16rm:     I = {2, false, {{NOCHECK, X86::MOV16ri}}};  break;
+    case X86::MOV32rm:     I = {4, false, {{NOCHECK, X86::MOV32ri}}};  break;
+    case X86::MOV64rm:     I = {8, false, {{CHECK32, X86::MOV64ri32},
+                                           {NOCHECK, X86::MOV64rm}}};  break;
+
+    // MOVZX
+    case X86::MOVZX16rm8:  I = {1, false, {{NOCHECK, X86::MOV16ri}}};  break;
+    case X86::MOVZX32rm8:  I = {1, false, {{NOCHECK, X86::MOV32ri}}};  break;
+    case X86::MOVZX32rm16: I = {2, false, {{NOCHECK, X86::MOV32ri}}};  break;
+
+    // CMP
+    case X86::CMP8rm:      I = {1, false, {{NOCHECK, X86::CMP8ri}}};   break;
+    case X86::CMP16rm:     I = {2, false, {{CHECK8,  X86::CMP16ri8},
+                                           {NOCHECK, X86::CMP16ri}}};  break;
+    case X86::CMP32rm:     I = {4, false, {{CHECK8,  X86::CMP32ri8},
+                                           {NOCHECK, X86::CMP32ri}}};  break;
+    case X86::CMP64rm:     I = {8, false, {{CHECK8,  X86::CMP64ri8},
+                                           {CHECK32, X86::CMP64ri32},
+                                           {NOCHECK, X86::CMP64rm}}};  break;
+
+    // TEST
+    case X86::TEST8mr:     I = {1, false, {{NOCHECK, X86::TEST8ri}}};  break;
+    case X86::TEST16mr:    I = {2, false, {{NOCHECK, X86::TEST16ri}}}; break;
+    case X86::TEST32mr:    I = {4, false, {{NOCHECK, X86::TEST32ri}}}; break;
+    case X86::TEST64mr:    I = {8, false, {{CHECK32, X86::TEST64ri32},
+                                           {NOCHECK, X86::TEST64mr}}}; break;
+
+    // ADD
+    case X86::ADD8rm:      I = {1, true,  {{NOCHECK, X86::ADD8ri}}};   break;
+    case X86::ADD16rm:     I = {2, true,  {{CHECK8,  X86::ADD16ri8},
+                                           {NOCHECK, X86::ADD16ri}}};  break;
+    case X86::ADD32rm:     I = {4, true,  {{CHECK8,  X86::ADD32ri8},
+                                           {NOCHECK, X86::ADD32ri}}};  break;
+    case X86::ADD64rm:     I = {8, true,  {{CHECK8,  X86::ADD64ri8},
+                                           {CHECK32, X86::ADD64ri32},
+                                           {NOCHECK, X86::ADD64rm}}};  break;
+
+    // SUB
+    case X86::SUB8rm:      I = {1, true,  {{NOCHECK, X86::SUB8ri}}};   break;
+    case X86::SUB16rm:     I = {2, true,  {{CHECK8,  X86::SUB16ri8},
+                                           {NOCHECK, X86::SUB16ri}}};  break;
+    case X86::SUB32rm:     I = {4, true,  {{CHECK8,  X86::SUB32ri8},
+                                           {NOCHECK, X86::SUB32ri}}};  break;
+    case X86::SUB64rm:     I = {8, true,  {{CHECK8,  X86::SUB64ri8},
+                                           {CHECK32, X86::SUB64ri32},
+                                           {NOCHECK, X86::SUB64rm}}};  break;
+
+    // AND
+    case X86::AND8rm:      I = {1, true,  {{NOCHECK, X86::AND8ri}}};   break;
+    case X86::AND16rm:     I = {2, true,  {{CHECK8,  X86::AND16ri8},
+                                           {NOCHECK, X86::AND16ri}}};  break;
+    case X86::AND32rm:     I = {4, true,  {{CHECK8,  X86::AND32ri8},
+                                           {NOCHECK, X86::AND32ri}}};  break;
+    case X86::AND64rm:     I = {8, true,  {{CHECK8,  X86::AND64ri8},
+                                           {CHECK32, X86::AND64ri32},
+                                           {NOCHECK, X86::AND64rm}}};  break;
+
+    // OR
+    case X86::OR8rm:       I = {1, true,  {{NOCHECK, X86::OR8ri}}};    break;
+    case X86::OR16rm:      I = {2, true,  {{CHECK8,  X86::OR16ri8},
+                                           {NOCHECK, X86::OR16ri}}};   break;
+    case X86::OR32rm:      I = {4, true,  {{CHECK8,  X86::OR32ri8},
+                                           {NOCHECK, X86::OR32ri}}};   break;
+    case X86::OR64rm:      I = {8, true,  {{CHECK8,  X86::OR64ri8},
+                                           {CHECK32, X86::OR64ri32},
+                                           {NOCHECK, X86::OR64rm}}};   break;
+
+    // XOR
+    case X86::XOR8rm:      I = {1, true,  {{NOCHECK, X86::XOR8ri}}};   break;
+    case X86::XOR16rm:     I = {2, true,  {{CHECK8,  X86::XOR16ri8},
+                                           {NOCHECK, X86::XOR16ri}}};  break;
+    case X86::XOR32rm:     I = {4, true,  {{CHECK8,  X86::XOR32ri8},
+                                           {NOCHECK, X86::XOR32ri}}};  break;
+    case X86::XOR64rm:     I = {8, true,  {{CHECK8,  X86::XOR64ri8},
+                                           {CHECK32, X86::XOR64ri32},
+                                           {NOCHECK, X86::XOR64rm}}};  break;
+    }
+
+    // Compute the immediate value.
+    assert(Offset + I.DataSize <= ConstantData.size() &&
+           "invalid offset for given constant data");
+    int64_t ImmVal =
+      DataExtractor(ConstantData, true, 8).getSigned(&Offset, I.DataSize);
+
+    // Compute the new opcode.
+    unsigned NewOpcode = 0;
+    for (const std::pair<CheckSignExt, unsigned> &Check : I.Checks) {
+      NewOpcode = Check.second;
+      if (Check.first == NOCHECK)
+        break;
+      if (Check.first == CHECK8 &&
+          ImmVal >= std::numeric_limits<int8_t>::min() &&
+          ImmVal <= std::numeric_limits<int8_t>::max())
+        break;
+      if (Check.first == CHECK32 &&
+          ImmVal >= std::numeric_limits<int32_t>::min() &&
+          ImmVal <= std::numeric_limits<int32_t>::max())
+        break;
+    }
+    if (NewOpcode == Inst.getOpcode())
+      return false;
+
+    // Modify the instruction.
+    MCOperand ImmOp = MCOperand::createImm(ImmVal);
+    uint32_t TargetOpNum = 0;
+    // Test instruction does not follow the regular pattern of putting the
+    // memory reference of a load (5 MCOperands) last in the list of operands.
+    // Since it is not modifying the register operand, it is not treated as
+    // a destination operand and it is not the first operand as it is in the
+    // other instructions we treat here.
+    if (NewOpcode == X86::TEST8ri ||
+        NewOpcode == X86::TEST16ri ||
+        NewOpcode == X86::TEST32ri ||
+        NewOpcode == X86::TEST64ri32) {
+      TargetOpNum = getMemoryOperandNo(Inst) + X86::AddrNumOperands;
+    }
+    MCOperand TargetOp = Inst.getOperand(TargetOpNum);
+    Inst.clear();
+    Inst.setOpcode(NewOpcode);
+    Inst.addOperand(TargetOp);
+    if (I.HasLHS)
+      Inst.addOperand(TargetOp);
+    Inst.addOperand(ImmOp);
+
+    return true;
+  }
+
+  /// TODO: this implementation currently works for the most common opcodes that
+  /// load from memory. It can be extended to work with memory store opcodes as
+  /// well as more memory load opcodes.
+  bool replaceMemOperandWithReg(MCInst &Inst, MCPhysReg RegNum) const override {
+    unsigned NewOpcode;
+
+    switch (Inst.getOpcode()) {
+    default: {
+      switch (getPopSize(Inst)) {
+      case 2:            NewOpcode = X86::MOV16rr; break;
+      case 4:            NewOpcode = X86::MOV32rr; break;
+      case 8:            NewOpcode = X86::MOV64rr; break;
+      default:           return false;
+      }
+      break;
+    }
+
+    // MOV
+    case X86::MOV8rm:      NewOpcode = X86::MOV8rr;   break;
+    case X86::MOV16rm:     NewOpcode = X86::MOV16rr;  break;
+    case X86::MOV32rm:     NewOpcode = X86::MOV32rr;  break;
+    case X86::MOV64rm:     NewOpcode = X86::MOV64rr;  break;
+    }
+
+    // Modify the instruction.
+    MCOperand RegOp = MCOperand::createReg(RegNum);
+    MCOperand TargetOp = Inst.getOperand(0);
+    Inst.clear();
+    Inst.setOpcode(NewOpcode);
+    Inst.addOperand(TargetOp);
+    Inst.addOperand(RegOp);
+
+    return true;
+  }
+
+  bool isRedundantMove(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+
+    // MOV
+    case X86::MOV8rr:
+    case X86::MOV16rr:
+    case X86::MOV32rr:
+    case X86::MOV64rr:
+      break;
+    }
+
+    assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg());
+    return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg();
+  }
+
+  bool isTailCall(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    case X86::TAILJMPd:
+    case X86::TAILJMPm:
+    case X86::TAILJMPr:
+      return true;
+    }
+
+    if (getConditionalTailCall(Inst))
+      return true;
+
+    return false;
+  }
+
+  bool requiresAlignedAddress(const MCInst &Inst) const override {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    for (unsigned int I = 0; I < Desc.getNumOperands(); ++I) {
+      const MCOperandInfo &Op = Desc.OpInfo[I];
+      if (Op.OperandType != MCOI::OPERAND_REGISTER)
+        continue;
+      if (Op.RegClass == X86::VR128RegClassID)
+        return true;
+    }
+    return false;
+  }
+
+  bool convertJmpToTailCall(MCInst &Inst) override {
+    int NewOpcode;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::JMP_1:
+    case X86::JMP_2:
+    case X86::JMP_4:
+      NewOpcode = X86::TAILJMPd;
+      break;
+    case X86::JMP16m:
+    case X86::JMP32m:
+    case X86::JMP64m:
+      NewOpcode = X86::TAILJMPm;
+      break;
+    case X86::JMP16r:
+    case X86::JMP32r:
+    case X86::JMP64r:
+      NewOpcode = X86::TAILJMPr;
+      break;
+    }
+
+    Inst.setOpcode(NewOpcode);
+    return true;
+  }
+
+  bool convertTailCallToJmp(MCInst &Inst) override {
+    int NewOpcode;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::TAILJMPd:
+      NewOpcode = X86::JMP_1;
+      break;
+    case X86::TAILJMPm:
+      NewOpcode = X86::JMP64m;
+      break;
+    case X86::TAILJMPr:
+      NewOpcode = X86::JMP64r;
+      break;
+    }
+
+    Inst.setOpcode(NewOpcode);
+    return true;
+  }
+
+  bool convertTailCallToCall(MCInst &Inst) const override {
+    int NewOpcode;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::TAILJMPd:
+      NewOpcode = X86::CALL64pcrel32;
+      break;
+    case X86::TAILJMPm:
+      NewOpcode = X86::CALL64m;
+      break;
+    case X86::TAILJMPr:
+      NewOpcode = X86::CALL64r;
+      break;
+    }
+
+    Inst.setOpcode(NewOpcode);
+    return true;
+  }
+
+  bool convertCallToIndirectCall(MCInst &Inst,
+                                 const MCSymbol *TargetLocation,
+                                 MCContext *Ctx) const override {
+    assert((Inst.getOpcode() == X86::CALL64pcrel32 ||
+            Inst.getOpcode() == X86::TAILJMPd) &&
+           "64-bit direct (tail) call instruction expected");
+    const auto NewOpcode = (Inst.getOpcode() == X86::CALL64pcrel32)
+      ? X86::CALL64m
+      : X86::TAILJMPm;
+    Inst.setOpcode(NewOpcode);
+
+    // Replace the first operand and preserve auxiliary operands of
+    // the instruction.
+    Inst.erase(Inst.begin());
+    Inst.insert(Inst.begin(),
+                MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+    Inst.insert(Inst.begin(),
+                MCOperand::createExpr(                  // Displacement
+                  MCSymbolRefExpr::create(TargetLocation,
+                                          MCSymbolRefExpr::VK_None,
+                                          *Ctx)));
+    Inst.insert(Inst.begin(),
+                MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.insert(Inst.begin(),
+                MCOperand::createImm(1));               // ScaleAmt
+    Inst.insert(Inst.begin(),
+                MCOperand::createReg(X86::RIP));        // BaseReg
+
+    return true;
+  }
+
+  void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) const override {
+    if (Inst.getOpcode() == X86::CALL64m ||
+        Inst.getOpcode() == X86::TAILJMPm) {
+      Inst.setOpcode(X86::MOV64rm);
+      Inst.insert(Inst.begin(), MCOperand::createReg(Reg));
+      return;
+    }
+    if (Inst.getOpcode() == X86::CALL64r ||
+        Inst.getOpcode() == X86::TAILJMPr) {
+      Inst.setOpcode(X86::MOV64rr);
+      Inst.insert(Inst.begin(), MCOperand::createReg(Reg));
+      return;
+    }
+    LLVM_DEBUG(Inst.dump());
+    llvm_unreachable("not implemented");
+  }
+
+  bool shortenInstruction(MCInst &Inst) const override {
+    unsigned OldOpcode = Inst.getOpcode();
+    unsigned NewOpcode = OldOpcode;
+
+    // Check and remove EIZ/RIZ. These cases represent ambiguous cases where SIB
+    // byte is present, but no index is used and modrm alone shoud have been
+    // enough. Converting to NoRegister effectively removes the SIB byte.
+    int MemOpNo = getMemoryOperandNo(Inst);
+    if (MemOpNo >= 0) {
+      MCOperand &IndexOp =
+          Inst.getOperand(static_cast<unsigned>(MemOpNo) + X86::AddrIndexReg);
+      if (IndexOp.getReg() == X86::EIZ ||
+          IndexOp.getReg() == X86::RIZ) {
+        IndexOp = MCOperand::createReg(X86::NoRegister);
+      }
+    }
+
+    if (isBranch(Inst)) {
+      NewOpcode = getShortBranchOpcode(OldOpcode);
+    } else if (OldOpcode == X86::MOV64ri) {
+      if (Inst.getOperand(MCPlus::getNumPrimeOperands(Inst) - 1).isImm()) {
+        const int64_t Imm =
+            Inst.getOperand(MCPlus::getNumPrimeOperands(Inst) - 1).getImm();
+        if (int64_t(Imm) == int64_t(int32_t(Imm))) {
+          NewOpcode = X86::MOV64ri32;
+        }
+      }
+    } else {
+      // If it's arithmetic instruction check if signed operand fits in 1 byte.
+      const unsigned ShortOpcode = getShortArithOpcode(OldOpcode);
+      if (ShortOpcode != OldOpcode &&
+          Inst.getOperand(MCPlus::getNumPrimeOperands(Inst) - 1).isImm()) {
+        int64_t Imm =
+            Inst.getOperand(MCPlus::getNumPrimeOperands(Inst) - 1).getImm();
+        if (int64_t(Imm) == int64_t(int8_t(Imm))) {
+          NewOpcode = ShortOpcode;
+        }
+      }
+    }
+
+    if (NewOpcode == OldOpcode)
+      return false;
+
+    Inst.setOpcode(NewOpcode);
+    return true;
+  }
+
+  bool lowerTailCall(MCInst &Inst) override {
+    if (Inst.getOpcode() == X86::TAILJMPd) {
+      Inst.setOpcode(X86::JMP_1);
+      return true;
+    }
+    return false;
+  }
+
+  const MCSymbol *getTargetSymbol(const MCInst &Inst,
+                                  unsigned OpNum = 0) const override {
+    if (OpNum >= MCPlus::getNumPrimeOperands(Inst))
+      return nullptr;
+
+    const MCOperand &Op = Inst.getOperand(OpNum);
+    if (!Op.isExpr())
+      return nullptr;
+
+    auto *SymExpr = dyn_cast<MCSymbolRefExpr>(Op.getExpr());
+    if (!SymExpr || SymExpr->getKind() != MCSymbolRefExpr::VK_None)
+      return nullptr;
+
+    return &SymExpr->getSymbol();
+  }
+
+  // This is the same as the base class, but since we are overriding one of
+  // getTargetSymbol's signatures above, we need to override all of them.
+  const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
+    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
+  }
+
+  bool analyzeBranch(InstructionIterator Begin,
+                     InstructionIterator End,
+                     const MCSymbol *&TBB,
+                     const MCSymbol *&FBB,
+                     MCInst *&CondBranch,
+                     MCInst *&UncondBranch) const override {
+    auto I = End;
+
+    // Bottom-up analysis
+    while (I != Begin) {
+      --I;
+
+      // Ignore nops and CFIs
+      if (isPseudo(*I))
+        continue;
+
+      // Stop when we find the first non-terminator
+      if (!isTerminator(*I))
+        break;
+
+      if (!isBranch(*I))
+        break;
+
+      // Handle unconditional branches.
+      if (I->getOpcode() == X86::JMP_1 ||
+          I->getOpcode() == X86::JMP_2 ||
+          I->getOpcode() == X86::JMP_4) {
+        // If any code was seen after this unconditional branch, we've seen
+        // unreachable code. Ignore them.
+        CondBranch = nullptr;
+        UncondBranch = &*I;
+        const MCSymbol *Sym = getTargetSymbol(*I);
+        assert(Sym != nullptr &&
+               "Couldn't extract BB symbol from jump operand");
+        TBB = Sym;
+        continue;
+      }
+
+      // Handle conditional branches and ignore indirect branches
+      if (!isUnsupportedBranch(I->getOpcode()) &&
+          getCondCode(*I) == X86::COND_INVALID) {
+        // Indirect branch
+        return false;
+      }
+
+      if (CondBranch == nullptr) {
+        const MCSymbol *TargetBB = getTargetSymbol(*I);
+        if (TargetBB == nullptr) {
+          // Unrecognized branch target
+          return false;
+        }
+        FBB = TBB;
+        TBB = TargetBB;
+        CondBranch = &*I;
+        continue;
+      }
+
+      llvm_unreachable("multiple conditional branches in one BB");
+    }
+    return true;
+  }
+
+  template <typename Itr>
+  std::pair<IndirectBranchType, MCInst *>
+  analyzePICJumpTable(Itr II,
+                      Itr IE,
+                      MCPhysReg R1,
+                      MCPhysReg R2) const {
+    // Analyze PIC-style jump table code template:
+    //
+    //    lea PIC_JUMP_TABLE(%rip), {%r1|%r2}     <- MemLocInstr
+    //    mov ({%r1|%r2}, %index, 4), {%r2|%r1}
+    //    add %r2, %r1
+    //    jmp *%r1
+    //
+    // (with any irrelevant instructions in-between)
+    //
+    // When we call this helper we've already determined %r1 and %r2, and
+    // reverse instruction iterator \p II is pointing to the ADD instruction.
+    //
+    // PIC jump table looks like following:
+    //
+    //   JT:  ----------
+    //    E1:| L1 - JT  |
+    //       |----------|
+    //    E2:| L2 - JT  |
+    //       |----------|
+    //       |          |
+    //          ......
+    //    En:| Ln - JT  |
+    //        ----------
+    //
+    // Where L1, L2, ..., Ln represent labels in the function.
+    //
+    // The actual relocations in the table will be of the form:
+    //
+    //   Ln - JT
+    //    = (Ln - En) + (En - JT)
+    //    = R_X86_64_PC32(Ln) + En - JT
+    //    = R_X86_64_PC32(Ln + offsetof(En))
+    //
+    LLVM_DEBUG(dbgs() << "Checking for PIC jump table\n");
+    MCInst *MemLocInstr = nullptr;
+    const MCInst *MovInstr = nullptr;
+    while (++II != IE) {
+      MCInst &Instr = *II;
+      const MCInstrDesc &InstrDesc = Info->get(Instr.getOpcode());
+      if (!InstrDesc.hasDefOfPhysReg(Instr, R1, *RegInfo) &&
+          !InstrDesc.hasDefOfPhysReg(Instr, R2, *RegInfo)) {
+        // Ignore instructions that don't affect R1, R2 registers.
+        continue;
+      } else if (!MovInstr) {
+        // Expect to see MOV instruction.
+        if (!isMOVSX64rm32(Instr)) {
+          LLVM_DEBUG(dbgs() << "MOV instruction expected.\n");
+          break;
+        }
+
+        // Check if it's setting %r1 or %r2. In canonical form it sets %r2.
+        // If it sets %r1 - rename the registers so we have to only check
+        // a single form.
+        unsigned MovDestReg = Instr.getOperand(0).getReg();
+        if (MovDestReg != R2)
+          std::swap(R1, R2);
+        if (MovDestReg != R2) {
+          LLVM_DEBUG(dbgs() << "MOV instruction expected to set %r2\n");
+          break;
+        }
+
+        // Verify operands for MOV.
+        unsigned  BaseRegNum;
+        int64_t   ScaleValue;
+        unsigned  IndexRegNum;
+        int64_t   DispValue;
+        unsigned  SegRegNum;
+        if (!evaluateX86MemoryOperand(Instr, &BaseRegNum,
+                                      &ScaleValue, &IndexRegNum,
+                                      &DispValue, &SegRegNum))
+          break;
+        if (BaseRegNum != R1 ||
+            ScaleValue != 4 ||
+            IndexRegNum == X86::NoRegister ||
+            DispValue != 0 ||
+            SegRegNum != X86::NoRegister)
+          break;
+        MovInstr = &Instr;
+      } else {
+        if (!InstrDesc.hasDefOfPhysReg(Instr, R1, *RegInfo))
+          continue;
+        if (!isLEA64r(Instr)) {
+          LLVM_DEBUG(dbgs() << "LEA instruction expected\n");
+          break;
+        }
+        if (Instr.getOperand(0).getReg() != R1) {
+          LLVM_DEBUG(dbgs() << "LEA instruction expected to set %r1\n");
+          break;
+        }
+
+        // Verify operands for LEA.
+        unsigned      BaseRegNum;
+        int64_t       ScaleValue;
+        unsigned      IndexRegNum;
+        const MCExpr *DispExpr = nullptr;
+        int64_t       DispValue;
+        unsigned      SegRegNum;
+        if (!evaluateX86MemoryOperand(Instr, &BaseRegNum,
+                                      &ScaleValue, &IndexRegNum,
+                                      &DispValue, &SegRegNum, &DispExpr))
+          break;
+        if (BaseRegNum != RegInfo->getProgramCounter() ||
+            IndexRegNum != X86::NoRegister ||
+            SegRegNum != X86::NoRegister ||
+            DispExpr == nullptr)
+          break;
+        MemLocInstr = &Instr;
+        break;
+      }
+    }
+
+    if (!MemLocInstr)
+      return std::make_pair(IndirectBranchType::UNKNOWN, nullptr);
+
+    LLVM_DEBUG(dbgs() << "checking potential PIC jump table\n");
+    return std::make_pair(IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE,
+                          MemLocInstr);
+  }
+
+  IndirectBranchType analyzeIndirectBranch(
+     MCInst &Instruction,
+     InstructionIterator Begin,
+     InstructionIterator End,
+     const unsigned PtrSize,
+     MCInst *&MemLocInstrOut,
+     unsigned &BaseRegNumOut,
+     unsigned &IndexRegNumOut,
+     int64_t &DispValueOut,
+     const MCExpr *&DispExprOut,
+     MCInst *&PCRelBaseOut
+  ) const override {
+    // Try to find a (base) memory location from where the address for
+    // the indirect branch is loaded. For X86-64 the memory will be specified
+    // in the following format:
+    //
+    //   {%rip}/{%basereg} + Imm + IndexReg * Scale
+    //
+    // We are interested in the cases where Scale == sizeof(uintptr_t) and
+    // the contents of the memory are presumably an array of pointers to code.
+    //
+    // Normal jump table:
+    //
+    //    jmp *(JUMP_TABLE, %index, Scale)        <- MemLocInstr
+    //
+    //    or
+    //
+    //    mov (JUMP_TABLE, %index, Scale), %r1    <- MemLocInstr
+    //    ...
+    //    jmp %r1
+    //
+    // We handle PIC-style jump tables separately.
+    //
+    MemLocInstrOut = nullptr;
+    BaseRegNumOut = X86::NoRegister;
+    IndexRegNumOut = X86::NoRegister;
+    DispValueOut = 0;
+    DispExprOut = nullptr;
+
+    std::reverse_iterator<InstructionIterator> II(End);
+    std::reverse_iterator<InstructionIterator> IE(Begin);
+
+    IndirectBranchType Type = IndirectBranchType::UNKNOWN;
+
+    // An instruction referencing memory used by jump instruction (directly or
+    // via register). This location could be an array of function pointers
+    // in case of indirect tail call, or a jump table.
+    MCInst *MemLocInstr = nullptr;
+
+    if (MCPlus::getNumPrimeOperands(Instruction) == 1) {
+      // If the indirect jump is on register - try to detect if the
+      // register value is loaded from a memory location.
+      assert(Instruction.getOperand(0).isReg() && "register operand expected");
+      const unsigned R1 = Instruction.getOperand(0).getReg();
+      // Check if one of the previous instructions defines the jump-on register.
+      for (auto PrevII = II; PrevII != IE; ++PrevII) {
+        MCInst &PrevInstr = *PrevII;
+        const MCInstrDesc &PrevInstrDesc = Info->get(PrevInstr.getOpcode());
+
+        if (!PrevInstrDesc.hasDefOfPhysReg(PrevInstr, R1, *RegInfo))
+          continue;
+
+        if (isMoveMem2Reg(PrevInstr)) {
+          MemLocInstr = &PrevInstr;
+          break;
+        } else if (isADD64rr(PrevInstr)) {
+          unsigned R2 = PrevInstr.getOperand(2).getReg();
+          if (R1 == R2)
+            return IndirectBranchType::UNKNOWN;
+          std::tie(Type, MemLocInstr) = analyzePICJumpTable(PrevII, IE, R1, R2);
+          break;
+        } else {
+          return IndirectBranchType::UNKNOWN;
+        }
+      }
+      if (!MemLocInstr) {
+        // No definition seen for the register in this function so far. Could be
+        // an input parameter - which means it is an external code reference.
+        // It also could be that the definition happens to be in the code that
+        // we haven't processed yet. Since we have to be conservative, return
+        // as UNKNOWN case.
+        return IndirectBranchType::UNKNOWN;
+      }
+    } else {
+      MemLocInstr = &Instruction;
+    }
+
+    const MCRegister RIPRegister = RegInfo->getProgramCounter();
+
+    // Analyze the memory location.
+    unsigned      BaseRegNum, IndexRegNum, SegRegNum;
+    int64_t       ScaleValue, DispValue;
+    const MCExpr *DispExpr;
+
+    if (!evaluateX86MemoryOperand(*MemLocInstr, &BaseRegNum,
+                                  &ScaleValue, &IndexRegNum,
+                                  &DispValue, &SegRegNum,
+                                  &DispExpr))
+      return IndirectBranchType::UNKNOWN;
+
+    BaseRegNumOut = BaseRegNum;
+    IndexRegNumOut = IndexRegNum;
+    DispValueOut = DispValue;
+    DispExprOut = DispExpr;
+
+    if ((BaseRegNum != X86::NoRegister && BaseRegNum != RIPRegister) ||
+        SegRegNum != X86::NoRegister)
+      return IndirectBranchType::UNKNOWN;
+
+    if (MemLocInstr == &Instruction &&
+        (!ScaleValue || IndexRegNum == X86::NoRegister)) {
+      MemLocInstrOut = MemLocInstr;
+      return IndirectBranchType::POSSIBLE_FIXED_BRANCH;
+    }
+
+    if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE &&
+        (ScaleValue != 1 || BaseRegNum != RIPRegister))
+      return IndirectBranchType::UNKNOWN;
+
+    if (Type != IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE &&
+        ScaleValue != PtrSize)
+      return IndirectBranchType::UNKNOWN;
+
+    MemLocInstrOut = MemLocInstr;
+
+    return Type;
+  }
+
+  /// Analyze a callsite to see if it could be a virtual method call.  This only
+  /// checks to see if the overall pattern is satisfied, it does not guarantee
+  /// that the callsite is a true virtual method call.
+  /// The format of virtual method calls that are recognized is one of the
+  /// following:
+  ///
+  ///  Form 1: (found in debug code)
+  ///    add METHOD_OFFSET, %VtableReg
+  ///    mov (%VtableReg), %MethodReg
+  ///    ...
+  ///    call or jmp *%MethodReg
+  ///
+  ///  Form 2:
+  ///    mov METHOD_OFFSET(%VtableReg), %MethodReg
+  ///    ...
+  ///    call or jmp *%MethodReg
+  ///
+  ///  Form 3:
+  ///    ...
+  ///    call or jmp *METHOD_OFFSET(%VtableReg)
+  ///
+  bool analyzeVirtualMethodCall(InstructionIterator ForwardBegin,
+                                InstructionIterator ForwardEnd,
+                                std::vector<MCInst *> &MethodFetchInsns,
+                                unsigned &VtableRegNum,
+                                unsigned &MethodRegNum,
+                                uint64_t &MethodOffset) const override {
+    VtableRegNum = X86::NoRegister;
+    MethodRegNum = X86::NoRegister;
+    MethodOffset = 0;
+
+    std::reverse_iterator<InstructionIterator> Itr(ForwardEnd);
+    std::reverse_iterator<InstructionIterator> End(ForwardBegin);
+
+    MCInst &CallInst = *Itr++;
+    assert(isIndirectBranch(CallInst) || isCall(CallInst));
+
+    unsigned BaseReg, IndexReg, SegmentReg;
+    int64_t Scale, Disp;
+    const MCExpr *DispExpr;
+
+    // The call can just be jmp offset(reg)
+    if (evaluateX86MemoryOperand(CallInst,
+                                 &BaseReg,
+                                 &Scale,
+                                 &IndexReg,
+                                 &Disp,
+                                 &SegmentReg,
+                                 &DispExpr)) {
+      if (!DispExpr &&
+          BaseReg != X86::RIP &&
+          BaseReg != X86::RBP &&
+          BaseReg != X86::NoRegister) {
+        MethodRegNum = BaseReg;
+        if (Scale == 1 &&
+            IndexReg == X86::NoRegister &&
+            SegmentReg == X86::NoRegister) {
+          VtableRegNum = MethodRegNum;
+          MethodOffset = Disp;
+          MethodFetchInsns.push_back(&CallInst);
+          return true;
+        }
+      }
+      return false;
+    } else if (CallInst.getOperand(0).isReg()) {
+      MethodRegNum = CallInst.getOperand(0).getReg();
+    } else {
+      return false;
+    }
+
+    if (MethodRegNum == X86::RIP || MethodRegNum == X86::RBP) {
+      VtableRegNum = X86::NoRegister;
+      MethodRegNum = X86::NoRegister;
+      return false;
+    }
+
+    // find load from vtable, this may or may not include the method offset
+    while (Itr != End) {
+      MCInst &CurInst = *Itr++;
+      const MCInstrDesc &Desc = Info->get(CurInst.getOpcode());
+      if (Desc.hasDefOfPhysReg(CurInst, MethodRegNum, *RegInfo)) {
+        if (isLoad(CurInst) &&
+            evaluateX86MemoryOperand(CurInst,
+                                     &BaseReg,
+                                     &Scale,
+                                     &IndexReg,
+                                     &Disp,
+                                     &SegmentReg,
+                                     &DispExpr)) {
+          if (!DispExpr &&
+              Scale == 1 &&
+              BaseReg != X86::RIP &&
+              BaseReg != X86::RBP &&
+              BaseReg != X86::NoRegister &&
+              IndexReg == X86::NoRegister &&
+              SegmentReg == X86::NoRegister &&
+              BaseReg != X86::RIP) {
+            VtableRegNum = BaseReg;
+            MethodOffset = Disp;
+            MethodFetchInsns.push_back(&CurInst);
+            if (MethodOffset != 0)
+              return true;
+            break;
+          }
+        }
+        return false;
+      }
+    }
+
+    if (!VtableRegNum)
+      return false;
+
+    // look for any adds affecting the method register.
+    while (Itr != End) {
+      MCInst &CurInst = *Itr++;
+      const MCInstrDesc &Desc = Info->get(CurInst.getOpcode());
+      if (Desc.hasDefOfPhysReg(CurInst, VtableRegNum, *RegInfo)) {
+        if (isADDri(CurInst)) {
+          assert(!MethodOffset);
+          MethodOffset = CurInst.getOperand(2).getImm();
+          MethodFetchInsns.insert(MethodFetchInsns.begin(), &CurInst);
+          break;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool createStackPointerIncrement(MCInst &Inst, int Size,
+                                   bool NoFlagsClobber) const override {
+    if (NoFlagsClobber) {
+      Inst.setOpcode(X86::LEA64r);
+      Inst.clear();
+      Inst.addOperand(MCOperand::createReg(X86::RSP));
+      Inst.addOperand(MCOperand::createReg(X86::RSP));        // BaseReg
+      Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+      Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+      Inst.addOperand(MCOperand::createImm(-Size));           // Displacement
+      Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+      return true;
+    }
+    Inst.setOpcode(X86::SUB64ri8);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(X86::RSP));
+    Inst.addOperand(MCOperand::createReg(X86::RSP));
+    Inst.addOperand(MCOperand::createImm(Size));
+    return true;
+  }
+
+  bool createStackPointerDecrement(MCInst &Inst, int Size,
+                                   bool NoFlagsClobber) const override {
+    if (NoFlagsClobber) {
+      Inst.setOpcode(X86::LEA64r);
+      Inst.clear();
+      Inst.addOperand(MCOperand::createReg(X86::RSP));
+      Inst.addOperand(MCOperand::createReg(X86::RSP));        // BaseReg
+      Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+      Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+      Inst.addOperand(MCOperand::createImm(Size));            // Displacement
+      Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+      return true;
+    }
+    Inst.setOpcode(X86::ADD64ri8);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(X86::RSP));
+    Inst.addOperand(MCOperand::createReg(X86::RSP));
+    Inst.addOperand(MCOperand::createImm(Size));
+    return true;
+  }
+
+  bool createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg, int Offset,
+                         const MCPhysReg &SrcReg, int Size) const override {
+    unsigned NewOpcode;
+    switch (Size) {
+      default:
+        return false;
+      case 2:      NewOpcode = X86::MOV16mr; break;
+      case 4:      NewOpcode = X86::MOV32mr; break;
+      case 8:      NewOpcode = X86::MOV64mr; break;
+    }
+    Inst.setOpcode(NewOpcode);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(StackReg)); // BaseReg
+    Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.addOperand(MCOperand::createImm(Offset)); // Displacement
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+    Inst.addOperand(MCOperand::createReg(SrcReg));
+    return true;
+  }
+
+  bool createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
+                              int Offset, const MCPhysReg &DstReg,
+                              int Size) const override {
+    return createLoad(Inst, StackReg, /*Scale=*/1, /*IndexReg=*/X86::NoRegister,
+                      Offset, nullptr, /*AddrSegmentReg=*/X86::NoRegister,
+                      DstReg, Size);
+  }
+
+  bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+                  const MCPhysReg &IndexReg, int64_t Offset,
+                  const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg,
+                  const MCPhysReg &DstReg, int Size) const override {
+    unsigned NewOpcode;
+    switch (Size) {
+      default:
+        return false;
+      case 2:      NewOpcode = X86::MOV16rm; break;
+      case 4:      NewOpcode = X86::MOV32rm; break;
+      case 8:      NewOpcode = X86::MOV64rm; break;
+    }
+    Inst.setOpcode(NewOpcode);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(DstReg));
+    Inst.addOperand(MCOperand::createReg(BaseReg));
+    Inst.addOperand(MCOperand::createImm(Scale));
+    Inst.addOperand(MCOperand::createReg(IndexReg));
+    if (OffsetExpr)
+      Inst.addOperand(MCOperand::createExpr(OffsetExpr)); // Displacement
+    else
+      Inst.addOperand(MCOperand::createImm(Offset)); // Displacement
+    Inst.addOperand(MCOperand::createReg(AddrSegmentReg)); // AddrSegmentReg
+    return true;
+  }
+
+  void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest,
+                           uint32_t Imm) const override {
+    Inst.setOpcode(X86::MOV64ri32);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(Dest));
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
+                       MCContext *Ctx) const override {
+
+    Inst.setOpcode(X86::LOCK_INC64m);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
+    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None,
+                                *Ctx)));                    // Displacement
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+    return true;
+  }
+
+  bool createIJmp32Frag(SmallVectorImpl<MCInst> &Insts,
+                        const MCOperand &BaseReg, const MCOperand &Scale,
+                        const MCOperand &IndexReg, const MCOperand &Offset,
+                        const MCOperand &TmpReg) const override {
+    // The code fragment we emit here is:
+    //
+    //  mov32 (%base, %index, scale), %tmpreg
+    //  ijmp *(%tmpreg)
+    //
+    MCInst IJmp;
+    IJmp.setOpcode(X86::JMP64r);
+    IJmp.addOperand(TmpReg);
+
+    MCInst Load;
+    Load.setOpcode(X86::MOV32rm);
+    Load.addOperand(TmpReg);
+    Load.addOperand(BaseReg);
+    Load.addOperand(Scale);
+    Load.addOperand(IndexReg);
+    Load.addOperand(Offset);
+    Load.addOperand(MCOperand::createReg(X86::NoRegister));
+
+    Insts.push_back(Load);
+    Insts.push_back(IJmp);
+    return true;
+  }
+
+  bool createNoop(MCInst &Inst) const override {
+    Inst.setOpcode(X86::NOOP);
+    return true;
+  }
+
+  bool createReturn(MCInst &Inst) const override {
+    Inst.setOpcode(X86::RETQ);
+    return true;
+  }
+
+  std::vector<MCInst> createInlineMemcpy(bool ReturnEnd) const override {
+    std::vector<MCInst> Code;
+    if (ReturnEnd) {
+      Code.emplace_back(MCInstBuilder(X86::LEA64r)
+                            .addReg(X86::RAX)
+                            .addReg(X86::RDI)
+                            .addImm(1)
+                            .addReg(X86::RDX)
+                            .addImm(0)
+                            .addReg(X86::NoRegister));
+    } else {
+      Code.emplace_back(MCInstBuilder(X86::MOV64rr)
+                            .addReg(X86::RAX)
+                            .addReg(X86::RDI));
+    }
+    Code.emplace_back(MCInstBuilder(X86::MOV32rr)
+                          .addReg(X86::ECX)
+                          .addReg(X86::EDX));
+    Code.emplace_back(MCInstBuilder(X86::REP_MOVSB_64));
+
+    return Code;
+  }
+
+  std::vector<MCInst> createOneByteMemcpy() const override {
+    std::vector<MCInst> Code;
+    Code.emplace_back(MCInstBuilder(X86::MOV8rm)
+                          .addReg(X86::CL)
+                          .addReg(X86::RSI)
+                          .addImm(0)
+                          .addReg(X86::NoRegister)
+                          .addImm(0)
+                          .addReg(X86::NoRegister));
+    Code.emplace_back(MCInstBuilder(X86::MOV8mr)
+                          .addReg(X86::RDI)
+                          .addImm(0)
+                          .addReg(X86::NoRegister)
+                          .addImm(0)
+                          .addReg(X86::NoRegister)
+                          .addReg(X86::CL));
+    Code.emplace_back(MCInstBuilder(X86::MOV64rr)
+                          .addReg(X86::RAX)
+                          .addReg(X86::RDI));
+    return Code;
+  }
+
+  std::vector<MCInst>
+  createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target,
+              MCContext *Ctx) const override {
+    std::vector<MCInst> Code;
+    Code.emplace_back(MCInstBuilder(X86::CMP64ri8)
+                          .addReg(RegNo)
+                          .addImm(Imm));
+    Code.emplace_back(MCInstBuilder(X86::JCC_1)
+                          .addExpr(MCSymbolRefExpr::create(
+                              Target, MCSymbolRefExpr::VK_None, *Ctx))
+                          .addImm(X86::COND_E));
+    return Code;
+  }
+
+  Optional<Relocation>
+  createRelocation(const MCFixup &Fixup,
+                   const MCAsmBackend &MAB) const override {
+    const MCFixupKindInfo &FKI = MAB.getFixupKindInfo(Fixup.getKind());
+
+    assert(FKI.TargetOffset == 0 && "0-bit relocation offset expected");
+    const uint64_t RelOffset = Fixup.getOffset();
+
+    uint64_t RelType;
+    if (FKI.Flags & MCFixupKindInfo::FKF_IsPCRel) {
+      switch (FKI.TargetSize) {
+      default:
+        return NoneType();
+      case  8: RelType = ELF::R_X86_64_PC8; break;
+      case 16: RelType = ELF::R_X86_64_PC16; break;
+      case 32: RelType = ELF::R_X86_64_PC32; break;
+      case 64: RelType = ELF::R_X86_64_PC64; break;
+      }
+    } else {
+      switch (FKI.TargetSize) {
+      default:
+        return NoneType();
+      case  8: RelType = ELF::R_X86_64_8; break;
+      case 16: RelType = ELF::R_X86_64_16; break;
+      case 32: RelType = ELF::R_X86_64_32; break;
+      case 64: RelType = ELF::R_X86_64_64; break;
+      }
+    }
+
+    // Extract a symbol and an addend out of the fixup value expression.
+    //
+    // Only the following limited expression types are supported:
+    //   Symbol + Addend
+    //   Symbol
+    uint64_t Addend = 0;
+    MCSymbol *Symbol = nullptr;
+    const MCExpr *ValueExpr = Fixup.getValue();
+    if (ValueExpr->getKind() == MCExpr::Binary) {
+      const auto *BinaryExpr = cast<MCBinaryExpr>(ValueExpr);
+      assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add &&
+             "unexpected binary expression");
+      const MCExpr *LHS = BinaryExpr->getLHS();
+      assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS");
+      Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LHS));
+      const MCExpr *RHS = BinaryExpr->getRHS();
+      assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS");
+      Addend = cast<MCConstantExpr>(RHS)->getValue();
+    } else {
+      assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value");
+      Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(ValueExpr));
+    }
+
+    return Relocation({RelOffset, Symbol, RelType, Addend, 0});
+  }
+
+  bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
+                               int64_t Addend, MCContext *Ctx, int64_t &Value,
+                               uint64_t RelType) const override {
+    unsigned ImmOpNo = -1U;
+
+    for (unsigned Index = 0;
+         Index < MCPlus::getNumPrimeOperands(Inst); ++Index) {
+      if (Inst.getOperand(Index).isImm()) {
+        ImmOpNo = Index;
+        // TODO: this is a bit hacky.  It finds the correct operand by
+        // searching for a specific immediate value.  If no value is
+        // provided it defaults to the last immediate operand found.
+        // This could lead to unexpected results if the instruction
+        // has more than one immediate with the same value.
+        if (Inst.getOperand(ImmOpNo).getImm() == Value)
+          break;
+      }
+    }
+
+    if (ImmOpNo == -1U)
+      return false;
+
+    Value = Inst.getOperand(ImmOpNo).getImm();
+
+    setOperandToSymbolRef(Inst, ImmOpNo, Symbol, Addend, Ctx, RelType);
+
+    return true;
+  }
+
+  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+                          MCContext *Ctx) const override {
+    Inst.setOpcode(X86::JMP_1);
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx)));
+    return true;
+  }
+
+  bool createCall(MCInst &Inst, const MCSymbol *Target,
+                  MCContext *Ctx) override {
+    Inst.setOpcode(X86::CALL64pcrel32);
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    return true;
+  }
+
+  bool createIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
+                          MCContext *Ctx, bool IsTailCall) const override {
+    Inst.setOpcode(IsTailCall ? X86::TAILJMPm : X86::CALL64m);
+    Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
+    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.addOperand(MCOperand::createExpr(                  // Displacement
+        MCSymbolRefExpr::create(TargetLocation, MCSymbolRefExpr::VK_None,
+                                *Ctx)));
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+    return true;
+  }
+
+  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+                      MCContext *Ctx) override {
+    return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
+  }
+
+  bool createTrap(MCInst &Inst) const override {
+    Inst.clear();
+    Inst.setOpcode(X86::TRAP);
+    return true;
+  }
+
+  bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
+                              MCContext *Ctx) const override {
+    unsigned InvCC = getInvertedCondCode(getCondCode(Inst));
+    assert(InvCC != X86::COND_INVALID && "invalid branch instruction");
+    Inst.getOperand(Info->get(Inst.getOpcode()).NumOperands - 1).setImm(InvCC);
+    Inst.getOperand(0) = MCOperand::createExpr(
+        MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx));
+    return true;
+  }
+
+  unsigned getCanonicalBranchCondCode(unsigned CC) const override {
+    switch (CC) {
+    default:           return X86::COND_INVALID;
+
+    case X86::COND_E:  return X86::COND_E;
+    case X86::COND_NE: return X86::COND_E;
+
+    case X86::COND_L:  return X86::COND_L;
+    case X86::COND_GE: return X86::COND_L;
+
+    case X86::COND_LE: return X86::COND_G;
+    case X86::COND_G:  return X86::COND_G;
+
+    case X86::COND_B:  return X86::COND_B;
+    case X86::COND_AE: return X86::COND_B;
+
+    case X86::COND_BE: return X86::COND_A;
+    case X86::COND_A:  return X86::COND_A;
+
+    case X86::COND_S:  return X86::COND_S;
+    case X86::COND_NS: return X86::COND_S;
+
+    case X86::COND_P:  return X86::COND_P;
+    case X86::COND_NP: return X86::COND_P;
+
+    case X86::COND_O:  return X86::COND_O;
+    case X86::COND_NO: return X86::COND_O;
+    }
+  }
+
+  bool replaceBranchTarget(MCInst &Inst, const MCSymbol *TBB,
+                           MCContext *Ctx) const override {
+    assert((isCall(Inst) || isBranch(Inst)) && !isIndirectBranch(Inst) &&
+           "Invalid instruction");
+    Inst.getOperand(0) = MCOperand::createExpr(
+        MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx));
+    return true;
+  }
+
+  MCPhysReg getX86R11() const override {
+    return X86::R11;
+  }
+
+  MCPhysReg getNoRegister() const override {
+    return X86::NoRegister;
+  }
+
+  MCPhysReg getIntArgRegister(unsigned ArgNo) const override {
+    // FIXME: this should depend on the calling convention.
+    switch (ArgNo) {
+    case 0:   return X86::RDI;
+    case 1:   return X86::RSI;
+    case 2:   return X86::RDX;
+    case 3:   return X86::RCX;
+    case 4:   return X86::R8;
+    case 5:   return X86::R9;
+    default:  return getNoRegister();
+    }
+  }
+
+  void createPause(MCInst &Inst) const override {
+    Inst.clear();
+    Inst.setOpcode(X86::PAUSE);
+  }
+
+  void createLfence(MCInst &Inst) const override {
+    Inst.clear();
+    Inst.setOpcode(X86::LFENCE);
+  }
+
+  bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
+                        bool IsTailCall) const override {
+    Inst.clear();
+    Inst.setOpcode(IsTailCall ? X86::TAILJMPd : X86::CALL64pcrel32);
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    return true;
+  }
+
+  void createShortJmp(std::vector<MCInst> &Seq, const MCSymbol *Target,
+                      MCContext *Ctx) const override {
+    Seq.clear();
+    MCInst Inst;
+    Inst.setOpcode(X86::JMP_1);
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    Seq.emplace_back(Inst);
+  }
+
+  bool isBranchOnMem(const MCInst &Inst) const override {
+    unsigned OpCode = Inst.getOpcode();
+    if (OpCode == X86::CALL64m || OpCode == X86::TAILJMPm ||
+        OpCode == X86::JMP64m)
+      return true;
+
+    return false;
+  }
+
+  bool isBranchOnReg(const MCInst &Inst) const override {
+    unsigned OpCode = Inst.getOpcode();
+    if (OpCode == X86::CALL64r || OpCode == X86::TAILJMPr ||
+        OpCode == X86::JMP64r)
+      return true;
+
+    return false;
+  }
+
+  void createPushRegister(MCInst &Inst, MCPhysReg Reg,
+                          unsigned Size) const override {
+    Inst.clear();
+    unsigned NewOpcode = 0;
+    if (Reg == X86::EFLAGS) {
+      switch (Size) {
+      case 2: NewOpcode = X86::PUSHF16;  break;
+      case 4: NewOpcode = X86::PUSHF32;  break;
+      case 8: NewOpcode = X86::PUSHF64;  break;
+      default:
+        assert(false);
+      }
+      Inst.setOpcode(NewOpcode);
+      return;
+    }
+    switch (Size) {
+    case 2: NewOpcode = X86::PUSH16r;  break;
+    case 4: NewOpcode = X86::PUSH32r;  break;
+    case 8: NewOpcode = X86::PUSH64r;  break;
+    default:
+      assert(false);
+    }
+    Inst.setOpcode(NewOpcode);
+    Inst.addOperand(MCOperand::createReg(Reg));
+  }
+
+  void createPopRegister(MCInst &Inst, MCPhysReg Reg,
+                         unsigned Size) const override {
+    Inst.clear();
+    unsigned NewOpcode = 0;
+    if (Reg == X86::EFLAGS) {
+      switch (Size) {
+      case 2: NewOpcode = X86::POPF16;  break;
+      case 4: NewOpcode = X86::POPF32;  break;
+      case 8: NewOpcode = X86::POPF64;  break;
+      default:
+        assert(false);
+      }
+      Inst.setOpcode(NewOpcode);
+      return;
+    }
+    switch (Size) {
+    case 2: NewOpcode = X86::POP16r;  break;
+    case 4: NewOpcode = X86::POP32r;  break;
+    case 8: NewOpcode = X86::POP64r;  break;
+    default:
+      assert(false);
+    }
+    Inst.setOpcode(NewOpcode);
+    Inst.addOperand(MCOperand::createReg(Reg));
+  }
+
+  void createPushFlags(MCInst &Inst, unsigned Size) const override {
+    return createPushRegister(Inst, X86::EFLAGS, Size);
+  }
+
+  void createPopFlags(MCInst &Inst, unsigned Size) const override {
+    return createPopRegister(Inst, X86::EFLAGS, Size);
+  }
+
+  void createSwap(MCInst &Inst, MCPhysReg Source, MCPhysReg MemBaseReg,
+                  int64_t Disp) const {
+    Inst.setOpcode(X86::XCHG64rm);
+    Inst.addOperand(MCOperand::createReg(Source));
+    Inst.addOperand(MCOperand::createReg(Source));
+    Inst.addOperand(MCOperand::createReg(MemBaseReg));      // BaseReg
+    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.addOperand(MCOperand::createImm(Disp));            // Displacement
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister));//AddrSegmentReg
+  }
+
+  void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg,
+                            int64_t Disp) const {
+    Inst.setOpcode(X86::JMP64m);
+    Inst.addOperand(MCOperand::createReg(MemBaseReg));      // BaseReg
+    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.addOperand(MCOperand::createImm(Disp));            // Displacement
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+  }
+
+  std::vector<MCInst>
+  createInstrumentedIndirectCall(const MCInst &CallInst, bool TailCall,
+                                 MCSymbol *HandlerFuncAddr, int CallSiteID,
+                                 MCContext *Ctx) const override {
+    // Check if the target address expression used in the original indirect call
+    // uses the stack pointer, which we are going to clobber.
+    static BitVector SPAliases(getAliases(X86::RSP));
+    bool UsesSP = false;
+    // Skip defs.
+    for (unsigned I = Info->get(CallInst.getOpcode()).getNumDefs(),
+         E = MCPlus::getNumPrimeOperands(CallInst); I != E; ++I) {
+      const MCOperand &Operand = CallInst.getOperand(I);
+      if (Operand.isReg() && SPAliases[Operand.getReg()]) {
+        UsesSP = true;
+        break;
+      }
+    }
+
+    std::vector<MCInst> Insts;
+    MCPhysReg TempReg = getIntArgRegister(0);
+    // Code sequence used to enter indirect call instrumentation helper:
+    //   push %rdi
+    //   add $8, %rsp       ;; $rsp may be used in target, so fix it to prev val
+    //   movq target, %rdi  ;; via convertIndirectCallTargetToLoad
+    //   sub $8, %rsp       ;; restore correct stack value
+    //   push %rdi
+    //   movq $CallSiteID, %rdi
+    //   push %rdi
+    //   callq/jmp HandlerFuncAddr
+    Insts.emplace_back();
+    createPushRegister(Insts.back(), TempReg, 8);
+    if (UsesSP) { // Only adjust SP if we really need to
+      Insts.emplace_back();
+      createStackPointerDecrement(Insts.back(), 8, /*NoFlagsClobber=*/false);
+    }
+    Insts.emplace_back(CallInst);
+    convertIndirectCallToLoad(Insts.back(), TempReg);
+    if (UsesSP) {
+      Insts.emplace_back();
+      createStackPointerIncrement(Insts.back(), 8, /*NoFlagsClobber=*/false);
+    }
+    Insts.emplace_back();
+    createPushRegister(Insts.back(), TempReg, 8);
+    Insts.emplace_back();
+    createLoadImmediate(Insts.back(), TempReg, CallSiteID);
+    Insts.emplace_back();
+    createPushRegister(Insts.back(), TempReg, 8);
+    Insts.emplace_back();
+    createDirectCall(Insts.back(), HandlerFuncAddr, Ctx,
+                     /*TailCall=*/TailCall);
+    // Carry over metadata
+    for (int I = MCPlus::getNumPrimeOperands(CallInst),
+             E = CallInst.getNumOperands();
+         I != E; ++I) {
+      Insts.back().addOperand(CallInst.getOperand(I));
+    }
+    return Insts;
+  }
+
+  std::vector<MCInst> createInstrumentedIndCallHandler() const override {
+    const MCPhysReg TempReg = getIntArgRegister(0);
+    // We just need to undo the sequence created for every ind call in
+    // instrumentIndirectTarget(), which can be accomplished minimally with:
+    //   popfq
+    //   pop %rdi
+    //   add $16, %rsp
+    //   xchg (%rsp), %rdi
+    //   jmp *-8(%rsp)
+    std::vector<MCInst> Insts(5);
+    createPopFlags(Insts[0], 8);
+    createPopRegister(Insts[1], TempReg, 8);
+    createStackPointerDecrement(Insts[2], 16, /*NoFlagsClobber=*/false);
+    createSwap(Insts[3], TempReg, X86::RSP, 0);
+    createIndirectBranch(Insts[4], X86::RSP, -8);
+    return Insts;
+  }
+
+  std::vector<MCInst> createInstrumentedIndTailCallHandler() const override {
+    const MCPhysReg TempReg = getIntArgRegister(0);
+    // Same thing as above, but for tail calls
+    //   popfq
+    //   add $16, %rsp
+    //   pop %rdi
+    //   jmp *-16(%rsp)
+    std::vector<MCInst> Insts(4);
+    createPopFlags(Insts[0], 8);
+    createStackPointerDecrement(Insts[1], 16, /*NoFlagsClobber=*/false);
+    createPopRegister(Insts[2], TempReg, 8);
+    createIndirectBranch(Insts[3], X86::RSP, -16);
+    return Insts;
+  }
+
+  std::vector<MCInst>
+  createInstrumentedIndCallTrampoline(const MCSymbol *InstrTrampoline,
+                                      const MCSymbol *IndCallHandler,
+                                      MCContext *Ctx) const override {
+    const MCPhysReg TempReg = getIntArgRegister(0);
+    std::vector<MCInst> Insts;
+    Insts.emplace_back();
+    createPushFlags(Insts.back(), 8);
+    Insts.emplace_back();
+    createMove(Insts.back(), InstrTrampoline, TempReg, Ctx);
+    std::vector<MCInst> cmpJmp = createCmpJE(TempReg, 0, IndCallHandler, Ctx);
+    Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
+    Insts.emplace_back();
+    Insts.back().setOpcode(X86::CALL64r);
+    Insts.back().addOperand(MCOperand::createReg(TempReg));
+    Insts.emplace_back();
+    createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true);
+    return Insts;
+  }
+
+  std::vector<MCInst> createNumCountersGetter(MCContext *Ctx) const override {
+    std::vector<MCInst> Insts(2);
+    MCSymbol *NumLocs = Ctx->getOrCreateSymbol("__bolt_num_counters");
+    createMove(Insts[0], NumLocs, X86::EAX, Ctx);
+    createReturn(Insts[1]);
+    return Insts;
+  }
+
+  std::vector<MCInst> createInstrLocationsGetter(MCContext *Ctx) const override {
+    std::vector<MCInst> Insts(2);
+    MCSymbol *Locs = Ctx->getOrCreateSymbol("__bolt_instr_locations");
+    createLea(Insts[0], Locs, X86::EAX, Ctx);
+    createReturn(Insts[1]);
+    return Insts;
+  }
+
+  std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const override {
+    std::vector<MCInst> Insts(2);
+    MCSymbol *Locs = Ctx->getOrCreateSymbol("__bolt_instr_tables");
+    createLea(Insts[0], Locs, X86::EAX, Ctx);
+    createReturn(Insts[1]);
+    return Insts;
+  }
+
+  std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const override {
+    std::vector<MCInst> Insts(2);
+    MCSymbol *NumFuncs = Ctx->getOrCreateSymbol("__bolt_instr_num_funcs");
+    createMove(Insts[0], NumFuncs, X86::EAX, Ctx);
+    createReturn(Insts[1]);
+    return Insts;
+  }
+
+  std::vector<MCInst> createSymbolTrampoline(const MCSymbol *TgtSym,
+                                             MCContext *Ctx) const override {
+    std::vector<MCInst> Insts(1);
+    createUncondBranch(Insts[0], TgtSym, Ctx);
+    return Insts;
+  }
+
+  std::vector<MCInst> createDummyReturnFunction(MCContext *Ctx) const override {
+    std::vector<MCInst> Insts(1);
+    createReturn(Insts[0]);
+    return Insts;
+  }
+
+  BlocksVectorTy indirectCallPromotion(
+    const MCInst &CallInst,
+    const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
+    const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
+    const std::vector<MCInst *> &MethodFetchInsns,
+    const bool MinimizeCodeSize,
+    MCContext *Ctx
+  ) override {
+    const bool IsTailCall = isTailCall(CallInst);
+    const bool IsJumpTable = getJumpTable(CallInst) != 0;
+    BlocksVectorTy Results;
+
+    // Label for the current code block.
+    MCSymbol* NextTarget = nullptr;
+
+    // The join block which contains all the instructions following CallInst.
+    // MergeBlock remains null if CallInst is a tail call.
+    MCSymbol* MergeBlock = nullptr;
+
+    unsigned FuncAddrReg = X86::R10;
+
+    const bool LoadElim = !VtableSyms.empty();
+    assert((!LoadElim || VtableSyms.size() == Targets.size()) &&
+           "There must be a vtable entry for every method "
+           "in the targets vector.");
+
+    if (MinimizeCodeSize && !LoadElim) {
+      std::set<unsigned> UsedRegs;
+
+      for (unsigned int I = 0; I < MCPlus::getNumPrimeOperands(CallInst); ++I) {
+        const MCOperand &Op = CallInst.getOperand(I);
+        if (Op.isReg()) {
+          UsedRegs.insert(Op.getReg());
+        }
+      }
+
+      if (UsedRegs.count(X86::R10) == 0)
+        FuncAddrReg = X86::R10;
+      else if (UsedRegs.count(X86::R11) == 0)
+        FuncAddrReg = X86::R11;
+      else
+        return Results;
+    }
+
+    const auto jumpToMergeBlock = [&](std::vector<MCInst> &NewCall) {
+      assert(MergeBlock);
+      NewCall.push_back(CallInst);
+      MCInst &Merge = NewCall.back();
+      Merge.clear();
+      createUncondBranch(Merge, MergeBlock, Ctx);
+    };
+
+    for (unsigned int i = 0; i < Targets.size(); ++i) {
+      Results.emplace_back(NextTarget, std::vector<MCInst>());
+      std::vector<MCInst>* NewCall = &Results.back().second;
+
+      if (MinimizeCodeSize && !LoadElim) {
+        // Load the call target into FuncAddrReg.
+        NewCall->push_back(CallInst);  // Copy CallInst in order to get SMLoc
+        MCInst &Target = NewCall->back();
+        Target.clear();
+        Target.setOpcode(X86::MOV64ri32);
+        Target.addOperand(MCOperand::createReg(FuncAddrReg));
+        if (Targets[i].first) {
+          // Is this OK?
+          Target.addOperand(
+            MCOperand::createExpr(
+              MCSymbolRefExpr::create(Targets[i].first,
+                                      MCSymbolRefExpr::VK_None,
+                                      *Ctx)));
+        } else {
+          const uint64_t Addr = Targets[i].second;
+          // Immediate address is out of sign extended 32 bit range.
+          if (int64_t(Addr) != int64_t(int32_t(Addr))) {
+            return BlocksVectorTy();
+          }
+          Target.addOperand(MCOperand::createImm(Addr));
+        }
+
+        // Compare current call target to a specific address.
+        NewCall->push_back(CallInst);
+        MCInst &Compare = NewCall->back();
+        Compare.clear();
+        if (isBranchOnReg(CallInst)) {
+          Compare.setOpcode(X86::CMP64rr);
+        } else if (CallInst.getOpcode() == X86::CALL64pcrel32) {
+          Compare.setOpcode(X86::CMP64ri32);
+        } else {
+          Compare.setOpcode(X86::CMP64rm);
+        }
+        Compare.addOperand(MCOperand::createReg(FuncAddrReg));
+
+        // TODO: Would be preferable to only load this value once.
+        for (unsigned i = 0;
+             i < Info->get(CallInst.getOpcode()).getNumOperands();
+             ++i) {
+          if (!CallInst.getOperand(i).isInst())
+            Compare.addOperand(CallInst.getOperand(i));
+        }
+      } else {
+        // Compare current call target to a specific address.
+        NewCall->push_back(CallInst);
+        MCInst &Compare = NewCall->back();
+        Compare.clear();
+        if (isBranchOnReg(CallInst)) {
+          Compare.setOpcode(X86::CMP64ri32);
+        } else {
+          Compare.setOpcode(X86::CMP64mi32);
+        }
+
+        // Original call address.
+        for (unsigned i = 0;
+             i < Info->get(CallInst.getOpcode()).getNumOperands();
+             ++i) {
+          if (!CallInst.getOperand(i).isInst())
+            Compare.addOperand(CallInst.getOperand(i));
+        }
+
+        // Target address.
+        if (Targets[i].first || LoadElim) {
+          const MCSymbol *Sym =
+              LoadElim ? VtableSyms[i].first : Targets[i].first;
+          const uint64_t Addend = LoadElim ? VtableSyms[i].second : 0;
+
+          const MCExpr *Expr = MCSymbolRefExpr::create(Sym, *Ctx);
+
+          if (Addend) {
+            Expr = MCBinaryExpr::createAdd(Expr,
+                                           MCConstantExpr::create(Addend, *Ctx),
+                                           *Ctx);
+          }
+
+          Compare.addOperand(MCOperand::createExpr(Expr));
+        } else {
+          const uint64_t Addr = Targets[i].second;
+          // Immediate address is out of sign extended 32 bit range.
+          if (int64_t(Addr) != int64_t(int32_t(Addr))) {
+            return BlocksVectorTy();
+          }
+          Compare.addOperand(MCOperand::createImm(Addr));
+        }
+      }
+
+      // jump to next target compare.
+      NextTarget =
+          Ctx->createNamedTempSymbol(); // generate label for the next block
+      NewCall->push_back(CallInst);
+
+      if (IsJumpTable) {
+        MCInst &Je = NewCall->back();
+
+        // Jump to next compare if target addresses don't match.
+        Je.clear();
+        Je.setOpcode(X86::JCC_1);
+        if (Targets[i].first) {
+          Je.addOperand(MCOperand::createExpr(
+            MCSymbolRefExpr::create(Targets[i].first,
+                                    MCSymbolRefExpr::VK_None,
+                                    *Ctx)));
+        } else {
+          Je.addOperand(MCOperand::createImm(Targets[i].second));
+        }
+        Je.addOperand(MCOperand::createImm(X86::COND_E));
+        assert(!isInvoke(CallInst));
+      } else {
+        MCInst &Jne = NewCall->back();
+
+        // Jump to next compare if target addresses don't match.
+        Jne.clear();
+        Jne.setOpcode(X86::JCC_1);
+        Jne.addOperand(MCOperand::createExpr(MCSymbolRefExpr::create(
+            NextTarget, MCSymbolRefExpr::VK_None, *Ctx)));
+        Jne.addOperand(MCOperand::createImm(X86::COND_NE));
+
+        // Call specific target directly.
+        Results.emplace_back(Ctx->createNamedTempSymbol(),
+                             std::vector<MCInst>());
+        NewCall = &Results.back().second;
+        NewCall->push_back(CallInst);
+        MCInst &CallOrJmp = NewCall->back();
+
+        CallOrJmp.clear();
+
+        if (MinimizeCodeSize && !LoadElim) {
+          CallOrJmp.setOpcode(IsTailCall ? X86::TAILJMPr : X86::CALL64r);
+          CallOrJmp.addOperand(MCOperand::createReg(FuncAddrReg));
+        } else {
+          CallOrJmp.setOpcode(IsTailCall ? X86::TAILJMPd : X86::CALL64pcrel32);
+
+          if (Targets[i].first) {
+            CallOrJmp.addOperand(MCOperand::createExpr(MCSymbolRefExpr::create(
+                Targets[i].first, MCSymbolRefExpr::VK_None, *Ctx)));
+          } else {
+            CallOrJmp.addOperand(MCOperand::createImm(Targets[i].second));
+          }
+        }
+
+        if (isInvoke(CallInst) && !isInvoke(CallOrJmp)) {
+          // Copy over any EH or GNU args size information from the original
+          // call.
+          Optional<MCPlus::MCLandingPad> EHInfo = getEHInfo(CallInst);
+          if (EHInfo)
+            addEHInfo(CallOrJmp, *EHInfo);
+          int64_t GnuArgsSize = getGnuArgsSize(CallInst);
+          if (GnuArgsSize >= 0)
+            addGnuArgsSize(CallOrJmp, GnuArgsSize);
+        }
+
+        if (!IsTailCall) {
+          // The fallthrough block for the most common target should be
+          // the merge block.
+          if (i == 0) {
+            // Fallthrough to merge block.
+            MergeBlock = Ctx->createNamedTempSymbol();
+          } else {
+            // Insert jump to the merge block if we are not doing a fallthrough.
+            jumpToMergeBlock(*NewCall);
+          }
+        }
+      }
+    }
+
+    // Cold call block.
+    Results.emplace_back(NextTarget, std::vector<MCInst>());
+    std::vector<MCInst> &NewCall = Results.back().second;
+    for (const MCInst *Inst : MethodFetchInsns) {
+      if (Inst != &CallInst)
+        NewCall.push_back(*Inst);
+    }
+    NewCall.push_back(CallInst);
+
+    // Jump to merge block from cold call block
+    if (!IsTailCall && !IsJumpTable) {
+      jumpToMergeBlock(NewCall);
+
+      // Record merge block
+      Results.emplace_back(MergeBlock, std::vector<MCInst>());
+    }
+
+    return Results;
+  }
+
+  BlocksVectorTy jumpTablePromotion(
+    const MCInst &IJmpInst,
+    const std::vector<std::pair<MCSymbol *,uint64_t>> &Targets,
+    const std::vector<MCInst *> &TargetFetchInsns,
+    MCContext *Ctx
+  ) const override {
+    assert(getJumpTable(IJmpInst) != 0);
+    uint16_t IndexReg = getAnnotationAs<uint16_t>(IJmpInst, "JTIndexReg");
+    if (IndexReg == 0)
+      return BlocksVectorTy();
+
+    BlocksVectorTy Results;
+
+    // Label for the current code block.
+    MCSymbol* NextTarget = nullptr;
+
+    for (unsigned int i = 0; i < Targets.size(); ++i) {
+      Results.emplace_back(NextTarget, std::vector<MCInst>());
+      std::vector<MCInst>* CurBB = &Results.back().second;
+
+      // Compare current index to a specific index.
+      CurBB->emplace_back(MCInst());
+      MCInst &CompareInst = CurBB->back();
+      CompareInst.setLoc(IJmpInst.getLoc());
+      CompareInst.setOpcode(X86::CMP64ri32);
+      CompareInst.addOperand(MCOperand::createReg(IndexReg));
+
+      const uint64_t CaseIdx = Targets[i].second;
+      // Immediate address is out of sign extended 32 bit range.
+      if (int64_t(CaseIdx) != int64_t(int32_t(CaseIdx))) {
+        return BlocksVectorTy();
+      }
+      CompareInst.addOperand(MCOperand::createImm(CaseIdx));
+      shortenInstruction(CompareInst);
+
+      // jump to next target compare.
+      NextTarget =
+          Ctx->createNamedTempSymbol(); // generate label for the next block
+      CurBB->push_back(MCInst());
+
+      MCInst &JEInst = CurBB->back();
+      JEInst.setLoc(IJmpInst.getLoc());
+
+      // Jump to target if indices match
+      JEInst.setOpcode(X86::JCC_1);
+      JEInst.addOperand(MCOperand::createExpr(MCSymbolRefExpr::create(
+          Targets[i].first, MCSymbolRefExpr::VK_None, *Ctx)));
+      JEInst.addOperand(MCOperand::createImm(X86::COND_E));
+    }
+
+    // Cold call block.
+    Results.emplace_back(NextTarget, std::vector<MCInst>());
+    std::vector<MCInst> &CurBB = Results.back().second;
+    for (const MCInst *Inst : TargetFetchInsns) {
+      if (Inst != &IJmpInst)
+        CurBB.push_back(*Inst);
+    }
+    CurBB.push_back(IJmpInst);
+
+    return Results;
+  }
+
+private:
+
+  bool createMove(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
+                  MCContext *Ctx) const {
+    Inst.setOpcode(X86::MOV64rm);
+    Inst.addOperand(MCOperand::createReg(Reg));
+    Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
+    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Src, MCSymbolRefExpr::VK_None,
+                                *Ctx)));                    // Displacement
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+
+    return true;
+  }
+
+  bool createLea(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
+                 MCContext *Ctx) const {
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::createReg(Reg));
+    Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
+    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Src, MCSymbolRefExpr::VK_None,
+                                *Ctx)));                    // Displacement
+    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+    return true;
+  }
+};
+
+}
+
+namespace llvm {
+namespace bolt {
+
+MCPlusBuilder *createX86MCPlusBuilder(const MCInstrAnalysis *Analysis,
+                                      const MCInstrInfo *Info,
+                                      const MCRegisterInfo *RegInfo) {
+  return new X86MCPlusBuilder(Analysis, Info, RegInfo);
+}
+
+}
+}
diff --git a/bolt/src/Utils.cpp b/bolt/src/Utils.cpp
new file mode 100644
index 000000000000..25d50d5893b4
--- /dev/null
+++ b/bolt/src/Utils.cpp
@@ -0,0 +1,49 @@
+//===--- Utils.cpp - Common helper functions ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace bolt {
+
+void report_error(StringRef Message, std::error_code EC) {
+  assert(EC);
+  errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n";
+  exit(1);
+}
+
+void report_error(StringRef Message, Error E) {
+  assert(E);
+  errs() << "BOLT-ERROR: '" << Message << "': " << toString(std::move(E))
+         << ".\n";
+  exit(1);
+}
+
+void check_error(std::error_code EC, StringRef Message) {
+  if (!EC)
+    return;
+  report_error(Message, EC);
+}
+
+void check_error(Error E, Twine Message) {
+  if (!E)
+    return;
+  handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
+    llvm::errs() << "BOLT-ERROR: '" << Message << "': " << EIB.message()
+                 << '\n';
+    exit(1);
+  });
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/Utils.h b/bolt/src/Utils.h
new file mode 100644
index 000000000000..7a4df8f3f43a
--- /dev/null
+++ b/bolt/src/Utils.h
@@ -0,0 +1,40 @@
+//===--- Utils.h - Common helper functions --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_UTILS_H
+#define LLVM_TOOLS_LLVM_BOLT_UTILS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace bolt {
+
+/// Free memory allocated for \p List.
+template<typename T> void clearList(T& List) {
+  T TempList;
+  TempList.swap(List);
+}
+
+void report_error(StringRef Message, std::error_code EC);
+
+void report_error(StringRef Message, Error E);
+
+void check_error(std::error_code EC, StringRef Message);
+
+void check_error(Error E, Twine Message);
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/YAMLProfileReader.cpp b/bolt/src/YAMLProfileReader.cpp
new file mode 100644
index 000000000000..5ee1bbef5c3e
--- /dev/null
+++ b/bolt/src/YAMLProfileReader.cpp
@@ -0,0 +1,435 @@
+//===-- YAMLProfileReader.cpp - BOLT YAML profile de-serializer -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "Passes/MCF.h"
+#include "YAMLProfileReader.h"
+#include "ProfileYAMLMapping.h"
+#include "Utils.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace opts {
+
+extern cl::opt<unsigned> Verbosity;
+extern cl::OptionCategory BoltOptCategory;
+
+static llvm::cl::opt<bool>
+IgnoreHash("profile-ignore-hash",
+  cl::desc("ignore hash while reading function profile"),
+  cl::init(false),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+}
+
+namespace llvm {
+namespace bolt {
+
+bool YAMLProfileReader::isYAML(const StringRef Filename) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = MB.getError())
+    report_error(Filename, EC);
+  StringRef Buffer = MB.get()->getBuffer();
+  if (Buffer.startswith("---\n"))
+    return true;
+  return false;
+}
+
+void YAMLProfileReader::buildNameMaps(
+    std::map<uint64_t, BinaryFunction> &Functions) {
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+    StringRef Name = YamlBF.Name;
+    const size_t Pos = Name.find("(*");
+    if (Pos != StringRef::npos)
+      Name = Name.substr(0, Pos);
+    ProfileNameToProfile[Name] = &YamlBF;
+    if (const Optional<StringRef> CommonName = getLTOCommonName(Name)) {
+      LTOCommonNameMap[*CommonName].push_back(&YamlBF);
+    }
+  }
+  for (auto &BFI : Functions) {
+    const BinaryFunction &Function = BFI.second;
+    for (StringRef Name : Function.getNames()) {
+      if (const Optional<StringRef> CommonName = getLTOCommonName(Name)) {
+        LTOCommonNameFunctionMap[*CommonName].insert(&Function);
+      }
+    }
+  }
+}
+
+bool YAMLProfileReader::hasLocalsWithFileName() const {
+  for (const StringMapEntry<yaml::bolt::BinaryFunctionProfile *> &KV :
+       ProfileNameToProfile) {
+    const StringRef &FuncName = KV.getKey();
+    if (FuncName.count('/') == 2 && FuncName[0] != '/')
+      return true;
+  }
+  return false;
+}
+
+bool YAMLProfileReader::parseFunctionProfile(
+    BinaryFunction &BF,
+    const yaml::bolt::BinaryFunctionProfile &YamlBF) {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  bool ProfileMatched = true;
+  uint64_t MismatchedBlocks = 0;
+  uint64_t MismatchedCalls = 0;
+  uint64_t MismatchedEdges = 0;
+
+  uint64_t FunctionExecutionCount = 0;
+
+  BF.setExecutionCount(YamlBF.ExecCount);
+
+  if (!opts::IgnoreHash && YamlBF.Hash != BF.computeHash(/*UseDFS=*/true)) {
+    if (opts::Verbosity >= 1)
+      errs() << "BOLT-WARNING: function hash mismatch\n";
+    ProfileMatched = false;
+  }
+
+  if (YamlBF.NumBasicBlocks != BF.size()) {
+    if (opts::Verbosity >= 1)
+      errs() << "BOLT-WARNING: number of basic blocks mismatch\n";
+    ProfileMatched = false;
+  }
+
+  std::vector<BinaryBasicBlock *> DFSOrder = BF.dfs();
+
+  for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
+    if (YamlBB.Index >= DFSOrder.size()) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: index " << YamlBB.Index
+               << " is out of bounds\n";
+      ++MismatchedBlocks;
+      continue;
+    }
+
+    BinaryBasicBlock &BB = *DFSOrder[YamlBB.Index];
+
+    // Basic samples profile (without LBR) does not have branches information
+    // and needs a special processing.
+    if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE) {
+      if (!YamlBB.EventCount) {
+        BB.setExecutionCount(0);
+        continue;
+      }
+      uint64_t NumSamples = YamlBB.EventCount * 1000;
+      if (NormalizeByInsnCount && BB.getNumNonPseudos()) {
+        NumSamples /= BB.getNumNonPseudos();
+      } else if (NormalizeByCalls) {
+        NumSamples /= BB.getNumCalls() + 1;
+      }
+      BB.setExecutionCount(NumSamples);
+      if (BB.isEntryPoint())
+        FunctionExecutionCount += NumSamples;
+      continue;
+    }
+
+    BB.setExecutionCount(YamlBB.ExecCount);
+
+    for (const yaml::bolt::CallSiteInfo &YamlCSI : YamlBB.CallSites) {
+      BinaryFunction *Callee = YamlCSI.DestId < YamlProfileToFunction.size()
+                                   ? YamlProfileToFunction[YamlCSI.DestId]
+                                   : nullptr;
+      bool IsFunction = Callee ? true : false;
+      MCSymbol *CalleeSymbol = nullptr;
+      if (IsFunction) {
+        CalleeSymbol = Callee->getSymbolForEntryID(YamlCSI.EntryDiscriminator);
+      }
+      BF.getAllCallSites().emplace_back(
+          CalleeSymbol, YamlCSI.Count, YamlCSI.Mispreds, YamlCSI.Offset);
+
+      if (YamlCSI.Offset >= BB.getOriginalSize()) {
+        if (opts::Verbosity >= 2)
+          errs() << "BOLT-WARNING: offset " << YamlCSI.Offset
+                 << " out of bounds in block " << BB.getName() << '\n';
+        ++MismatchedCalls;
+        continue;
+      }
+
+      MCInst *Instr =
+          BF.getInstructionAtOffset(BB.getInputOffset() + YamlCSI.Offset);
+      if (!Instr) {
+        if (opts::Verbosity >= 2)
+          errs() << "BOLT-WARNING: no instruction at offset " << YamlCSI.Offset
+                 << " in block " << BB.getName() << '\n';
+        ++MismatchedCalls;
+        continue;
+      }
+      if (!BC.MIB->isCall(*Instr) && !BC.MIB->isIndirectBranch(*Instr)) {
+        if (opts::Verbosity >= 2)
+          errs() << "BOLT-WARNING: expected call at offset " << YamlCSI.Offset
+                 << " in block " << BB.getName() << '\n';
+        ++MismatchedCalls;
+        continue;
+      }
+
+      auto setAnnotation = [&](StringRef Name, uint64_t Count) {
+        if (BC.MIB->hasAnnotation(*Instr, Name)) {
+          if (opts::Verbosity >= 1)
+            errs() << "BOLT-WARNING: ignoring duplicate " << Name
+                   << " info for offset 0x" << Twine::utohexstr(YamlCSI.Offset)
+                   << " in function " << BF << '\n';
+          return;
+        }
+        BC.MIB->addAnnotation(*Instr, Name, Count);
+      };
+
+      if (BC.MIB->isIndirectCall(*Instr) || BC.MIB->isIndirectBranch(*Instr)) {
+        auto &CSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
+            *Instr, "CallProfile");
+        CSP.emplace_back(CalleeSymbol, YamlCSI.Count, YamlCSI.Mispreds);
+      } else if (BC.MIB->getConditionalTailCall(*Instr)) {
+        setAnnotation("CTCTakenCount", YamlCSI.Count);
+        setAnnotation("CTCMispredCount", YamlCSI.Mispreds);
+      } else {
+        setAnnotation("Count", YamlCSI.Count);
+      }
+    }
+
+    for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
+      if (YamlSI.Index >= DFSOrder.size()) {
+        if (opts::Verbosity >= 1)
+          errs() << "BOLT-WARNING: index out of bounds for profiled block\n";
+        ++MismatchedEdges;
+        continue;
+      }
+
+      BinaryBasicBlock &SuccessorBB = *DFSOrder[YamlSI.Index];
+      if (!BB.getSuccessor(SuccessorBB.getLabel())) {
+        if (opts::Verbosity >= 1)
+          errs() << "BOLT-WARNING: no successor for block " << BB.getName()
+                 << " that matches index " << YamlSI.Index << " or block "
+                 << SuccessorBB.getName() << '\n';
+        ++MismatchedEdges;
+        continue;
+      }
+
+      BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(SuccessorBB);
+      BI.Count += YamlSI.Count;
+      BI.MispredictedCount += YamlSI.Mispreds;
+    }
+  }
+
+  // If basic block profile wasn't read it should be 0.
+  for (BinaryBasicBlock &BB : BF) {
+    if (BB.getExecutionCount() == BinaryBasicBlock::COUNT_NO_PROFILE)
+      BB.setExecutionCount(0);
+  }
+
+  if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE) {
+    BF.setExecutionCount(FunctionExecutionCount);
+    estimateEdgeCounts(BF);
+  }
+
+  ProfileMatched &= !MismatchedBlocks && !MismatchedCalls && !MismatchedEdges;
+
+  if (ProfileMatched)
+    BF.markProfiled(YamlBP.Header.Flags);
+
+  if (!ProfileMatched && opts::Verbosity >= 1) {
+    errs() << "BOLT-WARNING: " << MismatchedBlocks << " blocks, "
+           << MismatchedCalls << " calls, and " << MismatchedEdges
+           << " edges in profile did not match function " << BF << '\n';
+  }
+
+  return ProfileMatched;
+}
+
+Error YAMLProfileReader::preprocessProfile(BinaryContext &BC) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = MB.getError()) {
+    errs() << "ERROR: cannot open " << Filename << ": " << EC.message() << "\n";
+    return errorCodeToError(EC);
+  }
+  yaml::Input YamlInput(MB.get()->getBuffer());
+
+  // Consume YAML file.
+  YamlInput >> YamlBP;
+  if (YamlInput.error()) {
+    errs() << "BOLT-ERROR: syntax error parsing profile in " << Filename
+           << " : " << YamlInput.error().message() << '\n';
+    return errorCodeToError(YamlInput.error());
+  }
+
+  // Sanity check.
+  if (YamlBP.Header.Version != 1) {
+    return make_error<StringError>(
+        Twine("cannot read profile : unsupported version"),
+        inconvertibleErrorCode());
+  }
+  if (YamlBP.Header.EventNames.find(',') != StringRef::npos) {
+    return make_error<StringError>(
+        Twine("multiple events in profile are not supported"),
+        inconvertibleErrorCode());
+  }
+
+  // Match profile to function based on a function name.
+  buildNameMaps(BC.getBinaryFunctions());
+
+  // Preliminary assign function execution count.
+  for (auto &KV : BC.getBinaryFunctions()) {
+    BinaryFunction &BF = KV.second;
+    for (StringRef Name : BF.getNames()) {
+      auto PI = ProfileNameToProfile.find(Name);
+      if (PI != ProfileNameToProfile.end()) {
+        yaml::bolt::BinaryFunctionProfile &YamlBF = *PI->getValue();
+        BF.setExecutionCount(YamlBF.ExecCount);
+        break;
+      }
+    }
+  }
+
+  return Error::success();
+}
+
+bool YAMLProfileReader::mayHaveProfileData(const BinaryFunction &BF) {
+  for (StringRef Name : BF.getNames()) {
+    if (ProfileNameToProfile.find(Name) != ProfileNameToProfile.end())
+      return true;
+    if (const Optional<StringRef> CommonName = getLTOCommonName(Name)) {
+      if (LTOCommonNameMap.find(*CommonName) != LTOCommonNameMap.end()) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+Error YAMLProfileReader::readProfile(BinaryContext &BC) {
+  YamlProfileToFunction.resize(YamlBP.Functions.size() + 1);
+
+  auto profileMatches = [](const yaml::bolt::BinaryFunctionProfile &Profile,
+                           BinaryFunction &BF) {
+    if (opts::IgnoreHash && Profile.NumBasicBlocks == BF.size())
+      return true;
+    if (!opts::IgnoreHash &&
+        Profile.Hash == static_cast<uint64_t>(BF.getHash()))
+      return true;
+    return false;
+  };
+
+  // We have to do 2 passes since LTO introduces an ambiguity in function
+  // names. The first pass assigns profiles that match 100% by name and
+  // by hash. The second pass allows name ambiguity for LTO private functions.
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    // Clear function call count that may have been set while pre-processing
+    // the profile.
+    Function.setExecutionCount(BinaryFunction::COUNT_NO_PROFILE);
+
+    // Recompute hash once per function.
+    if (!opts::IgnoreHash)
+      Function.computeHash(/*UseDFS=*/true);
+
+    for (StringRef FunctionName : Function.getNames()) {
+      auto PI = ProfileNameToProfile.find(FunctionName);
+      if (PI == ProfileNameToProfile.end()) {
+        continue;
+      }
+      yaml::bolt::BinaryFunctionProfile &YamlBF = *PI->getValue();
+      if (profileMatches(YamlBF, Function))
+        matchProfileToFunction(YamlBF, Function);
+    }
+  }
+
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
+
+    if (ProfiledFunctions.count(&Function))
+      continue;
+
+    for (StringRef FunctionName : Function.getNames()) {
+      const Optional<StringRef> CommonName = getLTOCommonName(FunctionName);
+      if (CommonName) {
+        auto I = LTOCommonNameMap.find(*CommonName);
+        if (I == LTOCommonNameMap.end())
+          continue;
+
+        bool ProfileMatched = false;
+        std::vector<yaml::bolt::BinaryFunctionProfile *> &LTOProfiles =
+            I->getValue();
+        for (yaml::bolt::BinaryFunctionProfile *YamlBF : LTOProfiles) {
+          if (YamlBF->Used)
+            continue;
+          if ((ProfileMatched = profileMatches(*YamlBF, Function))) {
+            matchProfileToFunction(*YamlBF, Function);
+            break;
+          }
+        }
+        if (ProfileMatched)
+          break;
+
+        // If there's only one function with a given name, try to
+        // match it partially.
+        if (LTOProfiles.size() == 1 &&
+            LTOCommonNameFunctionMap[*CommonName].size() == 1 &&
+            !LTOProfiles.front()->Used) {
+          matchProfileToFunction(*LTOProfiles.front(), Function);
+          break;
+        }
+      } else {
+        auto PI = ProfileNameToProfile.find(FunctionName);
+        if (PI == ProfileNameToProfile.end())
+          continue;
+
+        yaml::bolt::BinaryFunctionProfile &YamlBF = *PI->getValue();
+        if (!YamlBF.Used) {
+          matchProfileToFunction(YamlBF, Function);
+          break;
+        }
+      }
+    }
+  }
+
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+    if (!YamlBF.Used && opts::Verbosity >= 1) {
+      errs() << "BOLT-WARNING: profile ignored for function " << YamlBF.Name
+             << '\n';
+    }
+  }
+
+  // Set for parseFunctionProfile().
+  NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions");
+  NormalizeByCalls = usesEvent("branches");
+
+  uint64_t NumUnused = 0;
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+    if (YamlBF.Id >= YamlProfileToFunction.size()) {
+      // Such profile was ignored.
+      ++NumUnused;
+      continue;
+    }
+    if (BinaryFunction *BF = YamlProfileToFunction[YamlBF.Id]) {
+      parseFunctionProfile(*BF, YamlBF);
+    } else {
+      ++NumUnused;
+    }
+  }
+
+  BC.setNumUnusedProfiledObjects(NumUnused);
+
+  return Error::success();
+}
+
+bool YAMLProfileReader::usesEvent(StringRef Name) const {
+  return YamlBP.Header.EventNames.find(std::string(Name)) != StringRef::npos;
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/src/YAMLProfileReader.h b/bolt/src/YAMLProfileReader.h
new file mode 100644
index 000000000000..5561e80b8828
--- /dev/null
+++ b/bolt/src/YAMLProfileReader.h
@@ -0,0 +1,102 @@
+//===-- YAMLProfileReader.h - BOLT YAML profile deserializer ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_YAML_PROFILE_READER_H
+#define LLVM_TOOLS_LLVM_BOLT_YAML_PROFILE_READER_H
+
+#include "ProfileReaderBase.h"
+#include "ProfileYAMLMapping.h"
+#include <unordered_set>
+
+namespace llvm {
+namespace bolt {
+
+class YAMLProfileReader : public ProfileReaderBase {
+public:
+  explicit YAMLProfileReader(StringRef Filename)
+    : ProfileReaderBase(Filename) {}
+
+  StringRef getReaderName() const override {
+    return "YAML profile reader";
+  }
+
+  bool isTrustedSource() const override {
+    return false;
+  }
+
+  Error readProfilePreCFG(BinaryContext &BC) override {
+    return Error::success();
+  }
+
+  Error readProfile(BinaryContext &BC) override;
+
+  Error preprocessProfile(BinaryContext &BC) override;
+
+  virtual bool hasLocalsWithFileName() const override;
+
+  virtual bool mayHaveProfileData(const BinaryFunction &BF) override;
+
+  /// Check if the file contains YAML.
+  static bool isYAML(StringRef Filename);
+
+private:
+  /// Adjustments for basic samples profiles (without LBR).
+  bool NormalizeByInsnCount{false};
+  bool NormalizeByCalls{false};
+
+  /// Binary profile in YAML format.
+  yaml::bolt::BinaryProfile YamlBP;
+
+  /// Map a function ID from a YAML profile to a BinaryFunction object.
+  std::vector<BinaryFunction *> YamlProfileToFunction;
+
+  /// To keep track of functions that have a matched profile before the profile
+  /// is attributed.
+  std::unordered_set<const BinaryFunction *> ProfiledFunctions;
+
+  /// For LTO symbol resolution.
+  /// Map a common LTO prefix to a list of YAML profiles matching the prefix.
+  StringMap<std::vector<yaml::bolt::BinaryFunctionProfile *>> LTOCommonNameMap;
+
+  /// Map a common LTO prefix to a set of binary functions.
+  StringMap<std::unordered_set<const BinaryFunction *>>
+                                                      LTOCommonNameFunctionMap;
+
+  /// Strict matching of a name in a profile to its contents.
+  StringMap<yaml::bolt::BinaryFunctionProfile *> ProfileNameToProfile;
+
+  /// Populate \p Function profile with the one supplied in YAML format.
+  bool parseFunctionProfile(BinaryFunction &Function,
+                            const yaml::bolt::BinaryFunctionProfile &YamlBF);
+
+  /// Initialize maps for profile matching.
+  void buildNameMaps(std::map<uint64_t, BinaryFunction> &Functions);
+
+  /// Update matched YAML -> BinaryFunction pair.
+  void matchProfileToFunction(yaml::bolt::BinaryFunctionProfile &YamlBF,
+                              BinaryFunction &BF) {
+    if (YamlBF.Id >= YamlProfileToFunction.size())
+      YamlProfileToFunction.resize(YamlBF.Id + 1);
+    YamlProfileToFunction[YamlBF.Id] = &BF;
+    YamlBF.Used = true;
+
+    assert(!ProfiledFunctions.count(&BF) &&
+           "function already has an assigned profile");
+    ProfiledFunctions.emplace(&BF);
+  }
+
+  /// Check if the profile uses an event with a given \p Name.
+  bool usesEvent(StringRef Name) const;
+};
+
+}
+}
+
+#endif
diff --git a/bolt/src/YAMLProfileWriter.cpp b/bolt/src/YAMLProfileWriter.cpp
new file mode 100644
index 000000000000..4761db00873c
--- /dev/null
+++ b/bolt/src/YAMLProfileWriter.cpp
@@ -0,0 +1,214 @@
+//===- YAMLProfileWriter.cpp - serialize profiling data in YAML -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "YAMLProfileWriter.h"
+#include "BinaryBasicBlock.h"
+#include "BinaryFunction.h"
+#include "ProfileReaderBase.h"
+#include "ProfileYAMLMapping.h"
+#include "RewriteInstance.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt-prof"
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+void
+convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) {
+  const BinaryContext &BC = BF.getBinaryContext();
+
+  const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
+
+  YamlBF.Name = BF.getPrintName();
+  YamlBF.Id = BF.getFunctionNumber();
+  YamlBF.Hash = BF.computeHash(/*UseDFS=*/true);
+  YamlBF.NumBasicBlocks = BF.size();
+  YamlBF.ExecCount = BF.getKnownExecutionCount();
+
+  for (const BinaryBasicBlock *BB : BF.dfs()) {
+    yaml::bolt::BinaryBasicBlockProfile YamlBB;
+    YamlBB.Index = BB->getLayoutIndex();
+    YamlBB.NumInstructions = BB->getNumNonPseudos();
+
+    if (!LBRProfile) {
+      YamlBB.EventCount = BB->getKnownExecutionCount();
+      if (YamlBB.EventCount)
+        YamlBF.Blocks.emplace_back(YamlBB);
+      continue;
+    }
+
+    YamlBB.ExecCount = BB->getKnownExecutionCount();
+
+    for (const MCInst &Instr : *BB) {
+      if (!BC.MIB->isCall(Instr) && !BC.MIB->isIndirectBranch(Instr))
+        continue;
+
+      yaml::bolt::CallSiteInfo CSI;
+      auto Offset = BC.MIB->tryGetAnnotationAs<uint32_t>(Instr, "Offset");
+      if (!Offset || Offset.get() < BB->getInputOffset())
+        continue;
+      CSI.Offset = Offset.get() - BB->getInputOffset();
+
+      if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
+        auto ICSP =
+          BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(Instr,
+                                                              "CallProfile");
+        if (!ICSP)
+          continue;
+        for (const IndirectCallProfile &CSP : ICSP.get()) {
+          CSI.DestId = 0; // designated for unknown functions
+          CSI.EntryDiscriminator = 0;
+          if (CSP.Symbol) {
+            const BinaryFunction *Callee = BC.getFunctionForSymbol(CSP.Symbol);
+            if (Callee) {
+              CSI.DestId = Callee->getFunctionNumber();
+            }
+          }
+          CSI.Count = CSP.Count;
+          CSI.Mispreds = CSP.Mispreds;
+          YamlBB.CallSites.push_back(CSI);
+        }
+      } else { // direct call or a tail call
+        uint64_t EntryID = 0;
+        const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Instr);
+        const BinaryFunction *const Callee =
+            BC.getFunctionForSymbol(CalleeSymbol, &EntryID);
+        if (Callee) {
+          CSI.DestId = Callee->getFunctionNumber();;
+          CSI.EntryDiscriminator = EntryID;
+        }
+
+        if (BC.MIB->getConditionalTailCall(Instr)) {
+          auto CTCCount =
+            BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCTakenCount");
+          if (CTCCount) {
+            CSI.Count = *CTCCount;
+            auto CTCMispreds =
+              BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCMispredCount");
+            if (CTCMispreds)
+              CSI.Mispreds = *CTCMispreds;
+          }
+        } else {
+          auto Count = BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "Count");
+          if (Count)
+            CSI.Count = *Count;
+        }
+
+        if (CSI.Count)
+          YamlBB.CallSites.emplace_back(CSI);
+      }
+    }
+
+    std::sort(YamlBB.CallSites.begin(), YamlBB.CallSites.end());
+
+    // Skip printing if there's no profile data for non-entry basic block.
+    // Include landing pads with non-zero execution count.
+    if (YamlBB.CallSites.empty() &&
+        !BB->isEntryPoint() &&
+        !(BB->isLandingPad() && BB->getKnownExecutionCount() != 0)) {
+      uint64_t SuccessorExecCount = 0;
+      for (const BinaryBasicBlock::BinaryBranchInfo &BranchInfo :
+           BB->branch_info()) {
+        SuccessorExecCount += BranchInfo.Count;
+      }
+      if (!SuccessorExecCount)
+        continue;
+    }
+
+    auto BranchInfo = BB->branch_info_begin();
+    for (const BinaryBasicBlock *Successor : BB->successors()) {
+      yaml::bolt::SuccessorInfo YamlSI;
+      YamlSI.Index = Successor->getLayoutIndex();
+      YamlSI.Count = BranchInfo->Count;
+      YamlSI.Mispreds = BranchInfo->MispredictedCount;
+
+      YamlBB.Successors.emplace_back(YamlSI);
+
+      ++BranchInfo;
+    }
+
+    YamlBF.Blocks.emplace_back(YamlBB);
+  }
+}
+} // end anonymous namespace
+
+std::error_code
+YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
+  const BinaryContext &BC = RI.getBinaryContext();
+  const auto &Functions = BC.getBinaryFunctions();
+
+  std::error_code EC;
+  OS = std::make_unique<raw_fd_ostream>(Filename, EC, sys::fs::OF_None);
+  if (EC) {
+    errs() << "BOLT-WARNING: " << EC.message() << " : unable to open "
+           << Filename << " for output.\n";
+    return EC;
+  }
+
+  yaml::bolt::BinaryProfile BP;
+
+  // Fill out the header info.
+  BP.Header.Version = 1;
+  BP.Header.FileName = std::string(BC.getFilename());
+  Optional<StringRef> BuildID = BC.getFileBuildID();
+  BP.Header.Id = BuildID ? std::string(*BuildID) : "<unknown>";
+  BP.Header.Origin = std::string(RI.getProfileReader()->getReaderName());
+
+  StringSet<> EventNames = RI.getProfileReader()->getEventNames();
+  if (!EventNames.empty()) {
+    std::string Sep = "";
+    for (const StringMapEntry<NoneType> &EventEntry : EventNames) {
+      BP.Header.EventNames += Sep + EventEntry.first().str();
+      Sep = ",";
+    }
+  }
+
+  // Make sure the profile is consistent across all functions.
+  uint16_t ProfileFlags = BinaryFunction::PF_NONE;
+  for (const auto &BFI : Functions) {
+    const BinaryFunction &BF = BFI.second;
+    if (BF.hasProfile() && !BF.empty()) {
+      assert(BF.getProfileFlags() != BinaryFunction::PF_NONE);
+      if (ProfileFlags == BinaryFunction::PF_NONE) {
+        ProfileFlags = BF.getProfileFlags();
+      }
+      assert(BF.getProfileFlags() == ProfileFlags &&
+             "expected consistent profile flags across all functions");
+    }
+  }
+  BP.Header.Flags = ProfileFlags;
+
+  // Add all function objects.
+  for (const auto &BFI : Functions) {
+    const BinaryFunction &BF = BFI.second;
+    if (BF.hasProfile()) {
+      if (!BF.hasValidProfile() && !RI.getProfileReader()->isTrustedSource())
+        continue;
+
+      yaml::bolt::BinaryFunctionProfile YamlBF;
+      convert(BF, YamlBF);
+      BP.Functions.emplace_back(YamlBF);
+    }
+  }
+
+  // Write the profile.
+  yaml::Output Out(*OS, nullptr, 0);
+  Out << BP;
+
+  return std::error_code();
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/src/YAMLProfileWriter.h b/bolt/src/YAMLProfileWriter.h
new file mode 100644
index 000000000000..65b72588d85a
--- /dev/null
+++ b/bolt/src/YAMLProfileWriter.h
@@ -0,0 +1,42 @@
+//===- YAMLProfileWriter.cpp - serialize profiling data in YAML -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_YAML_PROFILE_WRITER_H
+#define LLVM_TOOLS_LLVM_BOLT_YAML_PROFILE_WRITER_H
+
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
+
+namespace llvm {
+namespace bolt {
+class RewriteInstance;
+
+class YAMLProfileWriter {
+  YAMLProfileWriter() = delete;
+
+  std::string Filename;
+
+  std::unique_ptr<raw_fd_ostream> OS;
+
+public:
+  explicit YAMLProfileWriter(const std::string &Filename)
+    : Filename(Filename) {
+  }
+
+  /// Save execution profile for that instance.
+  std::error_code writeProfile(const RewriteInstance &RI);
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/src/llvm-bolt.cpp b/bolt/src/llvm-bolt.cpp
new file mode 100644
index 000000000000..16b30026fb54
--- /dev/null
+++ b/bolt/src/llvm-bolt.cpp
@@ -0,0 +1,377 @@
+//===-- llvm-bolt.cpp - Feedback-directed layout optimizer ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a binary optimizer that will take 'perf' output and change
+// basic block layout for better performance (a.k.a. branch straightening),
+// plus some other optimizations that are better performed on a binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DataAggregator.h"
+#include "MachORewriteInstance.h"
+#include "RewriteInstance.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "bolt"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+namespace opts {
+
+bool HeatmapMode = false;
+bool LinuxKernelMode = false;
+
+cl::OptionCategory BoltCategory("BOLT generic options");
+cl::OptionCategory BoltDiffCategory("BOLTDIFF generic options");
+cl::OptionCategory BoltOptCategory("BOLT optimization options");
+cl::OptionCategory BoltRelocCategory("BOLT options in relocation mode");
+cl::OptionCategory BoltOutputCategory("Output options");
+cl::OptionCategory AggregatorCategory("Data aggregation options");
+cl::OptionCategory BoltInstrCategory("BOLT instrumentation options");
+
+static cl::OptionCategory *BoltCategories[] = {&BoltCategory,
+                                               &BoltOptCategory,
+                                               &BoltRelocCategory,
+                                               &BoltInstrCategory,
+                                               &BoltOutputCategory};
+
+static cl::OptionCategory *BoltDiffCategories[] = {&BoltDiffCategory};
+
+static cl::OptionCategory *Perf2BoltCategories[] = {&AggregatorCategory,
+                                                    &BoltOutputCategory};
+
+cl::SubCommand HeatmapCommand("heatmap", "generate heatmap");
+
+extern cl::opt<std::string> OutputFilename;
+extern cl::opt<bool> AggregateOnly;
+extern cl::opt<bool> DiffOnly;
+
+static cl::opt<std::string>
+InputDataFilename("data",
+  cl::desc("<data file>"),
+  cl::Optional,
+  cl::cat(BoltCategory));
+
+static cl::alias
+BoltProfile("b",
+  cl::desc("alias for -data"),
+  cl::aliasopt(InputDataFilename),
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string>
+InputDataFilename2("data2",
+  cl::desc("<data file>"),
+  cl::Optional,
+  cl::cat(BoltCategory));
+
+static cl::opt<std::string>
+InputFilename(
+  cl::Positional,
+  cl::desc("<executable>"),
+  cl::Required,
+  cl::cat(BoltCategory),
+  cl::sub(*cl::AllSubCommands));
+
+static cl::opt<std::string>
+InputFilename2(
+  cl::Positional,
+  cl::desc("<executable>"),
+  cl::Optional,
+  cl::cat(BoltDiffCategory));
+
+static cl::opt<std::string>
+PerfData("perfdata",
+  cl::desc("<data file>"),
+  cl::Optional,
+  cl::cat(AggregatorCategory),
+  cl::sub(*cl::AllSubCommands));
+
+static cl::alias
+PerfDataA("p",
+  cl::desc("alias for -perfdata"),
+  cl::aliasopt(PerfData),
+  cl::cat(AggregatorCategory));
+
+cl::opt<bool>
+  PrintSections("print-sections",
+  cl::desc("print all registered sections"),
+  cl::ZeroOrMore,
+  cl::Hidden,
+  cl::cat(BoltCategory));
+
+} // namespace opts
+
+static StringRef ToolName;
+
+static void report_error(StringRef Message, std::error_code EC) {
+  assert(EC);
+  errs() << ToolName << ": '" << Message << "': " << EC.message() << ".\n";
+  exit(1);
+}
+
+static void report_error(StringRef Message, Error E) {
+  assert(E);
+  errs() << ToolName << ": '" << Message << "': " << toString(std::move(E))
+         << ".\n";
+  exit(1);
+}
+
+namespace llvm {
+namespace bolt {
+const char *BoltRevision =
+#include "BoltRevision.inc"
+;
+}
+}
+
+static void printBoltRevision(llvm::raw_ostream &OS) {
+  OS << "BOLT revision " << BoltRevision << "\n";
+}
+
+void perf2boltMode(int argc, char **argv) {
+  cl::HideUnrelatedOptions(makeArrayRef(opts::Perf2BoltCategories));
+  cl::AddExtraVersionPrinter(printBoltRevision);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "perf2bolt - BOLT data aggregator\n"
+      "\nEXAMPLE: perf2bolt -p=perf.data executable -o data.fdata\n");
+  if (opts::PerfData.empty()) {
+    errs() << ToolName << ": expected -perfdata=<filename> option.\n";
+    exit(1);
+  }
+  if (!opts::InputDataFilename.empty()) {
+    errs() << ToolName << ": unknown -data option.\n";
+    exit(1);
+  }
+  if (!sys::fs::exists(opts::PerfData))
+    report_error(opts::PerfData, errc::no_such_file_or_directory);
+  if (!DataAggregator::checkPerfDataMagic(opts::PerfData)) {
+    errs() << ToolName << ": '" << opts::PerfData
+           << "': expected valid perf.data file.\n";
+    exit(1);
+  }
+  if (opts::OutputFilename.empty()) {
+    errs() << ToolName << ": expected -o=<output file> option.\n";
+    exit(1);
+  }
+  opts::AggregateOnly = true;
+}
+
+void heatmapMode(int argc, char **argv) {
+  // Insert a fake subcommand if invoked via a command alias.
+  std::unique_ptr<char *[]> FakeArgv;
+  if (argc == 1 || strcmp(argv[1], "heatmap")) {
+    ++argc;
+    FakeArgv.reset(new char *[argc+1]);
+    FakeArgv[0] = argv[0];
+    FakeArgv[1] = const_cast<char *>("heatmap");
+    for (int I = 2; I < argc; ++I)
+      FakeArgv[I] = argv[I - 1];
+    FakeArgv[argc] = nullptr;
+    argv = FakeArgv.get();
+  }
+
+  cl::ParseCommandLineOptions(argc, argv, "");
+
+  if (!sys::fs::exists(opts::InputFilename))
+    report_error(opts::InputFilename, errc::no_such_file_or_directory);
+
+  if (opts::PerfData.empty()) {
+    errs() << ToolName << ": expected -perfdata=<filename> option.\n";
+    exit(1);
+  }
+
+  opts::HeatmapMode = true;
+  opts::AggregateOnly = true;
+}
+
+void boltDiffMode(int argc, char **argv) {
+  cl::HideUnrelatedOptions(makeArrayRef(opts::BoltDiffCategories));
+  cl::AddExtraVersionPrinter(printBoltRevision);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "llvm-boltdiff - BOLT binary diff tool\n"
+      "\nEXAMPLE: llvm-boltdiff -data=a.fdata -data2=b.fdata exec1 exec2\n");
+  if (opts::InputDataFilename2.empty()) {
+    errs() << ToolName << ": expected -data2=<filename> option.\n";
+    exit(1);
+  }
+  if (opts::InputDataFilename.empty()) {
+    errs() << ToolName << ": expected -data=<filename> option.\n";
+    exit(1);
+  }
+  if (opts::InputFilename2.empty()) {
+    errs() << ToolName << ": expected second binary name.\n";
+    exit(1);
+  }
+  if (opts::InputFilename.empty()) {
+    errs() << ToolName << ": expected binary.\n";
+    exit(1);
+  }
+  opts::DiffOnly = true;
+}
+
+void boltMode(int argc, char **argv) {
+  cl::HideUnrelatedOptions(makeArrayRef(opts::BoltCategories));
+  // Register the target printer for --version.
+  cl::AddExtraVersionPrinter(printBoltRevision);
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+  cl::ParseCommandLineOptions(argc, argv,
+                              "BOLT - Binary Optimization and Layout Tool\n");
+
+  if (opts::OutputFilename.empty()) {
+    errs() << ToolName << ": expected -o=<output file> option.\n";
+    exit(1);
+  }
+}
+
+std::string GetExecutablePath(const char *Argv0) {
+  SmallString<128> ExecutablePath(Argv0);
+  // Do a PATH lookup if Argv0 isn't a valid path.
+  if (!llvm::sys::fs::exists(ExecutablePath))
+    if (llvm::ErrorOr<std::string> P =
+            llvm::sys::findProgramByName(ExecutablePath))
+      ExecutablePath = *P;
+  return std::string(ExecutablePath.str());
+}
+
+int main(int argc, char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  PrettyStackTraceProgram X(argc, argv);
+
+  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+
+  std::string ToolPath = GetExecutablePath(argv[0]);
+
+  // Initialize targets and assembly printers/parsers.
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllDisassemblers();
+
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+
+  ToolName = argv[0];
+
+  // Pre-process subcommands.
+  if (argc > 1 && *argv[1] != '-') {
+    if (!strcmp(argv[1], "heatmap"))
+      opts::HeatmapMode = true;
+  }
+
+  if (llvm::sys::path::filename(ToolName) == "perf2bolt")
+    perf2boltMode(argc, argv);
+  else if (llvm::sys::path::filename(ToolName) == "llvm-boltdiff")
+    boltDiffMode(argc, argv);
+  else if (llvm::sys::path::filename(ToolName) == "llvm-bolt-heatmap" ||
+           opts::HeatmapMode)
+    heatmapMode(argc, argv);
+  else
+    boltMode(argc, argv);
+
+
+  if (!sys::fs::exists(opts::InputFilename))
+    report_error(opts::InputFilename, errc::no_such_file_or_directory);
+
+  // Attempt to open the binary.
+  if (!opts::DiffOnly) {
+    Expected<OwningBinary<Binary>> BinaryOrErr =
+        createBinary(opts::InputFilename);
+    if (Error E = BinaryOrErr.takeError())
+      report_error(opts::InputFilename, std::move(E));
+    Binary &Binary = *BinaryOrErr.get().getBinary();
+
+    if (auto *e = dyn_cast<ELFObjectFileBase>(&Binary)) {
+      RewriteInstance RI(e, argc, argv, ToolPath);
+      if (!opts::PerfData.empty()) {
+        if (!opts::AggregateOnly) {
+          errs() << ToolName
+            << ": WARNING: reading perf data directly is unsupported, please use "
+            "-aggregate-only or perf2bolt.\n!!! Proceed on your own risk. !!!\n";
+        }
+        if (Error E = RI.setProfile(opts::PerfData))
+          report_error(opts::PerfData, std::move(E));
+      }
+      if (!opts::InputDataFilename.empty()) {
+        if (Error E = RI.setProfile(opts::InputDataFilename))
+          report_error(opts::InputDataFilename, std::move(E));
+      }
+      if (opts::AggregateOnly && opts::PerfData.empty()) {
+        errs() << ToolName << ": missing required -perfdata option.\n";
+        exit(1);
+      }
+
+      RI.run();
+    } else if (auto *O = dyn_cast<MachOObjectFile>(&Binary)) {
+      MachORewriteInstance MachORI(O, ToolPath);
+
+      if (!opts::InputDataFilename.empty())
+        if (Error E = MachORI.setProfile(opts::InputDataFilename))
+          report_error(opts::InputDataFilename, std::move(E));
+
+      MachORI.run();
+    } else {
+      report_error(opts::InputFilename, object_error::invalid_file_type);
+    }
+
+    return EXIT_SUCCESS;
+  }
+
+  // Bolt-diff
+  Expected<OwningBinary<Binary>> BinaryOrErr1 =
+      createBinary(opts::InputFilename);
+  Expected<OwningBinary<Binary>> BinaryOrErr2 =
+      createBinary(opts::InputFilename2);
+  if (Error E = BinaryOrErr1.takeError())
+    report_error(opts::InputFilename, std::move(E));
+  if (Error E = BinaryOrErr2.takeError())
+    report_error(opts::InputFilename2, std::move(E));
+  Binary &Binary1 = *BinaryOrErr1.get().getBinary();
+  Binary &Binary2 = *BinaryOrErr2.get().getBinary();
+  if (auto *ELFObj1 = dyn_cast<ELFObjectFileBase>(&Binary1)) {
+    if (auto *ELFObj2 = dyn_cast<ELFObjectFileBase>(&Binary2)) {
+      RewriteInstance RI1(ELFObj1, argc, argv, ToolPath);
+      if (Error E = RI1.setProfile(opts::InputDataFilename))
+        report_error(opts::InputDataFilename, std::move(E));
+      RewriteInstance RI2(ELFObj2, argc, argv, ToolPath);
+      if (Error E = RI2.setProfile(opts::InputDataFilename2))
+        report_error(opts::InputDataFilename2, std::move(E));
+      outs() << "BOLT-DIFF: *** Analyzing binary 1: " << opts::InputFilename
+             << "\n";
+      outs() << "BOLT-DIFF: *** Binary 1 fdata:     " << opts::InputDataFilename
+             << "\n";
+      RI1.run();
+      outs() << "BOLT-DIFF: *** Analyzing binary 2: " << opts::InputFilename2
+             << "\n";
+      outs() << "BOLT-DIFF: *** Binary 2 fdata:     "
+             << opts::InputDataFilename2 << "\n";
+      RI2.run();
+      RI1.compare(RI2);
+    } else {
+      report_error(opts::InputFilename2, object_error::invalid_file_type);
+    }
+  } else {
+    report_error(opts::InputFilename, object_error::invalid_file_type);
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/bolt/src/merge-fdata/CMakeLists.txt b/bolt/src/merge-fdata/CMakeLists.txt
new file mode 100644
index 000000000000..5acd7aa1263c
--- /dev/null
+++ b/bolt/src/merge-fdata/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_llvm_tool(merge-fdata
+  merge-fdata.cpp
+
+  DEPENDS
+  intrinsics_gen
+)
diff --git a/bolt/src/merge-fdata/merge-fdata.cpp b/bolt/src/merge-fdata/merge-fdata.cpp
new file mode 100644
index 000000000000..f38dbe500704
--- /dev/null
+++ b/bolt/src/merge-fdata/merge-fdata.cpp
@@ -0,0 +1,415 @@
+//===-- merge-fdata.cpp - Tool for merging profile in fdata format --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// merge-fdata 1.fdata 2.fdata 3.fdata > merged.fdata
+//
+//===----------------------------------------------------------------------===//
+
+#include "../ProfileYAMLMapping.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Signals.h"
+#include <unordered_map>
+
+using namespace llvm;
+using namespace llvm::yaml::bolt;
+
+namespace opts {
+
+cl::OptionCategory MergeFdataCategory("merge-fdata options");
+
+enum SortType : char {
+  ST_NONE,
+  ST_EXEC_COUNT,      /// Sort based on function execution count.
+  ST_TOTAL_BRANCHES,  /// Sort based on all branches in the function.
+};
+
+static cl::list<std::string>
+InputDataFilenames(
+  cl::Positional,
+  cl::CommaSeparated,
+  cl::desc("<fdata1> [<fdata2>]..."),
+  cl::OneOrMore,
+  cl::cat(MergeFdataCategory));
+
+static cl::opt<SortType>
+PrintFunctionList("print",
+  cl::desc("print the list of objects with count to stderr"),
+  cl::init(ST_NONE),
+  cl::values(clEnumValN(ST_NONE,
+      "none",
+      "do not print objects/functions"),
+    clEnumValN(ST_EXEC_COUNT,
+      "exec",
+      "print functions sorted by execution count"),
+    clEnumValN(ST_TOTAL_BRANCHES,
+      "branches",
+      "print functions sorted by total branch count")),
+  cl::cat(MergeFdataCategory));
+
+static cl::opt<bool>
+SuppressMergedDataOutput("q",
+  cl::desc("do not print merged data to stdout"),
+  cl::init(false),
+  cl::Optional,
+  cl::cat(MergeFdataCategory));
+
+} // namespace opts
+
+namespace {
+
+static StringRef ToolName;
+
+static void report_error(StringRef Message, std::error_code EC) {
+  assert(EC);
+  errs() << ToolName << ": '" << Message << "': " << EC.message() << ".\n";
+  exit(1);
+}
+
+static void report_error(Twine Message, StringRef CustomError) {
+  errs() << ToolName << ": '" << Message << "': " << CustomError << ".\n";
+  exit(1);
+}
+
+void mergeProfileHeaders(BinaryProfileHeader &MergedHeader,
+                         const BinaryProfileHeader &Header) {
+  if (MergedHeader.FileName.empty()) {
+    MergedHeader.FileName = Header.FileName;
+  }
+  if (!MergedHeader.FileName.empty() &&
+      MergedHeader.FileName != Header.FileName) {
+    errs() << "WARNING: merging profile from a binary for "
+           << Header.FileName << " into a profile for binary "
+           << MergedHeader.FileName << '\n';
+  }
+  if (MergedHeader.Id.empty()) {
+    MergedHeader.Id = Header.Id;
+  }
+  if (!MergedHeader.Id.empty() && (MergedHeader.Id != Header.Id)) {
+    errs() << "WARNING: build-ids in merged profiles do not match\n";
+  }
+
+  // Cannot merge samples profile with LBR profile.
+  if (!MergedHeader.Flags) {
+    MergedHeader.Flags = Header.Flags;
+  }
+  constexpr auto Mask = llvm::bolt::BinaryFunction::PF_LBR |
+                        llvm::bolt::BinaryFunction::PF_SAMPLE;
+  if ((MergedHeader.Flags & Mask) != (Header.Flags & Mask)) {
+    errs() << "ERROR: cannot merge LBR profile with non-LBR profile\n";
+    exit(1);
+  }
+  MergedHeader.Flags = MergedHeader.Flags | Header.Flags;
+
+  if (!Header.Origin.empty()) {
+    if (MergedHeader.Origin.empty()) {
+      MergedHeader.Origin = Header.Origin;
+    } else if (MergedHeader.Origin != Header.Origin) {
+      MergedHeader.Origin += "; " + Header.Origin;
+    }
+  }
+
+  if (MergedHeader.EventNames.empty()) {
+    MergedHeader.EventNames = Header.EventNames;
+  }
+  if (MergedHeader.EventNames != Header.EventNames) {
+    errs() << "WARNING: merging profiles with different sampling events\n";
+    MergedHeader.EventNames += "," + Header.EventNames;
+  }
+}
+
+void mergeBasicBlockProfile(BinaryBasicBlockProfile &MergedBB,
+                            BinaryBasicBlockProfile &&BB,
+                            const BinaryFunctionProfile &BF) {
+  // Verify that the blocks match.
+  if (BB.NumInstructions != MergedBB.NumInstructions)
+    report_error(BF.Name + " : BB #" + Twine(BB.Index),
+                 "number of instructions in block mismatch");
+  if (BB.Hash != MergedBB.Hash)
+    report_error(BF.Name + " : BB #" + Twine(BB.Index),
+                 "basic block hash mismatch");
+
+  // Update the execution count.
+  MergedBB.ExecCount += BB.ExecCount;
+
+  // Update the event count.
+  MergedBB.EventCount += BB.EventCount;
+
+  // Merge calls sites.
+  std::unordered_map<uint32_t, CallSiteInfo *> CSByOffset;
+  for (CallSiteInfo &CS : BB.CallSites)
+    CSByOffset.emplace(std::make_pair(CS.Offset, &CS));
+
+  for (CallSiteInfo &MergedCS : MergedBB.CallSites) {
+    auto CSI = CSByOffset.find(MergedCS.Offset);
+    if (CSI == CSByOffset.end())
+      continue;
+    yaml::bolt::CallSiteInfo &CS = *CSI->second;
+    if (CS != MergedCS)
+      continue;
+
+    MergedCS.Count += CS.Count;
+    MergedCS.Mispreds += CS.Mispreds;
+
+    CSByOffset.erase(CSI);
+  }
+
+  // Append the rest of call sites.
+  for (std::pair<const uint32_t, CallSiteInfo *> CSI : CSByOffset) {
+    MergedBB.CallSites.emplace_back(std::move(*CSI.second));
+  }
+
+  // Merge successor info.
+  std::vector<SuccessorInfo *> SIByIndex(BF.NumBasicBlocks);
+  for (SuccessorInfo &SI : BB.Successors) {
+    if (SI.Index >= BF.NumBasicBlocks)
+      report_error(BF.Name, "bad successor index");
+    SIByIndex[SI.Index] = &SI;
+  }
+  for (SuccessorInfo &MergedSI : MergedBB.Successors) {
+    if (!SIByIndex[MergedSI.Index])
+      continue;
+    SuccessorInfo &SI = *SIByIndex[MergedSI.Index];
+
+    MergedSI.Count += SI.Count;
+    MergedSI.Mispreds += SI.Mispreds;
+
+    SIByIndex[MergedSI.Index] = nullptr;
+  }
+  for (SuccessorInfo *SI : SIByIndex) {
+    if (SI) {
+      MergedBB.Successors.emplace_back(std::move(*SI));
+    }
+  }
+}
+
+void mergeFunctionProfile(BinaryFunctionProfile &MergedBF,
+                          BinaryFunctionProfile &&BF) {
+  // Validate that we are merging the correct function.
+  if (BF.NumBasicBlocks != MergedBF.NumBasicBlocks)
+    report_error(BF.Name, "number of basic blocks mismatch");
+  if (BF.Id != MergedBF.Id)
+    report_error(BF.Name, "ID mismatch");
+  if (BF.Hash != MergedBF.Hash)
+    report_error(BF.Name, "hash mismatch");
+
+  // Update the execution count.
+  MergedBF.ExecCount += BF.ExecCount;
+
+  // Merge basic blocks profile.
+  std::vector<BinaryBasicBlockProfile *> BlockByIndex(BF.NumBasicBlocks);
+  for (BinaryBasicBlockProfile &BB : BF.Blocks) {
+    if (BB.Index >= BF.NumBasicBlocks)
+      report_error(BF.Name + " : BB #" + Twine(BB.Index),
+                   "bad basic block index");
+    BlockByIndex[BB.Index] = &BB;
+  }
+  for (BinaryBasicBlockProfile &MergedBB : MergedBF.Blocks) {
+    if (!BlockByIndex[MergedBB.Index])
+      continue;
+    BinaryBasicBlockProfile &BB = *BlockByIndex[MergedBB.Index];
+
+    mergeBasicBlockProfile(MergedBB, std::move(BB), MergedBF);
+
+    // Ignore this block in the future.
+    BlockByIndex[MergedBB.Index] = nullptr;
+  }
+
+  // Append blocks unique to BF (i.e. those that are not in MergedBF).
+  for (BinaryBasicBlockProfile *BB : BlockByIndex) {
+    if (BB) {
+      MergedBF.Blocks.emplace_back(std::move(*BB));
+    }
+  }
+}
+
+bool isYAML(const StringRef Filename) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = MB.getError())
+    report_error(Filename, EC);
+  StringRef Buffer = MB.get()->getBuffer();
+  if (Buffer.startswith("---\n"))
+    return true;
+  return false;
+}
+
+void mergeLegacyProfiles(const cl::list<std::string> &Filenames) {
+  errs() << "Using legacy profile format.\n";
+  bool BoltedCollection = false;
+  bool First = true;
+  for (const std::string &Filename : Filenames) {
+    if (isYAML(Filename))
+      report_error(Filename, "cannot mix YAML and legacy formats");
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+        MemoryBuffer::getFileOrSTDIN(Filename);
+    if (std::error_code EC = MB.getError())
+      report_error(Filename, EC);
+    errs() << "Merging data from " << Filename << "...\n";
+
+    StringRef Buf = MB.get()->getBuffer();
+    // Check if the string "boltedcollection" is in the first line
+    if (Buf.startswith("boltedcollection\n")) {
+      if (!First && !BoltedCollection) {
+        report_error(
+            Filename,
+            "cannot mix profile collected in BOLT and non-BOLT deployments");
+      }
+      BoltedCollection = true;
+      if (!First)
+        Buf = Buf.drop_front(17);
+    } else {
+      if (BoltedCollection) {
+        report_error(
+            Filename,
+            "cannot mix profile collected in BOLT and non-BOLT deployments");
+      }
+    }
+
+    outs() << Buf;
+    First = false;
+  }
+  errs() << "Profile from " << Filenames.size() << " files merged.\n";
+}
+
+} // anonymous namespace
+
+int main(int argc, char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  PrettyStackTraceProgram X(argc, argv);
+
+  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+
+  cl::HideUnrelatedOptions(opts::MergeFdataCategory);
+
+  cl::ParseCommandLineOptions(argc, argv,
+                              "merge multiple fdata into a single file");
+
+  ToolName = argv[0];
+
+  if (!isYAML(opts::InputDataFilenames.front())) {
+    mergeLegacyProfiles(opts::InputDataFilenames);
+    return 0;
+  }
+
+  // Merged header.
+  BinaryProfileHeader MergedHeader;
+  MergedHeader.Version = 1;
+
+  // Merged information for all functions.
+  StringMap<BinaryFunctionProfile> MergedBFs;
+
+  for (std::string &InputDataFilename : opts::InputDataFilenames) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+        MemoryBuffer::getFileOrSTDIN(InputDataFilename);
+    if (std::error_code EC = MB.getError())
+      report_error(InputDataFilename, EC);
+    yaml::Input YamlInput(MB.get()->getBuffer());
+
+    errs() << "Merging data from " << InputDataFilename << "...\n";
+
+    BinaryProfile BP;
+    YamlInput >> BP;
+    if (YamlInput.error())
+      report_error(InputDataFilename, YamlInput.error());
+
+    // Sanity check.
+    if (BP.Header.Version != 1) {
+      errs() << "Unable to merge data from profile using version "
+             << BP.Header.Version << '\n';
+      exit(1);
+    }
+
+    // Merge the header.
+    mergeProfileHeaders(MergedHeader, BP.Header);
+
+    // Do the function merge.
+    for (BinaryFunctionProfile &BF : BP.Functions) {
+      if (!MergedBFs.count(BF.Name)) {
+        MergedBFs.insert(std::make_pair(BF.Name, BF));
+        continue;
+      }
+
+      BinaryFunctionProfile &MergedBF = MergedBFs.find(BF.Name)->second;
+      mergeFunctionProfile(MergedBF, std::move(BF));
+    }
+  }
+
+  if (!opts::SuppressMergedDataOutput) {
+    yaml::Output YamlOut(outs());
+
+    BinaryProfile MergedProfile;
+    MergedProfile.Header = MergedHeader;
+    MergedProfile.Functions.resize(MergedBFs.size());
+    std::transform(MergedBFs.begin(),
+                   MergedBFs.end(),
+                   MergedProfile.Functions.begin(),
+                   [] (StringMapEntry<BinaryFunctionProfile> &V) {
+                     return V.second;
+                   });
+
+    // For consistency, sort functions by their IDs.
+    std::sort(MergedProfile.Functions.begin(), MergedProfile.Functions.end(),
+              [] (const BinaryFunctionProfile &A,
+                  const BinaryFunctionProfile &B) {
+                return A.Id < B.Id;
+              });
+
+    YamlOut << MergedProfile;
+  }
+
+  errs() << "Data for " << MergedBFs.size()
+         << " unique objects successfully merged.\n";
+
+  if (opts::PrintFunctionList != opts::ST_NONE) {
+    // List of function names with execution count.
+    std::vector<std::pair<uint64_t, StringRef>> FunctionList(MergedBFs.size());
+    using CountFuncType =
+      std::function<std::pair<uint64_t, StringRef>(
+          const StringMapEntry<BinaryFunctionProfile> &)>;
+    CountFuncType ExecCountFunc =
+        [](const StringMapEntry<BinaryFunctionProfile> &V) {
+      return std::make_pair(V.second.ExecCount,
+                            StringRef(V.second.Name));
+    };
+    CountFuncType BranchCountFunc =
+        [](const StringMapEntry<BinaryFunctionProfile> &V) {
+      // Return total branch count.
+      uint64_t BranchCount = 0;
+      for (const BinaryBasicBlockProfile &BI : V.second.Blocks) {
+        for (const SuccessorInfo &SI : BI.Successors) {
+          BranchCount += SI.Count;
+        }
+      }
+      return std::make_pair(BranchCount,
+                            StringRef(V.second.Name));
+    };
+
+    CountFuncType CountFunc = (opts::PrintFunctionList == opts::ST_EXEC_COUNT)
+       ? ExecCountFunc
+       : BranchCountFunc;
+    std::transform(MergedBFs.begin(),
+                   MergedBFs.end(),
+                   FunctionList.begin(),
+                   CountFunc);
+    std::stable_sort(FunctionList.rbegin(), FunctionList.rend());
+    errs() << "Functions sorted by "
+           << (opts::PrintFunctionList == opts::ST_EXEC_COUNT
+                ? "execution"
+                : "total branch")
+           << " count:\n";
+    for (std::pair<uint64_t, StringRef> &FI : FunctionList) {
+      errs() << FI.second << " : " << FI.first << '\n';
+    }
+  }
+
+  return 0;
+}
diff --git a/bolt/test/CMakeLists.txt b/bolt/test/CMakeLists.txt
new file mode 100644
index 000000000000..84129d2250f0
--- /dev/null
+++ b/bolt/test/CMakeLists.txt
@@ -0,0 +1,41 @@
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+  )
+
+set(BOLT_TEST_PARAMS
+  bolt_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+  )
+
+list(APPEND BOLT_TEST_DEPS
+  llvm-config
+  FileCheck count not
+  llvm-bolt
+  perf2bolt
+  merge-fdata
+  llvm-nm
+  llvm-objdump
+  llvm-readobj
+  yaml2obj
+  llvm-strip
+  llvm-dwarfdump
+  llvm-mc
+  )
+
+add_custom_target(bolt-test-depends DEPENDS ${BOLT_TEST_DEPS})
+set_target_properties(bolt-test-depends PROPERTIES FOLDER "BOLT tests")
+
+add_lit_testsuite(check-bolt "Running the BOLT regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  PARAMS ${BOLT_TEST_PARAMS}
+  DEPENDS ${BOLT_TEST_DEPS}
+  ARGS ${BOLT_TEST_EXTRA_ARGS}
+  )
+set_target_properties(check-bolt PROPERTIES FOLDER "BOLT tests")
+
+add_lit_testsuites(BOLT ${CMAKE_CURRENT_SOURCE_DIR}
+  PARAMS ${BOLT_TEST_PARAMS}
+  DEPENDS ${BOLT_TEST_DEPS}
+)
diff --git a/bolt/test/X86/Inputs/blarge.yaml b/bolt/test/X86/Inputs/blarge.yaml
new file mode 100644
index 000000000000..9cebc6107a33
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge.yaml
@@ -0,0 +1,137 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x0000000000400CC0
+Sections:
+  - Name:            .noplt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x00000000004005E0
+    AddressAlign:    0x0000000000000010
+    Content:         FF35BA122000FF25BC1220000F1F4000FF25BA1220006800000000E9E0FFFFFFFF25B21220006801000000E9D0FFFFFFFF25AA1220006802000000E9C0FFFFFFFF25A21220006803000000E9B0FFFFFFFF259A1220006804000000E9A0FFFFFFFF25921220006805000000E990FFFFFFFF258A1220006806000000E980FFFFFFFF25821220006807000000E970FFFFFFFF257A1220006808000000E960FFFFFF
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x0000000000400680
+    AddressAlign:    0x0000000000000010
+    Content:         41574156BF4812400041554154555331DB4883EC78E866FFFFFF488D742450488D7C243C488D6C2450F20F101D9F0C0000F20F10159F0C0000F20F100D9F0C0000F20F10057F0C0000E83207000031C0BF00134000E836FFFFFF448B5C243C4585DB7E21F20F104500BF0B134000B80100000083C3014883C508E811FFFFFF395C243C7FDFBF0A00000031DB488D6C2450E8DAFEFFFF488D742450488D7C243CF20F101D280C0000F20F1015380C0000F20F100D380C0000F20F1005080C0000E8BB06000031C0BF00134000E8BFFEFFFF448B54243C4585D27E21F20F104500BF0B134000B80100000083C3014883C508E89AFEFFFF395C243C7FDFBF0A00000031DB488D6C2450E863FEFFFF488D742450488D7C243CF20F101DD90B0000F20F1015D90B0000F20F100DD90B0000F20F1005910B0000E84406000031C0BF00134000E848FEFFFF448B4C243C4585C97E21F20F104500BF0B134000B80100000083C3014883C508E823FEFFFF395C243C7FDFBF0A00000031DB488D6C2450E8ECFDFFFFF20F10153C0B0000488D742450488D7C243CF20F101D720B0000F20F100D720B0000660F28C2E8D105000031C0BF00134000E8D5FDFFFF448B44243C4585C07E21F20F104500BF0B134000B80100000083C3014883C508E8B0FDFFFF395C243C7FDFBF0A00000031DB488D6C2450E879FDFFFF488D742450488D7C243CF20F101D170B0000F20F1015A70A0000F20F100D0F0B0000F20F10050F0B0000E85A050000BF0013400031C0E85EFDFFFF8B7C243C85FF7E21F20F104500BF0B134000B80100000083C3014883C508E83BFDFFFF395C243C7FDFBF0A00000031DB488D6C2450E804FDFFFF488D742450488D7C243CF20F101DBA0A0000F20F1015BA0A0000F20F100DBA0A0000F20F1005BA0A0000E8E504000031C0BF00134000E8E9FCFFFF8B74243C85F67E21F20F104500BF0B134000B80100000083C3014883C508E8C6FCFFFF395C243C7FDFBF0A00000031DB488D6C2450E88FFCFFFF488D742450488D7C243CF20F101D650A0000F20F1015650A0000F20F100D650A0000F20F1005650A0000E87004000031C0BF00134000E874FCFFFF8B4C243C85C97E21F20F104500BF0B134000B80100000083C3014883C508E851FCFFFF395C243C7FDFBF0A00000031DB488D6C2450E81AFCFFFF488D742450488D7C243CF20F101D100A0000F20F1015100A0000F20F100D100A0000F20F1005100A0000E8FB03000031C0BF00134000E8FFFBFFFF8B54243C85D27E21F20F104500BF0B134000B80100000083C3014883C508E8DCFBFFFF395C243C7FDFBF0A00000041BF09000000E8A6FBFFFFF20F1035F6080000F20F103DDE080000F20F11742420F20F117C2428F20F103DD208000041BE28000000F20F117C2418F20F107C242841BD11000000F20F117C24100F1F4000F20F103D9808000041BC09000000F20F117C24080F1F4000488D742450488D7C243C31DBF20F105C2408488D6C2450F20F10542410F20F104C2418F20F10442420E82A03000031C0BF00134000E82EFBFFFF8B44243C85C07E21F20F104500BF0B134000B80100000083C3014883C508E80BFBFFFF395C243C7FDFBF0A000000E8DBFAFFFFF20F106424084183EC01F20F5C25F1080000F20F116424080F8575FFFFFFF20F102DE50800004183ED01F20F586C2410F20F116C24100F853FFFFFFFF20F107424184183EE01F20F5C35C5080000F20F117424180F850BFFFFFFF20F103DD10700004183EF01F20F587C2420F20F117C24200F85D9FEFFFFBF7012400031DBE867FAFFFF488D7424404889DFE8CA0500008B54244089DE31C0BF0F1340004883C302E854FAFFFF4881FBA086010075D4BF0A000000BB6901ED3FE81CFAFFFF488D7424404889DFE88F0500008B5424404889DE31C0BF201340004883C301E818FAFFFF4881FB6941ED3F75D3BF98124000E8F5F9FFFF660FEFD2F20F100D19080000660F28C2BFC0124000B802000000F20F11542408F20F59CAF20F5E0D01080000E8D4F9FFFFF20F10542408F20F1035FE070000F20F5815EE070000660F2EF273B7BF2F134000E89EF9FFFF660FEFD2F20F100DCA070000660F28C2BFE0124000B802000000F20F11542408F20F59CAF20F5E0DA2070000E87DF9FFFFF20F10542408F20F103DB7070000F20F5815A7070000660F2EFA73B74883C47831C05B5D415C415D415E415FC331ED4989D15E4889E24883E4F0505449C7C03012400048C7C1C011400048C7C780064000E867F9FFFFF4660F1F440000B80F19600055482D081960004883F80E4889E5761BB8000000004885C074115DBF08196000FFE0660F1F8400000000005DC366666666662E0F1F840000000000BE08196000554881EE0819600048C1FE034889E54889F048C1E83F4801C648D1FE7415B8000000004885C0740B5DBF08196000FFE00F1F005DC3660F1F440000803D910B2000007511554889E5E86EFFFFFF5DC6057E0B200001F3C30F1F4000BFA816600048833F007505EB930F1F00B8000000004885C074F1554889E5FFD05DE97AFFFFFF662E0F1F840000000000F20F590570060000F20F5E0560060000C36666666666662E0F1F840000000000F20F590548060000F20F5E0548060000C3662E0F1F8400000000000F1F440000F20F5EC8534889F34883EC50F20F5ED0F20F110C24DD0424F20F5ED8F20F111424DD0424D9C1D8CA660FEFC0F20F111C24D90549060000D8CADEE9D90543060000DCF9D9C3D8C4D8CCD8CCD9C9D8CCDECBDEE2D9052F060000DC0C24DEC2D9C9D83526060000D9C0D9C2D8CBD8CBD9CAD8C8D8E2DD5C2448F20F104C2448660F2EC10F83A8000000DDD9F20F51D1660F2ED2C707010000000F8A09020000D9C9DB7C2430D9C9F20F1005E2050000DB7C2420DD542448F20F104C2448DB7C2410660F54C1F20F100DAC050000F20F58C2E84BF7FFFFF20F110424F20F100D66040000DD0424DB6C2430D8F1DEC1DD5C2448D9EEF20F10442448DB6C2410D9C9DFE9DDD8DB6C24207708F20F100D1F040000D90569050000F20F59C8DEF9F20F110C24DC2C24DD1B4883C4505BC30F1F00D9C9DD5C2448C70703000000F20F104C2448F20F51C1660F2EC00F8AF0010000D9C9DB7C2420D9C9F20F110424DB7C2410DD0424DEF9DD5C2448F20F10442448E8CBF6FFFFDB6C2420F20F110424DD5C2448F20F10642448DB6C2410F20F51CC660F2EC9F20F11642440660F28D10F8A73010000F20F5E0504040000F20F114C2430F20F11542420DB7C2410E86FF6FFFFF20F10542420F20F104C2430D905AD040000F20F591585040000660F2EC9DB6C2410F20F59D0DEF1F20F11542410660F28D1D9C0DC6C2410DD1B0F8AEE000000F20F100424DB7C2420F20F580556040000F20F114C2430F20F11542410F20F5E058A030000E805F6FFFFDB6C2420F20F10542410F20F104C2430F20F59151D040000660F2EC9D9C0F20F59D0F20F11542410DC6C2410DD5B087A77F20F100424DB7C2410F20F580503040000F20F114C2420F20F5E0535030000E8B0F5FFFFF20F104C2420DB6C2410F20F590DCE030000F20F59C8F20F110C24DC2C24DD5B104883C4505BC3DB7C2420660F28C1DB7C2410DB3C24E8B5F5FFFF660F28D0DB6C2420DB6C2410DB2C24D9CAD9C9E9CDFDFFFFDB7C2410F20F10442440E88EF5FFFF660F28C8DB6C2410E96DFFFFFFDB7C2410F20F10442440F20F114C2420E86CF5FFFFF20F104C2420660F28D0DB6C2410E9EAFEFFFFDDD8660F28C4F20F114C2420E848F5FFFFF20F104C2420660F28D0F20F100424DB6C2410E964FEFFFFDB7C2420660F28C1DB7C2410DB3C24E81CF5FFFFDB6C2420DB6C2410DB2C24D9CAD9C9E9EAFDFFFF0F1F84000000000041B82000000031C031D2660F1F4400004889F94801C048C1E70281E1000000C048C1E91E488D1491488D4C00014839CA72074829CA4883C0014183E80175D1488906C3662E0F1F8400000000000F1F00415741564189FF415541544C8D25C604200055488D2DC6042000534989F64989D531DB4C29E54883EC0848C1FD03E8CDF3FFFF4885ED741E0F1F8400000000004C89EA4C89F64489FF41FF14DC4883C3014839EB75EA4883C4085B5D415C415D415E415FC366662E0F1F840000000000F3C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0000000000401240
+    AddressAlign:    0x0000000000000010
+    Content:         01000200000000002A2A2A2A2A2A2A2A2A2043554249432046554E4354494F4E53202A2A2A2A2A2A2A2A2A2A2A0000002A2A2A2A2A2A2A2A2A20494E54454745522053515220524F4F5453202A2A2A2A2A2A2A2A2A2A2A002A2A2A2A2A2A2A2A2A20414E474C4520434F4E56455253494F4E202A2A2A2A2A2A2A2A2A2A2A000025332E30662064656772656573203D20252E3132662072616469616E730A0000252E3132662072616469616E73203D2025332E306620646567726565730A0000536F6C7574696F6E733A0020256600737172742825336429203D202532640A007371727428256C5829203D2025580A00000000000000F0BF00000000000014400000000000002440000000000000F03F0000000000003EC0000000000000404000000000000025C0000000000000314000000000000012C00000000000003FC000000000000036400000000000000CC000000000008041C06666666666662BC00000000000002840AE47E17A14AE284000000000000008409A999999999937C00000000000001840295C8FC2F5F850C000000000000020C000000000000041400000000000001E40D7A3703D0A572140000000000080464000000000000030403333333333331540333333333333FBBF00000000000028C077BE9F1A2FDDDC3F85EB51B81E85E33F000000000000D03F182D4454FB2109400000000000806640FCA9F1D24D62503F0000000000807640399D52A246DF413F9B0B6097FB21194000000000000000C0182D4454FB211940182D4454FB212940555555555555D53F00004040000010410000D84100005842FFFFFFFFFFFFFF7F0000000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x00000000006016B0
+    Link:            .dynstr
+    AddressAlign:    0x0000000000000008
+    Content:         01000000000000000100000000000000010000000000000072000000000000000C00000000000000C0054000000000000D000000000000003412400000000000190000000000000098166000000000001B0000000000000008000000000000001A00000000000000A0166000000000001C000000000000000800000000000000040000000000000048024000000000000500000000000000C803400000000000060000000000000090024000000000000A00000000000000AE000000000000000B000000000000001800000000000000150000000000000000000000000000000300000000000000981860000000000002000000000000000000000000000000140000000000000007000000000000001700000000000000E8044000000000000700000000000000D0044000000000000800000000000000000000000000000009000000000000001800000000000000FEFFFF6F000000009004400000000000FFFFFF6F000000000200000000000000F0FFFF6F000000007604400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x00000000006018F8
+    AddressAlign:    0x0000000000000008
+    Content:         '00000000000000000000000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x0000000000601908
+    AddressAlign:    0x0000000000000001
+    Size:            0x0000000000000008
+Symbols:
+    - Name:            completed.6650
+      Type:            STT_OBJECT
+      Section:         .bss
+      Value:           0x0000000000601908
+      Size:            0x0000000000000001
+    - Name:            frame_dummy
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400D90
+    - Name:            basicmath_large.c
+      Type:            STT_FILE
+    - Name:            rad2deg.c
+      Type:            STT_FILE
+    - Name:            cubic.c
+      Type:            STT_FILE
+    - Name:            isqrt.c
+      Type:            STT_FILE
+    - Name:            elf-init.c
+      Type:            STT_FILE
+    - Name:            crtstuff.c
+      Type:            STT_FILE
+    - Name:            _DYNAMIC
+      Type:            STT_OBJECT
+      Section:         .dynamic
+      Value:           0x00000000006016B0
+    - Name:            usqrt
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000401170
+      Size:            0x0000000000000043
+      Binding:         STB_GLOBAL
+    - Name:            deg2rad
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400DE0
+      Size:            0x0000000000000011
+      Binding:         STB_GLOBAL
+    - Name:            SolveCubic
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400E00
+      Size:            0x0000000000000368
+      Binding:         STB_GLOBAL
+    - Name:            _start
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400CC0
+      Size:            0x000000000000002A
+      Binding:         STB_GLOBAL
+    - Name:            rad2deg
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400DC0
+      Size:            0x0000000000000011
+      Binding:         STB_GLOBAL
+    - Name:            main
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400680
+      Size:            0x0000000000000640
+      Binding:         STB_GLOBAL
+DynamicSymbols:
+    - Name:            sqrt
+      Type:            STT_FUNC
+      Binding:         STB_GLOBAL
+ProgramHeaders:
+  - Type: PT_PHDR
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x00000000004005E0
+    PAddr: 0x00000000004005E0
+    FirstSec: .noplt
+    LastSec: .noplt
+  - Type: PT_LOAD
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x00000000004005E0
+    PAddr: 0x00000000004005E0
+    FirstSec: .noplt
+    LastSec: .rodata
+  - Type: PT_LOAD
+    Flags: [ PF_R, PF_W ]
+    VAddr: 0x00000000006018F8
+    PAddr: 0x00000000006018F8
+    FirstSec: .data
+    LastSec: .bss
+  - Type: PT_DYNAMIC
+    Flags: [ PF_R, PF_W ]
+    VAddr: 0x00000000006016B0
+    PAddr: 0x00000000006016B0
+    FirstSec: .dynamic
+    LastSec: .dynamic
+...
diff --git a/bolt/test/X86/Inputs/debug-fission-script.txt b/bolt/test/X86/Inputs/debug-fission-script.txt
new file mode 100644
index 000000000000..df621e4c771b
--- /dev/null
+++ b/bolt/test/X86/Inputs/debug-fission-script.txt
@@ -0,0 +1,7 @@
+SECTIONS
+{
+. = 0x4002a0;
+.text : { *(.text*) }
+. = 0x601000;
+.data : { *(.data) }
+}
diff --git a/bolt/test/X86/Inputs/debug-fission-simple.s b/bolt/test/X86/Inputs/debug-fission-simple.s
new file mode 100644
index 000000000000..1a2ef37a0aaa
--- /dev/null
+++ b/bolt/test/X86/Inputs/debug-fission-simple.s
@@ -0,0 +1,446 @@
+	.text
+	.file	"debug-fission-simple.cpp"
+	.file	1 "" "debug-fission-simple.cpp"
+	.section	.text._Z7doStuffi,"ax",@progbits
+	.globl	_Z7doStuffi                     # -- Begin function _Z7doStuffi
+	.p2align	4, 0x90
+	.type	_Z7doStuffi,@function
+_Z7doStuffi:                            # @_Z7doStuffi
+.Lfunc_begin0:
+	.loc	1 3 0                           # debug-fission-simple.cpp:3:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	%edi, -4(%rbp)
+.Ltmp0:
+	.loc	1 4 11 prologue_end             # debug-fission-simple.cpp:4:11
+	cmpl	$5, -4(%rbp)
+.Ltmp1:
+	.loc	1 4 7 is_stmt 0                 # debug-fission-simple.cpp:4:7
+	jne	.LBB0_2
+# %bb.1:                                # %if.then
+.Ltmp2:
+	.loc	1 5 16 is_stmt 1                # debug-fission-simple.cpp:5:16
+	movl	_ZL3foo, %eax
+	.loc	1 5 14 is_stmt 0                # debug-fission-simple.cpp:5:14
+	addl	$1, %eax
+	.loc	1 5 9                           # debug-fission-simple.cpp:5:9
+	addl	-4(%rbp), %eax
+	movl	%eax, -4(%rbp)
+	.loc	1 5 5                           # debug-fission-simple.cpp:5:5
+	jmp	.LBB0_3
+.LBB0_2:                                # %if.else
+	.loc	1 7 9 is_stmt 1                 # debug-fission-simple.cpp:7:9
+	movl	-4(%rbp), %eax
+	subl	$1, %eax
+	movl	%eax, -4(%rbp)
+.Ltmp3:
+.LBB0_3:                                # %if.end
+	.loc	1 8 10                          # debug-fission-simple.cpp:8:10
+	movl	-4(%rbp), %eax
+	.loc	1 8 3 is_stmt 0                 # debug-fission-simple.cpp:8:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp4:
+.Lfunc_end0:
+	.size	_Z7doStuffi, .Lfunc_end0-_Z7doStuffi
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._Z8doStuff2i,"ax",@progbits
+	.globl	_Z8doStuff2i                    # -- Begin function _Z8doStuff2i
+	.p2align	4, 0x90
+	.type	_Z8doStuff2i,@function
+_Z8doStuff2i:                           # @_Z8doStuff2i
+.Lfunc_begin1:
+	.loc	1 11 0 is_stmt 1                # debug-fission-simple.cpp:11:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	%edi, -4(%rbp)
+.Ltmp5:
+	.loc	1 12 14 prologue_end            # debug-fission-simple.cpp:12:14
+	movl	-4(%rbp), %eax
+	addl	$3, %eax
+	movl	%eax, -4(%rbp)
+	.loc	1 12 3 is_stmt 0                # debug-fission-simple.cpp:12:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp6:
+.Lfunc_end1:
+	.size	_Z8doStuff2i, .Lfunc_end1-_Z8doStuff2i
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._Z6_startv,"ax",@progbits
+	.globl	_Z6_startv                      # -- Begin function _Z6_startv
+	.p2align	4, 0x90
+	.type	_Z6_startv,@function
+_Z6_startv:                             # @_Z6_startv
+.Lfunc_begin2:
+	.loc	1 15 0 is_stmt 1                # debug-fission-simple.cpp:15:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+.Ltmp7:
+	.loc	1 16 7 prologue_end             # debug-fission-simple.cpp:16:7
+	movl	$4, -4(%rbp)
+	.loc	1 17 18                         # debug-fission-simple.cpp:17:18
+	movl	-4(%rbp), %edi
+	.loc	1 17 10 is_stmt 0               # debug-fission-simple.cpp:17:10
+	callq	_Z7doStuffi
+	.loc	1 17 3                          # debug-fission-simple.cpp:17:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp8:
+.Lfunc_end2:
+	.size	_Z6_startv, .Lfunc_end2-_Z6_startv
+	.cfi_endproc
+                                        # -- End function
+	.type	_ZL3foo,@object                 # @_ZL3foo
+	.data
+	.p2align	2
+_ZL3foo:
+	.long	2                               # 0x2
+	.size	_ZL3foo, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	23                              # DW_FORM_sec_offset
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	436953012669069206              # DW_AT_GNU_dwo_id
+	.quad	0                               # DW_AT_low_pc
+	.long	.Ldebug_ranges0                 # DW_AT_ranges
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_ranges,"",@progbits
+.Ldebug_ranges0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_end0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_end1
+	.quad	.Lfunc_begin2
+	.quad	.Lfunc_end2
+	.quad	0
+	.quad	0
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"" # string offset=0
+.Lskel_string1:
+	.asciz	"debug-fission-simple.dwo"      # string offset=47
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"foo"                           # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=4
+.Linfo_string2:
+	.asciz	"_ZL3foo"                       # string offset=8
+.Linfo_string3:
+	.asciz	"_Z7doStuffi"                   # string offset=16
+.Linfo_string4:
+	.asciz	"doStuff"                       # string offset=28
+.Linfo_string5:
+	.asciz	"_Z8doStuff2i"                  # string offset=36
+.Linfo_string6:
+	.asciz	"doStuff2"                      # string offset=49
+.Linfo_string7:
+	.asciz	"_Z6_startv"                    # string offset=58
+.Linfo_string8:
+	.asciz	"_start"                        # string offset=69
+.Linfo_string9:
+	.asciz	"val"                           # string offset=76
+.Linfo_string10:
+	.asciz	"clang version 13.0.0" # string offset=80
+.Linfo_string11:
+	.asciz	"debug-fission-simple.cpp"      # string offset=214
+.Linfo_string12:
+	.asciz	"debug-fission-simple.dwo"      # string offset=239
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	4
+	.long	8
+	.long	16
+	.long	28
+	.long	36
+	.long	49
+	.long	58
+	.long	69
+	.long	76
+	.long	80
+	.long	214
+	.long	239
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x73 DW_TAG_compile_unit
+	.byte	10                              # DW_AT_producer
+	.short	4                               # DW_AT_language
+	.byte	11                              # DW_AT_name
+	.byte	12                              # DW_AT_GNU_dwo_name
+	.quad	436953012669069206              # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0xc DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	251
+	.byte	0
+	.byte	2                               # DW_AT_linkage_name
+	.byte	3                               # Abbrev [3] 0x25:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	4                               # Abbrev [4] 0x29:0x1c DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_linkage_name
+	.byte	4                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	5                               # Abbrev [5] 0x39:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	124
+	.byte	9                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x45:0x1c DW_TAG_subprogram
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	5                               # DW_AT_linkage_name
+	.byte	6                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	5                               # Abbrev [5] 0x55:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	124
+	.byte	9                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x61:0x1c DW_TAG_subprogram
+	.byte	3                               # DW_AT_low_pc
+	.long	.Lfunc_end2-.Lfunc_begin2       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	7                               # DW_AT_linkage_name
+	.byte	8                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	15                              # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	6                               # Abbrev [6] 0x71:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	124
+	.byte	9                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	16                              # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	_ZL3foo
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_begin2
+	.section	.debug_gnu_pubnames,"",@progbits
+	.long	.LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	25                              # DIE offset
+	.byte	160                             # Attributes: VARIABLE, STATIC
+	.asciz	"foo"                           # External Name
+	.long	41                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"doStuff"                       # External Name
+	.long	69                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"doStuff2"                      # External Name
+	.long	97                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"_start"                        # External Name
+	.long	0                               # End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.long	.LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	37                              # DIE offset
+	.byte	144                             # Attributes: TYPE, STATIC
+	.asciz	"int"                           # External Name
+	.long	0                               # End Mark
+.LpubTypes_end0:
+	.ident	"clang version 13"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z7doStuffi
+	.addrsig_sym _ZL3foo
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/file1.s b/bolt/test/X86/Inputs/file1.s
new file mode 100644
index 000000000000..f358d40aac50
--- /dev/null
+++ b/bolt/test/X86/Inputs/file1.s
@@ -0,0 +1,362 @@
+	.text
+	.file	"file1.cpp"
+	.section	.text._Z7doStuffii,"ax",@progbits
+	.globl	_Z7doStuffii                    # -- Begin function _Z7doStuffii
+	.p2align	4, 0x90
+	.type	_Z7doStuffii,@function
+_Z7doStuffii:                           # @_Z7doStuffii
+.Lfunc_begin0:
+	.file	1 "/home/ayermolo/local/fbsource/fbcode/scripts/ayermolo/debugExperiment" "file1.cpp"
+	.loc	1 9 0                           # file1.cpp:9:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: doStuff:var1 <- $edi
+	#DEBUG_VALUE: doStuff:var1 <- $edi
+	#DEBUG_VALUE: doStuff:var2 <- $esi
+	#DEBUG_VALUE: doStuff:var2 <- $esi
+	#DEBUG_VALUE: doStuff:retVal <- $edi
+	#DEBUG_VALUE: doStuff:retVal <- $edi
+	#DEBUG_VALUE: doStuff:f <- [DW_OP_LLVM_fragment 0 32] $edi
+	#DEBUG_VALUE: doStuff:f <- [DW_OP_LLVM_fragment 0 32] $edi
+	#DEBUG_VALUE: doStuff:f <- [DW_OP_LLVM_fragment 32 32] 3
+	.loc	1 12 7 prologue_end             # file1.cpp:12:7
+	movl	globalVar(%rip), %ecx
+	movl	%ecx, %eax
+	negl	%eax
+.Ltmp0:
+	.loc	1 12 12 is_stmt 0               # file1.cpp:12:12
+	cmpl	%esi, %edi
+.Ltmp1:
+	.loc	1 12 7                          # file1.cpp:12:7
+	cmovel	%ecx, %eax
+	addl	%edi, %eax
+.Ltmp2:
+	#DEBUG_VALUE: doStuff:f <- [DW_OP_LLVM_fragment 0 32] $eax
+	.loc	1 18 3 is_stmt 1                # file1.cpp:18:3
+	retq
+.Ltmp3:
+.Lfunc_end0:
+	.size	_Z7doStuffii, .Lfunc_end0-_Z7doStuffii
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_types,"G",@progbits,7448148824980338162,comdat
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	7448148824980338162             # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x33 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x1e:0x24 DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.long	.Linfo_string12                 # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0xd DW_TAG_member
+	.long	.Linfo_string10                 # DW_AT_name
+	.long	66                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	3                               # Abbrev [3] 0x34:0xd DW_TAG_member
+	.long	.Linfo_string11                 # DW_AT_name
+	.long	66                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.byte	4                               # DW_AT_data_member_location
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x42:0x7 DW_TAG_base_type
+	.long	.Linfo_string5                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_loc,"",@progbits
+.Ldebug_loc0:
+	.quad	.Lfunc_begin0-.Lfunc_begin0
+	.quad	.Ltmp2-.Lfunc_begin0
+	.short	7                               # Loc expr size
+	.byte	85                              # super-register DW_OP_reg5
+	.byte	147                             # DW_OP_piece
+	.byte	4                               # 4
+	.byte	51                              # DW_OP_lit3
+	.byte	159                             # DW_OP_stack_value
+	.byte	147                             # DW_OP_piece
+	.byte	4                               # 4
+	.quad	.Ltmp2-.Lfunc_begin0
+	.quad	.Lfunc_end0-.Lfunc_begin0
+	.short	7                               # Loc expr size
+	.byte	80                              # super-register DW_OP_reg0
+	.byte	147                             # DW_OP_piece
+	.byte	4                               # 4
+	.byte	51                              # DW_OP_lit3
+	.byte	159                             # DW_OP_stack_value
+	.byte	147                             # DW_OP_piece
+	.byte	4                               # 4
+	.quad	0
+	.quad	0
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.ascii	"\227B"                         # DW_AT_GNU_all_call_sites
+	.byte	25                              # DW_FORM_flag_present
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	38                              # DW_TAG_const_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	5                               # Abbrev [5] 0xb:0x89 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	6                               # Abbrev [6] 0x2a:0x54 DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_GNU_all_call_sites
+	.long	.Linfo_string3                  # DW_AT_linkage_name
+	.long	.Linfo_string4                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	126                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	7                               # Abbrev [7] 0x47:0xd DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.long	.Linfo_string6                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	126                             # DW_AT_type
+	.byte	7                               # Abbrev [7] 0x54:0xd DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	84
+	.long	.Linfo_string7                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	133                             # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x61:0xd DW_TAG_variable
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.long	.Linfo_string8                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	126                             # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x6e:0xf DW_TAG_variable
+	.long	.Ldebug_loc0                    # DW_AT_location
+	.long	.Linfo_string9                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.long	138                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x7e:0x7 DW_TAG_base_type
+	.long	.Linfo_string5                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x85:0x5 DW_TAG_const_type
+	.long	126                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0x8a:0x9 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	7448148824980338162             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 12.0.0 (ssh://git.vip.facebook.com/data/gitrepos/osmeta/external/llvm-project 865a43501716d59b5b257119d6cf021fe4239642)" # string offset=0
+.Linfo_string1:
+	.asciz	"file1.cpp"                     # string offset=134
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/fbsource/fbcode/scripts/ayermolo/debugExperiment" # string offset=144
+.Linfo_string3:
+	.asciz	"_Z7doStuffii"                  # string offset=214
+.Linfo_string4:
+	.asciz	"doStuff"                       # string offset=227
+.Linfo_string5:
+	.asciz	"int"                           # string offset=235
+.Linfo_string6:
+	.asciz	"var1"                          # string offset=239
+.Linfo_string7:
+	.asciz	"var2"                          # string offset=244
+.Linfo_string8:
+	.asciz	"retVal"                        # string offset=249
+.Linfo_string9:
+	.asciz	"f"                             # string offset=256
+.Linfo_string10:
+	.asciz	"x1"                            # string offset=258
+.Linfo_string11:
+	.asciz	"y1"                            # string offset=261
+.Linfo_string12:
+	.asciz	"Foo"                           # string offset=264
+	.ident	"clang version 12.0.0 (ssh://git.vip.facebook.com/data/gitrepos/osmeta/external/llvm-project 865a43501716d59b5b257119d6cf021fe4239642)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/file2.s b/bolt/test/X86/Inputs/file2.s
new file mode 100644
index 000000000000..7efa1fac55b5
--- /dev/null
+++ b/bolt/test/X86/Inputs/file2.s
@@ -0,0 +1,170 @@
+	.text
+	.file	"file2.cpp"
+	.section	.text._Z8doStuff2i,"ax",@progbits
+	.globl	_Z8doStuff2i                    # -- Begin function _Z8doStuff2i
+	.p2align	4, 0x90
+	.type	_Z8doStuff2i,@function
+_Z8doStuff2i:                           # @_Z8doStuff2i
+.Lfunc_begin0:
+	.file	1 "/home/ayermolo/local/fbsource/fbcode/scripts/ayermolo/debugExperiment" "file2.cpp"
+	.loc	1 4 0                           # file2.cpp:4:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: doStuff2:val <- $edi
+	movl	%edi, %eax
+.Ltmp0:
+	#DEBUG_VALUE: doStuff2:val <- $eax
+	.loc	1 5 14 prologue_end             # file2.cpp:5:14
+	addl	globalVar(%rip), %eax
+.Ltmp1:
+	.loc	1 5 3 is_stmt 0                 # file2.cpp:5:3
+	retq
+.Ltmp2:
+.Lfunc_end0:
+	.size	_Z8doStuff2i, .Lfunc_end0-_Z8doStuff2i
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_loc,"",@progbits
+.Ldebug_loc0:
+	.quad	.Lfunc_begin0-.Lfunc_begin0
+	.quad	.Ltmp0-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	85                              # super-register DW_OP_reg5
+	.quad	.Ltmp0-.Lfunc_begin0
+	.quad	.Ltmp1-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	80                              # super-register DW_OP_reg0
+	.quad	0
+	.quad	0
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.ascii	"\227B"                         # DW_AT_GNU_all_call_sites
+	.byte	25                              # DW_FORM_flag_present
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x54 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	2                               # Abbrev [2] 0x2a:0x2d DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_GNU_all_call_sites
+	.long	.Linfo_string3                  # DW_AT_linkage_name
+	.long	.Linfo_string4                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.long	87                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x47:0xf DW_TAG_formal_parameter
+	.long	.Ldebug_loc0                    # DW_AT_location
+	.long	.Linfo_string6                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.long	87                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x57:0x7 DW_TAG_base_type
+	.long	.Linfo_string5                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 12.0.0 (ssh://git.vip.facebook.com/data/gitrepos/osmeta/external/llvm-project 865a43501716d59b5b257119d6cf021fe4239642)" # string offset=0
+.Linfo_string1:
+	.asciz	"file2.cpp"                     # string offset=134
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/fbsource/fbcode/scripts/ayermolo/debugExperiment" # string offset=144
+.Linfo_string3:
+	.asciz	"_Z8doStuff2i"                  # string offset=214
+.Linfo_string4:
+	.asciz	"doStuff2"                      # string offset=227
+.Linfo_string5:
+	.asciz	"int"                           # string offset=236
+.Linfo_string6:
+	.asciz	"val"                           # string offset=240
+	.ident	"clang version 12.0.0 (ssh://git.vip.facebook.com/data/gitrepos/osmeta/external/llvm-project 865a43501716d59b5b257119d6cf021fe4239642)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/icf-jump-tables.c b/bolt/test/X86/Inputs/icf-jump-tables.c
new file mode 100644
index 000000000000..dc2321eadfdc
--- /dev/null
+++ b/bolt/test/X86/Inputs/icf-jump-tables.c
@@ -0,0 +1,29 @@
+#include <stdio.h>
+
+int inc(int x) {
+  switch (x) {
+    case 0: puts("0"); return 1;
+    case 1: puts("1"); return 2;
+    case 2: puts("2"); return 3;
+    case 3: puts("3"); return 4;
+    case 4: puts("4"); return 5;
+    case 5: puts("5"); return 6;
+    default: return x + 1;
+  }
+}
+
+int inc_dup(int x) {
+  switch (x) {
+    case 0: puts("0"); return 1;
+    case 1: puts("1"); return 2;
+    case 2: puts("2"); return 3;
+    case 3: puts("3"); return 4;
+    case 4: puts("4"); return 5;
+    case 5: puts("5"); return 6;
+    default: return x + 1;
+  }
+}
+
+int main() {
+  return inc(5) - 2*inc_dup(2);
+}
diff --git a/bolt/test/X86/Inputs/inline-foo.c b/bolt/test/X86/Inputs/inline-foo.c
new file mode 100644
index 000000000000..12c737a67b89
--- /dev/null
+++ b/bolt/test/X86/Inputs/inline-foo.c
@@ -0,0 +1,5 @@
+#include <stdio.h>
+
+void foo() {
+  puts("Hello world!\n");
+}
diff --git a/bolt/test/X86/Inputs/inline-main.c b/bolt/test/X86/Inputs/inline-main.c
new file mode 100644
index 000000000000..7853d2b63ce6
--- /dev/null
+++ b/bolt/test/X86/Inputs/inline-main.c
@@ -0,0 +1,5 @@
+extern void foo();
+int main() {
+  foo();
+  return 0;
+}
diff --git a/bolt/test/X86/Inputs/issue20.yaml b/bolt/test/X86/Inputs/issue20.yaml
new file mode 100644
index 000000000000..9e45745ac303
--- /dev/null
+++ b/bolt/test/X86/Inputs/issue20.yaml
@@ -0,0 +1,57 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x00000000004004CD
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x00000000004003E0
+    AddressAlign:    0x0000000000000010
+    Content:         31ED4989D15E4889E24883E4F0505449C7C07005400048C7C10005400048C7C7CD044000E8B7FFFFFFF4660F1F440000B82F10600055482D281060004883F80E4889E577025DC3B8000000004885C074F45DBF28106000FFE00F1F8000000000B82810600055482D2810600048C1F8034889E54889C248C1EA3F4801D048D1F875025DC3BA000000004885D274F45D4889C6BF28106000FFE20F1F8000000000803D9D0B2000007511554889E5E87EFFFFFF5DC6058A0B200001F3C30F1F400048833D7809200000741EB8000000004885C0741455BF200E60004889E5FFD05DE97BFFFFFF0F1F00E973FFFFFF4831C0C34883E703FF24FD90054000B801000000EB13B802000000EB0CB803000000EB05B804000000C3660F1F84000000000041574189FF41564989F641554989D541544C8D25F808200055488D2DF8082000534C29E531DB48C1FD034883EC08E85DFEFFFF4885ED741E0F1F8400000000004C89EA4C89F64489FF41FF14DC4883C3014839EB75EA4883C4085B5D415C415D415E415FC390662E0F1F840000000000F3C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0000000000400580
+    AddressAlign:    0x0000000000000008
+    Content:         01000200000000000000000000000000DC04400000000000E304400000000000EA04400000000000F104400000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x0000000000600E28
+    Link:            .dynstr
+    AddressAlign:    0x0000000000000008
+    Content:         010000000000000001000000000000000C0000000000000090034000000000000D0000000000000074054000000000001900000000000000100E6000000000001B0000000000000008000000000000001A00000000000000180E6000000000001C000000000000000800000000000000F5FEFF6F000000009802400000000000050000000000000000034000000000000600000000000000B8024000000000000A0000000000000038000000000000000B0000000000000018000000000000001500000000000000000000000000000003000000000000000010600000000000020000000000000000000000000000001400000000000000070000000000000017000000000000007803400000000000070000000000000060034000000000000800000000000000000000000000000009000000000000001800000000000000FEFFFF6F000000004003400000000000FFFFFF6F000000000100000000000000F0FFFF6F000000003803400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+Symbols:
+    - Name:            main
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x00000000004004CD
+      Size:            0x000000000000002A
+      Binding:         STB_GLOBAL
+    - Name:            jumptbl
+      Section:         .rodata
+      Value:           0x0000000000400590
+      Binding:         STB_GLOBAL
+DynamicSymbols:
+    - Name:            mydata
+      Section:         .rodata
+      Value:           0x0000000000400100
+      Binding:         STB_GLOBAL
+ProgramHeaders:
+  - Type: PT_LOAD
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x004003E0
+    PAddr: 0x004003E0
+    FirstSec: .text
+    LastSec: .text
+  - Type: PT_DYNAMIC
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x00600E28
+    PAddr: 0x00600E28
+    FirstSec: .dynamic
+    LastSec: .dynamic
+...
diff --git a/bolt/test/X86/Inputs/issue26.yaml b/bolt/test/X86/Inputs/issue26.yaml
new file mode 100644
index 000000000000..7a5c082f2927
--- /dev/null
+++ b/bolt/test/X86/Inputs/issue26.yaml
@@ -0,0 +1,78 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x00000000004004FA
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x00000000004003E0
+    AddressAlign:    0x0000000000000010
+    Content:         31ED4989D15E4889E24883E4F0505449C7C07005400048C7C10005400048C7C7FA044000E8B7FFFFFFF4660F1F440000B82F10600055482D281060004883F80E4889E577025DC3B8000000004885C074F45DBF28106000FFE00F1F8000000000B82810600055482D2810600048C1F8034889E54889C248C1EA3F4801D048D1F875025DC3BA000000004885D274F45D4889C6BF28106000FFE20F1F8000000000803D9D0B2000007511554889E5E87EFFFFFF5DC6058A0B200001F3C30F1F400048833D7809200000741EB8000000004885C0741455BF200E60004889E5FFD05DE97BFFFFFF0F1F00E973FFFFFF648B0425B0FCFFFF39C70F850C0000004839160F850400000048890EC3B8FFFFFFFFC34839FE0F84F0FFFFFFC34831C0C3669041574189FF41564989F641554989D541544C8D25F808200055488D2DF8082000534C29E531DB48C1FD034883EC08E85DFEFFFF4885ED741E0F1F8400000000004C89EA4C89F64489FF41FF14DC4883C3014839EB75EA4883C4085B5D415C415D415E415FC390662E0F1F840000000000F3C3
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x0000000000000008
+    Info:            .text
+    Relocations:
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0000000000400580
+    AddressAlign:    0x0000000000000008
+    Content:         '01000200000000000000000000000000'
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x0000000000600E28
+    Link:            .dynstr
+    AddressAlign:    0x0000000000000008
+    Content:         010000000000000001000000000000000C0000000000000090034000000000000D0000000000000074054000000000001900000000000000100E6000000000001B0000000000000008000000000000001A00000000000000180E6000000000001C000000000000000800000000000000F5FEFF6F000000009802400000000000050000000000000000034000000000000600000000000000B8024000000000000A0000000000000038000000000000000B0000000000000018000000000000001500000000000000000000000000000003000000000000000010600000000000020000000000000000000000000000001400000000000000070000000000000017000000000000007803400000000000070000000000000060034000000000000800000000000000000000000000000009000000000000001800000000000000FEFFFF6F000000004003400000000000FFFFFF6F000000000100000000000000F0FFFF6F000000003803400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+Symbols:
+    - Name:            FUNC
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x00000000004004F0
+      Size:            0x000000000000000A
+      Binding:         STB_GLOBAL
+    - Name:            main
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x00000000004004FA
+      Size:            0x0000000000000004
+      Binding:         STB_GLOBAL
+    - Name:            XYZ
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x00000000004004CD
+      Size:            0x0000000000000023
+      Binding:         STB_GLOBAL
+DynamicSymbols:
+    - Name:            mydata
+      Section:         .rodata
+      Value:           0x0000000000400100
+      Binding:         STB_GLOBAL
+ProgramHeaders:
+  - Type: PT_PHDR
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x004003E0
+    PAddr: 0x004003E0
+    FirstSec: .text
+    LastSec: .text
+  - Type: PT_LOAD
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x004003E0
+    PAddr: 0x004003E0
+    FirstSec: .text
+    LastSec: .text
+  - Type: PT_DYNAMIC
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x00600E28
+    PAddr: 0x00600E28
+    FirstSec: .dynamic
+    LastSec: .dynamic
+...
diff --git a/bolt/test/X86/Inputs/main.s b/bolt/test/X86/Inputs/main.s
new file mode 100644
index 000000000000..e5eabf6ed1df
--- /dev/null
+++ b/bolt/test/X86/Inputs/main.s
@@ -0,0 +1,6697 @@
+	.text
+	.file	"main.cpp"
+	.file	1 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux/bits" "atomic_word.h"
+	.file	2 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/bits" "ios_base.h"
+	.section	.debug_types,"G",@progbits,14346938675671512160,comdat
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-4099805398038039456            # Type Signature
+	.long	40                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x73 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x1e:0x4d DW_TAG_namespace
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x23:0x47 DW_TAG_class_type
+	.long	.Linfo_string5                  # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	4                               # Abbrev [4] 0x28:0x41 DW_TAG_class_type
+	.byte	4                               # DW_AT_calling_convention
+	.long	.Linfo_string11                 # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	2                               # DW_AT_decl_file
+	.short	603                             # DW_AT_decl_line
+	.byte	5                               # Abbrev [5] 0x32:0xc DW_TAG_member
+	.long	.Linfo_string6                  # DW_AT_name
+	.long	107                             # DW_AT_type
+	.byte	2                               # DW_AT_decl_file
+	.short	611                             # DW_AT_decl_line
+                                        # DW_AT_external
+                                        # DW_AT_declaration
+	.byte	5                               # Abbrev [5] 0x3e:0xc DW_TAG_member
+	.long	.Linfo_string9                  # DW_AT_name
+	.long	125                             # DW_AT_type
+	.byte	2                               # DW_AT_decl_file
+	.short	612                             # DW_AT_decl_line
+                                        # DW_AT_external
+                                        # DW_AT_declaration
+	.byte	6                               # Abbrev [6] 0x4a:0xf DW_TAG_subprogram
+	.long	.Linfo_string11                 # DW_AT_name
+	.byte	2                               # DW_AT_decl_file
+	.short	607                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x53:0x5 DW_TAG_formal_parameter
+	.long	132                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0x59:0xf DW_TAG_subprogram
+	.long	.Linfo_string12                 # DW_AT_name
+	.byte	2                               # DW_AT_decl_file
+	.short	608                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x62:0x5 DW_TAG_formal_parameter
+	.long	132                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x6b:0xb DW_TAG_typedef
+	.long	118                             # DW_AT_type
+	.long	.Linfo_string8                  # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	32                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0x76:0x7 DW_TAG_base_type
+	.long	.Linfo_string7                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x7d:0x7 DW_TAG_base_type
+	.long	.Linfo_string10                 # DW_AT_name
+	.byte	2                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x84:0x5 DW_TAG_pointer_type
+	.long	40                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.file	3 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "iostream"
+	.file	4 "/home/ayermolo/local/fbsource/fbcode/scripts/ayermolo/debugExperiment" "main.cpp"
+	.file	5 "/usr/include/bits/types" "__mbstate_t.h"
+	.section	.debug_types,"G",@progbits,10450978883573135350,comdat
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-7995765190136416266            # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x36 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	11                              # Abbrev [11] 0x1e:0x27 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	8                               # DW_AT_byte_size
+	.byte	5                               # DW_AT_decl_file
+	.byte	13                              # DW_AT_decl_line
+	.byte	12                              # Abbrev [12] 0x23:0xc DW_TAG_member
+	.long	.Linfo_string16                 # DW_AT_name
+	.long	69                              # DW_AT_type
+	.byte	5                               # DW_AT_decl_file
+	.byte	15                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x2f:0xc DW_TAG_member
+	.long	.Linfo_string17                 # DW_AT_name
+	.long	59                              # DW_AT_type
+	.byte	5                               # DW_AT_decl_file
+	.byte	20                              # DW_AT_decl_line
+	.byte	4                               # DW_AT_data_member_location
+	.byte	13                              # Abbrev [13] 0x3b:0x9 DW_TAG_union_type
+                                        # DW_AT_declaration
+	.quad	-4935202732161789841            # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x45:0x7 DW_TAG_base_type
+	.long	.Linfo_string7                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_types,"G",@progbits,13511541341547761775,comdat
+	.long	.Ldebug_info_end2-.Ldebug_info_start2 # Length of Unit
+.Ldebug_info_start2:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-4935202732161789841            # Type Signature
+	.long	39                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x51 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	14                              # Abbrev [14] 0x1e:0x28 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-7995765190136416266            # DW_AT_signature
+	.byte	15                              # Abbrev [15] 0x27:0x1e DW_TAG_union_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	4                               # DW_AT_byte_size
+	.byte	5                               # DW_AT_decl_file
+	.byte	16                              # DW_AT_decl_line
+	.byte	12                              # Abbrev [12] 0x2c:0xc DW_TAG_member
+	.long	.Linfo_string18                 # DW_AT_name
+	.long	70                              # DW_AT_type
+	.byte	5                               # DW_AT_decl_file
+	.byte	18                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x38:0xc DW_TAG_member
+	.long	.Linfo_string20                 # DW_AT_name
+	.long	77                              # DW_AT_type
+	.byte	5                               # DW_AT_decl_file
+	.byte	19                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x46:0x7 DW_TAG_base_type
+	.long	.Linfo_string19                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	16                              # Abbrev [16] 0x4d:0xc DW_TAG_array_type
+	.long	89                              # DW_AT_type
+	.byte	17                              # Abbrev [17] 0x52:0x6 DW_TAG_subrange_type
+	.long	96                              # DW_AT_type
+	.byte	4                               # DW_AT_count
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x59:0x7 DW_TAG_base_type
+	.long	.Linfo_string15                 # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	18                              # Abbrev [18] 0x60:0x7 DW_TAG_base_type
+	.long	.Linfo_string21                 # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	7                               # DW_AT_encoding
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end2:
+	.file	6 "/usr/include/bits/types" "mbstate_t.h"
+	.file	7 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "cwchar"
+	.file	8 "/usr/include/bits/types" "wint_t.h"
+	.file	9 "/usr/include" "wchar.h"
+	.file	10 "/usr/include/bits/types" "struct_FILE.h"
+	.file	11 "/usr/include/bits" "types.h"
+	.file	12 "/data/users/ayermolo/llvm-build-release/lib/clang/12.0.0/include" "stddef.h"
+	.section	.debug_types,"G",@progbits,7611735379267581216,comdat
+	.long	.Ldebug_info_end3-.Ldebug_info_start3 # Length of Unit
+.Ldebug_info_start3:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	7611735379267581216             # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x20d DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	19                              # Abbrev [19] 0x1e:0x166 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.long	.Linfo_string67                 # DW_AT_name
+	.byte	216                             # DW_AT_byte_size
+	.byte	10                              # DW_AT_decl_file
+	.byte	49                              # DW_AT_decl_line
+	.byte	12                              # Abbrev [12] 0x27:0xc DW_TAG_member
+	.long	.Linfo_string27                 # DW_AT_name
+	.long	388                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	51                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x33:0xc DW_TAG_member
+	.long	.Linfo_string28                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	54                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x3f:0xc DW_TAG_member
+	.long	.Linfo_string29                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	55                              # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x4b:0xc DW_TAG_member
+	.long	.Linfo_string30                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	56                              # DW_AT_decl_line
+	.byte	24                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x57:0xc DW_TAG_member
+	.long	.Linfo_string31                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	57                              # DW_AT_decl_line
+	.byte	32                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x63:0xc DW_TAG_member
+	.long	.Linfo_string32                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	58                              # DW_AT_decl_line
+	.byte	40                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x6f:0xc DW_TAG_member
+	.long	.Linfo_string33                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	59                              # DW_AT_decl_line
+	.byte	48                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x7b:0xc DW_TAG_member
+	.long	.Linfo_string34                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	60                              # DW_AT_decl_line
+	.byte	56                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x87:0xc DW_TAG_member
+	.long	.Linfo_string35                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	61                              # DW_AT_decl_line
+	.byte	64                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x93:0xc DW_TAG_member
+	.long	.Linfo_string36                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	64                              # DW_AT_decl_line
+	.byte	72                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x9f:0xc DW_TAG_member
+	.long	.Linfo_string37                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	65                              # DW_AT_decl_line
+	.byte	80                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xab:0xc DW_TAG_member
+	.long	.Linfo_string38                 # DW_AT_name
+	.long	395                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	66                              # DW_AT_decl_line
+	.byte	88                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xb7:0xc DW_TAG_member
+	.long	.Linfo_string39                 # DW_AT_name
+	.long	407                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	68                              # DW_AT_decl_line
+	.byte	96                              # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xc3:0xc DW_TAG_member
+	.long	.Linfo_string41                 # DW_AT_name
+	.long	417                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	70                              # DW_AT_decl_line
+	.byte	104                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xcf:0xc DW_TAG_member
+	.long	.Linfo_string42                 # DW_AT_name
+	.long	388                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	72                              # DW_AT_decl_line
+	.byte	112                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xdb:0xc DW_TAG_member
+	.long	.Linfo_string43                 # DW_AT_name
+	.long	388                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	73                              # DW_AT_decl_line
+	.byte	116                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xe7:0xc DW_TAG_member
+	.long	.Linfo_string44                 # DW_AT_name
+	.long	422                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	74                              # DW_AT_decl_line
+	.byte	120                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xf3:0xc DW_TAG_member
+	.long	.Linfo_string47                 # DW_AT_name
+	.long	440                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	77                              # DW_AT_decl_line
+	.byte	128                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0xff:0xc DW_TAG_member
+	.long	.Linfo_string49                 # DW_AT_name
+	.long	447                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	78                              # DW_AT_decl_line
+	.byte	130                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x10b:0xc DW_TAG_member
+	.long	.Linfo_string51                 # DW_AT_name
+	.long	454                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	79                              # DW_AT_decl_line
+	.byte	131                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x117:0xc DW_TAG_member
+	.long	.Linfo_string52                 # DW_AT_name
+	.long	473                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	81                              # DW_AT_decl_line
+	.byte	136                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x123:0xc DW_TAG_member
+	.long	.Linfo_string54                 # DW_AT_name
+	.long	485                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	89                              # DW_AT_decl_line
+	.byte	144                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x12f:0xc DW_TAG_member
+	.long	.Linfo_string56                 # DW_AT_name
+	.long	496                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	91                              # DW_AT_decl_line
+	.byte	152                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x13b:0xc DW_TAG_member
+	.long	.Linfo_string58                 # DW_AT_name
+	.long	506                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	92                              # DW_AT_decl_line
+	.byte	160                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x147:0xc DW_TAG_member
+	.long	.Linfo_string60                 # DW_AT_name
+	.long	417                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	93                              # DW_AT_decl_line
+	.byte	168                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x153:0xc DW_TAG_member
+	.long	.Linfo_string61                 # DW_AT_name
+	.long	516                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	94                              # DW_AT_decl_line
+	.byte	176                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x15f:0xc DW_TAG_member
+	.long	.Linfo_string62                 # DW_AT_name
+	.long	517                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	95                              # DW_AT_decl_line
+	.byte	184                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x16b:0xc DW_TAG_member
+	.long	.Linfo_string65                 # DW_AT_name
+	.long	388                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	96                              # DW_AT_decl_line
+	.byte	192                             # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x177:0xc DW_TAG_member
+	.long	.Linfo_string66                 # DW_AT_name
+	.long	535                             # DW_AT_type
+	.byte	10                              # DW_AT_decl_file
+	.byte	98                              # DW_AT_decl_line
+	.byte	196                             # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x184:0x7 DW_TAG_base_type
+	.long	.Linfo_string7                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x18b:0x5 DW_TAG_pointer_type
+	.long	400                             # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x190:0x7 DW_TAG_base_type
+	.long	.Linfo_string15                 # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x197:0x5 DW_TAG_pointer_type
+	.long	412                             # DW_AT_type
+	.byte	20                              # Abbrev [20] 0x19c:0x5 DW_TAG_structure_type
+	.long	.Linfo_string40                 # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	10                              # Abbrev [10] 0x1a1:0x5 DW_TAG_pointer_type
+	.long	30                              # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x1a6:0xb DW_TAG_typedef
+	.long	433                             # DW_AT_type
+	.long	.Linfo_string46                 # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	150                             # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0x1b1:0x7 DW_TAG_base_type
+	.long	.Linfo_string45                 # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x1b8:0x7 DW_TAG_base_type
+	.long	.Linfo_string48                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	2                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x1bf:0x7 DW_TAG_base_type
+	.long	.Linfo_string50                 # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	16                              # Abbrev [16] 0x1c6:0xc DW_TAG_array_type
+	.long	400                             # DW_AT_type
+	.byte	17                              # Abbrev [17] 0x1cb:0x6 DW_TAG_subrange_type
+	.long	466                             # DW_AT_type
+	.byte	1                               # DW_AT_count
+	.byte	0                               # End Of Children Mark
+	.byte	18                              # Abbrev [18] 0x1d2:0x7 DW_TAG_base_type
+	.long	.Linfo_string21                 # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	7                               # DW_AT_encoding
+	.byte	10                              # Abbrev [10] 0x1d9:0x5 DW_TAG_pointer_type
+	.long	478                             # DW_AT_type
+	.byte	21                              # Abbrev [21] 0x1de:0x7 DW_TAG_typedef
+	.long	.Linfo_string53                 # DW_AT_name
+	.byte	10                              # DW_AT_decl_file
+	.byte	43                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1e5:0xb DW_TAG_typedef
+	.long	433                             # DW_AT_type
+	.long	.Linfo_string55                 # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	151                             # DW_AT_decl_line
+	.byte	10                              # Abbrev [10] 0x1f0:0x5 DW_TAG_pointer_type
+	.long	501                             # DW_AT_type
+	.byte	20                              # Abbrev [20] 0x1f5:0x5 DW_TAG_structure_type
+	.long	.Linfo_string57                 # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	10                              # Abbrev [10] 0x1fa:0x5 DW_TAG_pointer_type
+	.long	511                             # DW_AT_type
+	.byte	20                              # Abbrev [20] 0x1ff:0x5 DW_TAG_structure_type
+	.long	.Linfo_string59                 # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	22                              # Abbrev [22] 0x204:0x1 DW_TAG_pointer_type
+	.byte	8                               # Abbrev [8] 0x205:0xb DW_TAG_typedef
+	.long	528                             # DW_AT_type
+	.long	.Linfo_string64                 # DW_AT_name
+	.byte	12                              # DW_AT_decl_file
+	.byte	46                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0x210:0x7 DW_TAG_base_type
+	.long	.Linfo_string63                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	16                              # Abbrev [16] 0x217:0xc DW_TAG_array_type
+	.long	400                             # DW_AT_type
+	.byte	17                              # Abbrev [17] 0x21c:0x6 DW_TAG_subrange_type
+	.long	466                             # DW_AT_type
+	.byte	20                              # DW_AT_count
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end3:
+	.file	13 "/usr/include/bits/types" "__FILE.h"
+	.section	.debug_types,"G",@progbits,15854790611260012143,comdat
+	.long	.Ldebug_info_end4-.Ldebug_info_start4 # Length of Unit
+.Ldebug_info_start4:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-2591953462449539473            # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x40 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	23                              # Abbrev [23] 0x1e:0x30 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.long	.Linfo_string92                 # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	24                              # Abbrev [24] 0x25:0xa DW_TAG_member
+	.long	.Linfo_string88                 # DW_AT_name
+	.long	78                              # DW_AT_type
+	.byte	0                               # DW_AT_data_member_location
+	.byte	24                              # Abbrev [24] 0x2f:0xa DW_TAG_member
+	.long	.Linfo_string89                 # DW_AT_name
+	.long	78                              # DW_AT_type
+	.byte	4                               # DW_AT_data_member_location
+	.byte	24                              # Abbrev [24] 0x39:0xa DW_TAG_member
+	.long	.Linfo_string90                 # DW_AT_name
+	.long	85                              # DW_AT_type
+	.byte	8                               # DW_AT_data_member_location
+	.byte	24                              # Abbrev [24] 0x43:0xa DW_TAG_member
+	.long	.Linfo_string91                 # DW_AT_name
+	.long	85                              # DW_AT_type
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x4e:0x7 DW_TAG_base_type
+	.long	.Linfo_string19                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	22                              # Abbrev [22] 0x55:0x1 DW_TAG_pointer_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end4:
+	.file	14 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/bits" "exception_ptr.h"
+	.file	15 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux/bits" "c++config.h"
+	.section	.debug_types,"G",@progbits,10094620716291046808,comdat
+	.long	.Ldebug_info_end5-.Ldebug_info_start5 # Length of Unit
+.Ldebug_info_start5:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-8352123357418504808            # Type Signature
+	.long	40                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x187 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x1e:0x14a DW_TAG_namespace
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x23:0x133 DW_TAG_namespace
+	.long	.Linfo_string139                # DW_AT_name
+	.byte	25                              # Abbrev [25] 0x28:0x12d DW_TAG_class_type
+	.byte	4                               # DW_AT_calling_convention
+	.long	.Linfo_string141                # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	14                              # DW_AT_decl_file
+	.byte	79                              # DW_AT_decl_line
+	.byte	12                              # Abbrev [12] 0x31:0xc DW_TAG_member
+	.long	.Linfo_string140                # DW_AT_name
+	.long	360                             # DW_AT_type
+	.byte	14                              # DW_AT_decl_file
+	.byte	81                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	26                              # Abbrev [26] 0x3d:0x12 DW_TAG_subprogram
+	.long	.Linfo_string141                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	83                              # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+                                        # DW_AT_explicit
+	.byte	7                               # Abbrev [7] 0x44:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0x49:0x5 DW_TAG_formal_parameter
+	.long	360                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	28                              # Abbrev [28] 0x4f:0x11 DW_TAG_subprogram
+	.long	.Linfo_string142                # DW_AT_linkage_name
+	.long	.Linfo_string143                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	85                              # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	7                               # Abbrev [7] 0x5a:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	28                              # Abbrev [28] 0x60:0x11 DW_TAG_subprogram
+	.long	.Linfo_string144                # DW_AT_linkage_name
+	.long	.Linfo_string145                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	86                              # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	7                               # Abbrev [7] 0x6b:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	29                              # Abbrev [29] 0x71:0x15 DW_TAG_subprogram
+	.long	.Linfo_string146                # DW_AT_linkage_name
+	.long	.Linfo_string147                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	88                              # DW_AT_decl_line
+	.long	360                             # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	7                               # Abbrev [7] 0x80:0x5 DW_TAG_formal_parameter
+	.long	366                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	30                              # Abbrev [30] 0x86:0xe DW_TAG_subprogram
+	.long	.Linfo_string141                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	96                              # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x8e:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	30                              # Abbrev [30] 0x94:0x13 DW_TAG_subprogram
+	.long	.Linfo_string141                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	98                              # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x9c:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0xa1:0x5 DW_TAG_formal_parameter
+	.long	376                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	30                              # Abbrev [30] 0xa7:0x13 DW_TAG_subprogram
+	.long	.Linfo_string141                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	101                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0xaf:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0xb4:0x5 DW_TAG_formal_parameter
+	.long	342                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	30                              # Abbrev [30] 0xba:0x13 DW_TAG_subprogram
+	.long	.Linfo_string141                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	105                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0xc2:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0xc7:0x5 DW_TAG_formal_parameter
+	.long	386                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	31                              # Abbrev [31] 0xcd:0x1b DW_TAG_subprogram
+	.long	.Linfo_string150                # DW_AT_linkage_name
+	.long	.Linfo_string151                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	118                             # DW_AT_decl_line
+	.long	391                             # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0xdd:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0xe2:0x5 DW_TAG_formal_parameter
+	.long	376                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	31                              # Abbrev [31] 0xe8:0x1b DW_TAG_subprogram
+	.long	.Linfo_string152                # DW_AT_linkage_name
+	.long	.Linfo_string151                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	122                             # DW_AT_decl_line
+	.long	391                             # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0xf8:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0xfd:0x5 DW_TAG_formal_parameter
+	.long	386                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	30                              # Abbrev [30] 0x103:0xe DW_TAG_subprogram
+	.long	.Linfo_string153                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	129                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x10b:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	32                              # Abbrev [32] 0x111:0x17 DW_TAG_subprogram
+	.long	.Linfo_string154                # DW_AT_linkage_name
+	.long	.Linfo_string155                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	132                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x11d:0x5 DW_TAG_formal_parameter
+	.long	361                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	27                              # Abbrev [27] 0x122:0x5 DW_TAG_formal_parameter
+	.long	391                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	33                              # Abbrev [33] 0x128:0x16 DW_TAG_subprogram
+	.long	.Linfo_string156                # DW_AT_linkage_name
+	.long	.Linfo_string157                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	144                             # DW_AT_decl_line
+	.long	396                             # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+                                        # DW_AT_explicit
+	.byte	7                               # Abbrev [7] 0x138:0x5 DW_TAG_formal_parameter
+	.long	366                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	31                              # Abbrev [31] 0x13e:0x16 DW_TAG_subprogram
+	.long	.Linfo_string158                # DW_AT_linkage_name
+	.long	.Linfo_string159                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	153                             # DW_AT_decl_line
+	.long	403                             # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	7                               # Abbrev [7] 0x14e:0x5 DW_TAG_formal_parameter
+	.long	366                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	34                              # Abbrev [34] 0x156:0xc DW_TAG_typedef
+	.long	381                             # DW_AT_type
+	.long	.Linfo_string149                # DW_AT_name
+	.byte	15                              # DW_AT_decl_file
+	.short	2192                            # DW_AT_decl_line
+	.byte	35                              # Abbrev [35] 0x162:0x5 DW_TAG_class_type
+	.long	.Linfo_string160                # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	22                              # Abbrev [22] 0x168:0x1 DW_TAG_pointer_type
+	.byte	10                              # Abbrev [10] 0x169:0x5 DW_TAG_pointer_type
+	.long	40                              # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x16e:0x5 DW_TAG_pointer_type
+	.long	371                             # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x173:0x5 DW_TAG_const_type
+	.long	40                              # DW_AT_type
+	.byte	37                              # Abbrev [37] 0x178:0x5 DW_TAG_reference_type
+	.long	371                             # DW_AT_type
+	.byte	38                              # Abbrev [38] 0x17d:0x5 DW_TAG_unspecified_type
+	.long	.Linfo_string148                # DW_AT_name
+	.byte	39                              # Abbrev [39] 0x182:0x5 DW_TAG_rvalue_reference_type
+	.long	40                              # DW_AT_type
+	.byte	37                              # Abbrev [37] 0x187:0x5 DW_TAG_reference_type
+	.long	40                              # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x18c:0x7 DW_TAG_base_type
+	.long	.Linfo_string10                 # DW_AT_name
+	.byte	2                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x193:0x5 DW_TAG_pointer_type
+	.long	408                             # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x198:0x5 DW_TAG_const_type
+	.long	354                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end5:
+	.file	16 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/debug" "debug.h"
+	.file	17 "/usr/include/bits" "stdint-intn.h"
+	.file	18 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "cstdint"
+	.file	19 "/usr/include" "stdint.h"
+	.file	20 "/usr/include/bits" "stdint-uintn.h"
+	.file	21 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "clocale"
+	.file	22 "/usr/include" "locale.h"
+	.file	23 "/usr/include" "ctype.h"
+	.file	24 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "cctype"
+	.file	25 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/ext" "new_allocator.h"
+	.file	26 "/usr/include" "stdlib.h"
+	.file	27 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/bits" "std_abs.h"
+	.file	28 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "cstdlib"
+	.section	.debug_types,"G",@progbits,8006786087487830082,comdat
+	.long	.Ldebug_info_end6-.Ldebug_info_start6 # Length of Unit
+.Ldebug_info_start6:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	8006786087487830082             # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x2d DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	11                              # Abbrev [11] 0x1e:0x1e DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	16                              # DW_AT_byte_size
+	.byte	26                              # DW_AT_decl_file
+	.byte	66                              # DW_AT_decl_line
+	.byte	12                              # Abbrev [12] 0x23:0xc DW_TAG_member
+	.long	.Linfo_string233                # DW_AT_name
+	.long	60                              # DW_AT_type
+	.byte	26                              # DW_AT_decl_file
+	.byte	68                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x2f:0xc DW_TAG_member
+	.long	.Linfo_string234                # DW_AT_name
+	.long	60                              # DW_AT_type
+	.byte	26                              # DW_AT_decl_file
+	.byte	69                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x3c:0x7 DW_TAG_base_type
+	.long	.Linfo_string45                 # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end6:
+	.file	29 "/usr/include/bits" "stdlib-float.h"
+	.file	30 "/usr/include/bits" "stdlib-bsearch.h"
+	.section	.debug_types,"G",@progbits,17630279974106280217,comdat
+	.long	.Ldebug_info_end7-.Ldebug_info_start7 # Length of Unit
+.Ldebug_info_start7:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-816464099603271399             # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x2d DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	11                              # Abbrev [11] 0x1e:0x1e DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	16                              # DW_AT_byte_size
+	.byte	26                              # DW_AT_decl_file
+	.byte	76                              # DW_AT_decl_line
+	.byte	12                              # Abbrev [12] 0x23:0xc DW_TAG_member
+	.long	.Linfo_string233                # DW_AT_name
+	.long	60                              # DW_AT_type
+	.byte	26                              # DW_AT_decl_file
+	.byte	78                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	12                              # Abbrev [12] 0x2f:0xc DW_TAG_member
+	.long	.Linfo_string234                # DW_AT_name
+	.long	60                              # DW_AT_type
+	.byte	26                              # DW_AT_decl_file
+	.byte	79                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x3c:0x7 DW_TAG_base_type
+	.long	.Linfo_string136                # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end7:
+	.file	31 "/usr/include/bits/types" "FILE.h"
+	.file	32 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "cstdio"
+	.file	33 "/usr/include/bits/types" "__fpos64_t.h"
+	.file	34 "/usr/include" "stdio.h"
+	.file	35 "/usr/include/bits" "stdio.h"
+	.file	36 "/usr/include" "wctype.h"
+	.file	37 "/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8" "cwctype"
+	.file	38 "/usr/include/bits" "wctype-wchar.h"
+	.section	.text.main,"ax",@progbits
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	4 10 0                          # main.cpp:10:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: main:argc <- $edi
+	#DEBUG_VALUE: main:argv <- $rsi
+	#DEBUG_VALUE: main:argv <- $rsi
+	#DEBUG_VALUE: main:argc <- $edi
+	.loc	4 11 12 prologue_end            # main.cpp:11:12
+	cmpl	$4, %edi
+.Ltmp0:
+	.loc	4 11 7 is_stmt 0                # main.cpp:11:7
+	jl	.LBB0_6
+.Ltmp1:
+# %bb.1:                                # %if.then
+	#DEBUG_VALUE: main:argv <- $rsi
+	#DEBUG_VALUE: main:argc <- $edi
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	pushq	%r15
+	.cfi_def_cfa_offset 24
+	pushq	%r14
+	.cfi_def_cfa_offset 32
+	pushq	%r12
+	.cfi_def_cfa_offset 40
+	pushq	%rbx
+	.cfi_def_cfa_offset 48
+	.cfi_offset %rbx, -48
+	.cfi_offset %r12, -40
+	.cfi_offset %r14, -32
+	.cfi_offset %r15, -24
+	.cfi_offset %rbp, -16
+	movq	%rsi, %r12
+.Ltmp2:
+	#DEBUG_VALUE: main:argv <- $r12
+	.loc	4 12 20 is_stmt 1               # main.cpp:12:20
+	movq	8(%rsi), %rdi
+.Ltmp3:
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $edi
+	#DEBUG_VALUE: atoi:__nptr <- $rdi
+	.loc	4 0 20 is_stmt 0                # main.cpp:0:20
+	xorl	%ebx, %ebx
+.Ltmp4:
+	.loc	26 363 16 is_stmt 1             # /usr/include/stdlib.h:363:16
+	xorl	%esi, %esi
+	movl	$10, %edx
+	callq	strtol
+.Ltmp5:
+	movq	%rax, %r14
+.Ltmp6:
+	#DEBUG_VALUE: val <- $r14d
+	.loc	4 13 20                         # main.cpp:13:20
+	movq	16(%r12), %rdi
+.Ltmp7:
+	#DEBUG_VALUE: atoi:__nptr <- $rdi
+	.loc	26 363 16                       # /usr/include/stdlib.h:363:16
+	xorl	%esi, %esi
+	movl	$10, %edx
+	callq	strtol
+.Ltmp8:
+	movq	%rax, %r15
+.Ltmp9:
+	#DEBUG_VALUE: foo <- undef
+	.loc	4 14 22                         # main.cpp:14:22
+	movq	24(%r12), %rdi
+.Ltmp10:
+	#DEBUG_VALUE: atoi:__nptr <- $rdi
+	.loc	26 363 16                       # /usr/include/stdlib.h:363:16
+	xorl	%esi, %esi
+	movl	$10, %edx
+	callq	strtol
+.Ltmp11:
+	.loc	4 14 15                         # main.cpp:14:15
+	movl	%eax, globalVar(%rip)
+.Ltmp12:
+	#DEBUG_VALUE: retVal <- 0
+	.loc	4 17 13                         # main.cpp:17:13
+	cmpl	$1001, %r14d                    # imm = 0x3E9
+.Ltmp13:
+	.loc	4 17 9 is_stmt 0                # main.cpp:17:9
+	jl	.LBB0_4
+.Ltmp14:
+# %bb.2:                                # %for.body.preheader
+	#DEBUG_VALUE: val <- $r14d
+	#DEBUG_VALUE: main:argv <- $r12
+	#DEBUG_VALUE: retVal <- 0
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $edi
+	#DEBUG_VALUE: foo <- $r15d
+	.loc	4 0 9                           # main.cpp:0:9
+	movl	%r14d, %ebp
+.Ltmp15:
+	.p2align	4, 0x90
+.LBB0_3:                                # %for.body
+                                        # =>This Inner Loop Header: Depth=1
+	#DEBUG_VALUE: foo <- $r15d
+	#DEBUG_VALUE: val <- $r14d
+	#DEBUG_VALUE: main:argv <- $r12
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $edi
+	#DEBUG_VALUE: retVal <- $ebx
+	#DEBUG_VALUE: i <- undef
+	.loc	4 19 19 is_stmt 1               # main.cpp:19:19
+	movl	%r14d, %edi
+	movl	%r15d, %esi
+	callq	_Z7doStuffii
+.Ltmp16:
+	.loc	4 19 16 is_stmt 0               # main.cpp:19:16
+	addl	%eax, %ebx
+.Ltmp17:
+	#DEBUG_VALUE: i <- [DW_OP_plus_uconst 1, DW_OP_stack_value] undef
+	#DEBUG_VALUE: retVal <- $ebx
+	.loc	4 18 25 is_stmt 1               # main.cpp:18:25
+	addl	$-1, %ebp
+.Ltmp18:
+	.loc	4 18 7 is_stmt 0                # main.cpp:18:7
+	jne	.LBB0_3
+	jmp	.LBB0_5
+.Ltmp19:
+.LBB0_4:                                # %if.else
+	#DEBUG_VALUE: val <- $r14d
+	#DEBUG_VALUE: main:argv <- $r12
+	#DEBUG_VALUE: retVal <- 0
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $edi
+	.loc	4 21 16 is_stmt 1               # main.cpp:21:16
+	movl	%r14d, %edi
+	callq	_Z8doStuff2i
+.Ltmp20:
+	movl	%eax, %ebx
+.Ltmp21:
+	#DEBUG_VALUE: retVal <- $ebx
+.LBB0_5:                                # %if.end
+	#DEBUG_VALUE: val <- $r14d
+	#DEBUG_VALUE: main:argv <- $r12
+	#DEBUG_VALUE: retVal <- $ebx
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $edi
+	#DEBUG_VALUE: retVal <- $ebx
+	.loc	4 22 5                          # main.cpp:22:5
+	movl	$.L.str, %edi
+	movl	%ebx, %esi
+	xorl	%eax, %eax
+	callq	printf
+.Ltmp22:
+	.loc	4 0 5 is_stmt 0                 # main.cpp:0:5
+	popq	%rbx
+.Ltmp23:
+	.cfi_def_cfa_offset 40
+	popq	%r12
+.Ltmp24:
+	#DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rsi
+	.cfi_def_cfa_offset 32
+	popq	%r14
+.Ltmp25:
+	.cfi_def_cfa_offset 24
+	popq	%r15
+	.cfi_def_cfa_offset 16
+	popq	%rbp
+	.cfi_def_cfa_offset 8
+	.cfi_restore %rbx
+	.cfi_restore %r12
+	.cfi_restore %r14
+	.cfi_restore %r15
+	.cfi_restore %rbp
+.Ltmp26:
+.LBB0_6:                                # %if.end11
+	.loc	4 25 3 is_stmt 1                # main.cpp:25:3
+	xorl	%eax, %eax
+	retq
+.Ltmp27:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.text.startup,"ax",@progbits
+	.p2align	4, 0x90                         # -- Begin function _GLOBAL__sub_I_main.cpp
+	.type	_GLOBAL__sub_I_main.cpp,@function
+_GLOBAL__sub_I_main.cpp:                # @_GLOBAL__sub_I_main.cpp
+.Lfunc_begin1:
+	.loc	4 0 0                           # main.cpp:0:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rax
+	.cfi_def_cfa_offset 16
+.Ltmp28:
+	.loc	3 74 25 prologue_end            # /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/iostream:74:25
+	movl	$_ZStL8__ioinit, %edi
+	callq	_ZNSt8ios_base4InitC1Ev
+.Ltmp29:
+	.loc	4 0 0 is_stmt 0                 # main.cpp:0:0
+	movl	$_ZNSt8ios_base4InitD1Ev, %edi
+	movl	$_ZStL8__ioinit, %esi
+	movl	$__dso_handle, %edx
+	popq	%rax
+	.cfi_def_cfa_offset 8
+.Ltmp30:
+	jmp	__cxa_atexit                    # TAILCALL
+.Ltmp31:
+.Lfunc_end1:
+	.size	_GLOBAL__sub_I_main.cpp, .Lfunc_end1-_GLOBAL__sub_I_main.cpp
+	.cfi_endproc
+                                        # -- End function
+	.type	_ZStL8__ioinit,@object          # @_ZStL8__ioinit
+	.local	_ZStL8__ioinit
+	.comm	_ZStL8__ioinit,1,1
+	.hidden	__dso_handle
+	.type	globalVar,@object               # @globalVar
+	.bss
+	.globl	globalVar
+	.p2align	2
+globalVar:
+	.long	0                               # 0x0
+	.size	globalVar, 4
+
+	.type	.L.str,@object                  # @.str
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str:
+	.asciz	"retVal: %d\n"
+	.size	.L.str, 12
+
+	.section	.init_array,"aw",@init_array
+	.p2align	3
+	.quad	_GLOBAL__sub_I_main.cpp
+	.section	.debug_loc,"",@progbits
+.Ldebug_loc0:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Lfunc_begin0-.Lfunc_begin0
+	.quad	.Ltmp3-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	85                              # super-register DW_OP_reg5
+	.quad	.Ltmp3-.Lfunc_begin0
+	.quad	.Ltmp26-.Lfunc_begin0
+	.short	4                               # Loc expr size
+	.byte	243                             # DW_OP_GNU_entry_value
+	.byte	1                               # 1
+	.byte	85                              # super-register DW_OP_reg5
+	.byte	159                             # DW_OP_stack_value
+	.quad	0
+	.quad	0
+.Ldebug_loc1:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Lfunc_begin0-.Lfunc_begin0
+	.quad	.Ltmp2-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	84                              # DW_OP_reg4
+	.quad	.Ltmp2-.Lfunc_begin0
+	.quad	.Ltmp24-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	92                              # DW_OP_reg12
+	.quad	.Ltmp24-.Lfunc_begin0
+	.quad	.Ltmp26-.Lfunc_begin0
+	.short	4                               # Loc expr size
+	.byte	243                             # DW_OP_GNU_entry_value
+	.byte	1                               # 1
+	.byte	84                              # DW_OP_reg4
+	.byte	159                             # DW_OP_stack_value
+	.quad	0
+	.quad	0
+.Ldebug_loc2:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Ltmp3-.Lfunc_begin0
+	.quad	.Ltmp5-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	85                              # DW_OP_reg5
+	.quad	0
+	.quad	0
+.Ldebug_loc3:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Ltmp6-.Lfunc_begin0
+	.quad	.Ltmp25-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	94                              # super-register DW_OP_reg14
+	.quad	0
+	.quad	0
+.Ldebug_loc4:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Ltmp7-.Lfunc_begin0
+	.quad	.Ltmp8-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	85                              # DW_OP_reg5
+	.quad	0
+	.quad	0
+.Ldebug_loc5:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Ltmp14-.Lfunc_begin0
+	.quad	.Ltmp19-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	95                              # super-register DW_OP_reg15
+	.quad	0
+	.quad	0
+.Ldebug_loc6:
+	.quad	-1
+	.quad	.Lfunc_begin0                   #   base address
+	.quad	.Ltmp12-.Lfunc_begin0
+	.quad	.Ltmp15-.Lfunc_begin0
+	.short	3                               # Loc expr size
+	.byte	17                              # DW_OP_consts
+	.byte	0                               # 0
+	.byte	159                             # DW_OP_stack_value
+	.quad	.Ltmp15-.Lfunc_begin0
+	.quad	.Ltmp19-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	83                              # super-register DW_OP_reg3
+	.quad	.Ltmp19-.Lfunc_begin0
+	.quad	.Ltmp21-.Lfunc_begin0
+	.short	3                               # Loc expr size
+	.byte	17                              # DW_OP_consts
+	.byte	0                               # 0
+	.byte	159                             # DW_OP_stack_value
+	.quad	.Ltmp21-.Lfunc_begin0
+	.quad	.Ltmp23-.Lfunc_begin0
+	.short	1                               # Loc expr size
+	.byte	83                              # super-register DW_OP_reg3
+	.quad	0
+	.quad	0
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	22                              # DW_TAG_typedef
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	23                              # DW_TAG_union_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	14                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	15                              # Abbreviation Code
+	.byte	23                              # DW_TAG_union_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	16                              # Abbreviation Code
+	.byte	1                               # DW_TAG_array_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	17                              # Abbreviation Code
+	.byte	33                              # DW_TAG_subrange_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	55                              # DW_AT_count
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	18                              # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	19                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	20                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	21                              # Abbreviation Code
+	.byte	22                              # DW_TAG_typedef
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	22                              # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	23                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	24                              # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	25                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	26                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	99                              # DW_AT_explicit
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	27                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	28                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	29                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	30                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	31                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	32                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	33                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	99                              # DW_AT_explicit
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	34                              # Abbreviation Code
+	.byte	22                              # DW_TAG_typedef
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	35                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	36                              # Abbreviation Code
+	.byte	38                              # DW_TAG_const_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	37                              # Abbreviation Code
+	.byte	16                              # DW_TAG_reference_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	38                              # Abbreviation Code
+	.byte	59                              # DW_TAG_unspecified_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	39                              # Abbreviation Code
+	.byte	66                              # DW_TAG_rvalue_reference_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	40                              # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	41                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	42                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	43                              # Abbreviation Code
+	.byte	8                               # DW_TAG_imported_declaration
+	.byte	0                               # DW_CHILDREN_no
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	24                              # DW_AT_import
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	44                              # Abbreviation Code
+	.byte	8                               # DW_TAG_imported_declaration
+	.byte	0                               # DW_CHILDREN_no
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	24                              # DW_AT_import
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	45                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\207\001"                      # DW_AT_noreturn
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	46                              # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	47                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	48                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	49                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	50                              # Abbreviation Code
+	.byte	55                              # DW_TAG_restrict_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	51                              # Abbreviation Code
+	.byte	24                              # DW_TAG_unspecified_parameters
+	.byte	0                               # DW_CHILDREN_no
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	52                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	53                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	54                              # Abbreviation Code
+	.byte	58                              # DW_TAG_imported_module
+	.byte	0                               # DW_CHILDREN_no
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	24                              # DW_AT_import
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	55                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	56                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	57                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\207\001"                      # DW_AT_noreturn
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	58                              # Abbreviation Code
+	.byte	21                              # DW_TAG_subroutine_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	59                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	60                              # Abbreviation Code
+	.byte	38                              # DW_TAG_const_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	61                              # Abbreviation Code
+	.byte	21                              # DW_TAG_subroutine_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	62                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\207\001"                      # DW_AT_noreturn
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	63                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	64                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	65                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	66                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	32                              # DW_AT_inline
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	67                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	5                               # DW_FORM_data2
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	68                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.ascii	"\227B"                         # DW_AT_GNU_all_call_sites
+	.byte	25                              # DW_FORM_flag_present
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	69                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	70                              # Abbreviation Code
+	.byte	11                              # DW_TAG_lexical_block
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	71                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	72                              # Abbreviation Code
+	.byte	29                              # DW_TAG_inlined_subroutine
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	88                              # DW_AT_call_file
+	.byte	11                              # DW_FORM_data1
+	.byte	89                              # DW_AT_call_line
+	.byte	11                              # DW_FORM_data1
+	.byte	87                              # DW_AT_call_column
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	73                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	74                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	75                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	76                              # Abbreviation Code
+	.ascii	"\211\202\001"                  # DW_TAG_GNU_call_site
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	77                              # Abbreviation Code
+	.ascii	"\212\202\001"                  # DW_TAG_GNU_call_site_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.ascii	"\221B"                         # DW_AT_GNU_call_site_value
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	78                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	32                              # DW_AT_inline
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	79                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.ascii	"\227B"                         # DW_AT_GNU_all_call_sites
+	.byte	25                              # DW_FORM_flag_present
+	.byte	110                             # DW_AT_linkage_name
+	.byte	14                              # DW_FORM_strp
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	80                              # Abbreviation Code
+	.byte	29                              # DW_TAG_inlined_subroutine
+	.byte	0                               # DW_CHILDREN_no
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	88                              # DW_AT_call_file
+	.byte	11                              # DW_FORM_data1
+	.byte	89                              # DW_AT_call_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end8-.Ldebug_info_start8 # Length of Unit
+.Ldebug_info_start8:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	40                              # Abbrev [40] 0xb:0x1c20 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.quad	0                               # DW_AT_low_pc
+	.long	.Ldebug_ranges0                 # DW_AT_ranges
+	.byte	2                               # Abbrev [2] 0x2a:0x69c DW_TAG_namespace
+	.long	.Linfo_string3                  # DW_AT_name
+	.byte	41                              # Abbrev [41] 0x2f:0x19 DW_TAG_variable
+	.long	.Linfo_string4                  # DW_AT_name
+	.long	77                              # DW_AT_type
+	.byte	3                               # DW_AT_decl_file
+	.byte	74                              # DW_AT_decl_line
+	.byte	9                               # DW_AT_location
+	.byte	3
+	.quad	_ZStL8__ioinit
+	.long	.Linfo_string13                 # DW_AT_linkage_name
+	.byte	3                               # Abbrev [3] 0x48:0xf DW_TAG_class_type
+	.long	.Linfo_string5                  # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	42                              # Abbrev [42] 0x4d:0x9 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	-4099805398038039456            # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+	.byte	43                              # Abbrev [43] 0x57:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	64                              # DW_AT_decl_line
+	.long	1779                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5e:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	139                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x65:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	141                             # DW_AT_decl_line
+	.long	1828                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x6c:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	142                             # DW_AT_decl_line
+	.long	1846                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x73:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	143                             # DW_AT_decl_line
+	.long	1889                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x7a:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	144                             # DW_AT_decl_line
+	.long	1939                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x81:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	145                             # DW_AT_decl_line
+	.long	1962                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x88:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	146                             # DW_AT_decl_line
+	.long	2000                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x8f:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	147                             # DW_AT_decl_line
+	.long	2023                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x96:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	148                             # DW_AT_decl_line
+	.long	2047                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x9d:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	149                             # DW_AT_decl_line
+	.long	2071                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xa4:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	150                             # DW_AT_decl_line
+	.long	2089                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xab:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	151                             # DW_AT_decl_line
+	.long	2101                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xb2:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	152                             # DW_AT_decl_line
+	.long	2172                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xb9:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	153                             # DW_AT_decl_line
+	.long	2205                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xc0:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	154                             # DW_AT_decl_line
+	.long	2233                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xc7:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	155                             # DW_AT_decl_line
+	.long	2276                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xce:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	156                             # DW_AT_decl_line
+	.long	2299                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd5:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	158                             # DW_AT_decl_line
+	.long	2317                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xdc:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	160                             # DW_AT_decl_line
+	.long	2346                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xe3:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	161                             # DW_AT_decl_line
+	.long	2370                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xea:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	162                             # DW_AT_decl_line
+	.long	2393                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xf1:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	164                             # DW_AT_decl_line
+	.long	2435                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xf8:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	167                             # DW_AT_decl_line
+	.long	2463                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xff:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	170                             # DW_AT_decl_line
+	.long	2496                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x106:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	172                             # DW_AT_decl_line
+	.long	2524                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x10d:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	174                             # DW_AT_decl_line
+	.long	2547                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x114:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	176                             # DW_AT_decl_line
+	.long	2570                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x11b:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	177                             # DW_AT_decl_line
+	.long	2603                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x122:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	178                             # DW_AT_decl_line
+	.long	2625                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x129:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	179                             # DW_AT_decl_line
+	.long	2647                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x130:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	180                             # DW_AT_decl_line
+	.long	2669                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x137:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	181                             # DW_AT_decl_line
+	.long	2691                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x13e:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	182                             # DW_AT_decl_line
+	.long	2713                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x145:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	183                             # DW_AT_decl_line
+	.long	2766                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x14c:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	184                             # DW_AT_decl_line
+	.long	2783                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x153:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	185                             # DW_AT_decl_line
+	.long	2810                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x15a:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	186                             # DW_AT_decl_line
+	.long	2837                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x161:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	187                             # DW_AT_decl_line
+	.long	2864                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x168:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	188                             # DW_AT_decl_line
+	.long	2907                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x16f:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	189                             # DW_AT_decl_line
+	.long	2929                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x176:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	191                             # DW_AT_decl_line
+	.long	2969                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x17d:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	193                             # DW_AT_decl_line
+	.long	2999                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x184:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	194                             # DW_AT_decl_line
+	.long	3026                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x18b:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	195                             # DW_AT_decl_line
+	.long	3061                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x192:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	196                             # DW_AT_decl_line
+	.long	3089                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x199:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	197                             # DW_AT_decl_line
+	.long	3116                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1a0:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	198                             # DW_AT_decl_line
+	.long	3134                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1a7:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	199                             # DW_AT_decl_line
+	.long	3162                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1ae:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	200                             # DW_AT_decl_line
+	.long	3190                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1b5:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	201                             # DW_AT_decl_line
+	.long	3218                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1bc:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	202                             # DW_AT_decl_line
+	.long	3246                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1c3:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	203                             # DW_AT_decl_line
+	.long	3265                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1ca:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	204                             # DW_AT_decl_line
+	.long	3284                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1d1:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	205                             # DW_AT_decl_line
+	.long	3306                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1d8:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	206                             # DW_AT_decl_line
+	.long	3328                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1df:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	207                             # DW_AT_decl_line
+	.long	3350                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x1e6:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	208                             # DW_AT_decl_line
+	.long	3372                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x1ed:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	264                             # DW_AT_decl_line
+	.long	3566                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x1f5:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	265                             # DW_AT_decl_line
+	.long	3596                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x1fd:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	266                             # DW_AT_decl_line
+	.long	3631                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x205:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	280                             # DW_AT_decl_line
+	.long	2969                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x20d:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	283                             # DW_AT_decl_line
+	.long	2435                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x215:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	286                             # DW_AT_decl_line
+	.long	2496                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x21d:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	289                             # DW_AT_decl_line
+	.long	2547                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x225:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	293                             # DW_AT_decl_line
+	.long	3566                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x22d:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	294                             # DW_AT_decl_line
+	.long	3596                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0x235:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	295                             # DW_AT_decl_line
+	.long	3631                            # DW_AT_import
+	.byte	2                               # Abbrev [2] 0x23d:0x16 DW_TAG_namespace
+	.long	.Linfo_string139                # DW_AT_name
+	.byte	42                              # Abbrev [42] 0x242:0x9 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	-8352123357418504808            # DW_AT_signature
+	.byte	43                              # Abbrev [43] 0x24b:0x7 DW_TAG_imported_declaration
+	.byte	14                              # DW_AT_decl_file
+	.byte	73                              # DW_AT_decl_line
+	.long	602                             # DW_AT_import
+	.byte	0                               # End Of Children Mark
+	.byte	43                              # Abbrev [43] 0x253:0x7 DW_TAG_imported_declaration
+	.byte	14                              # DW_AT_decl_file
+	.byte	57                              # DW_AT_decl_line
+	.long	578                             # DW_AT_import
+	.byte	45                              # Abbrev [45] 0x25a:0x11 DW_TAG_subprogram
+	.long	.Linfo_string161                # DW_AT_linkage_name
+	.long	.Linfo_string162                # DW_AT_name
+	.byte	14                              # DW_AT_decl_file
+	.byte	69                              # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+                                        # DW_AT_noreturn
+	.byte	27                              # Abbrev [27] 0x265:0x5 DW_TAG_formal_parameter
+	.long	578                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	46                              # Abbrev [46] 0x26b:0x5 DW_TAG_namespace
+	.long	.Linfo_string164                # DW_AT_name
+	.byte	43                              # Abbrev [43] 0x270:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	48                              # DW_AT_decl_line
+	.long	3679                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x277:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	49                              # DW_AT_decl_line
+	.long	3708                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x27e:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	50                              # DW_AT_decl_line
+	.long	3737                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x285:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	51                              # DW_AT_decl_line
+	.long	3759                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x28c:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	53                              # DW_AT_decl_line
+	.long	3781                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x293:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	54                              # DW_AT_decl_line
+	.long	3792                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x29a:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	55                              # DW_AT_decl_line
+	.long	3803                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2a1:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	56                              # DW_AT_decl_line
+	.long	3814                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2a8:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	58                              # DW_AT_decl_line
+	.long	3825                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2af:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	59                              # DW_AT_decl_line
+	.long	3847                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2b6:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	60                              # DW_AT_decl_line
+	.long	3869                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2bd:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	61                              # DW_AT_decl_line
+	.long	3891                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2c4:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	63                              # DW_AT_decl_line
+	.long	3913                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2cb:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	64                              # DW_AT_decl_line
+	.long	3935                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2d2:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	66                              # DW_AT_decl_line
+	.long	3946                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2d9:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	67                              # DW_AT_decl_line
+	.long	3975                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2e0:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	68                              # DW_AT_decl_line
+	.long	4004                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2e7:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	69                              # DW_AT_decl_line
+	.long	4026                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2ee:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	71                              # DW_AT_decl_line
+	.long	4048                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2f5:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	72                              # DW_AT_decl_line
+	.long	4059                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x2fc:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	73                              # DW_AT_decl_line
+	.long	4070                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x303:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	74                              # DW_AT_decl_line
+	.long	4081                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x30a:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	76                              # DW_AT_decl_line
+	.long	4092                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x311:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	77                              # DW_AT_decl_line
+	.long	4114                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x318:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	78                              # DW_AT_decl_line
+	.long	4136                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x31f:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	79                              # DW_AT_decl_line
+	.long	4158                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x326:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	81                              # DW_AT_decl_line
+	.long	4180                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x32d:0x7 DW_TAG_imported_declaration
+	.byte	18                              # DW_AT_decl_file
+	.byte	82                              # DW_AT_decl_line
+	.long	4202                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x334:0x7 DW_TAG_imported_declaration
+	.byte	21                              # DW_AT_decl_file
+	.byte	53                              # DW_AT_decl_line
+	.long	4213                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x33b:0x7 DW_TAG_imported_declaration
+	.byte	21                              # DW_AT_decl_file
+	.byte	54                              # DW_AT_decl_line
+	.long	4218                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x342:0x7 DW_TAG_imported_declaration
+	.byte	21                              # DW_AT_decl_file
+	.byte	55                              # DW_AT_decl_line
+	.long	4240                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x349:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	64                              # DW_AT_decl_line
+	.long	4256                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x350:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	65                              # DW_AT_decl_line
+	.long	4273                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x357:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	66                              # DW_AT_decl_line
+	.long	4290                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x35e:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	67                              # DW_AT_decl_line
+	.long	4307                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x365:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	68                              # DW_AT_decl_line
+	.long	4324                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x36c:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	69                              # DW_AT_decl_line
+	.long	4341                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x373:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	70                              # DW_AT_decl_line
+	.long	4358                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x37a:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	71                              # DW_AT_decl_line
+	.long	4375                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x381:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	72                              # DW_AT_decl_line
+	.long	4392                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x388:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	73                              # DW_AT_decl_line
+	.long	4409                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x38f:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	74                              # DW_AT_decl_line
+	.long	4426                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x396:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	75                              # DW_AT_decl_line
+	.long	4443                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x39d:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	76                              # DW_AT_decl_line
+	.long	4460                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3a4:0x7 DW_TAG_imported_declaration
+	.byte	24                              # DW_AT_decl_file
+	.byte	87                              # DW_AT_decl_line
+	.long	4477                            # DW_AT_import
+	.byte	34                              # Abbrev [34] 0x3ab:0xc DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string64                 # DW_AT_name
+	.byte	15                              # DW_AT_decl_file
+	.short	2188                            # DW_AT_decl_line
+	.byte	34                              # Abbrev [34] 0x3b7:0xc DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string230                # DW_AT_name
+	.byte	15                              # DW_AT_decl_file
+	.short	2189                            # DW_AT_decl_line
+	.byte	43                              # Abbrev [43] 0x3c3:0x7 DW_TAG_imported_declaration
+	.byte	27                              # DW_AT_decl_file
+	.byte	52                              # DW_AT_decl_line
+	.long	4494                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3ca:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	127                             # DW_AT_decl_line
+	.long	4512                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3d1:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	128                             # DW_AT_decl_line
+	.long	4524                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3d8:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	130                             # DW_AT_decl_line
+	.long	4544                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3df:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	134                             # DW_AT_decl_line
+	.long	4552                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3e6:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	137                             # DW_AT_decl_line
+	.long	4576                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3ed:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	140                             # DW_AT_decl_line
+	.long	4594                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3f4:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	141                             # DW_AT_decl_line
+	.long	4611                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x3fb:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	142                             # DW_AT_decl_line
+	.long	4612                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x402:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	143                             # DW_AT_decl_line
+	.long	4630                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x409:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	144                             # DW_AT_decl_line
+	.long	4707                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x410:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	145                             # DW_AT_decl_line
+	.long	4730                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x417:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	146                             # DW_AT_decl_line
+	.long	4753                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x41e:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	147                             # DW_AT_decl_line
+	.long	4767                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x425:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	148                             # DW_AT_decl_line
+	.long	4781                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x42c:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	149                             # DW_AT_decl_line
+	.long	4799                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x433:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	150                             # DW_AT_decl_line
+	.long	4817                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x43a:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	151                             # DW_AT_decl_line
+	.long	4840                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x441:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	153                             # DW_AT_decl_line
+	.long	4858                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x448:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	154                             # DW_AT_decl_line
+	.long	4881                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x44f:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	155                             # DW_AT_decl_line
+	.long	4909                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x456:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	157                             # DW_AT_decl_line
+	.long	4937                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x45d:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	160                             # DW_AT_decl_line
+	.long	4966                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x464:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	163                             # DW_AT_decl_line
+	.long	4980                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x46b:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	164                             # DW_AT_decl_line
+	.long	4992                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x472:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	165                             # DW_AT_decl_line
+	.long	5015                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x479:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	166                             # DW_AT_decl_line
+	.long	5029                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x480:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	167                             # DW_AT_decl_line
+	.long	5056                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x487:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	168                             # DW_AT_decl_line
+	.long	5083                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x48e:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	169                             # DW_AT_decl_line
+	.long	5110                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x495:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	171                             # DW_AT_decl_line
+	.long	5128                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x49c:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	172                             # DW_AT_decl_line
+	.long	5156                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4a3:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	240                             # DW_AT_decl_line
+	.long	5179                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4aa:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	242                             # DW_AT_decl_line
+	.long	5199                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4b1:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	244                             # DW_AT_decl_line
+	.long	5213                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4b8:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	245                             # DW_AT_decl_line
+	.long	3504                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4bf:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	246                             # DW_AT_decl_line
+	.long	5231                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4c6:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	248                             # DW_AT_decl_line
+	.long	5254                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4cd:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	249                             # DW_AT_decl_line
+	.long	5326                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4d4:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	250                             # DW_AT_decl_line
+	.long	5272                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4db:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	251                             # DW_AT_decl_line
+	.long	5299                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4e2:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	252                             # DW_AT_decl_line
+	.long	5348                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4e9:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	98                              # DW_AT_decl_line
+	.long	5370                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4f0:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	99                              # DW_AT_decl_line
+	.long	5381                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4f7:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	101                             # DW_AT_decl_line
+	.long	5408                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x4fe:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	102                             # DW_AT_decl_line
+	.long	5427                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x505:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	103                             # DW_AT_decl_line
+	.long	5444                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x50c:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	104                             # DW_AT_decl_line
+	.long	5462                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x513:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	105                             # DW_AT_decl_line
+	.long	5480                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x51a:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	106                             # DW_AT_decl_line
+	.long	5497                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x521:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	107                             # DW_AT_decl_line
+	.long	5515                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x528:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	108                             # DW_AT_decl_line
+	.long	5557                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x52f:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	109                             # DW_AT_decl_line
+	.long	5585                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x536:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	110                             # DW_AT_decl_line
+	.long	5612                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x53d:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	111                             # DW_AT_decl_line
+	.long	5636                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x544:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	112                             # DW_AT_decl_line
+	.long	5659                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x54b:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	113                             # DW_AT_decl_line
+	.long	5682                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x552:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	114                             # DW_AT_decl_line
+	.long	5720                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x559:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	115                             # DW_AT_decl_line
+	.long	5752                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x560:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	116                             # DW_AT_decl_line
+	.long	5776                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x567:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	117                             # DW_AT_decl_line
+	.long	5804                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x56e:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	118                             # DW_AT_decl_line
+	.long	5841                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x575:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	119                             # DW_AT_decl_line
+	.long	5859                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x57c:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	120                             # DW_AT_decl_line
+	.long	5897                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x583:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	121                             # DW_AT_decl_line
+	.long	5915                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x58a:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	126                             # DW_AT_decl_line
+	.long	5926                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x591:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	127                             # DW_AT_decl_line
+	.long	5940                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x598:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	128                             # DW_AT_decl_line
+	.long	5959                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x59f:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	129                             # DW_AT_decl_line
+	.long	5982                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5a6:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	130                             # DW_AT_decl_line
+	.long	5999                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5ad:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	131                             # DW_AT_decl_line
+	.long	6017                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5b4:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	132                             # DW_AT_decl_line
+	.long	6034                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5bb:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	133                             # DW_AT_decl_line
+	.long	6056                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5c2:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	134                             # DW_AT_decl_line
+	.long	6070                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5c9:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	135                             # DW_AT_decl_line
+	.long	6089                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5d0:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	136                             # DW_AT_decl_line
+	.long	6108                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5d7:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	137                             # DW_AT_decl_line
+	.long	6141                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5de:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	138                             # DW_AT_decl_line
+	.long	6165                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5e5:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	139                             # DW_AT_decl_line
+	.long	6189                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5ec:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	141                             # DW_AT_decl_line
+	.long	6204                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5f3:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	143                             # DW_AT_decl_line
+	.long	6221                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x5fa:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	144                             # DW_AT_decl_line
+	.long	6244                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x601:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	145                             # DW_AT_decl_line
+	.long	6272                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x608:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	146                             # DW_AT_decl_line
+	.long	6294                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x60f:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	185                             # DW_AT_decl_line
+	.long	6322                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x616:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	186                             # DW_AT_decl_line
+	.long	6351                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x61d:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	187                             # DW_AT_decl_line
+	.long	6379                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x624:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	188                             # DW_AT_decl_line
+	.long	6402                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x62b:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	189                             # DW_AT_decl_line
+	.long	6435                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x632:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	82                              # DW_AT_decl_line
+	.long	6463                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x639:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	83                              # DW_AT_decl_line
+	.long	6484                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x640:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	84                              # DW_AT_decl_line
+	.long	1810                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x647:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	86                              # DW_AT_decl_line
+	.long	6495                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x64e:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	87                              # DW_AT_decl_line
+	.long	6512                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x655:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	89                              # DW_AT_decl_line
+	.long	6529                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x65c:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	91                              # DW_AT_decl_line
+	.long	6546                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x663:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	92                              # DW_AT_decl_line
+	.long	6563                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x66a:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	93                              # DW_AT_decl_line
+	.long	6585                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x671:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	94                              # DW_AT_decl_line
+	.long	6602                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x678:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	95                              # DW_AT_decl_line
+	.long	6619                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x67f:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	96                              # DW_AT_decl_line
+	.long	6636                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x686:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	97                              # DW_AT_decl_line
+	.long	6653                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x68d:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	98                              # DW_AT_decl_line
+	.long	6670                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x694:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	99                              # DW_AT_decl_line
+	.long	6687                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x69b:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	100                             # DW_AT_decl_line
+	.long	6704                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x6a2:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	101                             # DW_AT_decl_line
+	.long	6721                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x6a9:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	102                             # DW_AT_decl_line
+	.long	6743                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x6b0:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	103                             # DW_AT_decl_line
+	.long	6760                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x6b7:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	104                             # DW_AT_decl_line
+	.long	6777                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0x6be:0x7 DW_TAG_imported_declaration
+	.byte	37                              # DW_AT_decl_file
+	.byte	105                             # DW_AT_decl_line
+	.long	6794                            # DW_AT_import
+	.byte	0                               # End Of Children Mark
+	.byte	47                              # Abbrev [47] 0x6c6:0x15 DW_TAG_variable
+	.long	.Linfo_string14                 # DW_AT_name
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_external
+	.byte	4                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	9                               # DW_AT_location
+	.byte	3
+	.quad	globalVar
+	.byte	9                               # Abbrev [9] 0x6db:0x7 DW_TAG_base_type
+	.long	.Linfo_string7                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x6e2:0x5 DW_TAG_pointer_type
+	.long	1767                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x6e7:0x5 DW_TAG_pointer_type
+	.long	1772                            # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x6ec:0x7 DW_TAG_base_type
+	.long	.Linfo_string15                 # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x6f3:0xb DW_TAG_typedef
+	.long	1790                            # DW_AT_type
+	.long	.Linfo_string23                 # DW_AT_name
+	.byte	6                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x6fe:0xb DW_TAG_typedef
+	.long	1801                            # DW_AT_type
+	.long	.Linfo_string22                 # DW_AT_name
+	.byte	5                               # DW_AT_decl_file
+	.byte	21                              # DW_AT_decl_line
+	.byte	48                              # Abbrev [48] 0x709:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-7995765190136416266            # DW_AT_signature
+	.byte	8                               # Abbrev [8] 0x712:0xb DW_TAG_typedef
+	.long	1821                            # DW_AT_type
+	.long	.Linfo_string24                 # DW_AT_name
+	.byte	8                               # DW_AT_decl_file
+	.byte	20                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0x71d:0x7 DW_TAG_base_type
+	.long	.Linfo_string19                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	49                              # Abbrev [49] 0x724:0x12 DW_TAG_subprogram
+	.long	.Linfo_string25                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	318                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x730:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x736:0x12 DW_TAG_subprogram
+	.long	.Linfo_string26                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	727                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x742:0x5 DW_TAG_formal_parameter
+	.long	1864                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x748:0x5 DW_TAG_pointer_type
+	.long	1869                            # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x74d:0xb DW_TAG_typedef
+	.long	1880                            # DW_AT_type
+	.long	.Linfo_string68                 # DW_AT_name
+	.byte	13                              # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	48                              # Abbrev [48] 0x758:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	7611735379267581216             # DW_AT_signature
+	.byte	49                              # Abbrev [49] 0x761:0x1c DW_TAG_subprogram
+	.long	.Linfo_string69                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	756                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x76d:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x772:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x777:0x5 DW_TAG_formal_parameter
+	.long	1934                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x77d:0x5 DW_TAG_pointer_type
+	.long	1922                            # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x782:0x7 DW_TAG_base_type
+	.long	.Linfo_string70                 # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	50                              # Abbrev [50] 0x789:0x5 DW_TAG_restrict_type
+	.long	1917                            # DW_AT_type
+	.byte	50                              # Abbrev [50] 0x78e:0x5 DW_TAG_restrict_type
+	.long	1864                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x793:0x17 DW_TAG_subprogram
+	.long	.Linfo_string71                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	741                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x79f:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x7a4:0x5 DW_TAG_formal_parameter
+	.long	1864                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x7aa:0x17 DW_TAG_subprogram
+	.long	.Linfo_string72                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	763                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x7b6:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x7bb:0x5 DW_TAG_formal_parameter
+	.long	1934                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0x7c1:0x5 DW_TAG_restrict_type
+	.long	1990                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x7c6:0x5 DW_TAG_pointer_type
+	.long	1995                            # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x7cb:0x5 DW_TAG_const_type
+	.long	1922                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x7d0:0x17 DW_TAG_subprogram
+	.long	.Linfo_string73                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	573                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x7dc:0x5 DW_TAG_formal_parameter
+	.long	1864                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x7e1:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x7e7:0x18 DW_TAG_subprogram
+	.long	.Linfo_string74                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	580                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x7f3:0x5 DW_TAG_formal_parameter
+	.long	1934                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x7f8:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x7fd:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x7ff:0x18 DW_TAG_subprogram
+	.long	.Linfo_string75                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	621                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x80b:0x5 DW_TAG_formal_parameter
+	.long	1934                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x810:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x815:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x817:0x12 DW_TAG_subprogram
+	.long	.Linfo_string76                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	728                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x823:0x5 DW_TAG_formal_parameter
+	.long	1864                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	52                              # Abbrev [52] 0x829:0xc DW_TAG_subprogram
+	.long	.Linfo_string77                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	734                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	49                              # Abbrev [49] 0x835:0x1c DW_TAG_subprogram
+	.long	.Linfo_string78                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	329                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x841:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x846:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x84b:0x5 DW_TAG_formal_parameter
+	.long	2162                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x851:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string64                 # DW_AT_name
+	.byte	12                              # DW_AT_decl_file
+	.byte	46                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0x85c:0x7 DW_TAG_base_type
+	.long	.Linfo_string63                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	50                              # Abbrev [50] 0x863:0x5 DW_TAG_restrict_type
+	.long	2152                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x868:0x5 DW_TAG_pointer_type
+	.long	2157                            # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x86d:0x5 DW_TAG_const_type
+	.long	1772                            # DW_AT_type
+	.byte	50                              # Abbrev [50] 0x872:0x5 DW_TAG_restrict_type
+	.long	2167                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x877:0x5 DW_TAG_pointer_type
+	.long	1779                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x87c:0x21 DW_TAG_subprogram
+	.long	.Linfo_string79                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	296                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x888:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x88d:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x892:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x897:0x5 DW_TAG_formal_parameter
+	.long	2162                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x89d:0x12 DW_TAG_subprogram
+	.long	.Linfo_string80                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	292                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x8a9:0x5 DW_TAG_formal_parameter
+	.long	2223                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x8af:0x5 DW_TAG_pointer_type
+	.long	2228                            # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x8b4:0x5 DW_TAG_const_type
+	.long	1779                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x8b9:0x21 DW_TAG_subprogram
+	.long	.Linfo_string81                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	337                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x8c5:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x8ca:0x5 DW_TAG_formal_parameter
+	.long	2266                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x8cf:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x8d4:0x5 DW_TAG_formal_parameter
+	.long	2162                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0x8da:0x5 DW_TAG_restrict_type
+	.long	2271                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x8df:0x5 DW_TAG_pointer_type
+	.long	2152                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x8e4:0x17 DW_TAG_subprogram
+	.long	.Linfo_string82                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	742                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x8f0:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x8f5:0x5 DW_TAG_formal_parameter
+	.long	1864                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x8fb:0x12 DW_TAG_subprogram
+	.long	.Linfo_string83                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	748                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x907:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x90d:0x1d DW_TAG_subprogram
+	.long	.Linfo_string84                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	590                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x919:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x91e:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x923:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x928:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x92a:0x18 DW_TAG_subprogram
+	.long	.Linfo_string85                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	631                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x936:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x93b:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x940:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x942:0x17 DW_TAG_subprogram
+	.long	.Linfo_string86                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	771                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x94e:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x953:0x5 DW_TAG_formal_parameter
+	.long	1864                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x959:0x1c DW_TAG_subprogram
+	.long	.Linfo_string87                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	598                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x965:0x5 DW_TAG_formal_parameter
+	.long	1934                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x96a:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x96f:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x975:0x5 DW_TAG_pointer_type
+	.long	2426                            # DW_AT_type
+	.byte	48                              # Abbrev [48] 0x97a:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-2591953462449539473            # DW_AT_signature
+	.byte	49                              # Abbrev [49] 0x983:0x1c DW_TAG_subprogram
+	.long	.Linfo_string93                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	673                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x98f:0x5 DW_TAG_formal_parameter
+	.long	1934                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x994:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x999:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x99f:0x21 DW_TAG_subprogram
+	.long	.Linfo_string94                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	611                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x9ab:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x9b0:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x9b5:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x9ba:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x9c0:0x1c DW_TAG_subprogram
+	.long	.Linfo_string95                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	685                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x9cc:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x9d1:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x9d6:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x9dc:0x17 DW_TAG_subprogram
+	.long	.Linfo_string96                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	606                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x9e8:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x9ed:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x9f3:0x17 DW_TAG_subprogram
+	.long	.Linfo_string97                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	681                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x9ff:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa04:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xa0a:0x1c DW_TAG_subprogram
+	.long	.Linfo_string98                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	301                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xa16:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa1b:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa20:0x5 DW_TAG_formal_parameter
+	.long	2162                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0xa26:0x5 DW_TAG_restrict_type
+	.long	1767                            # DW_AT_type
+	.byte	53                              # Abbrev [53] 0xa2b:0x16 DW_TAG_subprogram
+	.long	.Linfo_string99                 # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	97                              # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xa36:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa3b:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xa41:0x16 DW_TAG_subprogram
+	.long	.Linfo_string100                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	106                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xa4c:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa51:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xa57:0x16 DW_TAG_subprogram
+	.long	.Linfo_string101                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	131                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xa62:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa67:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xa6d:0x16 DW_TAG_subprogram
+	.long	.Linfo_string102                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	87                              # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xa78:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa7d:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xa83:0x16 DW_TAG_subprogram
+	.long	.Linfo_string103                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	187                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xa8e:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xa93:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xa99:0x21 DW_TAG_subprogram
+	.long	.Linfo_string104                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	835                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xaa5:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xaaa:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xaaf:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xab4:0x5 DW_TAG_formal_parameter
+	.long	2746                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0xaba:0x5 DW_TAG_restrict_type
+	.long	2751                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0xabf:0x5 DW_TAG_pointer_type
+	.long	2756                            # DW_AT_type
+	.byte	36                              # Abbrev [36] 0xac4:0x5 DW_TAG_const_type
+	.long	2761                            # DW_AT_type
+	.byte	20                              # Abbrev [20] 0xac9:0x5 DW_TAG_structure_type
+	.long	.Linfo_string105                # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	53                              # Abbrev [53] 0xace:0x11 DW_TAG_subprogram
+	.long	.Linfo_string106                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	222                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xad9:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xadf:0x1b DW_TAG_subprogram
+	.long	.Linfo_string107                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	101                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xaea:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xaef:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xaf4:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xafa:0x1b DW_TAG_subprogram
+	.long	.Linfo_string108                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	109                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xb05:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb0a:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb0f:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xb15:0x1b DW_TAG_subprogram
+	.long	.Linfo_string109                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	92                              # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xb20:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb25:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb2a:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xb30:0x21 DW_TAG_subprogram
+	.long	.Linfo_string110                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	343                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xb3c:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb41:0x5 DW_TAG_formal_parameter
+	.long	2897                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb46:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb4b:0x5 DW_TAG_formal_parameter
+	.long	2162                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0xb51:0x5 DW_TAG_restrict_type
+	.long	2902                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0xb56:0x5 DW_TAG_pointer_type
+	.long	1990                            # DW_AT_type
+	.byte	53                              # Abbrev [53] 0xb5b:0x16 DW_TAG_subprogram
+	.long	.Linfo_string111                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	191                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xb66:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb6b:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xb71:0x17 DW_TAG_subprogram
+	.long	.Linfo_string112                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	377                             # DW_AT_decl_line
+	.long	2952                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xb7d:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xb82:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0xb88:0x7 DW_TAG_base_type
+	.long	.Linfo_string113                # DW_AT_name
+	.byte	4                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	50                              # Abbrev [50] 0xb8f:0x5 DW_TAG_restrict_type
+	.long	2964                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0xb94:0x5 DW_TAG_pointer_type
+	.long	1917                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0xb99:0x17 DW_TAG_subprogram
+	.long	.Linfo_string114                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	382                             # DW_AT_decl_line
+	.long	2992                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xba5:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xbaa:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0xbb0:0x7 DW_TAG_base_type
+	.long	.Linfo_string115                # DW_AT_name
+	.byte	4                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	53                              # Abbrev [53] 0xbb7:0x1b DW_TAG_subprogram
+	.long	.Linfo_string116                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	217                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xbc2:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xbc7:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xbcc:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xbd2:0x1c DW_TAG_subprogram
+	.long	.Linfo_string117                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	428                             # DW_AT_decl_line
+	.long	3054                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xbde:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xbe3:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xbe8:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0xbee:0x7 DW_TAG_base_type
+	.long	.Linfo_string45                 # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	49                              # Abbrev [49] 0xbf5:0x1c DW_TAG_subprogram
+	.long	.Linfo_string118                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	433                             # DW_AT_decl_line
+	.long	2140                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc01:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc06:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc0b:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xc11:0x1b DW_TAG_subprogram
+	.long	.Linfo_string119                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	135                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc1c:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc21:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc26:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xc2c:0x12 DW_TAG_subprogram
+	.long	.Linfo_string120                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	324                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc38:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xc3e:0x1c DW_TAG_subprogram
+	.long	.Linfo_string121                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	258                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc4a:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc4f:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc54:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xc5a:0x1c DW_TAG_subprogram
+	.long	.Linfo_string122                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	262                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc66:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc6b:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc70:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xc76:0x1c DW_TAG_subprogram
+	.long	.Linfo_string123                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	267                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc82:0x5 DW_TAG_formal_parameter
+	.long	1917                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc87:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xc8c:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xc92:0x1c DW_TAG_subprogram
+	.long	.Linfo_string124                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	271                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xc9e:0x5 DW_TAG_formal_parameter
+	.long	1917                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xca3:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xca8:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xcae:0x13 DW_TAG_subprogram
+	.long	.Linfo_string125                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	587                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xcba:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0xcbf:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xcc1:0x13 DW_TAG_subprogram
+	.long	.Linfo_string126                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	628                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xccd:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0xcd2:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xcd4:0x16 DW_TAG_subprogram
+	.long	.Linfo_string127                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	164                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xcdf:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xce4:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xcea:0x16 DW_TAG_subprogram
+	.long	.Linfo_string128                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	201                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xcf5:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xcfa:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xd00:0x16 DW_TAG_subprogram
+	.long	.Linfo_string129                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	174                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xd0b:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xd10:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xd16:0x16 DW_TAG_subprogram
+	.long	.Linfo_string130                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	212                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xd21:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xd26:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0xd2c:0x1b DW_TAG_subprogram
+	.long	.Linfo_string131                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.byte	253                             # DW_AT_decl_line
+	.long	1917                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xd37:0x5 DW_TAG_formal_parameter
+	.long	1990                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xd3c:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xd41:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	2                               # Abbrev [2] 0xd47:0xa7 DW_TAG_namespace
+	.long	.Linfo_string132                # DW_AT_name
+	.byte	43                              # Abbrev [43] 0xd4c:0x7 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.byte	248                             # DW_AT_decl_line
+	.long	3566                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0xd53:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	257                             # DW_AT_decl_line
+	.long	3596                            # DW_AT_import
+	.byte	44                              # Abbrev [44] 0xd5b:0x8 DW_TAG_imported_declaration
+	.byte	7                               # DW_AT_decl_file
+	.short	258                             # DW_AT_decl_line
+	.long	3631                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd63:0x7 DW_TAG_imported_declaration
+	.byte	25                              # DW_AT_decl_file
+	.byte	44                              # DW_AT_decl_line
+	.long	939                             # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd6a:0x7 DW_TAG_imported_declaration
+	.byte	25                              # DW_AT_decl_file
+	.byte	45                              # DW_AT_decl_line
+	.long	951                             # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd71:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	200                             # DW_AT_decl_line
+	.long	5179                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd78:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	206                             # DW_AT_decl_line
+	.long	5199                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd7f:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	210                             # DW_AT_decl_line
+	.long	5213                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd86:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	216                             # DW_AT_decl_line
+	.long	5231                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd8d:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	227                             # DW_AT_decl_line
+	.long	5254                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd94:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	228                             # DW_AT_decl_line
+	.long	5272                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xd9b:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	229                             # DW_AT_decl_line
+	.long	5299                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xda2:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	231                             # DW_AT_decl_line
+	.long	5326                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xda9:0x7 DW_TAG_imported_declaration
+	.byte	28                              # DW_AT_decl_file
+	.byte	232                             # DW_AT_decl_line
+	.long	5348                            # DW_AT_import
+	.byte	29                              # Abbrev [29] 0xdb0:0x1a DW_TAG_subprogram
+	.long	.Linfo_string274                # DW_AT_linkage_name
+	.long	.Linfo_string244                # DW_AT_name
+	.byte	28                              # DW_AT_decl_file
+	.byte	213                             # DW_AT_decl_line
+	.long	5179                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xdbf:0x5 DW_TAG_formal_parameter
+	.long	3624                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xdc4:0x5 DW_TAG_formal_parameter
+	.long	3624                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	43                              # Abbrev [43] 0xdca:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	175                             # DW_AT_decl_line
+	.long	6322                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xdd1:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	176                             # DW_AT_decl_line
+	.long	6351                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xdd8:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	177                             # DW_AT_decl_line
+	.long	6379                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xddf:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	178                             # DW_AT_decl_line
+	.long	6402                            # DW_AT_import
+	.byte	43                              # Abbrev [43] 0xde6:0x7 DW_TAG_imported_declaration
+	.byte	32                              # DW_AT_decl_file
+	.byte	179                             # DW_AT_decl_line
+	.long	6435                            # DW_AT_import
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0xdee:0x17 DW_TAG_subprogram
+	.long	.Linfo_string133                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	384                             # DW_AT_decl_line
+	.long	3589                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xdfa:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xdff:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0xe05:0x7 DW_TAG_base_type
+	.long	.Linfo_string134                # DW_AT_name
+	.byte	4                               # DW_AT_encoding
+	.byte	16                              # DW_AT_byte_size
+	.byte	49                              # Abbrev [49] 0xe0c:0x1c DW_TAG_subprogram
+	.long	.Linfo_string135                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	441                             # DW_AT_decl_line
+	.long	3624                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xe18:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xe1d:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xe22:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0xe28:0x7 DW_TAG_base_type
+	.long	.Linfo_string136                # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	49                              # Abbrev [49] 0xe2f:0x1c DW_TAG_subprogram
+	.long	.Linfo_string137                # DW_AT_name
+	.byte	9                               # DW_AT_decl_file
+	.short	448                             # DW_AT_decl_line
+	.long	3659                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0xe3b:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xe40:0x5 DW_TAG_formal_parameter
+	.long	2959                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0xe45:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0xe4b:0x7 DW_TAG_base_type
+	.long	.Linfo_string138                # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	8                               # DW_AT_byte_size
+	.byte	2                               # Abbrev [2] 0xe52:0xd DW_TAG_namespace
+	.long	.Linfo_string163                # DW_AT_name
+	.byte	54                              # Abbrev [54] 0xe57:0x7 DW_TAG_imported_module
+	.byte	16                              # DW_AT_decl_file
+	.byte	58                              # DW_AT_decl_line
+	.long	619                             # DW_AT_import
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0xe5f:0xb DW_TAG_typedef
+	.long	3690                            # DW_AT_type
+	.long	.Linfo_string166                # DW_AT_name
+	.byte	17                              # DW_AT_decl_file
+	.byte	24                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xe6a:0xb DW_TAG_typedef
+	.long	3701                            # DW_AT_type
+	.long	.Linfo_string165                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	36                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0xe75:0x7 DW_TAG_base_type
+	.long	.Linfo_string50                 # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0xe7c:0xb DW_TAG_typedef
+	.long	3719                            # DW_AT_type
+	.long	.Linfo_string169                # DW_AT_name
+	.byte	17                              # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xe87:0xb DW_TAG_typedef
+	.long	3730                            # DW_AT_type
+	.long	.Linfo_string168                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	38                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0xe92:0x7 DW_TAG_base_type
+	.long	.Linfo_string167                # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	2                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0xe99:0xb DW_TAG_typedef
+	.long	3748                            # DW_AT_type
+	.long	.Linfo_string171                # DW_AT_name
+	.byte	17                              # DW_AT_decl_file
+	.byte	26                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xea4:0xb DW_TAG_typedef
+	.long	1755                            # DW_AT_type
+	.long	.Linfo_string170                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	40                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xeaf:0xb DW_TAG_typedef
+	.long	3770                            # DW_AT_type
+	.long	.Linfo_string173                # DW_AT_name
+	.byte	17                              # DW_AT_decl_file
+	.byte	27                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xeba:0xb DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string172                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	43                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xec5:0xb DW_TAG_typedef
+	.long	3701                            # DW_AT_type
+	.long	.Linfo_string174                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	58                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xed0:0xb DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string175                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	60                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xedb:0xb DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string176                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	61                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xee6:0xb DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string177                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	62                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xef1:0xb DW_TAG_typedef
+	.long	3836                            # DW_AT_type
+	.long	.Linfo_string179                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	43                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xefc:0xb DW_TAG_typedef
+	.long	3690                            # DW_AT_type
+	.long	.Linfo_string178                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	51                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf07:0xb DW_TAG_typedef
+	.long	3858                            # DW_AT_type
+	.long	.Linfo_string181                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	44                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf12:0xb DW_TAG_typedef
+	.long	3719                            # DW_AT_type
+	.long	.Linfo_string180                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	53                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf1d:0xb DW_TAG_typedef
+	.long	3880                            # DW_AT_type
+	.long	.Linfo_string183                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	45                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf28:0xb DW_TAG_typedef
+	.long	3748                            # DW_AT_type
+	.long	.Linfo_string182                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	55                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf33:0xb DW_TAG_typedef
+	.long	3902                            # DW_AT_type
+	.long	.Linfo_string185                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	46                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf3e:0xb DW_TAG_typedef
+	.long	3770                            # DW_AT_type
+	.long	.Linfo_string184                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	57                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf49:0xb DW_TAG_typedef
+	.long	3924                            # DW_AT_type
+	.long	.Linfo_string187                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	101                             # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf54:0xb DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string186                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	71                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf5f:0xb DW_TAG_typedef
+	.long	3054                            # DW_AT_type
+	.long	.Linfo_string188                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	87                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf6a:0xb DW_TAG_typedef
+	.long	3957                            # DW_AT_type
+	.long	.Linfo_string191                # DW_AT_name
+	.byte	20                              # DW_AT_decl_file
+	.byte	24                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf75:0xb DW_TAG_typedef
+	.long	3968                            # DW_AT_type
+	.long	.Linfo_string190                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	37                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0xf80:0x7 DW_TAG_base_type
+	.long	.Linfo_string189                # DW_AT_name
+	.byte	8                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0xf87:0xb DW_TAG_typedef
+	.long	3986                            # DW_AT_type
+	.long	.Linfo_string193                # DW_AT_name
+	.byte	20                              # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xf92:0xb DW_TAG_typedef
+	.long	3997                            # DW_AT_type
+	.long	.Linfo_string192                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	39                              # DW_AT_decl_line
+	.byte	9                               # Abbrev [9] 0xf9d:0x7 DW_TAG_base_type
+	.long	.Linfo_string48                 # DW_AT_name
+	.byte	7                               # DW_AT_encoding
+	.byte	2                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0xfa4:0xb DW_TAG_typedef
+	.long	4015                            # DW_AT_type
+	.long	.Linfo_string195                # DW_AT_name
+	.byte	20                              # DW_AT_decl_file
+	.byte	26                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xfaf:0xb DW_TAG_typedef
+	.long	1821                            # DW_AT_type
+	.long	.Linfo_string194                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	41                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xfba:0xb DW_TAG_typedef
+	.long	4037                            # DW_AT_type
+	.long	.Linfo_string197                # DW_AT_name
+	.byte	20                              # DW_AT_decl_file
+	.byte	27                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xfc5:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string196                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	44                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xfd0:0xb DW_TAG_typedef
+	.long	3968                            # DW_AT_type
+	.long	.Linfo_string198                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	71                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xfdb:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string199                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	73                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xfe6:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string200                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	74                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xff1:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string201                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	75                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0xffc:0xb DW_TAG_typedef
+	.long	4103                            # DW_AT_type
+	.long	.Linfo_string203                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	49                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1007:0xb DW_TAG_typedef
+	.long	3957                            # DW_AT_type
+	.long	.Linfo_string202                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	52                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1012:0xb DW_TAG_typedef
+	.long	4125                            # DW_AT_type
+	.long	.Linfo_string205                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	50                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x101d:0xb DW_TAG_typedef
+	.long	3986                            # DW_AT_type
+	.long	.Linfo_string204                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	54                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1028:0xb DW_TAG_typedef
+	.long	4147                            # DW_AT_type
+	.long	.Linfo_string207                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	51                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1033:0xb DW_TAG_typedef
+	.long	4015                            # DW_AT_type
+	.long	.Linfo_string206                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	56                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x103e:0xb DW_TAG_typedef
+	.long	4169                            # DW_AT_type
+	.long	.Linfo_string209                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	52                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1049:0xb DW_TAG_typedef
+	.long	4037                            # DW_AT_type
+	.long	.Linfo_string208                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	58                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1054:0xb DW_TAG_typedef
+	.long	4191                            # DW_AT_type
+	.long	.Linfo_string211                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	102                             # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x105f:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string210                # DW_AT_name
+	.byte	11                              # DW_AT_decl_file
+	.byte	72                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x106a:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string212                # DW_AT_name
+	.byte	19                              # DW_AT_decl_file
+	.byte	90                              # DW_AT_decl_line
+	.byte	20                              # Abbrev [20] 0x1075:0x5 DW_TAG_structure_type
+	.long	.Linfo_string213                # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	53                              # Abbrev [53] 0x107a:0x16 DW_TAG_subprogram
+	.long	.Linfo_string214                # DW_AT_name
+	.byte	22                              # DW_AT_decl_file
+	.byte	122                             # DW_AT_decl_line
+	.long	1767                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1085:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x108a:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	55                              # Abbrev [55] 0x1090:0xb DW_TAG_subprogram
+	.long	.Linfo_string215                # DW_AT_name
+	.byte	22                              # DW_AT_decl_file
+	.byte	125                             # DW_AT_decl_line
+	.long	4251                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	10                              # Abbrev [10] 0x109b:0x5 DW_TAG_pointer_type
+	.long	4213                            # DW_AT_type
+	.byte	53                              # Abbrev [53] 0x10a0:0x11 DW_TAG_subprogram
+	.long	.Linfo_string216                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	108                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x10ab:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x10b1:0x11 DW_TAG_subprogram
+	.long	.Linfo_string217                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	109                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x10bc:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x10c2:0x11 DW_TAG_subprogram
+	.long	.Linfo_string218                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	110                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x10cd:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x10d3:0x11 DW_TAG_subprogram
+	.long	.Linfo_string219                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	111                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x10de:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x10e4:0x11 DW_TAG_subprogram
+	.long	.Linfo_string220                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	113                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x10ef:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x10f5:0x11 DW_TAG_subprogram
+	.long	.Linfo_string221                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	112                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1100:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1106:0x11 DW_TAG_subprogram
+	.long	.Linfo_string222                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	114                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1111:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1117:0x11 DW_TAG_subprogram
+	.long	.Linfo_string223                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	115                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1122:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1128:0x11 DW_TAG_subprogram
+	.long	.Linfo_string224                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	116                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1133:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1139:0x11 DW_TAG_subprogram
+	.long	.Linfo_string225                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	117                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1144:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x114a:0x11 DW_TAG_subprogram
+	.long	.Linfo_string226                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	118                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1155:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x115b:0x11 DW_TAG_subprogram
+	.long	.Linfo_string227                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	122                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1166:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x116c:0x11 DW_TAG_subprogram
+	.long	.Linfo_string228                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	125                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1177:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x117d:0x11 DW_TAG_subprogram
+	.long	.Linfo_string229                # DW_AT_name
+	.byte	23                              # DW_AT_decl_file
+	.byte	130                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1188:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x118e:0x12 DW_TAG_subprogram
+	.long	.Linfo_string231                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	837                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x119a:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x11a0:0xb DW_TAG_typedef
+	.long	4523                            # DW_AT_type
+	.long	.Linfo_string232                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	62                              # DW_AT_decl_line
+	.byte	56                              # Abbrev [56] 0x11ab:0x1 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.byte	8                               # Abbrev [8] 0x11ac:0xb DW_TAG_typedef
+	.long	4535                            # DW_AT_type
+	.long	.Linfo_string235                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	70                              # DW_AT_decl_line
+	.byte	48                              # Abbrev [48] 0x11b7:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	8006786087487830082             # DW_AT_signature
+	.byte	57                              # Abbrev [57] 0x11c0:0x8 DW_TAG_subprogram
+	.long	.Linfo_string236                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	588                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+                                        # DW_AT_noreturn
+	.byte	49                              # Abbrev [49] 0x11c8:0x12 DW_TAG_subprogram
+	.long	.Linfo_string237                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	592                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x11d4:0x5 DW_TAG_formal_parameter
+	.long	4570                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x11da:0x5 DW_TAG_pointer_type
+	.long	4575                            # DW_AT_type
+	.byte	58                              # Abbrev [58] 0x11df:0x1 DW_TAG_subroutine_type
+	.byte	49                              # Abbrev [49] 0x11e0:0x12 DW_TAG_subprogram
+	.long	.Linfo_string238                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	597                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x11ec:0x5 DW_TAG_formal_parameter
+	.long	4570                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x11f2:0x11 DW_TAG_subprogram
+	.long	.Linfo_string239                # DW_AT_name
+	.byte	29                              # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	2952                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x11fd:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	59                              # Abbrev [59] 0x1203:0x1 DW_TAG_subprogram
+	.byte	49                              # Abbrev [49] 0x1204:0x12 DW_TAG_subprogram
+	.long	.Linfo_string240                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	366                             # DW_AT_decl_line
+	.long	3054                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1210:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1216:0x25 DW_TAG_subprogram
+	.long	.Linfo_string241                # DW_AT_name
+	.byte	30                              # DW_AT_decl_file
+	.byte	20                              # DW_AT_decl_line
+	.long	4667                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1221:0x5 DW_TAG_formal_parameter
+	.long	4668                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1226:0x5 DW_TAG_formal_parameter
+	.long	4668                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x122b:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1230:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1235:0x5 DW_TAG_formal_parameter
+	.long	4674                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	22                              # Abbrev [22] 0x123b:0x1 DW_TAG_pointer_type
+	.byte	10                              # Abbrev [10] 0x123c:0x5 DW_TAG_pointer_type
+	.long	4673                            # DW_AT_type
+	.byte	60                              # Abbrev [60] 0x1241:0x1 DW_TAG_const_type
+	.byte	34                              # Abbrev [34] 0x1242:0xc DW_TAG_typedef
+	.long	4686                            # DW_AT_type
+	.long	.Linfo_string242                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	805                             # DW_AT_decl_line
+	.byte	10                              # Abbrev [10] 0x124e:0x5 DW_TAG_pointer_type
+	.long	4691                            # DW_AT_type
+	.byte	61                              # Abbrev [61] 0x1253:0x10 DW_TAG_subroutine_type
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1258:0x5 DW_TAG_formal_parameter
+	.long	4668                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x125d:0x5 DW_TAG_formal_parameter
+	.long	4668                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1263:0x17 DW_TAG_subprogram
+	.long	.Linfo_string243                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	541                             # DW_AT_decl_line
+	.long	4667                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x126f:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1274:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x127a:0x17 DW_TAG_subprogram
+	.long	.Linfo_string244                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	849                             # DW_AT_decl_line
+	.long	4512                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1286:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x128b:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	62                              # Abbrev [62] 0x1291:0xe DW_TAG_subprogram
+	.long	.Linfo_string245                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	614                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+                                        # DW_AT_noreturn
+	.byte	27                              # Abbrev [27] 0x1299:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	63                              # Abbrev [63] 0x129f:0xe DW_TAG_subprogram
+	.long	.Linfo_string246                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	563                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x12a7:0x5 DW_TAG_formal_parameter
+	.long	4667                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x12ad:0x12 DW_TAG_subprogram
+	.long	.Linfo_string247                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	631                             # DW_AT_decl_line
+	.long	1767                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x12b9:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x12bf:0x12 DW_TAG_subprogram
+	.long	.Linfo_string248                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	838                             # DW_AT_decl_line
+	.long	3054                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x12cb:0x5 DW_TAG_formal_parameter
+	.long	3054                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x12d1:0x17 DW_TAG_subprogram
+	.long	.Linfo_string249                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	851                             # DW_AT_decl_line
+	.long	4524                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x12dd:0x5 DW_TAG_formal_parameter
+	.long	3054                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x12e2:0x5 DW_TAG_formal_parameter
+	.long	3054                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x12e8:0x12 DW_TAG_subprogram
+	.long	.Linfo_string250                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	539                             # DW_AT_decl_line
+	.long	4667                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x12f4:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x12fa:0x17 DW_TAG_subprogram
+	.long	.Linfo_string251                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	919                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1306:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x130b:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1311:0x1c DW_TAG_subprogram
+	.long	.Linfo_string252                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	930                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x131d:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1322:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1327:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x132d:0x1c DW_TAG_subprogram
+	.long	.Linfo_string253                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	922                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1339:0x5 DW_TAG_formal_parameter
+	.long	1929                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x133e:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1343:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	63                              # Abbrev [63] 0x1349:0x1d DW_TAG_subprogram
+	.long	.Linfo_string254                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	827                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1351:0x5 DW_TAG_formal_parameter
+	.long	4667                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1356:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x135b:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1360:0x5 DW_TAG_formal_parameter
+	.long	4674                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	62                              # Abbrev [62] 0x1366:0xe DW_TAG_subprogram
+	.long	.Linfo_string255                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	620                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+                                        # DW_AT_noreturn
+	.byte	27                              # Abbrev [27] 0x136e:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	52                              # Abbrev [52] 0x1374:0xc DW_TAG_subprogram
+	.long	.Linfo_string256                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	453                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	49                              # Abbrev [49] 0x1380:0x17 DW_TAG_subprogram
+	.long	.Linfo_string257                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	549                             # DW_AT_decl_line
+	.long	4667                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x138c:0x5 DW_TAG_formal_parameter
+	.long	4667                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1391:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	63                              # Abbrev [63] 0x1397:0xe DW_TAG_subprogram
+	.long	.Linfo_string258                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	455                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x139f:0x5 DW_TAG_formal_parameter
+	.long	1821                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x13a5:0x16 DW_TAG_subprogram
+	.long	.Linfo_string259                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	117                             # DW_AT_decl_line
+	.long	2952                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x13b0:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x13b5:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0x13bb:0x5 DW_TAG_restrict_type
+	.long	1762                            # DW_AT_type
+	.byte	53                              # Abbrev [53] 0x13c0:0x1b DW_TAG_subprogram
+	.long	.Linfo_string260                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	176                             # DW_AT_decl_line
+	.long	3054                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x13cb:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x13d0:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x13d5:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x13db:0x1b DW_TAG_subprogram
+	.long	.Linfo_string261                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	180                             # DW_AT_decl_line
+	.long	2140                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x13e6:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x13eb:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x13f0:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x13f6:0x12 DW_TAG_subprogram
+	.long	.Linfo_string262                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	781                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1402:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1408:0x1c DW_TAG_subprogram
+	.long	.Linfo_string263                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	933                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1414:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1419:0x5 DW_TAG_formal_parameter
+	.long	1985                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x141e:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1424:0x17 DW_TAG_subprogram
+	.long	.Linfo_string264                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	926                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1430:0x5 DW_TAG_formal_parameter
+	.long	1767                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1435:0x5 DW_TAG_formal_parameter
+	.long	1922                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x143b:0xb DW_TAG_typedef
+	.long	5190                            # DW_AT_type
+	.long	.Linfo_string265                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	80                              # DW_AT_decl_line
+	.byte	48                              # Abbrev [48] 0x1446:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-816464099603271399             # DW_AT_signature
+	.byte	62                              # Abbrev [62] 0x144f:0xe DW_TAG_subprogram
+	.long	.Linfo_string266                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	626                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+                                        # DW_AT_noreturn
+	.byte	27                              # Abbrev [27] 0x1457:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x145d:0x12 DW_TAG_subprogram
+	.long	.Linfo_string267                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	841                             # DW_AT_decl_line
+	.long	3624                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1469:0x5 DW_TAG_formal_parameter
+	.long	3624                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x146f:0x17 DW_TAG_subprogram
+	.long	.Linfo_string268                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	855                             # DW_AT_decl_line
+	.long	5179                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x147b:0x5 DW_TAG_formal_parameter
+	.long	3624                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1480:0x5 DW_TAG_formal_parameter
+	.long	3624                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1486:0x12 DW_TAG_subprogram
+	.long	.Linfo_string269                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	373                             # DW_AT_decl_line
+	.long	3624                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1492:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1498:0x1b DW_TAG_subprogram
+	.long	.Linfo_string270                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	200                             # DW_AT_decl_line
+	.long	3624                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x14a3:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x14a8:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x14ad:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x14b3:0x1b DW_TAG_subprogram
+	.long	.Linfo_string271                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	205                             # DW_AT_decl_line
+	.long	3659                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x14be:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x14c3:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x14c8:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x14ce:0x16 DW_TAG_subprogram
+	.long	.Linfo_string272                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	123                             # DW_AT_decl_line
+	.long	2992                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x14d9:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x14de:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x14e4:0x16 DW_TAG_subprogram
+	.long	.Linfo_string273                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.byte	126                             # DW_AT_decl_line
+	.long	3589                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x14ef:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x14f4:0x5 DW_TAG_formal_parameter
+	.long	5051                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x14fa:0xb DW_TAG_typedef
+	.long	1880                            # DW_AT_type
+	.long	.Linfo_string275                # DW_AT_name
+	.byte	31                              # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1505:0xb DW_TAG_typedef
+	.long	5392                            # DW_AT_type
+	.long	.Linfo_string278                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	86                              # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x1510:0xb DW_TAG_typedef
+	.long	5403                            # DW_AT_type
+	.long	.Linfo_string277                # DW_AT_name
+	.byte	33                              # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.byte	20                              # Abbrev [20] 0x151b:0x5 DW_TAG_structure_type
+	.long	.Linfo_string276                # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	63                              # Abbrev [63] 0x1520:0xe DW_TAG_subprogram
+	.long	.Linfo_string279                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	763                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1528:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x152e:0x5 DW_TAG_pointer_type
+	.long	5370                            # DW_AT_type
+	.byte	53                              # Abbrev [53] 0x1533:0x11 DW_TAG_subprogram
+	.long	.Linfo_string280                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	213                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x153e:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1544:0x12 DW_TAG_subprogram
+	.long	.Linfo_string281                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	765                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1550:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1556:0x12 DW_TAG_subprogram
+	.long	.Linfo_string282                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	767                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1562:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1568:0x11 DW_TAG_subprogram
+	.long	.Linfo_string283                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	218                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1573:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1579:0x12 DW_TAG_subprogram
+	.long	.Linfo_string284                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	491                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1585:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	64                              # Abbrev [64] 0x158b:0x1b DW_TAG_subprogram
+	.long	.Linfo_string285                # DW_AT_linkage_name
+	.long	.Linfo_string286                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	745                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x159b:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x15a0:0x5 DW_TAG_formal_parameter
+	.long	5547                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0x15a6:0x5 DW_TAG_restrict_type
+	.long	5422                            # DW_AT_type
+	.byte	50                              # Abbrev [50] 0x15ab:0x5 DW_TAG_restrict_type
+	.long	5552                            # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x15b0:0x5 DW_TAG_pointer_type
+	.long	5381                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x15b5:0x1c DW_TAG_subprogram
+	.long	.Linfo_string287                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	570                             # DW_AT_decl_line
+	.long	1767                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x15c1:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x15c6:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x15cb:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	64                              # Abbrev [64] 0x15d1:0x1b DW_TAG_subprogram
+	.long	.Linfo_string288                # DW_AT_linkage_name
+	.long	.Linfo_string289                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	257                             # DW_AT_decl_line
+	.long	5422                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x15e1:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x15e6:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x15ec:0x18 DW_TAG_subprogram
+	.long	.Linfo_string290                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	326                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x15f8:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x15fd:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x1602:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1604:0x17 DW_TAG_subprogram
+	.long	.Linfo_string291                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	527                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1610:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1615:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x161b:0x17 DW_TAG_subprogram
+	.long	.Linfo_string292                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	632                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1627:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x162c:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1632:0x21 DW_TAG_subprogram
+	.long	.Linfo_string293                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	652                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x163e:0x5 DW_TAG_formal_parameter
+	.long	5715                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1643:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1648:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x164d:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0x1653:0x5 DW_TAG_restrict_type
+	.long	4667                            # DW_AT_type
+	.byte	64                              # Abbrev [64] 0x1658:0x20 DW_TAG_subprogram
+	.long	.Linfo_string294                # DW_AT_linkage_name
+	.long	.Linfo_string295                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	260                             # DW_AT_decl_line
+	.long	5422                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1668:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x166d:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1672:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1678:0x18 DW_TAG_subprogram
+	.long	.Linfo_string296                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	391                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1684:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1689:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x168e:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1690:0x1c DW_TAG_subprogram
+	.long	.Linfo_string297                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	690                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x169c:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x16a1:0x5 DW_TAG_formal_parameter
+	.long	3054                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x16a6:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	64                              # Abbrev [64] 0x16ac:0x1b DW_TAG_subprogram
+	.long	.Linfo_string298                # DW_AT_linkage_name
+	.long	.Linfo_string299                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	747                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x16bc:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x16c1:0x5 DW_TAG_formal_parameter
+	.long	5831                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x16c7:0x5 DW_TAG_pointer_type
+	.long	5836                            # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x16cc:0x5 DW_TAG_const_type
+	.long	5381                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x16d1:0x12 DW_TAG_subprogram
+	.long	.Linfo_string300                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	695                             # DW_AT_decl_line
+	.long	3054                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x16dd:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x16e3:0x21 DW_TAG_subprogram
+	.long	.Linfo_string301                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	658                             # DW_AT_decl_line
+	.long	2129                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x16ef:0x5 DW_TAG_formal_parameter
+	.long	5892                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x16f4:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x16f9:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x16fe:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	50                              # Abbrev [50] 0x1704:0x5 DW_TAG_restrict_type
+	.long	4668                            # DW_AT_type
+	.byte	49                              # Abbrev [49] 0x1709:0x12 DW_TAG_subprogram
+	.long	.Linfo_string302                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	492                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1715:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	55                              # Abbrev [55] 0x171b:0xb DW_TAG_subprogram
+	.long	.Linfo_string303                # DW_AT_name
+	.byte	35                              # DW_AT_decl_file
+	.byte	47                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	63                              # Abbrev [63] 0x1726:0xe DW_TAG_subprogram
+	.long	.Linfo_string304                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	781                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x172e:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1734:0x13 DW_TAG_subprogram
+	.long	.Linfo_string305                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	332                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1740:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x1745:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1747:0x17 DW_TAG_subprogram
+	.long	.Linfo_string306                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	528                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1753:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1758:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x175e:0x11 DW_TAG_subprogram
+	.long	.Linfo_string307                # DW_AT_name
+	.byte	35                              # DW_AT_decl_file
+	.byte	82                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1769:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x176f:0x12 DW_TAG_subprogram
+	.long	.Linfo_string308                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	638                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x177b:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1781:0x11 DW_TAG_subprogram
+	.long	.Linfo_string309                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	146                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x178c:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1792:0x16 DW_TAG_subprogram
+	.long	.Linfo_string310                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	148                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x179d:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x17a2:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	63                              # Abbrev [63] 0x17a8:0xe DW_TAG_subprogram
+	.long	.Linfo_string311                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	700                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x17b0:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x17b6:0x13 DW_TAG_subprogram
+	.long	.Linfo_string312                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	397                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x17c2:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x17c7:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	63                              # Abbrev [63] 0x17c9:0x13 DW_TAG_subprogram
+	.long	.Linfo_string313                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	304                             # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x17d1:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x17d6:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x17dc:0x21 DW_TAG_subprogram
+	.long	.Linfo_string314                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	308                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x17e8:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x17ed:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x17f2:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x17f7:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x17fd:0x18 DW_TAG_subprogram
+	.long	.Linfo_string315                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	334                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1809:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x180e:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x1813:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1815:0x18 DW_TAG_subprogram
+	.long	.Linfo_string316                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	399                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1821:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1826:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x182b:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	65                              # Abbrev [65] 0x182d:0xf DW_TAG_subprogram
+	.long	.Linfo_string317                # DW_AT_linkage_name
+	.long	.Linfo_string318                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	176                             # DW_AT_decl_line
+	.long	5422                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	53                              # Abbrev [53] 0x183c:0x11 DW_TAG_subprogram
+	.long	.Linfo_string319                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.byte	187                             # DW_AT_decl_line
+	.long	1767                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1847:0x5 DW_TAG_formal_parameter
+	.long	1767                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x184d:0x17 DW_TAG_subprogram
+	.long	.Linfo_string320                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	645                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1859:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x185e:0x5 DW_TAG_formal_parameter
+	.long	5422                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1864:0x1c DW_TAG_subprogram
+	.long	.Linfo_string321                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	341                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1870:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1875:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x187a:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1880:0x16 DW_TAG_subprogram
+	.long	.Linfo_string322                # DW_AT_name
+	.byte	35                              # DW_AT_decl_file
+	.byte	39                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x188b:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1890:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1896:0x1c DW_TAG_subprogram
+	.long	.Linfo_string323                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	349                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x18a2:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18a7:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18ac:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x18b2:0x1d DW_TAG_subprogram
+	.long	.Linfo_string324                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	354                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x18be:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18c3:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18c8:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	51                              # Abbrev [51] 0x18cd:0x1 DW_TAG_unspecified_parameters
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x18cf:0x1c DW_TAG_subprogram
+	.long	.Linfo_string325                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	434                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x18db:0x5 DW_TAG_formal_parameter
+	.long	5542                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18e0:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18e5:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x18eb:0x17 DW_TAG_subprogram
+	.long	.Linfo_string326                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	442                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x18f7:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x18fc:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1902:0x21 DW_TAG_subprogram
+	.long	.Linfo_string327                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	358                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x190e:0x5 DW_TAG_formal_parameter
+	.long	2598                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1913:0x5 DW_TAG_formal_parameter
+	.long	2129                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1918:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x191d:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	49                              # Abbrev [49] 0x1923:0x1c DW_TAG_subprogram
+	.long	.Linfo_string328                # DW_AT_name
+	.byte	34                              # DW_AT_decl_file
+	.short	446                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x192f:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1934:0x5 DW_TAG_formal_parameter
+	.long	2147                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1939:0x5 DW_TAG_formal_parameter
+	.long	2421                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x193f:0xb DW_TAG_typedef
+	.long	6474                            # DW_AT_type
+	.long	.Linfo_string329                # DW_AT_name
+	.byte	36                              # DW_AT_decl_file
+	.byte	48                              # DW_AT_decl_line
+	.byte	10                              # Abbrev [10] 0x194a:0x5 DW_TAG_pointer_type
+	.long	6479                            # DW_AT_type
+	.byte	36                              # Abbrev [36] 0x194f:0x5 DW_TAG_const_type
+	.long	3748                            # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x1954:0xb DW_TAG_typedef
+	.long	2140                            # DW_AT_type
+	.long	.Linfo_string330                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	38                              # DW_AT_decl_line
+	.byte	53                              # Abbrev [53] 0x195f:0x11 DW_TAG_subprogram
+	.long	.Linfo_string331                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	95                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x196a:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1970:0x11 DW_TAG_subprogram
+	.long	.Linfo_string332                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	101                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x197b:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1981:0x11 DW_TAG_subprogram
+	.long	.Linfo_string333                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	146                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x198c:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1992:0x11 DW_TAG_subprogram
+	.long	.Linfo_string334                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	104                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x199d:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x19a3:0x16 DW_TAG_subprogram
+	.long	.Linfo_string335                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	159                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x19ae:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x19b3:0x5 DW_TAG_formal_parameter
+	.long	6484                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x19b9:0x11 DW_TAG_subprogram
+	.long	.Linfo_string336                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	108                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x19c4:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x19ca:0x11 DW_TAG_subprogram
+	.long	.Linfo_string337                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	112                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x19d5:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x19db:0x11 DW_TAG_subprogram
+	.long	.Linfo_string338                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	117                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x19e6:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x19ec:0x11 DW_TAG_subprogram
+	.long	.Linfo_string339                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	120                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x19f7:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x19fd:0x11 DW_TAG_subprogram
+	.long	.Linfo_string340                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	125                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a08:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a0e:0x11 DW_TAG_subprogram
+	.long	.Linfo_string341                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	130                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a19:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a1f:0x11 DW_TAG_subprogram
+	.long	.Linfo_string342                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	135                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a2a:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a30:0x11 DW_TAG_subprogram
+	.long	.Linfo_string343                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	140                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a3b:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a41:0x16 DW_TAG_subprogram
+	.long	.Linfo_string344                # DW_AT_name
+	.byte	36                              # DW_AT_decl_file
+	.byte	55                              # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a4c:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1a51:0x5 DW_TAG_formal_parameter
+	.long	6463                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a57:0x11 DW_TAG_subprogram
+	.long	.Linfo_string345                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	166                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a62:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a68:0x11 DW_TAG_subprogram
+	.long	.Linfo_string346                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	169                             # DW_AT_decl_line
+	.long	1810                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a73:0x5 DW_TAG_formal_parameter
+	.long	1810                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a79:0x11 DW_TAG_subprogram
+	.long	.Linfo_string347                # DW_AT_name
+	.byte	36                              # DW_AT_decl_file
+	.byte	52                              # DW_AT_decl_line
+	.long	6463                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a84:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	53                              # Abbrev [53] 0x1a8a:0x11 DW_TAG_subprogram
+	.long	.Linfo_string348                # DW_AT_name
+	.byte	38                              # DW_AT_decl_file
+	.byte	155                             # DW_AT_decl_line
+	.long	6484                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1a95:0x5 DW_TAG_formal_parameter
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	54                              # Abbrev [54] 0x1a9b:0x7 DW_TAG_imported_module
+	.byte	4                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.long	42                              # DW_AT_import
+	.byte	66                              # Abbrev [66] 0x1aa2:0x1a DW_TAG_subprogram
+	.long	.Linfo_string349                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	361                             # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_inline
+	.byte	67                              # Abbrev [67] 0x1aaf:0xc DW_TAG_formal_parameter
+	.long	.Linfo_string350                # DW_AT_name
+	.byte	26                              # DW_AT_decl_file
+	.short	361                             # DW_AT_decl_line
+	.long	2152                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	68                              # Abbrev [68] 0x1abc:0x112 DW_TAG_subprogram
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_GNU_all_call_sites
+	.long	.Linfo_string356                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_external
+	.byte	69                              # Abbrev [69] 0x1ad5:0xf DW_TAG_formal_parameter
+	.long	.Ldebug_loc0                    # DW_AT_location
+	.long	.Linfo_string358                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+	.byte	69                              # Abbrev [69] 0x1ae4:0xf DW_TAG_formal_parameter
+	.long	.Ldebug_loc1                    # DW_AT_location
+	.long	.Linfo_string359                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	2271                            # DW_AT_type
+	.byte	70                              # Abbrev [70] 0x1af3:0xac DW_TAG_lexical_block
+	.quad	.Ltmp2                          # DW_AT_low_pc
+	.long	.Ltmp26-.Ltmp2                  # DW_AT_high_pc
+	.byte	71                              # Abbrev [71] 0x1b00:0xf DW_TAG_variable
+	.long	.Ldebug_loc3                    # DW_AT_location
+	.long	.Linfo_string360                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	12                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+	.byte	71                              # Abbrev [71] 0x1b0f:0xf DW_TAG_variable
+	.long	.Ldebug_loc5                    # DW_AT_location
+	.long	.Linfo_string361                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	13                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+	.byte	71                              # Abbrev [71] 0x1b1e:0xf DW_TAG_variable
+	.long	.Ldebug_loc6                    # DW_AT_location
+	.long	.Linfo_string362                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	16                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+	.byte	72                              # Abbrev [72] 0x1b2d:0x1e DW_TAG_inlined_subroutine
+	.long	6818                            # DW_AT_abstract_origin
+	.quad	.Ltmp4                          # DW_AT_low_pc
+	.long	.Ltmp6-.Ltmp4                   # DW_AT_high_pc
+	.byte	4                               # DW_AT_call_file
+	.byte	12                              # DW_AT_call_line
+	.byte	15                              # DW_AT_call_column
+	.byte	73                              # Abbrev [73] 0x1b41:0x9 DW_TAG_formal_parameter
+	.long	.Ldebug_loc2                    # DW_AT_location
+	.long	6831                            # DW_AT_abstract_origin
+	.byte	0                               # End Of Children Mark
+	.byte	72                              # Abbrev [72] 0x1b4b:0x1e DW_TAG_inlined_subroutine
+	.long	6818                            # DW_AT_abstract_origin
+	.quad	.Ltmp7                          # DW_AT_low_pc
+	.long	.Ltmp9-.Ltmp7                   # DW_AT_high_pc
+	.byte	4                               # DW_AT_call_file
+	.byte	13                              # DW_AT_call_line
+	.byte	15                              # DW_AT_call_column
+	.byte	73                              # Abbrev [73] 0x1b5f:0x9 DW_TAG_formal_parameter
+	.long	.Ldebug_loc4                    # DW_AT_location
+	.long	6831                            # DW_AT_abstract_origin
+	.byte	0                               # End Of Children Mark
+	.byte	72                              # Abbrev [72] 0x1b69:0x1c DW_TAG_inlined_subroutine
+	.long	6818                            # DW_AT_abstract_origin
+	.quad	.Ltmp10                         # DW_AT_low_pc
+	.long	.Ltmp11-.Ltmp10                 # DW_AT_high_pc
+	.byte	4                               # DW_AT_call_file
+	.byte	14                              # DW_AT_call_line
+	.byte	17                              # DW_AT_call_column
+	.byte	74                              # Abbrev [74] 0x1b7d:0x7 DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.long	6831                            # DW_AT_abstract_origin
+	.byte	0                               # End Of Children Mark
+	.byte	70                              # Abbrev [70] 0x1b85:0x19 DW_TAG_lexical_block
+	.quad	.Ltmp15                         # DW_AT_low_pc
+	.long	.Ltmp19-.Ltmp15                 # DW_AT_high_pc
+	.byte	75                              # Abbrev [75] 0x1b92:0xb DW_TAG_variable
+	.long	.Linfo_string363                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	18                              # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	76                              # Abbrev [76] 0x1b9f:0x1a DW_TAG_GNU_call_site
+	.long	7118                            # DW_AT_abstract_origin
+	.quad	.Ltmp16                         # DW_AT_low_pc
+	.byte	77                              # Abbrev [77] 0x1bac:0x6 DW_TAG_GNU_call_site_parameter
+	.byte	1                               # DW_AT_location
+	.byte	84
+	.byte	2                               # DW_AT_GNU_call_site_value
+	.byte	127
+	.byte	0
+	.byte	77                              # Abbrev [77] 0x1bb2:0x6 DW_TAG_GNU_call_site_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.byte	2                               # DW_AT_GNU_call_site_value
+	.byte	126
+	.byte	0
+	.byte	0                               # End Of Children Mark
+	.byte	76                              # Abbrev [76] 0x1bb9:0x14 DW_TAG_GNU_call_site
+	.long	7144                            # DW_AT_abstract_origin
+	.quad	.Ltmp20                         # DW_AT_low_pc
+	.byte	77                              # Abbrev [77] 0x1bc6:0x6 DW_TAG_GNU_call_site_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.byte	2                               # DW_AT_GNU_call_site_value
+	.byte	126
+	.byte	0
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	29                              # Abbrev [29] 0x1bce:0x1a DW_TAG_subprogram
+	.long	.Linfo_string351                # DW_AT_linkage_name
+	.long	.Linfo_string352                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1bdd:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	27                              # Abbrev [27] 0x1be2:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	29                              # Abbrev [29] 0x1be8:0x15 DW_TAG_subprogram
+	.long	.Linfo_string353                # DW_AT_linkage_name
+	.long	.Linfo_string354                # DW_AT_name
+	.byte	4                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	1755                            # DW_AT_type
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	27                              # Abbrev [27] 0x1bf7:0x5 DW_TAG_formal_parameter
+	.long	1755                            # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	78                              # Abbrev [78] 0x1bfd:0x6 DW_TAG_subprogram
+	.long	.Linfo_string355                # DW_AT_name
+                                        # DW_AT_artificial
+	.byte	1                               # DW_AT_inline
+	.byte	79                              # Abbrev [79] 0x1c03:0x27 DW_TAG_subprogram
+	.quad	.Lfunc_begin1                   # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_GNU_all_call_sites
+	.long	.Linfo_string357                # DW_AT_linkage_name
+                                        # DW_AT_artificial
+	.byte	80                              # Abbrev [80] 0x1c16:0x13 DW_TAG_inlined_subroutine
+	.long	7165                            # DW_AT_abstract_origin
+	.quad	.Ltmp28                         # DW_AT_low_pc
+	.long	.Ltmp31-.Ltmp28                 # DW_AT_high_pc
+	.byte	4                               # DW_AT_call_file
+	.byte	0                               # DW_AT_call_line
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end8:
+	.section	.debug_ranges,"",@progbits
+.Ldebug_ranges0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_end0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_end1
+	.quad	0
+	.quad	0
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 12.0.0 (ssh://git.vip.facebook.com/data/gitrepos/osmeta/external/llvm-project 865a43501716d59b5b257119d6cf021fe4239642)" # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=134
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/fbsource/fbcode/scripts/ayermolo/debugExperiment" # string offset=143
+.Linfo_string3:
+	.asciz	"std"                           # string offset=213
+.Linfo_string4:
+	.asciz	"__ioinit"                      # string offset=217
+.Linfo_string5:
+	.asciz	"ios_base"                      # string offset=226
+.Linfo_string6:
+	.asciz	"_S_refcount"                   # string offset=235
+.Linfo_string7:
+	.asciz	"int"                           # string offset=247
+.Linfo_string8:
+	.asciz	"_Atomic_word"                  # string offset=251
+.Linfo_string9:
+	.asciz	"_S_synced_with_stdio"          # string offset=264
+.Linfo_string10:
+	.asciz	"bool"                          # string offset=285
+.Linfo_string11:
+	.asciz	"Init"                          # string offset=290
+.Linfo_string12:
+	.asciz	"~Init"                         # string offset=295
+.Linfo_string13:
+	.asciz	"_ZStL8__ioinit"                # string offset=301
+.Linfo_string14:
+	.asciz	"globalVar"                     # string offset=316
+.Linfo_string15:
+	.asciz	"char"                          # string offset=326
+.Linfo_string16:
+	.asciz	"__count"                       # string offset=331
+.Linfo_string17:
+	.asciz	"__value"                       # string offset=339
+.Linfo_string18:
+	.asciz	"__wch"                         # string offset=347
+.Linfo_string19:
+	.asciz	"unsigned int"                  # string offset=353
+.Linfo_string20:
+	.asciz	"__wchb"                        # string offset=366
+.Linfo_string21:
+	.asciz	"__ARRAY_SIZE_TYPE__"           # string offset=373
+.Linfo_string22:
+	.asciz	"__mbstate_t"                   # string offset=393
+.Linfo_string23:
+	.asciz	"mbstate_t"                     # string offset=405
+.Linfo_string24:
+	.asciz	"wint_t"                        # string offset=415
+.Linfo_string25:
+	.asciz	"btowc"                         # string offset=422
+.Linfo_string26:
+	.asciz	"fgetwc"                        # string offset=428
+.Linfo_string27:
+	.asciz	"_flags"                        # string offset=435
+.Linfo_string28:
+	.asciz	"_IO_read_ptr"                  # string offset=442
+.Linfo_string29:
+	.asciz	"_IO_read_end"                  # string offset=455
+.Linfo_string30:
+	.asciz	"_IO_read_base"                 # string offset=468
+.Linfo_string31:
+	.asciz	"_IO_write_base"                # string offset=482
+.Linfo_string32:
+	.asciz	"_IO_write_ptr"                 # string offset=497
+.Linfo_string33:
+	.asciz	"_IO_write_end"                 # string offset=511
+.Linfo_string34:
+	.asciz	"_IO_buf_base"                  # string offset=525
+.Linfo_string35:
+	.asciz	"_IO_buf_end"                   # string offset=538
+.Linfo_string36:
+	.asciz	"_IO_save_base"                 # string offset=550
+.Linfo_string37:
+	.asciz	"_IO_backup_base"               # string offset=564
+.Linfo_string38:
+	.asciz	"_IO_save_end"                  # string offset=580
+.Linfo_string39:
+	.asciz	"_markers"                      # string offset=593
+.Linfo_string40:
+	.asciz	"_IO_marker"                    # string offset=602
+.Linfo_string41:
+	.asciz	"_chain"                        # string offset=613
+.Linfo_string42:
+	.asciz	"_fileno"                       # string offset=620
+.Linfo_string43:
+	.asciz	"_flags2"                       # string offset=628
+.Linfo_string44:
+	.asciz	"_old_offset"                   # string offset=636
+.Linfo_string45:
+	.asciz	"long int"                      # string offset=648
+.Linfo_string46:
+	.asciz	"__off_t"                       # string offset=657
+.Linfo_string47:
+	.asciz	"_cur_column"                   # string offset=665
+.Linfo_string48:
+	.asciz	"unsigned short"                # string offset=677
+.Linfo_string49:
+	.asciz	"_vtable_offset"                # string offset=692
+.Linfo_string50:
+	.asciz	"signed char"                   # string offset=707
+.Linfo_string51:
+	.asciz	"_shortbuf"                     # string offset=719
+.Linfo_string52:
+	.asciz	"_lock"                         # string offset=729
+.Linfo_string53:
+	.asciz	"_IO_lock_t"                    # string offset=735
+.Linfo_string54:
+	.asciz	"_offset"                       # string offset=746
+.Linfo_string55:
+	.asciz	"__off64_t"                     # string offset=754
+.Linfo_string56:
+	.asciz	"_codecvt"                      # string offset=764
+.Linfo_string57:
+	.asciz	"_IO_codecvt"                   # string offset=773
+.Linfo_string58:
+	.asciz	"_wide_data"                    # string offset=785
+.Linfo_string59:
+	.asciz	"_IO_wide_data"                 # string offset=796
+.Linfo_string60:
+	.asciz	"_freeres_list"                 # string offset=810
+.Linfo_string61:
+	.asciz	"_freeres_buf"                  # string offset=824
+.Linfo_string62:
+	.asciz	"__pad5"                        # string offset=837
+.Linfo_string63:
+	.asciz	"long unsigned int"             # string offset=844
+.Linfo_string64:
+	.asciz	"size_t"                        # string offset=862
+.Linfo_string65:
+	.asciz	"_mode"                         # string offset=869
+.Linfo_string66:
+	.asciz	"_unused2"                      # string offset=875
+.Linfo_string67:
+	.asciz	"_IO_FILE"                      # string offset=884
+.Linfo_string68:
+	.asciz	"__FILE"                        # string offset=893
+.Linfo_string69:
+	.asciz	"fgetws"                        # string offset=900
+.Linfo_string70:
+	.asciz	"wchar_t"                       # string offset=907
+.Linfo_string71:
+	.asciz	"fputwc"                        # string offset=915
+.Linfo_string72:
+	.asciz	"fputws"                        # string offset=922
+.Linfo_string73:
+	.asciz	"fwide"                         # string offset=929
+.Linfo_string74:
+	.asciz	"fwprintf"                      # string offset=935
+.Linfo_string75:
+	.asciz	"fwscanf"                       # string offset=944
+.Linfo_string76:
+	.asciz	"getwc"                         # string offset=952
+.Linfo_string77:
+	.asciz	"getwchar"                      # string offset=958
+.Linfo_string78:
+	.asciz	"mbrlen"                        # string offset=967
+.Linfo_string79:
+	.asciz	"mbrtowc"                       # string offset=974
+.Linfo_string80:
+	.asciz	"mbsinit"                       # string offset=982
+.Linfo_string81:
+	.asciz	"mbsrtowcs"                     # string offset=990
+.Linfo_string82:
+	.asciz	"putwc"                         # string offset=1000
+.Linfo_string83:
+	.asciz	"putwchar"                      # string offset=1006
+.Linfo_string84:
+	.asciz	"swprintf"                      # string offset=1015
+.Linfo_string85:
+	.asciz	"swscanf"                       # string offset=1024
+.Linfo_string86:
+	.asciz	"ungetwc"                       # string offset=1032
+.Linfo_string87:
+	.asciz	"vfwprintf"                     # string offset=1040
+.Linfo_string88:
+	.asciz	"gp_offset"                     # string offset=1050
+.Linfo_string89:
+	.asciz	"fp_offset"                     # string offset=1060
+.Linfo_string90:
+	.asciz	"overflow_arg_area"             # string offset=1070
+.Linfo_string91:
+	.asciz	"reg_save_area"                 # string offset=1088
+.Linfo_string92:
+	.asciz	"__va_list_tag"                 # string offset=1102
+.Linfo_string93:
+	.asciz	"vfwscanf"                      # string offset=1116
+.Linfo_string94:
+	.asciz	"vswprintf"                     # string offset=1125
+.Linfo_string95:
+	.asciz	"vswscanf"                      # string offset=1135
+.Linfo_string96:
+	.asciz	"vwprintf"                      # string offset=1144
+.Linfo_string97:
+	.asciz	"vwscanf"                       # string offset=1153
+.Linfo_string98:
+	.asciz	"wcrtomb"                       # string offset=1161
+.Linfo_string99:
+	.asciz	"wcscat"                        # string offset=1169
+.Linfo_string100:
+	.asciz	"wcscmp"                        # string offset=1176
+.Linfo_string101:
+	.asciz	"wcscoll"                       # string offset=1183
+.Linfo_string102:
+	.asciz	"wcscpy"                        # string offset=1191
+.Linfo_string103:
+	.asciz	"wcscspn"                       # string offset=1198
+.Linfo_string104:
+	.asciz	"wcsftime"                      # string offset=1206
+.Linfo_string105:
+	.asciz	"tm"                            # string offset=1215
+.Linfo_string106:
+	.asciz	"wcslen"                        # string offset=1218
+.Linfo_string107:
+	.asciz	"wcsncat"                       # string offset=1225
+.Linfo_string108:
+	.asciz	"wcsncmp"                       # string offset=1233
+.Linfo_string109:
+	.asciz	"wcsncpy"                       # string offset=1241
+.Linfo_string110:
+	.asciz	"wcsrtombs"                     # string offset=1249
+.Linfo_string111:
+	.asciz	"wcsspn"                        # string offset=1259
+.Linfo_string112:
+	.asciz	"wcstod"                        # string offset=1266
+.Linfo_string113:
+	.asciz	"double"                        # string offset=1273
+.Linfo_string114:
+	.asciz	"wcstof"                        # string offset=1280
+.Linfo_string115:
+	.asciz	"float"                         # string offset=1287
+.Linfo_string116:
+	.asciz	"wcstok"                        # string offset=1293
+.Linfo_string117:
+	.asciz	"wcstol"                        # string offset=1300
+.Linfo_string118:
+	.asciz	"wcstoul"                       # string offset=1307
+.Linfo_string119:
+	.asciz	"wcsxfrm"                       # string offset=1315
+.Linfo_string120:
+	.asciz	"wctob"                         # string offset=1323
+.Linfo_string121:
+	.asciz	"wmemcmp"                       # string offset=1329
+.Linfo_string122:
+	.asciz	"wmemcpy"                       # string offset=1337
+.Linfo_string123:
+	.asciz	"wmemmove"                      # string offset=1345
+.Linfo_string124:
+	.asciz	"wmemset"                       # string offset=1354
+.Linfo_string125:
+	.asciz	"wprintf"                       # string offset=1362
+.Linfo_string126:
+	.asciz	"wscanf"                        # string offset=1370
+.Linfo_string127:
+	.asciz	"wcschr"                        # string offset=1377
+.Linfo_string128:
+	.asciz	"wcspbrk"                       # string offset=1384
+.Linfo_string129:
+	.asciz	"wcsrchr"                       # string offset=1392
+.Linfo_string130:
+	.asciz	"wcsstr"                        # string offset=1400
+.Linfo_string131:
+	.asciz	"wmemchr"                       # string offset=1407
+.Linfo_string132:
+	.asciz	"__gnu_cxx"                     # string offset=1415
+.Linfo_string133:
+	.asciz	"wcstold"                       # string offset=1425
+.Linfo_string134:
+	.asciz	"long double"                   # string offset=1433
+.Linfo_string135:
+	.asciz	"wcstoll"                       # string offset=1445
+.Linfo_string136:
+	.asciz	"long long int"                 # string offset=1453
+.Linfo_string137:
+	.asciz	"wcstoull"                      # string offset=1467
+.Linfo_string138:
+	.asciz	"long long unsigned int"        # string offset=1476
+.Linfo_string139:
+	.asciz	"__exception_ptr"               # string offset=1499
+.Linfo_string140:
+	.asciz	"_M_exception_object"           # string offset=1515
+.Linfo_string141:
+	.asciz	"exception_ptr"                 # string offset=1535
+.Linfo_string142:
+	.asciz	"_ZNSt15__exception_ptr13exception_ptr9_M_addrefEv" # string offset=1549
+.Linfo_string143:
+	.asciz	"_M_addref"                     # string offset=1599
+.Linfo_string144:
+	.asciz	"_ZNSt15__exception_ptr13exception_ptr10_M_releaseEv" # string offset=1609
+.Linfo_string145:
+	.asciz	"_M_release"                    # string offset=1661
+.Linfo_string146:
+	.asciz	"_ZNKSt15__exception_ptr13exception_ptr6_M_getEv" # string offset=1672
+.Linfo_string147:
+	.asciz	"_M_get"                        # string offset=1720
+.Linfo_string148:
+	.asciz	"decltype(nullptr)"             # string offset=1727
+.Linfo_string149:
+	.asciz	"nullptr_t"                     # string offset=1745
+.Linfo_string150:
+	.asciz	"_ZNSt15__exception_ptr13exception_ptraSERKS0_" # string offset=1755
+.Linfo_string151:
+	.asciz	"operator="                     # string offset=1801
+.Linfo_string152:
+	.asciz	"_ZNSt15__exception_ptr13exception_ptraSEOS0_" # string offset=1811
+.Linfo_string153:
+	.asciz	"~exception_ptr"                # string offset=1856
+.Linfo_string154:
+	.asciz	"_ZNSt15__exception_ptr13exception_ptr4swapERS0_" # string offset=1871
+.Linfo_string155:
+	.asciz	"swap"                          # string offset=1919
+.Linfo_string156:
+	.asciz	"_ZNKSt15__exception_ptr13exception_ptrcvbEv" # string offset=1924
+.Linfo_string157:
+	.asciz	"operator bool"                 # string offset=1968
+.Linfo_string158:
+	.asciz	"_ZNKSt15__exception_ptr13exception_ptr20__cxa_exception_typeEv" # string offset=1982
+.Linfo_string159:
+	.asciz	"__cxa_exception_type"          # string offset=2045
+.Linfo_string160:
+	.asciz	"type_info"                     # string offset=2066
+.Linfo_string161:
+	.asciz	"_ZSt17rethrow_exceptionNSt15__exception_ptr13exception_ptrE" # string offset=2076
+.Linfo_string162:
+	.asciz	"rethrow_exception"             # string offset=2136
+.Linfo_string163:
+	.asciz	"__gnu_debug"                   # string offset=2154
+.Linfo_string164:
+	.asciz	"__debug"                       # string offset=2166
+.Linfo_string165:
+	.asciz	"__int8_t"                      # string offset=2174
+.Linfo_string166:
+	.asciz	"int8_t"                        # string offset=2183
+.Linfo_string167:
+	.asciz	"short"                         # string offset=2190
+.Linfo_string168:
+	.asciz	"__int16_t"                     # string offset=2196
+.Linfo_string169:
+	.asciz	"int16_t"                       # string offset=2206
+.Linfo_string170:
+	.asciz	"__int32_t"                     # string offset=2214
+.Linfo_string171:
+	.asciz	"int32_t"                       # string offset=2224
+.Linfo_string172:
+	.asciz	"__int64_t"                     # string offset=2232
+.Linfo_string173:
+	.asciz	"int64_t"                       # string offset=2242
+.Linfo_string174:
+	.asciz	"int_fast8_t"                   # string offset=2250
+.Linfo_string175:
+	.asciz	"int_fast16_t"                  # string offset=2262
+.Linfo_string176:
+	.asciz	"int_fast32_t"                  # string offset=2275
+.Linfo_string177:
+	.asciz	"int_fast64_t"                  # string offset=2288
+.Linfo_string178:
+	.asciz	"__int_least8_t"                # string offset=2301
+.Linfo_string179:
+	.asciz	"int_least8_t"                  # string offset=2316
+.Linfo_string180:
+	.asciz	"__int_least16_t"               # string offset=2329
+.Linfo_string181:
+	.asciz	"int_least16_t"                 # string offset=2345
+.Linfo_string182:
+	.asciz	"__int_least32_t"               # string offset=2359
+.Linfo_string183:
+	.asciz	"int_least32_t"                 # string offset=2375
+.Linfo_string184:
+	.asciz	"__int_least64_t"               # string offset=2389
+.Linfo_string185:
+	.asciz	"int_least64_t"                 # string offset=2405
+.Linfo_string186:
+	.asciz	"__intmax_t"                    # string offset=2419
+.Linfo_string187:
+	.asciz	"intmax_t"                      # string offset=2430
+.Linfo_string188:
+	.asciz	"intptr_t"                      # string offset=2439
+.Linfo_string189:
+	.asciz	"unsigned char"                 # string offset=2448
+.Linfo_string190:
+	.asciz	"__uint8_t"                     # string offset=2462
+.Linfo_string191:
+	.asciz	"uint8_t"                       # string offset=2472
+.Linfo_string192:
+	.asciz	"__uint16_t"                    # string offset=2480
+.Linfo_string193:
+	.asciz	"uint16_t"                      # string offset=2491
+.Linfo_string194:
+	.asciz	"__uint32_t"                    # string offset=2500
+.Linfo_string195:
+	.asciz	"uint32_t"                      # string offset=2511
+.Linfo_string196:
+	.asciz	"__uint64_t"                    # string offset=2520
+.Linfo_string197:
+	.asciz	"uint64_t"                      # string offset=2531
+.Linfo_string198:
+	.asciz	"uint_fast8_t"                  # string offset=2540
+.Linfo_string199:
+	.asciz	"uint_fast16_t"                 # string offset=2553
+.Linfo_string200:
+	.asciz	"uint_fast32_t"                 # string offset=2567
+.Linfo_string201:
+	.asciz	"uint_fast64_t"                 # string offset=2581
+.Linfo_string202:
+	.asciz	"__uint_least8_t"               # string offset=2595
+.Linfo_string203:
+	.asciz	"uint_least8_t"                 # string offset=2611
+.Linfo_string204:
+	.asciz	"__uint_least16_t"              # string offset=2625
+.Linfo_string205:
+	.asciz	"uint_least16_t"                # string offset=2642
+.Linfo_string206:
+	.asciz	"__uint_least32_t"              # string offset=2657
+.Linfo_string207:
+	.asciz	"uint_least32_t"                # string offset=2674
+.Linfo_string208:
+	.asciz	"__uint_least64_t"              # string offset=2689
+.Linfo_string209:
+	.asciz	"uint_least64_t"                # string offset=2706
+.Linfo_string210:
+	.asciz	"__uintmax_t"                   # string offset=2721
+.Linfo_string211:
+	.asciz	"uintmax_t"                     # string offset=2733
+.Linfo_string212:
+	.asciz	"uintptr_t"                     # string offset=2743
+.Linfo_string213:
+	.asciz	"lconv"                         # string offset=2753
+.Linfo_string214:
+	.asciz	"setlocale"                     # string offset=2759
+.Linfo_string215:
+	.asciz	"localeconv"                    # string offset=2769
+.Linfo_string216:
+	.asciz	"isalnum"                       # string offset=2780
+.Linfo_string217:
+	.asciz	"isalpha"                       # string offset=2788
+.Linfo_string218:
+	.asciz	"iscntrl"                       # string offset=2796
+.Linfo_string219:
+	.asciz	"isdigit"                       # string offset=2804
+.Linfo_string220:
+	.asciz	"isgraph"                       # string offset=2812
+.Linfo_string221:
+	.asciz	"islower"                       # string offset=2820
+.Linfo_string222:
+	.asciz	"isprint"                       # string offset=2828
+.Linfo_string223:
+	.asciz	"ispunct"                       # string offset=2836
+.Linfo_string224:
+	.asciz	"isspace"                       # string offset=2844
+.Linfo_string225:
+	.asciz	"isupper"                       # string offset=2852
+.Linfo_string226:
+	.asciz	"isxdigit"                      # string offset=2860
+.Linfo_string227:
+	.asciz	"tolower"                       # string offset=2869
+.Linfo_string228:
+	.asciz	"toupper"                       # string offset=2877
+.Linfo_string229:
+	.asciz	"isblank"                       # string offset=2885
+.Linfo_string230:
+	.asciz	"ptrdiff_t"                     # string offset=2893
+.Linfo_string231:
+	.asciz	"abs"                           # string offset=2903
+.Linfo_string232:
+	.asciz	"div_t"                         # string offset=2907
+.Linfo_string233:
+	.asciz	"quot"                          # string offset=2913
+.Linfo_string234:
+	.asciz	"rem"                           # string offset=2918
+.Linfo_string235:
+	.asciz	"ldiv_t"                        # string offset=2922
+.Linfo_string236:
+	.asciz	"abort"                         # string offset=2929
+.Linfo_string237:
+	.asciz	"atexit"                        # string offset=2935
+.Linfo_string238:
+	.asciz	"at_quick_exit"                 # string offset=2942
+.Linfo_string239:
+	.asciz	"atof"                          # string offset=2956
+.Linfo_string240:
+	.asciz	"atol"                          # string offset=2961
+.Linfo_string241:
+	.asciz	"bsearch"                       # string offset=2966
+.Linfo_string242:
+	.asciz	"__compar_fn_t"                 # string offset=2974
+.Linfo_string243:
+	.asciz	"calloc"                        # string offset=2988
+.Linfo_string244:
+	.asciz	"div"                           # string offset=2995
+.Linfo_string245:
+	.asciz	"exit"                          # string offset=2999
+.Linfo_string246:
+	.asciz	"free"                          # string offset=3004
+.Linfo_string247:
+	.asciz	"getenv"                        # string offset=3009
+.Linfo_string248:
+	.asciz	"labs"                          # string offset=3016
+.Linfo_string249:
+	.asciz	"ldiv"                          # string offset=3021
+.Linfo_string250:
+	.asciz	"malloc"                        # string offset=3026
+.Linfo_string251:
+	.asciz	"mblen"                         # string offset=3033
+.Linfo_string252:
+	.asciz	"mbstowcs"                      # string offset=3039
+.Linfo_string253:
+	.asciz	"mbtowc"                        # string offset=3048
+.Linfo_string254:
+	.asciz	"qsort"                         # string offset=3055
+.Linfo_string255:
+	.asciz	"quick_exit"                    # string offset=3061
+.Linfo_string256:
+	.asciz	"rand"                          # string offset=3072
+.Linfo_string257:
+	.asciz	"realloc"                       # string offset=3077
+.Linfo_string258:
+	.asciz	"srand"                         # string offset=3085
+.Linfo_string259:
+	.asciz	"strtod"                        # string offset=3091
+.Linfo_string260:
+	.asciz	"strtol"                        # string offset=3098
+.Linfo_string261:
+	.asciz	"strtoul"                       # string offset=3105
+.Linfo_string262:
+	.asciz	"system"                        # string offset=3113
+.Linfo_string263:
+	.asciz	"wcstombs"                      # string offset=3120
+.Linfo_string264:
+	.asciz	"wctomb"                        # string offset=3129
+.Linfo_string265:
+	.asciz	"lldiv_t"                       # string offset=3136
+.Linfo_string266:
+	.asciz	"_Exit"                         # string offset=3144
+.Linfo_string267:
+	.asciz	"llabs"                         # string offset=3150
+.Linfo_string268:
+	.asciz	"lldiv"                         # string offset=3156
+.Linfo_string269:
+	.asciz	"atoll"                         # string offset=3162
+.Linfo_string270:
+	.asciz	"strtoll"                       # string offset=3168
+.Linfo_string271:
+	.asciz	"strtoull"                      # string offset=3176
+.Linfo_string272:
+	.asciz	"strtof"                        # string offset=3185
+.Linfo_string273:
+	.asciz	"strtold"                       # string offset=3192
+.Linfo_string274:
+	.asciz	"_ZN9__gnu_cxx3divExx"          # string offset=3200
+.Linfo_string275:
+	.asciz	"FILE"                          # string offset=3221
+.Linfo_string276:
+	.asciz	"_G_fpos64_t"                   # string offset=3226
+.Linfo_string277:
+	.asciz	"__fpos64_t"                    # string offset=3238
+.Linfo_string278:
+	.asciz	"fpos_t"                        # string offset=3249
+.Linfo_string279:
+	.asciz	"clearerr"                      # string offset=3256
+.Linfo_string280:
+	.asciz	"fclose"                        # string offset=3265
+.Linfo_string281:
+	.asciz	"feof"                          # string offset=3272
+.Linfo_string282:
+	.asciz	"ferror"                        # string offset=3277
+.Linfo_string283:
+	.asciz	"fflush"                        # string offset=3284
+.Linfo_string284:
+	.asciz	"fgetc"                         # string offset=3291
+.Linfo_string285:
+	.asciz	"fgetpos64"                     # string offset=3297
+.Linfo_string286:
+	.asciz	"fgetpos"                       # string offset=3307
+.Linfo_string287:
+	.asciz	"fgets"                         # string offset=3315
+.Linfo_string288:
+	.asciz	"fopen64"                       # string offset=3321
+.Linfo_string289:
+	.asciz	"fopen"                         # string offset=3329
+.Linfo_string290:
+	.asciz	"fprintf"                       # string offset=3335
+.Linfo_string291:
+	.asciz	"fputc"                         # string offset=3343
+.Linfo_string292:
+	.asciz	"fputs"                         # string offset=3349
+.Linfo_string293:
+	.asciz	"fread"                         # string offset=3355
+.Linfo_string294:
+	.asciz	"freopen64"                     # string offset=3361
+.Linfo_string295:
+	.asciz	"freopen"                       # string offset=3371
+.Linfo_string296:
+	.asciz	"fscanf"                        # string offset=3379
+.Linfo_string297:
+	.asciz	"fseek"                         # string offset=3386
+.Linfo_string298:
+	.asciz	"fsetpos64"                     # string offset=3392
+.Linfo_string299:
+	.asciz	"fsetpos"                       # string offset=3402
+.Linfo_string300:
+	.asciz	"ftell"                         # string offset=3410
+.Linfo_string301:
+	.asciz	"fwrite"                        # string offset=3416
+.Linfo_string302:
+	.asciz	"getc"                          # string offset=3423
+.Linfo_string303:
+	.asciz	"getchar"                       # string offset=3428
+.Linfo_string304:
+	.asciz	"perror"                        # string offset=3436
+.Linfo_string305:
+	.asciz	"printf"                        # string offset=3443
+.Linfo_string306:
+	.asciz	"putc"                          # string offset=3450
+.Linfo_string307:
+	.asciz	"putchar"                       # string offset=3455
+.Linfo_string308:
+	.asciz	"puts"                          # string offset=3463
+.Linfo_string309:
+	.asciz	"remove"                        # string offset=3468
+.Linfo_string310:
+	.asciz	"rename"                        # string offset=3475
+.Linfo_string311:
+	.asciz	"rewind"                        # string offset=3482
+.Linfo_string312:
+	.asciz	"scanf"                         # string offset=3489
+.Linfo_string313:
+	.asciz	"setbuf"                        # string offset=3495
+.Linfo_string314:
+	.asciz	"setvbuf"                       # string offset=3502
+.Linfo_string315:
+	.asciz	"sprintf"                       # string offset=3510
+.Linfo_string316:
+	.asciz	"sscanf"                        # string offset=3518
+.Linfo_string317:
+	.asciz	"tmpfile64"                     # string offset=3525
+.Linfo_string318:
+	.asciz	"tmpfile"                       # string offset=3535
+.Linfo_string319:
+	.asciz	"tmpnam"                        # string offset=3543
+.Linfo_string320:
+	.asciz	"ungetc"                        # string offset=3550
+.Linfo_string321:
+	.asciz	"vfprintf"                      # string offset=3557
+.Linfo_string322:
+	.asciz	"vprintf"                       # string offset=3566
+.Linfo_string323:
+	.asciz	"vsprintf"                      # string offset=3574
+.Linfo_string324:
+	.asciz	"snprintf"                      # string offset=3583
+.Linfo_string325:
+	.asciz	"vfscanf"                       # string offset=3592
+.Linfo_string326:
+	.asciz	"vscanf"                        # string offset=3600
+.Linfo_string327:
+	.asciz	"vsnprintf"                     # string offset=3607
+.Linfo_string328:
+	.asciz	"vsscanf"                       # string offset=3617
+.Linfo_string329:
+	.asciz	"wctrans_t"                     # string offset=3625
+.Linfo_string330:
+	.asciz	"wctype_t"                      # string offset=3635
+.Linfo_string331:
+	.asciz	"iswalnum"                      # string offset=3644
+.Linfo_string332:
+	.asciz	"iswalpha"                      # string offset=3653
+.Linfo_string333:
+	.asciz	"iswblank"                      # string offset=3662
+.Linfo_string334:
+	.asciz	"iswcntrl"                      # string offset=3671
+.Linfo_string335:
+	.asciz	"iswctype"                      # string offset=3680
+.Linfo_string336:
+	.asciz	"iswdigit"                      # string offset=3689
+.Linfo_string337:
+	.asciz	"iswgraph"                      # string offset=3698
+.Linfo_string338:
+	.asciz	"iswlower"                      # string offset=3707
+.Linfo_string339:
+	.asciz	"iswprint"                      # string offset=3716
+.Linfo_string340:
+	.asciz	"iswpunct"                      # string offset=3725
+.Linfo_string341:
+	.asciz	"iswspace"                      # string offset=3734
+.Linfo_string342:
+	.asciz	"iswupper"                      # string offset=3743
+.Linfo_string343:
+	.asciz	"iswxdigit"                     # string offset=3752
+.Linfo_string344:
+	.asciz	"towctrans"                     # string offset=3762
+.Linfo_string345:
+	.asciz	"towlower"                      # string offset=3772
+.Linfo_string346:
+	.asciz	"towupper"                      # string offset=3781
+.Linfo_string347:
+	.asciz	"wctrans"                       # string offset=3790
+.Linfo_string348:
+	.asciz	"wctype"                        # string offset=3798
+.Linfo_string349:
+	.asciz	"atoi"                          # string offset=3805
+.Linfo_string350:
+	.asciz	"__nptr"                        # string offset=3810
+.Linfo_string351:
+	.asciz	"_Z7doStuffii"                  # string offset=3817
+.Linfo_string352:
+	.asciz	"doStuff"                       # string offset=3830
+.Linfo_string353:
+	.asciz	"_Z8doStuff2i"                  # string offset=3838
+.Linfo_string354:
+	.asciz	"doStuff2"                      # string offset=3851
+.Linfo_string355:
+	.asciz	"__cxx_global_var_init"         # string offset=3860
+.Linfo_string356:
+	.asciz	"main"                          # string offset=3882
+.Linfo_string357:
+	.asciz	"_GLOBAL__sub_I_main.cpp"       # string offset=3887
+.Linfo_string358:
+	.asciz	"argc"                          # string offset=3911
+.Linfo_string359:
+	.asciz	"argv"                          # string offset=3916
+.Linfo_string360:
+	.asciz	"val"                           # string offset=3921
+.Linfo_string361:
+	.asciz	"foo"                           # string offset=3925
+.Linfo_string362:
+	.asciz	"retVal"                        # string offset=3929
+.Linfo_string363:
+	.asciz	"i"                             # string offset=3936
+	.ident	"clang version 12.0.0 (ssh://git.vip.facebook.com/data/gitrepos/osmeta/external/llvm-project 865a43501716d59b5b257119d6cf021fe4239642)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _GLOBAL__sub_I_main.cpp
+	.addrsig_sym _ZStL8__ioinit
+	.addrsig_sym __dso_handle
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/plt-sec.yaml b/bolt/test/X86/Inputs/plt-sec.yaml
new file mode 100644
index 000000000000..7e8594ea79ff
--- /dev/null
+++ b/bolt/test/X86/Inputs/plt-sec.yaml
@@ -0,0 +1,819 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x401050
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x400040
+    Align:           0x8
+  - Type:            PT_INTERP
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .interp
+    VAddr:           0x400318
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .rela.plt
+    VAddr:           0x400000
+    Align:           0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x401000
+    Align:           0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x402000
+    Align:           0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x403E10
+    Align:           0x1000
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x403E20
+    Align:           0x8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x400338
+    Align:           0x8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.ABI-tag
+    VAddr:           0x400358
+    Align:           0x4
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x400338
+    Align:           0x8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x402014
+    Align:           0x4
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x403E10
+Sections:
+  - Name:            .interp
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400318
+    AddressAlign:    0x1
+    Content:         2F6C696236342F6C642D6C696E75782D7838362D36342E736F2E3200
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400338
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400358
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            AE3407FE6CCCA79129DD6837FC72006B35955447
+        Type:            NT_PRPSINFO
+  - Name:            .note.ABI-tag
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x40037C
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            '00000000030000000200000000000000'
+        Type:            NT_VERSION
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4003A0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x1
+      Shift2:          0x0
+    BloomFilter:     [ 0x0 ]
+    HashBuckets:     [ 0x0 ]
+    HashValues:      [  ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4003C0
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400420
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x40045E
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 2, 2, 0 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400468
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400488
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x403FF0
+        Symbol:          __libc_start_main
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x403FF8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x4004B8
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x404018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25CD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401050
+    AddressAlign:    0x10
+    Content:         F30F1EFA31ED4989D15E4889E24883E4F0505449C7C0E011400048C7C17011400048C7C736114000FF15722F0000F490F30F1EFAC3662E0F1F84000000000090B830404000483D304040007413B8000000004885C07409BF30404000FFE06690C366662E0F1F8400000000000F1F4000BE304040004881EE304040004889F048C1EE3F48C1F8034801C648D1FE7411B8000000004885C07407BF30404000FFE0C366662E0F1F8400000000000F1F4000F30F1EFA803D252F0000007513554889E5E87AFFFFFFC605132F0000015DC390C366662E0F1F8400000000000F1F4000F30F1EFAEB8AF30F1EFA4883EC08B800000000E80A000000B8000000004883C408C3F30F1EFA4883EC08488D3DA30E0000E8DAFEFFFF4883C408C30F1F440000F30F1EFA41574C8D3D932C000041564989D641554989F541544189FC55488D2D842C0000534C29FD4883EC08E85FFEFFFF48C1FD03741F31DB0F1F80000000004C89F24C89EE4489E741FF14DF4883C3014839DD75EA4883C4085B5D415C415D415E415FC366662E0F1F840000000000F30F1EFAC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x4011E8
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x402000
+    AddressAlign:    0x4
+    Offset:          0x2000
+    Content:         0100020048656C6C6F20776F726C64210A00
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x402014
+    AddressAlign:    0x4
+    Content:         011B033B48000000080000000CF0FFFF8C0000002CF0FFFFB40000003CF0FFFF640000006CF0FFFF7800000022F1FFFFCC0000003EF1FFFFE40000005CF1FFFFFC000000CCF1FFFF44010000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x402060
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000100000001C000000D0EFFFFF2F000000004407101000000030000000ECEFFFFF0500000000000000240000004400000078EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A33242200000000140000006C00000070EFFFFF10000000000000000000000014000000840000004EF0FFFF1C00000000480E10530E0800140000009C00000052F0FFFF1900000000480E10500E080044000000B400000058F0FFFF6500000000460E108F02490E188E03450E208D04450E288C05440E308606480E388307470E406E0E38410E30410E28420E20420E18420E10420E080010000000FC00000080F0FFFF050000000000000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '3011400000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0011400000000000'
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x1
+      - Tag:             DT_INIT
+        Value:           0x401000
+      - Tag:             DT_FINI
+        Value:           0x4011E8
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x403E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x403E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x4003A0
+      - Tag:             DT_STRTAB
+        Value:           0x400420
+      - Tag:             DT_SYMTAB
+        Value:           0x4003C0
+      - Tag:             DT_STRSZ
+        Value:           0x3D
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_DEBUG
+        Value:           0x0
+      - Tag:             DT_PLTGOT
+        Value:           0x404000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x4004B8
+      - Tag:             DT_RELA
+        Value:           0x400488
+      - Tag:             DT_RELASZ
+        Value:           0x30
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x400468
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x40045E
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403FF0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '00000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E400000000000000000000000000000000000000000003010400000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404020
+    AddressAlign:    0x8
+    Content:         '00000000000000000000000000000000'
+  - Name:            .tm_clone_table
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404030
+    AddressAlign:    0x8
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404030
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E747520392E332E302D31377562756E7475317E32302E30342920392E332E3000
+  - Name:            .rela.init
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .init
+    Relocations:
+      - Offset:          0x40100B
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_REX_GOTPCRELX
+        Addend:          -4
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .text
+    Relocations:
+      - Offset:          0x401066
+        Symbol:          __libc_csu_fini
+        Type:            R_X86_64_32S
+      - Offset:          0x40106D
+        Symbol:          __libc_csu_init
+        Type:            R_X86_64_32S
+      - Offset:          0x401074
+        Symbol:          main
+        Type:            R_X86_64_32S
+      - Offset:          0x40107A
+        Symbol:          '__libc_start_main@@GLIBC_2.2.5'
+        Type:            R_X86_64_GOTPCRELX
+        Addend:          -4
+      - Offset:          0x401091
+        Symbol:          __TMC_END__
+        Type:            R_X86_64_32
+      - Offset:          0x401097
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32S
+      - Offset:          0x40109E
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_32
+      - Offset:          0x4010A8
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32
+      - Offset:          0x4010C1
+        Symbol:          __TMC_END__
+        Type:            R_X86_64_32
+      - Offset:          0x4010C8
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32S
+      - Offset:          0x4010E0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_32
+      - Offset:          0x4010EA
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32
+      - Offset:          0x401106
+        Symbol:          .bss
+        Type:            R_X86_64_PC32
+        Addend:          -5
+      - Offset:          0x401118
+        Symbol:          .bss
+        Type:            R_X86_64_PC32
+        Addend:          -5
+      - Offset:          0x401144
+        Symbol:          foo
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40115D
+        Symbol:          .LC0
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401162
+        Symbol:          'puts@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401179
+        Symbol:          __init_array_start
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401190
+        Symbol:          __init_array_end
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40119D
+        Symbol:          _init
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+  - Name:            .rela.eh_frame
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .eh_frame
+    Relocations:
+      - Offset:          0x402080
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+      - Offset:          0x402094
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          48
+      - Offset:          0x4020E8
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          230
+      - Offset:          0x402100
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          258
+      - Offset:          0x402118
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          288
+      - Offset:          0x402160
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          400
+  - Name:            .rela.init_array
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .init_array
+    Relocations:
+      - Offset:          0x403E10
+        Symbol:          .text
+        Type:            R_X86_64_64
+        Addend:          224
+  - Name:            .rela.fini_array
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .fini_array
+    Relocations:
+      - Offset:          0x403E18
+        Symbol:          .text
+        Type:            R_X86_64_64
+        Addend:          176
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .interp
+      - Name:            .note.gnu.property
+      - Name:            .note.gnu.build-id
+      - Name:            .note.ABI-tag
+      - Name:            .gnu.hash
+      - Name:            .dynsym
+      - Name:            .dynstr
+      - Name:            .gnu.version
+      - Name:            .gnu.version_r
+      - Name:            .rela.dyn
+      - Name:            .rela.plt
+      - Name:            .init
+      - Name:            .rela.init
+      - Name:            .plt
+      - Name:            .plt.sec
+      - Name:            .text
+      - Name:            .rela.text
+      - Name:            .fini
+      - Name:            .rodata
+      - Name:            .eh_frame_hdr
+      - Name:            .eh_frame
+      - Name:            .rela.eh_frame
+      - Name:            .init_array
+      - Name:            .rela.init_array
+      - Name:            .fini_array
+      - Name:            .rela.fini_array
+      - Name:            .dynamic
+      - Name:            .got
+      - Name:            .got.plt
+      - Name:            .data
+      - Name:            .tm_clone_table
+      - Name:            .bss
+      - Name:            .comment
+      - Name:            .symtab
+      - Name:            .strtab
+      - Name:            .shstrtab
+Symbols:
+  - Name:            .interp
+    Type:            STT_SECTION
+    Section:         .interp
+    Value:           0x400318
+  - Name:            .note.gnu.property
+    Type:            STT_SECTION
+    Section:         .note.gnu.property
+    Value:           0x400338
+  - Name:            .note.gnu.build-id
+    Type:            STT_SECTION
+    Section:         .note.gnu.build-id
+    Value:           0x400358
+  - Name:            .note.ABI-tag
+    Type:            STT_SECTION
+    Section:         .note.ABI-tag
+    Value:           0x40037C
+  - Name:            .gnu.hash
+    Type:            STT_SECTION
+    Section:         .gnu.hash
+    Value:           0x4003A0
+  - Name:            .dynsym
+    Type:            STT_SECTION
+    Section:         .dynsym
+    Value:           0x4003C0
+  - Name:            .dynstr
+    Type:            STT_SECTION
+    Section:         .dynstr
+    Value:           0x400420
+  - Name:            .gnu.version
+    Type:            STT_SECTION
+    Section:         .gnu.version
+    Value:           0x40045E
+  - Name:            .gnu.version_r
+    Type:            STT_SECTION
+    Section:         .gnu.version_r
+    Value:           0x400468
+  - Name:            .rela.dyn
+    Type:            STT_SECTION
+    Section:         .rela.dyn
+    Value:           0x400488
+  - Name:            .rela.plt
+    Type:            STT_SECTION
+    Section:         .rela.plt
+    Value:           0x4004B8
+  - Name:            .init
+    Type:            STT_SECTION
+    Section:         .init
+    Value:           0x401000
+  - Name:            .plt
+    Type:            STT_SECTION
+    Section:         .plt
+    Value:           0x401020
+  - Name:            .plt.sec
+    Type:            STT_SECTION
+    Section:         .plt.sec
+    Value:           0x401040
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x401050
+  - Name:            .fini
+    Type:            STT_SECTION
+    Section:         .fini
+    Value:           0x4011E8
+  - Name:            .rodata
+    Type:            STT_SECTION
+    Section:         .rodata
+    Value:           0x402000
+  - Name:            .eh_frame_hdr
+    Type:            STT_SECTION
+    Section:         .eh_frame_hdr
+    Value:           0x402014
+  - Name:            .eh_frame
+    Type:            STT_SECTION
+    Section:         .eh_frame
+    Value:           0x402060
+  - Name:            .init_array
+    Type:            STT_SECTION
+    Section:         .init_array
+    Value:           0x403E10
+  - Name:            .fini_array
+    Type:            STT_SECTION
+    Section:         .fini_array
+    Value:           0x403E18
+  - Name:            .dynamic
+    Type:            STT_SECTION
+    Section:         .dynamic
+    Value:           0x403E20
+  - Name:            .got
+    Type:            STT_SECTION
+    Section:         .got
+    Value:           0x403FF0
+  - Name:            .got.plt
+    Type:            STT_SECTION
+    Section:         .got.plt
+    Value:           0x404000
+  - Name:            .data
+    Type:            STT_SECTION
+    Section:         .data
+    Value:           0x404020
+  - Name:            .tm_clone_table
+    Type:            STT_SECTION
+    Section:         .tm_clone_table
+    Value:           0x404030
+  - Name:            .bss
+    Type:            STT_SECTION
+    Section:         .bss
+    Value:           0x404030
+  - Name:            .comment
+    Type:            STT_SECTION
+    Section:         .comment
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __TMC_LIST__
+    Type:            STT_OBJECT
+    Section:         .tm_clone_table
+    Value:           0x404030
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x401090
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x4010C0
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x401100
+  - Name:            completed.8060
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x404030
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x403E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x401130
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x403E10
+  - Name:            inline-main.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            inline-foo.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            .LC0
+    Section:         .rodata
+    Value:           0x402004
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x40216C
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __init_array_end
+    Section:         .init_array
+    Value:           0x403E18
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x403E20
+  - Name:            __init_array_start
+    Section:         .init_array
+    Value:           0x403E10
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x402014
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x404000
+  - Name:            __libc_csu_fini
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x4011E0
+    Size:            0x5
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            data_start
+    Section:         .data
+    Binding:         STB_WEAK
+    Value:           0x404020
+  - Name:            'puts@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            _edata
+    Section:         .tm_clone_table
+    Binding:         STB_GLOBAL
+    Value:           0x404030
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Binding:         STB_GLOBAL
+    Value:           0x4011E8
+    Other:           [ STV_HIDDEN ]
+  - Name:            '__libc_start_main@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __data_start
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x404020
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x404028
+    Other:           [ STV_HIDDEN ]
+  - Name:            _IO_stdin_used
+    Type:            STT_OBJECT
+    Section:         .rodata
+    Binding:         STB_GLOBAL
+    Value:           0x402000
+    Size:            0x4
+  - Name:            __libc_csu_init
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401170
+    Size:            0x65
+  - Name:            foo
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401152
+    Size:            0x19
+  - Name:            _end
+    Section:         .bss
+    Binding:         STB_GLOBAL
+    Value:           0x404038
+  - Name:            _dl_relocate_static_pie
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401080
+    Size:            0x5
+    Other:           [ STV_HIDDEN ]
+  - Name:            _start
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401050
+    Size:            0x2F
+  - Name:            __bss_start
+    Section:         .bss
+    Binding:         STB_GLOBAL
+    Value:           0x404030
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401136
+    Size:            0x1C
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .tm_clone_table
+    Binding:         STB_GLOBAL
+    Value:           0x404030
+    Other:           [ STV_HIDDEN ]
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Binding:         STB_GLOBAL
+    Value:           0x401000
+    Other:           [ STV_HIDDEN ]
+DynamicSymbols:
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __libc_start_main
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+...
diff --git a/bolt/test/X86/Inputs/pre-aggregated.txt b/bolt/test/X86/Inputs/pre-aggregated.txt
new file mode 100644
index 000000000000..788ceb45607b
--- /dev/null
+++ b/bolt/test/X86/Inputs/pre-aggregated.txt
@@ -0,0 +1,8 @@
+B X:7f36d18d60c0 400bbc 2 0
+B 400ad1 400e00 2 0
+B 400b10 4005f0 1 0
+B 400bb7 400610 1 0
+B 4005f0 X:7f36d18f2ce0 1 0
+B 4011a0 4011a9 33 4
+B 4011ad 401180 58 0
+F 401170 4011b2 22
diff --git a/bolt/test/X86/Inputs/srol-bug-input.yaml b/bolt/test/X86/Inputs/srol-bug-input.yaml
new file mode 100644
index 000000000000..a76ab95fa0ef
--- /dev/null
+++ b/bolt/test/X86/Inputs/srol-bug-input.yaml
@@ -0,0 +1,62 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x0000000000400000
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x0000000000400000
+    AddressAlign:    0x0000000000000010
+    Content:         0315fa0000002B15f40000002315ee0000000B15e80000003315e2000000500fB715da0000008A15d4000000668B15cd0000008B15c7000000488B15c00000003A15ba000000663B15b30000003B15ad000000483B15a60000008415a0000000668515990000008515930000004885158c000000C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0000000000400100
+    AddressAlign:    0x0000000000000100
+    Content:         010002000000000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x000000000061ADA8
+    Link:            .dynstr
+    AddressAlign:    0x0000000000000008
+    Content:         01000000000000000100000000000000010000000000000096000000000000000100000000000000C400000000000000010000000000000001010000000000000C0000000000000028224000000000000D000000000000005C29410000000000190000000000000028A36100000000001B0000000000000008000000000000001A0000000000000030A36100000000001C000000000000000800000000000000F5FEFF6F0000000098024000000000000500000000000000300F4000000000000600000000000000D0024000000000000A00000000000000BC050000000000000B00000000000000180000000000000015000000000000000000000000000000030000000000000000B0610000000000020000000000000000000000000000001400000000000000070000000000000017000000000000006017400000000000070000000000000088164000000000000800000000000000000000000000000009000000000000001800000000000000FEFFFF6F00000000F815400000000000FFFFFF6F000000000200000000000000F0FFFF6F00000000EC14400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+Symbols:
+    - Name:            mydata
+      Section:         .rodata
+      Value:           0x0000000000400100
+      Binding:         STB_GLOBAL
+    - Name:            myfunc
+      Type:            STT_FUNC
+      Section:         .text
+      Value:           0x0000000000400000
+      Binding:         STB_GLOBAL
+DynamicSymbols:
+    - Name:            mydata
+      Section:         .rodata
+      Value:           0x0000000000400100
+      Binding:         STB_GLOBAL
+ProgramHeaders:
+  - Type: PT_PHDR
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x00400000
+    PAddr: 0x00400000
+    FirstSec: .text
+    LastSec: .text
+  - Type: PT_LOAD
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x00400000
+    PAddr: 0x00400000
+    FirstSec: .text
+    LastSec: .text
+  - Type: PT_DYNAMIC
+    Flags: [ PF_X, PF_R ]
+    VAddr: 0x0061ADA8
+    PAddr: 0x0064ADA8
+    FirstSec: .dynamic
+    LastSec: .dynamic
+...
diff --git a/bolt/test/X86/Inputs/user_func_order.txt b/bolt/test/X86/Inputs/user_func_order.txt
new file mode 100644
index 000000000000..48b76cd35f44
--- /dev/null
+++ b/bolt/test/X86/Inputs/user_func_order.txt
@@ -0,0 +1,2 @@
+main
+fib
diff --git a/bolt/test/X86/bb_with_two_tail_calls.s b/bolt/test/X86/bb_with_two_tail_calls.s
new file mode 100644
index 000000000000..ab023817f5ad
--- /dev/null
+++ b/bolt/test/X86/bb_with_two_tail_calls.s
@@ -0,0 +1,29 @@
+# This reproduces a bug with dynostats when trying to compute branch stats
+# at a block with two tails calls (one conditional and one unconditional).
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out -data %t.fdata -lite=0 -dyno-stats \
+# RUN:    -print-sctc 2>&1 | FileCheck %s
+# CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed.
+# Two tail calls in the same basic block after SCTC:
+# CHECK:         {{.*}}:   jae     {{.*}} # TAILCALL  # CTCTakenCount: {{.*}}
+# CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL
+
+  .globl _start
+_start:
+    ja a
+b:  jb c
+# FDATA: 1 _start #b# 1 _start #c# 2 4
+    jmp e
+a:  nop
+c:  jmp e
+
+  .globl e
+e:
+    nop
diff --git a/bolt/test/X86/bug-reorder-bb-jrcxz.s b/bolt/test/X86/bug-reorder-bb-jrcxz.s
new file mode 100644
index 000000000000..ca283817f6cb
--- /dev/null
+++ b/bolt/test/X86/bug-reorder-bb-jrcxz.s
@@ -0,0 +1,640 @@
+# Test performs a BB reordering with unsupported
+# instruction jrcxz. Reordering works correctly with the
+# follow options: None, Normal or Reverse. Other strategies
+# are completed with Assertion `isIntN(Size * 8 + 1, Value).
+# The cause is the distance between BB where one contains
+# jrcxz instruction.
+# Example: OpenSSL
+# https://github.com/openssl/openssl/blob/master/crypto/bn/asm/x86_64-mont5.pl#L3319
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: %host_cc %t.o -falign-labels -march=native -o %t.exe -Wl,-q
+
+# RUN:  llvm-bolt %t.exe -o %t.bolted -data %t.fdata \
+# RUN:    -reorder-blocks=cache+ -reorder-functions=hfsort \
+# RUN:    -split-functions=2 -split-all-cold -split-eh -dyno-stats \
+# RUN:    -print-finalized 2>&1 | FileCheck %s
+
+# CHECK-NOT: value of -2105 is too large for field of 1 byte.
+
+  .text
+  .section .text.startup,"ax",@progbits
+  .p2align 5,,31
+  .globl main
+  .type main, @function
+main:
+  jmp bn_sqrx8x_internal
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,@function
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+# FDATA: 1 bn_from_mont8x 160 1 bn_sqrx8x_internal 0 0 56
+# FDATA: 1 bn_sqrx8x_internal 13 1  bn_sqrx8x_internal 40 0 60972
+# FDATA: 1 bn_sqrx8x_internal 5f 1  bn_sqrx8x_internal 2c 0 60972
+# FDATA: 1 bn_sqrx8x_internal 2f1 1 bn_sqrx8x_internal 500 0 60972
+# FDATA: 1 bn_sqrx8x_internal 34a 1 bn_sqrx8x_internal 360 0 60972
+# FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 360 0 447888
+# FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 417 0 63984
+# FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 480 0 60972
+# FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 429 0 3012
+# FDATA: 1 bn_sqrx8x_internal 467 1 bn_sqrx8x_internal 360 0 3012
+# FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 80 0 58964
+# FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 4c0 0 2008
+# FDATA: 1 bn_sqrx8x_internal 4fb 1 bn_sqrx8x_internal 80 0 2008
+# FDATA: 1 bn_sqrx8x_internal 5f0 1 bn_sqrx8x_internal 5f2 0 180908
+# FDATA: 1 bn_sqrx8x_internal 61b 1 bn_sqrx8x_internal 540 0 180908
+# FDATA: 1 bn_sqrx8x_internal 632 1 bn_sqrx8x_internal 637 0 59020
+# FDATA: 1 bn_sqrx8x_internal 657 1 bn_sqrx8x_internal 660 0 59020
+# FDATA: 1 bn_sqrx8x_internal 696 1 bn_sqrx8x_internal 6a0 0 120048
+# FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 6a0 0 840336
+# FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 760 0 120048
+# FDATA: 1 bn_sqrx8x_internal 768 1 bn_sqrx8x_internal 76e 0 120048
+# FDATA: 1 bn_sqrx8x_internal 7b2 1 bn_sqrx8x_internal 7c0 0 120048
+# FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 7c0 0 896560
+# FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 874 0 128080
+# FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 8c0 0 120048
+# FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 87b 0 8032
+# FDATA: 1 bn_sqrx8x_internal 8bb 1 bn_sqrx8x_internal 7c0 0 8032
+# FDATA: 1 bn_sqrx8x_internal 8e8 1 bn_sqrx8x_internal 8ed 0 120048
+# FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 660 0 61028
+# FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 95b 0 59020
+# FDATA: 0 [unknown] 0 1 bn_sqrx8x_internal 5f0 0 59020
+.cfi_startproc
+  leaq 48+8(%rsp),%rdi
+  leaq (%rsi,%r9,1),%rbp
+  movq %r9,0+8(%rsp)
+  movq %rbp,8+8(%rsp)
+  jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte 0x3e
+  movdqa %xmm0,0(%rdi)
+  movdqa %xmm0,16(%rdi)
+  movdqa %xmm0,32(%rdi)
+  movdqa %xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+  movdqa %xmm0,64(%rdi)
+  movdqa %xmm0,80(%rdi)
+  movdqa %xmm0,96(%rdi)
+  movdqa %xmm0,112(%rdi)
+  leaq 128(%rdi),%rdi
+  subq $64,%r9
+  jnz .Lsqrx8x_zero
+
+  movq 0(%rsi),%rdx
+
+  xorq %r10,%r10
+  xorq %r11,%r11
+  xorq %r12,%r12
+  xorq %r13,%r13
+  xorq %r14,%r14
+  xorq %r15,%r15
+  leaq 48+8(%rsp),%rdi
+  xorq %rbp,%rbp
+  jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+  mulxq 8(%rsi),%r8,%rax
+  adcxq %r9,%r8
+  adoxq %rax,%r10
+  mulxq 16(%rsi),%r9,%rax
+  adcxq %r10,%r9
+  adoxq %rax,%r11
+.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+  adcxq %r11,%r10
+  adoxq %rax,%r12
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+  adcxq %r12,%r11
+  adoxq %rax,%r13
+  mulxq 40(%rsi),%r12,%rax
+  adcxq %r13,%r12
+  adoxq %rax,%r14
+  mulxq 48(%rsi),%r13,%rax
+  adcxq %r14,%r13
+  adoxq %r15,%rax
+  mulxq 56(%rsi),%r14,%r15
+  movq 8(%rsi),%rdx
+  adcxq %rax,%r14
+  adoxq %rbp,%r15
+  adcq 64(%rdi),%r15
+  movq %r8,8(%rdi)
+  movq %r9,16(%rdi)
+  sbbq %rcx,%rcx
+  xorq %rbp,%rbp
+
+  mulxq 16(%rsi),%r8,%rbx
+  mulxq 24(%rsi),%r9,%rax
+  adcxq %r10,%r8
+  adoxq %rbx,%r9
+  mulxq 32(%rsi),%r10,%rbx
+  adcxq %r11,%r9
+  adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+  adcxq %r12,%r10
+  adoxq %rbx,%r11
+.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+  adcxq %r13,%r11
+  adoxq %r14,%r12
+.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+  movq 16(%rsi),%rdx
+  adcxq %rax,%r12
+  adoxq %rbx,%r13
+  adcxq %r15,%r13
+  adoxq %rbp,%r14
+  adcxq %rbp,%r14
+
+  movq %r8,24(%rdi)
+  movq %r9,32(%rdi)
+
+  mulxq 24(%rsi),%r8,%rbx
+  mulxq 32(%rsi),%r9,%rax
+  adcxq %r10,%r8
+  adoxq %rbx,%r9
+  mulxq 40(%rsi),%r10,%rbx
+  adcxq %r11,%r9
+  adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+  adcxq %r12,%r10
+  adoxq %r13,%r11
+.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte 0x3e
+  movq 24(%rsi),%rdx
+  adcxq %rbx,%r11
+  adoxq %rax,%r12
+  adcxq %r14,%r12
+  movq %r8,40(%rdi)
+  movq %r9,48(%rdi)
+  mulxq 32(%rsi),%r8,%rax
+  adoxq %rbp,%r13
+  adcxq %rbp,%r13
+
+  mulxq 40(%rsi),%r9,%rbx
+  adcxq %r10,%r8
+  adoxq %rax,%r9
+  mulxq 48(%rsi),%r10,%rax
+  adcxq %r11,%r9
+  adoxq %r12,%r10
+  mulxq 56(%rsi),%r11,%r12
+  movq 32(%rsi),%rdx
+  movq 40(%rsi),%r14
+  adcxq %rbx,%r10
+  adoxq %rax,%r11
+  movq 48(%rsi),%r15
+  adcxq %r13,%r11
+  adoxq %rbp,%r12
+  adcxq %rbp,%r12
+
+  movq %r8,56(%rdi)
+  movq %r9,64(%rdi)
+
+  mulxq %r14,%r9,%rax
+  movq 56(%rsi),%r8
+  adcxq %r10,%r9
+  mulxq %r15,%r10,%rbx
+  adoxq %rax,%r10
+  adcxq %r11,%r10
+  mulxq %r8,%r11,%rax
+  movq %r14,%rdx
+  adoxq %rbx,%r11
+  adcxq %r12,%r11
+
+  adcxq %rbp,%rax
+
+  mulxq %r15,%r14,%rbx
+  mulxq %r8,%r12,%r13
+  movq %r15,%rdx
+  leaq 64(%rsi),%rsi
+  adcxq %r14,%r11
+  adoxq %rbx,%r12
+  adcxq %rax,%r12
+  adoxq %rbp,%r13
+
+.byte 0x67,0x67
+  mulxq %r8,%r8,%r14
+  adcxq %r8,%r13
+  adcxq %rbp,%r14
+
+  cmpq 8+8(%rsp),%rsi
+  je .Lsqrx8x_outer_break
+
+  negq %rcx
+  movq $-8,%rcx
+  movq %rbp,%r15
+  movq 64(%rdi),%r8
+  adcxq 72(%rdi),%r9
+  adcxq 80(%rdi),%r10
+  adcxq 88(%rdi),%r11
+  adcq 96(%rdi),%r12
+  adcq 104(%rdi),%r13
+  adcq 112(%rdi),%r14
+  adcq 120(%rdi),%r15
+  leaq (%rsi),%rbp
+  leaq 128(%rdi),%rdi
+  sbbq %rax,%rax
+
+  movq -64(%rsi),%rdx
+  movq %rax,16+8(%rsp)
+  movq %rdi,24+8(%rsp)
+
+
+  xorl %eax,%eax
+  jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+  movq %r8,%rbx
+  mulxq 0(%rbp),%rax,%r8
+  adcxq %rax,%rbx
+  adoxq %r9,%r8
+
+  mulxq 8(%rbp),%rax,%r9
+  adcxq %rax,%r8
+  adoxq %r10,%r9
+
+  mulxq 16(%rbp),%rax,%r10
+  adcxq %rax,%r9
+  adoxq %r11,%r10
+
+  mulxq 24(%rbp),%rax,%r11
+  adcxq %rax,%r10
+  adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+  adcxq %rax,%r11
+  adoxq %r13,%r12
+
+  mulxq 40(%rbp),%rax,%r13
+  adcxq %rax,%r12
+  adoxq %r14,%r13
+
+  mulxq 48(%rbp),%rax,%r14
+  movq %rbx,(%rdi,%rcx,8)
+  movl $0,%ebx
+  adcxq %rax,%r13
+  adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+  movq 8(%rsi,%rcx,8),%rdx
+  adcxq %rax,%r14
+  adoxq %rbx,%r15
+  adcxq %rbx,%r15
+
+.byte 0x67
+  incq %rcx
+  jnz .Lsqrx8x_loop
+
+  leaq 64(%rbp),%rbp
+  movq $-8,%rcx
+  cmpq 8+8(%rsp),%rbp
+  je .Lsqrx8x_break
+
+  subq 16+8(%rsp),%rbx
+.byte 0x66
+  movq -64(%rsi),%rdx
+  adcxq 0(%rdi),%r8
+  adcxq 8(%rdi),%r9
+  adcq 16(%rdi),%r10
+  adcq 24(%rdi),%r11
+  adcq 32(%rdi),%r12
+  adcq 40(%rdi),%r13
+  adcq 48(%rdi),%r14
+  adcq 56(%rdi),%r15
+  leaq 64(%rdi),%rdi
+.byte 0x67
+  sbbq %rax,%rax
+  xorl %ebx,%ebx
+  movq %rax,16+8(%rsp)
+  jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+  xorq %rbp,%rbp
+  subq 16+8(%rsp),%rbx
+  adcxq %rbp,%r8
+  movq 24+8(%rsp),%rcx
+  adcxq %rbp,%r9
+  movq 0(%rsi),%rdx
+  adcq $0,%r10
+  movq %r8,0(%rdi)
+  adcq $0,%r11
+  adcq $0,%r12
+  adcq $0,%r13
+  adcq $0,%r14
+  adcq $0,%r15
+  cmpq %rcx,%rdi
+  je .Lsqrx8x_outer_loop
+
+  movq %r9,8(%rdi)
+  movq 8(%rcx),%r9
+  movq %r10,16(%rdi)
+  movq 16(%rcx),%r10
+  movq %r11,24(%rdi)
+  movq 24(%rcx),%r11
+  movq %r12,32(%rdi)
+  movq 32(%rcx),%r12
+  movq %r13,40(%rdi)
+  movq 40(%rcx),%r13
+  movq %r14,48(%rdi)
+  movq 48(%rcx),%r14
+  movq %r15,56(%rdi)
+  movq 56(%rcx),%r15
+  movq %rcx,%rdi
+  jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+  movq %r9,72(%rdi)
+.byte 102,72,15,126,217
+  movq %r10,80(%rdi)
+  movq %r11,88(%rdi)
+  movq %r12,96(%rdi)
+  movq %r13,104(%rdi)
+  movq %r14,112(%rdi)
+  leaq 48+8(%rsp),%rdi
+  movq (%rsi,%rcx,1),%rdx
+
+  movq 8(%rdi),%r11
+  xorq %r10,%r10
+  movq 0+8(%rsp),%r9
+  adoxq %r11,%r11
+  movq 16(%rdi),%r12
+  movq 24(%rdi),%r13
+
+.align 32
+.Lsqrx4x_shift_n_add:
+  mulxq %rdx,%rax,%rbx
+  adoxq %r12,%r12
+  adcxq %r10,%rax
+.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+  adoxq %r13,%r13
+  adcxq %r11,%rbx
+  movq 40(%rdi),%r11
+  movq %rax,0(%rdi)
+  movq %rbx,8(%rdi)
+
+  mulxq %rdx,%rax,%rbx
+  adoxq %r10,%r10
+  adcxq %r12,%rax
+  movq 16(%rsi,%rcx,1),%rdx
+  movq 48(%rdi),%r12
+  adoxq %r11,%r11
+  adcxq %r13,%rbx
+  movq 56(%rdi),%r13
+  movq %rax,16(%rdi)
+  movq %rbx,24(%rdi)
+
+  mulxq %rdx,%rax,%rbx
+  adoxq %r12,%r12
+  adcxq %r10,%rax
+  movq 24(%rsi,%rcx,1),%rdx
+  leaq 32(%rcx),%rcx
+  movq 64(%rdi),%r10
+  adoxq %r13,%r13
+  adcxq %r11,%rbx
+  movq 72(%rdi),%r11
+  movq %rax,32(%rdi)
+  movq %rbx,40(%rdi)
+
+  mulxq %rdx,%rax,%rbx
+  adoxq %r10,%r10
+  adcxq %r12,%rax
+  jrcxz .Lsqrx4x_shift_n_add_break
+.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+  adoxq %r11,%r11
+  adcxq %r13,%rbx
+  movq 80(%rdi),%r12
+  movq 88(%rdi),%r13
+  movq %rax,48(%rdi)
+  movq %rbx,56(%rdi)
+  leaq 64(%rdi),%rdi
+  nop
+  jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+  adcxq %r13,%rbx
+  movq %rax,48(%rdi)
+  movq %rbx,56(%rdi)
+  leaq 64(%rdi),%rdi
+.byte 102,72,15,126,213
+__bn_sqrx8x_reduction:
+  xorl %eax,%eax
+  movq 32+8(%rsp),%rbx
+  movq 48+8(%rsp),%rdx
+  leaq -64(%rbp,%r9,1),%rcx
+
+  movq %rcx,0+8(%rsp)
+  movq %rdi,8+8(%rsp)
+
+  leaq 48+8(%rsp),%rdi
+  jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+  movq 8(%rdi),%r9
+  movq 16(%rdi),%r10
+  movq 24(%rdi),%r11
+  movq 32(%rdi),%r12
+  movq %rdx,%r8
+  imulq %rbx,%rdx
+  movq 40(%rdi),%r13
+  movq 48(%rdi),%r14
+  movq 56(%rdi),%r15
+  movq %rax,24+8(%rsp)
+
+  leaq 64(%rdi),%rdi
+  xorq %rsi,%rsi
+  movq $-8,%rcx
+  jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+  movq %r8,%rbx
+  mulxq 0(%rbp),%rax,%r8
+  adcxq %rbx,%rax
+  adoxq %r9,%r8
+
+  mulxq 8(%rbp),%rbx,%r9
+  adcxq %rbx,%r8
+  adoxq %r10,%r9
+
+  mulxq 16(%rbp),%rbx,%r10
+  adcxq %rbx,%r9
+  adoxq %r11,%r10
+
+  mulxq 24(%rbp),%rbx,%r11
+  adcxq %rbx,%r10
+  adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+  movq %rdx,%rax
+  movq %r8,%rdx
+  adcxq %rbx,%r11
+  adoxq %r13,%r12
+
+  mulxq 32+8(%rsp),%rbx,%rdx
+  movq %rax,%rdx
+  movq %rax,64+48+8(%rsp,%rcx,8)
+
+  mulxq 40(%rbp),%rax,%r13
+  adcxq %rax,%r12
+  adoxq %r14,%r13
+
+  mulxq 48(%rbp),%rax,%r14
+  adcxq %rax,%r13
+  adoxq %r15,%r14
+
+  mulxq 56(%rbp),%rax,%r15
+  movq %rbx,%rdx
+  adcxq %rax,%r14
+  adoxq %rsi,%r15
+  adcxq %rsi,%r15
+
+.byte 0x67,0x67,0x67
+  incq %rcx
+  jnz .Lsqrx8x_reduce
+
+  movq %rsi,%rax
+  cmpq 0+8(%rsp),%rbp
+  jae .Lsqrx8x_no_tail
+
+  movq 48+8(%rsp),%rdx
+  addq 0(%rdi),%r8
+  leaq 64(%rbp),%rbp
+  movq $-8,%rcx
+  adcxq 8(%rdi),%r9
+  adcxq 16(%rdi),%r10
+  adcq 24(%rdi),%r11
+  adcq 32(%rdi),%r12
+  adcq 40(%rdi),%r13
+  adcq 48(%rdi),%r14
+  adcq 56(%rdi),%r15
+  leaq 64(%rdi),%rdi
+  sbbq %rax,%rax
+
+  xorq %rsi,%rsi
+  movq %rax,16+8(%rsp)
+  jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+  movq %r8,%rbx
+  mulxq 0(%rbp),%rax,%r8
+  adcxq %rax,%rbx
+  adoxq %r9,%r8
+
+  mulxq 8(%rbp),%rax,%r9
+  adcxq %rax,%r8
+  adoxq %r10,%r9
+
+  mulxq 16(%rbp),%rax,%r10
+  adcxq %rax,%r9
+  adoxq %r11,%r10
+
+  mulxq 24(%rbp),%rax,%r11
+  adcxq %rax,%r10
+  adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+  adcxq %rax,%r11
+  adoxq %r13,%r12
+
+  mulxq 40(%rbp),%rax,%r13
+  adcxq %rax,%r12
+  adoxq %r14,%r13
+
+  mulxq 48(%rbp),%rax,%r14
+  adcxq %rax,%r13
+  adoxq %r15,%r14
+
+  mulxq 56(%rbp),%rax,%r15
+  movq 72+48+8(%rsp,%rcx,8),%rdx
+  adcxq %rax,%r14
+  adoxq %rsi,%r15
+  movq %rbx,(%rdi,%rcx,8)
+  movq %r8,%rbx
+  adcxq %rsi,%r15
+
+  incq %rcx
+  jnz .Lsqrx8x_tail
+
+  cmpq 0+8(%rsp),%rbp
+  jae .Lsqrx8x_tail_done
+
+  subq 16+8(%rsp),%rsi
+  movq 48+8(%rsp),%rdx
+  leaq 64(%rbp),%rbp
+  adcq 0(%rdi),%r8
+  adcq 8(%rdi),%r9
+  adcq 16(%rdi),%r10
+  adcq 24(%rdi),%r11
+  adcq 32(%rdi),%r12
+  adcq 40(%rdi),%r13
+  adcq 48(%rdi),%r14
+  adcq 56(%rdi),%r15
+  leaq 64(%rdi),%rdi
+  sbbq %rax,%rax
+  subq $8,%rcx
+
+  xorq %rsi,%rsi
+  movq %rax,16+8(%rsp)
+  jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+  xorq %rax,%rax
+  addq 24+8(%rsp),%r8
+  adcq $0,%r9
+  adcq $0,%r10
+  adcq $0,%r11
+  adcq $0,%r12
+  adcq $0,%r13
+  adcq $0,%r14
+  adcq $0,%r15
+  adcq $0,%rax
+
+  subq 16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+  adcq 0(%rdi),%r8
+.byte 102,72,15,126,217
+  adcq 8(%rdi),%r9
+  movq 56(%rbp),%rsi
+.byte 102,72,15,126,213
+  adcq 16(%rdi),%r10
+  adcq 24(%rdi),%r11
+  adcq 32(%rdi),%r12
+  adcq 40(%rdi),%r13
+  adcq 48(%rdi),%r14
+  adcq 56(%rdi),%r15
+  adcq $0,%rax
+
+  movq 32+8(%rsp),%rbx
+  movq 64(%rdi,%rcx,1),%rdx
+
+  movq %r8,0(%rdi)
+  leaq 64(%rdi),%r8
+  movq %r9,8(%rdi)
+  movq %r10,16(%rdi)
+  movq %r11,24(%rdi)
+  movq %r12,32(%rdi)
+  movq %r13,40(%rdi)
+  movq %r14,48(%rdi)
+  movq %r15,56(%rdi)
+
+  leaq 64(%rdi,%rcx,1),%rdi
+  cmpq 8+8(%rsp),%r8
+  jb .Lsqrx8x_reduction_loop
+  .byte 0xf3,0xc3
+.cfi_endproc
+.size  bn_sqrx8x_internal,.-bn_sqrx8x_internal
diff --git a/bolt/test/X86/debug-fission-single.s b/bolt/test/X86/debug-fission-single.s
new file mode 100644
index 000000000000..d38c2bdd8982
--- /dev/null
+++ b/bolt/test/X86/debug-fission-single.s
@@ -0,0 +1,40 @@
+# REQUIRES: system-linux
+
+# RUN: mkdir -p %t.dir && cd %t.dir
+# RUN: cp %S/Inputs/debug-fission-simple.s debug-fission-simple.s
+# RUN: cp %S/Inputs/debug-fission-script.txt debug-fission-script.txt
+# RUN: llvm-mc -g -filetype=obj -triple x86_64-unknown-unknown --split-dwarf-file=debug-fission-simple.dwo \
+# RUN:   ./debug-fission-simple.s -o ./debug-fission-simple.o
+# RUN: %host_cxx %cxxflags -g -Wl,--gc-sections,-q,-nostdlib -Wl,--undefined=_Z6_startv -nostartfiles -Wl,--script=debug-fission-script.txt ./debug-fission-simple.o -o out.exe
+# RUN: llvm-bolt out.exe --reorder-blocks=reverse -update-debug-sections -dwo-output-path=%t.dir -o out.bolt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.dir/debug-fission-simple.dwo0.dwo | grep DW_FORM_GNU_addr_index | FileCheck %s --check-prefix=CHECK-ADDR-INDEX
+# RUN: llvm-dwarfdump --show-form --verbose --debug-addr out.bolt | FileCheck %s --check-prefix=CHECK-ADDR-SEC
+
+# CHECK-ADDR-INDEX: DW_AT_low_pc [DW_FORM_GNU_addr_index]	(indexed (00000001)
+# CHECK-ADDR-INDEX: DW_AT_low_pc [DW_FORM_GNU_addr_index]	(indexed (00000002)
+# CHECK-ADDR-INDEX: DW_AT_low_pc [DW_FORM_GNU_addr_index]	(indexed (00000003)
+
+# CHECK-ADDR-SEC: .debug_addr contents:
+# CHECK-ADDR-SEC: 0x00000000: Addrs: [
+# CHECK-ADDR-SEC: 0x0000000000601000
+# CHECK-ADDR-SEC: 0x0000000000a00000
+# CHECK-ADDR-SEC: 0x0000000000000000
+# CHECK-ADDR-SEC: 0x0000000000a00040
+
+//clang++ -ffunction-sections -fno-exceptions -g -gsplit-dwarf=split -S debug-fission-simple.cpp -o debug-fission-simple.s
+static int foo = 2;
+int doStuff(int val) {
+  if (val == 5)
+    val += 1 + foo;
+  else
+    val -= 1;
+  return val;
+}
+
+int doStuff2(int val) {
+  return val += 3;
+}
+
+int main(int argc, const char** argv) {
+  return doStuff(argc);
+}
diff --git a/bolt/test/X86/debugTypesBug.s b/bolt/test/X86/debugTypesBug.s
new file mode 100644
index 000000000000..179a4f284671
--- /dev/null
+++ b/bolt/test/X86/debugTypesBug.s
@@ -0,0 +1,13 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %p/Inputs/main.s -o %tmain.o
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %p/Inputs/file1.s -o %tfile1.o
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %p/Inputs/file2.s -o %tfile2.o
+# RUN: %host_cxx %cxxflags %tmain.o %tfile1.o %tfile2.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --reorder-blocks=reverse -update-debug-sections -o %t.out
+# RUN: llvm-dwarfdump --debug-types=0x000005f7 %t.out | grep DW_AT_stmt_list | FileCheck %s --check-prefix=CHECK-OUTPUT
+
+# CHECK-OUTPUT: DW_AT_stmt_list	(0x000004d4)
diff --git a/bolt/test/X86/false-jump-table.s b/bolt/test/X86/false-jump-table.s
new file mode 100755
index 000000000000..454289be07a6
--- /dev/null
+++ b/bolt/test/X86/false-jump-table.s
@@ -0,0 +1,132 @@
+# Check that jump table detection does not fail on a false
+# reference to a jump table.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -print-cfg \
+# RUN:    -print-only=inc_dup -o %t.out | FileCheck %s
+
+	.file	"jump_table.c"
+	.section	.rodata
+.LC0:
+	.string	"0"
+.LC1:
+	.string	"1"
+.LC2:
+	.string	"2"
+.LC3:
+	.string	"3"
+.LC4:
+	.string	"4"
+.LC5:
+	.string	"5"
+	.text
+	.globl	inc_dup
+	.type	inc_dup, @function
+inc_dup:
+.LFB0:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register 6
+	subq	$16, %rsp
+	movl	%edi, -4(%rbp)
+	movl	-4(%rbp), %eax
+	subl	$10, %eax
+	cmpl	$5, %eax
+	ja	.L2
+# Control flow confusing for JT detection
+# CHECK: leaq    "JUMP_TABLE{{.*}}"(%rip), %rdx
+	leaq	.L4(%rip), %rdx
+  jmp .LJT
+# CHECK: leaq    DATAat{{.*}}(%rip), %rdx
+	leaq	.LC0(%rip), %rdx
+  jmp .L10
+.LJT:
+  movslq  (%rdx,%rax,4), %rax
+	addq	%rdx, %rax
+# CHECK: jmpq    *%rax # UNKNOWN CONTROL FLOW
+	jmp	*%rax
+	.section	.rodata
+	.align 4
+	.align 4
+.L4:
+	.long	.L3-.L4
+	.long	.L5-.L4
+	.long	.L6-.L4
+	.long	.L7-.L4
+	.long	.L8-.L4
+	.long	.L9-.L4
+	.text
+.L3:
+	leaq	.LC0(%rip), %rdi
+	call	puts@PLT
+	movl	$1, %eax
+	jmp	.L10
+.L5:
+	leaq	.LC1(%rip), %rdi
+	call	puts@PLT
+	movl	$2, %eax
+	jmp	.L10
+.L6:
+	leaq	.LC2(%rip), %rdi
+	call	puts@PLT
+	movl	$3, %eax
+	jmp	.L10
+.L7:
+	leaq	.LC3(%rip), %rdi
+	call	puts@PLT
+	movl	$4, %eax
+	jmp	.L10
+.L8:
+	leaq	.LC4(%rip), %rdi
+	call	puts@PLT
+	movl	$5, %eax
+	jmp	.L10
+.L9:
+	leaq	.LC5(%rip), %rdi
+	call	puts@PLT
+	movl	$6, %eax
+	jmp	.L10
+.L2:
+	movl	-4(%rbp), %eax
+	addl	$1, %eax
+.L10:
+	leave
+	.cfi_def_cfa 7, 8
+	ret
+	.cfi_endproc
+.LFE0:
+	.size	inc_dup, .-inc_dup
+	.text
+	.globl	main
+	.type	main, @function
+main:
+.LFB1:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register 6
+	subq	$16, %rsp
+	movl	%edi, -4(%rbp)
+	movq	%rsi, -16(%rbp)
+	movl	-4(%rbp), %eax
+	addl	$9, %eax
+	movl	%eax, %edi
+	call	inc_dup@PLT
+	leave
+	.cfi_def_cfa 7, 8
+	ret
+	.cfi_endproc
+.LFE1:
+	.size	main, .-main
+	.ident	"GCC: (GNU) 6.3.0"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/bolt/test/X86/fix-branches-jrcxz.s b/bolt/test/X86/fix-branches-jrcxz.s
new file mode 100755
index 000000000000..38475a148bd3
--- /dev/null
+++ b/bolt/test/X86/fix-branches-jrcxz.s
@@ -0,0 +1,64 @@
+# Test BOLT does not crash by trying to change the direction of a JRCXZ
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -relocs=1 -reorder-blocks=cache+ -print-finalized \
+# RUN:    -o %t.out -data %t.fdata | FileCheck %s
+# RUN: %t.out 1 2 3
+
+# CHECK: BOLT-INFO
+
+  .text
+  .section .text.startup,"ax",@progbits
+  .p2align 5,,31
+  .globl main
+  .type main, %function
+main:
+  jmp test_function
+
+.globl test_function
+.hidden test_function
+.type test_function,@function
+.align 32
+test_function:
+# FDATA: 0 main 0 1 test_function 0 0 510
+  xorq %rcx, %rcx
+  andq $3, %rdi
+  jmpq *jumptbl(,%rdi,8)
+
+# Here are the 4 possible targets of the indirect branch to simulate a simple
+# CFG. What is important here is that BB1, the first block, conditionally
+# transfers control to the exit block with JRCXZ (.J1). We create a mock profile
+# saying that this branch is taken more often than not, causing BOLT to try to
+# put the exit block after it, which would require us to change the direction
+# of JRCXZ.
+.BB1:
+  movl $0x0, %eax
+.J1:
+  jrcxz .BBend
+# FDATA: 1 test_function #.J1# 1 test_function #.BB2# 0 10
+# FDATA: 1 test_function #.J1# 1 test_function #.BBend# 0 500
+.BB2:
+  movl $0x2, %eax
+  jmp .BBend
+.Lbb3:
+  movl $0x3, %eax
+  jmp .BBend
+.Lbb4:
+  movl $0x4, %eax
+.BBend:
+  retq
+.Lend1:
+
+  .section .rodata
+  .globl jumptbl
+jumptbl:
+  .quad .BB1
+  .quad .BB2
+  .quad .Lbb3
+  .quad .Lbb4
diff --git a/bolt/test/X86/hot_end_symbol.s b/bolt/test/X86/hot_end_symbol.s
new file mode 100755
index 000000000000..512e99ec963c
--- /dev/null
+++ b/bolt/test/X86/hot_end_symbol.s
@@ -0,0 +1,80 @@
+# This reproduces a bug where BOLT would read __hot_end as
+# the name of a function even when -hot-text is used, which
+# means BOLT will emit another __hot_end label, eventually
+# asserting due to a symbol redefinition in MCStreamer.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -relocs=1 -hot-text -reorder-functions=hfsort \
+# RUN:    -data %t.fdata -o %t.out | FileCheck %s
+
+# RUN: %t.out 1
+
+# CHECK: BOLT-INFO: setting __hot_end to
+
+# RUN: llvm-nm -n %t.exe | FileCheck %s --check-prefix=CHECK-INPUT
+# RUN: llvm-nm -n %t.out | FileCheck %s --check-prefix=CHECK-OUTPUT
+
+# CHECK-INPUT:       __hot_start
+# CHECK-INPUT-NEXT:  main
+# CHECK-INPUT-NEXT:  foo
+# CHECK-INPUT-NEXT:  __hot_end
+
+# Our fdata only logs activity in main, so hot markers will change
+# CHECK-OUTPUT:       __hot_start
+# CHECK-OUTPUT-NEXT:  main
+# CHECK-OUTPUT-NEXT:  __hot_end
+
+  .text
+  .globl  main
+  .type main, %function
+  .globl  __hot_start
+  .type __hot_start, %object
+  .p2align  4
+main:
+__hot_start:
+# FDATA: 0 [unknown] 0 1 main 0 0 510
+  pushq %rbp
+  movq  %rsp, %rbp
+  cmpl  $0x2, %edi
+  jb    .BBend
+.BB2:
+  callq bar
+  leaq mystring, %rdi
+  callq puts
+
+.BBend:
+  xorq %rax, %rax
+  leaveq
+  retq
+  .size main, .-main
+
+  .globl foo
+  .type foo, %function
+  .p2align 4
+foo:
+  retq
+  .size foo, .-foo
+
+  .globl __hot_end
+  .type __hot_end, %object
+  .p2align 2
+__hot_end:
+  int3
+  .size __hot_end, 0
+
+  .globl bar
+  .type bar, %function
+  .p2align 4
+bar:
+  retq
+  .size bar, .-bar
+
+  .data
+mystring: .asciz "test\n"
diff --git a/bolt/test/X86/icf-jump-tables.test b/bolt/test/X86/icf-jump-tables.test
new file mode 100644
index 000000000000..36873f39f7c0
--- /dev/null
+++ b/bolt/test/X86/icf-jump-tables.test
@@ -0,0 +1,13 @@
+## Check that BOLT folds functions with jump tables and does not report
+## undefined symbols.
+
+# REQUIRES: system-linux
+
+# RUN: %host_cc %cflags -O1 -g %p/Inputs/icf-jump-tables.c -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -icf -o %t.bolt |& FileCheck %s
+
+## Check that BOLT successfully folded a function with jump table:
+# CHECK: ICF folded {{.*}}. {{[^0]}} functions had jump tables.
+
+## Check that no relocations against undefined symbols were emitted:
+# CHECK-NOT: Undefined temporary symbol
diff --git a/bolt/test/X86/inline-debug-info.test b/bolt/test/X86/inline-debug-info.test
new file mode 100644
index 000000000000..108ba8d77d97
--- /dev/null
+++ b/bolt/test/X86/inline-debug-info.test
@@ -0,0 +1,19 @@
+## Check that BOLT correctly prints and updates debug info for inlined
+## functions.
+
+# REQUIRES: system-linux
+
+# RUN: %host_cc %cflags -O1 -g %p/Inputs/inline-main.c %p/Inputs/inline-foo.c \
+# RUN:   -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -update-debug-sections -print-debug-info \
+# RUN:   -print-only=main -print-after-lowering -force-inline=foo -o %t.bolt \
+# RUN:   | FileCheck %s
+
+## The call to puts() should come from inline-foo.c:
+# CHECK: callq   puts@PLT # debug line {{.*}}inline-foo.c:4:3
+
+# RUN: llvm-objdump --disassemble-symbols=main -d --line-numbers %t.bolt \
+# RUN:   | FileCheck %s -check-prefix=CHECK-OBJDUMP
+
+## Dump of main() should include debug info from inline-foo.c after inlining:
+# CHECK-OBJDUMP: inline-foo.c:4
diff --git a/bolt/test/X86/instrumentation-dup-jts.s b/bolt/test/X86/instrumentation-dup-jts.s
new file mode 100755
index 000000000000..f1be676996d0
--- /dev/null
+++ b/bolt/test/X86/instrumentation-dup-jts.s
@@ -0,0 +1,153 @@
+# This reproduces a bug with instrumentation when trying to instrument
+# functions that share a jump table with multiple indirect jumps. Usually,
+# each indirect jump that uses a JT will have its own copy of it. When
+# this does not happen, we need to duplicate the jump table safely, so
+# we can split the edges correctly (each copy of the jump table may have
+# different split edges). For this to happen, we need to correctly match
+# the sequence of instructions that perform the indirect jump to identify
+# the base address of the jump table and patch it to point to the new
+# cloned JT.
+#
+# Here we test this variant:
+#	  movq	jt.2397(,%rax,8), %rax
+#   jmp	*%rax
+#
+# Which is suboptimal since the compiler could've avoided using an intermediary
+# register, but GCC does generate this code and it triggered a bug in our
+# matcher. Usual jumps in non-PIC code have this format:
+#
+#   jmp	*jt.2397(,%rax,8)
+#
+# This is the C code fed to GCC:
+#  #include <stdio.h>
+#int interp(char* code) {
+#    static void* jt[] = { &&op_end, &&op_inc, &&do_dec };
+#    int pc = 0;
+#    int res = 0;
+#    goto *jt[code[pc++] - '0'];
+#
+#op_inc:
+#    res += 1;
+#    printf("%d\n", res);
+#    goto *jt[code[pc++] - '0'];
+#do_dec:
+#    res -= 1;
+#    printf("%d\n", res);
+#    goto *jt[code[pc++] - '0'];
+#op_end:
+#    return res;
+#}
+#int main(int argc, char** argv) {
+#    return interp(argv[1]);
+#}
+
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -instrument -instrumentation-file=%t.fdata \
+# RUN:   -o %t.instrumented
+
+# Instrumented program needs to finish returning zero
+# RUN: %t.instrumented 120
+
+# Test that the instrumented data makes sense
+# RUN:  llvm-bolt %t.exe -o %t.bolted -data %t.fdata \
+# RUN:    -reorder-blocks=cache+ -reorder-functions=hfsort+ \
+# RUN:    -print-only=interp -print-finalized | FileCheck %s
+
+# RUN: %t.bolted 120
+
+# Check that our two indirect jumps are recorded in the fdata file and that
+# each has its own independent profile
+# CHECK:  Successors: .Ltmp1 (mispreds: 0, count: 1), .Ltmp0 (mispreds: 0, count: 0), .Ltmp2 (mispreds: 0, count: 0)
+# CHECK:  Successors: .Ltmp0 (mispreds: 0, count: 1), .Ltmp2 (mispreds: 0, count: 1), .Ltmp1 (mispreds: 0, count: 0)
+
+	.file	"test.c"
+	.text
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.LC0:
+	.string	"%d\n"
+	.text
+	.p2align 4,,15
+	.globl	interp
+	.type	interp, @function
+interp:
+.LFB11:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	xorl	%ebp, %ebp
+	pushq	%rbx
+	.cfi_def_cfa_offset 24
+	.cfi_offset 3, -24
+	leaq	1(%rdi), %rbx
+	subq	$8, %rsp
+	.cfi_def_cfa_offset 32
+	movsbl	(%rdi), %eax
+	subl	$48, %eax
+	cltq
+	movq	jt.2397(,%rax,8), %rax
+	jmp	*%rax
+	.p2align 4,,10
+	.p2align 3
+.L3:
+	addl	$1, %ebp
+.L8:
+	movl	%ebp, %esi
+	movl	$.LC0, %edi
+	xorl	%eax, %eax
+	addq	$1, %rbx
+	call	printf
+	movsbl	-1(%rbx), %eax
+	subl	$48, %eax
+	cltq
+	movq	jt.2397(,%rax,8), %rax
+	jmp	*%rax
+	.p2align 4,,10
+	.p2align 3
+.L6:
+	addq	$8, %rsp
+	.cfi_remember_state
+	.cfi_def_cfa_offset 24
+	movl	%ebp, %eax
+	popq	%rbx
+	.cfi_def_cfa_offset 16
+	popq	%rbp
+	.cfi_def_cfa_offset 8
+	ret
+	.p2align 4,,10
+	.p2align 3
+.L4:
+	.cfi_restore_state
+	subl	$1, %ebp
+	jmp	.L8
+	.cfi_endproc
+.LFE11:
+	.size	interp, .-interp
+	.section	.text.startup,"ax",@progbits
+	.p2align 4,,15
+	.globl	main
+	.type	main, @function
+main:
+.LFB12:
+	.cfi_startproc
+	movq	8(%rsi), %rdi
+	jmp	interp
+	.cfi_endproc
+.LFE12:
+	.size	main, .-main
+	.section	.rodata
+	.align 16
+	.type	jt.2397, @object
+	.size	jt.2397, 24
+jt.2397:
+	.quad	.L6
+	.quad	.L3
+	.quad	.L4
+	.ident	"GCC: (GNU) 8"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/bolt/test/X86/instrumentation-ind-calls.s b/bolt/test/X86/instrumentation-ind-calls.s
new file mode 100755
index 000000000000..5665dd4ff9b7
--- /dev/null
+++ b/bolt/test/X86/instrumentation-ind-calls.s
@@ -0,0 +1,66 @@
+# This reproduces a bug with instrumentation when trying to count calls
+# when the target address is computed with a referece to the stack pointer.
+# Our instrumentation code uses the stack to save registers to be
+# transparent with the instrumented code, but we end up updating the stack
+# pointer while doing so, which affects this target address calculation.
+# The solution is to temporarily fix RSP. Check that we correctly instrument
+# these cases.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -instrument -instrumentation-file=%t.fdata \
+# RUN:   -o %t.instrumented
+
+# Instrumented program needs to finish returning zero
+# RUN: %t.instrumented arg1 arg2
+
+# Test that the instrumented data makes sense
+# RUN:  llvm-bolt %t.exe -o %t.bolted -data %t.fdata \
+# RUN:    -reorder-blocks=cache+ -reorder-functions=hfsort+ \
+# RUN:    -print-only=main -print-finalized | FileCheck %s
+
+# RUN: %t.bolted arg1 arg2
+
+# Check that our indirect call has 1 hit recorded in the fdata file and that
+# this was processed correctly by BOLT
+# CHECK:         callq   *0x18(%rsp) # CallProfile: 1 (0 misses) :
+# CHECK-NEXT:        { targetFunc: 1 (0 misses) }
+
+  .text
+  .globl  main
+  .type main, %function
+  .p2align  4
+main:
+  pushq %rbp
+  movq  %rsp, %rbp
+  leaq targetFunc, %rax
+  pushq %rax                  # We save the target function address in the stack
+  subq  $0x18, %rsp           # Set up a dummy stack frame
+  cmpl  $0x2, %edi
+  jb    .LBBerror             # Add control flow so we don't have a trivial case
+.LBB2:
+  callq *0x18(%rsp)           # Indirect call using %rsp
+  addq $0x20, %rsp
+  movq %rbp, %rsp
+  pop %rbp
+  retq
+
+.LBBerror:
+  addq $0x20, %rsp
+  movq %rbp, %rsp
+  pop %rbp
+  movq $1, %rax               # Finish with an error if we go this path
+  retq
+  .size main, .-main
+
+  .globl targetFunc
+  .type targetFunc, %function
+  .p2align  4
+targetFunc:
+  xorq %rax, %rax
+  retq
+  .size targetFunc, .-targetFunc
diff --git a/bolt/test/X86/instrumentation-pie.c b/bolt/test/X86/instrumentation-pie.c
new file mode 100644
index 000000000000..f4072434d5c9
--- /dev/null
+++ b/bolt/test/X86/instrumentation-pie.c
@@ -0,0 +1,39 @@
+/* Checks that BOLT correctly handles instrumentation of executables built
+ * with PIE with further optimization.
+ */
+#include <stdio.h>
+
+int foo(int x) { return x + 1; }
+
+int fib(int x) {
+  if (x < 2)
+    return x;
+  return fib(x - 1) + fib(x - 2);
+}
+
+int bar(int x) { return x - 1; }
+
+int main(int argc, char **argv) {
+  printf("fib(%d) = %d\n", argc, fib(argc));
+  return 0;
+}
+
+/*
+REQUIRES: system-linux
+
+RUN: %host_cc %cflags %s -o %t.exe -Wl,-q -pie -fpie
+
+RUN: llvm-bolt %t.exe -instrument -instrumentation-file=%t.fdata \
+RUN:   -o %t.instrumented
+
+# Instrumented program needs to finish returning zero
+RUN: %t.instrumented 1 2 3 | FileCheck %s -check-prefix=CHECK-OUTPUT
+
+# Test that the instrumented data makes sense
+RUN:  llvm-bolt %t.exe -o %t.bolted -data %t.fdata \
+RUN:    -reorder-blocks=cache+ -reorder-functions=hfsort+
+
+RUN: %t.bolted 1 2 3  | FileCheck %s -check-prefix=CHECK-OUTPUT
+
+CHECK-OUTPUT: fib(4) = 3
+*/
diff --git a/bolt/test/X86/instrumentation-shlib.c b/bolt/test/X86/instrumentation-shlib.c
new file mode 100644
index 000000000000..802b49459995
--- /dev/null
+++ b/bolt/test/X86/instrumentation-shlib.c
@@ -0,0 +1,69 @@
+/* Checks that BOLT correctly handles instrumentation shared libraries
+ * with further optimization.
+ */
+#include <dlfcn.h>
+#include <stdio.h>
+
+#ifdef LIB
+int foo(int x) { return x + 1; }
+
+int fib(int x) {
+  if (x < 2)
+    return x;
+  return fib(x - 1) + fib(x - 2);
+}
+
+int bar(int x) { return x - 1; }
+#endif
+
+#ifndef LIB
+int main(int argc, char **argv) {
+  int (*fib_func)(int);
+  char *libname;
+  void *hndl;
+  int val;
+  if (argc < 2)
+    return -1;
+  /*
+   * Expected library name as input to switch
+   * between original and instrumented file
+   */
+  libname = argv[1];
+  hndl = dlopen(libname, RTLD_LAZY);
+  if (!hndl) {
+    printf("library load failed\n");
+    return -1;
+  }
+  fib_func = dlsym(hndl, "fib");
+  if (!fib_func) {
+    printf("fib_func load failed\n");
+    return -1;
+  }
+  val = fib_func(argc);
+  dlclose(hndl);
+  printf("fib(%d) = %d\n", argc, val);
+  return 0;
+}
+#endif
+
+/*
+REQUIRES: system-linux
+
+RUN: %host_cc %cflags %s -o %t.so -Wl,-q -fpie -fPIC -shared -DLIB
+RUN: %host_cc %cflags %s -o %t.exe -Wl,-q -ldl
+
+RUN: llvm-bolt %t.so -instrument -instrumentation-file=%t.so.fdata \
+RUN:   -o %t.so.instrumented
+
+# Program with instrumented shared library needs to finish returning zero
+RUN: %t.exe %t.so.instrumented 1 2 | FileCheck %s -check-prefix=CHECK-OUTPUT
+RUN: test -f %t.so.fdata
+
+# Test that the instrumented data makes sense
+RUN: llvm-bolt %t.so -o %t.so.bolted -data %t.so.fdata \
+RUN:    -reorder-blocks=cache+ -reorder-functions=hfsort+
+
+RUN: %t.exe %t.so.bolted 1 2 | FileCheck %s -check-prefix=CHECK-OUTPUT
+
+CHECK-OUTPUT: fib(4) = 3
+*/
diff --git a/bolt/test/X86/interprocedural_ref_entry_point.s b/bolt/test/X86/interprocedural_ref_entry_point.s
new file mode 100644
index 000000000000..30bbcd4e4902
--- /dev/null
+++ b/bolt/test/X86/interprocedural_ref_entry_point.s
@@ -0,0 +1,51 @@
+# This reproduces a bug where not registering cold fragment entry points 
+# leads to removing blocks and an inconsistent CFG after UCE.
+# Test assembly was obtained using C-Reduce from this C++ code:
+# (compiled with `g++ -O2 -Wl,-q`)
+#
+# #include <stdexcept>
+# int a;
+# int main() {
+#  if (a)
+#    try {
+#      throw std::logic_error("");
+#    } catch (...) {}
+#  try {
+#    throw std::logic_error("");
+#  } catch (...) {}
+# }
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %host_cxx %cxxflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -dump-dot-all -funcs=main.* 2>&1 | FileCheck %s
+#
+# CHECK-NOT: Assertion `isValid()' failed.
+
+	.globl main
+main:
+	jmp .L3 # jump to the secondary entry point in main.cold.0
+
+	.section a,"ax"
+main.cold.0:
+	.cfi_startproc
+	.cfi_lsda 3,b
+  ud2
+  call __cxa_throw
+.L3:
+  nop
+	.cfi_endproc
+
+	.section  .gcc_except_table
+b:
+  .byte 0xff,0x3
+  .uleb128 e-c
+c:
+  .byte 1
+  .uleb128 e-d
+d:
+  .uleb128 0,0,0,0,0
+  .uleb128 .L3-main.cold.0
+  .uleb128 .L3-main.cold.0
+e:
diff --git a/bolt/test/X86/issue20.s b/bolt/test/X86/issue20.s
new file mode 100755
index 000000000000..0d6f57eab1ec
--- /dev/null
+++ b/bolt/test/X86/issue20.s
@@ -0,0 +1,53 @@
+# This reproduces issue 20 from our github repo
+#  "BOLT crashes when removing unreachable BBs that are a target
+#   in a JT"
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe
+# RUN: llvm-bolt %t.exe -relocs=0 -jump-tables=move -print-finalized \
+# RUN:    -o %t.out | FileCheck %s
+
+# CHECK: BOLT-INFO: UCE removed 0 blocks and 0 bytes of code.
+# CHECK: Binary Function "main"
+# CHECK:      .LFT{{.*}} (2 instructions, align : 1)
+# CHECK-NEXT:   CFI State : 0
+# CHECK-NEXT:     00000004:   andq
+# CHECK-NEXT:     00000008:   jmpq
+# CHECK-NEXT:   Successors: .Ltmp{{.*}}, .Ltmp{{.*}}, .Ltmp{{.*}}, .Ltmp{{.*}}
+
+
+  .text
+  .globl main
+  .type main, %function
+  .size main, .Lend1-main
+main:
+  xorq %rax, %rax
+  retq
+  andq $3, %rdi
+  jmpq *jumptbl(,%rdi,8)
+
+.Lbb1:
+  movl $0x1, %eax
+  jmp .Lexit
+.Lbb2:
+  movl $0x2, %eax
+  jmp .Lexit
+.Lbb3:
+  movl $0x3, %eax
+  jmp .Lexit
+.Lbb4:
+  movl $0x4, %eax
+.Lexit:
+  retq
+.Lend1:
+
+  .section .rodata
+  .globl jumptbl
+jumptbl:
+  .quad .Lbb1
+  .quad .Lbb2
+  .quad .Lbb3
+  .quad .Lbb4
diff --git a/bolt/test/X86/issue20.test b/bolt/test/X86/issue20.test
new file mode 100755
index 000000000000..de9c39b947da
--- /dev/null
+++ b/bolt/test/X86/issue20.test
@@ -0,0 +1,15 @@
+# This reproduces issue 20 from our github repo
+#  "BOLT crashes when removing unreachable BBs that are a target
+#   in a JT"
+
+# RUN: yaml2obj %p/Inputs/issue20.yaml &> %t.exe
+# RUN: llvm-bolt %t.exe -relocs=0 -jump-tables=move -print-finalized \
+# RUN:    -o %t.out | FileCheck %s
+
+CHECK: BOLT-INFO: UCE removed 0 blocks and 0 bytes of code.
+CHECK: Binary Function "main"
+CHECK:      .LFT0 (2 instructions, align : 1)
+CHECK-NEXT:   CFI State : 0
+CHECK-NEXT:     00000004:   andq
+CHECK-NEXT:     00000008:   jmpq
+CHECK-NEXT:   Successors: .Ltmp1, .Ltmp2, .Ltmp3, .Ltmp4
diff --git a/bolt/test/X86/issue26.s b/bolt/test/X86/issue26.s
new file mode 100755
index 000000000000..56d9d8615f08
--- /dev/null
+++ b/bolt/test/X86/issue26.s
@@ -0,0 +1,56 @@
+# This reproduces issue 26 from our github repo
+#  BOLT fails with the following assertion:
+#    llvm/tools/llvm-bolt/src/BinaryFunction.cpp:2950: void llvm::bolt::BinaryFunction::postProcessBranches(): Assertion `validateCFG() && "invalid CFG"' failed.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -relocs -print-cfg -o %t.out \
+# RUN:    | FileCheck %s
+
+# CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0
+
+# CHECK: Binary Function "XYZ"
+
+# CHECK: .Ltmp{{.*}} (1 instructions, align : 1)
+# CHECK-NEXT: Secondary Entry Point: FUNCat{{.*}}
+
+  .text
+  .globl XYZ
+  .type XYZ, %function
+  .size XYZ, .Lend1-XYZ
+XYZ:
+  movl %fs:-0x350, %eax
+  cmpl %eax, %edi
+  jne .L1
+
+  cmp %rdx, (%rsi)
+  jne .L2
+
+  movq %rcx, (%rsi)
+.L1:
+  retq
+
+.L2:
+  movl $0xffffffff, %eax
+  retq
+.Lend1:
+
+  .globl FUNC
+  .type FUNC, %function
+  .size FUNC, .Lend - FUNC
+FUNC:
+  cmpq %rdi, %rsi
+  je .L1
+  retq
+.Lend:
+
+  .globl main
+  .type main, %function
+  .size main, .Lend2 - main
+main:
+  xorq %rax, %rax
+  retq
+.Lend2:
diff --git a/bolt/test/X86/issue26.test b/bolt/test/X86/issue26.test
new file mode 100755
index 000000000000..423edd3a35d2
--- /dev/null
+++ b/bolt/test/X86/issue26.test
@@ -0,0 +1,12 @@
+# This reproduces issue 26 from our github repo
+
+# RUN: yaml2obj %p/Inputs/issue26.yaml &> %t.exe
+# RUN: llvm-bolt %t.exe -relocs -print-cfg -o %t.out \
+# RUN:    | FileCheck %s
+
+CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0
+
+CHECK: Binary Function "XYZ"
+
+CHECK: .Ltmp0 (1 instructions, align : 1)
+CHECK-NEXT: Secondary Entry Point: FUNCat0x4004e9
diff --git a/bolt/test/X86/loop-inversion-pass.s b/bolt/test/X86/loop-inversion-pass.s
new file mode 100644
index 000000000000..ab0fab9d523a
--- /dev/null
+++ b/bolt/test/X86/loop-inversion-pass.s
@@ -0,0 +1,38 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: link_fdata %s %t.o %t.fdata2 "FDATA2"
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=cache+ -print-finalized \
+# RUN:    -loop-inversion-opt -o %t.out | FileCheck %s
+# RUN: llvm-bolt %t.exe -data %t.fdata2 -reorder-blocks=cache+ -print-finalized \
+# RUN:    -loop-inversion-opt -o %t.out2 | FileCheck --check-prefix="CHECK2" %s
+
+# The case where loop is used:
+# FDATA: 1 main 2 1 main #.J1# 0 420
+# FDATA: 1 main b 1 main #.Jloop# 0 420
+# FDATA: 1 main b 1 main d 0 1
+# CHECK: BB Layout   : .LBB00, .Ltmp0, .Ltmp1, .LFT0
+
+# The case where loop is unused:
+# FDATA2: 1 main 2 1 main #.J1# 0 420
+# FDATA2: 1 main b 1 main #.Jloop# 0 1
+# FDATA2: 1 main b 1 main d 0 420
+# CHECK2: BB Layout   : .LBB00, .Ltmp1, .LFT0, .Ltmp0
+
+    .text
+    .globl main
+    .type main, %function
+    .size main, .Lend-main
+main:
+    xor %eax, %eax
+    jmp .J1
+.Jloop:
+    inc %rax
+.J1:
+    cmp $16, %rax
+    jl .Jloop
+    retq
+.Lend:
diff --git a/bolt/test/X86/no-output.test b/bolt/test/X86/no-output.test
new file mode 100644
index 000000000000..2df55502fc6a
--- /dev/null
+++ b/bolt/test/X86/no-output.test
@@ -0,0 +1,29 @@
+# This script checks that BOLT is able to work in dry run mode (no output)
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -relocs=1 -print-profile-stats -o /dev/null \
+# RUN:   -data %t.fdata | FileCheck %s
+
+  .text
+  .globl  main
+  .type main, %function
+  .p2align  4
+main:
+# FDATA: 0 [unknown] 0 1 main 0 0 510
+  pushq %rbp
+  movq  %rsp, %rbp
+  subq  $0x18, %rsp
+  addq  $0x18, %rsp
+  xorq  %rax, %rax
+  leaveq
+  retq
+.size main, .-main
+
+
+# CHECK: skipping writing final binary
diff --git a/bolt/test/X86/plt-sec.test b/bolt/test/X86/plt-sec.test
new file mode 100644
index 000000000000..e8aeb1946b56
--- /dev/null
+++ b/bolt/test/X86/plt-sec.test
@@ -0,0 +1,6 @@
+## Check that llvm-bolt correctly identifies functions in .plt.sec
+# RUN: yaml2obj %p/Inputs/plt-sec.yaml &> %t.exe
+# RUN: llvm-bolt %t.exe -print-cfg -print-only=foo -o %t.out | FileCheck %s
+
+## The only call instruction in function foo() should be a call to PLT.
+CHECK:  callq puts@PLT
diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test
new file mode 100644
index 000000000000..fed023e9f939
--- /dev/null
+++ b/bolt/test/X86/pre-aggregated-perf.test
@@ -0,0 +1,44 @@
+# This script checks that perf2bolt is reading pre-aggregated perf information
+# correctly for a simple example. The perf.data of this example was generated
+# with the following command:
+#
+#  $ perf record -j any,u -e branch -o perf.data -- ./blarge
+#
+#  blarge is the binary for "basicmath large inputs" taken from Mibench.
+
+# Currently failing in MacOS / generating different hash for usqrt
+REQUIRES: system-linux
+
+RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
+RUN: perf2bolt %t.exe -o %t -pa -p %p/Inputs/pre-aggregated.txt -w %t.new
+RUN: cat %t | sort | FileCheck %s -check-prefix=PERF2BOLT
+RUN: cat %t.new | FileCheck %s -check-prefix=NEWFORMAT
+
+PERF2BOLT: 0 [unknown] 7f36d18d60c0 1 main 53c 0 2
+PERF2BOLT: 1 main 451 1 SolveCubic 0 0 2
+PERF2BOLT: 1 main 490 0 [unknown] 4005f0 0 1
+PERF2BOLT: 1 main 537 0 [unknown] 400610 0 1
+PERF2BOLT: 1 usqrt 30 1 usqrt 32 0 22
+PERF2BOLT: 1 usqrt 30 1 usqrt 39 4 33
+PERF2BOLT: 1 usqrt 35 1 usqrt 39 0 22
+PERF2BOLT: 1 usqrt 3d 1 usqrt 10 0 58
+PERF2BOLT: 1 usqrt 3d 1 usqrt 3f 0 22
+PERF2BOLT: 1 usqrt 8 1 usqrt 10 0 22
+
+NEWFORMAT:  - name:            usqrt
+NEWFORMAT:    fid:             7
+NEWFORMAT:    exec:            0
+NEWFORMAT:    nblocks:         5
+NEWFORMAT:    blocks:
+NEWFORMAT:      - bid:             0
+NEWFORMAT:        insns:           3
+NEWFORMAT:        succ:            [ { bid: 1, cnt: 22 } ]
+NEWFORMAT:      - bid:             1
+NEWFORMAT:        insns:           9
+NEWFORMAT:        succ:            [ { bid: 3, cnt: 33, mis: 4 }, { bid: 2, cnt: 22 } ]
+NEWFORMAT:      - bid:             2
+NEWFORMAT:        insns:           2
+NEWFORMAT:        succ:            [ { bid: 3, cnt: 22 } ]
+NEWFORMAT:      - bid:             3
+NEWFORMAT:        insns:           2
+NEWFORMAT:        succ:            [ { bid: 1, cnt: 58 }, { bid: 4, cnt: 22 } ]
diff --git a/bolt/test/X86/section_reloc_with_addend.s b/bolt/test/X86/section_reloc_with_addend.s
new file mode 100755
index 000000000000..d4d3ddf6f557
--- /dev/null
+++ b/bolt/test/X86/section_reloc_with_addend.s
@@ -0,0 +1,50 @@
+# This reproduces a bug triggered by a relocation referencing a section symbol
+# plus a negative reloc. BOLT handles such cases specially, but while doing so,
+# it was failing to sign extend a negative result for the relocation (encoded
+# in the immediate operand of an LEA instruction).
+# Originally triggered by https://fossies.org/linux/glib/glib/guniprop.c
+# Line 550: const gchar *p = special_case_table + val - 0x1000000;
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# Delete our BB symbols so BOLT doesn't mark them as entry points
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -relocs=1 -print-finalized -print-only=main -o %t.out
+
+# RUN: %t.out 1 2
+
+  .text
+  .globl  main
+  .type main, %function
+  .p2align  4
+main:
+  pushq %rbp
+  movq  %rsp, %rbp
+  subq  $0x18, %rsp
+  cmpl  $0x2, %edi
+  jb    .BBend
+.BB2:
+  leaq .data-0x1000000, %rsi     # Use a large negative addend to cause a
+                                 # negative result to be encoded in LEA
+  addq $0x1000000, %rsi          # Eventually program logic compensates to get
+                                 # a real address
+  movq $2, %rbx
+  xorq %rax, %rax
+  movb (%rsi), %al
+  addq %rbx, %rax
+  movb %al, (%rsi)
+  leaq mystring, %rdi
+  callq puts
+
+.BBend:
+  xorq %rax, %rax
+  leaveq
+  retq
+  .size main, .-main
+
+  .data
+mystring: .asciz "0 is rbx mod 10 contents in decimal\n"
diff --git a/bolt/test/X86/shrinkwrapping-alignment.s b/bolt/test/X86/shrinkwrapping-alignment.s
new file mode 100755
index 000000000000..a228cd1a5eff
--- /dev/null
+++ b/bolt/test/X86/shrinkwrapping-alignment.s
@@ -0,0 +1,88 @@
+# This reproduces a bug with shrink wrapping when trying to move
+# push-pops in a function where we are not allowed to modify the
+# stack layout for alignment reasons. In this bug, we failed to
+# propagate alignment requirement upwards in the call graph for
+# some functions when there is a cycle in the call graph.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# Delete our BB symbols so BOLT doesn't mark them as entry points
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -relocs=1 -frame-opt=all -print-finalized \
+# RUN:   -lite=0 -print-only=main -data %t.fdata -o %t.out | FileCheck %s
+
+# RUN: %t.out
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 1 spills inserting load/stores and 0 spills inserting push/pops
+
+  .text
+  .globl bar
+  .type bar, %function
+  .p2align 4
+bar:
+# FDATA: 0 [unknown] 0 1 bar 0 0 510
+  pushq %rbp
+  movq  %rsp, %rbp
+  pushq %rbx                  # We save rbx here, but there is an
+                              # opportunity to move it to .BB2
+  subq  $0x18, %rsp
+  cmpl  $0x2, %edi
+.J1:
+  jb    .BBend
+# FDATA: 1 bar #.J1# 1 bar #.BB2# 0 10
+# FDATA: 1 bar #.J1# 1 bar #.BBend# 0 500
+.BB2:
+  movq $2, %rbx               # Use rbx in a cold block
+  xorq %rax, %rax
+  movb mystring, %al
+  addq %rbx, %rax
+  movb %al, mystring
+  leaq mystring, %rdi
+  #callq puts
+
+.BBend:
+  addq $0x18, %rsp
+  pop %rbx                    # Restore
+  xorq %rax, %rax
+  cmpq  $0x0, %rax
+  jne  .BBnever
+  jmp  .BBbarend
+.BBnever:
+  # This is a path that is never executed, but we add a call to main here
+  # to force a cycle in the call graph and to require us to have an aligned
+  # stack
+  callq main
+.BBbarend:
+  leaveq
+  retq
+  .size bar, .-bar
+
+# Frame alignedness information needs to be transmitted from foo to main to bar
+  .globl  main
+  .type main, %function
+  .p2align  4
+main:
+  # Call a function that requires an aligned stack
+  callq foo
+  # Call a function that can be shrink-wrapped
+  callq bar
+  retq
+  .size main, .-main
+
+# Frame alignedness information needs to be transmitted from foo to main to bar
+  .globl  foo
+  .type foo, %function
+  .p2align  4
+foo:
+  # Use an instruction that requires an aligned stack
+  movdqa -0x10(%rsp), %xmm0
+  retq
+  .size foo, .-foo
+
+  .data
+mystring: .asciz "0 is rbx mod 10 contents in decimal\n"
diff --git a/bolt/test/X86/shrinkwrapping-critedge.s b/bolt/test/X86/shrinkwrapping-critedge.s
new file mode 100644
index 000000000000..19b1ac3ca6be
--- /dev/null
+++ b/bolt/test/X86/shrinkwrapping-critedge.s
@@ -0,0 +1,35 @@
+# This reproduces a bug with shrink wrapping when trying to split critical
+# edges originating at the same basic block.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
+# RUN:     -frame-opt=all -simplify-conditional-tail-calls=false \
+# RUN:     -eliminate-unreachable=false
+
+  .globl _start
+_start:
+# FDATA: 0 [unknown] 0 1 _start 0 0 1
+    push  %rbx
+    je  b
+c:
+    pop %rbx
+    ret
+b:
+    je  f
+    jmp *JT(,%rbx,8)
+d:
+    jmp d
+    mov %r14, %rdi
+f:
+    ret
+  .data
+JT:
+  .quad c
+  .quad d
+  .quad f
diff --git a/bolt/test/X86/shrinkwrapping-insertcfi.s b/bolt/test/X86/shrinkwrapping-insertcfi.s
new file mode 100644
index 000000000000..52112d6ba3d9
--- /dev/null
+++ b/bolt/test/X86/shrinkwrapping-insertcfi.s
@@ -0,0 +1,41 @@
+# This test reproduces the issue with inserting updated CFI in shrink wrapping
+# into the first basic block.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out -data %t.fdata -frame-opt=all -lite=0 \
+# RUN:           -print-fop 2>&1 | FileCheck %s
+
+# Check shrink wrapping results:
+# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 1 spills inserting push/pops
+
+# Check that CFI is successfully inserted into the first basic block:
+# CHECK:      Binary Function "_start" after frame-optimizer
+# CHECK:      .LBB00 (2 instructions, align : 1)
+# CHECK-NEXT: Entry Point
+# CHECK:      00000000:   !CFI {{.*}}
+# CHECK-NEXT: 00000000:   je  .Ltmp{{.*}}
+
+  .globl _start
+_start:
+    .cfi_startproc
+# FDATA: 0 [unknown] 0 1 _start 0 0 6
+# !CFI OpOffset for reg3/rbx is inserted into this block.
+    je	a
+b:  jne _start
+# FDATA: 1 _start #b# 1 _start #c# 0 3
+
+c:
+    push	%rbx
+    .cfi_offset 3, 4
+    pop	%rbx
+
+# This basic block is treated as having 0 execution count.
+# push and pop will be sinked into this block.
+a:
+    ud2
+    .cfi_endproc
diff --git a/bolt/test/X86/shrinkwrapping-lock.s b/bolt/test/X86/shrinkwrapping-lock.s
new file mode 100755
index 000000000000..a3590d523c54
--- /dev/null
+++ b/bolt/test/X86/shrinkwrapping-lock.s
@@ -0,0 +1,64 @@
+# This reproduces a bug with shrink wrapping when moving
+# load instructions in-between the lock prefix and another
+# instruction (where the lock prefix applies).
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# Delete our BB symbols so BOLT doesn't mark them as entry points
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -relocs=1 -frame-opt=all -print-fop \
+# RUN:    -print-only=main -data %t.fdata -o %t.out | FileCheck %s
+
+# RUN: %t.out 1
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 1 spills inserting load/stores and 0 spills inserting push/pops
+
+  .text
+  .globl  main
+  .type main, %function
+  .p2align  4
+main:
+# FDATA: 0 [unknown] 0 1 main 0 0 510
+  pushq %rbp
+  movq  %rsp, %rbp
+  pushq %rbx                  # We save rbx here, but there is an
+                              # opportunity to move it to .LBB2
+  subq  $0x18, %rsp
+  cmpl  $0x2, %edi
+.J1:
+  jb    .BBend
+# FDATA: 1 main #.J1# 1 main #.BB2# 0 10
+# FDATA: 1 main #.J1# 1 main #.BBend# 0 500
+.BB2:
+  movq $2, %rbx               # Use rbx in a cold block. Save rbx will be moved
+                              # just before this instruction.
+  xorq %rax, %rax
+  movb mystring, %al
+  addq %rbx, %rax
+  movb %al, mystring
+  leaq mystring, %rdi
+  callq puts
+  lock add %r12,0x0(%rsp)     # Put a lock in an unrelated instruction at the
+                              # dom. frontier where the restore will be moved to
+                              # We should not put the restore in-between the
+                              # lock and add! We typically avoid putting a
+                              # restore in the last BB instruction, but since
+                              # lock in llvm MC lib is considered a
+                              # separate instruction, we may mistakenly
+                              # put the restore just between these two.
+
+.BBend:
+  addq $0x18, %rsp
+  pop %rbx                    # Restore should be moved
+  xorq %rax, %rax
+  leaveq
+  retq
+  .size main, .-main
+
+  .data
+mystring: .asciz "0 is rbx mod 10 contents in decimal\n"
diff --git a/bolt/test/X86/shrinkwrapping-pushpop.s b/bolt/test/X86/shrinkwrapping-pushpop.s
new file mode 100755
index 000000000000..694495ffc452
--- /dev/null
+++ b/bolt/test/X86/shrinkwrapping-pushpop.s
@@ -0,0 +1,60 @@
+# This reproduces a bug with shrink wrapping when trying to move
+# push-pops where restores were not validated to be POPs (and could be
+# a regular load, which violated our assumptions). Check that we
+# identify those cases.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# Delete our BB symbols so BOLT doesn't mark them as entry points
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -relocs=1 -frame-opt=all -print-finalized \
+# RUN:    -print-only=main -data %t.fdata -o %t.out | FileCheck %s
+
+# RUN: %t.out
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 1 spills inserting load/stores and 0 spills inserting push/pops
+
+  .text
+  .globl  main
+  .type main, %function
+  .p2align  4
+main:
+# FDATA: 0 [unknown] 0 1 main 0 0 510
+  pushq %rbp
+  movq  %rsp, %rbp
+  pushq %rbx                  # We save rbx here, but there is an
+                              # opportunity to move it to .LBB2
+  subq  $0x18, %rsp
+  cmpl  $0x2, %edi
+.J1:
+  jb    .BBend
+# FDATA: 1 main #.J1# 1 main #.BB2# 0 10
+# FDATA: 1 main #.J1# 1 main #.BBend# 0 500
+.BB2:
+  movq $2, %rbx               # Use rbx in a cold block
+  xorq %rax, %rax
+  movb mystring, %al
+  addq %rbx, %rax
+  movb %al, mystring
+  leaq mystring, %rdi
+# Avoid making the actual call here to allow push-pop mode to operate
+# without fear of an unknown function that may require aligned stack
+#  callq puts
+
+.BBend:
+  mov -0x08(%rbp), %rbx       # Original restore is here. Instead of a pop
+                              # we use a load to reproduce gcc behavior while
+                              # using leave in epilogue. Ordinarily it would
+                              # addq $0x18, %rsp followed by pop %rbx
+  xorq %rax, %rax
+  leaveq
+  retq
+  .size main, .-main
+
+  .data
+mystring: .asciz "0 is rbx mod 10 contents in decimal\n"
diff --git a/bolt/test/X86/split_func_jump_table_fragment.s b/bolt/test/X86/split_func_jump_table_fragment.s
new file mode 100644
index 000000000000..0f1615fd81f8
--- /dev/null
+++ b/bolt/test/X86/split_func_jump_table_fragment.s
@@ -0,0 +1,57 @@
+# This reproduces a bug with jump table identification where jump table has
+# entries pointing to code in function and its cold fragment.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -lite=0 -v=1 2>&1 | FileCheck %s
+
+# CHECK-NOT: unclaimed PC-relative relocations left in data
+# CHECK: BOLT-INFO: marking main.cold.1 as a fragment of main
+  .text
+  .globl main
+  .type main, %function
+  .p2align 2
+main:
+LBB0:
+  andl $0xf, %ecx
+  cmpb $0x4, %cl
+  # exit through abort in main.cold.1, registers cold fragment the regular way
+  ja main.cold.1
+
+# jump table dispatch, jumping to label indexed by val in %ecx
+LBB1:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB2:
+  xorq %rax, %rax
+LBB3:
+  addq $0x8, %rsp
+  ret
+.size main, .-main
+
+  .globl main.cold.1
+  .type main.cold.1, %function
+  .p2align 2
+main.cold.1:
+  # load bearing nop: pad LBB4 so that it can't be treated
+  # as __builtin_unreachable by analyzeJumpTable
+  nop
+LBB4:
+  callq abort
+.size main.cold.1, .-main.cold.1
+
+  .rodata
+# jmp table, entries must be R_X86_64_PC32 relocs
+  .globl JUMP_TABLE
+JUMP_TABLE:
+  .long LBB2-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
+  .long LBB4-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
diff --git a/bolt/test/X86/split_func_jump_table_fragment_noparent.s b/bolt/test/X86/split_func_jump_table_fragment_noparent.s
new file mode 100644
index 000000000000..780c70914277
--- /dev/null
+++ b/bolt/test/X86/split_func_jump_table_fragment_noparent.s
@@ -0,0 +1,59 @@
+# This reproduces a bug with jump table identification where jump table has
+# entries pointing to code in function and its cold fragment.
+# The fragment is only reachable through jump table.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -lite=0 -v=1 2>&1 | FileCheck %s
+
+# CHECK-NOT: unclaimed PC-relative relocations left in data
+# CHECK: BOLT-INFO: marking main.cold.1 as a fragment of main
+  .text
+  .globl main
+  .type main, %function
+  .p2align 2
+main:
+LBB0:
+  andl $0xf, %ecx
+  cmpb $0x4, %cl
+  # exit through ret
+  ja LBB3
+
+# jump table dispatch, jumping to label indexed by val in %ecx
+LBB1:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB2:
+  xorq %rax, %rax
+LBB3:
+  addq $0x8, %rsp
+  ret
+.size main, .-main
+
+# cold fragment is only reachable through jump table
+  .globl main.cold.1
+  .type main.cold.1, %function
+  .p2align 2
+main.cold.1:
+  # load bearing nop: pad LBB4 so that it can't be treated
+  # as __builtin_unreachable by analyzeJumpTable
+  nop
+LBB4:
+  callq abort
+.size main.cold.1, .-main.cold.1
+
+  .rodata
+# jmp table, entries must be R_X86_64_PC32 relocs
+  .globl JUMP_TABLE
+JUMP_TABLE:
+  .long LBB2-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
+  .long LBB4-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
diff --git a/bolt/test/X86/split_func_jump_table_fragment_reverse.s b/bolt/test/X86/split_func_jump_table_fragment_reverse.s
new file mode 100644
index 000000000000..292ea43ca948
--- /dev/null
+++ b/bolt/test/X86/split_func_jump_table_fragment_reverse.s
@@ -0,0 +1,59 @@
+# This reproduces a bug with jump table identification where jump table has
+# entries pointing to code in function and its cold fragment.
+# The fragment is only reachable through jump table.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -lite=0 -v=1 2>&1 | FileCheck %s
+
+# CHECK-NOT: unclaimed PC-relative relocations left in data
+# CHECK: BOLT-INFO: marking main.cold as a fragment of main
+  .text
+  .globl main.cold
+  .type main.cold, %function
+  .p2align 2
+main.cold:
+LBB0:
+  andl $0xf, %ecx
+  cmpb $0x4, %cl
+  # exit through ret
+  ja LBB3
+
+# jump table dispatch, jumping to label indexed by val in %ecx
+LBB1:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB2:
+  xorq %rax, %rax
+LBB3:
+  addq $0x8, %rsp
+  ret
+.size main.cold, .-main.cold
+
+# main function, referenced from jump table in cold fragment
+  .globl main
+  .type main, %function
+  .p2align 2
+main:
+  # load bearing nop: pad LBB4 so that it can't be treated
+  # as __builtin_unreachable by analyzeJumpTable
+  nop
+LBB4:
+  callq abort
+.size main, .-main
+
+  .rodata
+# jmp table, entries must be R_X86_64_PC32 relocs
+  .globl JUMP_TABLE
+JUMP_TABLE:
+  .long LBB2-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
+  .long LBB4-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
diff --git a/bolt/test/X86/srol-bug.test b/bolt/test/X86/srol-bug.test
new file mode 100644
index 000000000000..a6abbea4a2f2
--- /dev/null
+++ b/bolt/test/X86/srol-bug.test
@@ -0,0 +1,43 @@
+# RUN: yaml2obj %p/Inputs/srol-bug-input.yaml &> %t.exe
+# RUN: llvm-bolt %t.exe -simplify-rodata-loads -print-finalized -relocs=0 \
+# RUN:    -print-disasm -o %t.out | FileCheck %s
+
+CHECK:  Binary Function "myfunc" after disassembly {
+CHECK:    00000000:   addl    mydata(%rip), %edx
+CHECK:    00000006:   subl    mydata(%rip), %edx
+CHECK:    0000000c:   andl    mydata(%rip), %edx
+CHECK:    00000012:   orl     mydata(%rip), %edx
+CHECK:    00000018:   xorl    mydata(%rip), %edx
+CHECK:    0000001f:   movzwl  mydata(%rip), %edx
+CHECK:    00000026:   movb    mydata(%rip), %dl
+CHECK:    0000002c:   movw    mydata(%rip), %dx
+CHECK:    00000033:   movl    mydata(%rip), %edx
+CHECK:    00000039:   movq    mydata(%rip), %rdx
+CHECK:    00000040:   cmpb    mydata(%rip), %dl
+CHECK:    00000046:   cmpw    mydata(%rip), %dx
+CHECK:    0000004d:   cmpl    mydata(%rip), %edx
+CHECK:    00000053:   cmpq    mydata(%rip), %rdx
+CHECK:    0000005a:   testb   %dl, mydata(%rip)
+CHECK:    00000060:   testw   %dx, mydata(%rip)
+CHECK:    00000067:   testl   %edx, mydata(%rip)
+CHECK:    0000006d:   testq   %rdx, mydata(%rip)
+
+CHECK:  Binary Function "myfunc" after finalize-functions {
+CHECK:    00000000:   addl    $0x20001, %edx
+CHECK:    00000006:   subl    $0x20001, %edx
+CHECK:    0000000c:   andl    $0x20001, %edx
+CHECK:    00000012:   orl     $0x20001, %edx
+CHECK:    00000018:   xorl    $0x20001, %edx
+CHECK:    0000001f:   movl    $0x1, %edx
+CHECK:    00000024:   movb    $0x1, %dl
+CHECK:    00000026:   movw    $0x1, %dx
+CHECK:    0000002a:   movl    $0x20001, %edx
+CHECK:    0000002f:   movq    $0x20001, %rdx
+CHECK:    00000036:   cmpb    $0x1, %dl
+CHECK:    00000039:   cmpw    $0x1, %dx
+CHECK:    0000003d:   cmpl    $0x20001, %edx
+CHECK:    00000043:   cmpq    $0x20001, %rdx
+CHECK:    0000004a:   testb   $0x1, %dl
+CHECK:    0000004d:   testw   $0x1, %dx
+CHECK:    00000052:   testl   $0x20001, %edx
+CHECK:    00000058:   testq   $0x20001, %rdx
diff --git a/bolt/test/X86/user-func-reorder.c b/bolt/test/X86/user-func-reorder.c
new file mode 100644
index 000000000000..4c693b128e77
--- /dev/null
+++ b/bolt/test/X86/user-func-reorder.c
@@ -0,0 +1,44 @@
+/* Checks that BOLT correctly processes a user-provided function list file,
+ * reorder functions according to this list, update hot_start and hot_end
+ * symbols and insert a function to perform hot text mapping during program
+ * startup.
+ */
+#include <stdio.h>
+
+int foo(int x) {
+  return x + 1;
+}
+
+int fib(int x) {
+  if (x < 2)
+    return x;
+  return fib(x - 1) + fib(x - 2);
+}
+
+int bar(int x) {
+  return x - 1;
+}
+
+int main(int argc, char **argv) {
+  printf("fib(%d) = %d\n", argc, fib(argc));
+  return 0;
+}
+
+/*
+REQUIRES: system-linux
+
+RUN: %host_cc %cflags %s -o %t.exe -Wl,-q
+
+RUN: llvm-bolt %t.exe -relocs=1 -lite -reorder-functions=user \
+RUN:   -hugify -function-order=%p/Inputs/user_func_order.txt -o %t
+RUN: llvm-nm --numeric-sort --print-armap %t | \
+RUN:   FileCheck %s -check-prefix=CHECK-NM
+RUN: %t 1 2 3 | FileCheck %s -check-prefix=CHECK-OUTPUT
+
+CHECK-NM:      W  __hot_start
+CHECK-NM:      T main
+CHECK-NM-NEXT: T fib
+CHECK-NM-NEXT: W __hot_end
+
+CHECK-OUTPUT: fib(4) = 3
+*/
diff --git a/bolt/test/X86/zero-sized-object.s b/bolt/test/X86/zero-sized-object.s
new file mode 100755
index 000000000000..35e361233dd0
--- /dev/null
+++ b/bolt/test/X86/zero-sized-object.s
@@ -0,0 +1,137 @@
+# Check that references to local (unnamed) objects below are not
+# treated as references relative to zero-sized A object.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe -print-cfg \
+# RUN:    -print-only=main -o %t.out | FileCheck %s
+
+	.file	"rust_bug.c"
+	.section	.rodata
+.LC0:
+	.string	"0"
+.LC1:
+	.string	"1"
+.LC2:
+	.string	"2"
+.LC3:
+	.string	"3"
+.LC4:
+	.string	"4"
+.LC5:
+	.string	"5"
+	.text
+	.globl	inc_dup
+	.type	inc_dup, @function
+inc_dup:
+.LFB0:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register 6
+	subq	$16, %rsp
+	movl	%edi, -4(%rbp)
+	movl	-4(%rbp), %eax
+	subl	$10, %eax
+	cmpl	$5, %eax
+	ja	.L2
+	movl	%eax, %eax
+	movq	.L4(,%rax,8), %rax
+	jmp	*%rax
+	.section	.rodata
+  .align 8
+  .globl A
+  .type A, @object
+A:
+	.section	.rodata
+	.align 8
+	.align 4
+.L4:
+	.quad	.L3
+	.quad	.L5
+	.quad	.L6
+	.quad	.L7
+	.quad	.L8
+	.quad	.L9
+	.text
+.L3:
+	movl	$.LC0, %edi
+	call	puts
+	movl	$1, %eax
+	jmp	.L10
+.L5:
+	movl	$.LC1, %edi
+	call	puts
+	movl	$2, %eax
+	jmp	.L10
+.L6:
+	movl	$.LC2, %edi
+	call	puts
+	movl	$3, %eax
+	jmp	.L10
+.L7:
+	movl	$.LC3, %edi
+	call	puts
+	movl	$4, %eax
+	jmp	.L10
+.L8:
+	movl	$.LC4, %edi
+	call	puts
+	movl	$5, %eax
+	jmp	.L10
+.L9:
+	movl	$.LC5, %edi
+	call	puts
+	movl	$6, %eax
+	jmp	.L10
+.L2:
+	movl	-4(%rbp), %eax
+	addl	$1, %eax
+.L10:
+	leave
+	.cfi_def_cfa 7, 8
+	ret
+	.cfi_endproc
+.LFE0:
+	.size	inc_dup, .-inc_dup
+	.section	.rodata
+.LC6:
+	.string	"%d\n"
+	.text
+	.globl	main
+	.type	main, @function
+main:
+# CHECK: Binary Function "main" after building cfg
+.LFB1:
+  .cfi_startproc
+  pushq   %rbp
+  .cfi_def_cfa_offset 16
+  .cfi_offset 6, -16
+  movq    %rsp, %rbp
+  .cfi_def_cfa_register 6
+  movl    .LN(%rip), %eax
+# We should not reference a data object as an offset from A
+# CHECK-NOT: movl    A+{{.*}}(%rip), %eax
+  movl    %eax, %esi
+  movl    $.LC6, %edi
+  movl    $0, %eax
+  call    printf
+  movl    $0, %eax
+  popq    %rbp
+  .cfi_def_cfa 7, 8
+  ret
+  .cfi_endproc
+.LFE1:
+	.size	main, .-main
+	.section	.rodata
+	.align 4
+.LN:
+	.long	42
+	.ident	"GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-44)"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/bolt/test/link_fdata.sh b/bolt/test/link_fdata.sh
new file mode 100755
index 000000000000..4f3b0c16d4ca
--- /dev/null
+++ b/bolt/test/link_fdata.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -e
+
+prefix=${4:-"FDATA"}
+
+grep -e "^# ${prefix}:" < "$1" | sed -E "s/# ${prefix}: //g" > "$3"
+mapfile -t symbols < <(nm --defined-only "$2")
+
+for line in "${symbols[@]}"; do
+    val=$(echo $line | cut -d' ' -f1)
+    symname=$(echo $line | cut -d' ' -f3)
+    if [ -z "$symname" ]; then
+        continue
+    fi
+    if [ -z "${val##*[!0-9a-fA-F]*}" ]; then
+        continue
+    fi
+    sed -i -e "s/\#${symname}\#/$val/g" $3
+done
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
new file mode 100644
index 000000000000..8a1212e4d5cb
--- /dev/null
+++ b/bolt/test/lit.cfg.py
@@ -0,0 +1,77 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+from lit.llvm.subst import FindTool
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'BOLT'
+
+# testFormat: The test format to use to interpret tests.
+#
+# For now we require '&&' between commands, until they get globally killed and
+# the test runner updated.
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cppm', '.m', '.mm', '.cu',
+                   '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs']
+
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.bolt_obj_root, 'test')
+
+llvm_config.use_default_substitutions()
+
+config.substitutions.append(('%host_cc', config.host_cc))
+config.substitutions.append(('%host_cxx', config.host_cxx))
+
+config.substitutions.append(('%cflags', '-no-pie -fcf-protection=none'))
+config.substitutions.append(('%cxxflags', '-no-pie -fcf-protection=none'))
+
+tool_dirs = [config.llvm_tools_dir,
+             config.test_source_root]
+
+tools = [
+    ToolSubst('llvm-dwarfdump', unresolved='fatal'),
+    ToolSubst('llvm-bolt', unresolved='fatal'),
+    ToolSubst('perf2bolt', unresolved='fatal'),
+    ToolSubst('yaml2obj', unresolved='fatal'),
+    ToolSubst('llvm-mc', unresolved='fatal'),
+    ToolSubst('llvm-nm', unresolved='fatal'),
+    ToolSubst('llvm-objdump', unresolved='fatal'),
+    ToolSubst('llvm-strip', unresolved='fatal'),
+    ToolSubst('link_fdata', command=FindTool('link_fdata.sh'), unresolved='fatal'),
+]
+llvm_config.add_tool_substitutions(tools, tool_dirs)
+
+def calculate_arch_features(arch_string):
+    features = []
+    for arch in arch_string.split():
+        features.append(arch.lower() + '-registered-target')
+    return features
+
+
+llvm_config.feature_config(
+    [('--assertion-mode', {'ON': 'asserts'}),
+     ('--cxxflags', {r'-D_GLIBCXX_DEBUG\b': 'libstdcxx-safe-mode'}),
+        ('--targets-built', calculate_arch_features)
+     ])
diff --git a/bolt/test/lit.site.cfg.py.in b/bolt/test/lit.site.cfg.py.in
new file mode 100644
index 000000000000..79c9f4c399db
--- /dev/null
+++ b/bolt/test/lit.site.cfg.py.in
@@ -0,0 +1,39 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.bolt_obj_root = "@BOLT_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_libs_dir = "@LLVM_LIBS_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_plugin_ext = "@LLVM_PLUGIN_EXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@CMAKE_CXX_COMPILER@"
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_backtrace = @ENABLE_BACKTRACES@
+config.host_arch = "@HOST_ARCH@"
+config.enable_abi_breaking_checks = "@LLVM_ENABLE_ABI_BREAKING_CHECKS@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+
+# Support substitution of the tools and libs dirs with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+    config.llvm_libs_dir = config.llvm_libs_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@BOLT_SOURCE_DIR@/test/lit.cfg.py")
diff --git a/lld/test/ELF/eh-frame-hdr-augmentation.s b/lld/test/ELF/eh-frame-hdr-augmentation.s
index 4fe6a5fd45e7..5e9161068eae 100644
--- a/lld/test/ELF/eh-frame-hdr-augmentation.s
+++ b/lld/test/ELF/eh-frame-hdr-augmentation.s
@@ -40,4 +40,3 @@ g:
         .global h
         .hidden h
 h:
-
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 5d3ad7a4fd58..82e2908dfbd5 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -64,7 +64,7 @@ endif()
 # LLVM_EXTERNAL_${project}_SOURCE_DIR using LLVM_ALL_PROJECTS
 # This allows an easy way of setting up a build directory for llvm and another
 # one for llvm+clang+... using the same sources.
-set(LLVM_ALL_PROJECTS "clang;clang-tools-extra;compiler-rt;cross-project-tests;libc;libclc;libcxx;libcxxabi;libunwind;lld;lldb;mlir;openmp;parallel-libs;polly;pstl")
+set(LLVM_ALL_PROJECTS "clang;clang-tools-extra;compiler-rt;cross-project-tests;libc;libclc;libcxx;libcxxabi;libunwind;lld;lldb;mlir;openmp;parallel-libs;polly;pstl;bolt")
 # The flang project is not yet part of "all" projects (see C++ requirements)
 set(LLVM_EXTRA_PROJECTS "flang")
 # List of all known projects in the mono repo
@@ -808,7 +808,7 @@ if (NOT TENSORFLOW_AOT_PATH STREQUAL "")
   include_directories(${TENSORFLOW_AOT_PATH}/include)
   add_subdirectory(${TENSORFLOW_AOT_PATH}/xla_aot_runtime_src
     ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime)
-  install(TARGETS tf_xla_runtime EXPORT LLVMExports 
+  install(TARGETS tf_xla_runtime EXPORT LLVMExports
     ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT tf_xla_runtime)
   set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS tf_xla_runtime)
 endif()
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index dcb26f12b13e..a6893ab3e771 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -27,12 +27,15 @@ class raw_ostream;
 class DWARFAbbreviationDeclaration {
 public:
   struct AttributeSpec {
-    AttributeSpec(dwarf::Attribute A, dwarf::Form F, int64_t Value)
-        : Attr(A), Form(F), Value(Value) {
+    AttributeSpec(dwarf::Attribute A, dwarf::Form F, int64_t Value,
+                  uint32_t AttrOffset = -1U, uint32_t FormOffset = -1U)
+        : Attr(A), Form(F), AttrOffset(AttrOffset), FormOffset(FormOffset),
+          Value(Value) {
       assert(isImplicitConst());
     }
-    AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional<uint8_t> ByteSize)
-        : Attr(A), Form(F) {
+    AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional<uint8_t> ByteSize,
+                  uint32_t AttrOffset = -1U, uint32_t FormOffset = -1U)
+        : Attr(A), Form(F), AttrOffset(AttrOffset), FormOffset(FormOffset) {
       assert(!isImplicitConst());
       this->ByteSize.HasByteSize = ByteSize.hasValue();
       if (this->ByteSize.HasByteSize)
@@ -41,6 +44,8 @@ class DWARFAbbreviationDeclaration {
 
     dwarf::Attribute Attr;
     dwarf::Form Form;
+    uint32_t AttrOffset;
+    uint32_t FormOffset;
 
   private:
     /// The following field is used for ByteSize for non-implicit_const
@@ -121,6 +126,8 @@ class DWARFAbbreviationDeclaration {
     return AttributeSpecs[idx].getImplicitConstValue();
   }
 
+  const AttributeSpec *findAttribute(dwarf::Attribute Attr) const;
+
   /// Get the index of the specified attribute.
   ///
   /// Searches the this abbreviation declaration for the index of the specified
@@ -142,7 +149,8 @@ class DWARFAbbreviationDeclaration {
   /// \returns Optional DWARF form value if the attribute was extracted.
   Optional<DWARFFormValue> getAttributeValue(const uint64_t DIEOffset,
                                              const dwarf::Attribute Attr,
-                                             const DWARFUnit &U) const;
+                                             const DWARFUnit &U,
+                                             uint64_t *OffsetPtr = 0) const;
 
   bool extract(DataExtractor Data, uint64_t* OffsetPtr);
   void dump(raw_ostream &OS) const;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 75b2280658f1..6f902e9fa13f 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -350,6 +350,10 @@ class DWARFContext : public DIContext {
   ///       into "SectionedAddress Address"
   DIEsForAddress getDIEsForAddress(uint64_t Address);
 
+  /// Get offset to an attribute value within a compile unit
+  /// or 0 if the attribute was not found.
+  uint64_t getAttrFieldOffsetForUnit(DWARFUnit *U, dwarf::Attribute Attr) const;
+
   DILineInfo getLineInfoForAddress(
       object::SectionedAddress Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
@@ -393,7 +397,8 @@ class DWARFContext : public DIContext {
          std::function<void(Error)> RecoverableErrorHandler =
              WithColor::defaultErrorHandler,
          std::function<void(Error)> WarningHandler =
-             WithColor::defaultWarningHandler);
+             WithColor::defaultWarningHandler,
+         bool UsesRelocs = true);
 
   static std::unique_ptr<DWARFContext>
   create(const StringMap<std::unique_ptr<MemoryBuffer>> &Sections,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index 536583e20640..dad109fbca7c 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -686,9 +686,20 @@ class DWARFDebugFrame {
   void dump(raw_ostream &OS, DIDumpOptions DumpOpts, const MCRegisterInfo *MRI,
             Optional<uint64_t> Offset) const;
 
+  using RefHandlerType =
+    std::function<void(uint64_t, uint64_t, uint64_t)>;
+
   /// Parse the section from raw data. \p Data is assumed to contain the whole
   /// frame section contents to be parsed.
-  Error parse(DWARFDataExtractor Data);
+  /// If non-null RefHandler is passed, call it for every encountered external
+  /// reference in frame data. The expected signature is:
+  ///
+  ///   void RefHandler(uint64_t Value, uint64_t Offset, uint64_t Type);
+  ///
+  /// where Value is a value of the reference, Offset - is an offset into the
+  /// frame data at which the reference occured, and Type is a DWARF encoding
+  /// type of the reference.
+  Error parse(DWARFDataExtractor Data, RefHandlerType RefHandler= nullptr);
 
   /// Return whether the section has any entries.
   bool empty() const { return Entries.empty(); }
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 1903bab5e73f..b265cda91551 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -139,7 +139,8 @@ class DWARFDie {
   /// \param Attr the attribute to extract.
   /// \returns an optional DWARFFormValue that will have the form value if the
   /// attribute was successfully extracted.
-  Optional<DWARFFormValue> find(dwarf::Attribute Attr) const;
+  Optional<DWARFFormValue> find(dwarf::Attribute Attr,
+                                uint64_t *OffsetPtr = 0) const;
 
   /// Extract the first value of any attribute in Attrs from this DIE.
   ///
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index b1d6b7fb7fd3..2e82c139240c 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -332,6 +332,10 @@ class MCAsmInfo {
   /// result of a alignment directive.  Defaults to 0
   unsigned TextAlignFillValue = 0;
 
+  /// If non-zero, this is used to fill the executable space with instructions
+  /// that will trap. Defaults to 0
+  unsigned TrapFillValue = 0;
+
   //===--- Global Variable Emission Directives --------------------------===//
 
   /// This is the directive used to declare a global entity. Defaults to
@@ -704,6 +708,7 @@ class MCAsmInfo {
   }
   bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; }
   unsigned getTextAlignFillValue() const { return TextAlignFillValue; }
+  unsigned getTrapFillValue() const { return TrapFillValue; }
   const char *getGlobalDirective() const { return GlobalDirective; }
 
   bool doesSetDirectiveSuppressReloc() const {
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index d0edad8f02c2..c4ba8384dcb9 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -730,13 +730,14 @@ namespace llvm {
     /// instruction will be created.
     void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column,
                             unsigned Flags, unsigned Isa,
-                            unsigned Discriminator) {
+                            unsigned Discriminator, uint64_t Addr = -1ULL) {
       CurrentDwarfLoc.setFileNum(FileNum);
       CurrentDwarfLoc.setLine(Line);
       CurrentDwarfLoc.setColumn(Column);
       CurrentDwarfLoc.setFlags(Flags);
       CurrentDwarfLoc.setIsa(Isa);
       CurrentDwarfLoc.setDiscriminator(Discriminator);
+      CurrentDwarfLoc.setAbsoluteAddr(Addr);
       DwarfLocSeen = true;
     }
 
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index 23efdc70609b..e7db017e8f03 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -81,6 +81,7 @@ class MCDwarfLoc {
   uint8_t Flags;
   uint8_t Isa;
   uint32_t Discriminator;
+  uint64_t AbsoluteAddr;
 
 // Flag that indicates the initial value of the is_stmt_start flag.
 #define DWARF2_LINE_DEFAULT_IS_STMT 1
@@ -95,14 +96,17 @@ class MCDwarfLoc {
   friend class MCDwarfLineEntry;
 
   MCDwarfLoc(unsigned fileNum, unsigned line, unsigned column, unsigned flags,
-             unsigned isa, unsigned discriminator)
+             unsigned isa, unsigned discriminator, uint64_t addr=-1ULL)
       : FileNum(fileNum), Line(line), Column(column), Flags(flags), Isa(isa),
-        Discriminator(discriminator) {}
+        Discriminator(discriminator), AbsoluteAddr(addr) {}
 
   // Allow the default copy constructor and assignment operator to be used
   // for an MCDwarfLoc object.
 
 public:
+  /// \brief Get the AbsoluteAddr of this MCDwarfLoc.
+  uint64_t getAbsoluteAddr() const { return AbsoluteAddr; }
+
   /// Get the FileNum of this MCDwarfLoc.
   unsigned getFileNum() const { return FileNum; }
 
@@ -149,6 +153,11 @@ class MCDwarfLoc {
   void setDiscriminator(unsigned discriminator) {
     Discriminator = discriminator;
   }
+
+  /// \brief Set the AbsoluteAddr of this MCDwarfLoc.
+  void setAbsoluteAddr(uint64_t addr) {
+    AbsoluteAddr = addr;
+  }
 };
 
 /// Instances of this class represent the line information for
@@ -431,6 +440,57 @@ class MCGenDwarfLabelEntry {
                    SMLoc &Loc);
 };
 
+/// \brief A sequence of MCDwarfOperations corresponds to a DWARF expression,
+/// used as operand in some MCCFIInstructions.
+struct MCDwarfOperation {
+  uint8_t Operation{0};
+  uint64_t Operand0{0};
+  uint64_t Operand1{0};
+
+  MCDwarfOperation(uint8_t O, uint64_t O0, uint64_t O1)
+      : Operation(O), Operand0(O0), Operand1(O1) {}
+
+  bool operator==(const MCDwarfOperation &Other) const {
+    return (Other.Operation == Operation && Other.Operand0 == Operand0 &&
+            Other.Operand1 == Operand1);
+  }
+};
+typedef std::vector<MCDwarfOperation> MCDwarfExpression;
+
+/// \brief This builder should be used to create MCDwarfExpression objects
+/// before feeding them to a CFIInstruction factory method.
+class MCDwarfExprBuilder {
+public:
+  MCDwarfExprBuilder() {}
+
+private:
+  MCDwarfExpression Expr;
+
+public:
+  MCDwarfExprBuilder &appendOperation(uint8_t Operation) {
+    Expr.push_back(MCDwarfOperation(Operation, 0, 0));
+    return *this;
+  }
+
+  MCDwarfExprBuilder &appendOperation(uint8_t Operation, uint64_t Op0) {
+    Expr.push_back(MCDwarfOperation(Operation, Op0, 0));
+    return *this;
+  }
+
+  MCDwarfExprBuilder &appendOperation(uint8_t Operation, uint64_t Op0,
+                                   uint64_t Op1) {
+    Expr.push_back(MCDwarfOperation(Operation, Op0, Op1));
+    return *this;
+  }
+
+  /// \brief Return the resulting expression and reset internal state
+  MCDwarfExpression take() {
+    MCDwarfExpression Res;
+    std::swap(Res, Expr);
+    return Res;
+  }
+};
+
 class MCCFIInstruction {
 public:
   enum OpType {
@@ -450,6 +510,9 @@ class MCCFIInstruction {
     OpRegister,
     OpWindowSave,
     OpNegateRAState,
+    OpExpression,
+    OpDefCfaExpression,
+    OpValExpression,
     OpGnuArgsSize
   };
 
@@ -463,13 +526,16 @@ class MCCFIInstruction {
   };
   unsigned AddressSpace;
   std::vector<char> Values;
+  MCDwarfExpression Expression;
   std::string Comment;
 
   MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V,
                    StringRef Comment = "")
       : Operation(Op), Label(L), Register(R), Offset(O),
         Values(V.begin(), V.end()), Comment(Comment) {
-    assert(Op != OpRegister && Op != OpLLVMDefAspaceCfa);
+    assert(Op != OpRegister && Op != OpLLVMDefAspaceCfa &&
+           Op != OpDefCfaExpression && Op != OpValExpression &&
+           Op != OpExpression);
   }
 
   MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R1, unsigned R2)
@@ -482,6 +548,20 @@ class MCCFIInstruction {
     assert(Op == OpLLVMDefAspaceCfa);
   }
 
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R,
+                   const MCDwarfExpression &E)
+      : Operation(Op), Label(L), Register(R), Offset(0), Expression(E) {
+    assert(Op == OpDefCfaExpression || Op == OpValExpression ||
+           Op == OpExpression);
+  }
+
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, MCDwarfExpression &&E)
+      : Operation(Op), Label(L), Register(R), Offset(0),
+        Expression(std::move(E)) {
+    assert(Op == OpDefCfaExpression || Op == OpValExpression ||
+           Op == OpExpression);
+  }
+
 public:
   /// .cfi_def_cfa defines a rule for computing CFA as: take address from
   /// Register and add Offset to it.
@@ -594,15 +674,57 @@ class MCCFIInstruction {
     return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, "");
   }
 
+  /// \brief MCCFIInstructions that refer to an expression, expression object is
+  /// created by copying
+  static MCCFIInstruction createDefCfaExpression(MCSymbol *L,
+                                                 const MCDwarfExpression &E) {
+    return MCCFIInstruction(OpDefCfaExpression, L, 0, E);
+  }
+
+  static MCCFIInstruction createValExpression(MCSymbol *L, unsigned R,
+                                              const MCDwarfExpression &E) {
+    return MCCFIInstruction(OpValExpression, L, R, E);
+  }
+
+  static MCCFIInstruction createExpression(MCSymbol *L, unsigned R,
+                                           const MCDwarfExpression &E) {
+    return MCCFIInstruction(OpExpression, L, R, E);
+  }
+
+  /// \brief MCCFIInstructions that refer to an expression, expression object is
+  /// moved from an r-value
+  static MCCFIInstruction createDefCfaExpression(MCSymbol *L,
+                                                 MCDwarfExpression &&E) {
+    return MCCFIInstruction(OpDefCfaExpression, L, 0, E);
+  }
+
+  static MCCFIInstruction createValExpression(MCSymbol *L, unsigned R,
+                                              MCDwarfExpression &&E) {
+    return MCCFIInstruction(OpValExpression, L, R, E);
+  }
+
+  static MCCFIInstruction createExpression(MCSymbol *L, unsigned R,
+                                           MCDwarfExpression &&E) {
+    return MCCFIInstruction(OpExpression, L, R, E);
+  }
+
+  bool operator==(const MCCFIInstruction &Other) const {
+    return (Other.Operation == Operation && Other.Label == Label &&
+            Other.Offset == Offset && Other.Register == Register &&
+            Other.Expression == Expression);
+  }
+
   OpType getOperation() const { return Operation; }
   MCSymbol *getLabel() const { return Label; }
+  void setLabel(MCSymbol *L) { Label = L; }
 
   unsigned getRegister() const {
     assert(Operation == OpDefCfa || Operation == OpOffset ||
            Operation == OpRestore || Operation == OpUndefined ||
            Operation == OpSameValue || Operation == OpDefCfaRegister ||
            Operation == OpRelOffset || Operation == OpRegister ||
-           Operation == OpLLVMDefAspaceCfa);
+           Operation == OpLLVMDefAspaceCfa || Operation == OpExpression ||
+           Operation == OpValExpression);
     return Register;
   }
 
@@ -624,6 +746,33 @@ class MCCFIInstruction {
     return Offset;
   }
 
+  void setOffset(int NewOffset) {
+    assert(Operation == OpDefCfa || Operation == OpOffset ||
+           Operation == OpRelOffset || Operation == OpDefCfaOffset ||
+           Operation == OpAdjustCfaOffset || Operation == OpGnuArgsSize);
+    Offset = NewOffset;
+  }
+
+  void setRegister(unsigned NewReg) {
+    assert(Operation == OpDefCfa || Operation == OpOffset ||
+           Operation == OpRestore || Operation == OpUndefined ||
+           Operation == OpSameValue || Operation == OpDefCfaRegister ||
+           Operation == OpRelOffset || Operation == OpRegister ||
+           Operation == OpExpression || Operation == OpValExpression);
+    Register = NewReg;
+  }
+
+  void setRegister2(unsigned NewReg) {
+    assert(Operation == OpRegister);
+    Register2 = NewReg;
+  }
+
+  const MCDwarfExpression &getExpression() const {
+    assert(Operation == OpDefCfaExpression || Operation == OpExpression ||
+           Operation == OpValExpression);
+    return Expression;
+  }
+
   StringRef getValues() const {
     assert(Operation == OpEscape);
     return StringRef(&Values[0], Values.size());
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index 000b0e33e117..a1c8d924a4aa 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -34,6 +34,7 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
 public:
   enum FragmentType : uint8_t {
     FT_Align,
+    FT_NeverAlign,
     FT_Data,
     FT_CompactEncodedInst,
     FT_Fill,
@@ -333,6 +334,41 @@ class MCAlignFragment : public MCFragment {
   }
 };
 
+class MCNeverAlignFragment : public MCFragment {
+  /// Alignment - The alignment the end of the next fragment should avoid
+  unsigned Alignment;
+
+  /// EmitNops - Flag to indicate that (optimal) NOPs should be emitted instead
+  /// of using the provided value. The exact interpretation of this flag is
+  /// target dependent.
+  bool EmitNops : 1;
+
+  /// Value - Value to use for filling padding bytes.
+  int64_t Value;
+
+  /// ValueSize - The size of the integer (in bytes) of \p Value.
+  unsigned ValueSize;
+
+public:
+  MCNeverAlignFragment(unsigned Alignment, int64_t Value, unsigned ValueSize,
+                       MCSection *Sec = nullptr)
+      : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment),
+        EmitNops(false), Value(Value), ValueSize(ValueSize) {}
+
+  unsigned getAlignment() const { return Alignment; }
+
+  int64_t getValue() const { return Value; }
+
+  unsigned getValueSize() const { return ValueSize; }
+
+  bool hasEmitNops() const { return EmitNops; }
+  void setEmitNops(bool Value) { EmitNops = Value; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_NeverAlign;
+  }
+};
+
 class MCFillFragment : public MCFragment {
   uint8_t ValueSize;
   /// Value to use for filling bytes.
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index a0169c4c8228..039bf3f38bed 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -139,6 +139,8 @@ class MCObjectStreamer : public MCStreamer {
                             unsigned MaxBytesToEmit = 0) override;
   void emitCodeAlignment(unsigned ByteAlignment,
                          unsigned MaxBytesToEmit = 0) override;
+  void emitNeverAlignCodeAtEnd(unsigned ByteAlignment, int64_t Value = 0,
+                               unsigned ValueSize = 1) override;
   void emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                          SMLoc Loc) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
@@ -149,6 +151,9 @@ class MCObjectStreamer : public MCStreamer {
                                 const MCSymbol *Label,
                                 unsigned PointerSize) override;
   void emitDwarfLineEndEntry(MCSection *Section, MCSymbol *LastLabel) override;
+  void emitDwarfAdvanceLineAddrAbs(int64_t LineDelta, uint64_t Address,
+                                   uint64_t AddressDelta,
+                                   unsigned PointerSize) override;
   void emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                  const MCSymbol *Label);
   void emitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 1c6926b9a9e6..1874cee9e4da 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -45,9 +45,25 @@
 #define LLVM_MC_MCPSEUDOPROBE_H
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
+#include <algorithm>
 #include <functional>
 #include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <system_error>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -62,69 +78,212 @@ enum class MCPseudoProbeFlag {
   AddressDelta = 0x1,
 };
 
+// Function descriptor decoded from .pseudo_probe_desc section
+struct MCPseudoProbeFuncDesc {
+  uint64_t FuncGUID = 0;
+  uint64_t FuncHash = 0;
+  std::string FuncName;
+
+  MCPseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
+      : FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};
+
+  void print(raw_ostream &OS);
+};
+
+class MCPseudoProbe;
+class MCDecodedPseudoProbe;
+
+// An inline frame has the form <Guid, ProbeID>
+using InlineSite = std::tuple<uint64_t, uint32_t>;
+using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
+// GUID to PseudoProbeFuncDesc map
+using GUIDProbeFunctionMap =
+    std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
+// Address to pseudo probes map.
+using AddressProbesMap =
+    std::unordered_map<uint64_t, std::list<MCDecodedPseudoProbe>>;
+
+class MCPseudoProbeInlineTree;
+class MCDecodedPseudoProbeInlineTree;
+
+class MCPseudoProbeBase {
+protected:
+  uint64_t Guid;
+  uint64_t Index;
+  uint8_t Attributes;
+  uint8_t Type;
+  const static uint32_t PseudoProbeFirstId =
+      static_cast<uint32_t>(PseudoProbeReservedId::Last) + 1;
+
+public:
+  MCPseudoProbeBase(uint64_t G, uint64_t I, uint64_t At, uint8_t T)
+      : Guid(G), Index(I), Attributes(At), Type(T) {}
+
+  bool isEntry() const { return Index == PseudoProbeFirstId; }
+
+  bool isTailCall() const {
+    // Reserved of bolt is equivalent to PseudoProbeAttributes::TailCall
+    // defined in server-llvm (t/s/m)
+    return Attributes & static_cast<uint8_t>(PseudoProbeAttributes::Reserved);
+  }
+
+  uint64_t getGuid() const { return Guid; }
+
+  uint64_t getIndex() const { return Index; }
+
+  uint8_t getAttributes() const { return Attributes; }
+
+  uint8_t getType() const { return Type; }
+
+  bool isBlock() const {
+    return Type == static_cast<uint8_t>(PseudoProbeType::Block);
+  }
+
+  bool isIndirectCall() const {
+    return Type == static_cast<uint8_t>(PseudoProbeType::IndirectCall);
+  }
+
+  bool isDirectCall() const {
+    return Type == static_cast<uint8_t>(PseudoProbeType::DirectCall);
+  }
+
+  bool isCall() const { return isIndirectCall() || isDirectCall(); }
+
+  void setAttributes(uint8_t Attr) { Attributes = Attr; }
+};
+
 /// Instances of this class represent a pseudo probe instance for a pseudo probe
 /// table entry, which is created during a machine instruction is assembled and
 /// uses an address from a temporary label created at the current address in the
 /// current section.
-class MCPseudoProbe {
+class MCPseudoProbe : public MCPseudoProbeBase {
   MCSymbol *Label;
-  uint64_t Guid;
-  uint64_t Index;
-  uint8_t Type;
-  uint8_t Attributes;
 
 public:
   MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
                 uint64_t Attributes)
-      : Label(Label), Guid(Guid), Index(Index), Type(Type),
-        Attributes(Attributes) {
+      : MCPseudoProbeBase(Guid, Index, Attributes, Type), Label(Label) {
     assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
     assert(Attributes <= 0xFF &&
            "Probe attributes too big to encode, exceeding 2^16");
   }
 
   MCSymbol *getLabel() const { return Label; }
+  void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
+};
 
-  uint64_t getGuid() const { return Guid; }
+class MCDecodedPseudoProbe : public MCPseudoProbeBase {
+  uint64_t Address;
+  MCDecodedPseudoProbeInlineTree *InlineTree;
 
-  uint64_t getIndex() const { return Index; }
+public:
+  MCDecodedPseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
+                       uint8_t At, MCDecodedPseudoProbeInlineTree *Tree)
+      : MCPseudoProbeBase(G, I, At, static_cast<uint8_t>(K)), Address(Ad),
+        InlineTree(Tree){};
 
-  uint8_t getType() const { return Type; }
+  uint64_t getAddress() const { return Address; }
 
-  uint8_t getAttributes() const { return Attributes; }
+  void setAddress(uint64_t Addr) { Address = Addr; }
 
-  void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
+  MCDecodedPseudoProbeInlineTree *getInlineTreeNode() const {
+    return InlineTree;
+  }
+
+  // Get the inlined context by traversing current inline tree backwards,
+  // each tree node has its InlineSite which is taken as the context.
+  // \p ContextStack is populated in root to leaf order
+  void getInlineContext(SmallVectorImpl<std::string> &ContextStack,
+                        const GUIDProbeFunctionMap &GUID2FuncMAP,
+                        bool ShowName) const;
+
+  // Helper function to get the string from context stack
+  std::string getInlineContextStr(const GUIDProbeFunctionMap &GUID2FuncMAP,
+                                  bool ShowName) const;
+
+  // Print pseudo probe while disassembling
+  void print(raw_ostream &OS, const GUIDProbeFunctionMap &GUID2FuncMAP,
+             bool ShowName) const;
 };
 
-// An inline frame has the form <Guid, ProbeID>
-using InlineSite = std::tuple<uint64_t, uint32_t>;
-using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
+template <typename ProbeType, typename DerivedProbeInlineTreeType>
+class MCPseudoProbeInlineTreeBase {
+  struct InlineSiteHash {
+    uint64_t operator()(const InlineSite &Site) const {
+      return std::get<0>(Site) ^ std::get<1>(Site);
+    }
+  };
+
+protected:
+  // Track children (e.g. inlinees) of current context
+  using InlinedProbeTreeMap = std::unordered_map<
+      InlineSite, std::unique_ptr<DerivedProbeInlineTreeType>, InlineSiteHash>;
+  InlinedProbeTreeMap Children;
+  // Set of probes that come with the function.
+  std::vector<ProbeType> Probes;
+  MCPseudoProbeInlineTreeBase() {
+    static_assert(std::is_base_of<MCPseudoProbeInlineTreeBase,
+                                  DerivedProbeInlineTreeType>::value,
+                  "DerivedProbeInlineTreeType must be subclass of "
+                  "MCPseudoProbeInlineTreeBase");
+  }
+
+public:
+  uint64_t Guid = 0;
+
+  // Root node has a GUID 0.
+  bool isRoot() const { return Guid == 0; }
+  InlinedProbeTreeMap &getChildren() { return Children; }
+  std::vector<ProbeType> &getProbes() { return Probes; }
+  void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
+  // Caller node of the inline site
+  MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent;
+  DerivedProbeInlineTreeType *getOrAddNode(const InlineSite &Site) {
+    auto Ret = Children.emplace(
+        Site, std::make_unique<DerivedProbeInlineTreeType>(Site));
+    Ret.first->second->Parent = this;
+    return Ret.first->second.get();
+  };
+};
 
 // A Tri-tree based data structure to group probes by inline stack.
 // A tree is allocated for a standalone .text section. A fake
 // instance is created as the root of a tree.
 // A real instance of this class is created for each function, either an
 // unlined function that has code in .text section or an inlined function.
-class MCPseudoProbeInlineTree {
-  uint64_t Guid;
-  // Set of probes that come with the function.
-  std::vector<MCPseudoProbe> Probes;
-  // Use std::map for a deterministic output.
-  std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees;
-
-  // Root node has a GUID 0.
-  bool isRoot() { return Guid == 0; }
-  MCPseudoProbeInlineTree *getOrAddNode(InlineSite Site);
 
+class MCPseudoProbeInlineTree
+    : public MCPseudoProbeInlineTreeBase<MCPseudoProbe,
+                                         MCPseudoProbeInlineTree> {
 public:
   MCPseudoProbeInlineTree() = default;
-  MCPseudoProbeInlineTree(uint64_t Guid) : Guid(Guid) {}
-  ~MCPseudoProbeInlineTree();
+  MCPseudoProbeInlineTree(uint64_t Guid) { this->Guid = Guid; }
+  MCPseudoProbeInlineTree(const InlineSite &Site) {
+    this->Guid = std::get<0>(Site);
+  }
+
+  // MCPseudoProbeInlineTree method based on Inlinees
   void addPseudoProbe(const MCPseudoProbe &Probe,
                       const MCPseudoProbeInlineStack &InlineStack);
   void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *&LastProbe);
 };
 
+// inline tree node for the decoded pseudo probe
+class MCDecodedPseudoProbeInlineTree
+    : public MCPseudoProbeInlineTreeBase<MCDecodedPseudoProbe *,
+                                         MCDecodedPseudoProbeInlineTree> {
+public:
+  InlineSite ISite;
+  // Used for decoding
+  uint32_t ChildrenToProcess = 0;
+
+  MCDecodedPseudoProbeInlineTree(){};
+  MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
+
+  // Return false if it's a dummy inline site
+  bool hasInlineSite() const { return std::get<0>(ISite) != 0; }
+};
+
 /// Instances of this class represent the pseudo probes inserted into a compile
 /// unit.
 class MCPseudoProbeSection {
@@ -172,6 +331,84 @@ class MCPseudoProbeTable {
   static int DdgPrintIndent;
 #endif
 };
+
+class MCPseudoProbeDecoder {
+  // GUID to PseudoProbeFuncDesc map.
+  GUIDProbeFunctionMap GUID2FuncDescMap;
+
+  // Address to probes map.
+  AddressProbesMap Address2ProbesMap;
+
+  // The dummy root of the inline trie, all the outlined function will directly
+  // be the children of the dummy root, all the inlined function will be the
+  // children of its inlineer. So the relation would be like:
+  // DummyRoot --> OutlinedFunc --> InlinedFunc1 --> InlinedFunc2
+  MCDecodedPseudoProbeInlineTree DummyInlineRoot;
+
+  /// Points to the current location in the buffer.
+  const uint8_t *Data = nullptr;
+
+  /// Points to the end of the buffer.
+  const uint8_t *End = nullptr;
+
+  /// SectionName used for debug
+  std::string SectionName;
+
+  // Decoding helper function
+  template <typename T> ErrorOr<T> readUnencodedNumber();
+  template <typename T> ErrorOr<T> readUnsignedNumber();
+  template <typename T> ErrorOr<T> readSignedNumber();
+  ErrorOr<StringRef> readString(uint32_t Size);
+
+public:
+  // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
+  bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
+
+  // Decode pseudo_probe section to build address to probes map.
+  bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size);
+
+  // Print pseudo_probe_desc section info
+  void printGUID2FuncDescMap(raw_ostream &OS);
+
+  // Print pseudo_probe section info, used along with show-disassembly
+  void printProbeForAddress(raw_ostream &OS, uint64_t Address);
+
+  // do printProbeForAddress for all addresses
+  void printProbesForAllAddresses(raw_ostream &OS);
+
+  // Look up the probe of a call for the input address
+  const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const;
+
+  const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) const;
+
+  // Helper function to populate one probe's inline stack into
+  // \p InlineContextStack.
+  // Current leaf location info will be added if IncludeLeaf is true
+  // Example:
+  //  Current probe(bar:3) inlined at foo:2 then inlined at main:1
+  //  IncludeLeaf = true,  Output: [main:1, foo:2, bar:3]
+  //  IncludeLeaf = false, Output: [main:1, foo:2]
+  void
+  getInlineContextForProbe(const MCDecodedPseudoProbe *Probe,
+                           SmallVectorImpl<std::string> &InlineContextStack,
+                           bool IncludeLeaf) const;
+
+  const AddressProbesMap &getAddress2ProbesMap() const {
+    return Address2ProbesMap;
+  }
+
+  AddressProbesMap &getAddress2ProbesMap() { return Address2ProbesMap; }
+
+  const GUIDProbeFunctionMap &getGUID2FuncDescMap() const {
+    return GUID2FuncDescMap;
+  }
+
+  const MCPseudoProbeFuncDesc *
+  getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) const;
+
+  std::string getSectionName() const { return SectionName; };
+};
+
 } // end namespace llvm
 
 #endif // LLVM_MC_MCPSEUDOPROBE_H
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index d6ed2111ce8b..2f9e96857e13 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -40,6 +40,7 @@ namespace llvm {
 
 class AssemblerConstantPools;
 class MCAsmBackend;
+class MCCFIInstruction;
 class MCCodeEmitter;
 class MCContext;
 struct MCDwarfFrameInfo;
@@ -837,6 +838,12 @@ class MCStreamer {
   virtual void emitCodeAlignment(unsigned ByteAlignment,
                                  unsigned MaxBytesToEmit = 0);
 
+  /// If the end of the following fragment ever gets aligned to
+  /// \p ByteAlignment, emit a single nop or \t Value to break this alignment.
+  virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                       int64_t Value = 0,
+                                       unsigned ValueSize = 1);
+
   /// Emit some number of copies of \p Value until the byte offset \p
   /// Offset is reached.
   ///
@@ -1001,6 +1008,7 @@ class MCStreamer {
   virtual void emitCFIRegister(int64_t Register1, int64_t Register2);
   virtual void emitCFIWindowSave();
   virtual void emitCFINegateRAState();
+  virtual void emitCFIInstruction(const MCCFIInstruction &Inst);
 
   virtual void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndProc(SMLoc Loc = SMLoc());
@@ -1110,6 +1118,10 @@ class MCStreamer {
                                         const MCSymbol *Label,
                                         unsigned PointerSize) {}
 
+  virtual void emitDwarfAdvanceLineAddrAbs(int64_t LineDelta, uint64_t Address,
+                                           uint64_t AddressDelta,
+                                           unsigned PointerSize) {}
+
   /// Do finalization for the streamer at the end of a section.
   virtual void doFinalizationAtSectionEnd(MCSection *Section) {}
 };
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index 1c2e3f998449..d6f2d3dbea43 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -241,6 +241,12 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
         report_fatal_error(
             "Support for cfi_restore_state not implemented! Value of CFA may "
             "be incorrect!\n");
+#endif
+        break;
+      case MCCFIInstruction::OpDefCfaExpression:
+#ifndef NDEBUG
+        report_fatal_error("Support for cfi_def_cfa_expression not "
+                           "implemented! Value of CFA may be incorrect!\n");
 #endif
         break;
       // Other CFI directives do not affect CFA value.
@@ -249,6 +255,8 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
       case MCCFIInstruction::OpEscape:
       case MCCFIInstruction::OpWindowSave:
       case MCCFIInstruction::OpNegateRAState:
+      case MCCFIInstruction::OpExpression:
+      case MCCFIInstruction::OpValExpression:
       case MCCFIInstruction::OpGnuArgsSize:
         break;
       }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index ee1ff5460b9b..b9913cb946fc 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -60,13 +60,15 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
 
   // Read all of the abbreviation attributes and forms.
   while (true) {
+    uint32_t AOff = *OffsetPtr;
     auto A = static_cast<Attribute>(Data.getULEB128(OffsetPtr));
+    uint32_t FOff = *OffsetPtr;
     auto F = static_cast<Form>(Data.getULEB128(OffsetPtr));
     if (A && F) {
       bool IsImplicitConst = (F == DW_FORM_implicit_const);
       if (IsImplicitConst) {
         int64_t V = Data.getSLEB128(OffsetPtr);
-        AttributeSpecs.push_back(AttributeSpec(A, F, V));
+        AttributeSpecs.push_back(AttributeSpec(A, F, V, AOff, FOff));
         continue;
       }
       Optional<uint8_t> ByteSize;
@@ -108,7 +110,7 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
         break;
       }
       // Record this attribute and its fixed size if it has one.
-      AttributeSpecs.push_back(AttributeSpec(A, F, ByteSize));
+      AttributeSpecs.push_back(AttributeSpec(A, F, ByteSize, AOff, FOff));
     } else if (A == 0 && F == 0) {
       // We successfully reached the end of this abbreviation declaration
       // since both attribute and form are zero.
@@ -138,6 +140,15 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
   OS << '\n';
 }
 
+const DWARFAbbreviationDeclaration::AttributeSpec *
+DWARFAbbreviationDeclaration::findAttribute(dwarf::Attribute Attr) const {
+  for (uint32_t i = 0, e = AttributeSpecs.size(); i != e; ++i) {
+    if (AttributeSpecs[i].Attr == Attr)
+      return &AttributeSpecs[i];
+  }
+  return nullptr;
+}
+
 Optional<uint32_t>
 DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const {
   for (uint32_t i = 0, e = AttributeSpecs.size(); i != e; ++i) {
@@ -149,7 +160,7 @@ DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const {
 
 Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
     const uint64_t DIEOffset, const dwarf::Attribute Attr,
-    const DWARFUnit &U) const {
+    const DWARFUnit &U, uint64_t *OffsetPtr) const {
   // Check if this abbreviation has this attribute without needing to skip
   // any data so we can return quickly if it doesn't.
   Optional<uint32_t> MatchAttrIndex = findAttributeIndex(Attr);
@@ -170,6 +181,8 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
                                 &Offset, U.getFormParams());
 
   // We have arrived at the attribute to extract, extract if from Offset.
+  if (OffsetPtr)
+    *OffsetPtr = Offset;
   const AttributeSpec &Spec = AttributeSpecs[*MatchAttrIndex];
   if (Spec.isImplicitConst())
     return DWARFFormValue::createFromSValue(Spec.Form,
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 4e1cafeb2126..fe5298b1a830 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1368,6 +1368,19 @@ DWARFContext::getInliningInfoForAddress(object::SectionedAddress Address,
   return InliningInfo;
 }
 
+uint64_t DWARFContext::getAttrFieldOffsetForUnit(DWARFUnit *U,
+                                                 dwarf::Attribute Attr) const {
+  const auto UnitDIE = U->getUnitDIE();
+  if (!UnitDIE)
+    return 0;
+
+  uint64_t Offset = 0;
+  if (!UnitDIE.find(Attr, &Offset))
+    return 0;
+
+  return Offset;
+}
+
 std::shared_ptr<DWARFContext>
 DWARFContext::getDWOContext(StringRef AbsolutePath) {
   if (auto S = DWP.lock()) {
@@ -1590,6 +1603,8 @@ class DWARFObjInMemory final : public DWARFObject {
   // new decompressed sections are inserted at the end.
   std::deque<SmallString<0>> UncompressedSections;
 
+  bool UsesRelocs{true};
+
   StringRef *mapSectionToMember(StringRef Name) {
     if (DWARFSection *Sec = mapNameToDWARFSection(Name))
       return &Sec->Data;
@@ -1652,10 +1667,12 @@ class DWARFObjInMemory final : public DWARFObject {
     }
   }
   DWARFObjInMemory(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
-                   function_ref<void(Error)> HandleError, function_ref<void(Error)> HandleWarning )
+                   function_ref<void(Error)> HandleError,
+                   function_ref<void(Error)> HandleWarning,
+                   bool UsesRelocs = true)
       : IsLittleEndian(Obj.isLittleEndian()),
         AddressSize(Obj.getBytesInAddress()), FileName(Obj.getFileName()),
-        Obj(&Obj) {
+        Obj(&Obj), UsesRelocs(UsesRelocs) {
 
     StringMap<unsigned> SectionAmountMap;
     for (const SectionRef &Section : Obj.sections()) {
@@ -1735,7 +1752,7 @@ class DWARFObjInMemory final : public DWARFObject {
         S.Data = Data;
       }
 
-      if (RelocatedSection == Obj.section_end())
+      if (RelocatedSection == Obj.section_end() || !UsesRelocs)
         continue;
 
       StringRef RelSecName;
@@ -1849,6 +1866,8 @@ class DWARFObjInMemory final : public DWARFObject {
 
   Optional<RelocAddrEntry> find(const DWARFSection &S,
                                 uint64_t Pos) const override {
+    if (!UsesRelocs)
+      return None;
     auto &Sec = static_cast<const DWARFSectionMap &>(S);
     RelocAddrMap::const_iterator AI = Sec.Relocs.find(Pos);
     if (AI == Sec.Relocs.end())
@@ -1965,13 +1984,12 @@ class DWARFObjInMemory final : public DWARFObject {
 };
 } // namespace
 
-std::unique_ptr<DWARFContext>
-DWARFContext::create(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
-                     std::string DWPName,
-                     std::function<void(Error)> RecoverableErrorHandler,
-                     std::function<void(Error)> WarningHandler) {
-  auto DObj =
-      std::make_unique<DWARFObjInMemory>(Obj, L, RecoverableErrorHandler, WarningHandler);
+std::unique_ptr<DWARFContext> DWARFContext::create(
+    const object::ObjectFile &Obj, const LoadedObjectInfo *L,
+    std::string DWPName, std::function<void(Error)> RecoverableErrorHandler,
+    std::function<void(Error)> WarningHandler, bool UsesRelocs) {
+  auto DObj = std::make_unique<DWARFObjInMemory>(
+      Obj, L, RecoverableErrorHandler, WarningHandler, UsesRelocs);
   return std::make_unique<DWARFContext>(std::move(DObj), std::move(DWPName),
                                         RecoverableErrorHandler,
                                         WarningHandler);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 92a461dbd941..40e02b8a254e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -1032,7 +1032,8 @@ static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
   errs() << "\n";
 }
 
-Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
+Error DWARFDebugFrame::parse(DWARFDataExtractor Data,
+                             RefHandlerType RefHandler) {
   uint64_t Offset = 0;
   DenseMap<uint64_t, CIE *> CIEs;
 
@@ -1114,6 +1115,8 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
             Personality = Data.getEncodedPointer(
                 &Offset, *PersonalityEncoding,
                 EHFrameAddress ? EHFrameAddress + Offset : 0);
+            if (RefHandler)
+              RefHandler(*Personality, Offset, *PersonalityEncoding);
             break;
           }
           case 'R':
@@ -1179,6 +1182,8 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
                                        EHFrameAddress + Offset)) {
           InitialLocation = *Val;
         }
+        if (RefHandler)
+          RefHandler(InitialLocation, Offset, Cie->getFDEPointerEncoding());
         if (auto Val = Data.getEncodedPointer(
                 &Offset, Cie->getFDEPointerEncoding(), 0)) {
           AddressRange = *Val;
@@ -1196,6 +1201,8 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
             LSDAAddress = Data.getEncodedPointer(
                 &Offset, Cie->getLSDAPointerEncoding(),
                 EHFrameAddress ? Offset + EHFrameAddress : 0);
+            if (RefHandler)
+              RefHandler(*LSDAAddress, Offset, Cie->getLSDAPointerEncoding());
           }
 
           if (Offset != EndAugmentationOffset)
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 79438dcf843a..681d53ca57f2 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -342,12 +342,13 @@ bool DWARFDie::isSubroutineDIE() const {
   return Tag == DW_TAG_subprogram || Tag == DW_TAG_inlined_subroutine;
 }
 
-Optional<DWARFFormValue> DWARFDie::find(dwarf::Attribute Attr) const {
+Optional<DWARFFormValue> DWARFDie::find(dwarf::Attribute Attr,
+                                        uint64_t *OffsetPtr) const {
   if (!isValid())
     return None;
   auto AbbrevDecl = getAbbreviationDeclarationPtr();
   if (AbbrevDecl)
-    return AbbrevDecl->getAttributeValue(getOffset(), Attr, *U);
+    return AbbrevDecl->getAttributeValue(getOffset(), Attr, *U, OffsetPtr);
   return None;
 }
 
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index 811cee309e11..44a282c9cee6 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -70,6 +70,7 @@ add_llvm_component_library(LLVMMC
   Support
   BinaryFormat
   DebugInfoCodeView
+  ProfileData
   )
 
 add_subdirectory(MCParser)
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 9ed8d1083a40..09574d39bb6e 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -346,6 +346,34 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return Size;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    uint64_t Offset = Layout.getFragmentOffset(&NAF);
+    unsigned Size = 0;
+    uint64_t OffsetToAvoid = 0;
+    // Calculate offset to avoid in order to avoid aligning the end of the
+    // next fragment
+    if (const auto *NextFrag = dyn_cast<MCRelaxableFragment>(F.getNextNode())) {
+      OffsetToAvoid = NAF.getAlignment() -
+        (NextFrag->getContents().size() % NAF.getAlignment());
+    } else if (const auto *NextFrag =
+        dyn_cast<MCDataFragment>(F.getNextNode())) {
+      OffsetToAvoid = NAF.getAlignment() -
+        (NextFrag->getContents().size() % NAF.getAlignment());
+    }
+    // Check if the current offset matches the alignment plus offset we want to
+    // avoid
+    if (Offset % NAF.getAlignment() == OffsetToAvoid) {
+      // Avoid this alignment by introducing one extra byte
+      Size = 1;
+      if (Size > 0 && NAF.hasEmitNops()) {
+        while (Size % getBackend().getMinimumNopSize())
+          Size += 1;
+      }
+    }
+    return Size;
+  }
+
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
@@ -569,6 +597,41 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     break;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    assert(NAF.getValueSize() && "Invalid virtual align in concrete fragment!");
+
+    uint64_t Count = FragmentSize / NAF.getValueSize();
+    if (Count == 0)
+      break;
+    assert(Count * NAF.getValueSize() == FragmentSize);
+
+    if (NAF.hasEmitNops()) {
+      if (!Asm.getBackend().writeNopData(OS, Count))
+        report_fatal_error("unable to write nop sequence of " +
+            Twine(Count) + " bytes");
+      break;
+    }
+
+    // Otherwise, write out in multiples of the value size.
+    for (uint64_t i = 0; i != Count; ++i) {
+      switch (NAF.getValueSize()) {
+        default: llvm_unreachable("Invalid size!");
+        case 1: OS << char(NAF.getValue()); break;
+        case 2:
+          support::endian::write<uint16_t>(OS, NAF.getValue(), Endian);
+          break;
+        case 4:
+          support::endian::write<uint32_t>(OS, NAF.getValue(), Endian);
+          break;
+        case 8:
+          support::endian::write<uint64_t>(OS, NAF.getValue(), Endian);
+          break;
+      }
+    }
+    break;
+  }
+
   case MCFragment::FT_Data:
     ++stats::EmittedDataFragments;
     OS << cast<MCDataFragment>(F).getContents();
@@ -757,6 +820,11 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec,
                 cast<MCAlignFragment>(F).getValue() == 0) &&
                "Invalid align in virtual section!");
         break;
+      case MCFragment::FT_NeverAlign:
+        assert((cast<MCNeverAlignFragment>(F).getValueSize() == 0 ||
+                cast<MCNeverAlignFragment>(F).getValue() == 0) &&
+            "Invalid neveralign in virtual section!");
+        break;
       case MCFragment::FT_Fill:
         assert((cast<MCFillFragment>(F).getValue() == 0) &&
                "Invalid fill in virtual section!");
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 27bb7a103165..f422ed4bd647 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -171,12 +171,27 @@ static inline void emitDwarfLineTable(
   unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
   unsigned Isa = 0;
   unsigned Discriminator = 0;
+  uint64_t LastAddress = -1ULL;
   MCSymbol *LastLabel = nullptr;
+  const MCAsmInfo *AsmInfo = MCOS->getContext().getAsmInfo();
 
   // Loop through each MCDwarfLineEntry and encode the dwarf line number table.
-  for (const MCDwarfLineEntry &LineEntry : LineEntries) {
+  for (auto it = LineEntries.begin(),
+            ie = LineEntries.end();
+       it != ie; ++it) {
+    const MCDwarfLineEntry &LineEntry = *it;
     int64_t LineDelta = static_cast<int64_t>(LineEntry.getLine()) - LastLine;
 
+    const uint64_t Address = LineEntry.getAbsoluteAddr();
+    if (Address != -1ULL && std::next(it) == ie) {
+      // If emitting absolute addresses, the last entry only carries address
+      // info for the DW_LNE_end_sequence. This entry compensates for the lack
+      // of the section context used to emit the end of section label.
+      MCOS->emitDwarfAdvanceLineAddrAbs(INT64_MAX, -1ULL, Address - LastAddress,
+                                        AsmInfo->getCodePointerSize());
+      return;
+    }
+
     if (FileNum != LineEntry.getFileNum()) {
       FileNum = LineEntry.getFileNum();
       MCOS->emitInt8(dwarf::DW_LNS_set_file);
@@ -212,18 +227,34 @@ static inline void emitDwarfLineTable(
     if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
       MCOS->emitInt8(dwarf::DW_LNS_set_epilogue_begin);
 
-    MCSymbol *Label = LineEntry.getLabel();
+    if (Address == -1ULL) {
+      assert(LastAddress == -1ULL &&
+             "Absolute addresses can only be added at the end of the table.");
+
+      MCSymbol *Label = LineEntry.getLabel();
 
-    // At this point we want to emit/create the sequence to encode the delta in
-    // line numbers and the increment of the address from the previous Label
-    // and the current Label.
-    const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
-    MCOS->emitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
-                                   asmInfo->getCodePointerSize());
+      // At this point we want to emit/create the sequence to encode the delta
+      // in line numbers and the increment of the address from the previous
+      // Label and the current Label.
+      MCOS->emitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
+                                     AsmInfo->getCodePointerSize());
+      LastLabel = Label;
+      LastAddress = -1ULL;
+    } else {
+      if (LastAddress == -1ULL) {
+        MCOS->emitDwarfAdvanceLineAddrAbs(LineDelta, Address, 0,
+                                          AsmInfo->getCodePointerSize());
+      } else {
+        MCOS->emitDwarfAdvanceLineAddrAbs(LineDelta, -1ULL,
+                                          Address - LastAddress,
+                                          AsmInfo->getCodePointerSize());
+      }
+      LastAddress = Address;
+      LastLabel = nullptr;
+    }
 
     Discriminator = 0;
     LastLine = LineEntry.getLine();
-    LastLabel = Label;
   }
 
   // Generate DWARF line end entry.
@@ -1284,12 +1315,217 @@ class FrameEmitterImpl {
   void emitCFIInstruction(const MCCFIInstruction &Instr);
 };
 
+// A stripped-down version of MCObjectStreamer that only calculates how many
+// bytes were written to it. We use it to know in advance how many bytes
+// DWARF expressions will use.
+class SizeCalcMCStreamer {
+  uint64_t TotalSize = {0};
+
+public:
+  SizeCalcMCStreamer() {}
+
+  uint64_t getSize() { return TotalSize; }
+
+  void emitIntValue(uint64_t Value, unsigned Size) { TotalSize += Size; }
+
+  void emitULEB128IntValue(uint64_t Value, unsigned Padding = 0) {
+    TotalSize += Padding + getULEB128Size(Value);
+  }
+
+  void emitSLEB128IntValue(int64_t Value) {
+    TotalSize += getSLEB128Size(Value);
+  }
+};
+
 } // end anonymous namespace
 
 static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) {
   Streamer.emitInt8(Encoding);
 }
 
+template <typename T>
+static void EmitDwarfExpression(T &Streamer,
+                                const MCDwarfExpression &Expr) {
+  for (const auto &Elem : Expr) {
+    Streamer.emitIntValue(Elem.Operation, 1);
+    switch (Elem.Operation) {
+    default:
+      llvm_unreachable("Unrecognized DWARF expression opcode");
+    case dwarf::DW_OP_addr:
+    case dwarf::DW_OP_call_ref:
+      llvm_unreachable("DW_OP_addr & DW_OP_call_ref are unimplemented");
+      break;
+    case dwarf::DW_OP_const1u:
+    case dwarf::DW_OP_const1s:
+    case dwarf::DW_OP_pick:
+    case dwarf::DW_OP_deref_size:
+    case dwarf::DW_OP_xderef_size:
+      Streamer.emitIntValue(Elem.Operand0, 1);
+      break;
+    case dwarf::DW_OP_const2u:
+    case dwarf::DW_OP_const2s:
+    case dwarf::DW_OP_skip:
+    case dwarf::DW_OP_bra:
+    case dwarf::DW_OP_call2:
+      Streamer.emitIntValue(Elem.Operand0, 2);
+      break;
+    case dwarf::DW_OP_const4u:
+    case dwarf::DW_OP_const4s:
+    case dwarf::DW_OP_call4:
+      Streamer.emitIntValue(Elem.Operand0, 4);
+      break;
+    case dwarf::DW_OP_const8u:
+    case dwarf::DW_OP_const8s:
+      Streamer.emitIntValue(Elem.Operand0, 8);
+      break;
+    case dwarf::DW_OP_constu:
+    case dwarf::DW_OP_plus_uconst:
+    case dwarf::DW_OP_regx:
+    case dwarf::DW_OP_piece:
+    case dwarf::DW_OP_GNU_addr_index:
+    case dwarf::DW_OP_GNU_const_index:
+      Streamer.emitULEB128IntValue(Elem.Operand0);
+      break;
+    case dwarf::DW_OP_consts:
+    case dwarf::DW_OP_breg0:
+    case dwarf::DW_OP_breg1:
+    case dwarf::DW_OP_breg2:
+    case dwarf::DW_OP_breg3:
+    case dwarf::DW_OP_breg4:
+    case dwarf::DW_OP_breg5:
+    case dwarf::DW_OP_breg6:
+    case dwarf::DW_OP_breg7:
+    case dwarf::DW_OP_breg8:
+    case dwarf::DW_OP_breg9:
+    case dwarf::DW_OP_breg10:
+    case dwarf::DW_OP_breg11:
+    case dwarf::DW_OP_breg12:
+    case dwarf::DW_OP_breg13:
+    case dwarf::DW_OP_breg14:
+    case dwarf::DW_OP_breg15:
+    case dwarf::DW_OP_breg16:
+    case dwarf::DW_OP_breg17:
+    case dwarf::DW_OP_breg18:
+    case dwarf::DW_OP_breg19:
+    case dwarf::DW_OP_breg20:
+    case dwarf::DW_OP_breg21:
+    case dwarf::DW_OP_breg22:
+    case dwarf::DW_OP_breg23:
+    case dwarf::DW_OP_breg24:
+    case dwarf::DW_OP_breg25:
+    case dwarf::DW_OP_breg26:
+    case dwarf::DW_OP_breg27:
+    case dwarf::DW_OP_breg28:
+    case dwarf::DW_OP_breg29:
+    case dwarf::DW_OP_breg30:
+    case dwarf::DW_OP_breg31:
+    case dwarf::DW_OP_fbreg:
+      Streamer.emitSLEB128IntValue(Elem.Operand0);
+      break;
+    case dwarf::DW_OP_deref:
+    case dwarf::DW_OP_dup:
+    case dwarf::DW_OP_drop:
+    case dwarf::DW_OP_over:
+    case dwarf::DW_OP_swap:
+    case dwarf::DW_OP_rot:
+    case dwarf::DW_OP_xderef:
+    case dwarf::DW_OP_abs:
+    case dwarf::DW_OP_and:
+    case dwarf::DW_OP_div:
+    case dwarf::DW_OP_minus:
+    case dwarf::DW_OP_mod:
+    case dwarf::DW_OP_mul:
+    case dwarf::DW_OP_neg:
+    case dwarf::DW_OP_not:
+    case dwarf::DW_OP_or:
+    case dwarf::DW_OP_plus:
+    case dwarf::DW_OP_shl:
+    case dwarf::DW_OP_shr:
+    case dwarf::DW_OP_shra:
+    case dwarf::DW_OP_xor:
+    case dwarf::DW_OP_eq:
+    case dwarf::DW_OP_ge:
+    case dwarf::DW_OP_gt:
+    case dwarf::DW_OP_le:
+    case dwarf::DW_OP_lt:
+    case dwarf::DW_OP_ne:
+    case dwarf::DW_OP_lit0:
+    case dwarf::DW_OP_lit1:
+    case dwarf::DW_OP_lit2:
+    case dwarf::DW_OP_lit3:
+    case dwarf::DW_OP_lit4:
+    case dwarf::DW_OP_lit5:
+    case dwarf::DW_OP_lit6:
+    case dwarf::DW_OP_lit7:
+    case dwarf::DW_OP_lit8:
+    case dwarf::DW_OP_lit9:
+    case dwarf::DW_OP_lit10:
+    case dwarf::DW_OP_lit11:
+    case dwarf::DW_OP_lit12:
+    case dwarf::DW_OP_lit13:
+    case dwarf::DW_OP_lit14:
+    case dwarf::DW_OP_lit15:
+    case dwarf::DW_OP_lit16:
+    case dwarf::DW_OP_lit17:
+    case dwarf::DW_OP_lit18:
+    case dwarf::DW_OP_lit19:
+    case dwarf::DW_OP_lit20:
+    case dwarf::DW_OP_lit21:
+    case dwarf::DW_OP_lit22:
+    case dwarf::DW_OP_lit23:
+    case dwarf::DW_OP_lit24:
+    case dwarf::DW_OP_lit25:
+    case dwarf::DW_OP_lit26:
+    case dwarf::DW_OP_lit27:
+    case dwarf::DW_OP_lit28:
+    case dwarf::DW_OP_lit29:
+    case dwarf::DW_OP_lit30:
+    case dwarf::DW_OP_lit31:
+    case dwarf::DW_OP_reg0:
+    case dwarf::DW_OP_reg1:
+    case dwarf::DW_OP_reg2:
+    case dwarf::DW_OP_reg3:
+    case dwarf::DW_OP_reg4:
+    case dwarf::DW_OP_reg5:
+    case dwarf::DW_OP_reg6:
+    case dwarf::DW_OP_reg7:
+    case dwarf::DW_OP_reg8:
+    case dwarf::DW_OP_reg9:
+    case dwarf::DW_OP_reg10:
+    case dwarf::DW_OP_reg11:
+    case dwarf::DW_OP_reg12:
+    case dwarf::DW_OP_reg13:
+    case dwarf::DW_OP_reg14:
+    case dwarf::DW_OP_reg15:
+    case dwarf::DW_OP_reg16:
+    case dwarf::DW_OP_reg17:
+    case dwarf::DW_OP_reg18:
+    case dwarf::DW_OP_reg19:
+    case dwarf::DW_OP_reg20:
+    case dwarf::DW_OP_reg21:
+    case dwarf::DW_OP_reg22:
+    case dwarf::DW_OP_reg23:
+    case dwarf::DW_OP_reg24:
+    case dwarf::DW_OP_reg25:
+    case dwarf::DW_OP_reg26:
+    case dwarf::DW_OP_reg27:
+    case dwarf::DW_OP_reg28:
+    case dwarf::DW_OP_reg29:
+    case dwarf::DW_OP_reg30:
+    case dwarf::DW_OP_reg31:
+    case dwarf::DW_OP_nop:
+    case dwarf::DW_OP_push_object_address:
+    case dwarf::DW_OP_form_tls_address:
+    case dwarf::DW_OP_GNU_push_tls_address:
+      break;
+    case dwarf::DW_OP_bregx:
+      Streamer.emitULEB128IntValue(Elem.Operand0);
+      Streamer.emitSLEB128IntValue(Elem.Operand1);
+      break;
+    }
+  }
+}
+
 void FrameEmitterImpl::emitCFIInstruction(const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
   auto *MRI = Streamer.getContext().getRegisterInfo();
@@ -1427,6 +1663,28 @@ void FrameEmitterImpl::emitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.emitULEB128IntValue(Instr.getOffset());
     return;
 
+  case MCCFIInstruction::OpDefCfaExpression: {
+    Streamer.emitIntValue(dwarf::DW_CFA_def_cfa_expression, 1);
+    SizeCalcMCStreamer FakeStreamer;
+    EmitDwarfExpression<>(FakeStreamer, Instr.getExpression());
+    Streamer.emitULEB128IntValue(FakeStreamer.getSize());
+    EmitDwarfExpression<>(Streamer, Instr.getExpression());
+    return;
+  }
+  case MCCFIInstruction::OpExpression:
+  case MCCFIInstruction::OpValExpression: {
+    unsigned Reg = Instr.getRegister();
+    Streamer.emitIntValue(Instr.getOperation() == MCCFIInstruction::OpExpression
+                              ? dwarf::DW_CFA_expression
+                              : dwarf::DW_CFA_val_expression,
+                          1);
+    Streamer.emitULEB128IntValue(Reg);
+    SizeCalcMCStreamer FakeStreamer;
+    EmitDwarfExpression<>(FakeStreamer, Instr.getExpression());
+    Streamer.emitULEB128IntValue(FakeStreamer.getSize());
+    EmitDwarfExpression<>(Streamer, Instr.getExpression());
+    return;
+  }
   case MCCFIInstruction::OpEscape:
     Streamer.emitBytes(Instr.getValues());
     return;
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 0f8543f51096..d345106e9b44 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -270,6 +270,9 @@ void MCFragment::destroy() {
     case FT_Align:
       delete cast<MCAlignFragment>(this);
       return;
+    case FT_NeverAlign:
+      delete cast<MCNeverAlignFragment>(this);
+      return;
     case FT_Data:
       delete cast<MCDataFragment>(this);
       return;
@@ -338,6 +341,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   OS << "<";
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
+  case MCFragment::FT_NeverAlign: OS << "MCNeverAlignFragment"; break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
@@ -377,6 +381,15 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
   }
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment *NAF = cast<MCNeverAlignFragment>(this);
+    if (NAF->hasEmitNops())
+      OS << " (emit nops)";
+    OS << "\n       ";
+    OS << " Alignment:" << NAF->getAlignment()
+      << " Value:" << NAF->getValue() << " ValueSize:" << NAF->getValueSize();
+    break;
+  }
   case MCFragment::FT_Data:  {
     const auto *DF = cast<MCDataFragment>(this);
     OS << "\n       ";
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 4dae2f41fc45..629cd0e4bd03 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -469,6 +469,33 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer &OS, const MCSymbol *A,
   return AddrDelta;
 }
 
+static void emitDwarfSetLineAddrAbs(MCObjectStreamer &OS,
+                                    MCDwarfLineTableParams Params,
+                                    int64_t LineDelta, uint64_t Address,
+                                    int PointerSize) {
+  // emit the sequence to set the address
+  OS.emitIntValue(dwarf::DW_LNS_extended_op, 1);
+  OS.emitULEB128IntValue(PointerSize + 1);
+  OS.emitIntValue(dwarf::DW_LNE_set_address, 1);
+  OS.emitIntValue(Address, PointerSize);
+
+  // emit the sequence for the LineDelta (from 1) and a zero address delta.
+  MCDwarfLineAddr::Emit(&OS, Params, LineDelta, 0);
+}
+
+void MCObjectStreamer::emitDwarfAdvanceLineAddrAbs(int64_t LineDelta,
+                                                   uint64_t Address,
+                                                   uint64_t AddressDelta,
+                                                   unsigned PointerSize) {
+  if (Address != -1ULL) {
+    emitDwarfSetLineAddrAbs(*this, Assembler->getDWARFLinetableParams(),
+                            LineDelta, Address, PointerSize);
+    return;
+  }
+  MCDwarfLineAddr::Emit(this, Assembler->getDWARFLinetableParams(), LineDelta,
+                        AddressDelta);
+}
+
 static void emitDwarfSetLineAddr(MCObjectStreamer &OS,
                                  MCDwarfLineTableParams Params,
                                  int64_t LineDelta, const MCSymbol *Label,
@@ -614,6 +641,13 @@ void MCObjectStreamer::emitCodeAlignment(unsigned ByteAlignment,
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true);
 }
 
+void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                               int64_t Value,
+                                               unsigned ValueSize) {
+  insert(new MCNeverAlignFragment(ByteAlignment, 0, 1));
+  cast<MCNeverAlignFragment>(getCurrentFragment())->setEmitNops(true);
+}
+
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value,
                                          SMLoc Loc) {
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 731831d3bce3..9139cbd61a3d 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -12,10 +12,16 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <limits>
+#include <memory>
 
 #define DEBUG_TYPE "mcpseudoprobe"
 
 using namespace llvm;
+using namespace support;
 
 #ifndef NDEBUG
 int MCPseudoProbeTable::DdgPrintIndent = 0;
@@ -69,23 +75,6 @@ void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
   });
 }
 
-MCPseudoProbeInlineTree::~MCPseudoProbeInlineTree() {
-  for (auto &Inlinee : Inlinees)
-    delete Inlinee.second;
-}
-
-MCPseudoProbeInlineTree *
-MCPseudoProbeInlineTree::getOrAddNode(InlineSite Site) {
-  auto Iter = Inlinees.find(Site);
-  if (Iter == Inlinees.end()) {
-    auto *Node = new MCPseudoProbeInlineTree(std::get<0>(Site));
-    Inlinees[Site] = Node;
-    return Node;
-  } else {
-    return Iter->second;
-  }
-}
-
 void MCPseudoProbeInlineTree::addPseudoProbe(
     const MCPseudoProbe &Probe, const MCPseudoProbeInlineStack &InlineStack) {
   // The function should not be called on the root.
@@ -147,7 +136,7 @@ void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS,
     // Emit number of probes in this node
     MCOS->emitULEB128IntValue(Probes.size());
     // Emit number of direct inlinees
-    MCOS->emitULEB128IntValue(Inlinees.size());
+    MCOS->emitULEB128IntValue(Children.size());
     // Emit probes in this group
     for (const auto &Probe : Probes) {
       Probe.emit(MCOS, LastProbe);
@@ -157,7 +146,13 @@ void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS,
     assert(Probes.empty() && "Root should not have probes");
   }
 
-  // Emit descendent
+  // Emit sorted descendant
+  // InlineSite is unique for each pair,
+  // so there will be no ordering of Inlinee based on MCPseudoProbeInlineTree*
+  std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees;
+  for (auto Child = Children.begin(); Child != Children.end(); ++Child)
+    Inlinees[Child->first] = Child->second.get();
+
   for (const auto &Inlinee : Inlinees) {
     if (Guid) {
       // Emit probe index
@@ -211,3 +206,375 @@ void MCPseudoProbeTable::emit(MCObjectStreamer *MCOS) {
   // Put out the probe.
   ProbeSections.emit(MCOS);
 }
+
+static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
+                                      uint64_t GUID) {
+  auto It = GUID2FuncMAP.find(GUID);
+  assert(It != GUID2FuncMAP.end() &&
+         "Probe function must exist for a valid GUID");
+  return It->second.FuncName;
+}
+
+void MCPseudoProbeFuncDesc::print(raw_ostream &OS) {
+  OS << "GUID: " << FuncGUID << " Name: " << FuncName << "\n";
+  OS << "Hash: " << FuncHash << "\n";
+}
+
+void MCDecodedPseudoProbe::getInlineContext(
+    SmallVectorImpl<std::string> &ContextStack,
+    const GUIDProbeFunctionMap &GUID2FuncMAP, bool ShowName) const {
+  uint32_t Begin = ContextStack.size();
+  MCDecodedPseudoProbeInlineTree *Cur = InlineTree;
+  // It will add the string of each node's inline site during iteration.
+  // Note that it won't include the probe's belonging function(leaf location)
+  while (Cur->hasInlineSite()) {
+    std::string ContextStr;
+    if (ShowName) {
+      StringRef FuncName =
+          getProbeFNameForGUID(GUID2FuncMAP, std::get<0>(Cur->ISite));
+      ContextStr += FuncName.str();
+    } else {
+      ContextStr += Twine(std::get<0>(Cur->ISite)).str();
+    }
+    ContextStr += ":";
+    ContextStr += Twine(std::get<1>(Cur->ISite)).str();
+    ContextStack.emplace_back(ContextStr);
+    Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
+  }
+  // Make the ContextStack in caller-callee order
+  std::reverse(ContextStack.begin() + Begin, ContextStack.end());
+}
+
+std::string MCDecodedPseudoProbe::getInlineContextStr(
+    const GUIDProbeFunctionMap &GUID2FuncMAP, bool ShowName) const {
+  std::ostringstream OContextStr;
+  SmallVector<std::string, 16> ContextStack;
+  getInlineContext(ContextStack, GUID2FuncMAP, ShowName);
+  for (auto &CxtStr : ContextStack) {
+    if (OContextStr.str().size())
+      OContextStr << " @ ";
+    OContextStr << CxtStr;
+  }
+  return OContextStr.str();
+}
+
+static const char *PseudoProbeTypeStr[3] = {"Block", "IndirectCall",
+                                            "DirectCall"};
+
+void MCDecodedPseudoProbe::print(raw_ostream &OS,
+                                 const GUIDProbeFunctionMap &GUID2FuncMAP,
+                                 bool ShowName) const {
+  OS << "FUNC: ";
+  if (ShowName) {
+    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Guid);
+    OS << FuncName.str() << " ";
+  } else {
+    OS << Guid << " ";
+  }
+  OS << "Index: " << Index << "  ";
+  OS << "Type: " << PseudoProbeTypeStr[static_cast<uint8_t>(Type)] << "  ";
+  if (isTailCall()) {
+    OS << "TailCall  ";
+  }
+  std::string InlineContextStr = getInlineContextStr(GUID2FuncMAP, ShowName);
+  if (InlineContextStr.size()) {
+    OS << "Inlined: @ ";
+    OS << InlineContextStr;
+  }
+  OS << "\n";
+}
+
+template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readUnencodedNumber() {
+  if (Data + sizeof(T) > End) {
+    return std::error_code();
+  }
+  T Val = endian::readNext<T, little, unaligned>(Data);
+  return ErrorOr<T>(Val);
+}
+
+template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readUnsignedNumber() {
+  unsigned NumBytesRead = 0;
+  uint64_t Val = decodeULEB128(Data, &NumBytesRead);
+  if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
+    return std::error_code();
+  }
+  Data += NumBytesRead;
+  return ErrorOr<T>(static_cast<T>(Val));
+}
+
+template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readSignedNumber() {
+  unsigned NumBytesRead = 0;
+  int64_t Val = decodeSLEB128(Data, &NumBytesRead);
+  if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
+    return std::error_code();
+  }
+  Data += NumBytesRead;
+  return ErrorOr<T>(static_cast<T>(Val));
+}
+
+ErrorOr<StringRef> MCPseudoProbeDecoder::readString(uint32_t Size) {
+  StringRef Str(reinterpret_cast<const char *>(Data), Size);
+  if (Data + Size > End) {
+    return std::error_code();
+  }
+  Data += Size;
+  return ErrorOr<StringRef>(Str);
+}
+
+bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
+                                                 std::size_t Size) {
+  // The pseudo_probe_desc section has a format like:
+  // .section .pseudo_probe_desc,"",@progbits
+  // .quad -5182264717993193164   // GUID
+  // .quad 4294967295             // Hash
+  // .uleb 3                      // Name size
+  // .ascii "foo"                 // Name
+  // .quad -2624081020897602054
+  // .quad 174696971957
+  // .uleb 34
+  // .ascii "main"
+#ifndef NDEBUG
+  SectionName = "pseudo_probe_desc";
+#endif
+  Data = Start;
+  End = Data + Size;
+
+  while (Data < End) {
+    auto ErrorOrGUID = readUnencodedNumber<uint64_t>();
+    if (!ErrorOrGUID)
+      return false;
+
+    auto ErrorOrHash = readUnencodedNumber<uint64_t>();
+    if (!ErrorOrHash)
+      return false;
+
+    auto ErrorOrNameSize = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrNameSize)
+      return false;
+    uint32_t NameSize = std::move(*ErrorOrNameSize);
+
+    auto ErrorOrName = readString(NameSize);
+    if (!ErrorOrName)
+      return false;
+
+    uint64_t GUID = std::move(*ErrorOrGUID);
+    uint64_t Hash = std::move(*ErrorOrHash);
+    StringRef Name =
+        FunctionSamples::getCanonicalFnName(std::move(*ErrorOrName));
+
+    // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
+    GUID2FuncDescMap.emplace(GUID, MCPseudoProbeFuncDesc(GUID, Hash, Name));
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  return true;
+}
+
+bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
+                                                 std::size_t Size) {
+  // The pseudo_probe section encodes an inline forest and each tree has a
+  // format like:
+  //  FUNCTION BODY (one for each uninlined function present in the text
+  //  section)
+  //     GUID (uint64)
+  //         GUID of the function
+  //     NPROBES (ULEB128)
+  //         Number of probes originating from this function.
+  //     NUM_INLINED_FUNCTIONS (ULEB128)
+  //         Number of callees inlined into this function, aka number of
+  //         first-level inlinees
+  //     PROBE RECORDS
+  //         A list of NPROBES entries. Each entry contains:
+  //           INDEX (ULEB128)
+  //           TYPE (uint4)
+  //             0 - block probe, 1 - indirect call, 2 - direct call
+  //           ATTRIBUTE (uint3)
+  //             1 - tail call, 2 - dangling
+  //           ADDRESS_TYPE (uint1)
+  //             0 - code address, 1 - address delta
+  //           CODE_ADDRESS (uint64 or ULEB128)
+  //             code address or address delta, depending on Flag
+  //     INLINED FUNCTION RECORDS
+  //         A list of NUM_INLINED_FUNCTIONS entries describing each of the
+  //         inlined callees.  Each record contains:
+  //           INLINE SITE
+  //             Index of the callsite probe (ULEB128)
+  //           FUNCTION BODY
+  //             A FUNCTION BODY entry describing the inlined function.
+#ifndef NDEBUG
+  SectionName = "pseudo_probe";
+#endif
+  Data = Start;
+  End = Data + Size;
+
+  MCDecodedPseudoProbeInlineTree *Root = &DummyInlineRoot;
+  MCDecodedPseudoProbeInlineTree *Cur = &DummyInlineRoot;
+  uint64_t LastAddr = 0;
+  uint32_t Index = 0;
+  // A DFS-based decoding
+  while (Data < End) {
+    if (Root == Cur) {
+      // Use a sequential id for top level inliner.
+      Index = Root->getChildren().size();
+    } else {
+      // Read inline site for inlinees
+      auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
+      if (!ErrorOrIndex)
+        return false;
+      Index = std::move(*ErrorOrIndex);
+    }
+    // Switch/add to a new tree node(inlinee)
+    Cur = Cur->getOrAddNode(std::make_tuple(Cur->Guid, Index));
+    // Read guid
+    auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+    if (!ErrorOrCurGuid)
+      return false;
+    Cur->Guid = std::move(*ErrorOrCurGuid);
+    // Read number of probes in the current node.
+    auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrNodeCount)
+      return false;
+    uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+    // Read number of direct inlinees
+    auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrCurChildrenToProcess)
+      return false;
+    Cur->ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+    // Read all probes in this node
+    for (std::size_t I = 0; I < NodeCount; I++) {
+      // Read index
+      auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
+      if (!ErrorOrIndex)
+        return false;
+      uint32_t Index = std::move(*ErrorOrIndex);
+      // Read type | flag.
+      auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+      if (!ErrorOrValue)
+        return false;
+      uint8_t Value = std::move(*ErrorOrValue);
+      uint8_t Kind = Value & 0xf;
+      uint8_t Attr = (Value & 0x70) >> 4;
+      // Read address
+      uint64_t Addr = 0;
+      if (Value & 0x80) {
+        auto ErrorOrOffset = readSignedNumber<int64_t>();
+        if (!ErrorOrOffset)
+          return false;
+        int64_t Offset = std::move(*ErrorOrOffset);
+        Addr = LastAddr + Offset;
+      } else {
+        auto ErrorOrAddr = readUnencodedNumber<int64_t>();
+        if (!ErrorOrAddr)
+          return false;
+        Addr = std::move(*ErrorOrAddr);
+      }
+      // Populate Address2ProbesMap
+      auto &Probes = Address2ProbesMap[Addr];
+      Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
+                          Cur);
+      Cur->addProbes(&Probes.back());
+      LastAddr = Addr;
+    }
+
+    // Look for the parent for the next node by subtracting the current
+    // node count from tree counts along the parent chain. The first node
+    // in the chain that has a non-zero tree count is the target.
+    while (Cur != Root) {
+      if (Cur->ChildrenToProcess == 0) {
+        Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
+        if (Cur != Root) {
+          assert(Cur->ChildrenToProcess > 0 &&
+                 "Should have some unprocessed nodes");
+          Cur->ChildrenToProcess -= 1;
+        }
+      } else {
+        break;
+      }
+    }
+  }
+
+  assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  assert(Cur == Root &&
+         " Cur should point to root when the forest is fully built up");
+  return true;
+}
+
+void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
+  OS << "Pseudo Probe Desc:\n";
+  // Make the output deterministic
+  std::map<uint64_t, MCPseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
+                                                       GUID2FuncDescMap.end());
+  for (auto &I : OrderedMap) {
+    I.second.print(OS);
+  }
+}
+
+void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
+                                                uint64_t Address) {
+  auto It = Address2ProbesMap.find(Address);
+  if (It != Address2ProbesMap.end()) {
+    for (auto &Probe : It->second) {
+      OS << " [Probe]:\t";
+      Probe.print(OS, GUID2FuncDescMap, true);
+    }
+  }
+}
+
+void MCPseudoProbeDecoder::printProbesForAllAddresses(raw_ostream &OS) {
+  std::vector<uint64_t> Addresses;
+  for (auto Entry : Address2ProbesMap)
+    Addresses.push_back(Entry.first);
+  std::sort(Addresses.begin(), Addresses.end());
+  for (auto K : Addresses) {
+    OS << "Address:\t";
+    OS << K;
+    OS << "\n";
+    printProbeForAddress(OS, K);
+  }
+}
+
+const MCDecodedPseudoProbe *
+MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
+  auto It = Address2ProbesMap.find(Address);
+  if (It == Address2ProbesMap.end())
+    return nullptr;
+  const auto &Probes = It->second;
+
+  const MCDecodedPseudoProbe *CallProbe = nullptr;
+  for (const auto &Probe : Probes) {
+    if (Probe.isCall()) {
+      assert(!CallProbe &&
+             "There should be only one call probe corresponding to address "
+             "which is a callsite.");
+      CallProbe = &Probe;
+    }
+  }
+  return CallProbe;
+}
+
+const MCPseudoProbeFuncDesc *
+MCPseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
+  auto It = GUID2FuncDescMap.find(GUID);
+  assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
+  return &It->second;
+}
+
+void MCPseudoProbeDecoder::getInlineContextForProbe(
+    const MCDecodedPseudoProbe *Probe,
+    SmallVectorImpl<std::string> &InlineContextStack, bool IncludeLeaf) const {
+  Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap, true);
+  if (!IncludeLeaf)
+    return;
+  // Note that the context from probe doesn't include leaf frame,
+  // hence we need to retrieve and prepend leaf if requested.
+  const auto *FuncDesc = getFuncDescForGUID(Probe->getGuid());
+  InlineContextStack.emplace_back(FuncDesc->FuncName + ":" +
+                                  Twine(Probe->getIndex()).str());
+}
+
+const MCPseudoProbeFuncDesc *MCPseudoProbeDecoder::getInlinerDescForProbe(
+    const MCDecodedPseudoProbe *Probe) const {
+  MCDecodedPseudoProbeInlineTree *InlinerNode = Probe->getInlineTreeNode();
+  if (!InlinerNode->hasInlineSite())
+    return nullptr;
+  return getFuncDescForGUID(std::get<0>(InlinerNode->ISite));
+}
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 2a1998fd9c4c..e33094ccb0c8 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -678,6 +678,14 @@ void MCStreamer::emitCFIReturnColumn(int64_t Register) {
   CurFrame->RAReg = Register;
 }
 
+void MCStreamer::emitCFIInstruction(const MCCFIInstruction &Inst) {
+  MCSymbol *Label = emitCFILabel();
+  MCCFIInstruction Instruction = Inst;
+  Instruction.setLabel(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
 WinEH::FrameInfo *MCStreamer::EnsureValidWinFrameInfo(SMLoc Loc) {
   const MCAsmInfo *MAI = Context.getAsmInfo();
   if (!MAI->usesWindowsCFI()) {
@@ -1196,6 +1204,8 @@ void MCStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::emitCodeAlignment(unsigned ByteAlignment,
                                    unsigned MaxBytesToEmit) {}
+void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment, int64_t Value,
+                                         unsigned ValueSize) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
 void MCStreamer::emitBundleAlignMode(unsigned AlignPow2) {}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index b5351358e4e5..752a1895f5db 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -45,6 +45,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
 
   TextAlignFillValue = 0x90;
 
+  TrapFillValue = 0xCC;
+
   if (!is64Bit)
     Data64bitsDirective = nullptr;       // we can't emit a 64-bit unit
 
@@ -95,6 +97,8 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
 
   TextAlignFillValue = 0x90;
 
+  TrapFillValue = 0xCC;
+
   // Debug Information
   SupportsDebugInformation = true;
 
@@ -134,6 +138,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
 
   TextAlignFillValue = 0x90;
 
+  TrapFillValue = 0xCC;
+
   AllowAtInName = true;
 }
 
@@ -168,5 +174,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
 
   TextAlignFillValue = 0x90;
 
+  TrapFillValue = 0xCC;
+
   AllowAtInName = true;
 }
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index a6cb17f17a17..6a9b9579477d 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -297,14 +297,16 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
                            []>, Sched<[WriteJumpLd]>;
 
-  def TAILJMPd : PseudoI<(outs), (ins i32imm_brtarget:$dst),
-                         []>, Sched<[WriteJump]>;
+  def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), (ins i32imm_brtarget:$dst),
+                           "jmp\t$dst", []>, Sched<[WriteJump]>;
 
-  def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
-                         []>, Sched<[WriteJump]>;
-  let mayLoad = 1 in
-  def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst),
-                         []>, Sched<[WriteJumpLd]>;
+  let isIndirectBranch = 1 in {
+    def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                     "jmp{l}\t{*}$dst", []>, Sched<[WriteJump]>;
+    let mayLoad = 1 in
+    def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+                     "jmp{l}\t{*}$dst", []>, Sched<[WriteJumpLd]>;
+  }
 }
 
 // Conditional tail calls are similar to the above, but they are branches
diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt
index b6d3b2c2fcbe..770e46c4806f 100644
--- a/llvm/tools/CMakeLists.txt
+++ b/llvm/tools/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_external_project(lldb)
 add_llvm_external_project(mlir)
 # Flang depends on mlir, so place it afterward
 add_llvm_external_project(flang)
+add_llvm_external_project(bolt)
 
 # Automatically add remaining sub-directories containing a 'CMakeLists.txt'
 # file as external projects.
diff --git a/llvm/tools/llvm-profgen/CMakeLists.txt b/llvm/tools/llvm-profgen/CMakeLists.txt
index 949b45ff2f96..125b36b40129 100644
--- a/llvm/tools/llvm-profgen/CMakeLists.txt
+++ b/llvm/tools/llvm-profgen/CMakeLists.txt
@@ -19,5 +19,4 @@ add_llvm_tool(llvm-profgen
   CSPreInliner.cpp
   ProfiledBinary.cpp
   ProfileGenerator.cpp
-  PseudoProbe.cpp
   )
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index a9805cfa3550..3fa542b1e357 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -107,7 +107,7 @@ std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
   for (auto CallProbe : Stack) {
     ProbeBasedKey->Probes.emplace_back(CallProbe);
   }
-  CSProfileGenerator::compressRecursionContext<const PseudoProbe *>(
+  CSProfileGenerator::compressRecursionContext<const MCDecodedPseudoProbe *>(
       ProbeBasedKey->Probes);
   ProbeBasedKey->genHashCode();
   return ProbeBasedKey;
diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
index 27b307d809b8..3bee522a4c44 100644
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -366,7 +366,7 @@ struct StringBasedCtxKey : public ContextKey {
 // need to be splitted by '@' to get the last location frame, so we
 // can just use probe instead and generate the string in the end.
 struct ProbeBasedCtxKey : public ContextKey {
-  SmallVector<const PseudoProbe *, 16> Probes;
+  SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
 
   ProbeBasedCtxKey() : ContextKey(CK_ProbeBased) {}
   static bool classof(const ContextKey *K) {
@@ -432,11 +432,12 @@ struct FrameStack {
 };
 
 struct ProbeStack {
-  SmallVector<const PseudoProbe *, 16> Stack;
+  SmallVector<const MCDecodedPseudoProbe *, 16> Stack;
   const ProfiledBinary *Binary;
   ProbeStack(const ProfiledBinary *B) : Binary(B) {}
   bool pushFrame(UnwindState::ProfiledFrame *Cur) {
-    const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address);
+    const MCDecodedPseudoProbe *CallProbe =
+        Binary->getCallProbeForAddr(Cur->Address);
     // We may not find a probe for a merged or external callsite.
     // Callsite merging may cause the loss of original probe IDs.
     // Cutting off the context from here since the inliner will
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 861dd17ce2ad..57853f237397 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -443,8 +443,8 @@ void CSProfileGenerator::write(std::unique_ptr<SampleProfileWriter> Writer,
 // be added compressed while looking up function profile
 static void
 extractPrefixContextStack(SmallVectorImpl<std::string> &ContextStrStack,
-                          const SmallVectorImpl<const PseudoProbe *> &Probes,
-                          ProfiledBinary *Binary) {
+    const SmallVectorImpl<const MCDecodedPseudoProbe *> &Probes,
+    ProfiledBinary *Binary) {
   for (const auto *P : Probes) {
     Binary->getInlineContextForProbe(P, ContextStrStack, true);
   }
@@ -520,9 +520,10 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
   // Extract the top frame probes by looking up each address among the range in
   // the Address2ProbeMap
   extractProbesFromRange(RangeCounter, ProbeCounter, Binary);
-  std::unordered_map<PseudoProbeInlineTree *, FunctionSamples *> FrameSamples;
+  std::unordered_map<MCDecodedPseudoProbeInlineTree *, FunctionSamples *>
+      FrameSamples;
   for (auto PI : ProbeCounter) {
-    const PseudoProbe *Probe = PI.first;
+    const MCDecodedPseudoProbe *Probe = PI.first;
     uint64_t Count = PI.second;
     FunctionSamples &FunctionProfile =
         getFunctionProfileForLeafProbe(ContextStrStack, Probe, Binary);
@@ -530,7 +531,7 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
     // collected for non-danglie probes. This is for reporting all of the
     // zero count probes of the frame later.
     FrameSamples[Probe->getInlineTreeNode()] = &FunctionProfile;
-    FunctionProfile.addBodySamplesForProbe(Probe->Index, Count);
+    FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count);
     FunctionProfile.addTotalSamples(Count);
     if (Probe->isEntry()) {
       FunctionProfile.addHeadSamples(Count);
@@ -565,7 +566,7 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
     for (auto &I : FrameSamples) {
       auto *FunctionProfile = I.second;
       for (auto *Probe : I.first->getProbes()) {
-          FunctionProfile->addBodySamplesForProbe(Probe->Index, 0);
+        FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0);
       }
     }
   }
@@ -579,25 +580,26 @@ void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
     uint64_t TargetOffset = BI.first.second;
     uint64_t Count = BI.second;
     uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset);
-    const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(SourceAddress);
+    const MCDecodedPseudoProbe *CallProbe =
+        Binary->getCallProbeForAddr(SourceAddress);
     if (CallProbe == nullptr)
       continue;
     FunctionSamples &FunctionProfile =
         getFunctionProfileForLeafProbe(ContextStrStack, CallProbe, Binary);
-    FunctionProfile.addBodySamples(CallProbe->Index, 0, Count);
+    FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
     FunctionProfile.addTotalSamples(Count);
     StringRef CalleeName = FunctionSamples::getCanonicalFnName(
         Binary->getFuncFromStartOffset(TargetOffset));
     if (CalleeName.size() == 0)
       continue;
-    FunctionProfile.addCalledTargetSamples(CallProbe->Index, 0, CalleeName,
+    FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName,
                                            Count);
   }
 }
 
 FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
     SmallVectorImpl<std::string> &ContextStrStack,
-    const PseudoProbeFuncDesc *LeafFuncDesc, bool WasLeafInlined) {
+    const MCPseudoProbeFuncDesc *LeafFuncDesc, bool WasLeafInlined) {
   assert(ContextStrStack.size() && "Profile context must have the leaf frame");
   // Compress the context string except for the leaf frame
   std::string LeafFrame = ContextStrStack.back();
@@ -624,14 +626,15 @@ FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
 }
 
 FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
-    SmallVectorImpl<std::string> &ContextStrStack, const PseudoProbe *LeafProbe,
-    ProfiledBinary *Binary) {
+    SmallVectorImpl<std::string> &ContextStrStack,
+    const MCDecodedPseudoProbe *LeafProbe, ProfiledBinary *Binary) {
+
   // Explicitly copy the context for appending the leaf context
   SmallVector<std::string, 16> ContextStrStackCopy(ContextStrStack.begin(),
                                                    ContextStrStack.end());
   Binary->getInlineContextForProbe(LeafProbe, ContextStrStackCopy, true);
-  const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->GUID);
-  bool WasLeafInlined = LeafProbe->InlineTree->hasInlineSite();
+  const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid());
+  bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
   return getFunctionProfileForLeafProbe(ContextStrStackCopy, FuncDesc,
                                         WasLeafInlined);
 }
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index 66ccf495b335..cc1959cdcd93 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -214,7 +214,8 @@ class CSProfileGenerator : public ProfileGenerator {
   static int32_t MaxCompressionSize;
 };
 
-using ProbeCounterMap = std::unordered_map<const PseudoProbe *, uint64_t>;
+using ProbeCounterMap =
+    std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>;
 
 class PseudoProbeCSProfileGenerator : public CSProfileGenerator {
 
@@ -241,12 +242,12 @@ class PseudoProbeCSProfileGenerator : public CSProfileGenerator {
   // Helper function to get FunctionSamples for the leaf inlined context
   FunctionSamples &
   getFunctionProfileForLeafProbe(SmallVectorImpl<std::string> &ContextStrStack,
-                                 const PseudoProbeFuncDesc *LeafFuncDesc,
+                                 const MCPseudoProbeFuncDesc *LeafFuncDesc,
                                  bool WasLeafInlined);
   // Helper function to get FunctionSamples for the leaf probe
   FunctionSamples &
   getFunctionProfileForLeafProbe(SmallVectorImpl<std::string> &ContextStrStack,
-                                 const PseudoProbe *LeafProbe,
+                                 const MCDecodedPseudoProbe *LeafProbe,
                                  ProfiledBinary *Binary);
 };
 
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index afca9eab1da0..550ff465c31e 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -193,12 +193,18 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
 
     if (SectionName == ".pseudo_probe_desc") {
       StringRef Contents = unwrapOrError(Section.getContents(), FileName);
-      ProbeDecoder.buildGUID2FuncDescMap(
-          reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size());
+      if (!ProbeDecoder.buildGUID2FuncDescMap(
+              reinterpret_cast<const uint8_t *>(Contents.data()),
+              Contents.size()))
+        exitWithError("Pseudo Probe decoder fail in " +
+                      ProbeDecoder.getSectionName());
     } else if (SectionName == ".pseudo_probe") {
       StringRef Contents = unwrapOrError(Section.getContents(), FileName);
-      ProbeDecoder.buildAddress2ProbeMap(
-          reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size());
+      if (!ProbeDecoder.buildAddress2ProbeMap(
+              reinterpret_cast<const uint8_t *>(Contents.data()),
+              Contents.size()))
+        exitWithError("Pseudo Probe decoder fail in " +
+                      ProbeDecoder.getSectionName());
       // set UsePseudoProbes flag, used for PerfReader
       UsePseudoProbes = true;
     }
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index b56574e0bf6f..7e30df3328a1 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -10,7 +10,6 @@
 #define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H
 
 #include "CallContext.h"
-#include "PseudoProbe.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
@@ -22,6 +21,7 @@
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
@@ -132,7 +132,7 @@ class ProfiledBinary {
   std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
 
   // Pseudo probe decoder
-  PseudoProbeDecoder ProbeDecoder;
+  MCPseudoProbeDecoder ProbeDecoder;
 
   bool UsePseudoProbes = false;
 
@@ -242,23 +242,26 @@ class ProfiledBinary {
   std::string getExpandedContextStr(const SmallVectorImpl<uint64_t> &Stack,
                                     bool &WasLeafInlined) const;
 
-  const PseudoProbe *getCallProbeForAddr(uint64_t Address) const {
+  const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const {
     return ProbeDecoder.getCallProbeForAddr(Address);
   }
+
   void
-  getInlineContextForProbe(const PseudoProbe *Probe,
+  getInlineContextForProbe(const MCDecodedPseudoProbe *Probe,
                            SmallVectorImpl<std::string> &InlineContextStack,
                            bool IncludeLeaf = false) const {
+
     return ProbeDecoder.getInlineContextForProbe(Probe, InlineContextStack,
                                                  IncludeLeaf);
   }
   const AddressProbesMap &getAddress2ProbesMap() const {
     return ProbeDecoder.getAddress2ProbesMap();
   }
-  const PseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) {
+  const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) {
     return ProbeDecoder.getFuncDescForGUID(GUID);
   }
-  const PseudoProbeFuncDesc *getInlinerDescForProbe(const PseudoProbe *Probe) {
+  const MCPseudoProbeFuncDesc *
+  getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) {
     return ProbeDecoder.getInlinerDescForProbe(Probe);
   }
 };
diff --git a/llvm/tools/llvm-profgen/PseudoProbe.cpp b/llvm/tools/llvm-profgen/PseudoProbe.cpp
deleted file mode 100644
index 02af3f0ffd6d..000000000000
--- a/llvm/tools/llvm-profgen/PseudoProbe.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-//===--- PseudoProbe.cpp - Pseudo probe decoding utilities  ------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "PseudoProbe.h"
-#include "ErrorHandling.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/raw_ostream.h"
-#include <limits>
-#include <memory>
-
-using namespace llvm;
-using namespace sampleprof;
-using namespace support;
-
-namespace llvm {
-namespace sampleprof {
-
-static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
-                                      uint64_t GUID) {
-  auto It = GUID2FuncMAP.find(GUID);
-  assert(It != GUID2FuncMAP.end() &&
-         "Probe function must exist for a valid GUID");
-  return It->second.FuncName;
-}
-
-void PseudoProbeFuncDesc::print(raw_ostream &OS) {
-  OS << "GUID: " << FuncGUID << " Name: " << FuncName << "\n";
-  OS << "Hash: " << FuncHash << "\n";
-}
-
-void PseudoProbe::getInlineContext(SmallVectorImpl<std::string> &ContextStack,
-                                   const GUIDProbeFunctionMap &GUID2FuncMAP,
-                                   bool ShowName) const {
-  uint32_t Begin = ContextStack.size();
-  PseudoProbeInlineTree *Cur = InlineTree;
-  // It will add the string of each node's inline site during iteration.
-  // Note that it won't include the probe's belonging function(leaf location)
-  while (Cur->hasInlineSite()) {
-    std::string ContextStr;
-    if (ShowName) {
-      StringRef FuncName =
-          getProbeFNameForGUID(GUID2FuncMAP, std::get<0>(Cur->ISite));
-      ContextStr += FuncName.str();
-    } else {
-      ContextStr += Twine(std::get<0>(Cur->ISite)).str();
-    }
-    ContextStr += ":";
-    ContextStr += Twine(std::get<1>(Cur->ISite)).str();
-    ContextStack.emplace_back(ContextStr);
-    Cur = Cur->Parent;
-  }
-  // Make the ContextStack in caller-callee order
-  std::reverse(ContextStack.begin() + Begin, ContextStack.end());
-}
-
-std::string
-PseudoProbe::getInlineContextStr(const GUIDProbeFunctionMap &GUID2FuncMAP,
-                                 bool ShowName) const {
-  std::ostringstream OContextStr;
-  SmallVector<std::string, 16> ContextStack;
-  getInlineContext(ContextStack, GUID2FuncMAP, ShowName);
-  for (auto &CxtStr : ContextStack) {
-    if (OContextStr.str().size())
-      OContextStr << " @ ";
-    OContextStr << CxtStr;
-  }
-  return OContextStr.str();
-}
-
-static const char *PseudoProbeTypeStr[3] = {"Block", "IndirectCall",
-                                            "DirectCall"};
-
-void PseudoProbe::print(raw_ostream &OS,
-                        const GUIDProbeFunctionMap &GUID2FuncMAP,
-                        bool ShowName) {
-  OS << "FUNC: ";
-  if (ShowName) {
-    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, GUID);
-    OS << FuncName.str() << " ";
-  } else {
-    OS << GUID << " ";
-  }
-  OS << "Index: " << Index << "  ";
-  OS << "Type: " << PseudoProbeTypeStr[static_cast<uint8_t>(Type)] << "  ";
-
-  std::string InlineContextStr = getInlineContextStr(GUID2FuncMAP, ShowName);
-  if (InlineContextStr.size()) {
-    OS << "Inlined: @ ";
-    OS << InlineContextStr;
-  }
-  OS << "\n";
-}
-
-template <typename T> T PseudoProbeDecoder::readUnencodedNumber() {
-  if (Data + sizeof(T) > End) {
-    exitWithError("Decode unencoded number error in " + SectionName +
-                  " section");
-  }
-  T Val = endian::readNext<T, little, unaligned>(Data);
-  return Val;
-}
-
-template <typename T> T PseudoProbeDecoder::readUnsignedNumber() {
-  unsigned NumBytesRead = 0;
-  uint64_t Val = decodeULEB128(Data, &NumBytesRead);
-  if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
-    exitWithError("Decode number error in " + SectionName + " section");
-  }
-  Data += NumBytesRead;
-  return static_cast<T>(Val);
-}
-
-template <typename T> T PseudoProbeDecoder::readSignedNumber() {
-  unsigned NumBytesRead = 0;
-  int64_t Val = decodeSLEB128(Data, &NumBytesRead);
-  if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
-    exitWithError("Decode number error in " + SectionName + " section");
-  }
-  Data += NumBytesRead;
-  return static_cast<T>(Val);
-}
-
-StringRef PseudoProbeDecoder::readString(uint32_t Size) {
-  StringRef Str(reinterpret_cast<const char *>(Data), Size);
-  if (Data + Size > End) {
-    exitWithError("Decode string error in " + SectionName + " section");
-  }
-  Data += Size;
-  return Str;
-}
-
-void PseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
-                                               std::size_t Size) {
-  // The pseudo_probe_desc section has a format like:
-  // .section .pseudo_probe_desc,"",@progbits
-  // .quad -5182264717993193164   // GUID
-  // .quad 4294967295             // Hash
-  // .uleb 3                      // Name size
-  // .ascii "foo"                 // Name
-  // .quad -2624081020897602054
-  // .quad 174696971957
-  // .uleb 34
-  // .ascii "main"
-#ifndef NDEBUG
-  SectionName = "pseudo_probe_desc";
-#endif
-  Data = Start;
-  End = Data + Size;
-
-  while (Data < End) {
-    uint64_t GUID = readUnencodedNumber<uint64_t>();
-    uint64_t Hash = readUnencodedNumber<uint64_t>();
-    uint32_t NameSize = readUnsignedNumber<uint32_t>();
-    StringRef Name = FunctionSamples::getCanonicalFnName(readString(NameSize));
-
-    // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
-    GUID2FuncDescMap.emplace(GUID, PseudoProbeFuncDesc(GUID, Hash, Name));
-  }
-  assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
-}
-
-void PseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
-                                               std::size_t Size) {
-  // The pseudo_probe section encodes an inline forest and each tree has a
-  // format like:
-  //  FUNCTION BODY (one for each uninlined function present in the text
-  //  section)
-  //     GUID (uint64)
-  //         GUID of the function
-  //     NPROBES (ULEB128)
-  //         Number of probes originating from this function.
-  //     NUM_INLINED_FUNCTIONS (ULEB128)
-  //         Number of callees inlined into this function, aka number of
-  //         first-level inlinees
-  //     PROBE RECORDS
-  //         A list of NPROBES entries. Each entry contains:
-  //           INDEX (ULEB128)
-  //           TYPE (uint4)
-  //             0 - block probe, 1 - indirect call, 2 - direct call
-  //           ATTRIBUTE (uint3)
-  //             1 - reserved
-  //           ADDRESS_TYPE (uint1)
-  //             0 - code address, 1 - address delta
-  //           CODE_ADDRESS (uint64 or ULEB128)
-  //             code address or address delta, depending on Flag
-  //     INLINED FUNCTION RECORDS
-  //         A list of NUM_INLINED_FUNCTIONS entries describing each of the
-  //         inlined callees.  Each record contains:
-  //           INLINE SITE
-  //             Index of the callsite probe (ULEB128)
-  //           FUNCTION BODY
-  //             A FUNCTION BODY entry describing the inlined function.
-#ifndef NDEBUG
-  SectionName = "pseudo_probe";
-#endif
-  Data = Start;
-  End = Data + Size;
-
-  PseudoProbeInlineTree *Root = &DummyInlineRoot;
-  PseudoProbeInlineTree *Cur = &DummyInlineRoot;
-  uint64_t LastAddr = 0;
-  uint32_t Index = 0;
-  // A DFS-based decoding
-  while (Data < End) {
-    if (Root == Cur) {
-      // Use a sequential id for top level inliner.
-      Index = Root->getChildren().size();
-    } else {
-      // Read inline site for inlinees
-      Index = readUnsignedNumber<uint32_t>();
-    }
-    // Switch/add to a new tree node(inlinee)
-    Cur = Cur->getOrAddNode(std::make_tuple(Cur->GUID, Index));
-    // Read guid
-    Cur->GUID = readUnencodedNumber<uint64_t>();
-    // Read number of probes in the current node.
-    uint32_t NodeCount = readUnsignedNumber<uint32_t>();
-    // Read number of direct inlinees
-    Cur->ChildrenToProcess = readUnsignedNumber<uint32_t>();
-    // Read all probes in this node
-    for (std::size_t I = 0; I < NodeCount; I++) {
-      // Read index
-      uint32_t Index = readUnsignedNumber<uint32_t>();
-      // Read type | flag.
-      uint8_t Value = readUnencodedNumber<uint8_t>();
-      uint8_t Kind = Value & 0xf;
-      uint8_t Attr = (Value & 0x70) >> 4;
-      // Read address
-      uint64_t Addr = 0;
-      if (Value & 0x80) {
-        int64_t Offset = readSignedNumber<int64_t>();
-        Addr = LastAddr + Offset;
-      } else {
-        Addr = readUnencodedNumber<int64_t>();
-      }
-      // Populate Address2ProbesMap
-      auto &Probes = Address2ProbesMap[Addr];
-      Probes.emplace_back(Addr, Cur->GUID, Index, PseudoProbeType(Kind), Attr,
-                          Cur);
-      Cur->addProbes(&Probes.back());
-      LastAddr = Addr;
-    }
-
-    // Look for the parent for the next node by subtracting the current
-    // node count from tree counts along the parent chain. The first node
-    // in the chain that has a non-zero tree count is the target.
-    while (Cur != Root) {
-      if (Cur->ChildrenToProcess == 0) {
-        Cur = Cur->Parent;
-        if (Cur != Root) {
-          assert(Cur->ChildrenToProcess > 0 &&
-                 "Should have some unprocessed nodes");
-          Cur->ChildrenToProcess -= 1;
-        }
-      } else {
-        break;
-      }
-    }
-  }
-
-  assert(Data == End && "Have unprocessed data in pseudo_probe section");
-  assert(Cur == Root &&
-         " Cur should point to root when the forest is fully built up");
-}
-
-void PseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
-  OS << "Pseudo Probe Desc:\n";
-  // Make the output deterministic
-  std::map<uint64_t, PseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
-                                                     GUID2FuncDescMap.end());
-  for (auto &I : OrderedMap) {
-    I.second.print(OS);
-  }
-}
-
-void PseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
-                                              uint64_t Address) {
-  auto It = Address2ProbesMap.find(Address);
-  if (It != Address2ProbesMap.end()) {
-    for (auto &Probe : It->second) {
-      OS << " [Probe]:\t";
-      Probe.print(OS, GUID2FuncDescMap, true);
-    }
-  }
-}
-
-const PseudoProbe *
-PseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
-  auto It = Address2ProbesMap.find(Address);
-  if (It == Address2ProbesMap.end())
-    return nullptr;
-  const auto &Probes = It->second;
-
-  const PseudoProbe *CallProbe = nullptr;
-  for (const auto &Probe : Probes) {
-    if (Probe.isCall()) {
-      assert(!CallProbe &&
-             "There should be only one call probe corresponding to address "
-             "which is a callsite.");
-      CallProbe = &Probe;
-    }
-  }
-  return CallProbe;
-}
-
-const PseudoProbeFuncDesc *
-PseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
-  auto It = GUID2FuncDescMap.find(GUID);
-  assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
-  return &It->second;
-}
-
-void PseudoProbeDecoder::getInlineContextForProbe(
-    const PseudoProbe *Probe, SmallVectorImpl<std::string> &InlineContextStack,
-    bool IncludeLeaf) const {
-  Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap, true);
-  if (!IncludeLeaf)
-    return;
-  // Note that the context from probe doesn't include leaf frame,
-  // hence we need to retrieve and prepend leaf if requested.
-  const auto *FuncDesc = getFuncDescForGUID(Probe->GUID);
-  InlineContextStack.emplace_back(FuncDesc->FuncName + ":" +
-                                  Twine(Probe->Index).str());
-}
-
-const PseudoProbeFuncDesc *
-PseudoProbeDecoder::getInlinerDescForProbe(const PseudoProbe *Probe) const {
-  PseudoProbeInlineTree *InlinerNode = Probe->InlineTree;
-  if (!InlinerNode->hasInlineSite())
-    return nullptr;
-  return getFuncDescForGUID(std::get<0>(InlinerNode->ISite));
-}
-
-} // end namespace sampleprof
-} // end namespace llvm
diff --git a/llvm/tools/llvm-profgen/PseudoProbe.h b/llvm/tools/llvm-profgen/PseudoProbe.h
deleted file mode 100644
index 62d46d3092b2..000000000000
--- a/llvm/tools/llvm-profgen/PseudoProbe.h
+++ /dev/null
@@ -1,227 +0,0 @@
-//===--- PseudoProbe.h - Pseudo probe decoding utilities ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_TOOLS_LLVM_PROFGEN_PSEUDOPROBE_H
-#define LLVM_TOOLS_LLVM_PROFGEN_PSEUDOPROBE_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/PseudoProbe.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO/SampleProfileProbe.h"
-#include <algorithm>
-#include <set>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace llvm {
-namespace sampleprof {
-
-enum PseudoProbeAttributes { RESERVED = 1 };
-
-// Use func GUID and index as the location info of the inline site
-using InlineSite = std::tuple<uint64_t, uint32_t>;
-
-struct PseudoProbe;
-
-// Tree node to represent the inline relation and its inline site, we use a
-// dummy root in the PseudoProbeDecoder to lead the tree, the outlined
-// function will directly be the children of the dummy root. For the inlined
-// function, all the inlinee will be connected to its inlineer, then further to
-// its outlined function. Pseudo probes originating from the function stores the
-// tree's leaf node which we can process backwards to get its inline context
-class PseudoProbeInlineTree {
-  std::vector<PseudoProbe *> ProbeVector;
-
-  struct InlineSiteHash {
-    uint64_t operator()(const InlineSite &Site) const {
-      return std::get<0>(Site) ^ std::get<1>(Site);
-    }
-  };
-  using InlinedProbeTreeMap =
-      std::unordered_map<InlineSite, std::unique_ptr<PseudoProbeInlineTree>,
-                         InlineSiteHash>;
-  InlinedProbeTreeMap Children;
-
-public:
-  // Inlinee function GUID
-  uint64_t GUID = 0;
-  // Inline site to indicate the location in its inliner. As the node could also
-  // be an outlined function, it will use a dummy InlineSite whose GUID and
-  // Index is 0 connected to the dummy root
-  InlineSite ISite;
-  // Used for decoding
-  uint32_t ChildrenToProcess = 0;
-  // Caller node of the inline site
-  PseudoProbeInlineTree *Parent;
-
-  PseudoProbeInlineTree(){};
-  PseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
-
-  PseudoProbeInlineTree *getOrAddNode(const InlineSite &Site) {
-    auto Ret =
-        Children.emplace(Site, std::make_unique<PseudoProbeInlineTree>(Site));
-    Ret.first->second->Parent = this;
-    return Ret.first->second.get();
-  }
-
-  InlinedProbeTreeMap &getChildren() { return Children; }
-  std::vector<PseudoProbe *> &getProbes() { return ProbeVector; }
-  void addProbes(PseudoProbe *Probe) { ProbeVector.push_back(Probe); }
-  // Return false if it's a dummy inline site
-  bool hasInlineSite() const { return std::get<0>(ISite) != 0; }
-};
-
-// Function descriptor decoded from .pseudo_probe_desc section
-struct PseudoProbeFuncDesc {
-  uint64_t FuncGUID = 0;
-  uint64_t FuncHash = 0;
-  std::string FuncName;
-
-  PseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
-      : FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};
-
-  void print(raw_ostream &OS);
-};
-
-// GUID to PseudoProbeFuncDesc map
-using GUIDProbeFunctionMap = std::unordered_map<uint64_t, PseudoProbeFuncDesc>;
-// Address to pseudo probes map.
-using AddressProbesMap = std::unordered_map<uint64_t, std::list<PseudoProbe>>;
-
-/*
-A pseudo probe has the format like below:
-  INDEX (ULEB128)
-  TYPE (uint4)
-    0 - block probe, 1 - indirect call, 2 - direct call
-  ATTRIBUTE (uint3)
-    1 - reserved
-  ADDRESS_TYPE (uint1)
-    0 - code address, 1 - address delta
-  CODE_ADDRESS (uint64 or ULEB128)
-  code address or address delta, depending on Flag
-*/
-struct PseudoProbe {
-  uint64_t Address;
-  uint64_t GUID;
-  uint32_t Index;
-  PseudoProbeType Type;
-  uint8_t Attribute;
-  PseudoProbeInlineTree *InlineTree;
-  const static uint32_t PseudoProbeFirstId =
-      static_cast<uint32_t>(PseudoProbeReservedId::Last) + 1;
-
-  PseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
-              uint8_t At, PseudoProbeInlineTree *Tree)
-      : Address(Ad), GUID(G), Index(I), Type(K), Attribute(At),
-        InlineTree(Tree){};
-
-  bool isEntry() const { return Index == PseudoProbeFirstId; }
-  bool isBlock() const { return Type == PseudoProbeType::Block; }
-  bool isIndirectCall() const { return Type == PseudoProbeType::IndirectCall; }
-  bool isDirectCall() const { return Type == PseudoProbeType::DirectCall; }
-  bool isCall() const { return isIndirectCall() || isDirectCall(); }
-
-  PseudoProbeInlineTree *getInlineTreeNode() const { return InlineTree; }
-
-  // Get the inlined context by traversing current inline tree backwards,
-  // each tree node has its InlineSite which is taken as the context.
-  // \p ContextStack is populated in root to leaf order
-  void getInlineContext(SmallVectorImpl<std::string> &ContextStack,
-                        const GUIDProbeFunctionMap &GUID2FuncMAP,
-                        bool ShowName) const;
-  // Helper function to get the string from context stack
-  std::string getInlineContextStr(const GUIDProbeFunctionMap &GUID2FuncMAP,
-                                  bool ShowName) const;
-  // Print pseudo probe while disassembling
-  void print(raw_ostream &OS, const GUIDProbeFunctionMap &GUID2FuncMAP,
-             bool ShowName);
-};
-
-/*
-Decode pseudo probe info from ELF section, used along with ELF reader
-Two sections are decoded here:
-  1) \fn buildGUID2FunctionMap is responsible for .pseudo_probe_desc
-  section which encodes all function descriptors.
-  2) \fn buildAddress2ProbeMap is responsible for .pseudoprobe section
-    which encodes an inline function forest and each tree includes its
-    inlined function and all pseudo probes inside the function.
-see \file MCPseudoProbe.h for the details of the section encoding format.
-*/
-class PseudoProbeDecoder {
-  // GUID to PseudoProbeFuncDesc map.
-  GUIDProbeFunctionMap GUID2FuncDescMap;
-
-  // Address to probes map.
-  AddressProbesMap Address2ProbesMap;
-
-  // The dummy root of the inline trie, all the outlined function will directly
-  // be the children of the dummy root, all the inlined function will be the
-  // children of its inlineer. So the relation would be like:
-  // DummyRoot --> OutlinedFunc --> InlinedFunc1 --> InlinedFunc2
-  PseudoProbeInlineTree DummyInlineRoot;
-
-  /// Points to the current location in the buffer.
-  const uint8_t *Data = nullptr;
-
-  /// Points to the end of the buffer.
-  const uint8_t *End = nullptr;
-
-  /// SectionName used for debug
-  std::string SectionName;
-
-  // Decoding helper function
-  template <typename T> T readUnencodedNumber();
-  template <typename T> T readUnsignedNumber();
-  template <typename T> T readSignedNumber();
-  StringRef readString(uint32_t Size);
-
-public:
-  // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
-  void buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
-
-  // Decode pseudo_probe section to build address to probes map.
-  void buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size);
-
-  // Print pseudo_probe_desc section info
-  void printGUID2FuncDescMap(raw_ostream &OS);
-
-  // Print pseudo_probe section info, used along with show-disassembly
-  void printProbeForAddress(raw_ostream &OS, uint64_t Address);
-
-  // Look up the probe of a call for the input address
-  const PseudoProbe *getCallProbeForAddr(uint64_t Address) const;
-
-  const PseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) const;
-
-  // Helper function to populate one probe's inline stack into
-  // \p InlineContextStack.
-  // Current leaf location info will be added if IncludeLeaf is true
-  // Example:
-  //  Current probe(bar:3) inlined at foo:2 then inlined at main:1
-  //  IncludeLeaf = true,  Output: [main:1, foo:2, bar:3]
-  //  IncludeLeaf = false, Output: [main:1, foo:2]
-  void
-  getInlineContextForProbe(const PseudoProbe *Probe,
-                           SmallVectorImpl<std::string> &InlineContextStack,
-                           bool IncludeLeaf) const;
-
-  const AddressProbesMap &getAddress2ProbesMap() const {
-    return Address2ProbesMap;
-  }
-
-  const PseudoProbeFuncDesc *
-  getInlinerDescForProbe(const PseudoProbe *Probe) const;
-};
-
-} // end namespace sampleprof
-} // end namespace llvm
-
-#endif