From bd12a8bed3e466480315bf4b9459173cb7d8e34f Mon Sep 17 00:00:00 2001 From: "Dmitry K." Date: Tue, 24 Sep 2024 13:11:55 -0700 Subject: [PATCH 01/13] --wip-- [skip ci] --- go.mod | 6 ++---- .../scripts/upgrades/authz_cancel_upgrade_tx.json | 10 ++++++++++ tools/scripts/upgrades/upgrade_tx.json | 15 +++++++++++++++ x/tokenomics/types/tx.pb.go | 1 - 4 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 tools/scripts/upgrades/authz_cancel_upgrade_tx.json create mode 100644 tools/scripts/upgrades/upgrade_tx.json diff --git a/go.mod b/go.mod index 91de15f1d..9f47b177c 100644 --- a/go.mod +++ b/go.mod @@ -79,10 +79,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -require ( - cosmossdk.io/x/tx v0.13.4 - github.com/jhump/protoreflect v1.16.0 -) +require github.com/jhump/protoreflect v1.16.0 require ( buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.34.2-20240508200655-46a4cf4ba109.2 // indirect @@ -95,6 +92,7 @@ require ( connectrpc.com/connect v1.16.2 // indirect connectrpc.com/otelconnect v0.7.0 // indirect cosmossdk.io/collections v0.4.0 // indirect + cosmossdk.io/x/tx v0.13.4 // indirect filippo.io/edwards25519 v1.0.0 // indirect github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4 // indirect github.com/99designs/keyring v1.2.1 // indirect diff --git a/tools/scripts/upgrades/authz_cancel_upgrade_tx.json b/tools/scripts/upgrades/authz_cancel_upgrade_tx.json new file mode 100644 index 000000000..014eaac60 --- /dev/null +++ b/tools/scripts/upgrades/authz_cancel_upgrade_tx.json @@ -0,0 +1,10 @@ +{ + "body": { + "messages": [ + { + "@type": "/cosmos.upgrade.v1beta1.MsgCancelUpgrade", + "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t" + } + ] + } +} \ No newline at end of file diff --git a/tools/scripts/upgrades/upgrade_tx.json b/tools/scripts/upgrades/upgrade_tx.json new file mode 100644 index 000000000..c945229d9 --- /dev/null +++ b/tools/scripts/upgrades/upgrade_tx.json @@ -0,0 +1,15 @@ +{ + "body": { + "messages": [ + { + "@type": "/cosmos.upgrade.v1beta1.MsgSoftwareUpgrade", + "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t", + "plan": { + "name": "v0.0.9", + "height": "15510", + "info": "{\"binaries\":{\"linux/amd64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_linux_amd64.tar.gz?checksum=sha256:ab5b99ca0bc4bfbdd7031378d5a01c2a9f040ff310b745866a4dee7e62321c94\",\"linux/arm64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_linux_arm64.tar.gz?checksum=sha256:4b68c2ad326da055d43af1ad1a580158cec0f229d2ec6d9e18280d065260b622\",\"darwin/amd64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_darwin_amd64.tar.gz?checksum=sha256:c81aabddeb190044b979412e5a518bbf5c88305272f72a47e32e13aa765c3330\",\"darwin/arm64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_darwin_arm64.tar.gz?checksum=sha256:e683c55ac13902d107d7a726ed4a5c5affb2af1be3c67dd131ec2072a2cfbcb2\"}}" + } + } + ] + } +} \ No newline at end of file diff --git a/x/tokenomics/types/tx.pb.go b/x/tokenomics/types/tx.pb.go index e4fec264c..9f18a148c 100644 --- a/x/tokenomics/types/tx.pb.go +++ b/x/tokenomics/types/tx.pb.go @@ -125,7 +125,6 @@ type MsgUpdateParam struct { // specified in the `Params` message in `proof/params.proto.` Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` // Types that are valid to be assigned to AsType: - // // *MsgUpdateParam_AsString // *MsgUpdateParam_AsInt64 // *MsgUpdateParam_AsBytes From 4ea092e675fddc3577107a998580bda95ac47e03 Mon Sep 17 00:00:00 2001 From: "Dmitry K." Date: Tue, 24 Sep 2024 16:45:02 -0700 Subject: [PATCH 02/13] document learnings and more checks --- app/upgrades.go | 1 + app/upgrades/historical.go | 7 +++ .../docs/protocol/upgrades/release_process.md | 7 +++ .../protocol/upgrades/upgrade_procedure.md | 44 ++++++++++++++++++- ...upgrade_tx.json => upgrade_tx_v0.0.9.json} | 0 5 files changed, 57 insertions(+), 2 deletions(-) rename tools/scripts/upgrades/{upgrade_tx.json => upgrade_tx_v0.0.9.json} (100%) diff --git a/app/upgrades.go b/app/upgrades.go index a2af2973e..5043bd078 100644 --- a/app/upgrades.go +++ b/app/upgrades.go @@ -11,6 +11,7 @@ import ( // so `cosmovisor` can automatically pull the binary from GitHub. var allUpgrades = []upgrades.Upgrade{ upgrades.Upgrade_0_0_4, + upgrades.Upgrade_0_0_9, } // setUpgrades sets upgrade handlers for all upgrades and executes KVStore migration if an upgrade plan file exists. diff --git a/app/upgrades/historical.go b/app/upgrades/historical.go index fe06f7096..2e71f0430 100644 --- a/app/upgrades/historical.go +++ b/app/upgrades/historical.go @@ -86,3 +86,10 @@ var Upgrade_0_0_4 = Upgrade{ // No changes to the KVStore in this upgrade. StoreUpgrades: storetypes.StoreUpgrades{}, } + +// Upgrade_0_0_9 is a small upgrade on TestNet. +var Upgrade_0_0_9 = Upgrade{ + PlanName: "v0.0.9", + CreateUpgradeHandler: defaultUpgradeHandler, + StoreUpgrades: storetypes.StoreUpgrades{}, +} diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 2845f4c84..556f14fb2 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -59,6 +59,13 @@ You can find an example [here](https://github.com/pokt-network/poktroll/releases ```text ## Protocol Upgrades + + - **Planned Upgrade:** ❌ Not applicable for this release. - **Breaking Change:** ❌ Not applicable for this release. - **Manual Intervention Required:** ✅ Yes, but only for Alpha TestNet participants. If you are participating, please follow the [instructions provided here](https://dev.poktroll.com/operate/quickstart/docker_compose_walkthrough#restarting-a-full-node-after-re-genesis-) for restarting your full node after re-genesis. diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index b1098f509..38bbeb5cd 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -12,7 +12,9 @@ This page describes the protocol upgrade process, which is internal to the proto - [When is an Upgrade Warranted?](#when-is-an-upgrade-warranted) - [Implementing the Upgrade](#implementing-the-upgrade) - [Writing an Upgrade Transaction](#writing-an-upgrade-transaction) + - [Validate the URLs](#validate-the-urls) - [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) +- [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) - [DevNet](#devnet) @@ -39,7 +41,7 @@ An upgrade is necessary whenever there's an API, State Machine, or other Consens 1. When a new version includes a consensus-breaking change, plan for the next protocol upgrade: - If there's a change to a specific module, bump that module's consensus version. - Note any potential parameter changes to include in the upgrade. -2. Create a new upgrade in `app/upgrades`: +2. Create a new upgrade in `app/upgrades`. **THIS MUST BE DONE** even if there are no state changes. - Refer to `historical.go` for past upgrades and examples. - Consult Cosmos-sdk documentation on upgrades for additional guidance [here](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [here](https://docs.cosmos.network/main/build/modules/upgrade). @@ -69,12 +71,42 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `height`: The height at which an upgrade should be executed and the node will be restarted. - `info`: While this field can theoretically contain any information about the upgrade, in practice, `cosmovisor`uses it to obtain information about the binaries. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is optional). +### Validate the URLs + +The URLs of the binaries contain checksums. It is important to make sure they are correct, otherwise Cosmovisor won't be able +to download the binaries and go through the upgrade. Here's a little command that uses `jq` and `go-getter` (same library used by Cosmovisor - so it is a good test). + +:::tip + +Go-getter can be installed using the following command: + +```bash +go install github.com/hashicorp/go-getter/cmd/go-getter@latest +``` + +::: + +```bash +jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do + go-getter "$url" . +done +``` + +The output should look like this: + +```text +2024/09/24 12:40:40 success! +2024/09/24 12:40:42 success! +2024/09/24 12:40:44 success! +2024/09/24 12:40:46 success! +``` + ## Submitting the upgrade on-chain The `MsgSoftwareUpgrade` can be submitted using the following command: ```bash -poktrolld tx authz exec PATH_TO_TRANSACTION_JSON --from pnf +poktrolld tx authz exec PATH_TO_UPGRADE_TRANSACTION_JSON --from pnf ``` If the transaction has been accepted, upgrade plan can be viewed with this command: @@ -83,6 +115,14 @@ If the transaction has been accepted, upgrade plan can be viewed with this comma poktrolld query upgrade plan ``` +## Cancelling the upgrade plan + +It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following transaction: + +```bash +poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from pnf +``` + ## Testing the Upgrade :::warning diff --git a/tools/scripts/upgrades/upgrade_tx.json b/tools/scripts/upgrades/upgrade_tx_v0.0.9.json similarity index 100% rename from tools/scripts/upgrades/upgrade_tx.json rename to tools/scripts/upgrades/upgrade_tx_v0.0.9.json From 3d86aef7d3d95dc8d815d7f5254d3a640e27ef9c Mon Sep 17 00:00:00 2001 From: "Dmitry K." Date: Wed, 25 Sep 2024 10:44:29 -0700 Subject: [PATCH 03/13] ca-certs are needed for relayminer --- Dockerfile.release | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile.release b/Dockerfile.release index 07ccbb433..efd5d2f44 100644 --- a/Dockerfile.release +++ b/Dockerfile.release @@ -3,6 +3,11 @@ FROM debian:bookworm ARG TARGETARCH +# Install necessary packages. +RUN apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates && \ + rm -rf /var/lib/apt/lists/* + # Use `1025` G/UID so users can switch between this and `heighliner` image without a need to chown the files. RUN groupadd -g 1025 pocket && useradd -u 1025 -g pocket -m -s /sbin/nologin pocket From cb073bd027ff4260c220c61ba7c243ed5f61bfe8 Mon Sep 17 00:00:00 2001 From: DK Date: Mon, 14 Oct 2024 17:57:55 -0700 Subject: [PATCH 04/13] LocalNet upgrade procedure --- .../docs/protocol/upgrades/upgrade_list.md | 14 ++-- .../protocol/upgrades/upgrade_procedure.md | 68 ++++++++++++++----- tools/installer/full-node.sh | 11 +-- 3 files changed, 66 insertions(+), 27 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_list.md b/docusaurus/docs/protocol/upgrades/upgrade_list.md index f4b2c8d19..ff88e7584 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_list.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_list.md @@ -26,12 +26,14 @@ This table is currently incomplete and does not include all protocol upgrades. O -| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | -| ------------------------------------------------------------------------ | :-----: | :------: | :---------------------------------: | -------------- | -| [`v0.0.7`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.7) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.6`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.6) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.5`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.5) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.4`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.4) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | +| ---------------------------------------------------------------------------- | :-----: | :------: | :---------------------------------: | -------------- | +| [`v0.0.9-3`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9-3) | ❌ | ✅ | ✅ (Alpha TestNet Participants Only) | `17102` | +| [`v0.0.9`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.8`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.8) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.6`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.6) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.5`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.5) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.4`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.4) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | ## MainNet diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 38bbeb5cd..25c5ffc6e 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -12,11 +12,13 @@ This page describes the protocol upgrade process, which is internal to the proto - [When is an Upgrade Warranted?](#when-is-an-upgrade-warranted) - [Implementing the Upgrade](#implementing-the-upgrade) - [Writing an Upgrade Transaction](#writing-an-upgrade-transaction) - - [Validate the URLs](#validate-the-urls) + - [Validate the URLs (live network only)](#validate-the-urls-live-network-only) - [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) - [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) + - [TLDR](#tldr) + - [Full example](#full-example) - [DevNet](#devnet) - [TestNet](#testnet) - [Mainnet](#mainnet) @@ -69,9 +71,9 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. - `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: While this field can theoretically contain any information about the upgrade, in practice, `cosmovisor`uses it to obtain information about the binaries. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is optional). +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. -### Validate the URLs +### Validate the URLs (live network only) The URLs of the binaries contain checksums. It is important to make sure they are correct, otherwise Cosmovisor won't be able to download the binaries and go through the upgrade. Here's a little command that uses `jq` and `go-getter` (same library used by Cosmovisor - so it is a good test). @@ -87,7 +89,7 @@ go install github.com/hashicorp/go-getter/cmd/go-getter@latest ::: ```bash -jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do +jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' $PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do go-getter "$url" . done ``` @@ -106,7 +108,7 @@ The output should look like this: The `MsgSoftwareUpgrade` can be submitted using the following command: ```bash -poktrolld tx authz exec PATH_TO_UPGRADE_TRANSACTION_JSON --from pnf +poktrolld tx authz exec $PATH_TO_UPGRADE_TRANSACTION_JSON --from=pnf ``` If the transaction has been accepted, upgrade plan can be viewed with this command: @@ -120,7 +122,7 @@ poktrolld query upgrade plan It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following transaction: ```bash -poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from pnf +poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from=pnf ``` ## Testing the Upgrade @@ -131,18 +133,50 @@ Note that for local testing, `cosmovisor` won't pull the binary from the info fi ### LocalNet -LocalNet currently does not support `cosmovisor` and automatic upgrades. However, we have provided scripts to facilitate local testing in the `tools/scripts/upgrades` directory: +LocalNet does not support `cosmovisor` and automatic upgrades at the moment. But we don't need it to simulate and test the upgrade procedure. + +#### TLDR + +In short, the procedure is: +- Pull git repo with old version (separate directory) +- Download release binary of the old version +- Wipe localnet data and generate genesis using OLD version +- Start node using OLD binary +- Write and submit an upgrade transaction on-chain +- When the Upgrade Plan height is reached, stop the old node and run the new binary +- Observe the behavior + +#### Full example + +As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you with to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. + +1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. +2. Pull a new `poktroll` repo (will be used as an "old" version): + ```bash + git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old + cd poktroll-upgrade-old + git checkout v0.0.9 + + # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases + # CHANGE POKTROLLD_VERSION and ARCH + curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . + + # Validate the version + ./poktrolld version + 0.0.9 + ``` +3. Stop LocalNet: `make localnet_down` +4. Reset the data: `./poktrolld comet unsafe-reset-all` +5. Create new genesis using old version (from `poktroll-upgrade-old` dir): `make localnet_regenesis` +6. Start the network: `./poktrolld start` +7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction (e.g. `poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf`) +8. Verify the plan is active: `poktrolld query upgrade plan` +9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. +10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade network **TO**. +11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. +12. Start the new version (from the **NEW VERSION REPO**: `./release_binaries/poktroll_darwin_arm64 start`) +13. Observe the behavior. Your node should go through the upgrade process and start using the new version. -1. Modify `tools/scripts/upgrades/authz_upgrade_tx_example_v0.0.4_height_30.json` to reflect the name of the upgrade and the height at which it should be scheduled. - -2. Check and update the `tools/scripts/upgrades/cosmovisor-start-node.sh` to point to the correct binaries: - - - The old binary should be compiled to work before the upgrade. - - The new binary should contain the upgrade logic to be executed immediately after the node is started using the new binary. - -3. Run `bash tools/scripts/upgrades/cosmovisor-start-node.sh` to wipe the `~/.poktroll` directory and place binaries in the correct locations. - -4. Execute the transaction as shown in [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) section above. ### DevNet diff --git a/tools/installer/full-node.sh b/tools/installer/full-node.sh index 6faca639b..7ee542336 100644 --- a/tools/installer/full-node.sh +++ b/tools/installer/full-node.sh @@ -94,7 +94,7 @@ setup_env_vars() { echo "export DAEMON_HOME=\$HOME/.poktroll" >> \$HOME/.profile echo "export DAEMON_RESTART_AFTER_UPGRADE=true" >> \$HOME/.profile echo "export DAEMON_ALLOW_DOWNLOAD_BINARIES=true" >> \$HOME/.profile - echo "export UNSAFE_SKIP_BACKUP=true" >> \$HOME/.profile + echo "export UNSAFE_SKIP_BACKUP=false" >> \$HOME/.profile source \$HOME/.profile EOF print_color $GREEN "Environment variables set up successfully." @@ -138,12 +138,15 @@ setup_poktrolld() { exit 1 fi - # Use the direct download link for the latest release - LATEST_RELEASE_URL="https://github.com/pokt-network/poktroll/releases/latest/download/poktroll_linux_${ARCH}.tar.gz" + # Get the version genesis started from + POKTROLLD_VERSION=$(curl -s https://raw.githubusercontent.com/pokt-network/pocket-network-genesis/master/poktrolld/testnet-validated.init-version) + + # Use the direct download link for the correct release + RELEASE_URL="https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" sudo -u "$POKTROLL_USER" bash << EOF mkdir -p \$HOME/.poktroll/cosmovisor/genesis/bin - curl -L "$LATEST_RELEASE_URL" | tar -zxvf - -C \$HOME/.poktroll/cosmovisor/genesis/bin + curl -L "$RELEASE_URL" | tar -zxvf - -C \$HOME/.poktroll/cosmovisor/genesis/bin chmod +x \$HOME/.poktroll/cosmovisor/genesis/bin/poktrolld ln -sf \$HOME/.poktroll/cosmovisor/genesis/bin/poktrolld \$HOME/bin/poktrolld source \$HOME/.profile From 3fe1feaba7897760e4f6dbf2f5f013c6d9f6d34e Mon Sep 17 00:00:00 2001 From: DK Date: Tue, 15 Oct 2024 16:19:42 -0700 Subject: [PATCH 05/13] --wip-- [skip ci] --- app/upgrades/historical.go | 2 + .../protocol/upgrades/consensus_failure.md | 15 ++++++ .../protocol/upgrades/contigency_plans.md | 49 +++++++++++++++++++ .../docs/protocol/upgrades/release_process.md | 6 --- docusaurus/yarn.lock | 16 +++--- 5 files changed, 74 insertions(+), 14 deletions(-) create mode 100644 docusaurus/docs/protocol/upgrades/consensus_failure.md create mode 100644 docusaurus/docs/protocol/upgrades/contigency_plans.md diff --git a/app/upgrades/historical.go b/app/upgrades/historical.go index 2e71f0430..bb9eccccc 100644 --- a/app/upgrades/historical.go +++ b/app/upgrades/historical.go @@ -11,6 +11,7 @@ package upgrades import ( "context" + "fmt" storetypes "cosmossdk.io/store/types" upgradetypes "cosmossdk.io/x/upgrade/types" @@ -28,6 +29,7 @@ func defaultUpgradeHandler( configurator module.Configurator, ) upgradetypes.UpgradeHandler { return func(ctx context.Context, plan upgradetypes.Plan, vm module.VersionMap) (module.VersionMap, error) { + fmt.Println("Starting the migration in defaultUpgradeHandler.") return mm.RunMigrations(ctx, configurator, vm) } } diff --git a/docusaurus/docs/protocol/upgrades/consensus_failure.md b/docusaurus/docs/protocol/upgrades/consensus_failure.md new file mode 100644 index 000000000..229dd4544 --- /dev/null +++ b/docusaurus/docs/protocol/upgrades/consensus_failure.md @@ -0,0 +1,15 @@ +--- +title: Consensus failure recovery plan +sidebar_position: 6 +--- + +# Consensus Failure Recovery Plan + + + +## Common consensus failure errors + + + +- `wrong Block.Header.AppHash` - the data in block is different between nodes. Can be investigated by comparing the data dir - [more information here](../../develop/developer_guide/chain_halt_troubleshooting.md). + diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md new file mode 100644 index 000000000..90b3e8544 --- /dev/null +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -0,0 +1,49 @@ +--- +title: Failed upgrade contingency plan +sidebar_position: 5 +--- + +# Contingency plans + + +There's always a chance the upgrade will fail. We prepared some contingency plans, so we can try to recover without +significant downtime. + +:::tip + +This documentation covers failed upgrade contingency for `poktroll` - `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. + +::: + +- [Option 0: the bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) +- [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) +- [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) +- [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) + + +## Option 0: the bug is discovered before the upgrade height is reached + +Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-plan). + +## Option 1: The upgrade height is reached and the migration didn't start + +If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, but it's important that all nodes on the network do this at the same time. + +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. + +:::caution +`--unsafe-skip-upgrade` needs to be documented and added to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +::: + +## Option 2: The migration is stuck + +If the migration is stuck there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such case, we need to: + +- Rollback validators to the backup (snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). +- Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. +- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +- Resolve the issue with an upgrade and schedule another plan. + +## Option 3: The network is stuck at the future height after the upgrade + +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [here](./consensus_failure.md) for more information on how to handle such issues. diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 556f14fb2..4a756d5a5 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -16,12 +16,6 @@ sidebar_position: 4 This document is for the Pocket Network protocol team's internal use only. ::: -- [1. Determine if the Release is Consensus-Breaking](#1-determine-if-the-release-is-consensus-breaking) -- [2. Create a GitHub Release](#2-create-a-github-release) - - [Legend](#legend) -- [3. Write an Upgrade Plan](#3-write-an-upgrade-plan) -- [4. Issue Upgrade on TestNet](#4-issue-upgrade-on-testnet) -- [5. Issue Upgrade on MainNet](#5-issue-upgrade-on-mainnet) ### 1. Determine if the Release is Consensus-Breaking diff --git a/docusaurus/yarn.lock b/docusaurus/yarn.lock index 93cee387e..5c2dcf935 100644 --- a/docusaurus/yarn.lock +++ b/docusaurus/yarn.lock @@ -1810,15 +1810,10 @@ dependencies: "@types/mdx" "^2.0.0" -"@node-rs/jieba-linux-x64-gnu@1.10.0": +"@node-rs/jieba-darwin-arm64@1.10.0": version "1.10.0" - resolved "https://registry.npmjs.org/@node-rs/jieba-linux-x64-gnu/-/jieba-linux-x64-gnu-1.10.0.tgz" - integrity sha512-rS5Shs8JITxJjFIjoIZ5a9O+GO21TJgKu03g2qwFE3QaN5ZOvXtz+/AqqyfT4GmmMhCujD83AGqfOGXDmItF9w== - -"@node-rs/jieba-linux-x64-musl@1.10.0": - version "1.10.0" - resolved "https://registry.npmjs.org/@node-rs/jieba-linux-x64-musl/-/jieba-linux-x64-musl-1.10.0.tgz" - integrity sha512-BvSiF2rR8Birh2oEVHcYwq0WGC1cegkEdddWsPrrSmpKmukJE2zyjcxaOOggq2apb8fIRsjyeeUh6X3R5AgjvA== + resolved "https://registry.npmjs.org/@node-rs/jieba-darwin-arm64/-/jieba-darwin-arm64-1.10.0.tgz" + integrity sha512-IhR5r+XxFcfhVsF93zQ3uCJy8ndotRntXzoW/JCyKqOahUo/ITQRT6vTKHKMyD9xNmjl222OZonBSo2+mlI2fQ== "@node-rs/jieba@^1.6.0": version "1.10.0" @@ -4619,6 +4614,11 @@ fs.realpath@^1.0.0: resolved "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz" integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw== +fsevents@~2.3.2: + version "2.3.3" + resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz" + integrity sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw== + function-bind@^1.1.2: version "1.1.2" resolved "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz" From 525eb5783fbf3085d5c21923edce4a9b8b9cccfd Mon Sep 17 00:00:00 2001 From: DK Date: Wed, 16 Oct 2024 18:03:07 -0700 Subject: [PATCH 06/13] --wip-- [skip ci] --- .../chain_halt_troubleshooting.md | 14 +++- .../recovery_from_chain_halt.md | 68 +++++++++++++++++++ .../protocol/upgrades/consensus_failure.md | 15 ---- .../protocol/upgrades/contigency_plans.md | 7 +- .../docs/protocol/upgrades/upgrade_list.md | 21 ++---- 5 files changed, 92 insertions(+), 33 deletions(-) create mode 100644 docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md delete mode 100644 docusaurus/docs/protocol/upgrades/consensus_failure.md diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index 72da1f4f3..3ded9ff73 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -8,13 +8,15 @@ title: Chain Halt Troubleshooting - [Understanding Chain Halts](#understanding-chain-halts) - [Definition and Causes](#definition-and-causes) - [Impact on Network](#impact-on-network) -- [Troubleshooting Process](#troubleshooting-process) +- [`wrong Block.Header.AppHash` Troubleshooting Process](#wrong-blockheaderapphash-troubleshooting-process) - [Step 1: Identifying the Issue](#step-1-identifying-the-issue) - [Step 2: Collecting Node Data](#step-2-collecting-node-data) - [Step 3: Analyzing Discrepancies](#step-3-analyzing-discrepancies) - [Step 4: Decoding and Interpreting Data](#step-4-decoding-and-interpreting-data) - [Step 5: Comparing Records](#step-5-comparing-records) - [Step 6: Investigation and Resolution](#step-6-investigation-and-resolution) +- [`wrong Block.Header.LastResultsHash`](#wrong-blockheaderlastresultshash) +- [Syncing from genesis](#syncing-from-genesis) ## Understanding Chain Halts @@ -40,7 +42,7 @@ Chain halts can have severe consequences for the network: Given these impacts, swift and effective troubleshooting is crucial to maintain network health and user trust. -## Troubleshooting Process +## `wrong Block.Header.AppHash` Troubleshooting Process ### Step 1: Identifying the Issue @@ -94,3 +96,11 @@ Based on the identified discrepancies: 2. Develop a fix or patch to address the issue. 3. If necessary, initiate discussions with the validator community to reach social consensus on how to proceed. 4. Implement the agreed-upon solution and monitor the network closely during and after the fix. + +## `wrong Block.Header.LastResultsHash` + +Errors such as `reactor validation error: wrong Block.Header.LastResultsHash.` are most likely to come from the non-deterministic gas calculation. That can happen when the node runs on a different version. The solution is to use the correct binary version. + +## Syncing from genesis + +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md new file mode 100644 index 000000000..29cd949d3 --- /dev/null +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -0,0 +1,68 @@ +--- +sidebar_position: 7 +title: Chain Halt Recovery +--- + +## Chain Halt Recovery + +This document describes how to recover from a chain halt. This document assumes the cause of the chain halt has been identified and the new release has been created and verified to work. + +:::tip +See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. +::: + +- [Background](#background) +- [Halt during network upgrade](#halt-during-network-upgrade) +- [Replacing the binary manually (preferred)](#replacing-the-binary-manually-preferred) +- [Rollback, fork and upgrade](#rollback-fork-and-upgrade) + + +## Background + +Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Bezantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. + +## Halt during network upgrade + +If the halt is caused by the network upgrade, it is possible the solution can be as simple as skipping an upgrade (`unsafe-skip-upgrade`) and creating a new (fixed) one. Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). + + +## Replacing the binary manually (preferred) + +**This is preferred way of resolving the consensus-breaking issues**. + +Since the chain is not moving, it would be impossible to issue an automatic upgrade with an upgrade plan. Instead, we need to gather a social consensus to manually replace the binary and get the chain moving. + +Currently this means breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. + + + + + +## Rollback, fork and upgrade + +:::info + +This part is relevant for Pocket Network Shannon release only, as we do not rely on `x/gov` module for upgrades in Shannon. Instead, our DAO can issue upgrade transactions on the Pocket Network chain directly. Conventional `cosmos-sdk` upgrade process would require to go through the voting process to issue an upgrade. + +::: + +Perfrorming a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: + +- Prepare and verify the new version that addresses the consensus-breaking issue. +- [Create a release](../../protocol/upgrades/release_process.md). +- [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. +- Get the state of the validators on the network to **three blocks** prior to the consensus-breaking issue. + - For example, if there was an issue at height `103`, we need to get the state to the height of `100`. At `101` we will submit an upgrade transaction so the chain upgrades on `102` and avoids the issue at height `103`. + - Can be done in two ways: + - `poktrolld rollback --hard` until the command responds with the desired block number. **OR,** + - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to ceirtan height and then gracefully shuts down. +- **Make sure all validators use the same data directory** or have been rolled back to the same height. +- **Isolate validators from the other nodes** that have not been rolled back to the older state. If that means using a firewall or isolating from the internet - this is the way. Validators should be able to only gossip blocks between themselves. **Having at least one node that has knowledge of the forking ledger can jeopardize the whole process**. In particular, the following errors are the sign of the nodes populating existing blocks: + - `found conflicting vote from ourselves; did you unsafe_reset a validator?` + - `conflicting votes from validator` +- Start the network and perform an upgrade (following the example aboce): + - We would not be able to submit an transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. + - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. + - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. +- The network should go through successfull upgrade and climb to the next block. +- After the chain has been reached over the hight of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can peform the rollback or use a snapshot as well. diff --git a/docusaurus/docs/protocol/upgrades/consensus_failure.md b/docusaurus/docs/protocol/upgrades/consensus_failure.md deleted file mode 100644 index 229dd4544..000000000 --- a/docusaurus/docs/protocol/upgrades/consensus_failure.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Consensus failure recovery plan -sidebar_position: 6 ---- - -# Consensus Failure Recovery Plan - - - -## Common consensus failure errors - - - -- `wrong Block.Header.AppHash` - the data in block is different between nodes. Can be investigated by comparing the data dir - [more information here](../../develop/developer_guide/chain_halt_troubleshooting.md). - diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 90b3e8544..7029bf3ef 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -27,12 +27,14 @@ Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-pla ## Option 1: The upgrade height is reached and the migration didn't start -If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, but it's important that all nodes on the network do this at the same time. +If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue going and the protocol team to plan another release `--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. :::caution `--unsafe-skip-upgrade` needs to be documented and added to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. + + ::: ## Option 2: The migration is stuck @@ -46,4 +48,5 @@ If the migration is stuck there's always a chance the state has been mutated for ## Option 3: The network is stuck at the future height after the upgrade -This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [here](./consensus_failure.md) for more information on how to handle such issues. +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. + diff --git a/docusaurus/docs/protocol/upgrades/upgrade_list.md b/docusaurus/docs/protocol/upgrades/upgrade_list.md index ff88e7584..1df1dd52a 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_list.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_list.md @@ -8,7 +8,7 @@ sidebar_position: 1 The tables below provide a list of past and upcoming protocol upgrades. For more detailed information about what upgrades are, how they work, and what changes they bring to the protocol, please refer to our [upgrade overview page](./protocol_upgrades.md). - [Legend](#legend) -- [TestNet](#testnet) +- [Alpha TestNet](#alpha-testnet) - [MainNet](#mainnet) ## Legend @@ -18,22 +18,15 @@ The tables below provide a list of past and upcoming protocol upgrades. For more - ❓ - Unknown/To Be Determined - ⚠️ - Warning/Caution Required -## TestNet - -:::warning -This table is currently incomplete and does not include all protocol upgrades. Our recent TestNet upgrades, which were performed via a regenesis, are not listed here. -::: +## Alpha TestNet -| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | -| ---------------------------------------------------------------------------- | :-----: | :------: | :---------------------------------: | -------------- | -| [`v0.0.9-3`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9-3) | ❌ | ✅ | ✅ (Alpha TestNet Participants Only) | `17102` | -| [`v0.0.9`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.8`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.8) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.6`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.6) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.5`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.5) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.4`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.4) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | +| ---------------------------------------------------------------------------- | :-----: | :------: | :-------------------------------: | -------------- | +| [`v0.0.9-3`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9-3) | ❌ | ✅ | ⚠️ Alpha TestNet Participants Only | `17102` | +| [`v0.0.9`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9) | ❓ | ❓ | N/A: genesis version | ❓ | + ## MainNet From 4e8c7dda086ae6356c5ba587627288f8e09eeb78 Mon Sep 17 00:00:00 2001 From: DK Date: Wed, 16 Oct 2024 18:15:45 -0700 Subject: [PATCH 07/13] spell checking --- .../chain_halt_troubleshooting.md | 2 +- .../recovery_from_chain_halt.md | 14 +++++------ .../protocol/upgrades/contigency_plans.md | 25 ++++++++----------- .../protocol/upgrades/upgrade_procedure.md | 11 ++++---- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index 3ded9ff73..e97f535db 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -103,4 +103,4 @@ Errors such as `reactor validation error: wrong Block.Header.LastResultsHash.` a ## Syncing from genesis -If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). \ No newline at end of file diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 29cd949d3..b56b0e725 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -19,7 +19,7 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor ## Background -Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Bezantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. +Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Byzantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. ## Halt during network upgrade @@ -46,7 +46,7 @@ This part is relevant for Pocket Network Shannon release only, as we do not rely ::: -Perfrorming a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: +Performing a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: - Prepare and verify the new version that addresses the consensus-breaking issue. - [Create a release](../../protocol/upgrades/release_process.md). @@ -55,14 +55,14 @@ Perfrorming a rollback basically means forking the network at the older height. - For example, if there was an issue at height `103`, we need to get the state to the height of `100`. At `101` we will submit an upgrade transaction so the chain upgrades on `102` and avoids the issue at height `103`. - Can be done in two ways: - `poktrolld rollback --hard` until the command responds with the desired block number. **OR,** - - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to ceirtan height and then gracefully shuts down. + - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. - **Make sure all validators use the same data directory** or have been rolled back to the same height. - **Isolate validators from the other nodes** that have not been rolled back to the older state. If that means using a firewall or isolating from the internet - this is the way. Validators should be able to only gossip blocks between themselves. **Having at least one node that has knowledge of the forking ledger can jeopardize the whole process**. In particular, the following errors are the sign of the nodes populating existing blocks: - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - `conflicting votes from validator` -- Start the network and perform an upgrade (following the example aboce): - - We would not be able to submit an transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. +- Start the network and perform an upgrade (following the example above): + - We would not be able to submit a transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. -- The network should go through successfull upgrade and climb to the next block. -- After the chain has been reached over the hight of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can peform the rollback or use a snapshot as well. +- The network should go through successful upgrade and climb to the next block. +- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. \ No newline at end of file diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 7029bf3ef..75438c9be 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -5,48 +5,45 @@ sidebar_position: 5 # Contingency plans - -There's always a chance the upgrade will fail. We prepared some contingency plans, so we can try to recover without -significant downtime. +There's always a chance the upgrade will fail. We have prepared some contingency plans, so we can recover without significant downtime. :::tip -This documentation covers failed upgrade contingency for `poktroll` - `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. +This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. ::: -- [Option 0: the bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) +- [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) -## Option 0: the bug is discovered before the upgrade height is reached +## Option 0: The bug is discovered before the upgrade height is reached Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-plan). ## Option 1: The upgrade height is reached and the migration didn't start -If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue going and the protocol team to plan another release +If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. -`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations, and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. :::caution -`--unsafe-skip-upgrade` needs to be documented and added to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +`--unsafe-skip-upgrade` needs to be documented and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. ::: ## Option 2: The migration is stuck -If the migration is stuck there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such case, we need to: +If the migration is stuck, there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such a case, we need to: -- Rollback validators to the backup (snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). +- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). - Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. -- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. - Resolve the issue with an upgrade and schedule another plan. ## Option 3: The network is stuck at the future height after the upgrade -This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. - +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. \ No newline at end of file diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 25c5ffc6e..a5f990f54 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -71,7 +71,7 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. - `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. ### Validate the URLs (live network only) @@ -111,7 +111,7 @@ The `MsgSoftwareUpgrade` can be submitted using the following command: poktrolld tx authz exec $PATH_TO_UPGRADE_TRANSACTION_JSON --from=pnf ``` -If the transaction has been accepted, upgrade plan can be viewed with this command: +If the transaction has been accepted, the upgrade plan can be viewed with this command: ```bash poktrolld query upgrade plan @@ -148,7 +148,7 @@ In short, the procedure is: #### Full example -As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you with to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. +As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. 1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. 2. Pull a new `poktroll` repo (will be used as an "old" version): @@ -172,12 +172,11 @@ As we are testing an upgrade, we need to have a network that first runs on the o 7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction (e.g. `poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf`) 8. Verify the plan is active: `poktrolld query upgrade plan` 9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. -10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade network **TO**. +10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. 11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. 12. Start the new version (from the **NEW VERSION REPO**: `./release_binaries/poktroll_darwin_arm64 start`) 13. Observe the behavior. Your node should go through the upgrade process and start using the new version. - ### DevNet DevNets currently do not support `cosmovisor`. @@ -200,4 +199,4 @@ If you are a member of Grove, you can find the instructions to access the infras ### Mainnet -The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. +The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. \ No newline at end of file From dd631e60a3501f10f6678a1d140ea8db938aa4d6 Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Thu, 17 Oct 2024 13:35:53 -0700 Subject: [PATCH 08/13] Empty commit From 83a24aa26637a6947e46d3ef59622b32538ca34a Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 13:00:44 -0400 Subject: [PATCH 09/13] Partial review --- api/poktroll/application/event.pulsar.go | 2 +- .../chain_halt_troubleshooting.md | 21 +++++++--- .../recovery_from_chain_halt.md | 39 ++++++++++++------- go.mod | 6 --- go.sum | 2 - x/tokenomics/types/tx.pb.go | 1 - 6 files changed, 41 insertions(+), 30 deletions(-) diff --git a/api/poktroll/application/event.pulsar.go b/api/poktroll/application/event.pulsar.go index 043aa2264..290bbde55 100644 --- a/api/poktroll/application/event.pulsar.go +++ b/api/poktroll/application/event.pulsar.go @@ -3,11 +3,11 @@ package application import ( _ "cosmossdk.io/api/cosmos/base/v1beta1" + _ "github.com/pokt-network/poktroll/api/poktroll/shared" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - _ "github.com/pokt-network/poktroll/api/poktroll/shared" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index e97f535db..4f5796793 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -8,14 +8,14 @@ title: Chain Halt Troubleshooting - [Understanding Chain Halts](#understanding-chain-halts) - [Definition and Causes](#definition-and-causes) - [Impact on Network](#impact-on-network) -- [`wrong Block.Header.AppHash` Troubleshooting Process](#wrong-blockheaderapphash-troubleshooting-process) +- [Troubleshooting `wrong Block.Header.AppHash`](#troubleshooting-wrong-blockheaderapphash) - [Step 1: Identifying the Issue](#step-1-identifying-the-issue) - [Step 2: Collecting Node Data](#step-2-collecting-node-data) - [Step 3: Analyzing Discrepancies](#step-3-analyzing-discrepancies) - [Step 4: Decoding and Interpreting Data](#step-4-decoding-and-interpreting-data) - [Step 5: Comparing Records](#step-5-comparing-records) - [Step 6: Investigation and Resolution](#step-6-investigation-and-resolution) -- [`wrong Block.Header.LastResultsHash`](#wrong-blockheaderlastresultshash) +- [Troubleshooting `wrong Block.Header.LastResultsHash`](#troubleshooting-wrong-blockheaderlastresultshash) - [Syncing from genesis](#syncing-from-genesis) ## Understanding Chain Halts @@ -42,7 +42,7 @@ Chain halts can have severe consequences for the network: Given these impacts, swift and effective troubleshooting is crucial to maintain network health and user trust. -## `wrong Block.Header.AppHash` Troubleshooting Process +## Troubleshooting `wrong Block.Header.AppHash` ### Step 1: Identifying the Issue @@ -97,10 +97,19 @@ Based on the identified discrepancies: 3. If necessary, initiate discussions with the validator community to reach social consensus on how to proceed. 4. Implement the agreed-upon solution and monitor the network closely during and after the fix. -## `wrong Block.Header.LastResultsHash` +## Troubleshooting `wrong Block.Header.LastResultsHash` -Errors such as `reactor validation error: wrong Block.Header.LastResultsHash.` are most likely to come from the non-deterministic gas calculation. That can happen when the node runs on a different version. The solution is to use the correct binary version. +Errors like the following can occur from using the incorrect binary version at a certain height. + +```bash +reactor validation error: wrong Block.Header.LastResultsHash. +``` + +The solution is to use the correct binary version to sync the full node at the correct height. + +Tools like [cosmosvisor](https://docs.cosmos.network/v0.45/run-node/cosmovisor.html) make it easier +to sync a node from genesis, using the appropriate binary for each range of block heights. ## Syncing from genesis -If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). \ No newline at end of file +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index b56b0e725..13e9991cb 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -5,40 +5,51 @@ title: Chain Halt Recovery ## Chain Halt Recovery -This document describes how to recover from a chain halt. This document assumes the cause of the chain halt has been identified and the new release has been created and verified to work. +This document describes how to recover from a chain halt. It assumes the cause of +the chain halt has been identified, the new release has been created, and verified +function correctly. :::tip See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. ::: - [Background](#background) -- [Halt during network upgrade](#halt-during-network-upgrade) -- [Replacing the binary manually (preferred)](#replacing-the-binary-manually-preferred) -- [Rollback, fork and upgrade](#rollback-fork-and-upgrade) - +- [Resolving halts during a network upgrade](#resolving-halts-during-a-network-upgrade) + - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) + - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) ## Background -Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Byzantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. +Pocket network is built on top of `cosmos-sdk`, which utilizes the CometBFT consensus engine. +Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators +are online and voting for the same block to reach a consensus. In order to maintain liveness +and avoid a chain-halt, we need the majority (> 2/3) of Validators to participate +and use the same version of the software. + +## Resolving halts during a network upgrade -## Halt during network upgrade +If the halt is caused by the network upgrade, it is possible the solution can be as simple as +skipping an upgrade (i.e. `unsafe-skip-upgrade`) and creating a new (fixed) upgrade. -If the halt is caused by the network upgrade, it is possible the solution can be as simple as skipping an upgrade (`unsafe-skip-upgrade`) and creating a new (fixed) one. Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). +Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). +### Manual binary replacement (preferred) -## Replacing the binary manually (preferred) +:::note **This is preferred way of resolving the consensus-breaking issues**. -Since the chain is not moving, it would be impossible to issue an automatic upgrade with an upgrade plan. Instead, we need to gather a social consensus to manually replace the binary and get the chain moving. +::: -Currently this means breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. +Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. +Instead, we need **social consensus** to manually replace the binary and get the chain moving. - +Currently this involves synching the network from genesis breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. + -## Rollback, fork and upgrade +### Rollback, fork and upgrade :::info @@ -65,4 +76,4 @@ Performing a rollback basically means forking the network at the older height. M - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. - The network should go through successful upgrade and climb to the next block. -- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. \ No newline at end of file +- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. diff --git a/go.mod b/go.mod index c9f3d9997..91de15f1d 100644 --- a/go.mod +++ b/go.mod @@ -79,15 +79,10 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -<<<<<<< HEAD -require github.com/jhump/protoreflect v1.16.0 -======= require ( cosmossdk.io/x/tx v0.13.4 github.com/jhump/protoreflect v1.16.0 - go.uber.org/mock v0.4.0 ) ->>>>>>> main require ( buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.34.2-20240508200655-46a4cf4ba109.2 // indirect @@ -100,7 +95,6 @@ require ( connectrpc.com/connect v1.16.2 // indirect connectrpc.com/otelconnect v0.7.0 // indirect cosmossdk.io/collections v0.4.0 // indirect - cosmossdk.io/x/tx v0.13.4 // indirect filippo.io/edwards25519 v1.0.0 // indirect github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4 // indirect github.com/99designs/keyring v1.2.1 // indirect diff --git a/go.sum b/go.sum index b35917516..62dc61bb0 100644 --- a/go.sum +++ b/go.sum @@ -1213,8 +1213,6 @@ go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0 go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= -go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/x/tokenomics/types/tx.pb.go b/x/tokenomics/types/tx.pb.go index e4fec264c..9f18a148c 100644 --- a/x/tokenomics/types/tx.pb.go +++ b/x/tokenomics/types/tx.pb.go @@ -125,7 +125,6 @@ type MsgUpdateParam struct { // specified in the `Params` message in `proof/params.proto.` Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` // Types that are valid to be assigned to AsType: - // // *MsgUpdateParam_AsString // *MsgUpdateParam_AsInt64 // *MsgUpdateParam_AsBytes From 50cc08e0a6c130bbf9c8e939f90fc51402d17d75 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 13:55:48 -0400 Subject: [PATCH 10/13] Partial review --- .../recovery_from_chain_halt.md | 74 +++++++++++++------ .../docs/protocol/upgrades/release_process.md | 6 +- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 13e9991cb..82c432f7b 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -17,6 +17,8 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor - [Resolving halts during a network upgrade](#resolving-halts-during-a-network-upgrade) - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) + - [Step 5: Data rollback - retrieving snapshot at a specific height](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height) + - [Step 6: Validator Isolation - risk mitigation](#step-6-validator-isolation---risk-mitigation) ## Background @@ -37,7 +39,7 @@ Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_p :::note -**This is preferred way of resolving the consensus-breaking issues**. +This is preferred way of resolving the consensus-breaking issues. ::: @@ -53,27 +55,55 @@ Currently this involves synching the network from genesis breaking a way to sync :::info -This part is relevant for Pocket Network Shannon release only, as we do not rely on `x/gov` module for upgrades in Shannon. Instead, our DAO can issue upgrade transactions on the Pocket Network chain directly. Conventional `cosmos-sdk` upgrade process would require to go through the voting process to issue an upgrade. +These instructions are only relevant to Pocket Network's Shannon release. + +We do not currently use `x/gov` and on-chain voting for upgrades. + +Instead, our DAO votes on upgrades off-chain and the Foundation executes +transactions on their behalf. ::: -Performing a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: - -- Prepare and verify the new version that addresses the consensus-breaking issue. -- [Create a release](../../protocol/upgrades/release_process.md). -- [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. -- Get the state of the validators on the network to **three blocks** prior to the consensus-breaking issue. - - For example, if there was an issue at height `103`, we need to get the state to the height of `100`. At `101` we will submit an upgrade transaction so the chain upgrades on `102` and avoids the issue at height `103`. - - Can be done in two ways: - - `poktrolld rollback --hard` until the command responds with the desired block number. **OR,** - - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. -- **Make sure all validators use the same data directory** or have been rolled back to the same height. -- **Isolate validators from the other nodes** that have not been rolled back to the older state. If that means using a firewall or isolating from the internet - this is the way. Validators should be able to only gossip blocks between themselves. **Having at least one node that has knowledge of the forking ledger can jeopardize the whole process**. In particular, the following errors are the sign of the nodes populating existing blocks: - - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - - `conflicting votes from validator` -- Start the network and perform an upgrade (following the example above): - - We would not be able to submit a transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. - - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. -- The network should go through successful upgrade and climb to the next block. -- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. +**Performing a rollback is analogous to forking the network at the older height.** + +This should be avoided unless absolutely necessary. + +However, if necessary, the instructions to follow are: + +1. Prepare & verify a new binary that addresses the consensus-breaking issue. +2. [Create a release](../../protocol/upgrades/release_process.md). +3. [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. +4. Get the Validator set off the network **3 blocks** prior to the height of the chain halt. For example: + - Assume an issue at height `103` + - Get the validator set at height `100` + - Submit an upgrade transaction at `101` + - Upgrade the chain at height `102` + - Avoid the issue at height `103` +5. Ensure all validators rolled back to the same height and use the same snapshot + - The snapshot should be imported into each Validator's data directory + - This is necessary to ensure data continuity and prevent forks. +6. Isolate the validator set from full nodes. + - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. + - This may require using a firewall or a private network + - Validators should only be gossip blocks amongst themselves. +7. Start the network and perform the upgrade. For example, reiterating the process above: + - Start all Validators at height `100` + - On block `101`, submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. + - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102` + - If using `cosmosvisor`, the node will wait to replace the binary +8. Wait for the network to reach the height of the previous ledger (`104`+) +9. Allow validators to open their network to full nodes again. + - Note that full nodes will need to perform the rollback or use a snapshot as well. + +#### Step 5: Data rollback - retrieving snapshot at a specific height + +There are two ways to get a snapshot from a prior height: + +1. Use `poktrolld rollback --hard` repeately until the command responds with the desired block number. +2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. + +#### Step 6: Validator Isolation - risk mitigation + +- Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the following errors are the sign of the nodes populating existing blocks: +- `found conflicting vote from ourselves; did you unsafe_reset a validator?` +- `conflicting votes from validator` diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 4a756d5a5..398d56c05 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -16,7 +16,6 @@ sidebar_position: 4 This document is for the Pocket Network protocol team's internal use only. ::: - ### 1. Determine if the Release is Consensus-Breaking :::note @@ -54,10 +53,8 @@ You can find an example [here](https://github.com/pokt-network/poktroll/releases ## Protocol Upgrades - **Planned Upgrade:** ❌ Not applicable for this release. @@ -66,6 +63,7 @@ such as https://github.com/pokt-network/poktroll/blob/main/app/upgrades/historic - **Upgrade Height:** ❌ Not applicable for this release. ## What's Changed + ``` From 3bddc15c1eb10b9310edbb39a20968be651c3e98 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 14:12:16 -0400 Subject: [PATCH 11/13] Partial review --- .../protocol/upgrades/contigency_plans.md | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 75438c9be..3ab90d586 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -3,47 +3,64 @@ title: Failed upgrade contingency plan sidebar_position: 5 --- -# Contingency plans - -There's always a chance the upgrade will fail. We have prepared some contingency plans, so we can recover without significant downtime. - :::tip -This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. +This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. + +While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. ::: +## Contingency plans + +There's always a chance the upgrade will fail. + +This document is intended to help you recover without significant downtime. + - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) +### Option 0: The bug is discovered before the upgrade height is reached + +**Cancel the upgrade plan!!** -## Option 0: The bug is discovered before the upgrade height is reached +See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). -Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-plan). +### Option 1: The upgrade height is reached and the migration didn't start -## Option 1: The upgrade height is reached and the migration didn't start +If the nodes on the network stopped at the upgrade height and the migration did not +start yet (i.e. there are no logs indicating the upgrade handler and store migrations are being executed), +we mist gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. -If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. +This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. -`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations, and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations. +The chain continues as if the upgrade plan was never set. +The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. :::caution + `--unsafe-skip-upgrade` needs to be documented and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. - + + ::: -## Option 2: The migration is stuck +### Option 2: The migration is stuck + +If the migration is stuck, there's always a chance the state has been mutated for +the upgrade but the migration didn't complete. -If the migration is stuck, there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such a case, we need to: +In such a case, we need to: -- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). +- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). - Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. -- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. +- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis + they will automatically skip the failed upgrade. - Resolve the issue with an upgrade and schedule another plan. -## Option 3: The network is stuck at the future height after the upgrade +### Option 3: The network is stuck at the future height after the upgrade -This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. \ No newline at end of file +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. From 498d9d8f53e62e5e8e7e4837833c1977ca1777f8 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 14:26:41 -0400 Subject: [PATCH 12/13] Partial review --- .../protocol/upgrades/upgrade_procedure.md | 134 ++++++++++++------ 1 file changed, 91 insertions(+), 43 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index a5f990f54..668bee067 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -17,8 +17,8 @@ This page describes the protocol upgrade process, which is internal to the proto - [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) - - [TLDR](#tldr) - - [Full example](#full-example) + - [LocalNet Upgrade tl;dr](#localnet-upgrade-tldr) + - [LocalNet Upgrade Full Example Walkthrough](#localnet-upgrade-full-example-walkthrough) - [DevNet](#devnet) - [TestNet](#testnet) - [Mainnet](#mainnet) @@ -71,22 +71,20 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. - `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. -### Validate the URLs (live network only) +:::tip -The URLs of the binaries contain checksums. It is important to make sure they are correct, otherwise Cosmovisor won't be able -to download the binaries and go through the upgrade. Here's a little command that uses `jq` and `go-getter` (same library used by Cosmovisor - so it is a good test). +When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. -:::tip +::: -Go-getter can be installed using the following command: +### Validate the URLs (live network only) -```bash -go install github.com/hashicorp/go-getter/cmd/go-getter@latest -``` +The URLs of the binaries contain checksums. It is critical to ensure they are correct. +Otherwise Cosmovisor won't be able to download the binaries and go through the upgrade. -::: +The command below (using toold build by the authors of Cosmosvisor) can be used to achieve the above: ```bash jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' $PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do @@ -103,6 +101,16 @@ The output should look like this: 2024/09/24 12:40:46 success! ``` +:::tip + +`go-getter` can be installed using the following command: + +```bash +go install github.com/hashicorp/go-getter/cmd/go-getter@latest +``` + +::: + ## Submitting the upgrade on-chain The `MsgSoftwareUpgrade` can be submitted using the following command: @@ -133,48 +141,88 @@ Note that for local testing, `cosmovisor` won't pull the binary from the info fi ### LocalNet -LocalNet does not support `cosmovisor` and automatic upgrades at the moment. But we don't need it to simulate and test the upgrade procedure. +LocalNet **DOES NOT** support `cosmovisor` and automatic upgrades at the moment. + +However, **IT IS NOT NEEDED** to simulate and test the upgrade procedure. + +#### LocalNet Upgrade tl;dr -#### TLDR +1. Pull git repo with old version (separate directory) +2. Download release binary of the old version +3. Wipe LocalNet data and generate genesis using OLD version +4. Start node using anOLD binary +5. Write and submit an upgrade transaction on-chain +6. When the Upgrade Plan height is reached, stop the old node and run the new binary +7. Observe the behavior -In short, the procedure is: -- Pull git repo with old version (separate directory) -- Download release binary of the old version -- Wipe localnet data and generate genesis using OLD version -- Start node using OLD binary -- Write and submit an upgrade transaction on-chain -- When the Upgrade Plan height is reached, stop the old node and run the new binary -- Observe the behavior +#### LocalNet Upgrade Full Example Walkthrough -#### Full example +Testing an upgrade requires a network running on an old version. -As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. +Ensure LocalNet is running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. 1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. 2. Pull a new `poktroll` repo (will be used as an "old" version): - ```bash - git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old - cd poktroll-upgrade-old - git checkout v0.0.9 - # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases - # CHANGE POKTROLLD_VERSION and ARCH - curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . + ```bash + git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old + cd poktroll-upgrade-old + git checkout v0.0.9 + + # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases + # CHANGE POKTROLLD_VERSION and ARCH + curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . + + # Validate the version + ./poktrolld version + 0.0.9 + ``` + +3. Stop LocalNet + + ```bash + make localnet_down + ``` + +4. Reset the data + + ```bash + ./poktrolld comet unsafe-reset-all + ``` + +5. Create new genesis using old version (from `poktroll-upgrade-old` dir) + + ```bash + make localnet_regenesis + ``` + +6. Start the network + + ```bash + ./poktrolld start + ``` + +7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction. For example: + + ```bash + poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf` + ``` + +8. Verify the plan is active + + ```bash + poktrolld query upgrade plan + ``` - # Validate the version - ./poktrolld version - 0.0.9 - ``` -3. Stop LocalNet: `make localnet_down` -4. Reset the data: `./poktrolld comet unsafe-reset-all` -5. Create new genesis using old version (from `poktroll-upgrade-old` dir): `make localnet_regenesis` -6. Start the network: `./poktrolld start` -7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction (e.g. `poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf`) -8. Verify the plan is active: `poktrolld query upgrade plan` 9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. 10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. 11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. -12. Start the new version (from the **NEW VERSION REPO**: `./release_binaries/poktroll_darwin_arm64 start`) +12. Start the new version from the **NEW VERSION REPO**: + + ```bash + ./release_binaries/poktroll_darwin_arm64 start + ``` + 13. Observe the behavior. Your node should go through the upgrade process and start using the new version. ### DevNet @@ -199,4 +247,4 @@ If you are a member of Grove, you can find the instructions to access the infras ### Mainnet -The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. \ No newline at end of file +The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. From be35f1a80cf9f4c98e9f3a63b5b7889cb0f72b01 Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Wed, 23 Oct 2024 17:41:04 -0700 Subject: [PATCH 13/13] requested changes --- .../recovery_from_chain_halt.md | 100 ++++++++++++++---- .../protocol/upgrades/contigency_plans.md | 23 +++- .../protocol/upgrades/upgrade_procedure.md | 5 +- tools/installer/full-node.sh | 3 +- 4 files changed, 105 insertions(+), 26 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 82c432f7b..03a399052 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -18,7 +18,7 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) - [Step 5: Data rollback - retrieving snapshot at a specific height](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height) - - [Step 6: Validator Isolation - risk mitigation](#step-6-validator-isolation---risk-mitigation) + - [Step 6: Validator Isolation - risks](#step-6-validator-isolation---risks) ## Background @@ -39,17 +39,49 @@ Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_p :::note -This is preferred way of resolving the consensus-breaking issues. +This is the preferred way of resolving consensus-breaking issues. -::: - -Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. +**Significant side effect**: this breaks an ability to sync from genesis **without manual interventions**. +For example, when a consensus-breaking issue occurs on a node that is synching from the first block, node operators need +to manually replace the binary with the new one. There are efforts underway to mitigate this issue, including +configuration for `cosmovisor` that could automate the process. -Instead, we need **social consensus** to manually replace the binary and get the chain moving. + -Currently this involves synching the network from genesis breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. +::: - +Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. Instead, +we need **social consensus** to manually replace the binary and get the chain moving. + +1. Prepare and verify a new binary that addresses the consensus-breaking issue. +2. Reach out to the community and validators so they can upgrade the binary manually. + :::warning UNKNOWN, NEED TO INVESTIGATE + + We might need to coordinate the timing of when the nodes should be started. In Tendermint version of Pocket Network + (Morse), this was necessary to sync consensus rounds and steps, getting the chain moving. It might not be a + requirement anymore, but we need to double-check. [More information](https://docs.cometbft.com/v1.0/spec/consensus/consensus). +3. Update [the documentation](../../protocol/upgrades/upgrade_list.md) to include a range a height when the binary needs + to be repleced. Consider a configuration change for `cosmovisor` so it would automatically replace the binary when + synching from genesis. + + +```mermaid +sequenceDiagram + participant DevTeam + participant Community + participant Validators + participant Documentation + participant Network + + DevTeam->>DevTeam: Prepare and verify new binary + DevTeam->>Community: Announce new binary and instructions + DevTeam->>Validators: Notify validators to upgrade manually + Validators->>Validators: Manually replace the binary + Validators->>Network: Restart nodes with new binary + DevTeam->>Documentation: Update upgrade documentation + Validators->>Network: Network resumes operation + +``` ### Rollback, fork and upgrade @@ -79,10 +111,10 @@ However, if necessary, the instructions to follow are: - Submit an upgrade transaction at `101` - Upgrade the chain at height `102` - Avoid the issue at height `103` -5. Ensure all validators rolled back to the same height and use the same snapshot +5. Ensure all validators rolled back to the same height and use the same snapshot - ([how to get the snapshot](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height)) - The snapshot should be imported into each Validator's data directory - This is necessary to ensure data continuity and prevent forks. -6. Isolate the validator set from full nodes. +6. Isolate the validator set from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)) - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. - This may require using a firewall or a private network - Validators should only be gossip blocks amongst themselves. @@ -95,15 +127,47 @@ However, if necessary, the instructions to follow are: 9. Allow validators to open their network to full nodes again. - Note that full nodes will need to perform the rollback or use a snapshot as well. +```mermaid +sequenceDiagram + participant DevTeam + participant Foundation + participant Validators + participant FullNodes + participant Network + + DevTeam->>DevTeam: Prepare & verify new binary + DevTeam->>DevTeam: Create a release + Validators->>Validators: Roll back to height before issue or import snapshot + Validators->>Validators: Isolate from Full Nodes + Foundation->>Validators: Distribute upgrade transaction + Validators->>Network: Start network and perform upgrade + Validators->>Network: Wait until over consensus-breaking height + Validators->>FullNodes: Open network connections + FullNodes->>Network: Sync with updated network + Validators->>Network: Network resumes operation + +``` + #### Step 5: Data rollback - retrieving snapshot at a specific height There are two ways to get a snapshot from a prior height: -1. Use `poktrolld rollback --hard` repeately until the command responds with the desired block number. -2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. - -#### Step 6: Validator Isolation - risk mitigation - -- Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the following errors are the sign of the nodes populating existing blocks: -- `found conflicting vote from ourselves; did you unsafe_reset a validator?` -- `conflicting votes from validator` +1. Execute + ```bash + poktrolld rollback --hard + ``` + repeately until the command responds with the desired block number. +2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then + gracefully shuts down. Add this argument to `poktrolld start` like this: + ```bash + poktrolld start --halt-height=100 + ``` + + +#### Step 6: Validator Isolation - risks + +Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the +following errors in logs are the sign of the nodes populating existing blocks: + - `found conflicting vote from ourselves; did you unsafe_reset a validator?` + - `conflicting votes from validator` + diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 3ab90d586..32b254cad 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -20,11 +20,12 @@ This document is intended to help you recover without significant downtime. - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) + - [Documentation and scripts to update](#documentation-and-scripts-to-update) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) ### Option 0: The bug is discovered before the upgrade height is reached -**Cancel the upgrade plan!!** +**Cancel the upgrade plan!** See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). @@ -55,12 +56,24 @@ the upgrade but the migration didn't complete. In such a case, we need to: -- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). -- Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. -- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. +- Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade, + if `UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - + [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)). +- All full nodes and validators on the network: skip the upgrade handler and store migrations be adding `--unsafe-skip-upgrade=$upgradeHeightNumber` + argument to your `poktroll start` command. Like this: + ```bash + poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber + ``` +- Protocol team: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody + tries to sync the network from genesis they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) - Resolve the issue with an upgrade and schedule another plan. +#### Documentation and scripts to update + +- The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. +- Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). +- [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) (consider exposing via a `values.yaml` file) + ### Option 3: The network is stuck at the future height after the upgrade This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 668bee067..76ec0a824 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -215,7 +215,8 @@ Ensure LocalNet is running using a binary from the [previous release you wish to ``` 9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. -10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. +10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. It might be a + `poktroll` repo you working on or a release tag. 11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. 12. Start the new version from the **NEW VERSION REPO**: @@ -223,7 +224,7 @@ Ensure LocalNet is running using a binary from the [previous release you wish to ./release_binaries/poktroll_darwin_arm64 start ``` -13. Observe the behavior. Your node should go through the upgrade process and start using the new version. +13. Observe the output. Your node should go through the upgrade process and start using the new version. ### DevNet diff --git a/tools/installer/full-node.sh b/tools/installer/full-node.sh index 7ee542336..610df7a83 100644 --- a/tools/installer/full-node.sh +++ b/tools/installer/full-node.sh @@ -138,7 +138,8 @@ setup_poktrolld() { exit 1 fi - # Get the version genesis started from + # Get the version genesis started from. We can't just use `latest` as the new binary won't sync from genesis. + # We need to start syncing from scratch using the version that was used when the network started. POKTROLLD_VERSION=$(curl -s https://raw.githubusercontent.com/pokt-network/pocket-network-genesis/master/poktrolld/testnet-validated.init-version) # Use the direct download link for the correct release