Fix load test workflow (#374)

smartcontractkit · Dec 13, 2023 · e3f6571 · e3f6571
1 parent 59ae902
commit e3f6571
Show file tree

Hide file tree

Showing 8 changed files with 88 additions and 56 deletions.
diff --git a/.github/workflows/ccip-chaos-tests.yml b/.github/workflows/ccip-chaos-tests.yml
@@ -179,7 +179,7 @@ jobs:
       - name: Run Load With Chaos Tests
         uses: smartcontractkit/chainlink-github-actions/chainlink-testing-framework/run-tests@eccde1970eca69f079d3efb3409938a72ade8497 # v2.2.13
         with:
-          test_command_to_run: make test_need_operator_assets && cd ./integration-tests/ccip-tests && go test -timeout 1h -count=1 -json -test.parallel 4 -run '^TestLoadCCIPStableWithPodChaosDiffCommitAndExec' ./load 2>&1 | tee /tmp/gotest.log | gotestfmt
+          test_command_to_run: make test_need_operator_assets && cd ./integration-tests/ccip-tests && go test -timeout 2h -count=1 -json -test.parallel 4 -run '^TestLoadCCIPStableWithPodChaosDiffCommitAndExec' ./load 2>&1 | tee /tmp/gotest.log | gotestfmt
           test_download_vendor_packages_command: make gomod
           cl_repo: ${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/chainlink
           cl_image_tag: ${{ github.sha }}

diff --git a/.github/workflows/ccip-load-tests.yml b/.github/workflows/ccip-load-tests.yml
@@ -35,15 +35,15 @@ jobs:
         uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       - name: Check if image exists
         id: check-image
-        uses: smartcontractkit/chainlink-github-actions/docker/image-exists@eccde1970eca69f079d3efb3409938a72ade8497 # v2.2.13
+        uses: smartcontractkit/chainlink-github-actions/docker/image-exists@e865e376b8c2d594028c8d645dd6c47169b72974 # v2.2.16
         with:
           repository: chainlink
           tag: ${{ env.INPUT_CHAINLINK_VERSION }}
           AWS_REGION: ${{ secrets.QA_AWS_REGION }}
           AWS_ROLE_TO_ASSUME: ${{ secrets.QA_AWS_ROLE_TO_ASSUME }}
       - name: Build Image
         if: steps.check-image.outputs.exists == 'false'
-        uses: smartcontractkit/chainlink-github-actions/chainlink-testing-framework/build-image@eccde1970eca69f079d3efb3409938a72ade8497 # v2.2.13
+        uses: smartcontractkit/chainlink-github-actions/chainlink-testing-framework/build-image@e865e376b8c2d594028c8d645dd6c47169b72974 # v2.2.16
         env:
           GH_TOKEN: ${{ github.token }}
         with:
@@ -136,8 +136,8 @@ jobs:
         shell: bash
         run: |
           if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
-            echo "BASE64_TEST_CONFIG_OVERRIDE= ${{ inputs.base64_test_input }}" >> $GITHUB_ENV
-            echo "TEST_BASE64_TEST_CONFIG_OVERRIDE= ${{ inputs.base64_test_input }}" >> $GITHUB_ENV
+            echo "BASE64_TEST_CONFIG_OVERRIDE=${{ inputs.base64_test_input }}" >> $GITHUB_ENV
+            echo "TEST_BASE64_TEST_CONFIG_OVERRIDE=${{ inputs.base64_test_input }}" >> $GITHUB_ENV
           fi
           if [[ "${{ env.INPUT_CHAINLINK_IMAGE }}" == "" || "${{ env.INPUT_CHAINLINK_VERSION }}" == "" ]]; then
             echo "INPUT_CHAINLINK_IMAGE=${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/chainlink" >> $GITHUB_ENV
@@ -157,7 +157,7 @@ jobs:
           echo "### test input override :link:" >>$GITHUB_STEP_SUMMARY
           echo "$(base64 -d <<< "${{ inputs.base64_test_input }}")" >>$GITHUB_STEP_SUMMARY
       - name: Run Tests
-        uses: smartcontractkit/chainlink-github-actions/chainlink-testing-framework/run-tests@eccde1970eca69f079d3efb3409938a72ade8497 # v2.2.13
+        uses: smartcontractkit/chainlink-github-actions/chainlink-testing-framework/run-tests@e865e376b8c2d594028c8d645dd6c47169b72974 # v2.2.16
         env:
           TEST_SUITE: load
           TEST_ARGS: -test.timeout 900h
@@ -168,7 +168,7 @@ jobs:
           RR_CPU: 4
           TEST_TRIGGERED_BY: ccip-load-test-ci
         with:
-          test_command_to_run: make test_need_operator_assets && cd ./integration-tests/ccip-tests && go test -v -timeout 900h -count=1 -json -run ^TestLoadCCIPStableRPS$ ./load  2>&1 | tee /tmp/gotest.log | gotestfmt
+          test_command_to_run: make test_need_operator_assets && cd ./integration-tests/ccip-tests && go test -v -timeout 70m -count=1 -json -run ^TestLoadCCIPStableRPS$ ./load 2>&1 | tee /tmp/gotest.log | gotestfmt
           test_download_vendor_packages_command: cd ./integration-tests && go mod download
           cl_repo: ${{ env.INPUT_CHAINLINK_IMAGE }}
           cl_image_tag: ${{ env.INPUT_CHAINLINK_VERSION }}

diff --git a/integration-tests/ccip-tests/contracts/contract_models.go b/integration-tests/ccip-tests/contracts/contract_models.go
@@ -2,6 +2,7 @@ package contracts
 
 import (
 	"context"
+	"fmt"
 	"math/big"
 	"strconv"
 
@@ -519,7 +520,15 @@ func (r *Router) CCIPSend(destChainSelector uint64, msg router.ClientEVM2AnyMess
 	if valueForNative != nil {
 		opts.Value = valueForNative
 	}
-	log.Info().Interface("msg", msg).Msg("Sending msg")
+
+	log.Info().
+		Str("Network", r.client.GetNetworkName()).
+		Str("Router", r.Address()).
+		Interface("TokensAndAmounts", msg.TokenAmounts).
+		Str("FeeToken", msg.FeeToken.Hex()).
+		Str("ExtraArgs", fmt.Sprintf("0x%x", msg.ExtraArgs[:])).
+		Str("Receiver", fmt.Sprintf("0x%x", msg.Receiver[:])).
+		Msg("Sending msg")
 	return r.Instance.CcipSend(opts, destChainSelector, msg)
 }
 

diff --git a/integration-tests/ccip-tests/load/ccip_loadgen.go b/integration-tests/ccip-tests/load/ccip_loadgen.go
@@ -173,20 +173,23 @@ func (c *CCIPE2ELoad) Call(_ *wasp.Generator) *wasp.CallResult {
 	} else {
 		sendTx, err = sourceCCIP.Common.Router.CCIPSend(destChainSelector, msg, fee)
 	}
-	err = sourceCCIP.Common.ChainClient.MarkTxAsSentOnL2(sendTx)
 	if err != nil {
+		stats.UpdateState(lggr, 0, testreporters.TX, time.Since(startTime), testreporters.Failure)
 		res.Error = err.Error()
+		res.Data = stats.StatusByPhase
 		res.Failed = true
 		return res
 	}
+	err = sourceCCIP.Common.ChainClient.MarkTxAsSentOnL2(sendTx)
+
 	if err != nil {
 		stats.UpdateState(lggr, 0, testreporters.TX, time.Since(startTime), testreporters.Failure)
 		res.Error = fmt.Sprintf("ccip-send tx error %+v for msg ID %d", err, msgSerialNo)
 		res.Data = stats.StatusByPhase
 		res.Failed = true
 		return res
 	}
-	lggr = lggr.With().Str("Msg Tx", sendTx.Hash().String()).Logger()
+
 	txConfirmationTime := time.Now().UTC()
 	rcpt, err1 := bind.WaitMined(context.Background(), sourceCCIP.Common.ChainClient.DeployBackend(), sendTx)
 	if err1 == nil {
@@ -195,6 +198,7 @@ func (c *CCIPE2ELoad) Call(_ *wasp.Generator) *wasp.CallResult {
 			txConfirmationTime = hdr.Timestamp
 		}
 	}
+	lggr = lggr.With().Str("Msg Tx", sendTx.Hash().String()).Logger()
 	var gasUsed uint64
 	if rcpt != nil {
 		gasUsed = rcpt.GasUsed

diff --git a/integration-tests/ccip-tests/load/ccip_test.go b/integration-tests/ccip-tests/load/ccip_test.go
@@ -44,7 +44,7 @@ func TestLoadCCIPStableRPSTriggerBySource(t *testing.T) {
 	}
 	t.Cleanup(func() {
 		log.Info().Msg("Tearing down the environment")
-		require.NoError(t, testArgs.TestSetupArgs.TearDown())
+		testArgs.TearDown()
 	})
 	testArgs.TriggerLoadBySource()
 	testArgs.Wait()

diff --git a/integration-tests/ccip-tests/load/helper.go b/integration-tests/ccip-tests/load/helper.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"math"
 	"math/big"
+	"sync"
 	"testing"
 	"time"
 
@@ -31,9 +32,9 @@ type ChaosConfig struct {
 type loadArgs struct {
 	t                *testing.T
 	lggr             zerolog.Logger
-	ctx              context.Context
 	schedules        []*wasp.Segment
 	RunnerWg         *errgroup.Group // to wait on individual load generators run
+	LoadStarterWg    *sync.WaitGroup // waits for all the runners to start
 	TestCfg          *testsetups.CCIPTestConfig
 	TestSetupArgs    *testsetups.CCIPTestSetUpOutputs
 	ChaosExps        []ChaosConfig
@@ -127,9 +128,17 @@ func (l *loadArgs) TriggerLoadByLane() {
 		l.AddToRunnerGroup(loadRunner)
 	}
 	for _, lane := range l.TestSetupArgs.Lanes {
-		startLoad(lane.ForwardLane)
+		l.LoadStarterWg.Add(1)
+		go func() {
+			defer l.LoadStarterWg.Done()
+			startLoad(lane.ForwardLane)
+		}()
 		if pointer.GetBool(l.TestSetupArgs.Cfg.TestGroupInput.BiDirectionalLane) {
-			startLoad(lane.ReverseLane)
+			l.LoadStarterWg.Add(1)
+			go func() {
+				defer l.LoadStarterWg.Done()
+				startLoad(lane.ReverseLane)
+			}()
 		}
 	}
 }
@@ -148,6 +157,9 @@ func (l *loadArgs) AddToRunnerGroup(gen *wasp.Generator) {
 }
 
 func (l *loadArgs) Wait() {
+	l.lggr.Info().Msg("Waiting for load to start on all lanes")
+	// wait for load runner to start
+	l.LoadStarterWg.Wait()
 	l.lggr.Info().Msg("Waiting for load to finish on all lanes")
 	// wait for load runner to finish
 	err := l.RunnerWg.Wait()
@@ -208,52 +220,58 @@ func (l *loadArgs) TriggerLoadBySource() {
 		}
 	}
 	for source, lanes := range laneBySource {
-		l.lggr.Info().
-			Str("Source Network", source).
-			Msg("Starting load for source")
-		if lanes[0].TestEnv != nil && lanes[0].TestEnv.K8Env != nil && lanes[0].TestEnv.K8Env.Cfg != nil {
-			namespace = lanes[0].TestEnv.K8Env.Cfg.Namespace
-		}
-		allLabels := map[string]string{
-			"test_group":   "load",
-			"cluster":      "sdlc",
-			"namespace":    namespace,
-			"test_id":      "ccip",
-			"source_chain": source,
-		}
-		multiCallGen, err := NewMultiCallLoadGenerator(l.TestCfg, lanes, l.TestCfg.TestGroupInput.RequestPerUnitTime[0], allLabels)
-		require.NoError(l.t, err)
+		source := source
+		lanes := lanes
+		l.LoadStarterWg.Add(1)
+		go func() {
+			defer l.LoadStarterWg.Done()
+			l.lggr.Info().
+				Str("Source Network", source).
+				Msg("Starting load for source")
+			if lanes[0].TestEnv != nil && lanes[0].TestEnv.K8Env != nil && lanes[0].TestEnv.K8Env.Cfg != nil {
+				namespace = lanes[0].TestEnv.K8Env.Cfg.Namespace
+			}
+			allLabels := map[string]string{
+				"test_group":   "load",
+				"cluster":      "sdlc",
+				"namespace":    namespace,
+				"test_id":      "ccip",
+				"source_chain": source,
+			}
+			multiCallGen, err := NewMultiCallLoadGenerator(l.TestCfg, lanes, l.TestCfg.TestGroupInput.RequestPerUnitTime[0], allLabels)
+			require.NoError(l.t, err)
 
-		loadRunner, err := wasp.NewGenerator(&wasp.Config{
-			T:                     l.TestCfg.Test,
-			GenName:               fmt.Sprintf("Source %s", source),
-			Schedule:              wasp.Plain(1, l.TestCfg.TestGroupInput.TestDuration.Duration()), // hardcoded request per unit time to 1 as we are using multiCallGen
-			LoadType:              wasp.RPS,
-			RateLimitUnitDuration: l.TestCfg.TestGroupInput.TimeUnit.Duration(),
-			CallResultBufLen:      10, // we keep the last 10 call results for each generator, as the detailed report is generated at the end of the test
-			CallTimeout:           (l.TestCfg.TestGroupInput.PhaseTimeout.Duration()) * 5,
-			Gun:                   multiCallGen,
-			Logger:                multiCallGen.logger,
-			LokiConfig:            wasp.NewEnvLokiConfig(),
-			Labels:                allLabels,
-		})
-		require.NoError(l.TestCfg.Test, err, "initiating loadgen for source %s", source)
-		loadRunner.Run(false)
-		l.AddToRunnerGroup(loadRunner)
-		l.LoadgenTearDowns = append(l.LoadgenTearDowns, func() {
-			require.NoError(l.t, multiCallGen.Stop())
-		})
+			loadRunner, err := wasp.NewGenerator(&wasp.Config{
+				T:                     l.TestCfg.Test,
+				GenName:               fmt.Sprintf("Source %s", source),
+				Schedule:              wasp.Plain(1, l.TestCfg.TestGroupInput.TestDuration.Duration()), // hardcoded request per unit time to 1 as we are using multiCallGen
+				LoadType:              wasp.RPS,
+				RateLimitUnitDuration: l.TestCfg.TestGroupInput.TimeUnit.Duration(),
+				CallResultBufLen:      10, // we keep the last 10 call results for each generator, as the detailed report is generated at the end of the test
+				CallTimeout:           (l.TestCfg.TestGroupInput.PhaseTimeout.Duration()) * 5,
+				Gun:                   multiCallGen,
+				Logger:                multiCallGen.logger,
+				LokiConfig:            wasp.NewEnvLokiConfig(),
+				Labels:                allLabels,
+			})
+			require.NoError(l.TestCfg.Test, err, "initiating loadgen for source %s", source)
+			loadRunner.Run(false)
+			l.AddToRunnerGroup(loadRunner)
+			l.LoadgenTearDowns = append(l.LoadgenTearDowns, func() {
+				require.NoError(l.t, multiCallGen.Stop())
+			})
+		}()
 	}
 }
 
 func NewLoadArgs(t *testing.T, lggr zerolog.Logger, parent context.Context, chaosExps ...ChaosConfig) *loadArgs {
-	wg, ctx := errgroup.WithContext(parent)
+	wg, _ := errgroup.WithContext(parent)
 	return &loadArgs{
-		t:         t,
-		lggr:      lggr,
-		RunnerWg:  wg,
-		ctx:       ctx,
-		TestCfg:   testsetups.NewCCIPTestConfig(t, lggr, testconfig.Load),
-		ChaosExps: chaosExps,
+		t:             t,
+		lggr:          lggr,
+		RunnerWg:      wg,
+		TestCfg:       testsetups.NewCCIPTestConfig(t, lggr, testconfig.Load),
+		ChaosExps:     chaosExps,
+		LoadStarterWg: &sync.WaitGroup{},
 	}
 }
diff --git a/integration-tests/ccip-tests/testconfig/global.go b/integration-tests/ccip-tests/testconfig/global.go
@@ -84,6 +84,7 @@ func NewConfig() (*Config, error) {
 		if err != nil {
 			return nil, errors.Wrap(err, ErrUnmarshalConfig)
 		}
+		log.Info().Interface("override", override).Msg("Applied overrides")
 	}
 	if override != nil {
 		// apply overrides for all products

diff --git a/integration-tests/scripts/entrypoint b/integration-tests/scripts/entrypoint
@@ -21,7 +21,7 @@ exit_code=$?
 
 echo "Test exit code: ${exit_code}"
 
-# 3 is the code for an interrupted test, we only want to restart the test when the test is interrupted and in a state 
+# 3 is the code for an interrupted test, we only want to restart the test when the test is interrupted and in a state
 # that it can recover from. Otherwise we mark the test as "passed" as far as K8s is concerned so it doesn't restart it.
 if [ $exit_code -eq 3 ]; then
   exit 1  # Exiting with non-zero status to trigger pod restart