@@ -3,30 +3,31 @@ name: Continuous Forge Tests - Stable
3
3
4
4
# We have various Forge Stable tests here, that test out different situations and workloads.
5
5
#
6
+ #
6
7
# Dashboard showing historical results: https://grafana.aptoslabs.com/d/bdnt45ggsg000f/forge-stable-performance?orgId=1
7
8
8
9
# Tests are named based on how they are set up, some of the common flavors are:
9
10
# * "realistic-env" - tests with "realistic-env" in their name try to have network and hardware environemnt
10
11
# be more realisistic. They use "wrap_with_realistic_env", which sets:
11
12
# * MultiRegionNetworkEmulationTest which splits nodes into 4 "regions", which have different
12
13
# x-region and in-region latencies and reliability rates
13
- # * CpuChaosTest which tries to make nodes have heterogenous hardware, by loading a few cores fully
14
- # on a few nodes. But this is not too helpful, as block execution time variance is minimal
14
+ # * CpuChaosTest which tries to make nodes have heterogenous hardware, by loading a few cores fully
15
+ # on a few nodes. But this is not too helpful, as block execution time variance is minimal
15
16
# (as we generally have a few idle cores, and real variance mostly comes from variance in cpu speed instead)
16
- # * sweep - means running a multiple tests within a single test, by having everything the same, except for one
17
+ # * sweep - means running a multiple tests within a single test, by having everything the same, except for one
17
18
# thing - i.e. the thing we sweep over. There are two main dimensions we "sweep" over:
18
- # * load sweep - this generally uses const tps workload, and varies the load across the tests (i.e. 10 vs 100 vs 1000 TPS)
19
+ # * load sweep - this generally uses const tps workload, and varies the load across the tests (i.e. 10 vs 100 vs 1000 TPS)
19
20
# * workload sweep - this varies the transaction type being submitted, trying to test out how the system behaves
20
21
# when different part of the system are stressed (i.e. low vs high output sizes, good vs bad gas calibration, parallel vs sequential, etc)
21
22
# * graceful - tests where we are overloading the system - i.e. submitting more transactions than we expect system to handle,
22
23
# and seeing how it behaves. overall e2e latency is then high, but we can test that only validator -> block proposal has increased.
23
24
# additionally, we generally add a small TPS high-fee traffic in these tests, to confirm it is unaffected by the high load.
24
25
# * changing-working-quorum - tests where we intentionally make nodes unreachable (cut their network), and bring them back,
25
- # and go to cut network on next set of nodes - requiring state-sync to catch up, consensus to work with different set of
26
+ # and go to cut network on next set of nodes - requiring state-sync to catch up, consensus to work with different set of
26
27
# nodes being required to form consensus. During each iteration, we test that enough progress was made.
27
28
#
28
29
# Main success criteria used across the tests are:
29
- # * throughput and expiration/rejection rate
30
+ # * throughput and expiration/rejection rate
30
31
# * latency (avg / p50 / p90 / p99 )
31
32
# * latency breakdown across the components - currently within a validator alone:
32
33
# batch->pos->proposal->ordered->committed
62
63
required : true
63
64
type : choice
64
65
description : The specific stable test to run. If 'all', all stable tests will be run
65
- default : ' all'
66
+ default : " all"
66
67
options :
67
68
- all
68
69
- framework-upgrade-test
69
- # Test varies the load, i.e. sending 10, 100, 1000, 5000, etc TPS, and then measuring
70
+ # Test varies the load, i.e. sending 10, 100, 1000, 5000, etc TPS, and then measuring
70
71
# onchain TPS, expired rate, as well as p50/p90/p99 latency, among other things,
71
- # testing that we don't degrade performance both for low, mid and high loads.
72
+ # testing that we don't degrade performance both for low, mid and high loads.
72
73
- realistic-env-load-sweep
73
- # Test varies the workload, across some basic workloads (i.e. some cheap, some expensive),
74
+ # Test varies the workload, across some basic workloads (i.e. some cheap, some expensive),
74
75
# and checks that throughput and performance across different stages
75
76
- realistic-env-workload-sweep
76
77
# Test sends ConstTps workload above what the system can handle, while additionally sending
77
78
# non-small high-fee traffic (1000 TPS), and measures overall system performance.
78
79
- realistic-env-graceful-overload
79
- # Test varies the workload (opposite ends of gas calibration, high and low output sizes,
80
- # sequential / parallel, etc), and sends ConstTPS for each above what the system can handle,
81
- # while sending low TPS of high fee transactions. And primarily confirms that high-fee traffic
80
+ # Test varies the workload (opposite ends of gas calibration, high and low output sizes,
81
+ # sequential / parallel, etc), and sends ConstTPS for each above what the system can handle,
82
+ # while sending low TPS of high fee transactions. And primarily confirms that high-fee traffic
82
83
# has predictably low latency, and execution pipeline doesn't get backed up.
83
84
- realistic-env-graceful-workload-sweep
84
- # Test varies workload, which is user-contracts, such that max throughput varies from high to mid to low,
85
+ # Test varies workload, which is user-contracts, such that max throughput varies from high to mid to low,
85
86
# while testing that unrelated transactions paying the same gas price, are able to go through.
86
87
- realistic-env-fairness-workload-sweep
87
88
# Test which tunes all configurations for largest throughput possible (potentially sacrificing latency a bit)
88
89
- realistic-network-tuned-for-throughput
89
- # Run small-ish load, but checks that at all times all nodes are making progress,
90
+ # Run small-ish load, but checks that at all times all nodes are making progress,
90
91
# catching any unexpected unreliabilities/delays in consensus
91
92
- consensus-stress-test
92
93
# Send a mix of different workloads, to catch issues with different interactions of workloads.
96
97
- fullnode-reboot-stress-test
97
98
- compat
98
99
# Send low TPS (100 TPS)
99
- # Cut network on enough nodes, such that all others are needed for consensus. Then bring a few back, and cut
100
+ # Cut network on enough nodes, such that all others are needed for consensus. Then bring a few back, and cut
100
101
# same amount of new ones - requiring all that were brought back to state-sync and continue executing.
101
102
# Check that in each iteration - we were able to make meaningful progress.
102
103
- changing-working-quorum-test
105
106
- changing-working-quorum-test-high-load
106
107
- pfn-const-tps-realistic-env
107
108
# Run a production config (same as land blocking run) max load (via mempool backlog), but run it for 2 hours,
108
- # to check reliability and consistency of the newtork.
109
+ # to check reliability and consistency of the newtork.
109
110
- realistic-env-max-load-long
110
111
JOB_PARALLELISM :
111
112
required : false
121
122
- " testsuite/find_latest_image.py"
122
123
123
124
concurrency :
124
- group : forge-stable-${{ format('{0}-{1}-{2}-{3}', github.ref_name, inputs.GIT_SHA, inputs.IMAGE_TAG, inputs.TEST_NAME) }}
125
+ group : forge-stable-${{ format('{0}-{1}-{2}-{3}', github.ref_name, inputs.GIT_SHA || github.sha || github.head_ref || github.ref , inputs.IMAGE_TAG, inputs.TEST_NAME) }}
125
126
cancel-in-progress : true
126
127
127
128
env :
@@ -288,3 +289,4 @@ jobs:
288
289
FORGE_ENABLE_FAILPOINTS : ${{ matrix.FORGE_ENABLE_FAILPOINTS || false }}
289
290
POST_TO_SLACK : true
290
291
SEND_RESULTS_TO_TRUNK : true
292
+ FORGE_CONTINUOUS_TEST_MODE : true
0 commit comments