Skip to content

Commit

Permalink
made interleaving more deterministic
Browse files Browse the repository at this point in the history
  • Loading branch information
pompon0 committed Jun 15, 2024
1 parent a1636b8 commit 6577dcf
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 63 deletions.
1 change: 1 addition & 0 deletions node/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion node/actors/bft/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ tracing.workspace = true
vise.workspace = true

[dev-dependencies]
tokio.workspace = true
tokio = { workspace = true, features = ["full","test-util"]}
assert_matches.workspace = true
pretty_assertions.workspace = true
test-casing.workspace = true

[lints]
workspace = true
45 changes: 32 additions & 13 deletions node/actors/bft/src/testonly/run.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::{Behavior, Node};
use anyhow::bail;
use anyhow::Context as _;
use network::{io, Config};
use rand::seq::SliceRandom;
use std::{
Expand Down Expand Up @@ -91,9 +91,29 @@ pub(crate) struct Test {
pub(crate) blocks_to_finalize: usize,
}

#[derive(thiserror::Error, Debug)]
pub(crate) enum TestError {
#[error("finalized conflicting blocks")]
BlockConflict,
#[error(transparent)]
Other(#[from] ctx::Error),
}

impl From<anyhow::Error> for TestError {
fn from(err: anyhow::Error) -> Self {
Self::Other(err.into())
}
}

impl From<ctx::Canceled> for TestError {
fn from(err: ctx::Canceled) -> Self {
Self::Other(err.into())
}
}

impl Test {
/// Run a test with the given parameters and a random network setup.
pub(crate) async fn run(&self, ctx: &ctx::Ctx) -> anyhow::Result<()> {
pub(crate) async fn run(&self, ctx: &ctx::Ctx) -> Result<(), TestError> {
let rng = &mut ctx.rng();
let setup = validator::testonly::Setup::new_with_weights(
rng,
Expand All @@ -109,13 +129,13 @@ impl Test {
ctx: &ctx::Ctx,
nets: Vec<Config>,
genesis: &validator::Genesis,
) -> anyhow::Result<()> {
) -> Result<(), TestError> {
let mut nodes = vec![];
let mut honest = vec![];
scope::run!(ctx, |ctx, s| async {
for (i, net) in nets.into_iter().enumerate() {
let (store, runner) = new_store(ctx, genesis).await;
s.spawn_bg(runner.run(ctx));
s.spawn_bg(async { Ok(runner.run(ctx).await?) });
if self.nodes[i].0 == Behavior::Honest {
honest.push(store.clone());
}
Expand All @@ -126,7 +146,7 @@ impl Test {
});
}
assert!(!honest.is_empty());
s.spawn_bg(run_nodes(ctx, &self.network, &nodes));
s.spawn_bg(async { Ok(run_nodes(ctx, &self.network, &nodes).await?) });

// Run the nodes until all honest nodes store enough finalized blocks.
assert!(self.blocks_to_finalize > 0);
Expand All @@ -140,16 +160,15 @@ impl Test {
for i in 0..self.blocks_to_finalize as u64 {
let i = first + i;
// Only comparing the payload; the justification might be different.
let want = honest[0]
.block(ctx, i)
.await?
.expect("checked its existence")
.payload;
let want = honest[0].block(ctx, i).await?.context("missing block")?;
for store in &honest[1..] {
assert_eq!(want, store.block(ctx, i).await?.unwrap().payload);
let got = store.block(ctx, i).await?.context("missing block")?;
if want.payload != got.payload {
return Err(TestError::BlockConflict);
}
}
}
Ok(())
Ok::<_, TestError>(())
})
.await
}
Expand Down Expand Up @@ -445,7 +464,7 @@ async fn twins_receive_loop(
let can_send = |to| {
match router.can_send(&message.message.msg, port, to) {
Some(can_send) => Ok(can_send),
None => bail!("ran out of port schedule; we probably wouldn't finalize blocks even if we continued")
None => anyhow::bail!("ran out of port schedule; we probably wouldn't finalize blocks even if we continued")
}
};

Expand Down
89 changes: 40 additions & 49 deletions node/actors/bft/src/tests.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use std::collections::HashMap;

use crate::testonly::{
twins::{Cluster, HasKey, ScenarioGenerator, Twin},
ut_harness::UTHarness,
Behavior, Network, Port, PortRouter, PortSplitSchedule, Test, NUM_PHASES,
Behavior, Network, Port, PortRouter, PortSplitSchedule, Test, TestError, NUM_PHASES,
};
use assert_matches::assert_matches;
use std::collections::HashMap;
use test_casing::test_casing;
use zksync_concurrency::{ctx, scope, time};
use zksync_consensus_network::testonly::new_configs_for_validators;
use zksync_consensus_roles::validator::{
Expand All @@ -14,6 +15,7 @@ use zksync_consensus_roles::validator::{
};

async fn run_test(behavior: Behavior, network: Network) {
tokio::time::pause();
let _guard = zksync_concurrency::testonly::set_timeout(time::Duration::seconds(30));
zksync_concurrency::testonly::abort_on_panic();
let ctx = &ctx::test_root(&ctx::RealClock);
Expand All @@ -36,42 +38,42 @@ async fn run_test(behavior: Behavior, network: Network) {
.unwrap()
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn honest_mock_network() {
run_test(Behavior::Honest, Network::Mock).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn honest_real_network() {
run_test(Behavior::Honest, Network::Real).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn offline_mock_network() {
run_test(Behavior::Offline, Network::Mock).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn offline_real_network() {
run_test(Behavior::Offline, Network::Real).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn random_mock_network() {
run_test(Behavior::Random, Network::Mock).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn random_real_network() {
run_test(Behavior::Random, Network::Real).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn byzantine_mock_network() {
run_test(Behavior::Byzantine, Network::Mock).await
}

#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn byzantine_real_network() {
run_test(Behavior::Byzantine, Network::Real).await
}
Expand Down Expand Up @@ -213,8 +215,9 @@ async fn non_proposing_leader() {
///
/// This should be a simple sanity check that the network works and consensus
/// is achieved under the most favourable conditions.
#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn twins_network_wo_twins_wo_partitions() {
tokio::time::pause();
// n<6 implies f=0 and q=n
run_twins(5, 0, 10).await.unwrap();
}
Expand All @@ -225,67 +228,54 @@ async fn twins_network_wo_twins_wo_partitions() {
///
/// This should be a sanity check that without Byzantine behaviour the consensus
/// is resilient to temporary network partitions.
#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn twins_network_wo_twins_w_partitions() {
tokio::time::pause();
// n=6 implies f=1 and q=5; 6 is the minimum where partitions are possible.
run_twins(6, 0, 5).await.unwrap();
}

/// Run Twins scenarios with random number of nodes and 1 twin.
#[tokio::test(flavor = "multi_thread")]
async fn twins_network_w1_twins_w_partitions() {
#[test_casing(5, 6..=10)]
#[tokio::test]
async fn twins_network_w1_twins_w_partitions(num_replicas: usize) {
tokio::time::pause();
// n>=6 implies f>=1 and q=n-f
for num_replicas in 6..=10 {
// let num_honest = validator::threshold(num_replicas as u64) as usize;
// let max_faulty = num_replicas - num_honest;
// let num_twins = rng.gen_range(1..=max_faulty);
run_twins(num_replicas, 1, 10).await.unwrap();
}
// let num_honest = validator::threshold(num_replicas as u64) as usize;
// let max_faulty = num_replicas - num_honest;
// let num_twins = rng.gen_range(1..=max_faulty);
run_twins(num_replicas, 1, 10).await.unwrap();
}

/// Run Twins scenarios with higher number of nodes and 2 twins.
#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn twins_network_w2_twins_w_partitions() {
tokio::time::pause();
// n>=11 implies f>=2 and q=n-f
run_twins(11, 2, 10).await.unwrap();
run_twins(11, 2, 8).await.unwrap();
}

/// Run Twins scenario with more twins than tolerable and expect it to fail.
#[tokio::test(flavor = "multi_thread")]
#[should_panic]
#[tokio::test]
async fn twins_network_to_fail() {
tokio::time::pause();
// With n=5 f=0, so 1 twin means more faulty nodes than expected.
run_twins(5, 1, 100).await.unwrap();
assert_matches!(run_twins(5, 1, 100).await, Err(TestError::BlockConflict));
}

/// Create network configuration for a given number of replicas and twins and run [Test].
async fn run_twins(
num_replicas: usize,
num_twins: usize,
num_scenarios: usize,
) -> anyhow::Result<()> {
let num_honest = validator::threshold(num_replicas as u64) as usize;
let max_faulty = num_replicas - num_honest;

// If we pass more twins than tolerable faulty replicas then it should fail with an assertion error,
// but if we abort the process on panic then the #[should_panic] attribute doesn't work with `cargo nextest`.
if num_twins <= max_faulty {
zksync_concurrency::testonly::abort_on_panic();
}
zksync_concurrency::testonly::init_tracing();
) -> Result<(), TestError> {
zksync_concurrency::testonly::abort_on_panic();

// Use a single timeout for all scenarios to finish.
// A single scenario with 11 replicas took 3-5 seconds.
// Panic on timeout; works with `cargo nextest` and the `abort_on_panic` above.
// If we are in the mode where we are looking for faults and `abort_on_panic` is disabled,
// then this will not have any effect and the simulation will run for as long as it takes to
// go through all the configured scenarios, and then fail because it didn't panic if no fault was found.
// If it panics for another reason it might be misleading though, so ideally it should finish early.
// It would be nicer to actually inspect the panic and make sure it's the right kind of assertion.
let _guard = zksync_concurrency::testonly::set_timeout(time::Duration::seconds(30));
// Using `ctc.with_timeout` would stop a runaway execution even without `abort_on_panic` but
// it would make the test pass for a different reason, not because it found an error but because it ran long.
let ctx = &ctx::test_root(&ctx::AffineClock::new(10.0));
let _guard = zksync_concurrency::testonly::set_timeout(time::Duration::seconds(60));
let ctx = &ctx::test_root(&ctx::RealClock);

#[derive(PartialEq, Debug)]
struct Replica {
Expand Down Expand Up @@ -435,7 +425,7 @@ async fn run_twins(
/// while some other validators have the payload but don't have the HighQC and cannot finalize the block, and therefore
/// don't gossip it, which causes a deadlock unless the one with the HighQC moves on and broadcasts what they have, which
/// should cause the others to finalize the block and gossip the payload to them in turn.
#[tokio::test(flavor = "multi_thread")]
#[tokio::test]
async fn test_wait_for_finalized_deadlock() {
// These are the conditions for the deadlock to occur:
// * The problem happens in the handling of LeaderPrepare where the replica waits for the previous block in the justification.
Expand Down Expand Up @@ -551,10 +541,11 @@ async fn run_with_custom_router(
gossip_peers: usize,
blocks_to_finalize: usize,
make_router: impl FnOnce(HashMap<Port, usize>) -> PortRouter,
) -> anyhow::Result<()> {
) -> Result<(), TestError> {
tokio::time::pause();
zksync_concurrency::testonly::abort_on_panic();
let _guard = zksync_concurrency::testonly::set_timeout(time::Duration::seconds(30));
let ctx = &ctx::test_root(&ctx::AffineClock::new(10.0));
let _guard = zksync_concurrency::testonly::set_timeout(time::Duration::seconds(60));
let ctx = &ctx::test_root(&ctx::RealClock);

let rng = &mut ctx.rng();

Expand Down

0 comments on commit 6577dcf

Please sign in to comment.