Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/build_and_test_gcc_debug.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
compiler:
- "CONCORD_BFT_CONTAINER_CC=gcc CONCORD_BFT_CONTAINER_CXX=g++"
ci_build_type:
- "-DCMAKE_BUILD_TYPE=DEBUG -DBUILD_COMM_TCP_TLS=FALSE"
- "-DCMAKE_BUILD_TYPE=DEBUG -DBUILD_COMM_TCP_TLS=TRUE"
use_s3_obj_store:
- "-DUSE_S3_OBJECT_STORE=ON"
steps:
Expand Down Expand Up @@ -58,10 +58,10 @@ jobs:
-DUSE_OPENTRACING=ON \
-DOMIT_TEST_OUTPUT=OFF\
-DKEEP_APOLLO_LOGS=TRUE\
-DRUN_APOLLO_TESTS=FALSE\
-DRUN_APOLLO_TESTS=TRUE\
-DUSE_FAKE_CLOCK_IN_TIME_SERVICE=TRUE\" "\
&& script -q -e -c "make simple-test" \
&& script -q -e -c "make test"
&& script -q -e -c "make test-single-suite TEST_NAME=skvbc_chaotic_startup NUM_REPEATS=100 BREAK_ON_FAILURE=FALSE"
- name: Prepare artifacts
if: failure()
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/build_and_test_gcc_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ jobs:
-DUSE_OPENTRACING=ON \
-DOMIT_TEST_OUTPUT=OFF\
-DKEEP_APOLLO_LOGS=TRUE\
-DRUN_APOLLO_TESTS=FALSE\
-DRUN_APOLLO_TESTS=TRUE\
-DUSE_FAKE_CLOCK_IN_TIME_SERVICE=TRUE\" "\
&& script -q -e -c "make test"
&& script -q -e -c "make test-single-suite TEST_NAME=skvbc_chaotic_startup NUM_REPEATS=100 BREAK_ON_FAILURE=FALSE"
- name: Prepare artifacts
if: failure()
run: |
Expand Down
21 changes: 15 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -248,12 +248,21 @@ test-range: ## Run all tests in the range [START,END], inclusive: `make test-ran
ctest ${CONCORD_BFT_ADDITIONAL_CTEST_RUN_PARAMS} -I ${START},${END}"

.PHONY: test-single-suite
test-single-suite: ## Run a single test `make test-single-suite TEST_NAME=<test name>`
docker run ${BASIC_RUN_PARAMS} \
${CONCORD_BFT_CONTAINER_SHELL} -c \
"mkdir -p ${CONCORD_BFT_CORE_DIR} && \
cd ${CONCORD_BFT_BUILD_DIR} && \
ctest ${CONCORD_BFT_ADDITIONAL_CTEST_RUN_PARAMS} -V -R ${TEST_NAME} --timeout ${CONCORD_BFT_CTEST_TIMEOUT} --output-on-failure"
test-single-suite: SHELL:=/bin/bash
test-single-suite: ## Run a single test `make test-single-suite TEST_NAME=<test name> NUM_REPEATS=<number of repeats,default=1,optional> BREAK_ON_FAILURE=<TRUE|FALSE,optional>`. Example: `make test-single-suite TEST_NAME=timers_tests BREAK_ON_FAILURE=TRUE NUM_REPEATS=3`
num_failures=0; \
for (( i=1; i<=${NUM_REPEATS__}; i++ )); do \
echo "=== Starting iteration $${i}/${NUM_REPEATS__}"; \
docker run ${BASIC_RUN_PARAMS} ${CONCORD_BFT_CONTAINER_SHELL} -c \
"mkdir -p ${CONCORD_BFT_CORE_DIR} && cd ${CONCORD_BFT_BUILD_DIR} && \
ctest ${CONCORD_BFT_ADDITIONAL_CTEST_RUN_PARAMS} -V -R ${TEST_NAME} --timeout ${CONCORD_BFT_CTEST_TIMEOUT} --output-on-failure"; \
RESULT=$$?; \
if [[ $${RESULT} -ne 0 ]];then \
(( num_failures=num_failures+1 )); \
if [[ '${BREAK_ON_FAILURE__}' = 'TRUE' ]];then echo "Breaking on first failure! (iteration $$i)"; exit $${RESULT}; fi; fi; \
done; \
echo "Test ${TEST_NAME} completed ${NUM_REPEATS__} iterations" \
"($$((${NUM_REPEATS__}-num_failures)) succeed, $${num_failures} failed)";

.PHONY: test-single-apollo-case
test-single-apollo-case: ## Run a single Apollo test case: `make test-single-apollo-case TEST_FILE_NAME=<test file name> TEST_CASE_NAME=<test case name> NUM_REPEATS=<number of repeats,default=1,optional> BREAK_ON_FAILURE=<TRUE|FALSE,optional>`. Test suite file name should come without *.py. Test case is expected without a class name, and must be unique. Example: `make test-single-apollo-case BREAK_ON_FAILURE=TRUE NUM_REPEATS=100 TEST_FILE_NAME=test_skvbc_reconfiguration TEST_CASE_NAME=test_tls_exchange_client_replica_with_st`
Expand Down
92 changes: 47 additions & 45 deletions tests/apollo/test_skvbc_chaotic_startup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class SkvbcChaoticStartupTest(ApolloTest):

__test__ = False # so that PyTest ignores this test scenario

@unittest.skip("After CheckpointMsg-s forwarding, in this situation the late Replica initiates State Transfer.")
@with_trio
@with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7)
async def test_inactive_window_catchup_up_to_gap(self, bft_network):
Expand Down Expand Up @@ -84,6 +83,8 @@ async def write_req(num_req=1):
for _ in range(num_req):
await skvbc.send_write_kv_set()

await trio.sleep(1)

with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(
bft_network, {late_replica},
bft_network.all_replicas(without={primary, late_replica})) as adversary:
Expand All @@ -107,38 +108,37 @@ async def write_req(num_req=1):
while True:
last_exec = await bft_network.get_metric(late_replica, bft_network, 'Gauges', "lastExecutedSeqNum")
log.log_message(message_type=f"replica = {late_replica}; lase_exec = {last_exec}")
if last_exec == seq_nums_per_checkpoint + num_reqs_after_first_checkpoint:
if last_exec == seq_nums_per_checkpoint + num_reqs_after_first_checkpoint + 1:
break
await trio.sleep(seconds=0.3)

bft_network.stop_replica(late_replica)

# create 2 checkpoints and wait for checkpoint propagation
await skvbc.fill_and_wait_for_checkpoint(
initial_nodes=bft_network.all_replicas(without={late_replica}),
num_of_checkpoints_to_add=checkpoints_to_advance_after_first,
verify_checkpoint_persistency=False
)

await bft_network.wait_for_replicas_to_collect_stable_checkpoint(
bft_network.all_replicas(without={late_replica}),
first_stable_checkpoint_to_reach + checkpoints_to_advance_after_first)
# create 2 checkpoints and wait for checkpoint propagation
await skvbc.fill_and_wait_for_checkpoint(
initial_nodes=bft_network.all_replicas(without={late_replica}),
num_of_checkpoints_to_add=checkpoints_to_advance_after_first,
verify_checkpoint_persistency=False
)

bft_network.start_replica(late_replica)
with trio.fail_after(seconds=30):

late_replica_catch_up = False
while not late_replica_catch_up:
for replica_id in bft_network.get_live_replicas():
last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum")
last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum")
log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}")
if replica_id == late_replica and last_exec == 2*seq_nums_per_checkpoint:
late_replica_catch_up = True
await bft_network.wait_for_replicas_to_collect_stable_checkpoint(
bft_network.all_replicas(without={late_replica}),
first_stable_checkpoint_to_reach + checkpoints_to_advance_after_first)

await write_req()
await trio.sleep(seconds=3)
bft_network.start_replica(late_replica)
with trio.fail_after(seconds=30):
late_replica_catch_up = False
while not late_replica_catch_up:
for replica_id in bft_network.get_live_replicas():
last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum")
last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum")
log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}")
if replica_id == late_replica and last_exec >= 3*seq_nums_per_checkpoint:
late_replica_catch_up = True
await write_req()
await trio.sleep(seconds=3)

@unittest.skip("Testing in CI/CD")
@skip_for_tls
@with_trio
@with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7)
Expand Down Expand Up @@ -301,7 +301,6 @@ async def write_req(num_req=1):


# @unittest.skipIf(environ.get('BUILD_COMM_TCP_TLS', "").lower() == "true", "Unstable on CI (TCP/TLS only)")
@unittest.skip("Disabled due to BC-6816")
@with_trio
@with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7)
@with_constant_load
Expand Down Expand Up @@ -337,25 +336,26 @@ async def write_req():
bft_network.stop_replica(late_replica)

for isolated_replica, views_to_advance in [(0, 1), (1, 2)]:
with net.ReplicaSubsetTwoWayIsolatingAdversary(bft_network, {isolated_replica}) as adversary:
adversary.interfere()
try:
client = bft_network.random_client()
client.primary = None
for _ in range(5):
msg = skvbc.write_req(
[], [(skvbc.random_key(), skvbc.random_value())], 0)
await client.write(msg)
except:
pass

# Wait for View Change initiation to happen
with trio.fail_after(60):
while True:
view_of_connected_replica = await self._get_gauge(connected_replica, bft_network, "currentActiveView")
if view_of_connected_replica == current_view + views_to_advance:
break
await trio.sleep(0.2)
bft_network.stop_replica(isolated_replica)
try:
client = bft_network.random_client()
client.primary = None
for _ in range(5):
msg = skvbc.write_req(
[], [(skvbc.random_key(), skvbc.random_value())], 0)
await client.write(msg)
except:
pass

# Wait for View Change initiation to happen
with trio.fail_after(60):
while True:
view_of_connected_replica = await self._get_gauge(connected_replica, bft_network, "currentActiveView")
if view_of_connected_replica == current_view + views_to_advance:
break
await trio.sleep(0.2)

bft_network.start_replica(isolated_replica)

view = await bft_network.wait_for_view(
replica_id=connected_replica,
Expand All @@ -377,6 +377,7 @@ async def write_req():
await bft_network.wait_for_fast_path_to_be_prevalent(
run_ops=lambda: write_req(), threshold=num_req)

@unittest.skip("Testing in CI/CD")
@with_trio
@with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7)
@with_constant_load
Expand Down Expand Up @@ -625,6 +626,7 @@ async def test_f_minus_one_staggered_replicas_requesting_vc(self, bft_network, s

await self._wait_for_replicas_to_generate_checkpoint(bft_network, skvbc, expected_next_primary, bft_network.all_replicas(without={initial_primary}))

@unittest.skip("Testing in CI/CD")
@skip_for_tls
@with_trio
@with_bft_network(start_replica_cmd_with_vc_timeout("20000"),
Expand Down