Skip to content

Commit 98f5f3b

Browse files
committed
Fix more errors
1 parent c45052c commit 98f5f3b

File tree

13 files changed

+99
-53
lines changed

13 files changed

+99
-53
lines changed

bftengine/src/bcstatetransfer/AsyncStateTransferCRE.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ std::shared_ptr<ClientReconfigurationEngine> CreFactory::create(
184184
Config cre_config;
185185
cre_config.id_ = repConfig.replicaId;
186186
cre_config.interval_timeout_ms_ = 1000;
187-
IStateClient* pbc = new PollBasedStateClient(bftClient, cre_config.interval_timeout_ms_, 0, cre_config.id_);
187+
// TODO: fix relying on f + 1, so that byzantine replicas are also handled
188+
IStateClient* pbc = new PollBasedStateClient(bftClient, cre_config.interval_timeout_ms_, 0, cre_config.id_, true);
188189
auto cre =
189190
std::make_shared<ClientReconfigurationEngine>(cre_config, pbc, std::make_shared<concordMetrics::Aggregator>());
190191
if (bftEngine::ReplicaConfig::instance().isReadOnly) {

bftengine/src/bftengine/ReplicaForStateTransfer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ void ReplicaForStateTransfer::start() {
9797
// on other replicas but not on this one, finishing ST does not mean that missed key exchanges are executed)
9898
// This can be done by iterating the saved cryptosystems and updating their private key if their
9999
// public key matches the candidate saved in KeyExchangeManager
100+
// TODO: persist the candidate
100101
CryptoManager::instance().onCheckpoint(checkpoint);
101102
auto [priv, pub] = KeyExchangeManager::instance().getCandidateKeyPair();
102103
CryptoManager::instance().syncPrivateKeyAfterST(priv, pub);
@@ -108,6 +109,8 @@ void ReplicaForStateTransfer::start() {
108109
auto *pbc =
109110
reinterpret_cast<concord::client::reconfiguration::PollBasedStateClient *>(cre_->getStateClient());
110111

112+
// TODO: remove loop so that state transfer doesn't hang if it cannot complete reconfiguration requests
113+
// The current implementation expects f + 1 identical responses
111114
bool succ = false;
112115
while (!succ) {
113116
auto latestHandledUpdate = cre_->getLatestKnownUpdateBlock();

bftengine/src/bftengine/ReplicaImp.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2372,14 +2372,14 @@ void ReplicaImp::onMessage<CheckpointMsg>(std::unique_ptr<CheckpointMsg> message
23722372
static uint32_t maxTimeSinceLastExecutionInMainWindowMs =
23732373
config_.get<uint32_t>("concord.bft.st.maxTimeSinceLastExecutionInMainWindowMs", 5000);
23742374

2375-
Time timeOfLastEcecution = MinTime;
2375+
Time timeOfLastExecution = MinTime;
23762376
if (mainLog->insideActiveWindow(lastExecutedSeqNum))
2377-
timeOfLastEcecution = mainLog->get(lastExecutedSeqNum).lastUpdateTimeOfCommitMsgs();
2378-
if ((getMonotonicTime() - timeOfLastEcecution) > (milliseconds(maxTimeSinceLastExecutionInMainWindowMs))) {
2377+
timeOfLastExecution = mainLog->get(lastExecutedSeqNum).lastUpdateTimeOfCommitMsgs();
2378+
if ((getMonotonicTime() - timeOfLastExecution) > (milliseconds(maxTimeSinceLastExecutionInMainWindowMs))) {
23792379
LOG_INFO(GL,
23802380
"Number of stable checkpoints in current window: "
23812381
<< numRelevant << " time since last execution: "
2382-
<< (getMonotonicTime() - timeOfLastEcecution).count() << " ms");
2382+
<< (getMonotonicTime() - timeOfLastExecution).count() << " ms");
23832383
askForStateTransfer = true;
23842384
startStReason = "Too much time has passed since last execution";
23852385
}
@@ -2911,7 +2911,7 @@ void ReplicaImp::onMessage<ViewChangeMsg>(std::unique_ptr<ViewChangeMsg> message
29112911
ViewNum maxKnownCorrectView = 0;
29122912
ViewNum maxKnownAgreedView = 0;
29132913
viewsManager->computeCorrectRelevantViewNumbers(&maxKnownCorrectView, &maxKnownAgreedView);
2914-
LOG_INFO(VC_LOG, "View Number details: " << KVLOG(maxKnownCorrectView, maxKnownAgreedView));
2914+
LOG_INFO(VC_LOG, "View Number details: " << KVLOG(maxKnownCorrectView, maxKnownAgreedView, getCurrentView()));
29152915

29162916
if (maxKnownCorrectView > getCurrentView()) {
29172917
// we have at least f+1 view-changes with view number >= maxKnownCorrectView

bftengine/src/bftengine/messages/ClientRequestMsg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ void ClientRequestMsg::validateImp(const ReplicasInfo& repInfo) const {
141141
(repInfo.isIdOfReplica(clientId) || repInfo.isIdOfPeerRoReplica(clientId))) {
142142
// Allow every reconfiguration/internal message from replicas (it will be verified in the reconfiguration handler)
143143
LOG_INFO(CNSUS,
144-
"Reconfig/Internal replica message not validated"
144+
"Reconfig/Internal replica message validation skipped"
145145
<< KVLOG(clientId, header->flags & RECONFIG_FLAG, header->flags & INTERNAL_FLAG));
146146
return;
147147
}

client/reconfiguration/include/client/reconfiguration/poll_based_state_client.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ class PollBasedStateClient : public IStateClient {
2929
PollBasedStateClient(bft::client::Client* client,
3030
uint64_t interval_timeout_ms,
3131
uint64_t last_known_block,
32-
const uint16_t id_);
32+
const uint16_t id_,
33+
bool use_byzantine_quorum = false);
3334
State getNextState() const override;
3435
bool updateState(const WriteState& state) override;
3536
~PollBasedStateClient();
@@ -63,6 +64,8 @@ class PollBasedStateClient : public IStateClient {
6364
bool halted_ = false;
6465
std::condition_variable resume_cond_;
6566
std::mutex resume_lock_;
67+
// At the end of State transfer we use a f + 1 quorum
68+
bool use_byzantine_quorum_ = false;
6669
};
6770

6871
} // namespace concord::client::reconfiguration

client/reconfiguration/src/poll_based_state_client.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@ concord::messages::ReconfigurationResponse PollBasedStateClient::sendReconfigura
2929
concord::messages::ReconfigurationResponse rres;
3030
try {
3131
if (read_request) {
32-
bft::client::ReadConfig read_config{request_config, bft::client::LinearizableQuorum{}};
32+
// TODO: State transfer can work with f + 1 as long as there are no byzantine replicas
33+
bft::client::ReadConfig read_config;
34+
if (use_byzantine_quorum_) {
35+
read_config = bft::client::ReadConfig{request_config, bft::client::ByzantineSafeQuorum{}};
36+
} else {
37+
read_config = bft::client::ReadConfig{request_config, bft::client::LinearizableQuorum{}};
38+
}
39+
3340
rep = bftclient_->send(read_config, std::move(msg));
3441
} else {
3542
bft::client::WriteConfig write_config{request_config, bft::client::LinearizableQuorum{}};
@@ -57,12 +64,14 @@ State PollBasedStateClient::getNextState() const {
5764
PollBasedStateClient::PollBasedStateClient(bft::client::Client* client,
5865
uint64_t interval_timeout_ms,
5966
uint64_t last_known_block,
60-
const uint16_t id)
67+
const uint16_t id,
68+
bool use_byzantine_quorum)
6169
: bftclient_{client},
6270
id_{id},
6371
interval_timeout_ms_{interval_timeout_ms},
6472
last_known_block_{last_known_block},
65-
sn_gen_(bft::client::ClientId{id}) {}
73+
sn_gen_(bft::client::ClientId{id}),
74+
use_byzantine_quorum_{use_byzantine_quorum} {}
6675

6776
std::vector<State> PollBasedStateClient::getStateUpdate(bool& succ) const {
6877
concord::messages::ClientReconfigurationStateRequest creq{id_};

tests/apollo/test_skvbc_backup_restore.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,19 @@ class SkvbcBackupRestoreTest(ApolloTest):
5959
@with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7)
6060
async def test_checkpoint_propagation_after_restarting_replicas(self, bft_network):
6161
"""
62-
Here we trigger a checkpoint, restart all replicas in a random order with 5s delay in-between,
63-
both while stopping and starting. We verify checkpoint persisted upon restart and then trigger
62+
Here we trigger a checkpoint, restart all replicas.
63+
We verify checkpoint persisted upon restart and then trigger
6464
another checkpoint. We make sure checkpoint is propagated to all the replicas.
6565
1) Given a BFT network, we make sure all nodes are up
6666
2) Send sufficient number of client requests to trigger checkpoint protocol
67-
3) Stop all replicas in a random order (with 5s delay in between)
68-
4) Start all replicas in a random order (with 5s delay in between)
67+
3) Stop all replicas in a random order
68+
4) Start all replicas in a random order
6969
5) Make sure the initial view is stable
7070
6) Send sufficient number of client requests to trigger another checkpoint
7171
7) Make sure checkpoint propagates to all the replicas
72+
73+
Note: UDP configuration waits for 5 seconds until it assumes network communication is established.
74+
A replica can thus trigger a view change if it
7275
"""
7376
bft_network.start_all_replicas()
7477
skvbc = kvbc.SimpleKVBCProtocol(bft_network)
@@ -87,20 +90,15 @@ async def test_checkpoint_propagation_after_restarting_replicas(self, bft_networ
8790
verify_checkpoint_persistency=False
8891
)
8992

90-
# stop n replicas in a random order with a delay of 5s in between
91-
stopped_replicas = await self._stop_random_replicas_with_delay(bft_network, delay=5,
92-
exclude_replicas={current_primary})
93-
bft_network.stop_replica(current_primary)
94-
# start stopped replicas in a random order with a delay of 5s in between
95-
bft_network.start_replica(current_primary)
96-
await self._start_random_replicas_with_delay(bft_network, stopped_replicas, delay=5)
97-
93+
bft_network.stop_all_replicas()
94+
bft_network.start_all_replicas()
95+
stopped_replicas = bft_network.all_replicas()
9896
# verify checkpoint persistence
9997
log.log_message(message_type=f"Wait for replicas to reach checkpoint", checkpoint=checkpoint_before+1,
100-
replicas=[current_primary] + list(stopped_replicas))
98+
replicas=stopped_replicas)
10199
await bft_network.wait_for_replicas_to_checkpoint(
102100
stopped_replicas,
103-
expected_checkpoint_num=lambda ecn: ecn == checkpoint_before + 1)
101+
expected_checkpoint_num=lambda ecn: ecn >= checkpoint_before + 1)
104102

105103
# verify current view is stable
106104
for replica in bft_network.all_replicas():

tests/apollo/test_skvbc_checkpoints.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,12 +395,11 @@ async def test_checkpoint_propagation_after_f_nodes_including_primary_isolated(s
395395
# Once the adversary is gone, the isolated replicas should be able reach the checkpoint
396396
await bft_network.wait_for_replicas_to_checkpoint(
397397
isolated_replicas,
398-
expected_checkpoint_num=lambda ecn: ecn == checkpoint_before + 1)
398+
expected_checkpoint_num=lambda ecn: ecn >= checkpoint_before + 1)
399399

400400
@with_trio
401401
@with_bft_network(start_replica_cmd_with_corrupted_checkpoint_msgs(corrupt_checkpoints_from_replica_ids={ 1 }),
402402
selected_configs=lambda n, f, c: n == 7)
403-
404403
async def test_rvt_conflict_detection_after_corrupting_checkpoint_msg_for_single_replica(self, bft_network):
405404
await self._test_checkpointing_with_corruptions(bft_network, { 1 })
406405

tests/apollo/util/bft.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,7 @@ def monitor_replica_subproc(subproc: subprocess.Popen, replica_id: int, stop_eve
819819
f"return code = {return_code}"
820820
log_message(message_type=f"{error_msg}, aborting test", replica_log=stdout_file.name)
821821
stdout_file.write(f"####### FATAL ERROR: The process has crashed, subproc return value: {return_code}\n")
822+
print(error_msg, file=sys.stderr)
822823
os.kill(os.getpid(), signal.SIGINT)
823824
break
824825

tests/apollo/util/bft_network_partitioning.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from functools import partial
1919
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "pyclient"))
2020
import bft_config
21+
from util import eliot_logging as log
2122

2223
class NetworkPartitioningAdversary(ABC):
2324
"""Represents an adversary capable of inflicting network partitioning"""
@@ -42,6 +43,7 @@ def __enter__(self):
4243
def __exit__(self, *args):
4344
"""context manager method for 'with' statements"""
4445
self._remove_bft_network_rule_chain()
46+
log.log_message(message_type=f"Interference terminated")
4547

4648
@abstractmethod
4749
def interfere(self):
@@ -236,6 +238,7 @@ def __init__(self, bft_network, replicas_to_isolate):
236238
super(ReplicaSubsetIsolatingAdversary, self).__init__(bft_network)
237239

238240
def interfere(self):
241+
log.log_message(message_type=f"Disabling replicas communication", replicas=self.replicas_to_isolate)
239242
other_replicas = set(self.bft_network.all_replicas()) - set(self.replicas_to_isolate)
240243
for ir in self.replicas_to_isolate:
241244
for r in other_replicas:

0 commit comments

Comments
 (0)