Skip to content

Commit dc458f4

Browse files
committed
remove state transfer cre loop
1 parent 3bcec13 commit dc458f4

File tree

3 files changed

+40
-26
lines changed

3 files changed

+40
-26
lines changed

bftengine/src/bcstatetransfer/AsyncStateTransferCRE.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ class InternalSigner : public concord::crypto::ISigner {
8181
std::string getPrivKey() const override { return ""; }
8282
};
8383

84+
// Scaling command may break state transfer itself (if, for example, we scale from n1 to n2 < n1 replicas). For that we
85+
// need a client like mechanism which work asynchronously to the state itself. However, a committer replica, may end
86+
// state transfer and get the scale command in the state, before it was caught by CRE. In this case, the command is
87+
// handled by the reconfiguration state transfer callback (see concordbft/kvbc/include/st_reconfiguraion_sm.hpp). The
88+
// two mechanisms are synchronized via the configurations list. Note that we are unable to synchronize them based on
89+
// epoch number, because CRE is (at the moment) unaware to epochs. Epochs are shared via reserved pages which are
90+
// getting updated at the end of state transfer.
8491
class ScalingReplicaHandler : public IStateHandler {
8592
public:
8693
ScalingReplicaHandler() {}
@@ -100,7 +107,7 @@ class ScalingReplicaHandler : public IStateHandler {
100107
std::stringstream stream;
101108
stream << configurations_file.rdbuf();
102109
std::string configs = stream.str();
103-
return (configs.empty()) || (configs.find(command.config_descriptor) != std::string::npos);
110+
return (configs.empty()) || (configs.find(command.config_descriptor) == std::string::npos);
104111
}
105112
}
106113
return false;

bftengine/src/bftengine/ReplicaForStateTransfer.cpp

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -74,31 +74,13 @@ void ReplicaForStateTransfer::start() {
7474
stateTransfer->setReconfigurationEngine(cre_);
7575
stateTransfer->addOnTransferringCompleteCallback(
7676
[this](std::uint64_t) {
77-
// TODO - The next lines up to comment 'YYY' do not belong here (CRE) - consider refactor or move outside
7877
if (!config_.isReadOnly) {
79-
// At this point, we, if are not going to have another blocks in state transfer. So, we can safely stop CRE.
80-
// if there is a reconfiguration state change that prevents us from starting another state transfer (i.e.
81-
// scaling) then CRE probably won't work as well.
82-
// 1. First, make sure we handled the most recent available updates.
83-
concord::client::reconfiguration::PollBasedStateClient *pbc =
84-
(concord::client::reconfiguration::PollBasedStateClient *)(cre_->getStateClient());
85-
bool succ = false;
86-
while (!succ) {
87-
auto latestHandledUpdate = cre_->getLatestKnownUpdateBlock();
88-
auto latestReconfUpdates = pbc->getStateUpdate(succ);
89-
if (!succ) {
90-
LOG_WARN(GL, "unable to get the latest reconfiguration updates");
91-
}
92-
for (const auto &update : latestReconfUpdates) {
93-
if (update.blockid > latestHandledUpdate) {
94-
succ = false;
95-
break;
96-
} // else if (!isGettingBlocks)
97-
}
98-
} // while (!succ) {
78+
// CRE may not read all the relevant updates. This may be an issue only in a committer node, which do not run
79+
// state transfer continually. If this is the case, we need to execute the the relevant reconfiguration
80+
// commands, based on the state we gained during state transfer. In any such case, it is the developer
81+
// responsibility to synchronize between the two mechanisms. For example, see the AddRemoveWithWedgeCommand
82+
// handler in concordbft/kvbc/include/st_reconfiguraion_sm.hpp
9983
LOG_INFO(GL, "halting cre");
100-
// 2. Now we can safely halt cre. We know for sure that there are no update in the state transffered
101-
// blocks that haven't been handled yet
10284
cre_->halt();
10385
}
10486
},

kvbc/src/st_reconfiguration_sm.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,33 @@ bool StReconfigurationHandler::handle(const concord::messages::AddRemoveWithWedg
128128
uint64_t bft_seq_num,
129129
uint64_t current_cp_num,
130130
uint64_t bid) {
131-
return handleWedgeCommands(
132-
command, bid, current_cp_num, bft_seq_num, command.bft_support, true, command.restart, command.restart);
131+
// This callback should work together with the asyncCRE scaling handler. If the scale command has broken state
132+
// transfer itself, we won't even get to that point, and the CRE is expected to handle this case. However, if we did
133+
// manage to complete state transfer, CRE is halted and we need to execute the command based on the state we gained
134+
// during state transfer. In order not to execute the command twice, we do check that this configuration was not
135+
// already executed by reading the local configuration list.
136+
std::ofstream configurations_file;
137+
configurations_file.open(bftEngine::ReplicaConfig::instance().configurationViewFilePath + "/" +
138+
concord::reconfiguration::configurationsFileName + "." +
139+
std::to_string(bftEngine::ReplicaConfig::instance().replicaId),
140+
std::ios_base::app);
141+
if (configurations_file.good()) {
142+
std::stringstream stream;
143+
stream << configurations_file.rdbuf();
144+
std::string configs = stream.str();
145+
if (configs.find(command.config_descriptor) != std::string::npos) {
146+
LOG_INFO(GL, "the scale command was already executed by async CRE, we won't execute it again");
147+
return false;
148+
}
149+
}
150+
bool succ = true;
151+
concord::messages::ReconfigurationResponse response;
152+
for (auto &h : orig_reconf_handlers_) {
153+
// If it was written to the blockchain, it means that this is a valid request.
154+
// We do need to execute every relevant reconfiguration handler to complete the scale command.
155+
succ &= h->handle(command, bft_seq_num, UINT32_MAX, std::nullopt, response);
156+
}
157+
return succ;
133158
}
134159

135160
bool StReconfigurationHandler::handle(const concord::messages::RestartCommand &command,

0 commit comments

Comments
 (0)