Skip to content

Commit 0f758cf

Browse files
fluffyponystringhandlerSWvheerden
authored
feat: add connection pool cycling (#7011)
Description --- Allows for 16 outbound peers to be long-running + 12 that are forcibly reaped every 24 hours + 4 that are forcibly reaped every 2 hours. Additionally, new connections are never made to a node we've connected to in the last 7 days. Motivation and Context --- In order to prevent us locked in to a group of 32 nodes (think Spiderman meme) with 1 or more nodes mining, and thus being isolated from the network, we have to cycle some connections. How Has This Been Tested? --- Tested locally. What process can a PR reviewer use to test or verify this change? --- Run minotari_node with these changes for more than 2 hours (or adjust the reap time down to 5 minutes if you want to test faster) and look for "Rotating connection to xx as part of scheduled rotation" messages in the log. <!-- Checklist --> <!-- 1. Is the title of your PR in the form that would make nice release notes? The title, excluding the conventional commit tag, will be included exactly as is in the CHANGELOG, so please think about it carefully. --> Breaking Changes --- - [x] None <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced configurable connection rotation and cooldown periods for outbound peer connections. - Added connection history tracking to manage cooldowns and enforce reconnection delays. - Implemented connection count limits and periodic rotation of outbound connections. - Enhanced error reporting with new connection limit and cooldown-related error variants. - **Bug Fixes** - Enhanced enforcement of maximum outbound connection limits, preventing excess connections. - **Tests** - Added automated test to verify enforcement of outbound connection limits. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: stringhandler <stringhandler@protonmail.com> Co-authored-by: SW van Heerden <swvheerden@gmail.com>
1 parent 3c6683a commit 0f758cf

File tree

9 files changed

+469
-10
lines changed

9 files changed

+469
-10
lines changed

comms/core/src/builder/mod.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,4 +376,38 @@ impl CommsBuilder {
376376
protocol_extensions: ProtocolExtensions::new(),
377377
})
378378
}
379+
380+
/// Configure connection rotation settings
381+
///
382+
/// This configures how outbound connections are maintained and rotated:
383+
/// - `long_lived`: Number of connections that should remain long-lived (default: 16)
384+
/// - `daily_rotation`: Number of connections to rotate every 24 hours (default: 12)
385+
/// - `frequent_rotation`: Number of connections to rotate more frequently (every 2 hours) (default: 4)
386+
/// - `cooldown_days`: Number of days to wait before reconnecting to a recently disconnected peer (default: 7)
387+
///
388+
/// # Example
389+
///
390+
/// # use tari_comms::CommsBuilder;
391+
/// let builder = CommsBuilder::new()
392+
/// .with_connection_rotation(16, 12, 4, 7);
393+
pub fn with_connection_rotation(
394+
mut self,
395+
long_lived: usize,
396+
daily_rotation: usize,
397+
frequent_rotation: usize,
398+
cooldown_days: u64,
399+
) -> Self {
400+
// Ensure parameters are within reasonable bounds
401+
assert!(
402+
long_lived + daily_rotation + frequent_rotation > 0,
403+
"At least one connection type must have a non-zero count"
404+
);
405+
assert!(cooldown_days > 0, "Cooldown period must be positive");
406+
407+
self.connectivity_config.long_lived_connections = long_lived;
408+
self.connectivity_config.daily_rotation_connections = daily_rotation;
409+
self.connectivity_config.frequent_rotation_connections = frequent_rotation;
410+
self.connectivity_config.node_reconnection_cooldown = Duration::from_secs(cooldown_days * 24 * 60 * 60);
411+
self
412+
}
379413
}

comms/core/src/connection_manager/error.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use tokio::{sync::mpsc, time::error::Elapsed};
2525

2626
use crate::{
2727
connection_manager::PeerConnectionRequest,
28+
connectivity::ConnectivityError,
2829
multiplexing::YamuxControlError,
2930
noise,
3031
noise::NoiseError,
@@ -94,8 +95,11 @@ pub enum ConnectionManagerError {
9495
AllPeerAddressesAreExcluded(String),
9596
#[error("Yamux error: {0}")]
9697
YamuxControlError(#[from] YamuxControlError),
98+
#[error("Connectivity error: {0}")]
99+
ConnectivityError(#[from] Box<ConnectivityError>),
100+
#[error("Peer is in cooldown period")]
101+
PeerInCooldown,
97102
}
98-
99103
impl From<yamux::ConnectionError> for ConnectionManagerError {
100104
fn from(err: yamux::ConnectionError) -> Self {
101105
ConnectionManagerError::YamuxConnectionError(err.to_string())

comms/core/src/connectivity/config.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ pub struct ConnectivityConfig {
5252
/// The closest number of peer connections to maintain; connections above the threshold will be removed
5353
/// (default: disabled)
5454
pub maintain_n_closest_connections_only: Option<usize>,
55+
/// Number of connections that should be long-lived (not subject to periodic reaping)
56+
pub long_lived_connections: usize,
57+
/// Number of connections to rotate every 24 hours
58+
pub daily_rotation_connections: usize,
59+
/// Number of connections to rotate every 2 hours
60+
pub frequent_rotation_connections: usize,
61+
/// Minimum time before reconnecting to a previously connected node (7 days)
62+
pub node_reconnection_cooldown: Duration,
63+
/// Interval for rotating daily connections
64+
pub daily_rotation_interval: Duration,
65+
/// Interval for rotating frequent connections
66+
pub frequent_rotation_interval: Duration,
5567
}
5668

5769
impl Default for ConnectivityConfig {
@@ -66,6 +78,12 @@ impl Default for ConnectivityConfig {
6678
connection_tie_break_linger: Duration::from_secs(2),
6779
expire_peer_last_seen_duration: Duration::from_secs(24 * 60 * 60),
6880
maintain_n_closest_connections_only: None,
81+
long_lived_connections: 16,
82+
daily_rotation_connections: 12,
83+
frequent_rotation_connections: 4,
84+
node_reconnection_cooldown: Duration::from_secs(7 * 24 * 60 * 60), // 7 days
85+
daily_rotation_interval: Duration::from_secs(24 * 60 * 60), // 24 hours
86+
frequent_rotation_interval: Duration::from_secs(2 * 60 * 60), // 2 hours
6987
}
7088
}
7189
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// Copyright 2025, The Tari Project
2+
//
3+
// Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
4+
// following conditions are met:
5+
//
6+
// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following
7+
// disclaimer.
8+
//
9+
// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
10+
// following disclaimer in the documentation and/or other materials provided with the distribution.
11+
//
12+
// 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
13+
// products derived from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
16+
// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
18+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
20+
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
21+
// USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22+
23+
use std::{
24+
collections::HashMap,
25+
time::{Duration, Instant},
26+
};
27+
28+
use crate::peer_manager::NodeId;
29+
30+
/// Tracks connection history to nodes to enforce cooldown periods
31+
pub struct ConnectionHistory {
32+
/// Maps node IDs to the last time we disconnected from them
33+
last_disconnected: HashMap<NodeId, Instant>,
34+
}
35+
36+
impl ConnectionHistory {
37+
pub fn new() -> Self {
38+
Self {
39+
last_disconnected: HashMap::new(),
40+
}
41+
}
42+
43+
/// Record that we disconnected from a node
44+
pub fn record_disconnection(&mut self, node_id: &NodeId) {
45+
self.last_disconnected.insert(node_id.clone(), Instant::now());
46+
}
47+
48+
/// Check if a node is in cooldown period
49+
pub fn is_in_cooldown(&self, node_id: &NodeId, cooldown: Duration) -> bool {
50+
if let Some(last_time) = self.last_disconnected.get(node_id) {
51+
last_time.elapsed() < cooldown
52+
} else {
53+
false
54+
}
55+
}
56+
57+
/// Get the time elapsed since disconnection for a node
58+
pub fn time_since_disconnection(&self, node_id: &NodeId) -> Option<Duration> {
59+
self.last_disconnected.get(node_id).map(|time| time.elapsed())
60+
}
61+
62+
/// Clean up old history entries
63+
pub fn cleanup(&mut self, max_age: Duration) {
64+
self.last_disconnected.retain(|_, time| time.elapsed() < max_age);
65+
}
66+
67+
/// Get nodes that are not in cooldown as an iterator
68+
pub fn available_nodes<'a, I>(&'a self, nodes: I, cooldown: Duration) -> impl Iterator<Item = &'a NodeId> + 'a
69+
where I: Iterator<Item = &'a NodeId> + 'a {
70+
nodes.filter(move |node_id| !self.is_in_cooldown(node_id, cooldown))
71+
}
72+
73+
/// Get nodes that are not in cooldown (returns a Vec)
74+
pub fn get_available_nodes<'a, I>(&'a self, nodes: I, cooldown: Duration) -> Vec<NodeId>
75+
where I: Iterator<Item = &'a NodeId> + 'a {
76+
self.available_nodes(nodes, cooldown).cloned().collect()
77+
}
78+
}
79+
80+
impl Default for ConnectionHistory {
81+
fn default() -> Self {
82+
Self::new()
83+
}
84+
}

comms/core/src/connectivity/connection_pool.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,24 @@ impl ConnectionPool {
173173
})
174174
}
175175

176+
/// Get all outbound connections
177+
pub fn get_outbound_connections_mut(&mut self) -> Vec<&mut PeerConnection> {
178+
self.connections
179+
.values_mut()
180+
.filter_map(|c| c.connection_mut())
181+
.filter(|conn| conn.is_connected() && conn.direction().is_outbound())
182+
.collect()
183+
}
184+
185+
/// Get all outbound connections (non-mutable references)
186+
pub fn get_outbound_connections(&self) -> Vec<&PeerConnection> {
187+
self.connections
188+
.values()
189+
.filter_map(|c| c.connection())
190+
.filter(|conn| conn.is_connected() && conn.direction().is_outbound())
191+
.collect()
192+
}
193+
176194
pub(in crate::connectivity) fn filter_drain<P>(&mut self, mut predicate: P) -> Vec<PeerConnectionState>
177195
where P: FnMut(&PeerConnectionState) -> bool {
178196
let (keep, remove) = self

comms/core/src/connectivity/error.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@
2323
use thiserror::Error;
2424

2525
use crate::{connection_manager::ConnectionManagerError, peer_manager::PeerManagerError, PeerConnectionError};
26-
2726
/// Errors for the Connectivity actor.
28-
#[derive(Debug, Error)]
27+
#[derive(Debug, Error, Clone)]
2928
pub enum ConnectivityError {
3029
#[error("Cannot send request because ConnectivityActor disconnected")]
3130
ActorDisconnected,
@@ -34,7 +33,7 @@ pub enum ConnectivityError {
3433
#[error("PeerManagerError: {0}")]
3534
PeerManagerError(#[from] PeerManagerError),
3635
#[error("Peer connection error: {0}")]
37-
PeerConnectionError(#[from] PeerConnectionError),
36+
PeerConnectionError(String),
3837
#[error("ConnectionFailed: {0}")]
3938
ConnectionFailed(ConnectionManagerError),
4039
#[error("Connectivity event stream closed unexpectedly")]
@@ -45,6 +44,8 @@ pub enum ConnectivityError {
4544
DialCancelled,
4645
#[error("Client cancelled: '{0}'")]
4746
ClientCancelled(String),
47+
#[error("Connection limit reached ({current}/{max} connections)")]
48+
ConnectionLimitReached { current: usize, max: usize },
4849
}
4950

5051
impl From<ConnectionManagerError> for ConnectivityError {
@@ -55,3 +56,9 @@ impl From<ConnectionManagerError> for ConnectivityError {
5556
}
5657
}
5758
}
59+
60+
impl From<PeerConnectionError> for ConnectivityError {
61+
fn from(err: PeerConnectionError) -> Self {
62+
Self::PeerConnectionError(err.to_string())
63+
}
64+
}

0 commit comments

Comments
 (0)