Skip to content

Commit 15ee160

Browse files
committed
feat(multinode): teardown old containers on remote nodes and improve node management
- Fix old containers not being cleaned up on worker nodes during redeployment by routing teardown through RemoteNodeDeployer when container has a node_id - Add encryption_service to MarkDeploymentCompleteJob and DeploymentService for decrypting node tokens during remote teardown - Improve node health checks, proxy routing, and CLI node management
1 parent 1a7e131 commit 15ee160

File tree

22 files changed

+1626
-231
lines changed

22 files changed

+1626
-231
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
4848
- Docker container names are now used instead of Docker network aliases for cross-node environment variable rewriting, fixing service connectivity on remote worker nodes
4949
- Deployment "marking complete" step could hang for the full 60-second timeout when the job queue was busy: the DB poll fallback (which confirms the route table update via database query) was only checked when the queue receiver timed out, but a steady stream of unrelated queue events prevented the timeout from ever firing; the poll now runs on every loop iteration regardless of queue activity
5050
- Remote environment variables are no longer built when no active worker nodes exist, avoiding unnecessary work in single-node deployments
51+
- **Phantom deployments on node drain/failover**: drain and failover previously called `trigger_pipeline` with no branch/tag/commit, creating broken "preview" deployments with empty git context; now uses smart drain logic that retires containers on the draining node when healthy replicas exist on other nodes, and only triggers a full redeploy (with correct git context from the latest successful deployment) when all replicas are on the affected node
5152

5253
### Added
5354
- Automatic `CRON_SECRET` injection into deployed containers: the deployment token is now set as `CRON_SECRET` in the container environment on every deployment, and the cron scheduler sends `Authorization: Bearer <CRON_SECRET>` when invoking endpoints — no manual configuration needed

crates/temps-cli/src/commands/join.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ impl JoinCommand {
131131
"name": node_name,
132132
"token": agent_token,
133133
"join_token": self.token,
134-
"address": format!("http://{}:{}", private_address, self.agent_address.split(':').next_back().unwrap_or("3100")),
134+
"address": format!("http://{}:{}", private_address.trim(), self.agent_address.split(':').next_back().unwrap_or("3100").trim()),
135135
"private_address": private_address,
136136
"labels": labels,
137137
});
@@ -266,7 +266,12 @@ impl JoinCommand {
266266
relay_response.control_plane_url
267267
);
268268

269-
let agent_port = self.agent_address.split(':').next_back().unwrap_or("3100");
269+
let agent_port = self
270+
.agent_address
271+
.split(':')
272+
.next_back()
273+
.unwrap_or("3100")
274+
.trim();
270275

271276
let register_body = serde_json::json!({
272277
"name": node_name,

crates/temps-cli/src/commands/node.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ pub enum NodeSubcommand {
2323
Show(NodeShowCommand),
2424
/// Drain a node: stop scheduling new containers and redeploy existing ones
2525
Drain(NodeDrainCommand),
26+
/// Undrain a node: reactivate it so it can accept new deployments again
27+
Undrain(NodeUndrainCommand),
2628
/// Remove a node from the cluster (must be drained first)
2729
#[command(alias = "rm")]
2830
Remove(NodeRemoveCommand),
@@ -68,6 +70,18 @@ pub struct NodeDrainCommand {
6870
pub timeout: u64,
6971
}
7072

73+
#[derive(Args)]
74+
pub struct NodeUndrainCommand {
75+
/// Node ID to undrain
76+
pub node_id: i32,
77+
/// API base URL
78+
#[arg(long, env = "TEMPS_API_URL")]
79+
pub api_url: String,
80+
/// API authentication token
81+
#[arg(long, env = "TEMPS_API_TOKEN")]
82+
pub api_token: String,
83+
}
84+
7185
#[derive(Args)]
7286
pub struct NodeRemoveCommand {
7387
/// Node ID to remove
@@ -203,6 +217,7 @@ impl NodeCommand {
203217
NodeSubcommand::List(cmd) => execute_list(cmd).await,
204218
NodeSubcommand::Show(cmd) => execute_show(cmd).await,
205219
NodeSubcommand::Drain(cmd) => execute_drain(cmd).await,
220+
NodeSubcommand::Undrain(cmd) => execute_undrain(cmd).await,
206221
NodeSubcommand::Remove(cmd) => execute_remove(cmd).await,
207222
}
208223
})
@@ -458,6 +473,47 @@ async fn execute_drain(cmd: NodeDrainCommand) -> anyhow::Result<()> {
458473
Ok(())
459474
}
460475

476+
#[derive(Debug, Deserialize)]
477+
struct UndrainNodeResponse {
478+
name: String,
479+
status: String,
480+
message: String,
481+
}
482+
483+
async fn execute_undrain(cmd: NodeUndrainCommand) -> anyhow::Result<()> {
484+
let client = make_client();
485+
let url = api_url(&cmd.api_url, &format!("/nodes/{}/drain", cmd.node_id));
486+
487+
println!(
488+
" {} Undraining node {}...",
489+
"⏳".bright_yellow(),
490+
cmd.node_id
491+
);
492+
493+
let response = client
494+
.delete(&url)
495+
.header("Authorization", format!("Bearer {}", cmd.api_token))
496+
.send()
497+
.await
498+
.map_err(|e| anyhow::anyhow!("Failed to connect to API: {}", e))?;
499+
500+
if !response.status().is_success() {
501+
return Err(handle_api_error(response).await);
502+
}
503+
504+
let data: UndrainNodeResponse = response.json().await?;
505+
506+
println!(
507+
" {} Node '{}' is now {}",
508+
"✓".bright_green(),
509+
data.name.bright_cyan(),
510+
data.status.bright_green()
511+
);
512+
println!(" {}", data.message);
513+
514+
Ok(())
515+
}
516+
461517
async fn execute_remove(cmd: NodeRemoveCommand) -> anyhow::Result<()> {
462518
if !cmd.yes {
463519
// Show node info first

crates/temps-deployments/src/handlers/deployments.rs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2581,22 +2581,23 @@ mod tests {
25812581
"temps-test".to_string(),
25822582
));
25832583

2584+
let encryption_service = Arc::new(
2585+
temps_core::EncryptionService::new(
2586+
"0000000000000000000000000000000000000000000000000000000000000000",
2587+
)
2588+
.expect("Failed to create test encryption service"),
2589+
);
2590+
25842591
let deployment_service = Arc::new(crate::services::services::DeploymentService::new(
25852592
db.clone(),
25862593
log_service.clone(),
25872594
config_service.clone(),
25882595
queue_service.clone(),
25892596
docker_log_service,
25902597
deployer,
2598+
encryption_service.clone(),
25912599
));
25922600

2593-
let encryption_service = Arc::new(
2594-
temps_core::EncryptionService::new(
2595-
"0000000000000000000000000000000000000000000000000000000000000000",
2596-
)
2597-
.expect("Failed to create test encryption service"),
2598-
);
2599-
26002601
let deployment_token_service = Arc::new(
26012602
crate::services::deployment_token_service::DeploymentTokenService::new(
26022603
db.clone(),
@@ -2701,6 +2702,9 @@ mod tests {
27012702
image_builder: Arc::new(MockImageBuilder) as Arc<dyn temps_deployer::ImageBuilder>,
27022703
audit_service: Arc::new(MockAuditLogger) as Arc<dyn temps_core::AuditLogger>,
27032704
node_service: Arc::new(crate::services::NodeService::new(db.clone())),
2705+
encryption_service: Arc::new(
2706+
temps_core::EncryptionService::new("01234567890123456789012345678901").unwrap(),
2707+
),
27042708
})
27052709
}
27062710

0 commit comments

Comments
 (0)