Skip to content

Commit 203fbc8

Browse files
committed
fix(agents,workspace,proxy): phase 3 security hardening
- Sandbox kill_processes now takes a typed KillSignal enum (Term|Kill) instead of a raw i32, constraining callers to SIGTERM/SIGKILL. Stops a future caller from smuggling arbitrary signals (SIGSTOP, SIGUSR1, etc.) through the sandbox exec boundary. Unit tests cover the enum-to-signum mapping and Copy-ness. - PreviewAuthLimiter's failure map is now hard-capped at 65_536 entries with opportunistic expired-entry eviction + oldest-entry fallback. Closes the unbounded-memory-growth vector where an attacker sprays unique (ip, session_id) pairs to OOM the proxy. Test sprays 70k unique keys and asserts the map stays within cap.
1 parent bea11ee commit 203fbc8

File tree

6 files changed

+152
-18
lines changed

6 files changed

+152
-18
lines changed

crates/temps-agents/src/sandbox/docker.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,21 +1122,22 @@ impl SandboxProvider for DockerSandboxProvider {
11221122
&self,
11231123
handle: &SandboxHandle,
11241124
pattern: &str,
1125-
signal: i32,
1125+
signal: super::KillSignal,
11261126
) -> Result<(), AgentError> {
11271127
// Fresh exec running pkill. pkill exits 0 if something was killed,
11281128
// 1 if nothing matched — both are success from our POV. Bounded by
11291129
// a 10s timeout so we never replicate the phantom-stream hang.
11301130
//
11311131
// Use `pgrep` + `kill` instead of `pkill -f` to handle both busybox
11321132
// and util-linux pkill variants uniformly.
1133+
let sig_num = signal.as_number();
11331134
let cmd = vec![
11341135
"sh".to_string(),
11351136
"-c".to_string(),
11361137
format!(
11371138
"pgrep -f {pattern_q} 2>/dev/null | xargs -r kill -{sig} 2>/dev/null; exit 0",
11381139
pattern_q = shell_quote(pattern),
1139-
sig = signal,
1140+
sig = sig_num,
11401141
),
11411142
];
11421143

@@ -1145,7 +1146,7 @@ impl SandboxProvider for DockerSandboxProvider {
11451146
Ok(Ok(_)) => {
11461147
tracing::debug!(
11471148
"kill_processes: sent signal {} to '{}' in {}",
1148-
signal,
1149+
sig_num,
11491150
pattern,
11501151
handle.sandbox_name
11511152
);

crates/temps-agents/src/sandbox/local.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,11 @@ impl SandboxProvider for LocalSandboxProvider {
153153
&self,
154154
handle: &SandboxHandle,
155155
pattern: &str,
156-
signal: i32,
156+
signal: super::KillSignal,
157157
) -> Result<(), AgentError> {
158158
// Best-effort pkill on the host. Scoped to the current user so we
159159
// don't accidentally kill other things.
160-
let sig_flag = format!("-{}", signal);
160+
let sig_flag = format!("-{}", signal.as_number());
161161
let _ = tokio::process::Command::new("pkill")
162162
.arg(&sig_flag)
163163
.arg("-f")

crates/temps-agents/src/sandbox/mod.rs

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,30 @@ use std::time::Duration;
99
use crate::ai_cli::OnEventCallback;
1010
use crate::error::AgentError;
1111

12+
/// Unix signal to send to processes inside a sandbox. Constrained on purpose
13+
/// — the only two signals the workspace ever needs are SIGTERM (graceful
14+
/// shutdown, give the CLI a chance to flush state) and SIGKILL (hard kill
15+
/// after a grace period). Passing arbitrary integers across the provider
16+
/// boundary would invite untrusted callers to stuff anything from SIGSTOP
17+
/// to SIGUSR1 into the sandbox exec.
18+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19+
pub enum KillSignal {
20+
/// SIGTERM (15) — graceful termination. The process may trap it.
21+
Term,
22+
/// SIGKILL (9) — immediate kill. Cannot be trapped.
23+
Kill,
24+
}
25+
26+
impl KillSignal {
27+
/// Unix signal number used by `kill(1)` / `pkill -<n>`.
28+
pub fn as_number(self) -> i32 {
29+
match self {
30+
KillSignal::Term => 15,
31+
KillSignal::Kill => 9,
32+
}
33+
}
34+
}
35+
1236
/// A handle to an active sandbox. Opaque to callers — the internal fields
1337
/// are provider-specific (Docker container ID, Vercel sandbox ID, etc.).
1438
#[derive(Debug, Clone)]
@@ -98,18 +122,18 @@ pub trait SandboxProvider: Send + Sync {
98122

99123
/// Kill processes inside the sandbox matching a pgrep/pkill pattern.
100124
///
101-
/// `signal` is a Unix signal number (15 = SIGTERM, 9 = SIGKILL).
102-
/// `pattern` is passed to `pkill -f` — it matches against the full
103-
/// command line, so prefer anchored patterns like `^claude ` to avoid
104-
/// killing unrelated processes.
125+
/// `signal` is constrained to [`KillSignal`] — only SIGTERM/SIGKILL are
126+
/// valid. `pattern` is passed to `pkill -f` — it matches against the
127+
/// full command line, so prefer anchored patterns like `^claude ` to
128+
/// avoid killing unrelated processes.
105129
///
106130
/// Returns Ok(()) whether or not anything was actually killed — the
107131
/// operation is inherently best-effort.
108132
async fn kill_processes(
109133
&self,
110134
handle: &SandboxHandle,
111135
pattern: &str,
112-
signal: i32,
136+
signal: KillSignal,
113137
) -> Result<(), AgentError>;
114138

115139
/// Destroy sandbox and clean up its container.
@@ -172,3 +196,28 @@ pub trait SandboxProvider: Send + Sync {
172196
/// Delete and rebuild the sandbox image. Returns the image name.
173197
async fn rebuild_image(&self) -> Result<String, AgentError>;
174198
}
199+
200+
#[cfg(test)]
201+
mod tests {
202+
use super::*;
203+
204+
#[test]
205+
fn kill_signal_term_is_15() {
206+
assert_eq!(KillSignal::Term.as_number(), 15);
207+
}
208+
209+
#[test]
210+
fn kill_signal_kill_is_9() {
211+
assert_eq!(KillSignal::Kill.as_number(), 9);
212+
}
213+
214+
#[test]
215+
fn kill_signal_is_copy() {
216+
// Guards that KillSignal stays cheap to pass — if someone adds a
217+
// String payload later, this stops compiling and forces a review.
218+
let s = KillSignal::Term;
219+
let a = s;
220+
let b = s;
221+
assert_eq!(a.as_number(), b.as_number());
222+
}
223+
}

crates/temps-proxy/src/preview_auth.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,12 @@ struct FailureState {
206206
window_start: Option<Instant>,
207207
}
208208

209+
/// Hard cap on distinct (ip, session_id) pairs tracked concurrently.
210+
/// An attacker spraying unique IPs/session_ids can no longer grow this map
211+
/// without bound — at the cap we sweep expired entries, and if that fails
212+
/// to free space we drop the oldest entry.
213+
const MAX_TRACKED_ENTRIES: usize = 65_536;
214+
209215
/// In-memory rate limiter for preview auth failures.
210216
#[derive(Debug, Default)]
211217
pub struct PreviewAuthLimiter {
@@ -231,6 +237,27 @@ impl PreviewAuthLimiter {
231237
}
232238

233239
pub fn record_failure(&self, ip: IpAddr, session_id: i32) {
240+
// Cap enforcement: opportunistically evict before insert so the map
241+
// cannot be weaponized as an unbounded memory sink.
242+
if !self.failures.contains_key(&(ip, session_id))
243+
&& self.failures.len() >= MAX_TRACKED_ENTRIES
244+
{
245+
self.evict_expired();
246+
if self.failures.len() >= MAX_TRACKED_ENTRIES {
247+
// Still full after sweep — drop the single oldest entry.
248+
// Picking *an* entry is fine; under attack the whole map
249+
// turns over quickly and legit users are re-inserted.
250+
if let Some(victim) = self
251+
.failures
252+
.iter()
253+
.min_by_key(|e| e.value().window_start)
254+
.map(|e| *e.key())
255+
{
256+
self.failures.remove(&victim);
257+
}
258+
}
259+
}
260+
234261
let mut entry = self.failures.entry((ip, session_id)).or_default();
235262
let now = Instant::now();
236263
match entry.window_start {
@@ -247,6 +274,15 @@ impl PreviewAuthLimiter {
247274
pub fn record_success(&self, ip: IpAddr, session_id: i32) {
248275
self.failures.remove(&(ip, session_id));
249276
}
277+
278+
/// Drop all entries whose window has expired. O(n), but only called when
279+
/// we hit the cap — amortized cost is negligible under normal load.
280+
fn evict_expired(&self) {
281+
self.failures.retain(|_, state| match state.window_start {
282+
Some(start) => start.elapsed() <= RATE_LIMIT_WINDOW,
283+
None => false,
284+
});
285+
}
250286
}
251287

252288
/// Verify a plaintext password against an argon2 PHC hash. Public so the
@@ -437,4 +473,30 @@ mod tests {
437473
limiter.record_success(ip, 2);
438474
assert!(!limiter.is_blocked(ip, 2));
439475
}
476+
477+
#[test]
478+
fn rate_limiter_is_bounded_under_flood() {
479+
// Spray far more unique (ip, session) pairs than the cap allows.
480+
// The map must never exceed MAX_TRACKED_ENTRIES — this is the
481+
// whole point of the eviction logic: an attacker can't use
482+
// record_failure to OOM the proxy.
483+
let limiter = PreviewAuthLimiter::new();
484+
let attacker_count = MAX_TRACKED_ENTRIES + 5_000;
485+
for i in 0..attacker_count {
486+
// Synthesize distinct IPv4 addrs across the /8, cycling session_id
487+
// so each key is unique.
488+
let octet_a = ((i >> 16) & 0xff) as u8;
489+
let octet_b = ((i >> 8) & 0xff) as u8;
490+
let octet_c = (i & 0xff) as u8;
491+
let ip: IpAddr = format!("10.{}.{}.{}", octet_a, octet_b, octet_c)
492+
.parse()
493+
.unwrap();
494+
limiter.record_failure(ip, (i % 1024) as i32);
495+
}
496+
assert!(
497+
limiter.failures.len() <= MAX_TRACKED_ENTRIES,
498+
"limiter grew beyond cap: {}",
499+
limiter.failures.len()
500+
);
501+
}
440502
}

crates/temps-workspace/src/services/message_executor.rs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,22 @@ impl MessageExecutor {
109109
self.dirty_sessions.write().await.insert(session_id);
110110
// SIGTERM first — give claude ~2s to flush the current turn to jsonl.
111111
self.session_manager
112-
.kill_session_processes(session_id, "^claude ", 15)
112+
.kill_session_processes(
113+
session_id,
114+
"^claude ",
115+
temps_agents::sandbox::KillSignal::Term,
116+
)
113117
.await;
114118
// Spawn the escalation-to-SIGKILL so we don't block the handler.
115119
let sm = self.session_manager.clone();
116120
tokio::spawn(async move {
117121
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
118-
sm.kill_session_processes(session_id, "^claude ", 9).await;
122+
sm.kill_session_processes(
123+
session_id,
124+
"^claude ",
125+
temps_agents::sandbox::KillSignal::Kill,
126+
)
127+
.await;
119128
});
120129
}
121130

@@ -710,7 +719,11 @@ impl MessageExecutor {
710719
// claudes writing to the same jsonl corrupt it. This is always
711720
// safe — if there's nothing to kill, pkill is a no-op.
712721
self.session_manager
713-
.kill_session_processes(session_id, "^claude ", 9)
722+
.kill_session_processes(
723+
session_id,
724+
"^claude ",
725+
temps_agents::sandbox::KillSignal::Kill,
726+
)
714727
.await;
715728

716729
// 2. Repair pass: if this session is marked dirty (prior cancel or
@@ -922,12 +935,12 @@ impl MessageExecutor {
922935
Err(_) => {
923936
self.dirty_sessions.write().await.insert(session_id);
924937
self.session_manager
925-
.kill_session_processes(session_id, "^claude ", 15)
938+
.kill_session_processes(session_id, "^claude ", temps_agents::sandbox::KillSignal::Term)
926939
.await;
927940
let sm = self.session_manager.clone();
928941
tokio::spawn(async move {
929942
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
930-
sm.kill_session_processes(session_id, "^claude ", 9).await;
943+
sm.kill_session_processes(session_id, "^claude ", temps_agents::sandbox::KillSignal::Kill).await;
931944
});
932945
Err(WorkspaceError::AiCliFailed {
933946
session_id,
@@ -1014,7 +1027,11 @@ impl MessageExecutor {
10141027
// Nuke every .jsonl in the claude projects dir for this sandbox.
10151028
// We don't know the exact filename, so shotgun-delete them all.
10161029
self.session_manager
1017-
.kill_session_processes(session_id, "^claude ", 9)
1030+
.kill_session_processes(
1031+
session_id,
1032+
"^claude ",
1033+
temps_agents::sandbox::KillSignal::Kill,
1034+
)
10181035
.await;
10191036
let delete_cmd = vec![
10201037
"sh".to_string(),

crates/temps-workspace/src/services/session_manager.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,12 @@ impl WorkspaceSessionManager {
228228

229229
/// Kill processes inside the session's sandbox matching a pattern.
230230
/// Best-effort: never returns a hard error, just logs.
231-
pub async fn kill_session_processes(&self, session_id: i32, pattern: &str, signal: i32) {
231+
pub async fn kill_session_processes(
232+
&self,
233+
session_id: i32,
234+
pattern: &str,
235+
signal: temps_agents::sandbox::KillSignal,
236+
) {
232237
let sessions = self.sessions.read().await;
233238
if let Some(live) = sessions.get(&session_id) {
234239
let _ = self
@@ -941,7 +946,7 @@ mod tests {
941946
&self,
942947
_handle: &SandboxHandle,
943948
_pattern: &str,
944-
_signal: i32,
949+
_signal: temps_agents::sandbox::KillSignal,
945950
) -> Result<(), AgentError> {
946951
Ok(())
947952
}

0 commit comments

Comments
 (0)