Skip to content

Commit 1ad6786

Browse files
committed
Zero recycled KV blocks for FullAttention models (vllm-project#39146)
Signed-off-by: AjAnubolu <anuboluajay@gmail.com>
1 parent 10f08de commit 1ad6786

2 files changed

Lines changed: 25 additions & 1 deletion

File tree

tests/v1/core/test_kv_cache_utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2094,3 +2094,23 @@ def test_unify_hybrid_kv_cache_specs():
20942094

20952095
with pytest.raises(ValueError):
20962096
kv_cache_utils.unify_hybrid_kv_cache_specs(kv_cache_spec)
2097+
2098+
2099+
def test_needs_kv_cache_zeroing():
2100+
# Regression test for #39146: FullAttention models must zero recycled
2101+
# blocks to avoid stale K/V leaking through partial-block tail slots.
2102+
full_attention = KVCacheConfig(
2103+
num_blocks=16,
2104+
kv_cache_tensors=[],
2105+
kv_cache_groups=[KVCacheGroupSpec(["layer_0"], new_kv_cache_spec())],
2106+
)
2107+
assert full_attention.needs_kv_cache_zeroing
2108+
2109+
sliding_only = KVCacheConfig(
2110+
num_blocks=16,
2111+
kv_cache_tensors=[],
2112+
kv_cache_groups=[
2113+
KVCacheGroupSpec(["layer_0"], new_sliding_window_spec(sliding_window=64))
2114+
],
2115+
)
2116+
assert not sliding_only.needs_kv_cache_zeroing

vllm/v1/kv_cache_interface.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,4 +496,8 @@ def has_mamba_layers(self) -> bool:
496496

497497
@property
498498
def needs_kv_cache_zeroing(self) -> bool:
499-
return self.has_mamba_layers
499+
# Recycled blocks may hold stale K/V from prior requests; partial-block
500+
# tail slots can leak NaN/Inf into masked softmax (see #39146).
501+
return self.has_mamba_layers or any(
502+
type(g.kv_cache_spec) is FullAttentionSpec for g in self.kv_cache_groups
503+
)

0 commit comments

Comments
 (0)