Zero recycled KV blocks for FullAttention models (vllm-project#39146)

AjAnubolu · AjAnubolu · commit 1ad67864c0c2 · 2026-04-08T01:48:55.000-07:00
Signed-off-by: AjAnubolu &lt;anuboluajay@gmail.com&gt;
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -2094,3 +2094,23 @@ def test_unify_hybrid_kv_cache_specs():
 
     with pytest.raises(ValueError):
         kv_cache_utils.unify_hybrid_kv_cache_specs(kv_cache_spec)
+
+
+def test_needs_kv_cache_zeroing():
+    # Regression test for #39146: FullAttention models must zero recycled
+    # blocks to avoid stale K/V leaking through partial-block tail slots.
+    full_attention = KVCacheConfig(
+        num_blocks=16,
+        kv_cache_tensors=[],
+        kv_cache_groups=[KVCacheGroupSpec(["layer_0"], new_kv_cache_spec())],
+    )
+    assert full_attention.needs_kv_cache_zeroing
+
+    sliding_only = KVCacheConfig(
+        num_blocks=16,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_0"], new_sliding_window_spec(sliding_window=64))
+        ],
+    )
+    assert not sliding_only.needs_kv_cache_zeroing
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
@@ -496,4 +496,8 @@ def has_mamba_layers(self) -> bool:
 
     @property
     def needs_kv_cache_zeroing(self) -> bool:
-        return self.has_mamba_layers
+        # Recycled blocks may hold stale K/V from prior requests; partial-block
+        # tail slots can leak NaN/Inf into masked softmax (see #39146).
+        return self.has_mamba_layers or any(
+            type(g.kv_cache_spec) is FullAttentionSpec for g in self.kv_cache_groups
+        )