File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -2094,3 +2094,23 @@ def test_unify_hybrid_kv_cache_specs():
20942094
20952095 with pytest .raises (ValueError ):
20962096 kv_cache_utils .unify_hybrid_kv_cache_specs (kv_cache_spec )
2097+
2098+
2099+ def test_needs_kv_cache_zeroing ():
2100+ # Regression test for #39146: FullAttention models must zero recycled
2101+ # blocks to avoid stale K/V leaking through partial-block tail slots.
2102+ full_attention = KVCacheConfig (
2103+ num_blocks = 16 ,
2104+ kv_cache_tensors = [],
2105+ kv_cache_groups = [KVCacheGroupSpec (["layer_0" ], new_kv_cache_spec ())],
2106+ )
2107+ assert full_attention .needs_kv_cache_zeroing
2108+
2109+ sliding_only = KVCacheConfig (
2110+ num_blocks = 16 ,
2111+ kv_cache_tensors = [],
2112+ kv_cache_groups = [
2113+ KVCacheGroupSpec (["layer_0" ], new_sliding_window_spec (sliding_window = 64 ))
2114+ ],
2115+ )
2116+ assert not sliding_only .needs_kv_cache_zeroing
Original file line number Diff line number Diff line change @@ -496,4 +496,8 @@ def has_mamba_layers(self) -> bool:
496496
497497 @property
498498 def needs_kv_cache_zeroing (self ) -> bool :
499- return self .has_mamba_layers
499+ # Recycled blocks may hold stale K/V from prior requests; partial-block
500+ # tail slots can leak NaN/Inf into masked softmax (see #39146).
501+ return self .has_mamba_layers or any (
502+ type (g .kv_cache_spec ) is FullAttentionSpec for g in self .kv_cache_groups
503+ )
You can’t perform that action at this time.
0 commit comments