[Bugfix] Align block table for TRTLLM MLA edge-case (vllm-project#39324)

benchislett · web-flow · commit 38e16678ba7e · 2026-05-06T11:17:02.000-07:00
Signed-off-by: Benjamin Chislett &lt;bchislett@nvidia.com&gt;
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
@@ -257,6 +257,13 @@ def __init__(
                 f"must match block_sizes length ({len(block_sizes)})"
             )
 
+        # Align to a multiple of (128 / block_size) as required
+        # by some attention backends such as TRTLLM (#39324)
+        max_num_blocks = [
+            cdiv(n, 128 // bs) * (128 // bs) if bs <= 128 else n
+            for n, bs in zip(max_num_blocks, block_sizes)
+        ]
+
         self.block_tables = [
             BlockTable(
                 block_size,
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
@@ -41,6 +41,11 @@ def __init__(
             # As a result, one block on the current rank covers `block_size * cp_size`
             # tokens in the full, global (unsharded) sequence.
             max_num_blocks = cdiv(self.max_model_len, block_size * self.cp_size)
+            # Align to a multiple of (128 / block_size) as required
+            # by some attention backends such as TRTLLM (#39324)
+            if block_size <= 128:
+                alignment = 128 // block_size
+                max_num_blocks = cdiv(max_num_blocks, alignment) * alignment
             block_table = StagedWriteTensor(
                 (self.max_num_reqs, max_num_blocks),
                 dtype=torch.int32,