fix(kv-cache): increase hybrid attention grouping threshold from 1.25 to 1.5 (#36684)

Signed-off-by: Jaime Campos Salas <jaime.campos.salas@gmail.com>

fix(kv-cache): increase hybrid attention grouping threshold from 1.25 to 1.5 (#36684)
Signed-off-by: Jaime Campos Salas <jaime.campos.salas@gmail.com>
891c60dc · jaime campos salas · GitHub · 1ce13cf9 · 891c60dc
Unverified Commit 891c60dc authored Mar 12, 2026 by jaime campos salas Committed by GitHub Mar 12, 2026
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

vllm/v1/core/kv_cache_utils.py vllm/v1/core/kv_cache_utils.py +8 -6

No files found.
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1040,12 +1040,14 @@ def _get_kv_cache_groups_uniform_page_size(
    min_num_layers = min([len(layers) for layers in same_type_layers.values()])
    group_size = min_num_layers
    max_num_layers = max([len(layers) for layers in same_type_layers.values()])
-    if max_num_layers < min_num_layers * 1.25:
+    if max_num_layers < min_num_layers * 1.5:
-        # If the number of layers is not much larger than the minimum number of layers,
+        # If the number of layers is not much larger than the minimum number of
-        # use the maximum number of layers as the group size to avoid too many padding
+        # layers, use the maximum number of layers as the group size to avoid
-        # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
+        # too many padding layers. A typical example is gpt-oss-20b + eagle,
-        # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
+        # with 12 sw + 13 full. We pad it to (13 sw, 13 full) instead of
-        # magic number to avoid too many padding layers.
+        # (12 sw, 24 full). 1.5 is a heuristic to avoid too many padding
+        # layers while accommodating speculative decoding drafters that add
+        # extra layers to one attention type.
        group_size = max_num_layers
    grouped_layers = []
    for layers in same_type_layers.values():