[CI Fix] Try fixing eagle e2e test OOM by reducing block allocation (#20213)

Signed-off-by: mgoin <mgoin64@gmail.com>

[CI Fix] Try fixing eagle e2e test OOM by reducing block allocation (#20213)
Signed-off-by: mgoin <mgoin64@gmail.com>
7b1895e6 · Michael Goin · GitHub · 4d366936 · 7b1895e6
Unverified Commit 7b1895e6 authored Jun 29, 2025 by Michael Goin Committed by GitHub Jun 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 0 deletions

tests/spec_decode/e2e/test_eagle_correctness.py tests/spec_decode/e2e/test_eagle_correctness.py +8 -0

No files found.
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
+        # 2 for small prompt, 256//16 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 16,
+        "max_model_len": (2 + 256 // 16) * 16,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
+        # 2 for small prompt, 256//16 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 16,
+        "max_model_len": (2 + 256 // 16) * 16,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,