Unverified Commit 7b1895e6 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[CI Fix] Try fixing eagle e2e test OOM by reducing block allocation (#20213)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent 4d366936
...@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override": 2 + 256 // 16,
"max_model_len": (2 + 256 // 16) * 16,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override": 2 + 256 // 16,
"max_model_len": (2 + 256 // 16) * 16,
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment