Commit 22d7e7c4 authored by zhuwenwen's avatar zhuwenwen
Browse files

[fix]fix tests of async_engine and compile

parent 99963991
...@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal ...@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "google/gemma-2-2b-it"), os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
...@@ -41,10 +39,10 @@ def v1(run_with_both_engines): ...@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND: if not current_platform.is_rocm():
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2")) llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
weak_llm = weakref.ref(llm) weak_llm = weakref.ref(llm)
del llm del llm
...@@ -111,13 +109,12 @@ def test_models( ...@@ -111,13 +109,12 @@ def test_models(
prompt_embeds = hf_model.get_prompt_embeddings( prompt_embeds = hf_model.get_prompt_embeddings(
example_prompts) example_prompts)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND: if not current_platform.is_rocm():
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds, enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7) as vllm_model:
block_size=64) as vllm_model:
if enable_prompt_embeds: if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens) prompt_embeds, max_tokens)
...@@ -131,7 +128,8 @@ def test_models( ...@@ -131,7 +128,8 @@ def test_models(
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds, enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7,
block_size=64) as vllm_model:
if enable_prompt_embeds: if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens) prompt_embeds, max_tokens)
......
...@@ -94,7 +94,7 @@ def test_models( ...@@ -94,7 +94,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens) max_tokens)
...@@ -128,7 +128,7 @@ def test_models_distributed( ...@@ -128,7 +128,7 @@ def test_models_distributed(
) -> None: ) -> None:
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend) m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct" if (model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test Ray Compiled Graph # test Ray Compiled Graph
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
...@@ -158,7 +158,7 @@ def test_models_distributed( ...@@ -158,7 +158,7 @@ def test_models_distributed(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
example_prompts, example_prompts,
...@@ -220,6 +220,7 @@ def test_models_with_fp8_kv_cache( ...@@ -220,6 +220,7 @@ def test_models_with_fp8_kv_cache(
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
...@@ -233,10 +234,12 @@ def test_models_with_fp8_kv_cache( ...@@ -233,10 +234,12 @@ def test_models_with_fp8_kv_cache(
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=no_chunked_prefill_outputs, outputs_0_lst=no_chunked_prefill_outputs,
outputs_1_lst=chunked_prefill_outputs, outputs_1_lst=chunked_prefill_outputs,
...@@ -286,7 +289,7 @@ def test_with_prefix_caching( ...@@ -286,7 +289,7 @@ def test_with_prefix_caching(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
...@@ -303,7 +306,7 @@ def test_with_prefix_caching( ...@@ -303,7 +306,7 @@ def test_with_prefix_caching(
) )
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) @pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
......
...@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. ...@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`. pytest tests/basic_correctness/test_preemption.py`.
""" """
import os
import pytest import pytest
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
...@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ...@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix from ..utils import models_path_prefix
import os from vllm.platforms import current_platform
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "distilbert/distilgpt2"), os.path.join(models_path_prefix, "distilbert/distilgpt2"),
...@@ -74,6 +75,7 @@ def test_chunked_prefill_recompute( ...@@ -74,6 +75,7 @@ def test_chunked_prefill_recompute(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if not current_platform.is_rocm():
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
...@@ -86,6 +88,20 @@ def test_chunked_prefill_recompute( ...@@ -86,6 +88,20 @@ def test_chunked_prefill_recompute(
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT) < ARTIFICIAL_PREEMPTION_MAX_CNT)
else:
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -115,11 +131,25 @@ def test_preemption( ...@@ -115,11 +131,25 @@ def test_preemption(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if not current_platform.is_rocm():
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
else:
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
block_size=64,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
...@@ -163,7 +193,7 @@ def test_preemption_infeasible( ...@@ -163,7 +193,7 @@ def test_preemption_infeasible(
distributed_executor_backend: str, distributed_executor_backend: str,
) -> None: ) -> None:
"""Verify infeasible preemption request will be ignored.""" """Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16 BLOCK_SIZE = 16 if not current_platform.is_rocm() else 64
prefill_blocks = 2 prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE decode_blocks = max_tokens // BLOCK_SIZE
with vllm_runner( with vllm_runner(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import json import json
import pytest import pytest
...@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS ...@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import (compare_two_settings, create_new_process_for_each_test, from ..utils import (compare_two_settings, create_new_process_for_each_test,
multi_gpu_test) multi_gpu_test)
from .backend import TestBackend from .backend import TestBackend
from ..utils import models_path_prefix
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, ...@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
@create_new_process_for_each_test() @create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")])
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("async_tp_enabled", [True]) @pytest.mark.parametrize("async_tp_enabled", [True])
@pytest.mark.parametrize("distributed_backend", ["mp"]) @pytest.mark.parametrize("distributed_backend", ["mp"])
......
...@@ -84,16 +84,17 @@ class TestSetting: ...@@ -84,16 +84,17 @@ class TestSetting:
# method="encode", # method="encode",
# fullgraph=True, # fullgraph=True,
# ), # ),
# TODO
# vision language model # vision language model
TestSetting( # TestSetting(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"), # model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"], # model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2, # pp_size=2,
tp_size=1, # tp_size=1,
attn_backend="FLASH_ATTN", # attn_backend="FLASH_ATTN",
method="generate_with_image", # method="generate_with_image",
fullgraph=False, # fullgraph=False,
), # ),
]) ])
def test_compile_correctness( def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import vllm import vllm
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.utils import _is_torch_equal_or_newer from vllm.utils import _is_torch_equal_or_newer
from ..utils import models_path_prefix
def test_version(): def test_version():
assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev') assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
...@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch): ...@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert not vllm_config.compilation_config.use_cudagraph assert not vllm_config.compilation_config.use_cudagraph
@pytest.mark.parametrize("enabled", [True, False]) # TODO: when True num_cudagraph_captured=13
# @pytest.mark.parametrize("enabled", [True, False])
@pytest.mark.parametrize("enabled", [False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1 assert vllm.envs.VLLM_USE_V1
...@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): ...@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
num_cudagraph_captured=13 if enabled else 0, num_cudagraph_captured=13 if enabled else 0,
), ),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner('facebook/opt-125m', vllm_runner(os.path.join(models_path_prefix, 'facebook/opt-125m'),
compilation_config=compilation_config, compilation_config=compilation_config,
gpu_memory_utilization=0.4) as _): gpu_memory_utilization=0.4) as _):
pass pass
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys import sys
from unittest.mock import patch from unittest.mock import patch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ..utils import models_path_prefix
def test_mp_reducer(monkeypatch): def test_mp_reducer(monkeypatch):
...@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch): ...@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
with patch('multiprocessing.reducer.register') as mock_register: with patch('multiprocessing.reducer.register') as mock_register:
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=32, max_model_len=32,
gpu_memory_utilization=0.1, gpu_memory_utilization=0.1,
disable_log_stats=True, disable_log_stats=True,
......
...@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams ...@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.utils import maybe_model_redirect from vllm.transformers_utils.utils import maybe_model_redirect
from .utils import models_path_prefix from .utils import models_path_prefix
from vllm.platforms import current_platform
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -783,7 +784,7 @@ class VllmRunner: ...@@ -783,7 +784,7 @@ class VllmRunner:
dtype: str = "auto", dtype: str = "auto",
disable_log_stats: bool = True, disable_log_stats: bool = True,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16, block_size: int = 16 if not current_platform.is_rocm() else 64,
enable_chunked_prefill: Optional[bool] = False, enable_chunked_prefill: Optional[bool] = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: Optional[bool] = False,
......
...@@ -79,8 +79,10 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs): ...@@ -79,8 +79,10 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
queue.join_thread() queue.join_thread()
@pytest.mark.parametrize("enable_lora", [False, True]) # @pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2]) # @pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("enable_lora", [False])
@pytest.mark.parametrize("tp_size", [1])
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
llama_3p2_1b_files, llama_3p2_1b_files,
monkeypatch: pytest.MonkeyPatch): monkeypatch: pytest.MonkeyPatch):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment