"tests/vscode:/vscode.git/clone" did not exist on "a78a426065e250a953277a9a5f5247a07312bd1d"
Unverified Commit a73e183e authored by Sibi's avatar Sibi Committed by GitHub
Browse files

[Misc] Replace os environ to monkeypatch in test suite (#14516)


Signed-off-by: default avatarsibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: default avatarAaron Pham <contact@aarnphm.xyz>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarAaron Pham <contact@aarnphm.xyz>
parent 1e799b7e
...@@ -522,7 +522,7 @@ steps: ...@@ -522,7 +522,7 @@ steps:
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
......
...@@ -47,6 +47,7 @@ def test_vllm_gc_ed(): ...@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
...@@ -63,7 +64,8 @@ def test_models( ...@@ -63,7 +64,8 @@ def test_models(
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
...@@ -80,7 +82,8 @@ def test_models( ...@@ -80,7 +82,8 @@ def test_models(
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
...@@ -104,6 +107,7 @@ def test_models( ...@@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
]) ])
def test_models_distributed( def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
...@@ -116,13 +120,17 @@ def test_models_distributed( ...@@ -116,13 +120,17 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE: if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}") pytest.skip(f"Skip test for {test_suite}")
with monkeypatch.context() as monkeypatch_context:
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test Ray Compiled Graph # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if attention_backend: if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
...@@ -130,13 +138,16 @@ def test_models_distributed( ...@@ -130,13 +138,16 @@ def test_models_distributed(
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method
with vllm_runner(model, # (the default method).
with vllm_runner(
model,
dtype=dtype, dtype=dtype,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
......
...@@ -7,16 +7,22 @@ prefill requests are chunked. ...@@ -7,16 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
...@@ -24,12 +30,14 @@ MODELS = [ ...@@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
Since this module is V0 only, set VLLM_USE_V1=0 for Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file. all tests in the file.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch): ...@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
...@@ -52,13 +60,14 @@ def test_models( ...@@ -52,13 +60,14 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
...@@ -75,7 +84,8 @@ def test_models( ...@@ -75,7 +84,8 @@ def test_models(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
...@@ -90,21 +100,21 @@ def test_models( ...@@ -90,21 +100,21 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct" if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test Ray Compiled Graph # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
...@@ -119,7 +129,8 @@ def test_models_distributed( ...@@ -119,7 +129,8 @@ def test_models_distributed(
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with
# fork method (the default method).
with vllm_runner( with vllm_runner(
model, model,
...@@ -130,7 +141,10 @@ def test_models_distributed( ...@@ -130,7 +141,10 @@ def test_models_distributed(
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
...@@ -158,7 +172,7 @@ def test_models_distributed( ...@@ -158,7 +172,7 @@ def test_models_distributed(
# the async postprocessor # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True]) @pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache( def test_models_with_fp8_kv_cache(
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
kv_cache_dtype: str, kv_cache_dtype: str,
model: str, model: str,
...@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache( ...@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
...@@ -254,8 +268,10 @@ def test_with_prefix_caching( ...@@ -254,8 +268,10 @@ def test_with_prefix_caching(
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt], outputs[enable] += vllm_model.generate_greedy(
max_tokens) [prompt],
max_tokens,
)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs[False], outputs_0_lst=outputs[False],
...@@ -274,8 +290,8 @@ def test_with_prefix_caching( ...@@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu( def test_models_cpu(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
...@@ -283,7 +299,7 @@ def test_models_cpu( ...@@ -283,7 +299,7 @@ def test_models_cpu(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
test_models( test_models(
hf_runner, hf_runner,
...@@ -307,7 +323,7 @@ def test_models_cpu( ...@@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu( def test_with_prefix_caching_cpu(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
......
...@@ -123,9 +123,9 @@ def test_cumem_with_cudagraph(): ...@@ -123,9 +123,9 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
def test_end_to_end(model: str, use_v1: bool): def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
import os with monkeypatch.context() as m:
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True) llm = LLM(model, enable_sleep_mode=True)
...@@ -158,5 +158,3 @@ def test_end_to_end(model: str, use_v1: bool): ...@@ -158,5 +158,3 @@ def test_end_to_end(model: str, use_v1: bool):
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses import dataclasses
from typing import Optional
import pytest import pytest
...@@ -22,8 +22,11 @@ class TestSetting: ...@@ -22,8 +22,11 @@ class TestSetting:
fullgraph: bool fullgraph: bool
# representative settings for testing # we cannot afford testing the full Catesian product
test_settings = [ # of all models and all levels
@pytest.mark.parametrize(
"test_setting",
[
# basic llama model # basic llama model
TestSetting( TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct", model="meta-llama/Llama-3.2-1B-Instruct",
...@@ -84,13 +87,11 @@ test_settings = [ ...@@ -84,13 +87,11 @@ test_settings = [
method="generate_with_image", method="generate_with_image",
fullgraph=False, fullgraph=False,
), ),
] ])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
# we cannot afford testing the full Catesian product test_setting: TestSetting,
# of all models and all levels ):
@pytest.mark.parametrize("test_setting", test_settings)
def test_compile_correctness(test_setting: TestSetting):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices. # make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests. # don't use "<", as it will duplicate the tests.
...@@ -103,13 +104,17 @@ def test_compile_correctness(test_setting: TestSetting): ...@@ -103,13 +104,17 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size: if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.") pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend with monkeypatch.context() as m:
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
["-tp", str(tp_size)] final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = [] all_args: list[list[str]] = []
all_envs: list[Optional[dict[str, str]]] = [] all_envs: list[dict[str, str] | None] = []
for level in [ for level in [
CompilationLevel.NO_COMPILATION, CompilationLevel.NO_COMPILATION,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Any
import pytest import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
)
@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level): def test_full_graph(
model = model_info[0] monkeypatch: pytest.MonkeyPatch,
model_kwargs = model_info[1] model_info: tuple[str, dict[str, Any]],
check_full_graph_support(model, optimization_level: int,
model_kwargs, ):
optimization_level, model, model_kwargs = model_info
tp_size=1)
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
...@@ -3,7 +3,10 @@ ...@@ -3,7 +3,10 @@
Run `pytest tests/distributed/test_comm_ops.py`. Run `pytest tests/distributed/test_comm_ops.py`.
""" """
import os
from __future__ import annotations
from typing import Any, Callable
import pytest import pytest
import ray import ray
...@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel ...@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, def all_reduce_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, def all_gather_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def broadcast_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
all_reduce_test_worker, all_gather_test_worker, all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel(tp_size, test_target): def test_multi_process_tensor_parallel(
multi_process_parallel(tp_size, 1, test_target) monkeypatch: pytest.MonkeyPatch,
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
...@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target): ...@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
def test_multi_process_pipeline_parallel(pp_size, test_target): def test_multi_process_pipeline_parallel(
multi_process_parallel(1, pp_size, test_target) monkeypatch: pytest.MonkeyPatch,
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
...@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target): ...@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel_pipeline_parallel( def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size, pp_size, test_target): tp_size: int,
multi_process_parallel(tp_size, pp_size, test_target) pp_size: int,
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import random import random
import pytest import pytest
...@@ -23,8 +22,15 @@ for i, v in enumerate(test_sizes): ...@@ -23,8 +22,15 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): def graph_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -79,8 +85,15 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): ...@@ -79,8 +85,15 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): def eager_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -110,8 +123,14 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): ...@@ -110,8 +123,14 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): def test_custom_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count(): if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.") pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(tp_size, pipeline_parallel_size, test_target) multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
test_target)
...@@ -7,15 +7,17 @@ import pytest ...@@ -7,15 +7,17 @@ import pytest
from vllm.distributed.utils import get_pp_indices from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition(): def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
def _verify(partition_str, num_layers, pp_size, goldens): def _verify(partition_str, num_layers, pp_size, goldens):
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
for pp_rank, golden in enumerate(goldens): for pp_rank, golden in enumerate(goldens):
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None: if bak is not None:
os.environ["VLLM_PP_LAYER_PARTITION"] = bak m.setenv("VLLM_PP_LAYER_PARTITION", bak)
# Even partition # Even partition
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
...@@ -55,6 +57,10 @@ def test_custom_layer_partition(): ...@@ -55,6 +57,10 @@ def test_custom_layer_partition():
(5, 3, 1, (2, 4)), (5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)), (5, 3, 2, (4, 5)),
]) ])
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, def test_uneven_auto_partition(
pp_rank: int, indices: tuple[int, int]): num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os from typing import TYPE_CHECKING
import pytest import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"), (2, "JackFram/llama-160m"),
...@@ -15,7 +19,13 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test ...@@ -15,7 +19,13 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
"FLASHINFER", "FLASHINFER",
]) ])
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_pp_cudagraph(
monkeypatch: pytest.MonkeyPatch,
PP_SIZE: int,
MODEL_NAME: str,
ATTN_BACKEND: LiteralString,
):
with monkeypatch.context() as m:
cudagraph_args = [ cudagraph_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
...@@ -25,7 +35,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): ...@@ -25,7 +35,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
"--distributed-executor-backend", "--distributed-executor-backend",
"mp", "mp",
] ]
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"] eager_args = cudagraph_args + ["--enforce-eager"]
......
...@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4" ...@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU") reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): ...@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
run_test(more_args) run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
...@@ -53,21 +53,29 @@ def cache_models(): ...@@ -53,21 +53,29 @@ def cache_models():
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models") @pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch): def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM # Set HF to offline mode and ensure we can still construct an LLM
with monkeypatch.context() as m:
try: try:
monkeypatch.setenv("HF_HUB_OFFLINE", "1") m.setenv("HF_HUB_OFFLINE", "1")
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs): def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed") raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", m.setattr(
disable_connect) urllib3.connection.HTTPConnection,
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", "connect",
disable_connect) disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub and friends to setup offline mode # Need to re-import huggingface_hub
# and friends to setup offline mode
_re_import_modules() _re_import_modules()
# Cached model files should be used in offline mode # Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS: for model_config in MODEL_CONFIGS:
...@@ -75,10 +83,7 @@ def test_offline_mode(monkeypatch): ...@@ -75,10 +83,7 @@ def test_offline_mode(monkeypatch):
finally: finally:
# Reset the environment after the test # Reset the environment after the test
# NB: Assuming tests are run in online mode # NB: Assuming tests are run in online mode
monkeypatch.delenv("HF_HUB_OFFLINE")
monkeypatch.delenv("VLLM_NO_USAGE_STATS")
_re_import_modules() _re_import_modules()
pass
def _re_import_modules(): def _re_import_modules():
......
...@@ -70,7 +70,7 @@ def run_test(more_args): ...@@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU") reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): ...@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
...@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch ...@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -25,54 +24,67 @@ def clear_cache(): ...@@ -25,54 +24,67 @@ def clear_cache():
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, use_v1: bool, device: str, monkeypatch): def test_env(
name: str,
use_v1: bool,
device: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
""" """
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") with monkeypatch.context() as m:
override_backend_env_variable(monkeypatch, name) m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, name)
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()): with patch("vllm.attention.selector.current_platform",
backend = get_attn_backend(16, torch.float16, torch.float16, 16, CpuPlatform()):
False) backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
assert backend.get_name() == "TORCH_SDPA" assert backend.get_name() == "TORCH_SDPA"
elif device == "hip": elif device == "hip":
with patch("vllm.attention.selector.current_platform", RocmPlatform()): with patch("vllm.attention.selector.current_platform",
backend = get_attn_backend(16, torch.float16, torch.float16, 16, RocmPlatform()):
False) backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
elif device == "openvino": elif device == "openvino":
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules', OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}): {'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, backend = get_attn_backend(16, torch.float16, torch.float16,
False) 16, False)
assert backend.get_name() == "OPENVINO" assert backend.get_name() == "OPENVINO"
else: else:
if name in ["XFORMERS", "FLASHINFER"]: if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
CudaPlatform()): CudaPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, backend = get_attn_backend(16, torch.float16,
16, False) torch.float16, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
def test_flash_attn(monkeypatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to # TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend # get_attn_backend
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=(7, 5)): monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
(7, 5))
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Reset the monkeypatch for subsequent tests
monkeypatch.undo()
# Unsupported data type # Unsupported data type
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
...@@ -86,10 +98,19 @@ def test_flash_attn(monkeypatch): ...@@ -86,10 +98,19 @@ def test_flash_attn(monkeypatch):
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed # flash-attn is not installed
with patch.dict('sys.modules', {'vllm_flash_attn': None}): import sys
original_module = sys.modules.get('vllm_flash_attn')
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Restore the original module if it existed
if original_module is not None:
monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
original_module)
else:
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
# Unsupported head size # Unsupported head size
backend = get_attn_backend(17, torch.float16, None, 16, False) backend = get_attn_backend(17, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
...@@ -100,12 +121,14 @@ def test_flash_attn(monkeypatch): ...@@ -100,12 +121,14 @@ def test_flash_attn(monkeypatch):
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
def test_invalid_env(use_v1: bool, monkeypatch): def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
"""Ignore the invalid env variable if it is set."""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") with monkeypatch.context() as m, patch(
override_backend_env_variable(monkeypatch, STR_INVALID_VAL) "vllm.attention.selector.current_platform", CudaPlatform()):
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
with patch("vllm.attention.selector.current_platform", CudaPlatform()): # Test with head size 32
backend = get_attn_backend(32, torch.float16, None, 16, False) backend = get_attn_backend(32, torch.float16, None, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
...@@ -11,8 +9,9 @@ from vllm import _custom_ops as ops # noqa: F401 ...@@ -11,8 +9,9 @@ from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_dequantize_opcheck(): def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
m.setenv("VLLM_USE_TRITON_AWQ", "0")
qweight = torch.randint(-2000000000, qweight = torch.randint(-2000000000,
2000000000, (8192, 256), 2000000000, (8192, 256),
device='cuda', device='cuda',
...@@ -29,8 +28,9 @@ def test_awq_dequantize_opcheck(): ...@@ -29,8 +28,9 @@ def test_awq_dequantize_opcheck():
@pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_gemm_opcheck(): def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
m.setenv("VLLM_USE_TRITON_AWQ", "0")
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
qweight = torch.randint(-2000000000, qweight = torch.randint(-2000000000,
2000000000, (8192, 256), 2000000000, (8192, 256),
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -17,15 +15,19 @@ def clear_cache(): ...@@ -17,15 +15,19 @@ def clear_cache():
_cached_get_attn_backend.cache_clear() _cached_get_attn_backend.cache_clear()
def test_selector(monkeypatch): def test_selector(monkeypatch: pytest.MonkeyPatch):
"""Test that the attention selector for ROCm. with monkeypatch.context() as m:
""" m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
override_backend_env_variable(monkeypatch, "ROCM_FLASH")
with patch("vllm.attention.selector.current_platform", RocmPlatform()): # Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform",
RocmPlatform())
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH" assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1") or backend.get_name() == "ROCM_ATTN_VLLM_V1")
# mla test for deepseek related # mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True) False, True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment