Unverified Commit a73e183e authored by Sibi's avatar Sibi Committed by GitHub
Browse files

[Misc] Replace os environ to monkeypatch in test suite (#14516)


Signed-off-by: default avatarsibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: default avatarAaron Pham <contact@aarnphm.xyz>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarAaron Pham <contact@aarnphm.xyz>
parent 1e799b7e
......@@ -522,7 +522,7 @@ steps:
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
......
......@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
model: str,
backend: str,
......@@ -63,7 +64,8 @@ def test_models(
pytest.skip(
f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
......@@ -80,7 +82,8 @@ def test_models(
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
......@@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
vllm_runner,
example_prompts,
......@@ -116,13 +120,17 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")
with monkeypatch.context() as monkeypatch_context:
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
dtype = "half"
max_tokens = 5
......@@ -130,13 +138,16 @@ def test_models_distributed(
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
# will hurt multiprocessing backend with fork method
# (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
......
......@@ -7,16 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct",
......@@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS)
......@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models(
hf_runner,
vllm_runner,
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
......@@ -52,13 +60,14 @@ def test_models(
enforce_eager: bool,
tensor_parallel_size: int,
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
override_backend_env_variable(monkeypatch, attention_backend)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
......@@ -75,7 +84,8 @@ def test_models(
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
......@@ -90,21 +100,21 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed(
hf_runner,
vllm_runner,
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
override_backend_env_variable(monkeypatch, attention_backend)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half"
max_tokens = 5
......@@ -119,7 +129,8 @@ def test_models_distributed(
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# will hurt multiprocessing backend with
# fork method (the default method).
with vllm_runner(
model,
......@@ -130,7 +141,10 @@ def test_models_distributed(
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
......@@ -158,7 +172,7 @@ def test_models_distributed(
# the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache(
vllm_runner,
vllm_runner: VllmRunner,
example_prompts,
kv_cache_dtype: str,
model: str,
......@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching(
vllm_runner,
vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
......@@ -254,8 +268,10 @@ def test_with_prefix_caching(
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt],
max_tokens)
outputs[enable] += vllm_model.generate_greedy(
[prompt],
max_tokens,
)
check_outputs_equal(
outputs_0_lst=outputs[False],
......@@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
hf_runner,
vllm_runner,
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
......@@ -283,7 +299,7 @@ def test_models_cpu(
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
test_models(
hf_runner,
......@@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
vllm_runner,
vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
......
......@@ -123,9 +123,9 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint
("facebook/opt-125m", False),
])
def test_end_to_end(model: str, use_v1: bool):
import os
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
......@@ -158,5 +158,3 @@ def test_end_to_end(model: str, use_v1: bool):
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses
from typing import Optional
import pytest
......@@ -22,8 +22,11 @@ class TestSetting:
fullgraph: bool
# representative settings for testing
test_settings = [
# we cannot afford testing the full Catesian product
# of all models and all levels
@pytest.mark.parametrize(
"test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
......@@ -84,13 +87,11 @@ test_settings = [
method="generate_with_image",
fullgraph=False,
),
]
# we cannot afford testing the full Catesian product
# of all models and all levels
@pytest.mark.parametrize("test_setting", test_settings)
def test_compile_correctness(test_setting: TestSetting):
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
......@@ -103,13 +104,17 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
["-tp", str(tp_size)]
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = []
all_envs: list[Optional[dict[str, str]]] = []
all_envs: list[dict[str, str] | None] = []
for level in [
CompilationLevel.NO_COMPILATION,
......
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Any
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize(
"optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
)
@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level):
model = model_info[0]
model_kwargs = model_info[1]
check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1)
def test_full_graph(
monkeypatch: pytest.MonkeyPatch,
model_info: tuple[str, dict[str, Any]],
optimization_level: int,
):
model, model_kwargs = model_info
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
......@@ -3,7 +3,10 @@
Run `pytest tests/distributed/test_comm_ops.py`.
"""
import os
from __future__ import annotations
from typing import Any, Callable
import pytest
import ray
......@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str):
def all_reduce_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str):
def all_gather_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str):
def broadcast_tensor_dict_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str):
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
def send_recv_tensor_dict_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str):
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
def send_recv_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker
])
def test_multi_process_tensor_parallel(tp_size, test_target):
multi_process_parallel(tp_size, 1, test_target)
def test_multi_process_tensor_parallel(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
......@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
def test_multi_process_pipeline_parallel(pp_size, test_target):
multi_process_parallel(1, pp_size, test_target)
def test_multi_process_pipeline_parallel(
monkeypatch: pytest.MonkeyPatch,
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
......@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
broadcast_tensor_dict_test_worker
])
def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size, pp_size, test_target):
multi_process_parallel(tp_size, pp_size, test_target)
tp_size: int,
pp_size: int,
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
# SPDX-License-Identifier: Apache-2.0
import os
import random
import pytest
......@@ -23,8 +22,15 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
def graph_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -79,8 +85,15 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
def eager_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
......@@ -110,8 +123,14 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
def test_custom_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
test_target)
......@@ -7,15 +7,17 @@ import pytest
from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition():
def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
def _verify(partition_str, num_layers, pp_size, goldens):
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
for pp_rank, golden in enumerate(goldens):
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None:
os.environ["VLLM_PP_LAYER_PARTITION"] = bak
m.setenv("VLLM_PP_LAYER_PARTITION", bak)
# Even partition
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
......@@ -55,6 +57,10 @@ def test_custom_layer_partition():
(5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)),
])
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
pp_rank: int, indices: tuple[int, int]):
def test_uneven_auto_partition(
num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os
from typing import TYPE_CHECKING
import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"),
......@@ -15,7 +19,13 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
"FLASHINFER",
])
@fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
def test_pp_cudagraph(
monkeypatch: pytest.MonkeyPatch,
PP_SIZE: int,
MODEL_NAME: str,
ATTN_BACKEND: LiteralString,
):
with monkeypatch.context() as m:
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
......@@ -25,7 +35,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
"--distributed-executor-backend",
"mp",
]
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"]
......
......@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch):
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
......@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch):
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine."""
with monkeypatch.context() as m:
......
......@@ -53,21 +53,29 @@ def cache_models():
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch):
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM
with monkeypatch.context() as m:
try:
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
disable_connect)
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
disable_connect)
m.setattr(
urllib3.connection.HTTPConnection,
"connect",
disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub and friends to setup offline mode
# Need to re-import huggingface_hub
# and friends to setup offline mode
_re_import_modules()
# Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS:
......@@ -75,10 +83,7 @@ def test_offline_mode(monkeypatch):
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
monkeypatch.delenv("HF_HUB_OFFLINE")
monkeypatch.delenv("VLLM_NO_USAGE_STATS")
_re_import_modules()
pass
def _re_import_modules():
......
......@@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch):
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
......@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine."""
with monkeypatch.context() as m:
......
......@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
import pytest
import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True)
......@@ -25,54 +24,67 @@ def clear_cache():
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, use_v1: bool, device: str, monkeypatch):
def test_env(
name: str,
use_v1: bool,
device: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
"""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
override_backend_env_variable(monkeypatch, name)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, name)
if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
assert backend.get_name() == "TORCH_SDPA"
elif device == "hip":
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
with patch("vllm.attention.selector.current_platform",
RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED
elif device == "openvino":
with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
assert backend.get_name() == "OPENVINO"
else:
if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
backend = get_attn_backend(16, torch.float16,
torch.float16, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == EXPECTED
def test_flash_attn(monkeypatch):
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
(7, 5))
backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
# Reset the monkeypatch for subsequent tests
monkeypatch.undo()
# Unsupported data type
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
......@@ -86,10 +98,19 @@ def test_flash_attn(monkeypatch):
assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed
with patch.dict('sys.modules', {'vllm_flash_attn': None}):
import sys
original_module = sys.modules.get('vllm_flash_attn')
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
# Restore the original module if it existed
if original_module is not None:
monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
original_module)
else:
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
# Unsupported head size
backend = get_attn_backend(17, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
......@@ -100,12 +121,14 @@ def test_flash_attn(monkeypatch):
@pytest.mark.parametrize("use_v1", [True, False])
def test_invalid_env(use_v1: bool, monkeypatch):
"""Ignore the invalid env variable if it is set."""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, patch(
"vllm.attention.selector.current_platform", CudaPlatform()):
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
# Test with head size 32
backend = get_attn_backend(32, torch.float16, None, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
import torch
......@@ -11,8 +9,9 @@ from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.")
def test_awq_dequantize_opcheck():
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_TRITON_AWQ", "0")
qweight = torch.randint(-2000000000,
2000000000, (8192, 256),
device='cuda',
......@@ -29,8 +28,9 @@ def test_awq_dequantize_opcheck():
@pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.")
def test_awq_gemm_opcheck():
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_TRITON_AWQ", "0")
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
qweight = torch.randint(-2000000000,
2000000000, (8192, 256),
......
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest
import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True)
......@@ -17,15 +15,19 @@ def clear_cache():
_cached_get_attn_backend.cache_clear()
def test_selector(monkeypatch):
"""Test that the attention selector for ROCm.
"""
override_backend_env_variable(monkeypatch, "ROCM_FLASH")
def test_selector(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
# Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform",
RocmPlatform())
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
# mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment