Unverified Commit a73e183e authored by Sibi's avatar Sibi Committed by GitHub
Browse files

[Misc] Replace os environ to monkeypatch in test suite (#14516)


Signed-off-by: default avatarsibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: default avatarAaron Pham <contact@aarnphm.xyz>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarAaron Pham <contact@aarnphm.xyz>
parent 1e799b7e
...@@ -522,7 +522,7 @@ steps: ...@@ -522,7 +522,7 @@ steps:
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
......
...@@ -47,6 +47,7 @@ def test_vllm_gc_ed(): ...@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
...@@ -63,31 +64,33 @@ def test_models( ...@@ -63,31 +64,33 @@ def test_models(
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window # we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join( prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:" str(i) for i in range(1024)) + " are:"
example_prompts = [prompt] example_prompts = [prompt]
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
...@@ -104,6 +107,7 @@ def test_models( ...@@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
]) ])
def test_models_distributed( def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
...@@ -116,34 +120,41 @@ def test_models_distributed( ...@@ -116,34 +120,41 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE: if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}") pytest.skip(f"Skip test for {test_suite}")
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa with monkeypatch.context() as monkeypatch_context:
# test Ray Compiled Graph if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend if attention_backend:
monkeypatch_context.setenv(
dtype = "half" "VLLM_ATTENTION_BACKEND",
max_tokens = 5 attention_backend,
)
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. dtype = "half"
# if we run HF first, the cuda initialization will be done and it max_tokens = 5
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, # NOTE: take care of the order. run vLLM first, and then run HF.
dtype=dtype, # vLLM needs a fresh new process without cuda initialization.
tensor_parallel_size=2, # if we run HF first, the cuda initialization will be done and it
distributed_executor_backend=distributed_executor_backend # will hurt multiprocessing backend with fork method
) as vllm_model: # (the default method).
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) with vllm_runner(
model,
with hf_runner(model, dtype=dtype) as hf_model: dtype=dtype,
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
check_outputs_equal( ) as vllm_model:
outputs_0_lst=hf_outputs, vllm_outputs = vllm_model.generate_greedy(example_prompts,
outputs_1_lst=vllm_outputs, max_tokens)
name_0="hf",
name_1="vllm", with hf_runner(model, dtype=dtype) as hf_model:
) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
...@@ -7,16 +7,22 @@ prefill requests are chunked. ...@@ -7,16 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
...@@ -24,12 +30,14 @@ MODELS = [ ...@@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
Since this module is V0 only, set VLLM_USE_V1=0 for Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file. all tests in the file.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch): ...@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
...@@ -52,37 +60,39 @@ def test_models( ...@@ -52,37 +60,39 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True, enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
...@@ -90,57 +100,61 @@ def test_models( ...@@ -90,57 +100,61 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct" if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test Ray Compiled Graph # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
chunked_prefill_token_size = 16 chunked_prefill_token_size = 16
# Add a chunked prefill config. # Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256) max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1 assert chunked_prefill_token_size != -1
enable_chunked_prefill = True enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with
# fork method (the default method).
# NOTE: take care of the order. run vLLM first, and then run HF. with vllm_runner(
# vLLM needs a fresh new process without cuda initialization. model,
# if we run HF first, the cuda initialization will be done and it dtype=dtype,
# will hurt multiprocessing backend with fork method (the default method). tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
with vllm_runner( enable_chunked_prefill=enable_chunked_prefill,
model, max_num_batched_tokens=max_num_batched_tokens,
dtype=dtype, distributed_executor_backend=distributed_executor_backend,
tensor_parallel_size=2, ) as vllm_model:
max_num_seqs=max_num_seqs, vllm_outputs = vllm_model.generate_greedy(
enable_chunked_prefill=enable_chunked_prefill, example_prompts,
max_num_batched_tokens=max_num_batched_tokens, max_tokens,
distributed_executor_backend=distributed_executor_backend, )
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -158,7 +172,7 @@ def test_models_distributed( ...@@ -158,7 +172,7 @@ def test_models_distributed(
# the async postprocessor # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True]) @pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache( def test_models_with_fp8_kv_cache(
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
kv_cache_dtype: str, kv_cache_dtype: str,
model: str, model: str,
...@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache( ...@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
...@@ -254,8 +268,10 @@ def test_with_prefix_caching( ...@@ -254,8 +268,10 @@ def test_with_prefix_caching(
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt], outputs[enable] += vllm_model.generate_greedy(
max_tokens) [prompt],
max_tokens,
)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs[False], outputs_0_lst=outputs[False],
...@@ -274,8 +290,8 @@ def test_with_prefix_caching( ...@@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu( def test_models_cpu(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
...@@ -283,7 +299,7 @@ def test_models_cpu( ...@@ -283,7 +299,7 @@ def test_models_cpu(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
test_models( test_models(
hf_runner, hf_runner,
...@@ -307,7 +323,7 @@ def test_models_cpu( ...@@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu( def test_with_prefix_caching_cpu(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
......
...@@ -123,40 +123,38 @@ def test_cumem_with_cudagraph(): ...@@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
def test_end_to_end(model: str, use_v1: bool): def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
import os with monkeypatch.context() as m:
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True) llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?" prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params) output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only # which is difficult to measure in the test. therefore, we only
# test sleep level 1 here. # test sleep level 1 here.
llm.sleep(level=1) llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool, # now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights) # and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug, # is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected. # therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1. # in V1.
if use_v1: if use_v1:
assert used_bytes < 7 * GiB_bytes assert used_bytes < 7 * GiB_bytes
else: else:
assert used_bytes < 2 * GiB_bytes assert used_bytes < 2 * GiB_bytes
llm.wake_up() llm.wake_up()
output2 = llm.generate(prompt, sampling_params) output2 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses import dataclasses
from typing import Optional
import pytest import pytest
...@@ -22,75 +22,76 @@ class TestSetting: ...@@ -22,75 +22,76 @@ class TestSetting:
fullgraph: bool fullgraph: bool
# representative settings for testing
test_settings = [
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
]
# we cannot afford testing the full Catesian product # we cannot afford testing the full Catesian product
# of all models and all levels # of all models and all levels
@pytest.mark.parametrize("test_setting", test_settings) @pytest.mark.parametrize(
def test_compile_correctness(test_setting: TestSetting): "test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices. # make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests. # don't use "<", as it will duplicate the tests.
...@@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting): ...@@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size: if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.") pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
["-tp", str(tp_size)]
all_args: list[list[str]] = [] with monkeypatch.context() as m:
all_envs: list[Optional[dict[str, str]]] = [] m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for level in [ for level in [
CompilationLevel.NO_COMPILATION, CompilationLevel.NO_COMPILATION,
CompilationLevel.PIECEWISE, CompilationLevel.PIECEWISE,
]: ]:
all_args.append(final_args + [f"-O{level}"]) all_args.append(final_args + [f"-O{level}"])
all_envs.append({}) all_envs.append({})
# inductor will change the output, so we only compare if the output # inductor will change the output, so we only compare if the output
# is close, not exactly the same. # is close, not exactly the same.
compare_all_settings( compare_all_settings(
model, model,
all_args, all_args,
all_envs, all_envs,
method=method if method != "generate" else "generate_close") method=method if method != "generate" else "generate_close")
all_envs.clear() all_envs.clear()
all_args.clear() all_args.clear()
for level in [ for level in [
CompilationLevel.NO_COMPILATION, CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE, CompilationLevel.DYNAMO_ONCE,
]: ]:
all_args.append(final_args + [f"-O{level}"]) all_args.append(final_args + [f"-O{level}"])
all_envs.append({}) all_envs.append({})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph # "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][ all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method) compare_all_settings(model, all_args * 3, all_envs, method=method)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Any
import pytest import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
)
@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level): def test_full_graph(
model = model_info[0] monkeypatch: pytest.MonkeyPatch,
model_kwargs = model_info[1] model_info: tuple[str, dict[str, Any]],
check_full_graph_support(model, optimization_level: int,
model_kwargs, ):
optimization_level, model, model_kwargs = model_info
tp_size=1)
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
...@@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items): ...@@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items):
skip_optional = pytest.mark.skip(reason="need --optional option to run") skip_optional = pytest.mark.skip(reason="need --optional option to run")
for item in items: for item in items:
if "optional" in item.keywords: if "optional" in item.keywords:
item.add_marker(skip_optional) item.add_marker(skip_optional)
\ No newline at end of file
...@@ -3,7 +3,10 @@ ...@@ -3,7 +3,10 @@
Run `pytest tests/distributed/test_comm_ops.py`. Run `pytest tests/distributed/test_comm_ops.py`.
""" """
import os
from __future__ import annotations
from typing import Any, Callable
import pytest import pytest
import ray import ray
...@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel ...@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, def all_reduce_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, def all_gather_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def broadcast_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
all_reduce_test_worker, all_gather_test_worker, all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel(tp_size, test_target): def test_multi_process_tensor_parallel(
multi_process_parallel(tp_size, 1, test_target) monkeypatch: pytest.MonkeyPatch,
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
...@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target): ...@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
def test_multi_process_pipeline_parallel(pp_size, test_target): def test_multi_process_pipeline_parallel(
multi_process_parallel(1, pp_size, test_target) monkeypatch: pytest.MonkeyPatch,
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
...@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target): ...@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel_pipeline_parallel( def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size, pp_size, test_target): tp_size: int,
multi_process_parallel(tp_size, pp_size, test_target) pp_size: int,
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import random import random
import pytest import pytest
...@@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes): ...@@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): def graph_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
device = torch.device(f"cuda:{rank}") tp_size,
torch.cuda.set_device(device) pp_size,
init_test_distributed_environment(tp_size, pp_size, rank, rank,
distributed_init_port) distributed_init_port,
ensure_model_parallel_initialized(tp_size, pp_size) ):
group = get_tensor_model_parallel_group().device_group with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
# A small all_reduce for warmup. device = torch.device(f"cuda:{rank}")
# this is needed because device communicators might be created lazily torch.cuda.set_device(device)
# (e.g. NCCL). This will ensure that the communicator is initialized init_test_distributed_environment(tp_size, pp_size, rank,
# before any communication happens, so that this group can be used for distributed_init_port)
# graph capture immediately. ensure_model_parallel_initialized(tp_size, pp_size)
data = torch.zeros(1) group = get_tensor_model_parallel_group().device_group
data = data.to(device=device)
torch.distributed.all_reduce(data, group=group) # A small all_reduce for warmup.
torch.cuda.synchronize() # this is needed because device communicators might be created lazily
del data # (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for
# we use the first group to communicate once # graph capture immediately.
# and the second group to communicate twice data = torch.zeros(1)
# and so on data = data.to(device=device)
# this is used to demonstrate that each group can torch.distributed.all_reduce(data, group=group)
# communicate independently torch.cuda.synchronize()
num_communication = rank // tp_size + 1 del data
for sz in test_sizes: # we use the first group to communicate once
for dtype in [torch.float32, torch.float16, torch.bfloat16]: # and the second group to communicate twice
with graph_capture(device=device) as graph_capture_context: # and so on
# use integers so result matches NCCL exactly # this is used to demonstrate that each group can
inp1 = torch.randint(1, # communicate independently
16, (sz, ), num_communication = rank // tp_size + 1
dtype=dtype,
device=torch.cuda.current_device()) for sz in test_sizes:
inp2 = torch.randint(1, for dtype in [torch.float32, torch.float16, torch.bfloat16]:
16, (sz, ), with graph_capture(device=device) as graph_capture_context:
dtype=dtype, # use integers so result matches NCCL exactly
device=torch.cuda.current_device()) inp1 = torch.randint(1,
torch.cuda.synchronize() 16, (sz, ),
graph = torch.cuda.CUDAGraph() dtype=dtype,
with torch.cuda.graph(graph, device=torch.cuda.current_device())
stream=graph_capture_context.stream): inp2 = torch.randint(1,
for i in range(num_communication): 16, (sz, ),
out1 = tensor_model_parallel_all_reduce(inp1) dtype=dtype,
# the input buffer is immediately modified to test device=torch.cuda.current_device())
# synchronization torch.cuda.synchronize()
dist.all_reduce(inp1, group=group) graph = torch.cuda.CUDAGraph()
out2 = tensor_model_parallel_all_reduce(inp2) with torch.cuda.graph(graph,
dist.all_reduce(inp2, group=group) stream=graph_capture_context.stream):
graph.replay() for i in range(num_communication):
torch.testing.assert_close(out1, inp1) out1 = tensor_model_parallel_all_reduce(inp1)
torch.testing.assert_close(out2, inp2) # the input buffer is immediately modified to test
# synchronization
dist.all_reduce(inp1, group=group)
out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group)
graph.replay()
torch.testing.assert_close(out1, inp1)
torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): def eager_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
device = torch.device(f"cuda:{rank}") tp_size,
torch.cuda.set_device(device) pp_size,
init_test_distributed_environment(tp_size, pp_size, rank, rank,
distributed_init_port) distributed_init_port,
):
# we use the first group to communicate once with monkeypatch.context() as m:
# and the second group to communicate twice m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
# and so on device = torch.device(f"cuda:{rank}")
# this is used to demonstrate that each group can torch.cuda.set_device(device)
# communicate independently init_test_distributed_environment(tp_size, pp_size, rank,
num_communication = rank // tp_size + 1 distributed_init_port)
sz = 1024
fa = get_tp_group().ca_comm # we use the first group to communicate once
inp = torch.ones(sz, dtype=torch.float32, device=device) # and the second group to communicate twice
out = inp # and so on
for _ in range(num_communication): # this is used to demonstrate that each group can
out = fa.all_reduce(out, registered=False) # communicate independently
torch.testing.assert_close(out, inp * (tp_size**num_communication)) num_communication = rank // tp_size + 1
sz = 1024
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) fa = get_tp_group().ca_comm
out = inp inp = torch.ones(sz, dtype=torch.float32, device=device)
for _ in range(num_communication): out = inp
out = fa.all_reduce(out, registered=False) for _ in range(num_communication):
torch.testing.assert_close(out, inp * (tp_size**num_communication)) out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication))
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = inp
for _ in range(num_communication):
out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): def test_custom_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count(): if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.") pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(tp_size, pipeline_parallel_size, test_target) multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
test_target)
...@@ -7,33 +7,35 @@ import pytest ...@@ -7,33 +7,35 @@ import pytest
from vllm.distributed.utils import get_pp_indices from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition(): def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
def _verify(partition_str, num_layers, pp_size, goldens): with monkeypatch.context() as m:
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str def _verify(partition_str, num_layers, pp_size, goldens):
for pp_rank, golden in enumerate(goldens): bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
if bak is not None: for pp_rank, golden in enumerate(goldens):
os.environ["VLLM_PP_LAYER_PARTITION"] = bak assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None:
# Even partition m.setenv("VLLM_PP_LAYER_PARTITION", bak)
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Balanced partition # Even partition
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Put reminder somewhere # Balanced partition
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
# Invalid partition strings # Put reminder somewhere
with pytest.raises(ValueError): _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Invalid partition strings
with pytest.raises(ValueError): with pytest.raises(ValueError):
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Wrong number of partitions with pytest.raises(ValueError):
with pytest.raises(ValueError): _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Wrong number of partitions
# Wrong number of layers with pytest.raises(ValueError):
with pytest.raises(ValueError): _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Wrong number of layers
with pytest.raises(ValueError):
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -55,6 +57,10 @@ def test_custom_layer_partition(): ...@@ -55,6 +57,10 @@ def test_custom_layer_partition():
(5, 3, 1, (2, 4)), (5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)), (5, 3, 2, (4, 5)),
]) ])
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, def test_uneven_auto_partition(
pp_rank: int, indices: tuple[int, int]): num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os from typing import TYPE_CHECKING
import pytest import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"), (2, "JackFram/llama-160m"),
...@@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test ...@@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
"FLASHINFER", "FLASHINFER",
]) ])
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_pp_cudagraph(
cudagraph_args = [ monkeypatch: pytest.MonkeyPatch,
# use half precision for speed and memory savings in CI environment PP_SIZE: int,
"--dtype", MODEL_NAME: str,
"float16", ATTN_BACKEND: LiteralString,
"--pipeline-parallel-size", ):
str(PP_SIZE), with monkeypatch.context() as m:
"--distributed-executor-backend", cudagraph_args = [
"mp", # use half precision for speed and memory savings in CI environment
] "--dtype",
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND "float16",
"--pipeline-parallel-size",
eager_args = cudagraph_args + ["--enforce-eager"] str(PP_SIZE),
"--distributed-executor-backend",
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) "mp",
]
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
...@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4" ...@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU") reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): ...@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
run_test(more_args) run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
...@@ -53,32 +53,37 @@ def cache_models(): ...@@ -53,32 +53,37 @@ def cache_models():
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models") @pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch): def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM # Set HF to offline mode and ensure we can still construct an LLM
try: with monkeypatch.context() as m:
monkeypatch.setenv("HF_HUB_OFFLINE", "1") try:
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs): def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed") raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", m.setattr(
disable_connect) urllib3.connection.HTTPConnection,
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", "connect",
disable_connect) disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub and friends to setup offline mode # Need to re-import huggingface_hub
_re_import_modules() # and friends to setup offline mode
# Cached model files should be used in offline mode _re_import_modules()
for model_config in MODEL_CONFIGS: # Cached model files should be used in offline mode
LLM(**model_config) for model_config in MODEL_CONFIGS:
finally: LLM(**model_config)
# Reset the environment after the test finally:
# NB: Assuming tests are run in online mode # Reset the environment after the test
monkeypatch.delenv("HF_HUB_OFFLINE") # NB: Assuming tests are run in online mode
monkeypatch.delenv("VLLM_NO_USAGE_STATS") _re_import_modules()
_re_import_modules()
pass
def _re_import_modules(): def _re_import_modules():
......
...@@ -70,7 +70,7 @@ def run_test(more_args): ...@@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU") reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): ...@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
...@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch ...@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -25,87 +24,111 @@ def clear_cache(): ...@@ -25,87 +24,111 @@ def clear_cache():
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, use_v1: bool, device: str, monkeypatch): def test_env(
name: str,
use_v1: bool,
device: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
""" """
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") with monkeypatch.context() as m:
override_backend_env_variable(monkeypatch, name) m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, name)
if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()): if device == "cpu":
backend = get_attn_backend(16, torch.float16, torch.float16, 16, with patch("vllm.attention.selector.current_platform",
False) CpuPlatform()):
assert backend.get_name() == "TORCH_SDPA" backend = get_attn_backend(16, torch.float16, torch.float16,
elif device == "hip": 16, False)
with patch("vllm.attention.selector.current_platform", RocmPlatform()): assert backend.get_name() == "TORCH_SDPA"
backend = get_attn_backend(16, torch.float16, torch.float16, 16, elif device == "hip":
False)
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED
elif device == "openvino":
with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
assert backend.get_name() == "OPENVINO"
else:
if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
CudaPlatform()): RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, backend = get_attn_backend(16, torch.float16, torch.float16,
16, False) 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
elif device == "openvino":
with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
assert backend.get_name() == "OPENVINO"
else:
if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
backend = get_attn_backend(16, torch.float16,
torch.float16, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == EXPECTED
def test_flash_attn(monkeypatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to # TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend # get_attn_backend
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=(7, 5)): monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
(7, 5))
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported data type # Reset the monkeypatch for subsequent tests
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) monkeypatch.undo()
assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported kv cache data type # Unsupported data type
backend = get_attn_backend(16, torch.float16, "fp8", 16, False) backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported block size # Unsupported kv cache data type
backend = get_attn_backend(16, torch.float16, None, 8, False) backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed # Unsupported block size
with patch.dict('sys.modules', {'vllm_flash_attn': None}): backend = get_attn_backend(16, torch.float16, None, 8, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed
import sys
original_module = sys.modules.get('vllm_flash_attn')
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported head size # Restore the original module if it existed
backend = get_attn_backend(17, torch.float16, None, 16, False) if original_module is not None:
assert backend.get_name() != STR_FLASH_ATTN_VAL monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
original_module)
else:
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
# Attention-free models should bypass env and use PlaceholderAttention # Unsupported head size
backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) backend = get_attn_backend(17, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Attention-free models should bypass env and use PlaceholderAttention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
assert backend.get_name() != STR_FLASH_ATTN_VAL
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
def test_invalid_env(use_v1: bool, monkeypatch): def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
"""Ignore the invalid env variable if it is set."""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") with monkeypatch.context() as m, patch(
override_backend_env_variable(monkeypatch, STR_INVALID_VAL) "vllm.attention.selector.current_platform", CudaPlatform()):
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
with patch("vllm.attention.selector.current_platform", CudaPlatform()): # Test with head size 32
backend = get_attn_backend(32, torch.float16, None, 16, False) backend = get_attn_backend(32, torch.float16, None, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
...@@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401 ...@@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_dequantize_opcheck(): def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
qweight = torch.randint(-2000000000, m.setenv("VLLM_USE_TRITON_AWQ", "0")
2000000000, (8192, 256), qweight = torch.randint(-2000000000,
device='cuda', 2000000000, (8192, 256),
dtype=torch.int32) device='cuda',
scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) dtype=torch.int32)
zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
split_k_iters = 0 zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
thx = 0 split_k_iters = 0
thy = 0 thx = 0
opcheck(torch.ops._C.awq_dequantize, thy = 0
(qweight, scales, zeros, split_k_iters, thx, thy)) opcheck(torch.ops._C.awq_dequantize,
(qweight, scales, zeros, split_k_iters, thx, thy))
@pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_gemm_opcheck(): def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) m.setenv("VLLM_USE_TRITON_AWQ", "0")
qweight = torch.randint(-2000000000, input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
2000000000, (8192, 256), qweight = torch.randint(-2000000000,
device='cuda', 2000000000, (8192, 256),
dtype=torch.int32) device='cuda',
scales = torch.randint(-2000000000, dtype=torch.int32)
2000000000, (64, 256), scales = torch.randint(-2000000000,
device='cuda', 2000000000, (64, 256),
dtype=torch.int32) device='cuda',
qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) dtype=torch.int32)
split_k_iters = 8 qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
opcheck(torch.ops._C.awq_gemm, split_k_iters = 8
(input, qweight, qzeros, scales, split_k_iters)) opcheck(torch.ops._C.awq_gemm,
(input, qweight, qzeros, scales, split_k_iters))
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -17,15 +15,19 @@ def clear_cache(): ...@@ -17,15 +15,19 @@ def clear_cache():
_cached_get_attn_backend.cache_clear() _cached_get_attn_backend.cache_clear()
def test_selector(monkeypatch): def test_selector(monkeypatch: pytest.MonkeyPatch):
"""Test that the attention selector for ROCm. with monkeypatch.context() as m:
""" m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
override_backend_env_variable(monkeypatch, "ROCM_FLASH")
with patch("vllm.attention.selector.current_platform", RocmPlatform()): # Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform",
RocmPlatform())
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH" assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1") or backend.get_name() == "ROCM_ATTN_VLLM_V1")
# mla test for deepseek related # mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True) False, True)
......
...@@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = ( ...@@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
class QKVInputs(NamedTuple): class QKVInputs(NamedTuple):
''' '''
Data structure for representing unpacked attention inputs, Data structure for representing unpacked attention inputs,
query/key/values and their sequence lengths. query/key/values and their sequence lengths.
Attributes: Attributes:
* {query,key,value}: unpacked (batch_size x padded_seq_len x * {query,key,value}: unpacked (batch_size x padded_seq_len x
num_heads x head_size) attention inputs num_heads x head_size) attention inputs
* q_seq_lens: query sequence lengths list * q_seq_lens: query sequence lengths list
* kv_seq_lens: shared key/value sequence lengths list * kv_seq_lens: shared key/value sequence lengths list
...@@ -56,14 +56,14 @@ class QKVInputs(NamedTuple): ...@@ -56,14 +56,14 @@ class QKVInputs(NamedTuple):
class QKVO(NamedTuple): class QKVO(NamedTuple):
''' '''
Data structure for representing unpacked attention inputs, Data structure for representing unpacked attention inputs,
alongside unpacked known-correct attention output alongside unpacked known-correct attention output
Attributes: Attributes:
* qkv: unpacked (batch_size x padded_seq_len x * qkv: unpacked (batch_size x padded_seq_len x
num_heads x head_size) attention inputs num_heads x head_size) attention inputs
* ideal_output: unpacked (batch_size x padded_seq_len x * ideal_output: unpacked (batch_size x padded_seq_len x
num_heads x head_size) known-correct attention output num_heads x head_size) known-correct attention output
''' '''
...@@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple): ...@@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple):
Attributes: Attributes:
* {query,key,value}: packed (number_of_tokens x num_heads * {query,key,value}: packed (number_of_tokens x num_heads
x head_size) attention inputs x head_size) attention inputs
* q_start_loc_list: list of query start locations within packed tensor * q_start_loc_list: list of query start locations within packed tensor
* kv_start_loc_list: shared list of key/value start locations within * kv_start_loc_list: shared list of key/value start locations within
...@@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple): ...@@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple):
class PackedQKVO(NamedTuple): class PackedQKVO(NamedTuple):
''' '''
Data structure for representing packed attention inputs, Data structure for representing packed attention inputs,
alongside packed known-correct attention output alongside packed known-correct attention output
Attributes: Attributes:
* packed_qkv: packed (number_of_tokens x num_heads * packed_qkv: packed (number_of_tokens x num_heads
x head_size) attention inputs x head_size) attention inputs
* ideal_output: packed (number_of_tokens x num_heads * ideal_output: packed (number_of_tokens x num_heads
x head_size) known-correct attention output x head_size) known-correct attention output
''' '''
...@@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple): ...@@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple):
Attributes: Attributes:
* packed_qkvo: packed (number_of_tokens x num_heads * packed_qkvo: packed (number_of_tokens x num_heads
x head_size) attention inputs & known-correct x head_size) attention inputs & known-correct
output output
* kv_mmap: KV cache memory mapping, specific to this test phase & * kv_mmap: KV cache memory mapping, specific to this test phase &
...@@ -195,7 +195,7 @@ def make_causal_mask( ...@@ -195,7 +195,7 @@ def make_causal_mask(
Create a q_max_seq_len x kv_max_seq_len causal mask Create a q_max_seq_len x kv_max_seq_len causal mask
Arguments: Arguments:
* q_max_seq_len: query max seq len * q_max_seq_len: query max seq len
* kv_max_seq_len: key/value max seq len * kv_max_seq_len: key/value max seq len
...@@ -320,9 +320,9 @@ def make_qkv( ...@@ -320,9 +320,9 @@ def make_qkv(
* max_kv_seq_len: max key/value seq len * max_kv_seq_len: max key/value seq len
* num_heads * num_heads
* head_size * head_size
* is_encoder_decoder_attn: if True, query seqlen may differ from * is_encoder_decoder_attn: if True, query seqlen may differ from
key/value seqlen (as is often the case for cross-attention); key/value seqlen (as is often the case for cross-attention);
o/w, query/key/value seqlens match at each batch index o/w, query/key/value seqlens match at each batch index
(max_kv_seq_len is unused) (max_kv_seq_len is unused)
* force_kv_seq_lens: if not None, overrides kv sequence lengths * force_kv_seq_lens: if not None, overrides kv sequence lengths
* attn_type: encoder, decoder self, or enc/dec cross attention * attn_type: encoder, decoder self, or enc/dec cross attention
...@@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device, ...@@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
Individually pack each of Q, K and V, each with dimensions batch_size x Individually pack each of Q, K and V, each with dimensions batch_size x
padded_seq_len x num_heads x head_size, into respective number_of_tokens x padded_seq_len x num_heads x head_size, into respective number_of_tokens x
num_heads x head_size tensors. num_heads x head_size tensors.
For Q, number_of_tokens = sum(q_seq_lens). For Q, number_of_tokens = sum(q_seq_lens).
For K and V, number_of_tokens = sum(kv_seq_lens) For K and V, number_of_tokens = sum(kv_seq_lens)
...@@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int, ...@@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int,
Returns: Returns:
* kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
* for backend 'XFORMERS' * for backend 'XFORMERS'
* kv_cache: 2 x num_blocks x block_size x num_heads x head_size * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
* for backend 'FLASH_ATTN' * for backend 'FLASH_ATTN'
''' '''
if backend == 'XFORMERS': if backend == 'XFORMERS':
kv_cache = torch.rand( kv_cache = torch.rand(
...@@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], ...@@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
Context: Context:
* Your goal is to test (1) prefill of N prompts, with prompt-lengths * Your goal is to test (1) prefill of N prompts, with prompt-lengths
{K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
for all N prompts (N tokens total); the resultant sequence lengths for all N prompts (N tokens total); the resultant sequence lengths
after decode would be {K_i + 1 for i \\in [0,N)} after decode would be {K_i + 1 for i \\in [0,N)}
* The test you want to do requires (1) having the prefill slot mapping * The test you want to do requires (1) having the prefill slot mapping
for all tokens present during prefill, the number of which is for all tokens present during prefill, the number of which is
M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
decoded tokens decoded tokens
This function consumes a single 1D slot mapping, which is the This function consumes a single 1D slot mapping, which is the
concatenation of N slot mappings each of length K_i + 1 (corresponding concatenation of N slot mappings each of length K_i + 1 (corresponding
to the sequence lengths after decode), with a total length of to the sequence lengths after decode), with a total length of
P = \\sum_i{K_i + 1} = M + N P = \\sum_i{K_i + 1} = M + N
The prefill-phase slot mapping results from excising the (K_i + 1)-th entry The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
from each of the N subsequences in the slot mapping (i.e. omitting the from each of the N subsequences in the slot mapping (i.e. omitting the
decoded token's mapping.) decoded token's mapping.)
The N excised entries are appended to obtain the decode-phase slot mapping The N excised entries are appended to obtain the decode-phase slot mapping
...@@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], ...@@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
* slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
post-decode sequences post-decode sequences
* seq_lens: list of N post-decode sequence lengths (K_i + 1 in the * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
description above) description above)
* device: cuda, cpu, etc. * device: cuda, cpu, etc.
Returns: Returns:
* prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
reflecting all N prefill prompts reflecting all N prefill prompts
* decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
all N decoded tokens all N decoded tokens
''' '''
...@@ -725,7 +725,7 @@ def make_block_tables_slot_mapping( ...@@ -725,7 +725,7 @@ def make_block_tables_slot_mapping(
Then the minimum KV cache size in blocks is Then the minimum KV cache size in blocks is
total_cache_blocks = sum(num_blocks for all seqs) total_cache_blocks = sum(num_blocks for all seqs)
Then, the blocktable mapping counts downward from Then, the blocktable mapping counts downward from
...@@ -734,7 +734,7 @@ def make_block_tables_slot_mapping( ...@@ -734,7 +734,7 @@ def make_block_tables_slot_mapping(
to to
block_base_addr block_base_addr
The constructed block-tables and slot-mapping are sized to the The constructed block-tables and slot-mapping are sized to the
lengths of the sequences in their entirety (as reflected by seq_lens), lengths of the sequences in their entirety (as reflected by seq_lens),
...@@ -749,7 +749,7 @@ def make_block_tables_slot_mapping( ...@@ -749,7 +749,7 @@ def make_block_tables_slot_mapping(
Return: Return:
* block_tables_tensor: block table for sequence * block_tables_tensor: block table for sequence
* slot_mapping_list: slot mapping for sequence * slot_mapping_list: slot mapping for sequence
* max_block_idx: the highest block address within this block table * max_block_idx: the highest block address within this block table
''' '''
...@@ -807,7 +807,7 @@ def make_test_metadata( ...@@ -807,7 +807,7 @@ def make_test_metadata(
encoder_test_params and cross_test_params arguments allow encoder encoder_test_params and cross_test_params arguments allow encoder
attention and enc/dec cross-attention (respectively) to use distinct attention and enc/dec cross-attention (respectively) to use distinct
metadata values from decoder self-attention (decoder_test_params.) metadata values from decoder self-attention (decoder_test_params.)
if encoder_test_params and cross_test_params are None, the attention if encoder_test_params and cross_test_params are None, the attention
metadata will support decoder-only scenario. metadata will support decoder-only scenario.
...@@ -820,7 +820,7 @@ def make_test_metadata( ...@@ -820,7 +820,7 @@ def make_test_metadata(
* attn_backend_name: Backend for sourcing attention kernels * attn_backend_name: Backend for sourcing attention kernels
* is_prompt: prefill if True, o/w decode * is_prompt: prefill if True, o/w decode
* seq_lens: list of token counts for each sequence * seq_lens: list of token counts for each sequence
* decoder_test_params: decoder self-attention test params; * decoder_test_params: decoder self-attention test params;
this function requires this function requires
kv_mmap (memory mapping) field kv_mmap (memory mapping) field
* device: CPU or CUDA device * device: CPU or CUDA device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment