Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -58,7 +58,7 @@ schemathesis==3.39.15
# OpenAI schema test
# Evaluation and benchmarking
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
lm-eval[api]>=0.4.9.2
jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
......@@ -74,17 +74,21 @@ torchgeo==0.7.0
# MTEB Benchmark Test
mteb==2.1.2
# Data processing
xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
# Test async scheduling
# Utilities
num2words==0.5.14
# via lm-eval
pqdm==0.2.0
# via lm-eval
# Required for fastsafetensors test
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
# Required for suffix decoding test
arctic-inference == 0.1.1
# Required for Nemotron test
open-clip-torch==2.32.0
# Required for isaac Multi-Modal generation test
perceptron==0.1.4
# Required for the multi-modal models test
timm==1.0.17
# Required for plugins test
albumentations==1.4.6
\ No newline at end of file
......@@ -15,7 +15,7 @@ setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3
# conch-triton-kernels==1.2.1
timm>=1.0.17
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
grpcio-tools>=1.76.0
numa
pytrie
......@@ -23,10 +23,10 @@ setuptools_scm>=8
cmake==3.29
quart
fastrlock==0.8.3
cupy==12.3.0
# cupy==12.3.0
torch >= 2.7.1
triton == 3.1
torch == 2.9.0
triton == 3.3
flash_attn == 2.6.1
flash_mla == 1.0.0
lightop == 0.6.0
......
......@@ -9,6 +9,7 @@ pytest-timeout
pytest-cov
# testing utils
albumentations # required for Nemotron Parse in test_common.py
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl
......@@ -19,23 +20,22 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
sentence-transformers # required for embedding tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.9.0
torchaudio==2.9.0
torchvision==0.24.0
timm==1.0.17 # required for internvl and gemma3n-mm test
torch==2.9.1
torchaudio==2.9.1
torchvision==0.24.1
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.3
tokenizers==0.22.0
......@@ -57,3 +57,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
gpt-oss >= 0.0.7; python_version > '3.11'
perceptron # required for isaac test
......@@ -27,7 +27,9 @@ aiosignal==1.4.0
albucore==0.0.16
# via terratorch
albumentations==1.4.6
# via terratorch
# via
# -r requirements/test.in
# terratorch
alembic==1.16.4
# via mlflow
annotated-types==0.7.0
......@@ -135,6 +137,7 @@ cloudpickle==3.1.1
# via mlflow-skinny
colorama==0.4.6
# via
# perceptron
# sacrebleu
# schemathesis
# tqdm-multiprocess
......@@ -294,7 +297,7 @@ graphql-relay==3.2.0
# via graphene
greenlet==3.2.3
# via sqlalchemy
grpcio==1.71.0
grpcio==1.76.0
# via ray
gunicorn==23.0.0
# via mlflow
......@@ -302,6 +305,8 @@ h11==0.14.0
# via
# httpcore
# uvicorn
h2==4.3.0
# via httpx
h5py==3.13.0
# via terratorch
harfile==0.3.0
......@@ -310,6 +315,8 @@ hf-xet==1.1.7
# via huggingface-hub
hiredis==3.0.0
# via tensorizer
hpack==4.1.0
# via h2
html2text==2025.4.15
# via gpt-oss
httpcore==1.0.6
......@@ -317,6 +324,7 @@ httpcore==1.0.6
httpx==0.27.2
# via
# -r requirements/test.in
# perceptron
# schemathesis
huggingface-hub==0.34.3
# via
......@@ -338,6 +346,8 @@ hydra-core==1.3.2
# via
# lightly
# lightning
hyperframe==6.1.0
# via h2
hypothesis==6.131.0
# via
# hypothesis-graphql
......@@ -441,7 +451,7 @@ lightning-utilities==0.14.3
# torchmetrics
llvmlite==0.44.0
# via numba
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
lm-eval==0.4.9.2
# via -r requirements/test.in
lxml==5.3.0
# via
......@@ -474,7 +484,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.8.5
mistral-common==1.8.8
# via -r requirements/test.in
mlflow==2.22.0
# via terratorch
......@@ -549,6 +559,7 @@ numpy==1.26.4
# pandas
# patsy
# peft
# perceptron
# pycocotools
# pyogrio
# rasterio
......@@ -702,6 +713,8 @@ peft==0.16.0
# via
# -r requirements/test.in
# lm-eval
perceptron==0.1.4
# via -r requirements/test.in
pillow==10.4.0
# via
# genai-perf
......@@ -709,9 +722,9 @@ pillow==10.4.0
# lightly-utils
# matplotlib
# mistral-common
# perceptron
# scikit-image
# segmentation-models-pytorch
# sentence-transformers
# torchgeo
# torchvision
platformdirs==4.3.6
......@@ -745,7 +758,7 @@ propcache==0.2.0
# yarl
proto-plus==1.26.1
# via google-api-core
protobuf==5.28.3
protobuf==6.33.2
# via
# google-api-core
# googleapis-common-protos
......@@ -952,6 +965,7 @@ rich==13.9.4
# genai-perf
# lightning
# mteb
# perceptron
# typer
rioxarray==0.19.0
# via terratorch
......@@ -1010,7 +1024,7 @@ segmentation-models-pytorch==0.4.0
# via
# terratorch
# torchgeo
sentence-transformers==3.2.1
sentence-transformers==5.2.0
# via
# -r requirements/test.in
# mteb
......@@ -1024,7 +1038,9 @@ shapely==2.1.1
# geopandas
# torchgeo
shellingham==1.5.4
# via typer
# via
# perceptron
# typer
six==1.16.0
# via
# junit-xml
......@@ -1123,7 +1139,7 @@ tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.9.0+cu129
torch==2.9.1+cu129
# via
# -r requirements/test.in
# accelerate
......@@ -1152,7 +1168,7 @@ torch==2.9.0+cu129
# torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.9.0+cu129
torchaudio==2.9.1+cu129
# via
# -r requirements/test.in
# encodec
......@@ -1165,7 +1181,7 @@ torchmetrics==1.7.4
# pytorch-lightning
# terratorch
# torchgeo
torchvision==0.24.0+cu129
torchvision==0.24.1+cu129
# via
# -r requirements/test.in
# lightly
......@@ -1206,7 +1222,7 @@ transformers==4.57.3
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.5.0
triton==3.5.1
# via torch
tritonclient==2.51.0
# via
......@@ -1218,7 +1234,9 @@ typepy==1.3.2
# pytablewriter
# tabledata
typer==0.15.2
# via fastsafetensors
# via
# fastsafetensors
# perceptron
types-python-dateutil==2.9.0.20241206
# via arrow
typeshed-client==2.8.2
......@@ -1231,6 +1249,7 @@ typing-extensions==4.15.0
# chz
# fastapi
# graphene
# grpcio
# huggingface-hub
# librosa
# lightning
......@@ -1246,6 +1265,7 @@ typing-extensions==4.15.0
# pydantic-core
# pydantic-extra-types
# pytorch-lightning
# sentence-transformers
# sqlalchemy
# torch
# torchgeo
......
......@@ -18,6 +18,8 @@ import torch
from packaging.version import Version, parse
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools.command.build_py import build_py
from setuptools.command.develop import develop
# from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
......@@ -62,15 +64,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
sys.platform,
)
VLLM_TARGET_DEVICE = "empty"
elif (
sys.platform.startswith("linux")
and torch.version.cuda is None
and os.getenv("VLLM_TARGET_DEVICE") is None
and torch.version.hip is None
):
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
# fallback to cpu
VLLM_TARGET_DEVICE = "cpu"
elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
if torch.version.hip is not None:
VLLM_TARGET_DEVICE = "rocm"
logger.info("Auto-detected ROCm")
elif torch.version.cuda is not None:
VLLM_TARGET_DEVICE = "cuda"
logger.info("Auto-detected CUDA")
else:
VLLM_TARGET_DEVICE = "cpu"
def is_sccache_available() -> bool:
......@@ -91,6 +93,81 @@ def is_freethreaded():
return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
def compile_grpc_protos():
"""Compile gRPC protobuf definitions during build.
This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from
the vllm_engine.proto definition.
"""
try:
from grpc_tools import protoc
except ImportError:
logger.warning(
"grpcio-tools not installed, skipping gRPC proto compilation. "
"gRPC server functionality will not be available."
)
return False
proto_file = ROOT_DIR / "vllm" / "grpc" / "vllm_engine.proto"
if not proto_file.exists():
logger.warning("Proto file not found at %s, skipping compilation", proto_file)
return False
logger.info("Compiling gRPC protobuf: %s", proto_file)
result = protoc.main(
[
"grpc_tools.protoc",
f"--proto_path={ROOT_DIR}",
f"--python_out={ROOT_DIR}",
f"--grpc_python_out={ROOT_DIR}",
f"--pyi_out={ROOT_DIR}",
str(proto_file),
]
)
if result != 0:
logger.error("protoc failed with exit code %s", result)
return False
# Add SPDX headers and mypy ignore to generated files
spdx_header = (
"# SPDX-License-Identifier: Apache-2.0\n"
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
"# mypy: ignore-errors\n"
)
grpc_dir = ROOT_DIR / "vllm" / "grpc"
for generated_file in [
grpc_dir / "vllm_engine_pb2.py",
grpc_dir / "vllm_engine_pb2_grpc.py",
grpc_dir / "vllm_engine_pb2.pyi",
]:
if generated_file.exists():
content = generated_file.read_text()
if not content.startswith("# SPDX-License-Identifier"):
generated_file.write_text(spdx_header + content)
logger.info("gRPC protobuf compilation successful")
return True
class BuildPyAndGenerateGrpc(build_py):
"""Build Python modules and generate gRPC stubs from proto files."""
def run(self):
compile_grpc_protos()
super().run()
class DevelopAndGenerateGrpc(develop):
"""Develop mode that also generates gRPC stubs from proto files."""
def run(self):
compile_grpc_protos()
super().run()
class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
......@@ -120,20 +197,26 @@ class cmake_build_ext(build_ext):
num_jobs = os.cpu_count()
nvcc_threads = None
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
# `nvcc_threads` is either the value of the NVCC_THREADS
# environment variable (if defined) or 1.
# when it is set, we reduce `num_jobs` to avoid
# overloading the system.
nvcc_threads = envs.NVCC_THREADS
if nvcc_threads is not None:
nvcc_threads = int(nvcc_threads)
logger.info(
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
if _is_cuda() and CUDA_HOME is not None:
try:
nvcc_version = get_nvcc_cuda_version()
if nvcc_version >= Version("11.2"):
# `nvcc_threads` is either the value of the NVCC_THREADS
# environment variable (if defined) or 1.
# when it is set, we reduce `num_jobs` to avoid
# overloading the system.
nvcc_threads = envs.NVCC_THREADS
if nvcc_threads is not None:
nvcc_threads = int(nvcc_threads)
logger.info(
"Using NVCC_THREADS=%d as the number of nvcc threads.",
nvcc_threads,
)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
except Exception as e:
logger.warning("Failed to get NVCC version: %s", e)
return num_jobs, nvcc_threads
......@@ -211,9 +294,9 @@ class cmake_build_ext(build_ext):
# Default build tool to whatever cmake picks.
build_tool = []
# Make sure we use the nvcc from CUDA_HOME
if _is_cuda():
if _is_cuda() and CUDA_HOME is not None:
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
elif _is_hip():
elif _is_hip() and ROCM_HOME is not None:
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
other_cmake_args = os.environ.get("CMAKE_ARGS")
......@@ -351,6 +434,89 @@ class precompiled_wheel_utils:
wheels = json.loads(resp.read().decode("utf-8"))
return wheels, repo_url
@staticmethod
def is_rocm_system() -> bool:
"""Detect ROCm without relying on torch (for build environment)."""
if os.getenv("ROCM_PATH"):
return True
if os.path.isdir("/opt/rocm"):
return True
if which("rocminfo") is not None:
return True
try:
import torch
return torch.version.hip is not None
except ImportError:
return False
@staticmethod
def find_local_rocm_wheel() -> str | None:
"""Search for a local vllm wheel in common locations."""
import glob
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
wheels = glob.glob(pattern)
if wheels:
return sorted(wheels)[-1]
return None
@staticmethod
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
"""Fetch the latest wheel URL from a PyPI-style simple index."""
import platform
from html.parser import HTMLParser
from urllib.parse import urljoin
from urllib.request import urlopen
arch = platform.machine()
class WheelLinkParser(HTMLParser):
def __init__(self):
super().__init__()
self.wheels = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href" and value.endswith(".whl"):
self.wheels.append(value)
simple_url = f"{index_url.rstrip('/')}/{package}/"
print(f"Fetching wheel list from {simple_url}")
with urlopen(simple_url) as resp:
html = resp.read().decode("utf-8")
parser = WheelLinkParser()
parser.feed(html)
for wheel in reversed(parser.wheels):
if arch in wheel:
if wheel.startswith("http"):
return wheel
return urljoin(simple_url, wheel)
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
@staticmethod
def determine_wheel_url_rocm() -> tuple[str, str | None]:
"""Determine the precompiled wheel for ROCm."""
# Search for local wheel first
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
if local_wheel is not None:
print(f"Found local ROCm wheel: {local_wheel}")
return local_wheel, None
# Fall back to AMD's PyPI index
index_url = os.getenv(
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
)
print(f"Fetching ROCm precompiled wheel from {index_url}")
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
download_filename = wheel_url.split("/")[-1].split("#")[0]
print(f"Using ROCm precompiled wheel: {wheel_url}")
return wheel_url, download_filename
@staticmethod
def determine_wheel_url() -> tuple[str, str | None]:
"""
......@@ -371,6 +537,11 @@ class precompiled_wheel_utils:
print(f"Using user-specified precompiled wheel location: {wheel_location}")
return wheel_location, None
else:
# ROCm: use local wheel or AMD's PyPI index
# TODO: When we have ROCm nightly wheels, we can update this logic.
if precompiled_wheel_utils.is_rocm_system():
return precompiled_wheel_utils.determine_wheel_url_rocm()
import platform
arch = platform.machine()
......@@ -477,6 +648,8 @@ class precompiled_wheel_utils:
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
# ROCm-specific libraries
"vllm/_rocm_C.abi3.so",
]
flash_attn_regex = re.compile(
......@@ -614,6 +787,8 @@ def get_rocm_version():
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
try:
if ROCM_HOME is None:
return None
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
if not librocm_core_file.is_file():
return None
......@@ -690,9 +865,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f"""
try:
__version__ = "0.13.0"
__version_tuple__ = (0, 13, 0)
__hcu_version__ = f'0.13.0+{version}'
__version__ = "0.14.0"
__version_tuple__ = (0, 14, 0)
__hcu_version__ = f'0.14.0+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
......@@ -863,7 +1038,9 @@ if _is_cuda() or _is_hip():
if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
if envs.VLLM_USE_PRECOMPILED or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
):
# FA3 requires CUDA 12.3 or later
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# Optional since this doesn't get built (produce an .so file) when
......@@ -882,9 +1059,10 @@ if skip_vllm_build:
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"attention/backends/configs/*.json",
"entrypoints/serve/instrumentator/static/*.js",
"entrypoints/serve/instrumentator/static/*.css",
"model_executor/layers/quantization/configs/awq/*.json",
"attention/backends/configs/*.json",
"_C.abi3.so",
"_moe_C.abi3.so",
]
......@@ -895,7 +1073,8 @@ else:
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"entrypoints/serve/instrumentator/static/*.js",
"entrypoints/serve/instrumentator/static/*.css",
"attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json",
]
......@@ -915,12 +1094,17 @@ if _no_device() or skip_vllm_build:
ext_modules = []
if not ext_modules:
cmdclass = {}
cmdclass = {
"build_py": BuildPyAndGenerateGrpc,
"develop": DevelopAndGenerateGrpc,
}
else:
cmdclass = {
"build_ext": precompiled_build_ext
if envs.VLLM_USE_PRECOMPILED
else cmake_build_ext
else cmake_build_ext,
"build_py": BuildPyAndGenerateGrpc,
"develop": DevelopAndGenerateGrpc,
}
setup(
......@@ -929,12 +1113,13 @@ setup(
ext_modules=ext_modules,
install_requires=get_requirements(),
extras_require={
"bench": ["pandas", "matplotlib", "seaborn", "datasets"],
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [
"librosa",
"scipy",
"soundfile",
"mistral_common[audio]",
], # Required for audio processing
......
......@@ -72,7 +72,6 @@ def _fix_prompt_embed_outputs(
@pytest.mark.parametrize("model_executor", ["uni", "mp"])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
model: str,
backend: str,
......@@ -82,82 +81,80 @@ def test_models(
model_executor: str,
enable_prompt_embeds: bool,
) -> None:
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = (
"The following numbers of the sequence "
+ ", ".join(str(i) for i in range(1024))
+ " are:"
)
example_prompts = [prompt]
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = (
"The following numbers of the sequence "
+ ", ".join(str(i) for i in range(1024))
+ " are:"
)
example_prompts = [prompt]
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if not current_platform.is_rocm():
with VllmRunner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
attention_config={"backend": backend},
) as vllm_model:
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
if not current_platform.is_rocm():
with VllmRunner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
else:
with VllmRunner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
block_size=64,
) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
else:
with VllmRunner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
attention_config={"backend": backend},
block_size=64,
) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
# @multi_gpu_test(num_gpus=2)
# @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
# [
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
# (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "ray", "", "L4", {}),
# (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "mp", "", "L4", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100", {}),
# ("facebook/opt-125m", "ray", "", "L4", {}),
# ("facebook/opt-125m", "mp", "", "L4", {}),
# ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
# ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
# ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
# ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
# ("facebook/opt-125m", "ray", "", "A100", {}),
# ("facebook/opt-125m", "mp", "", "A100", {}),
# ],
# )
# @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
......@@ -186,12 +183,6 @@ def test_models(
# ): # noqa
# pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
# if attention_backend:
# monkeypatch_context.setenv(
# "VLLM_ATTENTION_BACKEND",
# attention_backend,
# )
# for k, v in extra_env.items():
# monkeypatch_context.setenv(k, v)
......@@ -203,6 +194,7 @@ def test_models(
# # if we run HF first, the cuda initialization will be done and it
# # will hurt multiprocessing backend with fork method
# # (the default method).
# attention_config = {"backend": attention_backend} if attention_backend else None
# with vllm_runner(
# model,
# dtype=dtype,
......@@ -210,6 +202,7 @@ def test_models(
# distributed_executor_backend=distributed_executor_backend,
# enable_prompt_embeds=enable_prompt_embeds,
# gpu_memory_utilization=0.7,
# attention_config=attention_config,
# ) as vllm_model:
# if enable_prompt_embeds:
# with hf_runner(model, dtype=dtype) as hf_model:
......@@ -225,90 +218,12 @@ def test_models(
# with hf_runner(model, dtype=dtype) as hf_model:
# hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
# @multi_gpu_test(num_gpus=2)
# @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, "
# "test_suite, extra_env", [
# ("distilbert/distilgpt2", "ray", "", "L4", {}),
# ("distilbert/distilgpt2", "mp", "", "L4", {}),
# ("distilbert/distilgpt2", "ray", "", "L4", {
# "VLLM_SLEEP_WHEN_IDLE": "1"
# }),
# ("distilbert/distilgpt2", "mp", "", "L4", {
# "VLLM_SLEEP_WHEN_IDLE": "1"
# }),
# ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
# ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
# ("distilbert/distilgpt2", "ray", "", "A100", {}),
# ("distilbert/distilgpt2", "mp", "", "A100", {}),
# ])
# @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
# def test_models_distributed(
# monkeypatch: pytest.MonkeyPatch,
# hf_runner,
# vllm_runner,
# example_prompts,
# model: str,
# distributed_executor_backend: str,
# attention_backend: str,
# test_suite: str,
# extra_env: dict[str, str],
# enable_prompt_embeds: bool,
# ) -> None:
# if test_suite != TARGET_TEST_SUITE:
# pytest.skip(f"Skip test for {test_suite}")
# with monkeypatch.context() as monkeypatch_context:
# if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# if enable_prompt_embeds:
# pytest.skip(
# "enable_prompt_embeds does not work with ray compiled dag."
# )
# monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
# monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
# if attention_backend:
# monkeypatch_context.setenv(
# "VLLM_ATTENTION_BACKEND",
# attention_backend,
# )
# for k, v in extra_env.items():
# monkeypatch_context.setenv(k, v)
# dtype = "half"
# max_tokens = 5
# # NOTE: take care of the order. run vLLM first, and then run HF.
# # vLLM needs a fresh new process without cuda initialization.
# # if we run HF first, the cuda initialization will be done and it
# # will hurt multiprocessing backend with fork method
# # (the default method).
# with vllm_runner(
# model,
# dtype=dtype,
# tensor_parallel_size=2,
# distributed_executor_backend=distributed_executor_backend,
# enable_prompt_embeds=enable_prompt_embeds,
# gpu_memory_utilization=0.7,
# ) as vllm_model:
# if enable_prompt_embeds:
# with hf_runner(model, dtype=dtype) as hf_model:
# with torch.no_grad():
# prompt_embeds = hf_model.get_prompt_embeddings(
# example_prompts)
# vllm_outputs = vllm_model.generate_greedy(
# prompt_embeds, max_tokens)
# vllm_outputs = _fix_prompt_embed_outputs(
# vllm_outputs, hf_model, example_prompts)
# hf_outputs = hf_model.generate_greedy(
# example_prompts, max_tokens)
# else:
# vllm_outputs = vllm_model.generate_greedy(
# example_prompts, max_tokens)
# with hf_runner(model, dtype=dtype) as hf_model:
# hf_outputs = hf_model.generate_greedy(
# example_prompts, max_tokens)
# check_outputs_equal(
# outputs_0_lst=hf_outputs,
# outputs_1_lst=vllm_outputs,
# name_0="hf",
# name_1="vllm",
# )
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
......
......@@ -248,7 +248,6 @@ def test_deep_sleep_async():
@requires_fp8
def test_deep_sleep_fp8_kvcache():
GiB_bytes = 1 << 30
model = "Qwen/Qwen2-0.5B"
used_bytes_baseline = current_platform.get_current_memory_usage()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Callable
from pathlib import Path
from unittest.mock import patch
from vllm.benchmarks.sweep.param_sweep import ParameterSweepItem
from vllm.benchmarks.sweep.serve_sla import _get_sla_run_path, solve_sla
from vllm.benchmarks.sweep.server import ServerProcess
from vllm.benchmarks.sweep.sla_sweep import (
SLACriterionBase,
SLALessThan,
SLALessThanOrEqualTo,
SLASweepItem,
)
def _set_return_value(
var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
):
"""
Create a patch for run_sla with a specific function
indicating the relationship between the benchmark combination
(which includes the SLA variable) and the SLA criterion.
"""
def mock_run_sla(
server: ServerProcess | None,
bench_cmd: list[str],
*,
serve_comb: ParameterSweepItem,
bench_comb: ParameterSweepItem,
iter_path: Path,
num_runs: int,
dry_run: bool,
):
iter_data = var2metric(bench_comb)
summary_path = _get_sla_run_path(iter_path, run_number=None)
summary_path.parent.mkdir(parents=True, exist_ok=True)
with summary_path.open("w") as f:
json.dump(iter_data, f, indent=4)
return iter_data
return patch("vllm.benchmarks.sweep.serve_sla.run_sla", side_effect=mock_run_sla)
def _var2metric_linear():
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
y = x
return [{"request_throughput": y}]
return wrapped
def _var2metric_concave(elbow_point: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
if x < elbow_point:
y = 0.5 * (x - elbow_point) + elbow_point
else:
y = 1.5 * (x - elbow_point) + elbow_point
return [{"request_throughput": y}]
return wrapped
def _var2metric_convex(elbow_point: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
if x < elbow_point:
y = 1.5 * (x - elbow_point) + elbow_point
else:
y = 0.5 * (x - elbow_point) + elbow_point
return [{"request_throughput": y}]
return wrapped
def _var2metric_quadratic(y_intercept: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
y = y_intercept + 0.1 * x**2
return [{"request_throughput": y}]
return wrapped
def _var2metric_sqrt(y_intercept: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
y = y_intercept + 10 * x**0.5
return [{"request_throughput": y}]
return wrapped
def _run_solve_sla(
var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
criterion: SLACriterionBase,
base_path: Path,
min_value: int = 1,
max_value: int = 100,
):
with _set_return_value(var2metric):
result = solve_sla(
server=None,
bench_cmd=[],
serve_comb=ParameterSweepItem(),
bench_comb=ParameterSweepItem(),
sla_comb=SLASweepItem({"request_throughput": criterion}),
base_path=base_path,
num_runs=1,
dry_run=False,
sla_variable="request_rate",
sla_min_value=min_value,
sla_max_value=max_value,
)
assert result is not None
return result
def test_solve_linear_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=32),
tmp_path,
)
assert history.get_max_passing() == 32
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
32: True,
33: False,
}
def test_solve_linear_sla_lt(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThan(target=32),
tmp_path,
)
assert history.get_max_passing() == 31
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
31: True,
32: False,
}
def test_solve_linear_sla_oob(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=32),
tmp_path,
min_value=64,
)
assert history.get_max_passing() == 64
assert history.get_min_failing() == 64
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
64: False,
}
def test_solve_concave_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_concave(elbow_point=32),
SLALessThanOrEqualTo(target=24),
tmp_path,
)
assert history.get_max_passing() == 16
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
7: True,
13: True,
15: True,
16: True,
17: False,
}
def test_solve_convex_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_convex(elbow_point=32),
SLALessThanOrEqualTo(target=24),
tmp_path,
)
assert history.get_max_passing() == 26
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
48: False,
30: False,
24: True,
26: True,
27: False,
}
def test_solve_quadratic_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_quadratic(y_intercept=10),
SLALessThanOrEqualTo(target=50),
tmp_path,
)
assert history.get_max_passing() == 20
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
4: True,
20: True,
21: False,
}
def test_solve_sqrt_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_sqrt(y_intercept=10),
SLALessThanOrEqualTo(target=100),
tmp_path,
)
assert history.get_max_passing() == 81
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
89: False,
81: True,
82: False,
}
def test_solve_reuse_history(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=10),
tmp_path,
min_value=1,
max_value=20,
)
assert history.get_max_passing() == 10
assert {val: margin <= 0 for val, margin in history.items()} == {
20: False,
1: True,
10: True,
11: False,
}
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=30),
tmp_path,
min_value=21,
max_value=40,
)
assert history.get_max_passing() == 30
assert {val: margin <= 0 for val, margin in history.items()} == {
# Items from the past run
# (the margins are different because the target changed)
20: True,
1: True,
10: True,
11: True,
# Items from this run
40: False,
30: True,
31: False,
}
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import pytest
@pytest.mark.benchmark
def test_bench_startup():
command = [
"vllm",
"bench",
"startup",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
......@@ -20,21 +20,18 @@ def server():
@pytest.mark.benchmark
def test_bench_serve(server):
# Test default model detection and input/output len
command = [
"vllm",
"bench",
"serve",
"--model",
MODEL_NAME,
"--host",
server.host,
"--port",
str(server.port),
"--dataset-name",
"random",
"--random-input-len",
"--input-len",
"32",
"--random-output-len",
"--output-len",
"4",
"--num-prompts",
"5",
......
......@@ -15,6 +15,7 @@ from vllm.config import (
ModelConfig,
PassConfig,
VllmConfig,
set_current_vllm_config,
)
from vllm.distributed import (
tensor_model_parallel_all_gather,
......@@ -26,6 +27,7 @@ from vllm.distributed.parallel_state import (
)
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import (
......@@ -301,7 +303,7 @@ def async_tp_pass_on_test_model(
dtype: torch.dtype,
dynamic: bool,
):
current_platform.seed_everything(0)
set_random_seed(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
......@@ -339,38 +341,42 @@ def async_tp_pass_on_test_model(
)
async_tp_pass = AsyncTPPass(vllm_config)
backend = TestBackend(async_tp_pass)
assert (
async_tp_pass.compilation_config.splitting_ops
== vllm_config.compilation_config.splitting_ops
)
assert (
async_tp_pass.compilation_config.use_inductor_graph_partition
== vllm_config.compilation_config.use_inductor_graph_partition
)
# Set the global vllm_config for TestBackend which calls
# get_current_vllm_config()
with set_current_vllm_config(vllm_config):
backend = TestBackend(async_tp_pass)
model = test_model_cls(hidden_size, dtype) # Pass dtype to model constructor
assert (
async_tp_pass.compilation_config.splitting_ops
== vllm_config.compilation_config.splitting_ops
)
assert (
async_tp_pass.compilation_config.use_inductor_graph_partition
== vllm_config.compilation_config.use_inductor_graph_partition
)
hidden_states = torch.randn(
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
)
model = test_model_cls(hidden_size, dtype) # Pass dtype to model constructor
hidden_states = torch.randn(
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
)
if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0)
if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0)
compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)
compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)
assert async_tp_pass.matched_count == 1
assert async_tp_pass.matched_count == 1
# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
backend.check_after_ops(model.ops_in_model_after())
# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
backend.check_after_ops(model.ops_in_model_after())
@create_new_process_for_each_test()
......
......@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
)
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...utils import has_module_attribute, multi_gpu_test
from ..backend import TestBackend
......@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model(
enable_rms_norm_custom_op,
enable_quant_fp8_custom_op,
):
current_platform.seed_everything(0)
set_random_seed(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
......
......@@ -208,7 +208,8 @@ def test_attn_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties
......@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties
......@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties
......@@ -554,7 +557,8 @@ def test_rms_group_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties
......@@ -564,7 +568,9 @@ def test_rms_group_quant(
splitting_ops=splitting_ops,
# Common
mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
pass_config=PassConfig(
fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
),
# Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True},
)
......
......@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...utils import multi_gpu_test
from ..backend import TestBackend
......@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model(
fuse_norm_quant: bool,
dynamic: bool,
):
current_platform.seed_everything(0)
set_random_seed(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
......
......@@ -6,10 +6,13 @@ import pytest
import os
from vllm.config import CompilationMode
from vllm.platforms import current_platform
from vllm.utils.torch_utils import cuda_device_count_stateless
from ...utils import compare_all_settings, models_path_prefix
ATTN_BACKEND = "FLASH_ATTN" if not current_platform.is_rocm() else "ROCM_ATTN"
@dataclasses.dataclass
class TestSetting:
......@@ -32,7 +35,7 @@ class TestSetting:
model_args=["--max-model-len", "2048"],
pp_size=2,
tp_size=2,
attn_backend="FLASH_ATTN",
attn_backend=ATTN_BACKEND,
method="generate",
),
# llama model with quantization
......@@ -41,7 +44,7 @@ class TestSetting:
model_args=["--quantization", "gptq", "--max-model-len", "2048"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
attn_backend=ATTN_BACKEND,
method="generate",
),
# MoE model
......@@ -50,7 +53,7 @@ class TestSetting:
model_args=["--max-model-len", "2048"],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
attn_backend=ATTN_BACKEND,
method="generate",
),
# embedding model
......@@ -66,18 +69,23 @@ class TestSetting:
],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
attn_backend=ATTN_BACKEND,
method="encode",
),
# # TODO
# TestSetting(
# model="BAAI/bge-base-en-v1.5",
# model_args=["--runner", "pooling"],
# pp_size=1,
# tp_size=1,
# attn_backend="FLASH_ATTN",
# method="encode",
# ),
pytest.param(
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--runner", "pooling"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
),
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="Encoder self-attention is not implemented for ROCm",
),
),
# vision language model
# See https://github.com/vllm-project/vllm/issues/26716.
# TestSetting(
......@@ -91,7 +99,6 @@ class TestSetting:
],
)
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs.
......@@ -109,49 +116,48 @@ def test_compile_correctness(
f"{cuda_device_count_stateless()}"
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
*model_args,
"-pp",
str(pp_size),
"-tp",
str(tp_size),
"-cc.cudagraph_mode=none",
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
final_args = [
*model_args,
"-pp",
str(pp_size),
"-tp",
str(tp_size),
"-cc.cudagraph_mode=none",
f"--attention-backend={attn_backend}",
]
for comp_mode in [
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
for comp_mode in [
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
all_envs.clear()
all_args.clear()
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
)
all_envs.clear()
all_args.clear()
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
compare_all_settings(model, all_args * 3, all_envs, method=method)
\ No newline at end of file
compare_all_settings(model, all_args * 3, all_envs, method=method)
......@@ -12,6 +12,7 @@ from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum
@contextlib.contextmanager
......@@ -70,11 +71,14 @@ def llm_pair(request):
elif backend_config.specific_gpu_arch == (10, 0):
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
# FlashInfer is not supported on ROCm
if backend_config == AttentionBackendEnum.FLASHINFER and current_platform.is_rocm():
pytest.skip("FlashInfer is not supported on ROCm")
env_vars = {
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0",
**backend_config.env_vars,
}
with temporary_environ(env_vars):
full = LLM(
......@@ -170,16 +174,10 @@ class TestFullCUDAGraph:
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_full_cudagraph_with_invalid_backend():
with (
temporary_environ(
{
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}
),
pytest.raises(RuntimeError),
):
# Flex_Attention is not supported with full cuda graph
with pytest.raises(RuntimeError):
LLM(
model="Qwen/Qwen2-1.5B-Instruct",
compilation_config=CompilationConfig(cudagraph_mode="FULL"),
attention_config={"backend": "FLEX_ATTENTION"},
)
......@@ -10,10 +10,10 @@ import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from ...utils import create_new_process_for_each_test
......@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
TEST_MODELS.append(
(
"alexm-nm/tinyllama-24-marlin24-4bit-g128",
{"quantization": "gptq_marlin_24"},
{
"quantization": "gptq_marlin_24",
"allow_deprecated_quantization": True,
},
)
)
......@@ -156,6 +159,20 @@ def test_full_graph(
)
for model_info in models_list(all=False)
if is_torch_equal_or_newer("2.9.0.dev")
]
+ [
# Test get_raw_stream patch with compile_sizes
# This tests that TorchInductor autotune works correctly with get_raw_stream
# patch in torch 2.9 and without patch in torch 2.10+
(
CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_sizes=[1, 2], # Triggers autotune which uses get_raw_stream
cudagraph_mode=CUDAGraphMode.NONE,
),
"facebook/opt-125m",
{},
),
],
)
# only test some of the models
......@@ -197,20 +214,19 @@ def test_custom_compile_config(
],
)
def test_fp8_kv_scale_compile(
monkeypatch: pytest.MonkeyPatch,
compilation_mode: int,
model: str,
backend: AttentionBackendEnum | None,
):
if backend:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs = {
"quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3",
"calculate_kv_scales": True,
"max_model_len": 512,
}
if backend:
model_kwargs["attention_config"] = {"backend": backend.name}
run_model(compilation_mode, model, **model_kwargs)
......
......@@ -71,3 +71,40 @@ def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
) as _,
):
pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
# Requires Cuda and 8 gpus as well
@pytest.mark.forked
@pytest.mark.skip(reason="Skipping due to CI resource constraints")
def test_mllama4_vit_compilation(vllm_runner, monkeypatch):
"""Test that Mllama4 vision submodules are compiled.
This test verifies that the 2 vision submodules (Llama4VisionEncoder,
Llama4VisionPixelShuffleMLP) are properly tagged
for compilation by checking that num_models_seen increases to 3.
However since we are using TP=8, we compilation_counter will not
work properly so we will just check the run succeeds rn
"""
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with (
monkeypatch.context(),
# TODO: Since we require TP=8, this messes with the compilation
# counter. We should fix this in the future, but leave for now
# to make sure that compilation runs (no crash) with llama vision encoder
compilation_counter.expect(num_models_seen=0),
vllm_runner(
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
max_model_len=512,
gpu_memory_utilization=0.8,
tensor_parallel_size=8,
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"compile_mm_encoder": True,
},
),
):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment