Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -58,7 +58,7 @@ schemathesis==3.39.15 ...@@ -58,7 +58,7 @@ schemathesis==3.39.15
# OpenAI schema test # OpenAI schema test
# Evaluation and benchmarking # Evaluation and benchmarking
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d lm-eval[api]>=0.4.9.2
jiwer==4.0.0 jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
...@@ -74,17 +74,21 @@ torchgeo==0.7.0 ...@@ -74,17 +74,21 @@ torchgeo==0.7.0
# MTEB Benchmark Test # MTEB Benchmark Test
mteb==2.1.2 mteb==2.1.2
# Data processing
xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
# Test async scheduling
# Utilities # Utilities
num2words==0.5.14 num2words==0.5.14
# via lm-eval # via lm-eval
pqdm==0.2.0 pqdm==0.2.0
# via lm-eval # via lm-eval
# Required for fastsafetensors test
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
# Required for suffix decoding test # Required for suffix decoding test
arctic-inference == 0.1.1 arctic-inference == 0.1.1
# Required for Nemotron test # Required for Nemotron test
open-clip-torch==2.32.0 open-clip-torch==2.32.0
# Required for isaac Multi-Modal generation test
perceptron==0.1.4
# Required for the multi-modal models test
timm==1.0.17
# Required for plugins test
albumentations==1.4.6
\ No newline at end of file
...@@ -15,7 +15,7 @@ setuptools-scm>=8 ...@@ -15,7 +15,7 @@ setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3 runai-model-streamer[s3,gcs]==0.15.3
# conch-triton-kernels==1.2.1 # conch-triton-kernels==1.2.1
timm>=1.0.17 timm>=1.0.17
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 grpcio-tools>=1.76.0
numa numa
pytrie pytrie
...@@ -23,10 +23,10 @@ setuptools_scm>=8 ...@@ -23,10 +23,10 @@ setuptools_scm>=8
cmake==3.29 cmake==3.29
quart quart
fastrlock==0.8.3 fastrlock==0.8.3
cupy==12.3.0 # cupy==12.3.0
torch >= 2.7.1 torch == 2.9.0
triton == 3.1 triton == 3.3
flash_attn == 2.6.1 flash_attn == 2.6.1
flash_mla == 1.0.0 flash_mla == 1.0.0
lightop == 0.6.0 lightop == 0.6.0
......
...@@ -9,6 +9,7 @@ pytest-timeout ...@@ -9,6 +9,7 @@ pytest-timeout
pytest-cov pytest-cov
# testing utils # testing utils
albumentations # required for Nemotron Parse in test_common.py
backoff # required for phi4mm test backoff # required for phi4mm test
blobfile # required for kimi-vl test blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl einops # required for MPT, qwen-vl
...@@ -19,23 +20,22 @@ vocos # required for minicpmo_26 test ...@@ -19,23 +20,22 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test peft>=0.15.0 # required for phi-4-mm test
pqdm pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
sentence-transformers # required for embedding tests sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
tblib # for pickling test exceptions tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test timm==1.0.17 # required for internvl and gemma3n-mm test
torch==2.9.0 torch==2.9.1
torchaudio==2.9.0 torchaudio==2.9.1
torchvision==0.24.0 torchvision==0.24.1
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.5 # required for voxtral test mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api]>=0.4.9.2 # required for model evaluation test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.3 transformers==4.57.3
tokenizers==0.22.0 tokenizers==0.22.0
...@@ -57,3 +57,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13 ...@@ -57,3 +57,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0 decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
gpt-oss >= 0.0.7; python_version > '3.11' gpt-oss >= 0.0.7; python_version > '3.11'
perceptron # required for isaac test
...@@ -27,7 +27,9 @@ aiosignal==1.4.0 ...@@ -27,7 +27,9 @@ aiosignal==1.4.0
albucore==0.0.16 albucore==0.0.16
# via terratorch # via terratorch
albumentations==1.4.6 albumentations==1.4.6
# via terratorch # via
# -r requirements/test.in
# terratorch
alembic==1.16.4 alembic==1.16.4
# via mlflow # via mlflow
annotated-types==0.7.0 annotated-types==0.7.0
...@@ -135,6 +137,7 @@ cloudpickle==3.1.1 ...@@ -135,6 +137,7 @@ cloudpickle==3.1.1
# via mlflow-skinny # via mlflow-skinny
colorama==0.4.6 colorama==0.4.6
# via # via
# perceptron
# sacrebleu # sacrebleu
# schemathesis # schemathesis
# tqdm-multiprocess # tqdm-multiprocess
...@@ -294,7 +297,7 @@ graphql-relay==3.2.0 ...@@ -294,7 +297,7 @@ graphql-relay==3.2.0
# via graphene # via graphene
greenlet==3.2.3 greenlet==3.2.3
# via sqlalchemy # via sqlalchemy
grpcio==1.71.0 grpcio==1.76.0
# via ray # via ray
gunicorn==23.0.0 gunicorn==23.0.0
# via mlflow # via mlflow
...@@ -302,6 +305,8 @@ h11==0.14.0 ...@@ -302,6 +305,8 @@ h11==0.14.0
# via # via
# httpcore # httpcore
# uvicorn # uvicorn
h2==4.3.0
# via httpx
h5py==3.13.0 h5py==3.13.0
# via terratorch # via terratorch
harfile==0.3.0 harfile==0.3.0
...@@ -310,6 +315,8 @@ hf-xet==1.1.7 ...@@ -310,6 +315,8 @@ hf-xet==1.1.7
# via huggingface-hub # via huggingface-hub
hiredis==3.0.0 hiredis==3.0.0
# via tensorizer # via tensorizer
hpack==4.1.0
# via h2
html2text==2025.4.15 html2text==2025.4.15
# via gpt-oss # via gpt-oss
httpcore==1.0.6 httpcore==1.0.6
...@@ -317,6 +324,7 @@ httpcore==1.0.6 ...@@ -317,6 +324,7 @@ httpcore==1.0.6
httpx==0.27.2 httpx==0.27.2
# via # via
# -r requirements/test.in # -r requirements/test.in
# perceptron
# schemathesis # schemathesis
huggingface-hub==0.34.3 huggingface-hub==0.34.3
# via # via
...@@ -338,6 +346,8 @@ hydra-core==1.3.2 ...@@ -338,6 +346,8 @@ hydra-core==1.3.2
# via # via
# lightly # lightly
# lightning # lightning
hyperframe==6.1.0
# via h2
hypothesis==6.131.0 hypothesis==6.131.0
# via # via
# hypothesis-graphql # hypothesis-graphql
...@@ -441,7 +451,7 @@ lightning-utilities==0.14.3 ...@@ -441,7 +451,7 @@ lightning-utilities==0.14.3
# torchmetrics # torchmetrics
llvmlite==0.44.0 llvmlite==0.44.0
# via numba # via numba
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d lm-eval==0.4.9.2
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via # via
...@@ -474,7 +484,7 @@ mbstrdecoder==1.1.3 ...@@ -474,7 +484,7 @@ mbstrdecoder==1.1.3
# typepy # typepy
mdurl==0.1.2 mdurl==0.1.2
# via markdown-it-py # via markdown-it-py
mistral-common==1.8.5 mistral-common==1.8.8
# via -r requirements/test.in # via -r requirements/test.in
mlflow==2.22.0 mlflow==2.22.0
# via terratorch # via terratorch
...@@ -549,6 +559,7 @@ numpy==1.26.4 ...@@ -549,6 +559,7 @@ numpy==1.26.4
# pandas # pandas
# patsy # patsy
# peft # peft
# perceptron
# pycocotools # pycocotools
# pyogrio # pyogrio
# rasterio # rasterio
...@@ -702,6 +713,8 @@ peft==0.16.0 ...@@ -702,6 +713,8 @@ peft==0.16.0
# via # via
# -r requirements/test.in # -r requirements/test.in
# lm-eval # lm-eval
perceptron==0.1.4
# via -r requirements/test.in
pillow==10.4.0 pillow==10.4.0
# via # via
# genai-perf # genai-perf
...@@ -709,9 +722,9 @@ pillow==10.4.0 ...@@ -709,9 +722,9 @@ pillow==10.4.0
# lightly-utils # lightly-utils
# matplotlib # matplotlib
# mistral-common # mistral-common
# perceptron
# scikit-image # scikit-image
# segmentation-models-pytorch # segmentation-models-pytorch
# sentence-transformers
# torchgeo # torchgeo
# torchvision # torchvision
platformdirs==4.3.6 platformdirs==4.3.6
...@@ -745,7 +758,7 @@ propcache==0.2.0 ...@@ -745,7 +758,7 @@ propcache==0.2.0
# yarl # yarl
proto-plus==1.26.1 proto-plus==1.26.1
# via google-api-core # via google-api-core
protobuf==5.28.3 protobuf==6.33.2
# via # via
# google-api-core # google-api-core
# googleapis-common-protos # googleapis-common-protos
...@@ -952,6 +965,7 @@ rich==13.9.4 ...@@ -952,6 +965,7 @@ rich==13.9.4
# genai-perf # genai-perf
# lightning # lightning
# mteb # mteb
# perceptron
# typer # typer
rioxarray==0.19.0 rioxarray==0.19.0
# via terratorch # via terratorch
...@@ -1010,7 +1024,7 @@ segmentation-models-pytorch==0.4.0 ...@@ -1010,7 +1024,7 @@ segmentation-models-pytorch==0.4.0
# via # via
# terratorch # terratorch
# torchgeo # torchgeo
sentence-transformers==3.2.1 sentence-transformers==5.2.0
# via # via
# -r requirements/test.in # -r requirements/test.in
# mteb # mteb
...@@ -1024,7 +1038,9 @@ shapely==2.1.1 ...@@ -1024,7 +1038,9 @@ shapely==2.1.1
# geopandas # geopandas
# torchgeo # torchgeo
shellingham==1.5.4 shellingham==1.5.4
# via typer # via
# perceptron
# typer
six==1.16.0 six==1.16.0
# via # via
# junit-xml # junit-xml
...@@ -1123,7 +1139,7 @@ tomli==2.2.1 ...@@ -1123,7 +1139,7 @@ tomli==2.2.1
# via schemathesis # via schemathesis
tomli-w==1.2.0 tomli-w==1.2.0
# via schemathesis # via schemathesis
torch==2.9.0+cu129 torch==2.9.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
...@@ -1152,7 +1168,7 @@ torch==2.9.0+cu129 ...@@ -1152,7 +1168,7 @@ torch==2.9.0+cu129
# torchvision # torchvision
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
torchaudio==2.9.0+cu129 torchaudio==2.9.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
...@@ -1165,7 +1181,7 @@ torchmetrics==1.7.4 ...@@ -1165,7 +1181,7 @@ torchmetrics==1.7.4
# pytorch-lightning # pytorch-lightning
# terratorch # terratorch
# torchgeo # torchgeo
torchvision==0.24.0+cu129 torchvision==0.24.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# lightly # lightly
...@@ -1206,7 +1222,7 @@ transformers==4.57.3 ...@@ -1206,7 +1222,7 @@ transformers==4.57.3
# transformers-stream-generator # transformers-stream-generator
transformers-stream-generator==0.0.5 transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.5.0 triton==3.5.1
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.51.0
# via # via
...@@ -1218,7 +1234,9 @@ typepy==1.3.2 ...@@ -1218,7 +1234,9 @@ typepy==1.3.2
# pytablewriter # pytablewriter
# tabledata # tabledata
typer==0.15.2 typer==0.15.2
# via fastsafetensors # via
# fastsafetensors
# perceptron
types-python-dateutil==2.9.0.20241206 types-python-dateutil==2.9.0.20241206
# via arrow # via arrow
typeshed-client==2.8.2 typeshed-client==2.8.2
...@@ -1231,6 +1249,7 @@ typing-extensions==4.15.0 ...@@ -1231,6 +1249,7 @@ typing-extensions==4.15.0
# chz # chz
# fastapi # fastapi
# graphene # graphene
# grpcio
# huggingface-hub # huggingface-hub
# librosa # librosa
# lightning # lightning
...@@ -1246,6 +1265,7 @@ typing-extensions==4.15.0 ...@@ -1246,6 +1265,7 @@ typing-extensions==4.15.0
# pydantic-core # pydantic-core
# pydantic-extra-types # pydantic-extra-types
# pytorch-lightning # pytorch-lightning
# sentence-transformers
# sqlalchemy # sqlalchemy
# torch # torch
# torchgeo # torchgeo
......
...@@ -18,6 +18,8 @@ import torch ...@@ -18,6 +18,8 @@ import torch
from packaging.version import Version, parse from packaging.version import Version, parse
from setuptools import Extension, setup from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from setuptools.command.build_py import build_py
from setuptools.command.develop import develop
# from setuptools_scm import get_version # from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
...@@ -62,15 +64,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")) ...@@ -62,15 +64,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
sys.platform, sys.platform,
) )
VLLM_TARGET_DEVICE = "empty" VLLM_TARGET_DEVICE = "empty"
elif ( elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
sys.platform.startswith("linux") if torch.version.hip is not None:
and torch.version.cuda is None VLLM_TARGET_DEVICE = "rocm"
and os.getenv("VLLM_TARGET_DEVICE") is None logger.info("Auto-detected ROCm")
and torch.version.hip is None elif torch.version.cuda is not None:
): VLLM_TARGET_DEVICE = "cuda"
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set, logger.info("Auto-detected CUDA")
# fallback to cpu else:
VLLM_TARGET_DEVICE = "cpu" VLLM_TARGET_DEVICE = "cpu"
def is_sccache_available() -> bool: def is_sccache_available() -> bool:
...@@ -91,6 +93,81 @@ def is_freethreaded(): ...@@ -91,6 +93,81 @@ def is_freethreaded():
return bool(sysconfig.get_config_var("Py_GIL_DISABLED")) return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
def compile_grpc_protos():
"""Compile gRPC protobuf definitions during build.
This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from
the vllm_engine.proto definition.
"""
try:
from grpc_tools import protoc
except ImportError:
logger.warning(
"grpcio-tools not installed, skipping gRPC proto compilation. "
"gRPC server functionality will not be available."
)
return False
proto_file = ROOT_DIR / "vllm" / "grpc" / "vllm_engine.proto"
if not proto_file.exists():
logger.warning("Proto file not found at %s, skipping compilation", proto_file)
return False
logger.info("Compiling gRPC protobuf: %s", proto_file)
result = protoc.main(
[
"grpc_tools.protoc",
f"--proto_path={ROOT_DIR}",
f"--python_out={ROOT_DIR}",
f"--grpc_python_out={ROOT_DIR}",
f"--pyi_out={ROOT_DIR}",
str(proto_file),
]
)
if result != 0:
logger.error("protoc failed with exit code %s", result)
return False
# Add SPDX headers and mypy ignore to generated files
spdx_header = (
"# SPDX-License-Identifier: Apache-2.0\n"
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
"# mypy: ignore-errors\n"
)
grpc_dir = ROOT_DIR / "vllm" / "grpc"
for generated_file in [
grpc_dir / "vllm_engine_pb2.py",
grpc_dir / "vllm_engine_pb2_grpc.py",
grpc_dir / "vllm_engine_pb2.pyi",
]:
if generated_file.exists():
content = generated_file.read_text()
if not content.startswith("# SPDX-License-Identifier"):
generated_file.write_text(spdx_header + content)
logger.info("gRPC protobuf compilation successful")
return True
class BuildPyAndGenerateGrpc(build_py):
"""Build Python modules and generate gRPC stubs from proto files."""
def run(self):
compile_grpc_protos()
super().run()
class DevelopAndGenerateGrpc(develop):
"""Develop mode that also generates gRPC stubs from proto files."""
def run(self):
compile_grpc_protos()
super().run()
class CMakeExtension(Extension): class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None: def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa) super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
...@@ -120,20 +197,26 @@ class cmake_build_ext(build_ext): ...@@ -120,20 +197,26 @@ class cmake_build_ext(build_ext):
num_jobs = os.cpu_count() num_jobs = os.cpu_count()
nvcc_threads = None nvcc_threads = None
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"): if _is_cuda() and CUDA_HOME is not None:
# `nvcc_threads` is either the value of the NVCC_THREADS try:
# environment variable (if defined) or 1. nvcc_version = get_nvcc_cuda_version()
# when it is set, we reduce `num_jobs` to avoid if nvcc_version >= Version("11.2"):
# overloading the system. # `nvcc_threads` is either the value of the NVCC_THREADS
nvcc_threads = envs.NVCC_THREADS # environment variable (if defined) or 1.
if nvcc_threads is not None: # when it is set, we reduce `num_jobs` to avoid
nvcc_threads = int(nvcc_threads) # overloading the system.
logger.info( nvcc_threads = envs.NVCC_THREADS
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads if nvcc_threads is not None:
) nvcc_threads = int(nvcc_threads)
else: logger.info(
nvcc_threads = 1 "Using NVCC_THREADS=%d as the number of nvcc threads.",
num_jobs = max(1, num_jobs // nvcc_threads) nvcc_threads,
)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
except Exception as e:
logger.warning("Failed to get NVCC version: %s", e)
return num_jobs, nvcc_threads return num_jobs, nvcc_threads
...@@ -211,9 +294,9 @@ class cmake_build_ext(build_ext): ...@@ -211,9 +294,9 @@ class cmake_build_ext(build_ext):
# Default build tool to whatever cmake picks. # Default build tool to whatever cmake picks.
build_tool = [] build_tool = []
# Make sure we use the nvcc from CUDA_HOME # Make sure we use the nvcc from CUDA_HOME
if _is_cuda(): if _is_cuda() and CUDA_HOME is not None:
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"] cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
elif _is_hip(): elif _is_hip() and ROCM_HOME is not None:
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"] cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
other_cmake_args = os.environ.get("CMAKE_ARGS") other_cmake_args = os.environ.get("CMAKE_ARGS")
...@@ -351,6 +434,89 @@ class precompiled_wheel_utils: ...@@ -351,6 +434,89 @@ class precompiled_wheel_utils:
wheels = json.loads(resp.read().decode("utf-8")) wheels = json.loads(resp.read().decode("utf-8"))
return wheels, repo_url return wheels, repo_url
@staticmethod
def is_rocm_system() -> bool:
"""Detect ROCm without relying on torch (for build environment)."""
if os.getenv("ROCM_PATH"):
return True
if os.path.isdir("/opt/rocm"):
return True
if which("rocminfo") is not None:
return True
try:
import torch
return torch.version.hip is not None
except ImportError:
return False
@staticmethod
def find_local_rocm_wheel() -> str | None:
"""Search for a local vllm wheel in common locations."""
import glob
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
wheels = glob.glob(pattern)
if wheels:
return sorted(wheels)[-1]
return None
@staticmethod
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
"""Fetch the latest wheel URL from a PyPI-style simple index."""
import platform
from html.parser import HTMLParser
from urllib.parse import urljoin
from urllib.request import urlopen
arch = platform.machine()
class WheelLinkParser(HTMLParser):
def __init__(self):
super().__init__()
self.wheels = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href" and value.endswith(".whl"):
self.wheels.append(value)
simple_url = f"{index_url.rstrip('/')}/{package}/"
print(f"Fetching wheel list from {simple_url}")
with urlopen(simple_url) as resp:
html = resp.read().decode("utf-8")
parser = WheelLinkParser()
parser.feed(html)
for wheel in reversed(parser.wheels):
if arch in wheel:
if wheel.startswith("http"):
return wheel
return urljoin(simple_url, wheel)
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
@staticmethod
def determine_wheel_url_rocm() -> tuple[str, str | None]:
"""Determine the precompiled wheel for ROCm."""
# Search for local wheel first
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
if local_wheel is not None:
print(f"Found local ROCm wheel: {local_wheel}")
return local_wheel, None
# Fall back to AMD's PyPI index
index_url = os.getenv(
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
)
print(f"Fetching ROCm precompiled wheel from {index_url}")
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
download_filename = wheel_url.split("/")[-1].split("#")[0]
print(f"Using ROCm precompiled wheel: {wheel_url}")
return wheel_url, download_filename
@staticmethod @staticmethod
def determine_wheel_url() -> tuple[str, str | None]: def determine_wheel_url() -> tuple[str, str | None]:
""" """
...@@ -371,6 +537,11 @@ class precompiled_wheel_utils: ...@@ -371,6 +537,11 @@ class precompiled_wheel_utils:
print(f"Using user-specified precompiled wheel location: {wheel_location}") print(f"Using user-specified precompiled wheel location: {wheel_location}")
return wheel_location, None return wheel_location, None
else: else:
# ROCm: use local wheel or AMD's PyPI index
# TODO: When we have ROCm nightly wheels, we can update this logic.
if precompiled_wheel_utils.is_rocm_system():
return precompiled_wheel_utils.determine_wheel_url_rocm()
import platform import platform
arch = platform.machine() arch = platform.machine()
...@@ -477,6 +648,8 @@ class precompiled_wheel_utils: ...@@ -477,6 +648,8 @@ class precompiled_wheel_utils:
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so", "vllm/cumem_allocator.abi3.so",
# ROCm-specific libraries
"vllm/_rocm_C.abi3.so",
] ]
flash_attn_regex = re.compile( flash_attn_regex = re.compile(
...@@ -614,6 +787,8 @@ def get_rocm_version(): ...@@ -614,6 +787,8 @@ def get_rocm_version():
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
try: try:
if ROCM_HOME is None:
return None
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
if not librocm_core_file.is_file(): if not librocm_core_file.is_file():
return None return None
...@@ -690,9 +865,9 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -690,9 +865,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.13.0" __version__ = "0.14.0"
__version_tuple__ = (0, 13, 0) __version_tuple__ = (0, 14, 0)
__hcu_version__ = f'0.13.0+{version}' __hcu_version__ = f'0.14.0+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
...@@ -863,7 +1038,9 @@ if _is_cuda() or _is_hip(): ...@@ -863,7 +1038,9 @@ if _is_cuda() or _is_hip():
if _is_cuda(): if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): if envs.VLLM_USE_PRECOMPILED or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
):
# FA3 requires CUDA 12.3 or later # FA3 requires CUDA 12.3 or later
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# Optional since this doesn't get built (produce an .so file) when # Optional since this doesn't get built (produce an .so file) when
...@@ -882,9 +1059,10 @@ if skip_vllm_build: ...@@ -882,9 +1059,10 @@ if skip_vllm_build:
"py.typed", "py.typed",
"model_executor/layers/fused_moe/configs/*.json", "model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json", "model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py", "entrypoints/serve/instrumentator/static/*.js",
"attention/backends/configs/*.json", "entrypoints/serve/instrumentator/static/*.css",
"model_executor/layers/quantization/configs/awq/*.json", "model_executor/layers/quantization/configs/awq/*.json",
"attention/backends/configs/*.json",
"_C.abi3.so", "_C.abi3.so",
"_moe_C.abi3.so", "_moe_C.abi3.so",
] ]
...@@ -895,7 +1073,8 @@ else: ...@@ -895,7 +1073,8 @@ else:
"py.typed", "py.typed",
"model_executor/layers/fused_moe/configs/*.json", "model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json", "model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py", "entrypoints/serve/instrumentator/static/*.js",
"entrypoints/serve/instrumentator/static/*.css",
"attention/backends/configs/*.json", "attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json", "model_executor/layers/quantization/configs/awq/*.json",
] ]
...@@ -915,12 +1094,17 @@ if _no_device() or skip_vllm_build: ...@@ -915,12 +1094,17 @@ if _no_device() or skip_vllm_build:
ext_modules = [] ext_modules = []
if not ext_modules: if not ext_modules:
cmdclass = {} cmdclass = {
"build_py": BuildPyAndGenerateGrpc,
"develop": DevelopAndGenerateGrpc,
}
else: else:
cmdclass = { cmdclass = {
"build_ext": precompiled_build_ext "build_ext": precompiled_build_ext
if envs.VLLM_USE_PRECOMPILED if envs.VLLM_USE_PRECOMPILED
else cmake_build_ext else cmake_build_ext,
"build_py": BuildPyAndGenerateGrpc,
"develop": DevelopAndGenerateGrpc,
} }
setup( setup(
...@@ -929,12 +1113,13 @@ setup( ...@@ -929,12 +1113,13 @@ setup(
ext_modules=ext_modules, ext_modules=ext_modules,
install_requires=get_requirements(), install_requires=get_requirements(),
extras_require={ extras_require={
"bench": ["pandas", "matplotlib", "seaborn", "datasets"], "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
"tensorizer": ["tensorizer==2.10.1"], "tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"], "fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"], "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [ "audio": [
"librosa", "librosa",
"scipy",
"soundfile", "soundfile",
"mistral_common[audio]", "mistral_common[audio]",
], # Required for audio processing ], # Required for audio processing
......
...@@ -72,7 +72,6 @@ def _fix_prompt_embed_outputs( ...@@ -72,7 +72,6 @@ def _fix_prompt_embed_outputs(
@pytest.mark.parametrize("model_executor", ["uni", "mp"]) @pytest.mark.parametrize("model_executor", ["uni", "mp"])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
...@@ -82,82 +81,80 @@ def test_models( ...@@ -82,82 +81,80 @@ def test_models(
model_executor: str, model_executor: str,
enable_prompt_embeds: bool, enable_prompt_embeds: bool,
) -> None: ) -> None:
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = (
"The following numbers of the sequence "
+ ", ".join(str(i) for i in range(1024))
+ " are:"
)
example_prompts = [prompt]
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
with monkeypatch.context() as m: if not current_platform.is_rocm():
m.setenv("VLLM_ATTENTION_BACKEND", backend) with VllmRunner(
model,
# 5042 tokens for gemma2 max_model_len=8192,
# gemma2 has alternating sliding window size of 4096 enforce_eager=enforce_eager,
# we need a prompt with more than 4096 tokens to test the sliding window enable_prompt_embeds=enable_prompt_embeds,
prompt = ( gpu_memory_utilization=0.7,
"The following numbers of the sequence " async_scheduling=async_scheduling,
+ ", ".join(str(i) for i in range(1024)) distributed_executor_backend=model_executor,
+ " are:" attention_config={"backend": backend},
) ) as vllm_model:
example_prompts = [prompt]
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds: if enable_prompt_embeds:
with torch.no_grad(): vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts) vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
if not current_platform.is_rocm(): )
with VllmRunner( else:
model, vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
max_model_len=8192, else:
enforce_eager=enforce_eager, with VllmRunner(
enable_prompt_embeds=enable_prompt_embeds, model,
gpu_memory_utilization=0.7, max_model_len=8192,
async_scheduling=async_scheduling, enforce_eager=enforce_eager,
distributed_executor_backend=model_executor, enable_prompt_embeds=enable_prompt_embeds,
) as vllm_model: gpu_memory_utilization=0.7,
if enable_prompt_embeds: async_scheduling=async_scheduling,
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens) distributed_executor_backend=model_executor,
vllm_outputs = _fix_prompt_embed_outputs( attention_config={"backend": backend},
vllm_outputs, hf_model, example_prompts block_size=64,
) ) as vllm_model:
else: if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
else: vllm_outputs = _fix_prompt_embed_outputs(
with VllmRunner( vllm_outputs, hf_model, example_prompts
model, )
max_model_len=8192, else:
enforce_eager=enforce_eager, vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
block_size=64,
) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
# @multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
# @pytest.mark.parametrize( # @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, test_suite, extra_env", # "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
# [ # [
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {}), # ("facebook/opt-125m", "ray", "", "L4", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {}), # ("facebook/opt-125m", "mp", "", "L4", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}), # ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}), # ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
# (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "ray", "", "L4", {}), # ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
# (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "mp", "", "L4", {}), # ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100", {}), # ("facebook/opt-125m", "ray", "", "A100", {}),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100", {}), # ("facebook/opt-125m", "mp", "", "A100", {}),
# ], # ],
# ) # )
# @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) # @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
...@@ -186,12 +183,6 @@ def test_models( ...@@ -186,12 +183,6 @@ def test_models(
# ): # noqa # ): # noqa
# pytest.skip("enable_prompt_embeds does not work with ray compiled dag.") # pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
# if attention_backend:
# monkeypatch_context.setenv(
# "VLLM_ATTENTION_BACKEND",
# attention_backend,
# )
# for k, v in extra_env.items(): # for k, v in extra_env.items():
# monkeypatch_context.setenv(k, v) # monkeypatch_context.setenv(k, v)
...@@ -203,6 +194,7 @@ def test_models( ...@@ -203,6 +194,7 @@ def test_models(
# # if we run HF first, the cuda initialization will be done and it # # if we run HF first, the cuda initialization will be done and it
# # will hurt multiprocessing backend with fork method # # will hurt multiprocessing backend with fork method
# # (the default method). # # (the default method).
# attention_config = {"backend": attention_backend} if attention_backend else None
# with vllm_runner( # with vllm_runner(
# model, # model,
# dtype=dtype, # dtype=dtype,
...@@ -210,6 +202,7 @@ def test_models( ...@@ -210,6 +202,7 @@ def test_models(
# distributed_executor_backend=distributed_executor_backend, # distributed_executor_backend=distributed_executor_backend,
# enable_prompt_embeds=enable_prompt_embeds, # enable_prompt_embeds=enable_prompt_embeds,
# gpu_memory_utilization=0.7, # gpu_memory_utilization=0.7,
# attention_config=attention_config,
# ) as vllm_model: # ) as vllm_model:
# if enable_prompt_embeds: # if enable_prompt_embeds:
# with hf_runner(model, dtype=dtype) as hf_model: # with hf_runner(model, dtype=dtype) as hf_model:
...@@ -225,90 +218,12 @@ def test_models( ...@@ -225,90 +218,12 @@ def test_models(
# with hf_runner(model, dtype=dtype) as hf_model: # with hf_runner(model, dtype=dtype) as hf_model:
# hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) # hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
# @multi_gpu_test(num_gpus=2) # check_outputs_equal(
# @pytest.mark.parametrize( # outputs_0_lst=hf_outputs,
# "model, distributed_executor_backend, attention_backend, " # outputs_1_lst=vllm_outputs,
# "test_suite, extra_env", [ # name_0="hf",
# ("distilbert/distilgpt2", "ray", "", "L4", {}), # name_1="vllm",
# ("distilbert/distilgpt2", "mp", "", "L4", {}), # )
# ("distilbert/distilgpt2", "ray", "", "L4", {
# "VLLM_SLEEP_WHEN_IDLE": "1"
# }),
# ("distilbert/distilgpt2", "mp", "", "L4", {
# "VLLM_SLEEP_WHEN_IDLE": "1"
# }),
# ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
# ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
# ("distilbert/distilgpt2", "ray", "", "A100", {}),
# ("distilbert/distilgpt2", "mp", "", "A100", {}),
# ])
# @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
# def test_models_distributed(
# monkeypatch: pytest.MonkeyPatch,
# hf_runner,
# vllm_runner,
# example_prompts,
# model: str,
# distributed_executor_backend: str,
# attention_backend: str,
# test_suite: str,
# extra_env: dict[str, str],
# enable_prompt_embeds: bool,
# ) -> None:
# if test_suite != TARGET_TEST_SUITE:
# pytest.skip(f"Skip test for {test_suite}")
# with monkeypatch.context() as monkeypatch_context:
# if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# if enable_prompt_embeds:
# pytest.skip(
# "enable_prompt_embeds does not work with ray compiled dag."
# )
# monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
# monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
# if attention_backend:
# monkeypatch_context.setenv(
# "VLLM_ATTENTION_BACKEND",
# attention_backend,
# )
# for k, v in extra_env.items():
# monkeypatch_context.setenv(k, v)
# dtype = "half"
# max_tokens = 5
# # NOTE: take care of the order. run vLLM first, and then run HF.
# # vLLM needs a fresh new process without cuda initialization.
# # if we run HF first, the cuda initialization will be done and it
# # will hurt multiprocessing backend with fork method
# # (the default method).
# with vllm_runner(
# model,
# dtype=dtype,
# tensor_parallel_size=2,
# distributed_executor_backend=distributed_executor_backend,
# enable_prompt_embeds=enable_prompt_embeds,
# gpu_memory_utilization=0.7,
# ) as vllm_model:
# if enable_prompt_embeds:
# with hf_runner(model, dtype=dtype) as hf_model:
# with torch.no_grad():
# prompt_embeds = hf_model.get_prompt_embeddings(
# example_prompts)
# vllm_outputs = vllm_model.generate_greedy(
# prompt_embeds, max_tokens)
# vllm_outputs = _fix_prompt_embed_outputs(
# vllm_outputs, hf_model, example_prompts)
# hf_outputs = hf_model.generate_greedy(
# example_prompts, max_tokens)
# else:
# vllm_outputs = vllm_model.generate_greedy(
# example_prompts, max_tokens)
# with hf_runner(model, dtype=dtype) as hf_model:
# hf_outputs = hf_model.generate_greedy(
# example_prompts, max_tokens)
def test_failed_model_execution(vllm_runner, monkeypatch) -> None: def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
......
...@@ -248,7 +248,6 @@ def test_deep_sleep_async(): ...@@ -248,7 +248,6 @@ def test_deep_sleep_async():
@requires_fp8 @requires_fp8
def test_deep_sleep_fp8_kvcache(): def test_deep_sleep_fp8_kvcache():
GiB_bytes = 1 << 30
model = "Qwen/Qwen2-0.5B" model = "Qwen/Qwen2-0.5B"
used_bytes_baseline = current_platform.get_current_memory_usage() used_bytes_baseline = current_platform.get_current_memory_usage()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Callable
from pathlib import Path
from unittest.mock import patch
from vllm.benchmarks.sweep.param_sweep import ParameterSweepItem
from vllm.benchmarks.sweep.serve_sla import _get_sla_run_path, solve_sla
from vllm.benchmarks.sweep.server import ServerProcess
from vllm.benchmarks.sweep.sla_sweep import (
SLACriterionBase,
SLALessThan,
SLALessThanOrEqualTo,
SLASweepItem,
)
def _set_return_value(
var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
):
"""
Create a patch for run_sla with a specific function
indicating the relationship between the benchmark combination
(which includes the SLA variable) and the SLA criterion.
"""
def mock_run_sla(
server: ServerProcess | None,
bench_cmd: list[str],
*,
serve_comb: ParameterSweepItem,
bench_comb: ParameterSweepItem,
iter_path: Path,
num_runs: int,
dry_run: bool,
):
iter_data = var2metric(bench_comb)
summary_path = _get_sla_run_path(iter_path, run_number=None)
summary_path.parent.mkdir(parents=True, exist_ok=True)
with summary_path.open("w") as f:
json.dump(iter_data, f, indent=4)
return iter_data
return patch("vllm.benchmarks.sweep.serve_sla.run_sla", side_effect=mock_run_sla)
def _var2metric_linear():
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
y = x
return [{"request_throughput": y}]
return wrapped
def _var2metric_concave(elbow_point: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
if x < elbow_point:
y = 0.5 * (x - elbow_point) + elbow_point
else:
y = 1.5 * (x - elbow_point) + elbow_point
return [{"request_throughput": y}]
return wrapped
def _var2metric_convex(elbow_point: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
if x < elbow_point:
y = 1.5 * (x - elbow_point) + elbow_point
else:
y = 0.5 * (x - elbow_point) + elbow_point
return [{"request_throughput": y}]
return wrapped
def _var2metric_quadratic(y_intercept: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
y = y_intercept + 0.1 * x**2
return [{"request_throughput": y}]
return wrapped
def _var2metric_sqrt(y_intercept: float):
def wrapped(bench_comb):
x = float(bench_comb["request_rate"])
y = y_intercept + 10 * x**0.5
return [{"request_throughput": y}]
return wrapped
def _run_solve_sla(
var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
criterion: SLACriterionBase,
base_path: Path,
min_value: int = 1,
max_value: int = 100,
):
with _set_return_value(var2metric):
result = solve_sla(
server=None,
bench_cmd=[],
serve_comb=ParameterSweepItem(),
bench_comb=ParameterSweepItem(),
sla_comb=SLASweepItem({"request_throughput": criterion}),
base_path=base_path,
num_runs=1,
dry_run=False,
sla_variable="request_rate",
sla_min_value=min_value,
sla_max_value=max_value,
)
assert result is not None
return result
def test_solve_linear_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=32),
tmp_path,
)
assert history.get_max_passing() == 32
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
32: True,
33: False,
}
def test_solve_linear_sla_lt(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThan(target=32),
tmp_path,
)
assert history.get_max_passing() == 31
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
31: True,
32: False,
}
def test_solve_linear_sla_oob(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=32),
tmp_path,
min_value=64,
)
assert history.get_max_passing() == 64
assert history.get_min_failing() == 64
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
64: False,
}
def test_solve_concave_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_concave(elbow_point=32),
SLALessThanOrEqualTo(target=24),
tmp_path,
)
assert history.get_max_passing() == 16
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
7: True,
13: True,
15: True,
16: True,
17: False,
}
def test_solve_convex_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_convex(elbow_point=32),
SLALessThanOrEqualTo(target=24),
tmp_path,
)
assert history.get_max_passing() == 26
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
48: False,
30: False,
24: True,
26: True,
27: False,
}
def test_solve_quadratic_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_quadratic(y_intercept=10),
SLALessThanOrEqualTo(target=50),
tmp_path,
)
assert history.get_max_passing() == 20
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
4: True,
20: True,
21: False,
}
def test_solve_sqrt_sla_le(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_sqrt(y_intercept=10),
SLALessThanOrEqualTo(target=100),
tmp_path,
)
assert history.get_max_passing() == 81
assert {val: margin <= 0 for val, margin in history.items()} == {
100: False,
1: True,
89: False,
81: True,
82: False,
}
def test_solve_reuse_history(tmp_path):
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=10),
tmp_path,
min_value=1,
max_value=20,
)
assert history.get_max_passing() == 10
assert {val: margin <= 0 for val, margin in history.items()} == {
20: False,
1: True,
10: True,
11: False,
}
sla_data, history = _run_solve_sla(
_var2metric_linear(),
SLALessThanOrEqualTo(target=30),
tmp_path,
min_value=21,
max_value=40,
)
assert history.get_max_passing() == 30
assert {val: margin <= 0 for val, margin in history.items()} == {
# Items from the past run
# (the margins are different because the target changed)
20: True,
1: True,
10: True,
11: True,
# Items from this run
40: False,
30: True,
31: False,
}
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import pytest
@pytest.mark.benchmark
def test_bench_startup():
command = [
"vllm",
"bench",
"startup",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
...@@ -20,21 +20,18 @@ def server(): ...@@ -20,21 +20,18 @@ def server():
@pytest.mark.benchmark @pytest.mark.benchmark
def test_bench_serve(server): def test_bench_serve(server):
# Test default model detection and input/output len
command = [ command = [
"vllm", "vllm",
"bench", "bench",
"serve", "serve",
"--model",
MODEL_NAME,
"--host", "--host",
server.host, server.host,
"--port", "--port",
str(server.port), str(server.port),
"--dataset-name", "--input-len",
"random",
"--random-input-len",
"32", "32",
"--random-output-len", "--output-len",
"4", "4",
"--num-prompts", "--num-prompts",
"5", "5",
......
...@@ -15,6 +15,7 @@ from vllm.config import ( ...@@ -15,6 +15,7 @@ from vllm.config import (
ModelConfig, ModelConfig,
PassConfig, PassConfig,
VllmConfig, VllmConfig,
set_current_vllm_config,
) )
from vllm.distributed import ( from vllm.distributed import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
...@@ -26,6 +27,7 @@ from vllm.distributed.parallel_state import ( ...@@ -26,6 +27,7 @@ from vllm.distributed.parallel_state import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...models.registry import HF_EXAMPLE_MODELS from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import ( from ...utils import (
...@@ -301,7 +303,7 @@ def async_tp_pass_on_test_model( ...@@ -301,7 +303,7 @@ def async_tp_pass_on_test_model(
dtype: torch.dtype, dtype: torch.dtype,
dynamic: bool, dynamic: bool,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -339,38 +341,42 @@ def async_tp_pass_on_test_model( ...@@ -339,38 +341,42 @@ def async_tp_pass_on_test_model(
) )
async_tp_pass = AsyncTPPass(vllm_config) async_tp_pass = AsyncTPPass(vllm_config)
backend = TestBackend(async_tp_pass)
assert ( # Set the global vllm_config for TestBackend which calls
async_tp_pass.compilation_config.splitting_ops # get_current_vllm_config()
== vllm_config.compilation_config.splitting_ops with set_current_vllm_config(vllm_config):
) backend = TestBackend(async_tp_pass)
assert (
async_tp_pass.compilation_config.use_inductor_graph_partition
== vllm_config.compilation_config.use_inductor_graph_partition
)
model = test_model_cls(hidden_size, dtype) # Pass dtype to model constructor assert (
async_tp_pass.compilation_config.splitting_ops
== vllm_config.compilation_config.splitting_ops
)
assert (
async_tp_pass.compilation_config.use_inductor_graph_partition
== vllm_config.compilation_config.use_inductor_graph_partition
)
hidden_states = torch.randn( model = test_model_cls(hidden_size, dtype) # Pass dtype to model constructor
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
) hidden_states = torch.randn(
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
)
if dynamic: if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0) torch._dynamo.mark_dynamic(hidden_states, 0)
compiled_model = torch.compile(model, backend=backend) compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states) compiled_model(hidden_states)
assert async_tp_pass.matched_count == 1 assert async_tp_pass.matched_count == 1
# In pre-nodes, all gather or reduce scatter should exist, # In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
# In post-nodes, fused_matmul_reduce_scatter or \ # In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist # fused_all_gather_matmul should exist
backend.check_after_ops(model.ops_in_model_after()) backend.check_after_ops(model.ops_in_model_after())
@create_new_process_for_each_test() @create_new_process_for_each_test()
......
...@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( ...@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...utils import has_module_attribute, multi_gpu_test from ...utils import has_module_attribute, multi_gpu_test
from ..backend import TestBackend from ..backend import TestBackend
...@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model( ...@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model(
enable_rms_norm_custom_op, enable_rms_norm_custom_op,
enable_quant_fp8_custom_op, enable_quant_fp8_custom_op,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -208,7 +208,8 @@ def test_attn_quant( ...@@ -208,7 +208,8 @@ def test_attn_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm( ...@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp( ...@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -554,7 +557,8 @@ def test_rms_group_quant( ...@@ -554,7 +557,8 @@ def test_rms_group_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties
...@@ -564,7 +568,9 @@ def test_rms_group_quant( ...@@ -564,7 +568,9 @@ def test_rms_group_quant(
splitting_ops=splitting_ops, splitting_ops=splitting_ops,
# Common # Common
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True), pass_config=PassConfig(
fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
),
# Inductor caches custom passes by default as well via uuid # Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True}, inductor_compile_config={"force_disable_caches": True},
) )
......
...@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape ...@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
from ..backend import TestBackend from ..backend import TestBackend
...@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model( ...@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model(
fuse_norm_quant: bool, fuse_norm_quant: bool,
dynamic: bool, dynamic: bool,
): ):
current_platform.seed_everything(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -6,10 +6,13 @@ import pytest ...@@ -6,10 +6,13 @@ import pytest
import os import os
from vllm.config import CompilationMode from vllm.config import CompilationMode
from vllm.platforms import current_platform
from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless
from ...utils import compare_all_settings, models_path_prefix from ...utils import compare_all_settings, models_path_prefix
ATTN_BACKEND = "FLASH_ATTN" if not current_platform.is_rocm() else "ROCM_ATTN"
@dataclasses.dataclass @dataclasses.dataclass
class TestSetting: class TestSetting:
...@@ -32,7 +35,7 @@ class TestSetting: ...@@ -32,7 +35,7 @@ class TestSetting:
model_args=["--max-model-len", "2048"], model_args=["--max-model-len", "2048"],
pp_size=2, pp_size=2,
tp_size=2, tp_size=2,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="generate", method="generate",
), ),
# llama model with quantization # llama model with quantization
...@@ -41,7 +44,7 @@ class TestSetting: ...@@ -41,7 +44,7 @@ class TestSetting:
model_args=["--quantization", "gptq", "--max-model-len", "2048"], model_args=["--quantization", "gptq", "--max-model-len", "2048"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="generate", method="generate",
), ),
# MoE model # MoE model
...@@ -50,7 +53,7 @@ class TestSetting: ...@@ -50,7 +53,7 @@ class TestSetting:
model_args=["--max-model-len", "2048"], model_args=["--max-model-len", "2048"],
pp_size=1, pp_size=1,
tp_size=2, tp_size=2,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="generate", method="generate",
), ),
# embedding model # embedding model
...@@ -66,18 +69,23 @@ class TestSetting: ...@@ -66,18 +69,23 @@ class TestSetting:
], ],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="encode", method="encode",
), ),
# # TODO pytest.param(
# TestSetting( TestSetting(
# model="BAAI/bge-base-en-v1.5", model="BAAI/bge-base-en-v1.5",
# model_args=["--runner", "pooling"], model_args=["--runner", "pooling"],
# pp_size=1, pp_size=1,
# tp_size=1, tp_size=1,
# attn_backend="FLASH_ATTN", attn_backend="FLASH_ATTN",
# method="encode", method="encode",
# ), ),
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="Encoder self-attention is not implemented for ROCm",
),
),
# vision language model # vision language model
# See https://github.com/vllm-project/vllm/issues/26716. # See https://github.com/vllm-project/vllm/issues/26716.
# TestSetting( # TestSetting(
...@@ -91,7 +99,6 @@ class TestSetting: ...@@ -91,7 +99,6 @@ class TestSetting:
], ],
) )
def test_compile_correctness( def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting, test_setting: TestSetting,
): ):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
...@@ -109,49 +116,48 @@ def test_compile_correctness( ...@@ -109,49 +116,48 @@ def test_compile_correctness(
f"{cuda_device_count_stateless()}" f"{cuda_device_count_stateless()}"
) )
with monkeypatch.context() as m: final_args = [
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) *model_args,
final_args = [ "-pp",
*model_args, str(pp_size),
"-pp", "-tp",
str(pp_size), str(tp_size),
"-tp", "-cc.cudagraph_mode=none",
str(tp_size), f"--attention-backend={attn_backend}",
"-cc.cudagraph_mode=none", ]
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for comp_mode in [ all_args: list[list[str]] = []
CompilationMode.STOCK_TORCH_COMPILE, all_envs: list[dict[str, str] | None] = []
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
# inductor will change the output, so we only compare if the output for comp_mode in [
# is close, not exactly the same. CompilationMode.STOCK_TORCH_COMPILE,
compare_all_settings( CompilationMode.DYNAMO_TRACE_ONCE,
model, CompilationMode.VLLM_COMPILE,
all_args, ]:
all_envs, for mode in [CompilationMode.NONE, comp_mode]:
method=method if method != "generate" else "generate_close", all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
) )
all_envs.clear()
all_args.clear()
for mode in [ # inductor will change the output, so we only compare if the output
CompilationMode.NONE, # is close, not exactly the same.
CompilationMode.STOCK_TORCH_COMPILE, compare_all_settings(
CompilationMode.DYNAMO_TRACE_ONCE, model,
CompilationMode.VLLM_COMPILE, all_args,
]: all_envs,
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) method=method if method != "generate" else "generate_close",
all_envs.append({}) )
all_envs.append({}) all_envs.clear()
all_args.clear()
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
compare_all_settings(model, all_args * 3, all_envs, method=method) compare_all_settings(model, all_args * 3, all_envs, method=method)
\ No newline at end of file
...@@ -12,6 +12,7 @@ from vllm import LLM, SamplingParams ...@@ -12,6 +12,7 @@ from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig from vllm.config import CompilationConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum
@contextlib.contextmanager @contextlib.contextmanager
...@@ -70,11 +71,14 @@ def llm_pair(request): ...@@ -70,11 +71,14 @@ def llm_pair(request):
elif backend_config.specific_gpu_arch == (10, 0): elif backend_config.specific_gpu_arch == (10, 0):
pytest.skip("Only Blackwell GPUs support Cutlass MLA") pytest.skip("Only Blackwell GPUs support Cutlass MLA")
# FlashInfer is not supported on ROCm
if backend_config == AttentionBackendEnum.FLASHINFER and current_platform.is_rocm():
pytest.skip("FlashInfer is not supported on ROCm")
env_vars = { env_vars = {
# Force native sampler to avoid potential nondeterminism in FlashInfer # Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1. # when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0", "VLLM_USE_FLASHINFER_SAMPLER": "0",
**backend_config.env_vars,
} }
with temporary_environ(env_vars): with temporary_environ(env_vars):
full = LLM( full = LLM(
...@@ -170,16 +174,10 @@ class TestFullCUDAGraph: ...@@ -170,16 +174,10 @@ class TestFullCUDAGraph:
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_full_cudagraph_with_invalid_backend(): def test_full_cudagraph_with_invalid_backend():
with ( # Flex_Attention is not supported with full cuda graph
temporary_environ( with pytest.raises(RuntimeError):
{
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}
),
pytest.raises(RuntimeError),
):
LLM( LLM(
model="Qwen/Qwen2-1.5B-Instruct", model="Qwen/Qwen2-1.5B-Instruct",
compilation_config=CompilationConfig(cudagraph_mode="FULL"), compilation_config=CompilationConfig(cudagraph_mode="FULL"),
attention_config={"backend": "FLEX_ATTENTION"},
) )
...@@ -10,10 +10,10 @@ import torch ...@@ -10,10 +10,10 @@ import torch
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from ...utils import create_new_process_for_each_test from ...utils import create_new_process_for_each_test
...@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): ...@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
TEST_MODELS.append( TEST_MODELS.append(
( (
"alexm-nm/tinyllama-24-marlin24-4bit-g128", "alexm-nm/tinyllama-24-marlin24-4bit-g128",
{"quantization": "gptq_marlin_24"}, {
"quantization": "gptq_marlin_24",
"allow_deprecated_quantization": True,
},
) )
) )
...@@ -156,6 +159,20 @@ def test_full_graph( ...@@ -156,6 +159,20 @@ def test_full_graph(
) )
for model_info in models_list(all=False) for model_info in models_list(all=False)
if is_torch_equal_or_newer("2.9.0.dev") if is_torch_equal_or_newer("2.9.0.dev")
]
+ [
# Test get_raw_stream patch with compile_sizes
# This tests that TorchInductor autotune works correctly with get_raw_stream
# patch in torch 2.9 and without patch in torch 2.10+
(
CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_sizes=[1, 2], # Triggers autotune which uses get_raw_stream
cudagraph_mode=CUDAGraphMode.NONE,
),
"facebook/opt-125m",
{},
),
], ],
) )
# only test some of the models # only test some of the models
...@@ -197,20 +214,19 @@ def test_custom_compile_config( ...@@ -197,20 +214,19 @@ def test_custom_compile_config(
], ],
) )
def test_fp8_kv_scale_compile( def test_fp8_kv_scale_compile(
monkeypatch: pytest.MonkeyPatch,
compilation_mode: int, compilation_mode: int,
model: str, model: str,
backend: AttentionBackendEnum | None, backend: AttentionBackendEnum | None,
): ):
if backend:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs = { model_kwargs = {
"quantization": "fp8", "quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3", "kv_cache_dtype": "fp8_e4m3",
"calculate_kv_scales": True, "calculate_kv_scales": True,
"max_model_len": 512, "max_model_len": 512,
} }
if backend:
model_kwargs["attention_config"] = {"backend": backend.name}
run_model(compilation_mode, model, **model_kwargs) run_model(compilation_mode, model, **model_kwargs)
......
...@@ -71,3 +71,40 @@ def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch): ...@@ -71,3 +71,40 @@ def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
) as _, ) as _,
): ):
pass pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
# Requires Cuda and 8 gpus as well
@pytest.mark.forked
@pytest.mark.skip(reason="Skipping due to CI resource constraints")
def test_mllama4_vit_compilation(vllm_runner, monkeypatch):
"""Test that Mllama4 vision submodules are compiled.
This test verifies that the 2 vision submodules (Llama4VisionEncoder,
Llama4VisionPixelShuffleMLP) are properly tagged
for compilation by checking that num_models_seen increases to 3.
However since we are using TP=8, we compilation_counter will not
work properly so we will just check the run succeeds rn
"""
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with (
monkeypatch.context(),
# TODO: Since we require TP=8, this messes with the compilation
# counter. We should fix this in the future, but leave for now
# to make sure that compilation runs (no crash) with llama vision encoder
compilation_counter.expect(num_models_seen=0),
vllm_runner(
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
max_model_len=512,
gpu_memory_utilization=0.8,
tensor_parallel_size=8,
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"compile_mm_encoder": True,
},
),
):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment