Commit afd0da21 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.1' into v0.7.1-dev

parents 1a11f127 4f4d427a
......@@ -5,7 +5,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers == 4.47.0 # Required for Llama 3.2 and Qwen2-VL.
transformers >= 4.48.2 # Required for Bamba.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
......@@ -19,7 +19,7 @@ pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.9, < 0.11
outlines == 0.1.11 # Requires pytorch
outlines == 0.1.11
lark == 1.2.2
xgrammar >= 0.1.6; platform_machine == "x86_64"
typing_extensions >= 4.10
......@@ -34,6 +34,6 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
compressed-tensors == 0.9.0 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
......@@ -2,7 +2,14 @@
-r requirements-common.txt
# Dependencies for CPUs
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64"
torch==2.5.1; platform_machine == "aarch64"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
datasets # for benchmark scripts
\ No newline at end of file
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le"
torchaudio==2.5.1; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le"
torchvision==0.20.1; platform_machine == "ppc64le"
datasets # for benchmark scripts
......@@ -5,6 +5,7 @@
ray[default] >= 2.9
nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.1
torchaudio==2.5.1
# These must be updated alongside torch
torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1
......@@ -3,7 +3,7 @@
# Dependencies for HPU code
ray
triton
triton==3.1.0
pandas
tabulate
setuptools>=61
......
# formatting
yapf==0.32.0
toml==0.10.2
tomli==2.0.2
ruff==0.6.5
codespell==2.3.0
isort==5.13.2
clang-format==18.1.5
sphinx-lint==1.0.0
# type checking
mypy==1.11.1
types-PyYAML
types-requests
types-setuptools
pre-commit==4.0.1
......@@ -2,6 +2,6 @@
-r requirements-common.txt
# Dependencies for Neuron devices
transformers-neuronx >= 0.12.0
torch-neuronx >= 2.1.2
transformers-neuronx >= 0.13.0
torch-neuronx >= 2.5.0
neuronx-cc
......@@ -12,20 +12,27 @@ decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test
peft
pqdm
ray[adag]==2.40.0
sentence-transformers # required for embedding tests
soundfile # required for audio tests
timm # required for internvl test
torch==2.5.1
torchaudio==2.5.1
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.0 # required for pixtral test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test
transformers==4.48.2
# quantization
bitsandbytes>=0.45.0
buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
numpy < 2.0.0
......@@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
#
absl-py==2.1.0
# via rouge-score
......@@ -37,7 +37,7 @@ audioread==3.0.1
# via librosa
awscli==1.35.23
# via -r requirements-test.in
bitsandbytes>=0.45.0
bitsandbytes==0.45.0
# via -r requirements-test.in
black==24.10.0
# via datamodel-code-generator
......@@ -48,6 +48,8 @@ botocore==1.35.57
# awscli
# boto3
# s3transfer
bounded-pool-executor==0.0.3
# via pqdm
buildkite-test-collector==0.1.9
# via -r requirements-test.in
certifi==2024.8.30
......@@ -73,6 +75,8 @@ colorama==0.4.6
# tqdm-multiprocess
contourpy==1.3.0
# via matplotlib
cramjam==2.9.0
# via fastparquet
cupy-cuda12x==13.3.0
# via ray
cycler==0.12.1
......@@ -102,11 +106,21 @@ dnspython==2.7.0
docutils==0.16
# via awscli
einops==0.8.0
# via -r requirements-test.in
# via
# -r requirements-test.in
# encodec
# vector-quantize-pytorch
# vocos
einx==0.3.0
# via vector-quantize-pytorch
email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
evaluate==0.4.3
# via lm-eval
fastparquet==2024.11.0
# via genai-perf
fastrlock==0.8.2
# via cupy-cuda12x
filelock==3.16.1
......@@ -119,6 +133,8 @@ filelock==3.16.1
# triton
fonttools==4.54.1
# via matplotlib
frozendict==2.4.6
# via einx
frozenlist==1.5.0
# via
# aiohttp
......@@ -128,8 +144,11 @@ fsspec[http]==2024.9.0
# via
# datasets
# evaluate
# fastparquet
# huggingface-hub
# torch
genai-perf==0.0.8
# via -r requirements-test.in
genson==1.3.0
# via datamodel-code-generator
h11==0.14.0
......@@ -150,6 +169,7 @@ huggingface-hub==0.26.2
# timm
# tokenizers
# transformers
# vocos
idna==3.10
# via
# anyio
......@@ -184,6 +204,8 @@ jsonschema==4.23.0
# ray
jsonschema-specifications==2024.10.1
# via jsonschema
kaleido==0.2.1
# via genai-perf
kiwisolver==1.4.7
# via matplotlib
lazy-loader==0.4
......@@ -198,6 +220,8 @@ lm-eval[api]==0.4.4
# via -r requirements-test.in
lxml==5.3.0
# via sacrebleu
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.2
# via jinja2
matplotlib==3.9.2
......@@ -207,6 +231,8 @@ mbstrdecoder==1.1.3
# dataproperty
# pytablewriter
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common[opencv]==1.5.1
# via
# -r requirements-test.in
......@@ -246,7 +272,11 @@ numpy==1.26.4
# cupy-cuda12x
# datasets
# decord
# einx
# encodec
# evaluate
# fastparquet
# genai-perf
# librosa
# matplotlib
# mistral-common
......@@ -254,15 +284,19 @@ numpy==1.26.4
# numexpr
# opencv-python-headless
# pandas
# patsy
# peft
# rouge-score
# sacrebleu
# scikit-learn
# scipy
# soxr
# statsmodels
# tensorizer
# torchvision
# transformers
# tritonclient
# vocos
nvidia-cublas-cu12==12.4.5.8
# via
# nvidia-cudnn-cu12
......@@ -304,30 +338,39 @@ packaging==24.1
# datamodel-code-generator
# datasets
# evaluate
# fastparquet
# huggingface-hub
# lazy-loader
# matplotlib
# peft
# plotly
# pooch
# pytest
# pytest-rerunfailures
# ray
# statsmodels
# transformers
# typepy
pandas==2.2.3
# via
# datasets
# evaluate
# fastparquet
# genai-perf
# statsmodels
pathspec==0.12.1
# via black
pathvalidate==3.2.1
# via pytablewriter
patsy==1.0.1
# via statsmodels
peft==0.13.2
# via
# -r requirements-test.in
# lm-eval
pillow==10.4.0
# via
# genai-perf
# matplotlib
# mistral-common
# sentence-transformers
......@@ -336,12 +379,16 @@ platformdirs==4.3.6
# via
# black
# pooch
plotly==5.24.1
# via genai-perf
pluggy==1.5.0
# via pytest
pooch==1.8.2
# via librosa
portalocker==2.10.1
# via sacrebleu
pqdm==0.2.0
# via -r requirements-test.in
propcache==0.2.0
# via yarl
protobuf==5.28.3
......@@ -356,7 +403,9 @@ psutil==6.1.0
py==1.11.0
# via pytest-forked
pyarrow==18.0.0
# via datasets
# via
# datasets
# genai-perf
pyasn1==0.6.1
# via rsa
pybind11==2.13.6
......@@ -369,6 +418,8 @@ pydantic[email]==2.9.2
# mistral-common
pydantic-core==2.23.4
# via pydantic
pygments==2.18.0
# via rich
pyparsing==3.2.0
# via matplotlib
pytablewriter==1.2.0
......@@ -377,14 +428,18 @@ pytest==8.3.3
# via
# -r requirements-test.in
# buildkite-test-collector
# genai-perf
# pytest-asyncio
# pytest-forked
# pytest-mock
# pytest-rerunfailures
# pytest-shard
pytest-asyncio==0.24.0
# via -r requirements-test.in
pytest-forked==1.6.0
# via -r requirements-test.in
pytest-mock==3.14.0
# via genai-perf
pytest-rerunfailures==14.0
# via -r requirements-test.in
pytest-shard==0.1.2
......@@ -395,6 +450,8 @@ python-dateutil==2.9.0.post0
# matplotlib
# pandas
# typepy
python-rapidjson==1.20
# via tritonclient
pytz==2024.2
# via
# pandas
......@@ -405,11 +462,14 @@ pyyaml==6.0.2
# awscli
# datamodel-code-generator
# datasets
# genai-perf
# huggingface-hub
# peft
# ray
# responses
# timm
# transformers
# vocos
ray[adag]==2.40.0
# via -r requirements-test.in
redis==5.2.0
......@@ -434,8 +494,13 @@ requests==2.32.3
# mistral-common
# pooch
# ray
# responses
# tiktoken
# transformers
responses==0.25.3
# via genai-perf
rich==13.9.4
# via genai-perf
rouge-score==0.1.2
# via lm-eval
rpds-py==0.20.1
......@@ -466,6 +531,8 @@ scipy==1.13.1
# librosa
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
sentence-transformers==3.2.1
# via -r requirements-test.in
sentencepiece==0.2.0
......@@ -486,8 +553,12 @@ soxr==0.5.0.post1
# via librosa
sqlitedict==2.1.0
# via lm-eval
statsmodels==0.14.4
# via genai-perf
sympy==1.13.1
# via torch
# via
# einx
# torch
tabledata==1.3.3
# via pytablewriter
tabulate==0.9.0
......@@ -495,7 +566,9 @@ tabulate==0.9.0
tcolorpy==0.1.6
# via pytablewriter
tenacity==9.0.0
# via lm-eval
# via
# lm-eval
# plotly
tensorizer==2.9.0
# via -r requirements-test.in
threadpoolctl==3.5.0
......@@ -513,12 +586,21 @@ torch==2.5.1
# -r requirements-test.in
# accelerate
# bitsandbytes
# encodec
# lm-eval
# peft
# sentence-transformers
# tensorizer
# timm
# torchaudio
# torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.5.1
# via
# -r requirements-test.in
# encodec
# vocos
torchvision==0.20.1
# via timm
tqdm==4.66.6
......@@ -529,13 +611,16 @@ tqdm==4.66.6
# lm-eval
# nltk
# peft
# pqdm
# sentence-transformers
# tqdm-multiprocess
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.47.0
transformers==4.48.2
# via
# -r requirements-test.in
# genai-perf
# lm-eval
# peft
# sentence-transformers
......@@ -544,6 +629,10 @@ transformers-stream-generator==0.0.5
# via -r requirements-test.in
triton==3.1.0
# via torch
tritonclient==2.51.0
# via
# -r requirements-test.in
# genai-perf
typepy[datetime]==1.3.2
# via
# dataproperty
......@@ -551,18 +640,26 @@ typepy[datetime]==1.3.2
# tabledata
typing-extensions==4.12.2
# via
# bitsandbytes
# huggingface-hub
# librosa
# mistral-common
# pqdm
# pydantic
# pydantic-core
# torch
tzdata==2024.2
# via pandas
urllib3==1.26.20
urllib3==2.2.3
# via
# botocore
# requests
# responses
# tritonclient
vector-quantize-pytorch==1.21.2
# via -r requirements-test.in
vocos==0.1.0
# via -r requirements-test.in
word2number==1.1
# via lm-eval
xxhash==3.5.0
......
......@@ -13,11 +13,11 @@ ray[default]
# Install torch_xla
--pre
--extra-index-url https://download.pytorch.org/whl/nightly/cpu
--find-links https://storage.googleapis.com/libtpu-wheels/index.html
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.6.0.dev20241126+cpu
torchvision==0.20.0.dev20241126+cpu
torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
jaxlib==0.4.36.dev20241122
jax==0.4.36.dev20241122
torch==2.6.0.dev20241216+cpu
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
import ctypes
import importlib.util
import logging
import os
......@@ -13,7 +14,7 @@ from packaging.version import Version, parse
from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
from typing import Optional, Union
import subprocess
......@@ -40,9 +41,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
if not sys.platform.startswith("linux"):
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
logger.warning(
"vLLM only supports Linux platform (including WSL). "
"VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
VLLM_TARGET_DEVICE = "cpu"
elif not (sys.platform.startswith("linux")
or sys.platform.startswith("darwin")):
logger.warning(
"vLLM only supports Linux platform (including WSL) and MacOS."
"Building on %s, "
"so vLLM may not be able to run correctly", sys.platform)
VLLM_TARGET_DEVICE = "empty"
......@@ -229,8 +235,11 @@ class cmake_build_ext(build_ext):
# CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it.
# We assume only the final component of extension prefix is added by
# CMake, this is currently true for current extensions but may not
# always be the case.
prefix = outdir
for i in range(ext.name.count('.')):
if '.' in ext.name:
prefix = prefix.parent
# prefix here should actually be the same for all components
......@@ -258,7 +267,7 @@ class cmake_build_ext(build_ext):
class repackage_wheel(build_ext):
"""Extracts libraries and other files from an existing wheel."""
default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
def run(self) -> None:
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
......@@ -299,9 +308,11 @@ class repackage_wheel(build_ext):
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = filter(lambda x: x.filename in files_to_copy,
......@@ -325,21 +336,26 @@ class repackage_wheel(build_ext):
def _is_hpu() -> bool:
is_hpu_available = True
# if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
return VLLM_TARGET_DEVICE == "hpu"
# if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
# and if it doesn't, check if habanalabs driver is loaded
is_hpu_available = False
try:
subprocess.run(["hl-smi"], capture_output=True, check=True)
out = subprocess.run(["hl-smi"], capture_output=True, check=True)
is_hpu_available = out.returncode == 0
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
'/dev/accel/accel_controlD0'):
# last resort...
if sys.platform.startswith("linux"):
try:
output = subprocess.check_output(
'lsmod | grep habanalabs | wc -l', shell=True)
is_hpu_available = int(output) > 0
except (ValueError, FileNotFoundError, PermissionError,
subprocess.CalledProcessError):
is_hpu_available = False
return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
pass
return is_hpu_available
def _no_device() -> bool:
......@@ -386,25 +402,31 @@ def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
def get_hipcc_rocm_version():
# Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True)
def get_rocm_version():
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
try:
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
if not librocm_core_file.is_file():
return None
librocm_core = ctypes.CDLL(librocm_core_file)
VerErrors = ctypes.c_uint32
get_rocm_core_version = librocm_core.getROCmVersion
get_rocm_core_version.restype = VerErrors
get_rocm_core_version.argtypes = [
ctypes.POINTER(ctypes.c_uint32),
ctypes.POINTER(ctypes.c_uint32),
ctypes.POINTER(ctypes.c_uint32),
]
major = ctypes.c_uint32()
minor = ctypes.c_uint32()
patch = ctypes.c_uint32()
# Check if the command was executed successfully
if result.returncode != 0:
print("Error running 'hipcc --version'")
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0):
return f"{major.value}.{minor.value}.{patch.value}"
return None
# Extract the version using a regular expression
match = re.search(r'HIP version: (\S+)', result.stdout)
if match:
# Return the version string
return match.group(1)
else:
print("Could not find HIP version in the output")
except Exception:
return None
......@@ -482,9 +504,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f"""
try:
__version__ = "0.6.6.post1"
__version_tuple__ = (0, 6, 6)
__hcu_version__ = f'0.6.6.post1+{version}'
__version__ = "0.7.1"
__version_tuple__ = (0, 7, 1)
__hcu_version__ = f'0.7.1+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
......@@ -527,14 +549,10 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str:
# TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
try:
if not _is_hip():
version = get_version(
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
)
except LookupError:
version = "0.0.0"
if not _is_hip():
version = get_version(
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
)
sep = "+" if "+" not in version else "." # dev versions might contain +
......@@ -552,11 +570,10 @@ def get_vllm_version() -> str:
if "sdist" not in sys.argv:
version += f"{sep}cu{cuda_version_str}"
elif _is_hip():
# Get the HIP version
# hipcc_version = get_hipcc_rocm_version()
# if hipcc_version != MAIN_CUDA_VERSION:
# rocm_version_str = hipcc_version.replace(".", "")[:3]
# version += f"{sep}rocm{rocm_version_str}"
# Get the Rocm Version
# rocm_version = get_rocm_version() or torch.version.hip
# if rocm_version and rocm_version != MAIN_CUDA_VERSION:
# version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
version = get_version()
elif _is_neuron():
# Get the Neuron version
......@@ -611,7 +628,7 @@ def get_requirements() -> List[str]:
return resolved_requirements
if _no_device():
requirements = _read_requirements("requirements-cuda.txt")
requirements = _read_requirements("requirements-cpu.txt")
elif _is_cuda():
requirements = _read_requirements("requirements-cuda.txt")
cuda_major, cuda_minor = torch.version.cuda.split(".")
......@@ -654,14 +671,24 @@ if _is_cuda() or _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
if _is_cuda():
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
# FA3 requires CUDA 12.0 or later
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py","model_executor/layers/quantization/configs/w8a8/*.json"]
"vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"benchmarks/*.py",
"model_executor/layers/quantization/configs/w8a8/*.json"
]
}
if _no_device():
......
......@@ -27,27 +27,32 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
sys.executable, "-u",
str(script_path), "--model", os.path.join(models_path_prefix, "facebook/opt-125m"), "--host",
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
sys.executable,
"-u",
str(script_path),
"--model",
os.path.join(models_path_prefix, "facebook/opt-125m"),
"--host",
"127.0.0.1",
"--tokenizer-pool-size",
str(tokenizer_pool_size),
"--distributed-executor-backend",
distributed_executor_backend,
]
if worker_use_ray:
commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands)
yield
uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, tokenizer_pool_size: int,
worker_use_ray: bool):
distributed_executor_backend: str):
"""
Run the API server and test it.
......
......@@ -46,7 +46,6 @@ def test_vllm_gc_ed():
assert weak_llm() is None
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
......@@ -65,9 +64,10 @@ def test_models(
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
if backend in ("XFORMERS",
"FLASHINFER") and model == "google/gemma-2-2b-it":
pytest.skip(
"XFORMERS does not support gemma2 with full context length.")
f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
......
import torch
from vllm import LLM, SamplingParams
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes
from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test
def test_basic_cumem():
# some tensors from default memory pool
shape = (1024, 1024)
x = torch.empty(shape, device='cuda')
x.zero_()
# some tensors from custom memory pool
allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool():
# custom memory pool
y = torch.empty(shape, device='cuda')
y.zero_()
y += 1
z = torch.empty(shape, device='cuda')
z.zero_()
z += 2
# they can be used together
output = x + y + z
assert torch.allclose(output, torch.ones_like(output) * 3)
free_bytes = torch.cuda.mem_get_info()[0]
allocator.sleep()
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
assert free_bytes_after_sleep > free_bytes
allocator.wake_up()
# they can be used together
output = x + y + z
assert torch.allclose(output, torch.ones_like(output) * 3)
@fork_new_process_for_each_test
def test_cumem_with_cudagraph():
allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool():
weight = torch.eye(1024, device='cuda')
with allocator.use_memory_pool(tag="discard"):
cache = torch.empty(1024, 1024, device='cuda')
def model(x):
out = x @ weight
cache[:out.size(0)].copy_(out)
return out + 1
x = torch.empty(128, 1024, device='cuda')
# warmup
model(x)
# capture cudagraph
model_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(model_graph):
y = model(x)
free_bytes = torch.cuda.mem_get_info()[0]
allocator.sleep()
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
assert free_bytes_after_sleep > free_bytes
allocator.wake_up()
# after waking up, the content in the weight tensor
# should be restored, but the content in the cache tensor
# should be discarded
# this operation is also compatible with cudagraph
x.random_()
model_graph.replay()
# cache content is as expected
assert torch.allclose(x, cache[:x.size(0)])
# output content is as expected
assert torch.allclose(y, x + 1)
@fork_new_process_for_each_test
def test_end_to_end():
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
assert used_bytes < 2 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
......@@ -32,10 +32,10 @@ def check_settings():
@pytest.fixture
def worker_use_ray() -> bool:
# When SPMD worker is used, use ray_use_worker=True
def distributed_executor_backend() -> str:
# When SPMD worker is used, use distributed_executor_backend="ray"
# to test delta input optimization works with preemption.
return envs.VLLM_USE_RAY_SPMD_WORKER
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
@pytest.mark.parametrize("model", MODELS)
......@@ -50,7 +50,7 @@ def test_chunked_prefill_recompute(
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs = min(chunked_prefill_token_size, 256)
......@@ -69,7 +69,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
......@@ -97,7 +97,7 @@ def test_preemption(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""By default, recompute preemption is enabled"""
......@@ -108,7 +108,7 @@ def test_preemption(
model,
dtype=dtype,
disable_log_stats=False,
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
......@@ -149,7 +149,7 @@ def test_preemption_infeasible(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16
......@@ -164,7 +164,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
......
......@@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed.
"""
from dataclasses import dataclass
from typing import Optional, Tuple
from typing import Any, List, Optional, Tuple
import torch
from torch import nn
......@@ -54,6 +54,16 @@ class LlamaConfig:
tractable_init: bool = False
random_seed: int = 0
def compute_hash(self) -> str:
factors: List[Any] = []
for k, v in self.__dict__.items():
if k == "random_seed":
continue
factors.append((k, v))
factors.sort()
import hashlib
return hashlib.md5(str(factors).encode()).hexdigest()
def __post_init__(self):
assert self.mlp_size >= self.hidden_size
......@@ -263,7 +273,8 @@ def run_model(llama_config,
compilation_config = CompilationConfig(
level=CompilationLevel.NO_COMPILATION, )
vllm_config = VllmConfig(compilation_config=compilation_config)
vllm_config = VllmConfig(compilation_config=compilation_config,
additional_config=llama_config)
with set_current_vllm_config(vllm_config):
model = LlamaModel(config=llama_config,
vllm_config=vllm_config,
......
......@@ -59,7 +59,7 @@ test_settings = [
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASHINFER",
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
......
......@@ -30,13 +30,13 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
TokensPrompt, to_enc_dec_tuple_list,
zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity)
identity, is_list_of)
from .utils import models_path_prefix
logger = init_logger(__name__)
......@@ -44,6 +44,7 @@ logger = init_logger(__name__)
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M")
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
......@@ -181,6 +182,12 @@ def example_prompts() -> List[str]:
return prompts
@pytest.fixture
def example_system_message() -> str:
with open(_SYS_MSG) as f:
return f.read()
class DecoderPromptType(Enum):
"""For encoder/decoder models only."""
CUSTOM = 1
......@@ -240,11 +247,13 @@ def video_assets() -> _VideoAssets:
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
_R = TypeVar("_R")
class HfRunner:
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
from vllm.platforms import current_platform
if x is None or isinstance(x, (bool, )):
return x
......@@ -882,6 +891,12 @@ class VllmRunner:
beam_width: int,
max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]:
if is_list_of(prompts, str, check="all"):
prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
else:
prompts = [
TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
]
outputs = self.model.beam_search(
prompts,
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
......@@ -919,6 +934,10 @@ class VllmRunner:
req_outputs = self.model.score(text_1, text_2)
return [req_output.outputs.score for req_output in req_outputs]
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
executor = self.model.llm_engine.model_executor
return executor.apply_model(func)
def __enter__(self):
return self
......
......@@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator:
block_hashes=block_hashes_seq1)
assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
# Test reset prefix cache
@staticmethod
@pytest.mark.parametrize("num_blocks", [10])
@pytest.mark.parametrize("block_size", [16])
def test_reset_prefix_cache(num_blocks: int, block_size: int):
"""This test case simulates the case of resetting the prefix cache."""
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
token_ids = list(range(3 * block_size))
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Free each block in the first chain.
for block in first_chain:
allocator.free(block)
# Failed to reset prefix cache because some blocks are not freed yet.
assert not allocator.reset_prefix_cache()
assert allocator.get_prefix_cache_hit_rate() > 0.0
# Free each block in the second chain.
for block in second_chain:
allocator.free(block)
# Reset prefix cache.
assert allocator.reset_prefix_cache()
assert allocator.get_prefix_cache_hit_rate() == 0.0
@staticmethod
def create_immutable_chain(
block_size: int,
......
......@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture() as graph_capture_context:
with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly
inp1 = torch.randint(1,
16, (sz, ),
......
......@@ -59,8 +59,7 @@ def worker_fn():
device=get_world_group().device)
tensor = torch.ones(16, 1024, 1024,
dtype=torch.float32).cuda(pynccl_comm.rank)
with pynccl_comm.change_state(enable=True):
tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
......@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn():
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
pynccl_comm = PyNcclCommunicator(group=group, device=device)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
with pynccl_comm.change_state(enable=True):
# two groups can communicate independently
if torch.distributed.get_rank() in [0, 1]:
tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == 4).cpu().item()
else:
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == 2).cpu().item()
# two groups can communicate independently
if torch.distributed.get_rank() in [0, 1]:
tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == 4).cpu().item()
else:
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == 2).cpu().item()
@pytest.mark.skipif(torch.cuda.device_count() < 4,
......@@ -107,7 +105,7 @@ def multiple_allreduce_with_vllm_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
ensure_model_parallel_initialized(2, 2)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
with graph_capture():
with graph_capture(device=device):
# two tp groups can communicate independently
if torch.distributed.get_rank() in [0, 1]:
tensor = tensor_model_parallel_all_reduce(tensor)
......@@ -137,9 +135,7 @@ def worker_fn_with_cudagraph():
# run something in the default stream to initialize torch engine
a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
torch.cuda.synchronize()
with torch.cuda.graph(
graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
enable=True):
with torch.cuda.graph(graph):
a_out = pynccl_comm.all_reduce(a)
torch.cuda.synchronize()
graph.replay()
......@@ -168,8 +164,7 @@ def all_gather_worker_fn():
for r in range(world_size)
]).to(device)
with pynccl_comm.change_state(enable=True):
pynccl_comm.all_gather(result, tensor)
pynccl_comm.all_gather(result, tensor)
torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
......@@ -206,8 +201,7 @@ def reduce_scatter_worker_fn():
expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
for tensor in all_tensors).to(device)
with pynccl_comm.change_state(enable=True):
pynccl_comm.reduce_scatter(result, tensor)
pynccl_comm.reduce_scatter(result, tensor)
torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
......@@ -234,15 +228,13 @@ def send_recv_worker_fn():
else:
tensor = torch.empty(16, 1024, 1024,
dtype=torch.float32).cuda(pynccl_comm.rank)
with pynccl_comm.change_state(enable=True):
if pynccl_comm.rank == 0:
pynccl_comm.send(tensor,
dst=(pynccl_comm.rank + 1) %
pynccl_comm.world_size)
else:
pynccl_comm.recv(tensor,
src=(pynccl_comm.rank - 1) %
pynccl_comm.world_size)
if pynccl_comm.rank == 0:
pynccl_comm.send(tensor,
dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
else:
pynccl_comm.recv(tensor,
src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
torch.cuda.synchronize()
assert torch.all(tensor == 1).cpu().item()
......@@ -273,15 +265,12 @@ def multiple_send_recv_worker_fn():
1024,
dtype=torch.float32,
device=device)
with pynccl_comm.change_state(enable=True):
if torch.distributed.get_rank() in [0, 1]:
pynccl_comm.send(tensor,
dst=(pynccl_comm.rank + 1) %
pynccl_comm.world_size)
else:
pynccl_comm.recv(tensor,
src=(pynccl_comm.rank - 1) %
pynccl_comm.world_size)
if torch.distributed.get_rank() in [0, 1]:
pynccl_comm.send(tensor,
dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
else:
pynccl_comm.recv(tensor,
src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
torch.cuda.synchronize()
if torch.distributed.get_rank() in [0, 2]:
assert torch.all(tensor == 1).cpu().item()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment