Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
# Should be mirrored in pyproject.toml # Should be mirrored in pyproject.toml
cmake>=3.26 cmake>=3.26
ninja ninja
packaging packaging
setuptools>=61 setuptools>=61
setuptools-scm>=8 setuptools-scm>=8
torch==2.4.0 torch==2.5.1; platform_machine != 'aarch64'
wheel wheel
jinja2 jinja2
psutil psutil
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0 numpy < 2.0.0
requests requests >= 2.26.0
tqdm tqdm
blake3
py-cpuinfo py-cpuinfo
transformers == 4.45.0 # Required for Llama 3.2. transformers >= 4.45.2 # Required for Llama 3.2 and Qwen2-VL.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.114.1; python_version >= '3.9' fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp aiohttp
openai >= 1.40.0 # Ensure modern openai package (ensure types module present) openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard] uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0 pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing pillow # Required for image processing
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.6 lm-format-enforcer >= 0.10.9, < 0.11
outlines >= 0.0.43, < 0.1 outlines == 0.1.11
xgrammar >= 0.1.6; platform_machine == "x86_64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
pyzmq pyzmq
msgspec msgspec
gguf == 0.10.0 gguf == 0.10.0
importlib_metadata importlib_metadata
mistral_common >= 1.4.3 mistral_common[opencv] >= 1.5.0
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.8.1 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging torch.compile
# Common dependencies # Common dependencies
-r requirements-common.txt -r requirements-common.txt
# Dependencies for x86_64 CPUs # Dependencies for CPUs
torch == 2.4.0+cpu; platform_machine != "ppc64le" torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torch==2.5.1; platform_machine == "aarch64"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
datasets # for benchmark scripts
\ No newline at end of file
--index-url https://download.pytorch.org/whl/nightly/cu124
torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray >= 2.9 ray >= 2.9
nvidia-ml-py # for pynvml package nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.4.0 torch == 2.5.1; platform_machine != 'aarch64'
# These must be updated alongside torch # These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1
# Common dependencies
-r requirements-common.txt
# Dependencies for HPU code
ray
triton
pandas
tabulate
setuptools>=61
setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
# formatting # formatting
yapf==0.32.0 yapf==0.32.0
toml==0.10.2 toml==0.10.2
tomli==2.0.1 tomli==2.0.2
ruff==0.6.5 ruff==0.6.5
codespell==2.3.0 codespell==2.3.0
isort==5.13.2 isort==5.13.2
clang-format==18.1.5 clang-format==18.1.5
sphinx-lint==1.0.0
# type checking # type checking
mypy==1.11.1 mypy==1.11.1
......
# Common dependencies # Common dependencies
-r requirements-common.txt -r requirements-common.txt
# OpenVINO dependencies torch == 2.5.1 # should be aligned with "common" vLLM torch version
torch >= 2.1.2 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
openvino ~= 2024.3.0
optimum-intel[openvino] >= 1.18.2 optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
awscli awscli
boto3 boto3
botocore botocore
datasets
ray >= 2.10.0 ray >= 2.10.0
peft peft
pytest-asyncio pytest-asyncio
...@@ -15,4 +16,4 @@ torch == 2.4.1 ...@@ -15,4 +16,4 @@ torch == 2.4.1
triton == 3.0.0 triton == 3.0.0
flash_attn == 2.6.1 flash_attn == 2.6.1
lmslim == 0.2.0 lmslim == 0.2.0
numa numa
\ No newline at end of file
# testing
pytest
tensorizer>=2.9.0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
# testing utils
awscli
decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
peft
ray[adag]==2.40.0
sentence-transformers # required for embedding tests
soundfile # required for audio tests
timm # required for internvl test
torch==2.5.1
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.0 # required for pixtral test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test
# quantization
bitsandbytes>=0.45.0
buildkite-test-collector==0.1.9
numpy < 2.0.0
# testing #
pytest # This file is autogenerated by pip-compile with Python 3.12
tensorizer>=2.9.0 # by the following command:
pytest-forked #
pytest-asyncio # python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
pytest-rerunfailures #
pytest-shard absl-py==2.1.0
pytest-html # via rouge-score
pytest-timeout accelerate==1.0.1
# via
# lm-eval
# peft
aiohappyeyeballs==2.4.3
# via aiohttp
aiohttp==3.10.10
# via
# datasets
# fsspec
# lm-eval
aiosignal==1.3.1
# via
# aiohttp
# ray
annotated-types==0.7.0
# via pydantic
anyio==4.6.2.post1
# via httpx
argcomplete==3.5.1
# via datamodel-code-generator
attrs==24.2.0
# via
# aiohttp
# jsonlines
# jsonschema
# referencing
audioread==3.0.1
# via librosa
awscli==1.35.23
# via -r requirements-test.in
bitsandbytes>=0.45.0
# via -r requirements-test.in
black==24.10.0
# via datamodel-code-generator
boto3==1.35.57
# via tensorizer
botocore==1.35.57
# via
# awscli
# boto3
# s3transfer
buildkite-test-collector==0.1.9
# via -r requirements-test.in
certifi==2024.8.30
# via
# httpcore
# httpx
# requests
cffi==1.17.1
# via soundfile
chardet==5.2.0
# via mbstrdecoder
charset-normalizer==3.4.0
# via requests
click==8.1.7
# via
# black
# nltk
# ray
colorama==0.4.6
# via
# awscli
# sacrebleu
# tqdm-multiprocess
contourpy==1.3.0
# via matplotlib
cupy-cuda12x==13.3.0
# via ray
cycler==0.12.1
# via matplotlib
datamodel-code-generator==0.26.3
# via -r requirements-test.in
dataproperty==1.0.1
# via
# pytablewriter
# tabledata
datasets==3.0.2
# via
# evaluate
# lm-eval
decorator==5.1.1
# via librosa
decord==0.6.0
# via -r requirements-test.in
dill==0.3.8
# via
# datasets
# evaluate
# lm-eval
# multiprocess
dnspython==2.7.0
# via email-validator
docutils==0.16
# via awscli
einops==0.8.0
# via -r requirements-test.in
email-validator==2.2.0
# via pydantic
evaluate==0.4.3
# via lm-eval
fastrlock==0.8.2
# via cupy-cuda12x
filelock==3.16.1
# via
# datasets
# huggingface-hub
# ray
# torch
# transformers
# triton
fonttools==4.54.1
# via matplotlib
frozenlist==1.5.0
# via
# aiohttp
# aiosignal
# ray
fsspec[http]==2024.9.0
# via
# datasets
# evaluate
# huggingface-hub
# torch
genson==1.3.0
# via datamodel-code-generator
h11==0.14.0
# via httpcore
hiredis==3.0.0
# via tensorizer
httpcore==1.0.6
# via httpx
httpx==0.27.2
# via -r requirements-test.in
huggingface-hub==0.26.2
# via
# accelerate
# datasets
# evaluate
# peft
# sentence-transformers
# timm
# tokenizers
# transformers
idna==3.10
# via
# anyio
# email-validator
# httpx
# requests
# yarl
inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
isort==5.13.2
# via datamodel-code-generator
jinja2==3.1.4
# via
# datamodel-code-generator
# torch
jmespath==1.0.1
# via
# boto3
# botocore
joblib==1.4.2
# via
# librosa
# nltk
# scikit-learn
jsonlines==4.0.0
# via lm-eval
jsonschema==4.23.0
# via
# mistral-common
# ray
jsonschema-specifications==2024.10.1
# via jsonschema
kiwisolver==1.4.7
# via matplotlib
lazy-loader==0.4
# via librosa
libnacl==2.1.0
# via tensorizer
librosa==0.10.2.post1
# via -r requirements-test.in
llvmlite==0.43.0
# via numba
lm-eval[api]==0.4.4
# via -r requirements-test.in
lxml==5.3.0
# via sacrebleu
markupsafe==3.0.2
# via jinja2
matplotlib==3.9.2
# via -r requirements-test.in
mbstrdecoder==1.1.3
# via
# dataproperty
# pytablewriter
# typepy
mistral-common[opencv]==1.5.1
# via
# -r requirements-test.in
# mistral-common
more-itertools==10.5.0
# via lm-eval
mpmath==1.3.0
# via sympy
msgpack==1.1.0
# via
# librosa
# ray
multidict==6.1.0
# via
# aiohttp
# yarl
multiprocess==0.70.16
# via
# datasets
# evaluate
mypy-extensions==1.0.0
# via black
networkx==3.2.1
# via torch
nltk==3.9.1
# via rouge-score
numba==0.60.0
# via librosa
numexpr==2.10.1
# via lm-eval
numpy==1.26.4
# via
# -r requirements-test.in
# accelerate
# bitsandbytes
# contourpy
# cupy-cuda12x
# datasets
# decord
# evaluate
# librosa
# matplotlib
# mistral-common
# numba
# numexpr
# opencv-python-headless
# pandas
# peft
# rouge-score
# sacrebleu
# scikit-learn
# scipy
# soxr
# tensorizer
# torchvision
# transformers
nvidia-cublas-cu12==12.4.5.8
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.4.127
# via torch
nvidia-cuda-nvrtc-cu12==12.4.127
# via torch
nvidia-cuda-runtime-cu12==12.4.127
# via torch
nvidia-cudnn-cu12==9.1.0.70
# via torch
nvidia-cufft-cu12==11.2.1.3
# via torch
nvidia-curand-cu12==10.3.5.147
# via torch
nvidia-cusolver-cu12==11.6.1.9
# via torch
nvidia-cusparse-cu12==12.3.1.170
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.21.5
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.4.127
# via torch
opencv-python-headless==4.10.0.84
# via mistral-common
packaging==24.1
# via
# accelerate
# black
# datamodel-code-generator
# datasets
# evaluate
# huggingface-hub
# lazy-loader
# matplotlib
# peft
# pooch
# pytest
# pytest-rerunfailures
# ray
# transformers
# typepy
pandas==2.2.3
# via
# datasets
# evaluate
pathspec==0.12.1
# via black
pathvalidate==3.2.1
# via pytablewriter
peft==0.13.2
# via
# -r requirements-test.in
# lm-eval
pillow==10.4.0
# via
# matplotlib
# mistral-common
# sentence-transformers
# torchvision
platformdirs==4.3.6
# via
# black
# pooch
pluggy==1.5.0
# via pytest
pooch==1.8.2
# via librosa
portalocker==2.10.1
# via sacrebleu
propcache==0.2.0
# via yarl
protobuf==5.28.3
# via
# ray
# tensorizer
psutil==6.1.0
# via
# accelerate
# peft
# tensorizer
py==1.11.0
# via pytest-forked
pyarrow==18.0.0
# via datasets
pyasn1==0.6.1
# via rsa
pybind11==2.13.6
# via lm-eval
pycparser==2.22
# via cffi
pydantic[email]==2.9.2
# via
# datamodel-code-generator
# mistral-common
pydantic-core==2.23.4
# via pydantic
pyparsing==3.2.0
# via matplotlib
pytablewriter==1.2.0
# via lm-eval
pytest==8.3.3
# via
# -r requirements-test.in
# buildkite-test-collector
# pytest-asyncio
# pytest-forked
# pytest-rerunfailures
# pytest-shard
pytest-asyncio==0.24.0
# via -r requirements-test.in
pytest-forked==1.6.0
# via -r requirements-test.in
pytest-rerunfailures==14.0
# via -r requirements-test.in
pytest-shard==0.1.2
# via -r requirements-test.in
python-dateutil==2.9.0.post0
# via
# botocore
# matplotlib
# pandas
# typepy
pytz==2024.2
# via
# pandas
# typepy
pyyaml==6.0.2
# via
# accelerate
# awscli
# datamodel-code-generator
# datasets
# huggingface-hub
# peft
# ray
# timm
# transformers
ray[adag]==2.40.0
# via -r requirements-test.in
redis==5.2.0
# via tensorizer
referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
regex==2024.9.11
# via
# nltk
# sacrebleu
# tiktoken
# transformers
requests==2.32.3
# via
# buildkite-test-collector
# datasets
# evaluate
# huggingface-hub
# lm-eval
# mistral-common
# pooch
# ray
# tiktoken
# transformers
rouge-score==0.1.2
# via lm-eval
rpds-py==0.20.1
# via
# jsonschema
# referencing
rsa==4.7.2
# via awscli
s3transfer==0.10.3
# via
# awscli
# boto3
sacrebleu==2.4.3
# via lm-eval
safetensors==0.4.5
# via
# accelerate
# peft
# timm
# transformers
scikit-learn==1.5.2
# via
# librosa
# lm-eval
# sentence-transformers
scipy==1.13.1
# via
# librosa
# scikit-learn
# sentence-transformers
sentence-transformers==3.2.1
# via -r requirements-test.in
sentencepiece==0.2.0
# via mistral-common
six==1.16.0
# via
# python-dateutil
# rouge-score
sniffio==1.3.1
# via
# anyio
# httpx
soundfile==0.12.1
# via
# -r requirements-test.in
# librosa
soxr==0.5.0.post1
# via librosa
sqlitedict==2.1.0
# via lm-eval
sympy==1.13.1
# via torch
tabledata==1.3.3
# via pytablewriter
tabulate==0.9.0
# via sacrebleu
tcolorpy==0.1.6
# via pytablewriter
tenacity==9.0.0
# via lm-eval
tensorizer==2.9.0
# via -r requirements-test.in
threadpoolctl==3.5.0
# via scikit-learn
tiktoken==0.7.0
# via
# lm-eval
# mistral-common
timm==1.0.11
# via -r requirements-test.in
tokenizers==0.21.0
# via transformers
torch==2.5.1
# via
# -r requirements-test.in
# accelerate
# bitsandbytes
# lm-eval
# peft
# sentence-transformers
# tensorizer
# timm
# torchvision
torchvision==0.20.1
# via timm
tqdm==4.66.6
# via
# datasets
# evaluate
# huggingface-hub
# lm-eval
# nltk
# peft
# sentence-transformers
# tqdm-multiprocess
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.47.0
# via
# lm-eval
# peft
# sentence-transformers
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements-test.in
triton==3.1.0
# via torch
typepy[datetime]==1.3.2
# via
# dataproperty
# pytablewriter
# tabledata
typing-extensions==4.12.2
# via
# huggingface-hub
# librosa
# mistral-common
# pydantic
# pydantic-core
# torch
tzdata==2024.2
# via pandas
urllib3==1.26.20
# via
# botocore
# requests
word2number==1.1
# via lm-eval
xxhash==3.5.0
# via
# datasets
# evaluate
yarl==1.17.1
# via aiohttp
zstandard==0.23.0
# via lm-eval
# testing utils # The following packages are considered to be unsafe in a requirements file:
awscli # setuptools
einops # required for MPT, qwen-vl and Mamba \ No newline at end of file
httpx
librosa # required for audio test
opencv-python # required for video test
peft
requests
ray[adag]==2.35
sentence-transformers # required for embedding
soundfile # required for audio test
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
datamodel_code_generator # required for minicpm3 test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
# Benchmarking
aiohttp
# quantization
# bitsandbytes>=0.44.0
buildkite-test-collector==0.1.8
...@@ -2,6 +2,22 @@ ...@@ -2,6 +2,22 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for TPU # Dependencies for TPU
# Currently, the TPU backend uses a nightly version of PyTorch XLA. cmake>=3.26
# You can install the dependencies in Dockerfile.tpu. ninja
packaging
setuptools-scm>=8
wheel
jinja2
ray[default] ray[default]
# Install torch_xla
--pre
--extra-index-url https://download.pytorch.org/whl/nightly/cpu
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.6.0.dev20241126+cpu
torchvision==0.20.0.dev20241126+cpu
torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
jaxlib==0.4.36.dev20241122
jax==0.4.36.dev20241122
# Common dependencies # Common dependencies
-r requirements-common.txt -r requirements-common.txt
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
ray >= 2.9 ray >= 2.9
# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ cmake>=3.26
torch == 2.3.1+cxx11.abi ninja
intel-extension-for-pytorch == 2.3.110+xpu packaging
oneccl_bind_pt == 2.3.100+xpu setuptools-scm>=8
wheel
jinja2
torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
triton-xpu == 3.0.0b2 triton-xpu == 3.0.0b1
import importlib.util import importlib.util
import io
import logging import logging
import os import os
import re import re
...@@ -63,12 +62,6 @@ def is_ninja_available() -> bool: ...@@ -63,12 +62,6 @@ def is_ninja_available() -> bool:
return which("ninja") is not None return which("ninja") is not None
def remove_prefix(text, prefix):
if text.startswith(prefix):
return text[len(prefix):]
return text
class CMakeExtension(Extension): class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
...@@ -164,6 +157,14 @@ class cmake_build_ext(build_ext): ...@@ -164,6 +157,14 @@ class cmake_build_ext(build_ext):
# on subsequent calls to python. # on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
# Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles,
# and plays more nicely with sccache.
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
# #
# Setup parallelism and build tool # Setup parallelism and build tool
# #
...@@ -197,8 +198,10 @@ class cmake_build_ext(build_ext): ...@@ -197,8 +198,10 @@ class cmake_build_ext(build_ext):
os.makedirs(self.build_temp) os.makedirs(self.build_temp)
targets = [] targets = []
target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
"vllm_flash_attn.") def target_name(s: str) -> str:
return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.")
# Build all the extensions # Build all the extensions
for ext in self.extensions: for ext in self.extensions:
self.configure(ext) self.configure(ext)
...@@ -253,6 +256,92 @@ class cmake_build_ext(build_ext): ...@@ -253,6 +256,92 @@ class cmake_build_ext(build_ext):
self.copy_file(file, dst_file) self.copy_file(file, dst_file)
class repackage_wheel(build_ext):
"""Extracts libraries and other files from an existing wheel."""
default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
def run(self) -> None:
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
self.default_wheel)
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
import zipfile
if os.path.isfile(wheel_location):
wheel_path = wheel_location
print(f"Using existing wheel={wheel_path}")
else:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename = wheel_location.split("/")[-1]
import tempfile
# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve
try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError
raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = filter(lambda x: x.filename in files_to_copy,
wheel.filelist)
for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)
if package_name not in package_data:
package_data[package_name] = []
wheel.extract(file)
if file_name.endswith(".py"):
# python files shouldn't be added to package_data
continue
package_data[package_name].append(file_name)
def _is_hpu() -> bool:
is_hpu_available = True
try:
subprocess.run(["hl-smi"], capture_output=True, check=True)
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
'/dev/accel/accel_controlD0'):
# last resort...
try:
output = subprocess.check_output(
'lsmod | grep habanalabs | wc -l', shell=True)
is_hpu_available = int(output) > 0
except (ValueError, FileNotFoundError, PermissionError,
subprocess.CalledProcessError):
is_hpu_available = False
return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
def _no_device() -> bool: def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty" return VLLM_TARGET_DEVICE == "empty"
...@@ -260,7 +349,7 @@ def _no_device() -> bool: ...@@ -260,7 +349,7 @@ def _no_device() -> bool:
def _is_cuda() -> bool: def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu())) and not (_is_neuron() or _is_tpu() or _is_hpu()))
def _is_hip() -> bool: def _is_hip() -> bool:
...@@ -297,10 +386,6 @@ def _build_custom_ops() -> bool: ...@@ -297,10 +386,6 @@ def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu() return _is_cuda() or _is_hip() or _is_cpu()
def _build_core_ext() -> bool:
return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
def get_hipcc_rocm_version(): def get_hipcc_rocm_version():
# Run the hipcc --version command # Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'], result = subprocess.run(['hipcc', '--version'],
...@@ -330,7 +415,7 @@ def get_neuronxcc_version(): ...@@ -330,7 +415,7 @@ def get_neuronxcc_version():
"__init__.py") "__init__.py")
# Check if the command was executed successfully # Check if the command was executed successfully
with open(version_file, "rt") as fp: with open(version_file) as fp:
content = fp.read() content = fp.read()
# Extract the version using a regular expression # Extract the version using a regular expression
...@@ -339,7 +424,7 @@ def get_neuronxcc_version(): ...@@ -339,7 +424,7 @@ def get_neuronxcc_version():
# Return the version string # Return the version string
return match.group(1) return match.group(1)
else: else:
raise RuntimeError("Could not find HIP version in the output") raise RuntimeError("Could not find Neuron version in the output")
def get_nvcc_cuda_version() -> Version: def get_nvcc_cuda_version() -> Version:
...@@ -375,15 +460,15 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -375,15 +460,15 @@ def get_version_add(sha: Optional[str] = None) -> str:
if sha != 'Unknown': if sha != 'Unknown':
if sha is None: if sha is None:
sha = get_sha(vllm_root) sha = get_sha(vllm_root)
if (major, minor) == ('2', '3'): # if (major, minor) == ('2', '3'):
version = 'das.opt1.' + sha[:7] # version = 'das.opt1.' + sha[:7]
if (major, minor) == ('2', '4'): if (major, minor) == ('2', '4'):
version = 'das.opt2.' + sha[:7] version = 'das.opt1.' + sha[:7]
else: else:
if (major, minor) == ('2', '3'): # if (major, minor) == ('2', '3'):
version = 'das.opt1' # version = 'das.opt1'
if (major, minor) == ('2', '4'): if (major, minor) == ('2', '4'):
version = 'das.opt2' version = 'das.opt1'
# dtk version # dtk version
...@@ -397,9 +482,9 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -397,9 +482,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.6.2" __version__ = "0.6.5"
__version_tuple__ = (0, 6, 2) __version_tuple__ = (0, 6, 5)
__hcu_version__ = f'0.6.2+{version}' __hcu_version__ = f'0.6.5+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
...@@ -423,6 +508,22 @@ def get_version(): ...@@ -423,6 +508,22 @@ def get_version():
return locals()['__hcu_version__'] return locals()['__hcu_version__']
def get_gaudi_sw_version():
"""
Returns the driver version.
"""
# Enable console printing for `hl-smi` check
output = subprocess.run("hl-smi",
shell=True,
text=True,
capture_output=True,
env={"ENABLE_CONSOLE": "true"})
if output.returncode == 0 and output.stdout:
return output.stdout.split("\n")[2].replace(
" ", "").split(":")[1][:-1].split("-")[0]
return "0.0.0" # when hl-smi is not available
def get_vllm_version() -> str: def get_vllm_version() -> str:
if not _is_hip(): if not _is_hip():
version = get_version( version = get_version(
...@@ -435,12 +536,15 @@ def get_vllm_version() -> str: ...@@ -435,12 +536,15 @@ def get_vllm_version() -> str:
if envs.VLLM_TARGET_DEVICE == "empty": if envs.VLLM_TARGET_DEVICE == "empty":
version += f"{sep}empty" version += f"{sep}empty"
elif _is_cuda(): elif _is_cuda():
cuda_version = str(get_nvcc_cuda_version()) if envs.VLLM_USE_PRECOMPILED:
if cuda_version != MAIN_CUDA_VERSION: version += ".precompiled"
cuda_version_str = cuda_version.replace(".", "")[:3] else:
# skip this for source tarball, required for pypi cuda_version = str(get_nvcc_cuda_version())
if "sdist" not in sys.argv: if cuda_version != MAIN_CUDA_VERSION:
version += f"{sep}cu{cuda_version_str}" cuda_version_str = cuda_version.replace(".", "")[:3]
# skip this for source tarball, required for pypi
if "sdist" not in sys.argv:
version += f"{sep}cu{cuda_version_str}"
elif _is_hip(): elif _is_hip():
# Get the HIP version # Get the HIP version
# hipcc_version = get_hipcc_rocm_version() # hipcc_version = get_hipcc_rocm_version()
...@@ -454,6 +558,12 @@ def get_vllm_version() -> str: ...@@ -454,6 +558,12 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION: if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3] neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"{sep}neuron{neuron_version_str}" version += f"{sep}neuron{neuron_version_str}"
elif _is_hpu():
# Get the Intel Gaudi Software Suite version
gaudi_sw_version = str(get_gaudi_sw_version())
if gaudi_sw_version != MAIN_CUDA_VERSION:
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
version += f"{sep}gaudi{gaudi_sw_version}"
elif _is_openvino(): elif _is_openvino():
version += f"{sep}openvino" version += f"{sep}openvino"
elif _is_tpu(): elif _is_tpu():
...@@ -472,7 +582,8 @@ def read_readme() -> str: ...@@ -472,7 +582,8 @@ def read_readme() -> str:
"""Read the README file if present.""" """Read the README file if present."""
p = get_path("README.md") p = get_path("README.md")
if os.path.isfile(p): if os.path.isfile(p):
return io.open(get_path("README.md"), "r", encoding="utf-8").read() with open(get_path("README.md"), encoding="utf-8") as f:
return f.read()
else: else:
return "" return ""
...@@ -487,6 +598,8 @@ def get_requirements() -> List[str]: ...@@ -487,6 +598,8 @@ def get_requirements() -> List[str]:
for line in requirements: for line in requirements:
if line.startswith("-r "): if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1]) resolved_requirements += _read_requirements(line.split()[1])
elif line.startswith("--"):
continue
else: else:
resolved_requirements.append(line) resolved_requirements.append(line)
return resolved_requirements return resolved_requirements
...@@ -509,6 +622,8 @@ def get_requirements() -> List[str]: ...@@ -509,6 +622,8 @@ def get_requirements() -> List[str]:
requirements = _read_requirements("requirements-rocm.txt") requirements = _read_requirements("requirements-rocm.txt")
elif _is_neuron(): elif _is_neuron():
requirements = _read_requirements("requirements-neuron.txt") requirements = _read_requirements("requirements-neuron.txt")
elif _is_hpu():
requirements = _read_requirements("requirements-hpu.txt")
elif _is_openvino(): elif _is_openvino():
requirements = _read_requirements("requirements-openvino.txt") requirements = _read_requirements("requirements-openvino.txt")
elif _is_tpu(): elif _is_tpu():
...@@ -519,16 +634,13 @@ def get_requirements() -> List[str]: ...@@ -519,16 +634,13 @@ def get_requirements() -> List[str]:
requirements = _read_requirements("requirements-xpu.txt") requirements = _read_requirements("requirements-xpu.txt")
else: else:
raise ValueError( raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, " "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
"OpenVINO, or CPU.") "OpenVINO, or CPU.")
return requirements return requirements
ext_modules = [] ext_modules = []
if _build_core_ext():
ext_modules.append(CMakeExtension(name="vllm._core_C"))
if _is_cuda() or _is_hip(): if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm._moe_C"))
...@@ -545,13 +657,18 @@ if _build_custom_ops(): ...@@ -545,13 +657,18 @@ if _build_custom_ops():
package_data = { package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py","model_executor/layers/quantization/configs/*.json"] "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py","model_executor/layers/quantization/configs/*.json"]
} }
if envs.VLLM_USE_PRECOMPILED:
ext_modules = []
package_data["vllm"].append("*.so")
if _no_device(): if _no_device():
ext_modules = [] ext_modules = []
if not ext_modules:
cmdclass = {}
else:
cmdclass = {
"build_ext":
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
}
setup( setup(
name="vllm", name="vllm",
version=get_vllm_version(), version=get_vllm_version(),
...@@ -567,25 +684,28 @@ setup( ...@@ -567,25 +684,28 @@ setup(
"Documentation": "https://vllm.readthedocs.io/en/latest/", "Documentation": "https://vllm.readthedocs.io/en/latest/",
}, },
classifiers=[ classifiers=[
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",
"License :: OSI Approved :: Apache Software License", "License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
], ],
packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
"tests*")), "tests*")),
python_requires=">=3.8", python_requires=">=3.9",
install_requires=get_requirements(), install_requires=get_requirements(),
ext_modules=ext_modules, ext_modules=ext_modules,
extras_require={ extras_require={
"tensorizer": ["tensorizer>=2.9.0"], "tensorizer": ["tensorizer>=2.9.0"],
"video": ["opencv-python"], # Required for video processing "audio": ["librosa", "soundfile"], # Required for audio processing
"audio": ["librosa", "soundfile"] # Required for audio processing "video": ["decord"] # Required for video processing
}, },
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, cmdclass=cmdclass,
package_data=package_data, package_data=package_data,
entry_points={ entry_points={
"console_scripts": [ "console_scripts": [
......
...@@ -12,11 +12,11 @@ import torch ...@@ -12,11 +12,11 @@ import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from vllm.outputs import RequestOutput as RealRequestOutput from vllm.outputs import RequestOutput as RealRequestOutput
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear from ..utils import wait_for_gpu_memory_to_clear
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
...@@ -87,17 +87,19 @@ class MockAsyncLLMEngine(AsyncLLMEngine): ...@@ -87,17 +87,19 @@ class MockAsyncLLMEngine(AsyncLLMEngine):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_new_requests_event(): async def test_new_requests_event():
params = SamplingParams()
engine = MockAsyncLLMEngine() engine = MockAsyncLLMEngine()
engine.start_background_loop() engine.start_background_loop()
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
assert engine.engine.step_calls == 0 assert engine.engine.step_calls == 0
await engine.add_request("1", "", None) await engine.add_request("1", "", params)
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 1 assert engine.engine.add_request_calls == 1
assert engine.engine.step_calls == 1 assert engine.engine.step_calls == 1
await engine.add_request("2", "", None) await engine.add_request("2", "", params)
engine.engine.generate("2") engine.engine.generate("2")
await asyncio.sleep(0) await asyncio.sleep(0)
await asyncio.sleep(0) await asyncio.sleep(0)
...@@ -112,7 +114,7 @@ async def test_new_requests_event(): ...@@ -112,7 +114,7 @@ async def test_new_requests_event():
await asyncio.sleep(0.001) await asyncio.sleep(0.001)
assert engine.engine.step_calls == old_step_calls assert engine.engine.step_calls == old_step_calls
await engine.add_request("3", "", None) await engine.add_request("3", "", params)
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 3 assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1 assert engine.engine.step_calls == old_step_calls + 1
...@@ -156,7 +158,7 @@ async def async_engine(): ...@@ -156,7 +158,7 @@ async def async_engine():
engine.shutdown_background_loop() engine.shutdown_background_loop()
del engine del engine
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
cleanup() cleanup_dist_env_and_memory()
@pytest.fixture() @pytest.fixture()
......
...@@ -11,22 +11,31 @@ from unittest.mock import patch ...@@ -11,22 +11,31 @@ from unittest.mock import patch
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.utils import is_hip from vllm.platforms import current_platform
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
from ..conftest import VllmRunner
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
] ]
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m")) llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"))
...@@ -37,6 +46,7 @@ def test_vllm_gc_ed(): ...@@ -37,6 +46,7 @@ def test_vllm_gc_ed():
assert weak_llm() is None assert weak_llm() is None
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
...@@ -45,8 +55,6 @@ def test_vllm_gc_ed(): ...@@ -45,8 +55,6 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("enforce_eager", [False, True]) @pytest.mark.parametrize("enforce_eager", [False, True])
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner,
example_prompts,
model: str, model: str,
backend: str, backend: str,
dtype: str, dtype: str,
...@@ -54,18 +62,30 @@ def test_models( ...@@ -54,18 +62,30 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
) -> None: ) -> None:
if backend == "FLASHINFER" and is_hip(): if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
pytest.skip(
"XFORMERS does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend os.environ["VLLM_ATTENTION_BACKEND"] = backend
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:"
example_prompts = [prompt]
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, with VllmRunner(model,
dtype=dtype, max_model_len=8192,
enforce_eager=enforce_eager, dtype=dtype,
gpu_memory_utilization=0.7) as vllm_model: enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(
...@@ -110,6 +130,11 @@ def test_models( ...@@ -110,6 +130,11 @@ def test_models(
# if attention_backend: # if attention_backend:
# os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend # os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
# # Import VLLM_USE_V1 dynamically to handle patching
# from vllm.envs import VLLM_USE_V1
# if VLLM_USE_V1 and distributed_executor_backend != "mp":
# pytest.skip(f"Skip {distributed_executor_backend} for V1")
# dtype = "half" # dtype = "half"
# max_tokens = 5 # max_tokens = 5
...@@ -135,6 +160,7 @@ def test_models( ...@@ -135,6 +160,7 @@ def test_models(
# ) # )
@pytest.mark.skip_v1
def test_model_with_failure(vllm_runner) -> None: def test_model_with_failure(vllm_runner) -> None:
try: try:
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
...@@ -159,3 +185,30 @@ def test_model_with_failure(vllm_runner) -> None: ...@@ -159,3 +185,30 @@ def test_model_with_failure(vllm_runner) -> None:
ModelInputForGPUWithSamplingMetadata) ModelInputForGPUWithSamplingMetadata)
finally: finally:
os.remove(filename) os.remove(filename)
@pytest.mark.skip_v1
def test_failure_with_async_out_proc(vllm_runner) -> None:
filename = None
try:
with vllm_runner("facebook/opt-125m",
dtype="half",
enforce_eager=False,
gpu_memory_utilization=0.7) as vllm_model,\
patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
side_effect=ValueError()):
model_config = vllm_model.model.llm_engine.model_config
assert model_config.use_async_output_proc
with pytest.raises(ValueError) as exc_info:
vllm_model.generate_greedy('how to make pizza?', 250)
matches = re.search(r"input dumped to (.+).pkl",
str(exc_info.value))
assert matches is not None
filename = f"{matches.group(1)}.pkl"
finally:
# Clean up
if filename is not None:
os.remove(filename)
pass
...@@ -11,6 +11,9 @@ from contextlib import nullcontext ...@@ -11,6 +11,9 @@ from contextlib import nullcontext
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
...@@ -18,7 +21,7 @@ from ..utils import models_path_prefix ...@@ -18,7 +21,7 @@ from ..utils import models_path_prefix
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
] ]
...@@ -30,6 +33,7 @@ MODELS = [ ...@@ -30,6 +33,7 @@ MODELS = [
# NOTE: Increasing this in this suite will fail CI because we currently cannot # NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test. # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -40,11 +44,15 @@ def test_models( ...@@ -40,11 +44,15 @@ def test_models(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str,
monkeypatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
override_backend_env_variable(monkeypatch, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
...@@ -73,13 +81,18 @@ def test_models( ...@@ -73,13 +81,18 @@ def test_models(
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str,
monkeypatch,
) -> None: ) -> None:
override_backend_env_variable(monkeypatch, attention_backend)
if (model == "meta-llama/Llama-2-7b-hf" if (model == "meta-llama/Llama-2-7b-hf"
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test ray adag # test ray adag
...@@ -193,17 +206,17 @@ def test_models_distributed( ...@@ -193,17 +206,17 @@ def test_models_distributed(
@pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32]) @pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
# NOTE: Increasing this in this suite will fail CI because we currently cannot # NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test. # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner, vllm_runner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
use_v2_block_manager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
dtype: str,
) -> None: ) -> None:
""" """
Checks exact match decode with and without prefix caching Checks exact match decode with and without prefix caching
...@@ -225,12 +238,11 @@ def test_with_prefix_caching( ...@@ -225,12 +238,11 @@ def test_with_prefix_caching(
for enable in (True, False): for enable in (True, False):
with vllm_runner( with vllm_runner(
model, model,
dtype="half", dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True, enable_chunked_prefill=True,
enable_prefix_caching=enable, enable_prefix_caching=enable,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
use_v2_block_manager=use_v2_block_manager,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
...@@ -253,3 +265,61 @@ def test_with_prefix_caching( ...@@ -253,3 +265,61 @@ def test_with_prefix_caching(
name_0="w/o prefix caching", name_0="w/o prefix caching",
name_1="with prefix caching", name_1="with prefix caching",
) )
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
monkeypatch,
) -> None:
test_models(
hf_runner,
vllm_runner,
example_prompts,
model,
dtype,
max_tokens,
chunked_prefill_token_size,
enforce_eager,
1,
attention_backend,
monkeypatch,
)
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
vllm_runner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
dtype: str,
) -> None:
test_with_prefix_caching(
vllm_runner,
max_tokens,
enforce_eager,
chunk_size,
1,
dtype,
)
...@@ -3,5 +3,5 @@ from ..utils import compare_two_settings, models_path_prefix ...@@ -3,5 +3,5 @@ from ..utils import compare_two_settings, models_path_prefix
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), [], compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), [],
["--cpu-offload-gb", "4"]) ["--cpu-offload-gb", "1"])
...@@ -26,9 +26,9 @@ MODELS = [ ...@@ -26,9 +26,9 @@ MODELS = [
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
def check_settings(): def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, ( assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. " "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest " "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
"tests/basic_correctness/test_preemption.py`") "pytest tests/basic_correctness/test_preemption.py`")
@pytest.fixture @pytest.fixture
...@@ -139,113 +139,6 @@ def test_preemption( ...@@ -139,113 +139,6 @@ def test_preemption(
assert total_preemption == total_recorded_preemption assert total_preemption == total_recorded_preemption
@pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4])
def test_swap(
caplog_vllm,
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
beam_width: int,
worker_use_ray: bool,
) -> None:
"""Use beam search enables swapping."""
example_prompts = example_prompts[:1]
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens)
with vllm_runner(
model,
dtype=dtype,
swap_space=10,
disable_log_stats=False,
worker_use_ray=worker_use_ray,
) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts,
beam_width, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i]
vllm_output_ids, _ = vllm_outputs[i]
assert len(hf_output_ids) == len(vllm_output_ids)
for j in range(len(hf_output_ids)):
assert hf_output_ids[j] == vllm_output_ids[j], (
f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
f"vLLM: {vllm_output_ids}")
assert ("is preempted by PreemptionMode.SWAP mode because there "
"is not enough KV cache space." in caplog_vllm.text)
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
preemption_metrics = None
for m in REGISTRY.collect():
if m.name == "vllm:num_preemptions":
preemption_metrics = m
assert preemption_metrics is not None
total_recorded_preemption = 0
for sample in preemption_metrics.samples:
total_recorded_preemption += sample.value
assert total_preemption == total_recorded_preemption
@pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4])
def test_swap_infeasible(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
beam_width: int,
worker_use_ray: bool,
) -> None:
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE = 16
prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE
example_prompts = example_prompts[:1]
with vllm_runner(
model,
dtype=dtype,
swap_space=10,
block_size=BLOCK_SIZE,
# Since beam search have more than 1 sequence, prefill +
# decode blocks are not enough to finish.
num_gpu_blocks_override=prefill_blocks + decode_blocks,
max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
worker_use_ray=worker_use_ray,
) as vllm_model:
sampling_params = SamplingParams(n=beam_width,
use_beam_search=True,
temperature=0.0,
max_tokens=max_tokens,
ignore_eos=True)
req_outputs = vllm_model.model.generate(
example_prompts,
sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.
assert req_outputs[0].outputs[0].finish_reason == "length"
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("dtype", ["float"]) # @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
......
from copy import deepcopy
from typing import Callable, Union
from torch import fx
from vllm.compilation.inductor_pass import InductorPass
class TestBackend:
"""
This class provides a simple Inductor backend that can be used for testing.
It takes a list of custom passes and runs them after Inductor's passes.
It also saves the graph before and after the custom passes for inspection.
"""
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
None]]):
self.custom_passes = list(passes)
from torch._inductor import config
self.current_config = config.shallow_copy_dict()
self.current_config['force_disable_caches'] = True
self.current_config['post_grad_custom_post_pass'] = self.post_pass
def __call__(self, graph: fx.GraphModule, example_inputs):
from torch._inductor.compile_fx import compile_fx
return compile_fx(graph,
example_inputs,
config_patches=self.current_config)
def post_pass(self, graph: fx.Graph):
self.graph_pre_pass = deepcopy(graph)
for pass_ in self.custom_passes:
pass_(graph)
self.graph_post_pass = deepcopy(graph)
# assign by reference, will reflect the final state of the graph
self.final_graph = graph
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment