Commit 006693ed authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.11.2' into v0.11.2-ori

parents 4b51e6f1 275de341
{%- if messages | length > 1 -%}
{{ raise_exception('Embedding models should only embed one message at a time') }}
{%- endif -%}
{% set vars = namespace(parts=[]) %}
{%- for message in messages -%}
{%- for content in message['content'] -%}
{%- if content['type'] == 'text' -%}
{%- set vars.parts = vars.parts + [content['text']] %}
{%- elif content['type'] == 'image' -%}
{%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{ vars.parts | join(' ') }}
#!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
echo "Please run 'pip install -r requirements/lint.txt', followed by"
echo "'pre-commit install' to install the pre-commit hooks."
echo "Then linters will run automatically before each commit."
\ No newline at end of file
......@@ -102,7 +102,6 @@ plugins:
- https://numpy.org/doc/stable/objects.inv
- https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv
- https://huggingface.co/docs/transformers/main/en/objects.inv
markdown_extensions:
- attr_list
......@@ -143,8 +142,3 @@ extra_javascript:
- https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
- mkdocs/javascript/edit_and_feedback.js
- mkdocs/javascript/slack_and_forum.js
# Makes the url format end in .html rather than act as a dir
# So index.md generates as index.html and is available under URL /index.html
# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
use_directory_urls: false
......@@ -4,9 +4,9 @@ requires = [
"cmake>=3.26.1",
"ninja",
"packaging>=24.2",
"setuptools>=77.0.3,<80.0.0",
"setuptools>=77.0.3,<81.0.0",
"setuptools-scm>=8.0",
"torch == 2.8.0",
"torch == 2.9.0",
"wheel",
"jinja2",
]
......@@ -20,7 +20,6 @@ license-files = ["LICENSE"]
readme = "README.md"
description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
......@@ -31,7 +30,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
]
requires-python = ">=3.9,<3.14"
requires-python = ">=3.10,<3.14"
dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls]
......@@ -52,27 +51,10 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi
where = ["."]
include = ["vllm*"]
[tool.yapfignore]
ignore_patterns = [
".buildkite/**",
"benchmarks/**",
"build/**",
"examples/**",
]
[tool.ruff]
# Allow lines to be as long as 80.
line-length = 80
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# Python 3.8 typing - skip V0 code
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
[tool.ruff.lint]
select = [
......@@ -87,7 +69,7 @@ select = [
# flake8-simplify
"SIM",
# isort
# "I",
"I",
# flake8-logging-format
"G",
]
......@@ -96,29 +78,23 @@ ignore = [
"F405", "F403",
# lambda expression assignment
"E731",
# zip without `strict=`
"B905",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true
[tool.mypy]
plugins = ['pydantic.mypy']
ignore_missing_imports = true
check_untyped_defs = true
follow_imports = "silent"
[tool.isort]
skip_glob = [
".buildkite/*",
"benchmarks/*",
"examples/*",
]
use_parentheses = true
skip_gitignore = true
[tool.pytest.ini_options]
markers = [
"slow_test",
......@@ -126,6 +102,7 @@ markers = [
"core_model: enable this model test in each PR instead of only nightly",
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
"cpu_model: enable this model test in CPU tests",
"cpu_test: mark test as CPU-only test",
"split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1",
......@@ -206,6 +183,7 @@ ba = "ba"
[tool.typos.type.py.extend-words]
ba = "ba"
nd = "nd"
[tool.typos.type.cpp]
extend-glob = ["*.cu"]
......
......@@ -2,9 +2,9 @@
cmake>=3.26.1
ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
torch==2.8.0
torch==2.9.0
wheel
jinja2>=3.1.6
regex
......
......@@ -7,39 +7,38 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.55.2
transformers >= 4.56.0, < 5
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.11.7
pydantic >= 2.12.0
prometheus_client >= 0.18.0
pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.11.3
llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
outlines_core == 0.2.11
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
importlib_metadata; python_version < '3.10'
mistral_common[image,audio] >= 1.5.4 # requires numpy>=1.25 #1.8.2
mistral_common[image] >= 1.8.5
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.11.0 # required for compressed-tensors
depyf==0.19.0 # required for profiling and debugging with compilation config
compressed-tensors == 0.12.2 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/others/logging_configuration.md
......@@ -49,3 +48,5 @@ pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
model-hosting-container-standards < 1.0.0
\ No newline at end of file
cmake>=3.26.1
ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
wheel
jinja2>=3.1.6
regex
# Common dependencies
-r common.txt
numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
# Dependencies for CPUs
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools>=77.0.3,<81.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64"
torch==2.8.0; platform_system == "Darwin"
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
......
# Common dependencies
-r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.8.0
torchaudio==2.8.0
torch==2.9.0
torchaudio==2.9.0
# These must be updated alongside torch
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.2
......@@ -9,10 +9,8 @@ mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin
regex
ruff
# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
cachetools
msgspec
pydantic
torch
# For generating argparse docs.
# Adding requirements here should only be used as a last resort.
msgspec # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
lmcache
nixl >= 0.5.1 # Required for disaggregated prefill
nixl >= 0.6.0 # Required for disaggregated prefill
......@@ -23,14 +23,14 @@ jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
transformers==4.57.1
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.46.1
......@@ -40,10 +40,8 @@ buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
runai-model-streamer[s3,gcs]==0.15.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
pydantic>=2.12 # 2.11 leads to error on python 3.13
# Common dependencies
-r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.3
torch==2.8.0
torchvision==0.23.0
torchaudio==2.8.0
--extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.9.0
torchvision==0.24.0
torchaudio==2.9.0
triton==3.3.0
triton==3.5.0
cmake>=3.26.1,<4
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
wheel
jinja2>=3.1.6
amdsmi==6.2.4
amdsmi==6.4.3
timm>=1.0.17
# Common dependencies
-r common.txt
tblib==3.1.0
bm25s==0.2.13
pystemmer==3.0.0
# entrypoints test
# Entrypoints test
# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
audioread==3.0.1
cffi==1.17.1
......@@ -15,11 +17,11 @@ soundfile==0.13.1
soxr==0.5.0.post1
librosa==0.10.2.post1
# entrypoints test
# Entrypoints test
#vllm[video] # required by entrypoints/openai/test_video.py
decord==0.6.0
# entrypoints test
# Entrypoints test
#sentence-transformers # required by entrypoints/openai/test_score.py
sentence-transformers==3.4.1
......@@ -29,4 +31,11 @@ matplotlib==3.10.3
# Multi-Modal Models Test (Extended) 3
blobfile==3.0.0
# Required for openai schema test.
schemathesis==3.39.15
# Required for mteb test
mteb[bm25s]>=1.38.11, <2
# Required for eval tests
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# Common dependencies
-r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for AMD GPUs
boto3
botocore
datasets
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
peft
......@@ -15,7 +12,6 @@ tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
runai-model-streamer[s3,gcs]==0.15.0
# conch-triton-kernels==1.2.1
timm>=1.0.17
......@@ -24,12 +24,12 @@ soundfile # required for audio tests
jiwer # required for audio tests
tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.8.0
torchaudio==2.8.0
torchvision==0.23.0
torch==2.9.0
torchaudio==2.9.0
torchvision==0.24.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test
......@@ -37,8 +37,8 @@ datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.2
tokenizers==0.21.1
transformers==4.57.1
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes==0.46.1
......@@ -48,12 +48,12 @@ buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
runai-model-streamer[s3,gcs]==0.15.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
gpt-oss >= 0.0.7; python_version > '3.11'
# This file was autogenerated by uv via the following command:
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
absl-py==2.1.0
# via rouge-score
accelerate==1.0.1
......@@ -10,18 +10,19 @@ aenum==3.1.16
# via lightly
affine==2.4.0
# via rasterio
aiohappyeyeballs==2.4.3
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.10.11
aiohttp==3.13.0
# via
# aiohttp-cors
# datasets
# fsspec
# gpt-oss
# lm-eval
# ray
aiohttp-cors==0.8.1
# via ray
aiosignal==1.3.1
aiosignal==1.4.0
# via aiohttp
albucore==0.0.16
# via terratorch
......@@ -39,6 +40,8 @@ anyio==4.6.2.post1
# via
# httpx
# starlette
arctic-inference==0.1.1
# via -r requirements/test.in
argcomplete==3.5.1
# via datamodel-code-generator
arrow==1.3.0
......@@ -72,7 +75,9 @@ blobfile==3.0.0
bm25s==0.2.13
# via mteb
boto3==1.35.57
# via tensorizer
# via
# runai-model-streamer-s3
# tensorizer
botocore==1.35.57
# via
# boto3
......@@ -101,6 +106,8 @@ chardet==5.2.0
# via mbstrdecoder
charset-normalizer==3.4.0
# via requests
chz==0.3.0
# via gpt-oss
click==8.1.7
# via
# black
......@@ -171,7 +178,9 @@ distlib==0.3.9
dnspython==2.7.0
# via email-validator
docker==7.1.0
# via mlflow
# via
# gpt-oss
# mlflow
docopt==0.6.2
# via num2words
docstring-parser==0.17.0
......@@ -197,7 +206,9 @@ eval-type-backport==0.2.2
evaluate==0.4.3
# via lm-eval
fastapi==0.116.1
# via mlflow-skinny
# via
# gpt-oss
# mlflow-skinny
fastparquet==2024.11.0
# via genai-perf
fastrlock==0.8.2
......@@ -249,13 +260,31 @@ gitdb==4.0.12
gitpython==3.1.44
# via mlflow-skinny
google-api-core==2.24.2
# via opencensus
# via
# google-cloud-core
# google-cloud-storage
# opencensus
google-auth==2.40.2
# via
# databricks-sdk
# google-api-core
# google-cloud-core
# google-cloud-storage
# runai-model-streamer-gcs
google-cloud-core==2.4.3
# via google-cloud-storage
google-cloud-storage==3.4.0
# via runai-model-streamer-gcs
google-crc32c==1.7.1
# via
# google-cloud-storage
# google-resumable-media
google-resumable-media==2.7.2
# via google-cloud-storage
googleapis-common-protos==1.70.0
# via google-api-core
gpt-oss==0.0.8
# via -r requirements/test.in
graphene==3.4.3
# via mlflow
graphql-core==3.2.6
......@@ -283,6 +312,8 @@ hf-xet==1.1.7
# via huggingface-hub
hiredis==3.0.0
# via tensorizer
html2text==2025.4.15
# via gpt-oss
httpcore==1.0.6
# via httpx
httpx==0.27.2
......@@ -417,6 +448,7 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215
lxml==5.3.0
# via
# blobfile
# gpt-oss
# sacrebleu
mako==1.3.10
# via alembic
......@@ -444,7 +476,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.8.2
mistral-common==1.8.5
# via -r requirements/test.in
mlflow==2.22.0
# via terratorch
......@@ -543,42 +575,44 @@ numpy==1.26.4
# tritonclient
# vocos
# xarray
nvidia-cublas-cu12==12.8.4.1
nvidia-cublas-cu12==12.9.1.4
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-cupti-cu12==12.9.79
# via torch
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-nvrtc-cu12==12.9.86
# via torch
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cuda-runtime-cu12==12.9.79
# via torch
nvidia-cudnn-cu12==9.10.2.21
# via torch
nvidia-cufft-cu12==11.3.3.83
nvidia-cufft-cu12==11.4.1.4
# via torch
nvidia-cufile-cu12==1.13.1.3
nvidia-cufile-cu12==1.14.1.1
# via torch
nvidia-curand-cu12==10.3.9.90
nvidia-curand-cu12==10.3.10.19
# via torch
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusolver-cu12==11.7.5.82
# via torch
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparse-cu12==12.5.10.65
# via
# nvidia-cusolver-cu12
# torch
nvidia-cusparselt-cu12==0.7.1
# via torch
nvidia-nccl-cu12==2.27.3
nvidia-nccl-cu12==2.27.5
# via torch
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvjitlink-cu12==12.9.86
# via
# nvidia-cufft-cu12
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.8.90
nvidia-nvshmem-cu12==3.3.20
# via torch
nvidia-nvtx-cu12==12.9.79
# via torch
omegaconf==2.3.0
# via
......@@ -586,6 +620,8 @@ omegaconf==2.3.0
# lightning
open-clip-torch==2.32.0
# via -r requirements/test.in
openai-harmony==0.0.4
# via gpt-oss
opencensus==0.11.4
# via ray
opencensus-context==0.1.3
......@@ -706,7 +742,9 @@ prometheus-client==0.22.0
# opentelemetry-exporter-prometheus
# ray
propcache==0.2.0
# via yarl
# via
# aiohttp
# yarl
proto-plus==1.26.1
# via google-api-core
protobuf==5.28.3
......@@ -749,19 +787,21 @@ pycparser==2.22
# via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.11.7
pydantic==2.12.0
# via
# -r requirements/test.in
# albumentations
# datamodel-code-generator
# fastapi
# gpt-oss
# lightly
# mistral-common
# mlflow-skinny
# mteb
# openai-harmony
# pydantic-extra-types
# ray
pydantic-core==2.33.2
pydantic-core==2.41.1
# via pydantic
pydantic-extra-types==2.10.5
# via mistral-common
......@@ -888,6 +928,8 @@ requests==2.32.3
# docker
# evaluate
# google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub
# lightly
# lm-eval
......@@ -925,10 +967,12 @@ rsa==4.9.1
# via google-auth
rtree==1.4.0
# via torchgeo
runai-model-streamer==0.11.0
# via -r requirements/test.in
runai-model-streamer-s3==0.11.0
runai-model-streamer==0.15.0
# via -r requirements/test.in
runai-model-streamer-gcs==0.15.0
# via runai-model-streamer
runai-model-streamer-s3==0.15.0
# via runai-model-streamer
s3transfer==0.10.3
# via boto3
sacrebleu==2.4.3
......@@ -972,14 +1016,11 @@ sentence-transformers==3.2.1
# via
# -r requirements/test.in
# mteb
sentencepiece==0.2.0
# via mistral-common
setuptools==77.0.3
# via
# lightning-utilities
# pytablewriter
# torch
# triton
shapely==2.1.1
# via
# geopandas
......@@ -1031,6 +1072,8 @@ starlette-testclient==0.4.1
# via schemathesis
statsmodels==0.14.4
# via genai-perf
structlog==25.4.0
# via gpt-oss
sympy==1.13.3
# via
# einx
......@@ -1043,14 +1086,17 @@ tblib==3.1.0
# via -r requirements/test.in
tcolorpy==0.1.6
# via pytablewriter
tenacity==9.0.0
tenacity==9.1.2
# via
# gpt-oss
# lm-eval
# plotly
tensorboardx==2.6.4
# via lightning
tensorizer==2.10.1
# via -r requirements/test.in
termcolor==3.1.0
# via gpt-oss
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
# via -r requirements/test.in
threadpoolctl==3.5.0
......@@ -1059,8 +1105,9 @@ tifffile==2025.3.30
# via
# scikit-image
# terratorch
tiktoken==0.7.0
tiktoken==0.12.0
# via
# gpt-oss
# lm-eval
# mistral-common
timm==1.0.17
......@@ -1070,7 +1117,7 @@ timm==1.0.17
# segmentation-models-pytorch
# terratorch
# torchgeo
tokenizers==0.21.1
tokenizers==0.22.0
# via
# -r requirements/test.in
# transformers
......@@ -1078,7 +1125,7 @@ tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.8.0+cu128
torch==2.9.0+cu129
# via
# -r requirements/test.in
# accelerate
......@@ -1107,7 +1154,7 @@ torch==2.8.0+cu128
# torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.8.0+cu128
torchaudio==2.9.0+cu129
# via
# -r requirements/test.in
# encodec
......@@ -1120,7 +1167,7 @@ torchmetrics==1.7.4
# pytorch-lightning
# terratorch
# torchgeo
torchvision==0.23.0+cu128
torchvision==0.24.0+cu129
# via
# -r requirements/test.in
# lightly
......@@ -1151,7 +1198,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.55.2
transformers==4.57.1
# via
# -r requirements/test.in
# genai-perf
......@@ -1161,7 +1208,7 @@ transformers==4.55.2
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.4.0
triton==3.5.0
# via torch
tritonclient==2.51.0
# via
......@@ -1178,10 +1225,12 @@ types-python-dateutil==2.9.0.20241206
# via arrow
typeshed-client==2.8.2
# via jsonargparse
typing-extensions==4.12.2
typing-extensions==4.15.0
# via
# aiosignal
# albumentations
# alembic
# chz
# fastapi
# graphene
# huggingface-hub
......@@ -1205,7 +1254,7 @@ typing-extensions==4.12.2
# typer
# typeshed-client
# typing-inspection
typing-inspection==0.4.1
typing-inspection==0.4.2
# via pydantic
tzdata==2024.2
# via pandas
......@@ -1221,7 +1270,9 @@ urllib3==2.2.3
# responses
# tritonclient
uvicorn==0.35.0
# via mlflow-skinny
# via
# gpt-oss
# mlflow-skinny
vector-quantize-pytorch==1.21.2
# via -r requirements/test.in
virtualenv==20.31.2
......
......@@ -5,15 +5,14 @@ ray>=2.9
cmake>=3.26.1
packaging>=24.2
setuptools-scm>=8
setuptools>=77.0.3,<80.0.0
setuptools>=77.0.3,<81.0.0
wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
nixl==0.3.0 # for PD disaggregation
numba == 0.61.2 # Required for N-gram speculative decoding
torch==2.8.0+xpu
torchaudio
torchvision
--extra-index-url=https://download.pytorch.org/whl/xpu
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
......@@ -34,32 +34,36 @@ logger = logging.getLogger(__name__)
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
logger.warning(
"VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
VLLM_TARGET_DEVICE = "cpu"
elif not (sys.platform.startswith("linux")
or sys.platform.startswith("darwin")):
elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")):
logger.warning(
"vLLM only supports Linux platform (including WSL) and MacOS."
"Building on %s, "
"so vLLM may not be able to run correctly", sys.platform)
"so vLLM may not be able to run correctly",
sys.platform,
)
VLLM_TARGET_DEVICE = "empty"
elif (sys.platform.startswith("linux") and torch.version.cuda is None
and os.getenv("VLLM_TARGET_DEVICE") is None
and torch.version.hip is None):
elif (
sys.platform.startswith("linux")
and torch.version.cuda is None
and os.getenv("VLLM_TARGET_DEVICE") is None
and torch.version.hip is None
):
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
# fallback to cpu
VLLM_TARGET_DEVICE = "cpu"
def is_sccache_available() -> bool:
return which("sccache") is not None and \
not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
return which("sccache") is not None and not bool(
int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))
)
def is_ccache_available() -> bool:
......@@ -83,8 +87,7 @@ def is_url_available(url: str) -> bool:
class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
super().__init__(name, sources=[], py_limited_api=True, **kwa)
self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
......@@ -121,8 +124,8 @@ class cmake_build_ext(build_ext):
if nvcc_threads is not None:
nvcc_threads = int(nvcc_threads)
logger.info(
"Using NVCC_THREADS=%d as the number of nvcc threads.",
nvcc_threads)
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
......@@ -146,36 +149,36 @@ class cmake_build_ext(build_ext):
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
cmake_args = [
'-DCMAKE_BUILD_TYPE={}'.format(cfg),
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
"-DCMAKE_BUILD_TYPE={}".format(cfg),
"-DVLLM_TARGET_DEVICE={}".format(VLLM_TARGET_DEVICE),
]
verbose = envs.VERBOSE
if verbose:
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"]
if is_sccache_available():
cmake_args += [
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
'-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
"-DCMAKE_C_COMPILER_LAUNCHER=sccache",
"-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
"-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
"-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
]
elif is_ccache_available():
cmake_args += [
'-DCMAKE_C_COMPILER_LAUNCHER=ccache',
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
'-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
"-DCMAKE_C_COMPILER_LAUNCHER=ccache",
"-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
"-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
"-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
]
# Pass the python executable to cmake so it can find an exact
# match.
cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
cmake_args += ["-DVLLM_PYTHON_EXECUTABLE={}".format(sys.executable)]
# Pass the python path to cmake so it can reuse the build dependencies
# on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
cmake_args += ["-DVLLM_PYTHON_PATH={}".format(":".join(sys.path))]
# Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles,
......@@ -183,7 +186,7 @@ class cmake_build_ext(build_ext):
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
#
# Setup parallelism and build tool
......@@ -191,30 +194,38 @@ class cmake_build_ext(build_ext):
num_jobs, nvcc_threads = self.compute_num_jobs()
if nvcc_threads:
cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
if is_ninja_available():
build_tool = ['-G', 'Ninja']
build_tool = ["-G", "Ninja"]
cmake_args += [
'-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
'-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
"-DCMAKE_JOB_POOL_COMPILE:STRING=compile",
"-DCMAKE_JOB_POOLS:STRING=compile={}".format(num_jobs),
]
else:
# Default build tool to whatever cmake picks.
build_tool = []
# Make sure we use the nvcc from CUDA_HOME
if _is_cuda():
cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
elif _is_hip():
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
other_cmake_args = os.environ.get("CMAKE_ARGS")
if other_cmake_args:
cmake_args += other_cmake_args.split()
subprocess.check_call(
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp)
["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp,
)
def build_extensions(self) -> None:
# Ensure that CMake is present and working
try:
subprocess.check_output(['cmake', '--version'])
subprocess.check_output(["cmake", "--version"])
except OSError as e:
raise RuntimeError('Cannot find CMake executable') from e
raise RuntimeError("Cannot find CMake executable") from e
# Create build directory if it does not exist.
if not os.path.exists(self.build_temp):
......@@ -253,13 +264,18 @@ class cmake_build_ext(build_ext):
# CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it.
prefix = outdir
for _ in range(ext.name.count('.')):
for _ in range(ext.name.count(".")):
prefix = prefix.parent
# prefix here should actually be the same for all components
install_args = [
"cmake", "--install", ".", "--prefix", prefix, "--component",
target_name(ext.name)
"cmake",
"--install",
".",
"--prefix",
prefix,
"--component",
target_name(ext.name),
]
subprocess.check_call(install_args, cwd=self.build_temp)
......@@ -270,12 +286,15 @@ class cmake_build_ext(build_ext):
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
# directory so that they can be included in the editable build
import glob
files = glob.glob(os.path.join(self.build_lib, "vllm",
"vllm_flash_attn", "**", "*.py"),
recursive=True)
files = glob.glob(
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "**", "*.py"),
recursive=True,
)
for file in files:
dst_file = os.path.join("vllm/vllm_flash_attn",
file.split("vllm/vllm_flash_attn/")[-1])
dst_file = os.path.join(
"vllm/vllm_flash_attn", file.split("vllm/vllm_flash_attn/")[-1]
)
print(f"Copying {file} to {dst_file}")
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
self.copy_file(file, dst_file)
......@@ -285,8 +304,7 @@ class precompiled_build_ext(build_ext):
"""Disables extension building when using precompiled binaries."""
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
def build_extensions(self) -> None:
print("Skipping build_ext: using precompiled extensions.")
......@@ -307,9 +325,9 @@ class precompiled_wheel_utils:
wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} "
f"to {wheel_path}")
print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path)
else:
wheel_path = wheel_url_or_path
......@@ -330,25 +348,29 @@ class precompiled_wheel_utils:
]
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members = list(
filter(lambda x: x.filename in files_to_copy,
wheel.filelist))
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
)
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
)
for file in file_members:
print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(
target_path, "wb") as dst:
with (
wheel.open(file.filename) as src,
open(target_path, "wb") as dst,
):
shutil.copyfileobj(src, dst)
pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename))
os.path.basename(file.filename)
)
return package_data_patch
finally:
......@@ -364,10 +386,13 @@ class precompiled_wheel_utils:
try:
# Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output([
"curl", "-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main"
]).decode("utf-8")
resp_json = subprocess.check_output(
[
"curl",
"-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main",
]
).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing.
......@@ -377,25 +402,32 @@ class precompiled_wheel_utils:
# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
["git", "cat-file", "-e", f"{upstream_main_commit}"])
["git", "cat-file", "-e", f"{upstream_main_commit}"]
)
except subprocess.CalledProcessError:
# If not present, fetch it from the remote repository.
# Note that this does not update any local branches,
# but ensures that this commit ref and its history are
# available in our local repo.
subprocess.check_call([
"git", "fetch", "https://github.com/vllm-project/vllm",
"main"
])
subprocess.check_call(
["git", "fetch", "https://github.com/vllm-project/vllm", "main"]
)
# Then get the commit hash of the current branch that is the same as
# the upstream main commit.
current_branch = subprocess.check_output(
["git", "branch", "--show-current"]).decode("utf-8").strip()
current_branch = (
subprocess.check_output(["git", "branch", "--show-current"])
.decode("utf-8")
.strip()
)
base_commit = subprocess.check_output([
"git", "merge-base", f"{upstream_main_commit}", current_branch
]).decode("utf-8").strip()
base_commit = (
subprocess.check_output(
["git", "merge-base", f"{upstream_main_commit}", current_branch]
)
.decode("utf-8")
.strip()
)
return base_commit
except ValueError as err:
raise ValueError(err) from None
......@@ -403,7 +435,9 @@ class precompiled_wheel_utils:
logger.warning(
"Failed to get the base commit in the main branch. "
"Using the nightly wheel. The libraries in this "
"wheel may not be compatible with your dev branch: %s", err)
"wheel may not be compatible with your dev branch: %s",
err,
)
return "nightly"
......@@ -413,12 +447,13 @@ def _no_device() -> bool:
def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
return VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()
def _is_hip() -> bool:
return (VLLM_TARGET_DEVICE == "cuda"
or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
return (
VLLM_TARGET_DEVICE == "cuda" or VLLM_TARGET_DEVICE == "rocm"
) and torch.version.hip is not None
def _is_tpu() -> bool:
......@@ -457,8 +492,12 @@ def get_rocm_version():
minor = ctypes.c_uint32()
patch = ctypes.c_uint32()
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0):
if (
get_rocm_core_version(
ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)
)
== 0
):
return f"{major.value}.{minor.value}.{patch.value}"
return None
except Exception:
......@@ -471,8 +510,9 @@ def get_nvcc_cuda_version() -> Version:
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
"""
assert CUDA_HOME is not None, "CUDA_HOME is not set"
nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
universal_newlines=True)
nvcc_output = subprocess.check_output(
[CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True
)
output = nvcc_output.split()
release_idx = output.index("release") + 1
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
......@@ -484,18 +524,31 @@ def get_gaudi_sw_version():
Returns the driver version.
"""
# Enable console printing for `hl-smi` check
output = subprocess.run("hl-smi",
shell=True,
text=True,
capture_output=True,
env={"ENABLE_CONSOLE": "true"})
output = subprocess.run(
"hl-smi",
shell=True,
text=True,
capture_output=True,
env={"ENABLE_CONSOLE": "true"},
)
if output.returncode == 0 and output.stdout:
return output.stdout.split("\n")[2].replace(
" ", "").split(":")[1][:-1].split("-")[0]
return (
output.stdout.split("\n")[2]
.replace(" ", "")
.split(":")[1][:-1]
.split("-")[0]
)
return "0.0.0" # when hl-smi is not available
def get_vllm_version() -> str:
# Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source.
if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
return get_version(write_to="vllm/_version.py")
version = get_version(write_to="vllm/_version.py")
sep = "+" if "+" not in version else "." # dev versions might contain +
......@@ -541,8 +594,11 @@ def get_requirements() -> list[str]:
for line in requirements:
if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1])
elif not line.startswith("--") and not line.startswith(
"#") and line.strip() != "":
elif (
not line.startswith("--")
and not line.startswith("#")
and line.strip() != ""
):
resolved_requirements.append(line)
return resolved_requirements
......@@ -553,7 +609,7 @@ def get_requirements() -> list[str]:
cuda_major, cuda_minor = torch.version.cuda.split(".")
modified_requirements = []
for req in requirements:
if ("vllm-flash-attn" in req and cuda_major != "12"):
if "vllm-flash-attn" in req and cuda_major != "12":
# vllm-flash-attn is built only for CUDA 12.x.
# Skip for other versions.
continue
......@@ -568,8 +624,7 @@ def get_requirements() -> list[str]:
elif _is_xpu():
requirements = _read_requirements("xpu.txt")
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCm, or CPU.")
raise ValueError("Unsupported platform, please use CUDA, ROCm, or CPU.")
return requirements
......@@ -577,6 +632,7 @@ ext_modules = []
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
# if _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
......@@ -585,15 +641,13 @@ if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# FA3 requires CUDA 12.3 or later
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# Optional since this doesn't get built (produce an .so file) when
# not targeting a hopper system
ext_modules.append(CMakeExtension(name="vllm._flashmla_C", optional=True))
ext_modules.append(
CMakeExtension(name="vllm._flashmla_C", optional=True))
ext_modules.append(
CMakeExtension(name="vllm._flashmla_extension_C", optional=True))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
)
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
......@@ -614,6 +668,7 @@ if envs.VLLM_USE_PRECOMPILED:
wheel_url = wheel_location
else:
import platform
arch = platform.machine()
if arch == "x86_64":
wheel_tag = "manylinux1_x86_64"
......@@ -623,8 +678,11 @@ if envs.VLLM_USE_PRECOMPILED:
raise ValueError(f"Unsupported architecture: {arch}")
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
nightly_wheel_url = (
f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
)
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
......@@ -633,8 +691,7 @@ if envs.VLLM_USE_PRECOMPILED:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = nightly_wheel_url
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url)
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
......@@ -645,8 +702,9 @@ if not ext_modules:
cmdclass = {}
else:
cmdclass = {
"build_ext":
precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
"build_ext": precompiled_build_ext
if envs.VLLM_USE_PRECOMPILED
else cmake_build_ext
}
setup(
......@@ -655,18 +713,17 @@ setup(
ext_modules=ext_modules,
install_requires=get_requirements(),
extras_require={
"bench": ["pandas", "datasets"],
"bench": ["pandas", "matplotlib", "seaborn", "datasets"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": [
"runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
"google-cloud-storage", "runai-model-streamer-s3", "boto3"
],
"audio": ["librosa", "soundfile",
"mistral_common[audio]"], # Required for audio processing
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
"audio": [
"librosa",
"soundfile",
"mistral_common[audio]",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.3.1"],
"flashinfer": [], # Kept for backwards compatibility
# Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"],
},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment