Commit 006693ed authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.11.2' into v0.11.2-ori

parents 4b51e6f1 275de341
{%- if messages | length > 1 -%}
{{ raise_exception('Embedding models should only embed one message at a time') }}
{%- endif -%}
{% set vars = namespace(parts=[]) %}
{%- for message in messages -%}
{%- for content in message['content'] -%}
{%- if content['type'] == 'text' -%}
{%- set vars.parts = vars.parts + [content['text']] %}
{%- elif content['type'] == 'image' -%}
{%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{ vars.parts | join(' ') }}
#!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
echo "Please run 'pip install -r requirements/lint.txt', followed by"
echo "'pre-commit install' to install the pre-commit hooks."
echo "Then linters will run automatically before each commit."
\ No newline at end of file
...@@ -102,7 +102,6 @@ plugins: ...@@ -102,7 +102,6 @@ plugins:
- https://numpy.org/doc/stable/objects.inv - https://numpy.org/doc/stable/objects.inv
- https://pytorch.org/docs/stable/objects.inv - https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv - https://psutil.readthedocs.io/en/stable/objects.inv
- https://huggingface.co/docs/transformers/main/en/objects.inv
markdown_extensions: markdown_extensions:
- attr_list - attr_list
...@@ -143,8 +142,3 @@ extra_javascript: ...@@ -143,8 +142,3 @@ extra_javascript:
- https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
- mkdocs/javascript/edit_and_feedback.js - mkdocs/javascript/edit_and_feedback.js
- mkdocs/javascript/slack_and_forum.js - mkdocs/javascript/slack_and_forum.js
# Makes the url format end in .html rather than act as a dir
# So index.md generates as index.html and is available under URL /index.html
# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
use_directory_urls: false
...@@ -4,9 +4,9 @@ requires = [ ...@@ -4,9 +4,9 @@ requires = [
"cmake>=3.26.1", "cmake>=3.26.1",
"ninja", "ninja",
"packaging>=24.2", "packaging>=24.2",
"setuptools>=77.0.3,<80.0.0", "setuptools>=77.0.3,<81.0.0",
"setuptools-scm>=8.0", "setuptools-scm>=8.0",
"torch == 2.8.0", "torch == 2.9.0",
"wheel", "wheel",
"jinja2", "jinja2",
] ]
...@@ -20,7 +20,6 @@ license-files = ["LICENSE"] ...@@ -20,7 +20,6 @@ license-files = ["LICENSE"]
readme = "README.md" readme = "README.md"
description = "A high-throughput and memory-efficient inference and serving engine for LLMs" description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
classifiers = [ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",
...@@ -31,7 +30,7 @@ classifiers = [ ...@@ -31,7 +30,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Information Analysis",
] ]
requires-python = ">=3.9,<3.14" requires-python = ">=3.10,<3.14"
dynamic = [ "version", "dependencies", "optional-dependencies"] dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls] [project.urls]
...@@ -52,27 +51,10 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi ...@@ -52,27 +51,10 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi
where = ["."] where = ["."]
include = ["vllm*"] include = ["vllm*"]
[tool.yapfignore]
ignore_patterns = [
".buildkite/**",
"benchmarks/**",
"build/**",
"examples/**",
]
[tool.ruff]
# Allow lines to be as long as 80.
line-length = 80
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"] "vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"] "vllm/_version.py" = ["ALL"]
# Python 3.8 typing - skip V0 code
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
...@@ -87,7 +69,7 @@ select = [ ...@@ -87,7 +69,7 @@ select = [
# flake8-simplify # flake8-simplify
"SIM", "SIM",
# isort # isort
# "I", "I",
# flake8-logging-format # flake8-logging-format
"G", "G",
] ]
...@@ -96,29 +78,23 @@ ignore = [ ...@@ -96,29 +78,23 @@ ignore = [
"F405", "F403", "F405", "F403",
# lambda expression assignment # lambda expression assignment
"E731", "E731",
# zip without `strict=`
"B905",
# Loop control variable not used within loop body # Loop control variable not used within loop body
"B007", "B007",
# f-string format # f-string format
"UP032", "UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
] ]
[tool.ruff.format]
docstring-code-format = true
[tool.mypy] [tool.mypy]
plugins = ['pydantic.mypy'] plugins = ['pydantic.mypy']
ignore_missing_imports = true ignore_missing_imports = true
check_untyped_defs = true check_untyped_defs = true
follow_imports = "silent" follow_imports = "silent"
[tool.isort]
skip_glob = [
".buildkite/*",
"benchmarks/*",
"examples/*",
]
use_parentheses = true
skip_gitignore = true
[tool.pytest.ini_options] [tool.pytest.ini_options]
markers = [ markers = [
"slow_test", "slow_test",
...@@ -126,6 +102,7 @@ markers = [ ...@@ -126,6 +102,7 @@ markers = [
"core_model: enable this model test in each PR instead of only nightly", "core_model: enable this model test in each PR instead of only nightly",
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)", "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
"cpu_model: enable this model test in CPU tests", "cpu_model: enable this model test in CPU tests",
"cpu_test: mark test as CPU-only test",
"split: run this test as part of a split", "split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests", "distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1", "skip_v1: do not run this test with v1",
...@@ -206,6 +183,7 @@ ba = "ba" ...@@ -206,6 +183,7 @@ ba = "ba"
[tool.typos.type.py.extend-words] [tool.typos.type.py.extend-words]
ba = "ba" ba = "ba"
nd = "nd"
[tool.typos.type.cpp] [tool.typos.type.cpp]
extend-glob = ["*.cu"] extend-glob = ["*.cu"]
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
cmake>=3.26.1 cmake>=3.26.1
ninja ninja
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<81.0.0
setuptools-scm>=8 setuptools-scm>=8
torch==2.8.0 torch==2.9.0
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
regex regex
......
...@@ -7,39 +7,38 @@ requests >= 2.26.0 ...@@ -7,39 +7,38 @@ requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.55.2 transformers >= 4.56.0, < 5
tokenizers >= 0.21.1 # Required for fast incremental detokenization. tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp aiohttp
openai >= 1.99.1 # For Responses API with reasoning content openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.11.7 pydantic >= 2.12.0
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
pillow # Required for image processing pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.11.3 lm-format-enforcer == 0.11.3
llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
outlines_core == 0.2.11 outlines_core == 0.2.11
# required for outlines backend disk cache # required for outlines backend disk cache
diskcache == 5.6.3 diskcache == 5.6.3
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0 pyzmq >= 25.0.0
msgspec msgspec
gguf >= 0.13.0 gguf >= 0.13.0
importlib_metadata; python_version < '3.10' mistral_common[image] >= 1.8.5
mistral_common[image,audio] >= 1.5.4 # requires numpy>=1.25 #1.8.2
opencv-python-headless >= 4.11.0 # required for video IO opencv-python-headless >= 4.11.0 # required for video IO
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.11.0 # required for compressed-tensors compressed-tensors == 0.12.2 # required for compressed-tensors
depyf==0.19.0 # required for profiling and debugging with compilation config depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/others/logging_configuration.md python-json-logger # Used by logging as per examples/others/logging_configuration.md
...@@ -49,3 +48,5 @@ pybase64 # fast base64 implementation ...@@ -49,3 +48,5 @@ pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects cbor2 # Required for cross-language serialization of hashable objects
setproctitle # Used to set process names for better debugging and monitoring setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
model-hosting-container-standards < 1.0.0
\ No newline at end of file
cmake>=3.26.1 cmake>=3.26.1
ninja ninja
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<81.0.0
setuptools-scm>=8 setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64" torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
regex regex
# Common dependencies # Common dependencies
-r common.txt -r common.txt
numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
# Dependencies for CPUs # Dependencies for CPUs
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<81.0.0
--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64" torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.8.0; platform_system == "Darwin" torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
......
# Common dependencies # Common dependencies
-r common.txt -r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.8.0 torch==2.9.0
torchaudio==2.8.0 torchaudio==2.9.0
# These must be updated alongside torch # These must be updated alongside torch
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 # FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.2
...@@ -9,10 +9,8 @@ mkdocs-git-revision-date-localized-plugin ...@@ -9,10 +9,8 @@ mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin mkdocs-minify-plugin
regex regex
ruff ruff
# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
cachetools
msgspec
pydantic pydantic
torch
# For generating argparse docs.
# Adding requirements here should only be used as a last resort.
msgspec # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
lmcache lmcache
nixl >= 0.5.1 # Required for disaggregated prefill nixl >= 0.6.0 # Required for disaggregated prefill
...@@ -23,14 +23,14 @@ jiwer # required for audio tests ...@@ -23,14 +23,14 @@ jiwer # required for audio tests
timm # required for internvl test timm # required for internvl test
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test mistral_common[image,audio] >= 1.8.5 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4 transformers==4.57.1
tokenizers==0.21.1 tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes>=0.46.1 bitsandbytes>=0.46.1
...@@ -40,10 +40,8 @@ buildkite-test-collector==0.1.9 ...@@ -40,10 +40,8 @@ buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf==0.0.8
tritonclient==2.51.0 tritonclient==2.51.0
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
numpy numpy
runai-model-streamer==0.11.0 runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10 fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10 pydantic>=2.12 # 2.11 leads to error on python 3.13
# Common dependencies # Common dependencies
-r common.txt -r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.3 --extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.8.0 torch==2.9.0
torchvision==0.23.0 torchvision==0.24.0
torchaudio==2.8.0 torchaudio==2.9.0
triton==3.3.0 triton==3.5.0
cmake>=3.26.1,<4 cmake>=3.26.1,<4
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<80.0.0
setuptools-scm>=8 setuptools-scm>=8
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
amdsmi==6.2.4 amdsmi==6.4.3
timm>=1.0.17 timm>=1.0.17
# Common dependencies # Common dependencies
-r common.txt -r common.txt
tblib==3.1.0 tblib==3.1.0
bm25s==0.2.13
pystemmer==3.0.0
# entrypoints test # Entrypoints test
# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
audioread==3.0.1 audioread==3.0.1
cffi==1.17.1 cffi==1.17.1
...@@ -15,11 +17,11 @@ soundfile==0.13.1 ...@@ -15,11 +17,11 @@ soundfile==0.13.1
soxr==0.5.0.post1 soxr==0.5.0.post1
librosa==0.10.2.post1 librosa==0.10.2.post1
# entrypoints test # Entrypoints test
#vllm[video] # required by entrypoints/openai/test_video.py #vllm[video] # required by entrypoints/openai/test_video.py
decord==0.6.0 decord==0.6.0
# entrypoints test # Entrypoints test
#sentence-transformers # required by entrypoints/openai/test_score.py #sentence-transformers # required by entrypoints/openai/test_score.py
sentence-transformers==3.4.1 sentence-transformers==3.4.1
...@@ -29,4 +31,11 @@ matplotlib==3.10.3 ...@@ -29,4 +31,11 @@ matplotlib==3.10.3
# Multi-Modal Models Test (Extended) 3 # Multi-Modal Models Test (Extended) 3
blobfile==3.0.0 blobfile==3.0.0
# Required for openai schema test.
schemathesis==3.39.15
# Required for mteb test
mteb[bm25s]>=1.38.11, <2
# Required for eval tests
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# Common dependencies # Common dependencies
-r common.txt -r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
# Dependencies for AMD GPUs # Dependencies for AMD GPUs
boto3
botocore
datasets datasets
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
peft peft
...@@ -15,7 +12,6 @@ tensorizer==2.10.1 ...@@ -15,7 +12,6 @@ tensorizer==2.10.1
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<80.0.0
setuptools-scm>=8 setuptools-scm>=8
runai-model-streamer==0.11.0 runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer-s3==0.11.0
# conch-triton-kernels==1.2.1 # conch-triton-kernels==1.2.1
timm>=1.0.17 timm>=1.0.17
...@@ -24,12 +24,12 @@ soundfile # required for audio tests ...@@ -24,12 +24,12 @@ soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
tblib # for pickling test exceptions tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.8.0 torch==2.9.0
torchaudio==2.8.0 torchaudio==2.9.0
torchvision==0.23.0 torchvision==0.24.0
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test mistral_common[image,audio] >= 1.8.5 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
...@@ -37,8 +37,8 @@ datamodel_code_generator # required for minicpm3 test ...@@ -37,8 +37,8 @@ datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released # TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.2 transformers==4.57.1
tokenizers==0.21.1 tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes==0.46.1 bitsandbytes==0.46.1
...@@ -48,12 +48,12 @@ buildkite-test-collector==0.1.9 ...@@ -48,12 +48,12 @@ buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf==0.0.8
tritonclient==2.51.0 tritonclient==2.51.0
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2; python_version > '3.9' numba == 0.61.2 # Required for N-gram speculative decoding
numpy numpy
runai-model-streamer==0.11.0 runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10 fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10 pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0 decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
gpt-oss >= 0.0.7; python_version > '3.11'
# This file was autogenerated by uv via the following command: # This file was autogenerated by uv via the following command:
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28 # uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
absl-py==2.1.0 absl-py==2.1.0
# via rouge-score # via rouge-score
accelerate==1.0.1 accelerate==1.0.1
...@@ -10,18 +10,19 @@ aenum==3.1.16 ...@@ -10,18 +10,19 @@ aenum==3.1.16
# via lightly # via lightly
affine==2.4.0 affine==2.4.0
# via rasterio # via rasterio
aiohappyeyeballs==2.4.3 aiohappyeyeballs==2.6.1
# via aiohttp # via aiohttp
aiohttp==3.10.11 aiohttp==3.13.0
# via # via
# aiohttp-cors # aiohttp-cors
# datasets # datasets
# fsspec # fsspec
# gpt-oss
# lm-eval # lm-eval
# ray # ray
aiohttp-cors==0.8.1 aiohttp-cors==0.8.1
# via ray # via ray
aiosignal==1.3.1 aiosignal==1.4.0
# via aiohttp # via aiohttp
albucore==0.0.16 albucore==0.0.16
# via terratorch # via terratorch
...@@ -39,6 +40,8 @@ anyio==4.6.2.post1 ...@@ -39,6 +40,8 @@ anyio==4.6.2.post1
# via # via
# httpx # httpx
# starlette # starlette
arctic-inference==0.1.1
# via -r requirements/test.in
argcomplete==3.5.1 argcomplete==3.5.1
# via datamodel-code-generator # via datamodel-code-generator
arrow==1.3.0 arrow==1.3.0
...@@ -72,7 +75,9 @@ blobfile==3.0.0 ...@@ -72,7 +75,9 @@ blobfile==3.0.0
bm25s==0.2.13 bm25s==0.2.13
# via mteb # via mteb
boto3==1.35.57 boto3==1.35.57
# via tensorizer # via
# runai-model-streamer-s3
# tensorizer
botocore==1.35.57 botocore==1.35.57
# via # via
# boto3 # boto3
...@@ -101,6 +106,8 @@ chardet==5.2.0 ...@@ -101,6 +106,8 @@ chardet==5.2.0
# via mbstrdecoder # via mbstrdecoder
charset-normalizer==3.4.0 charset-normalizer==3.4.0
# via requests # via requests
chz==0.3.0
# via gpt-oss
click==8.1.7 click==8.1.7
# via # via
# black # black
...@@ -171,7 +178,9 @@ distlib==0.3.9 ...@@ -171,7 +178,9 @@ distlib==0.3.9
dnspython==2.7.0 dnspython==2.7.0
# via email-validator # via email-validator
docker==7.1.0 docker==7.1.0
# via mlflow # via
# gpt-oss
# mlflow
docopt==0.6.2 docopt==0.6.2
# via num2words # via num2words
docstring-parser==0.17.0 docstring-parser==0.17.0
...@@ -197,7 +206,9 @@ eval-type-backport==0.2.2 ...@@ -197,7 +206,9 @@ eval-type-backport==0.2.2
evaluate==0.4.3 evaluate==0.4.3
# via lm-eval # via lm-eval
fastapi==0.116.1 fastapi==0.116.1
# via mlflow-skinny # via
# gpt-oss
# mlflow-skinny
fastparquet==2024.11.0 fastparquet==2024.11.0
# via genai-perf # via genai-perf
fastrlock==0.8.2 fastrlock==0.8.2
...@@ -249,13 +260,31 @@ gitdb==4.0.12 ...@@ -249,13 +260,31 @@ gitdb==4.0.12
gitpython==3.1.44 gitpython==3.1.44
# via mlflow-skinny # via mlflow-skinny
google-api-core==2.24.2 google-api-core==2.24.2
# via opencensus # via
# google-cloud-core
# google-cloud-storage
# opencensus
google-auth==2.40.2 google-auth==2.40.2
# via # via
# databricks-sdk # databricks-sdk
# google-api-core # google-api-core
# google-cloud-core
# google-cloud-storage
# runai-model-streamer-gcs
google-cloud-core==2.4.3
# via google-cloud-storage
google-cloud-storage==3.4.0
# via runai-model-streamer-gcs
google-crc32c==1.7.1
# via
# google-cloud-storage
# google-resumable-media
google-resumable-media==2.7.2
# via google-cloud-storage
googleapis-common-protos==1.70.0 googleapis-common-protos==1.70.0
# via google-api-core # via google-api-core
gpt-oss==0.0.8
# via -r requirements/test.in
graphene==3.4.3 graphene==3.4.3
# via mlflow # via mlflow
graphql-core==3.2.6 graphql-core==3.2.6
...@@ -283,6 +312,8 @@ hf-xet==1.1.7 ...@@ -283,6 +312,8 @@ hf-xet==1.1.7
# via huggingface-hub # via huggingface-hub
hiredis==3.0.0 hiredis==3.0.0
# via tensorizer # via tensorizer
html2text==2025.4.15
# via gpt-oss
httpcore==1.0.6 httpcore==1.0.6
# via httpx # via httpx
httpx==0.27.2 httpx==0.27.2
...@@ -417,6 +448,7 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215 ...@@ -417,6 +448,7 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215
lxml==5.3.0 lxml==5.3.0
# via # via
# blobfile # blobfile
# gpt-oss
# sacrebleu # sacrebleu
mako==1.3.10 mako==1.3.10
# via alembic # via alembic
...@@ -444,7 +476,7 @@ mbstrdecoder==1.1.3 ...@@ -444,7 +476,7 @@ mbstrdecoder==1.1.3
# typepy # typepy
mdurl==0.1.2 mdurl==0.1.2
# via markdown-it-py # via markdown-it-py
mistral-common==1.8.2 mistral-common==1.8.5
# via -r requirements/test.in # via -r requirements/test.in
mlflow==2.22.0 mlflow==2.22.0
# via terratorch # via terratorch
...@@ -543,42 +575,44 @@ numpy==1.26.4 ...@@ -543,42 +575,44 @@ numpy==1.26.4
# tritonclient # tritonclient
# vocos # vocos
# xarray # xarray
nvidia-cublas-cu12==12.8.4.1 nvidia-cublas-cu12==12.9.1.4
# via # via
# nvidia-cudnn-cu12 # nvidia-cudnn-cu12
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# torch # torch
nvidia-cuda-cupti-cu12==12.8.90 nvidia-cuda-cupti-cu12==12.9.79
# via torch # via torch
nvidia-cuda-nvrtc-cu12==12.8.93 nvidia-cuda-nvrtc-cu12==12.9.86
# via torch # via torch
nvidia-cuda-runtime-cu12==12.8.90 nvidia-cuda-runtime-cu12==12.9.79
# via torch # via torch
nvidia-cudnn-cu12==9.10.2.21 nvidia-cudnn-cu12==9.10.2.21
# via torch # via torch
nvidia-cufft-cu12==11.3.3.83 nvidia-cufft-cu12==11.4.1.4
# via torch # via torch
nvidia-cufile-cu12==1.13.1.3 nvidia-cufile-cu12==1.14.1.1
# via torch # via torch
nvidia-curand-cu12==10.3.9.90 nvidia-curand-cu12==10.3.10.19
# via torch # via torch
nvidia-cusolver-cu12==11.7.3.90 nvidia-cusolver-cu12==11.7.5.82
# via torch # via torch
nvidia-cusparse-cu12==12.5.8.93 nvidia-cusparse-cu12==12.5.10.65
# via # via
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# torch # torch
nvidia-cusparselt-cu12==0.7.1 nvidia-cusparselt-cu12==0.7.1
# via torch # via torch
nvidia-nccl-cu12==2.27.3 nvidia-nccl-cu12==2.27.5
# via torch # via torch
nvidia-nvjitlink-cu12==12.8.93 nvidia-nvjitlink-cu12==12.9.86
# via # via
# nvidia-cufft-cu12 # nvidia-cufft-cu12
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# nvidia-cusparse-cu12 # nvidia-cusparse-cu12
# torch # torch
nvidia-nvtx-cu12==12.8.90 nvidia-nvshmem-cu12==3.3.20
# via torch
nvidia-nvtx-cu12==12.9.79
# via torch # via torch
omegaconf==2.3.0 omegaconf==2.3.0
# via # via
...@@ -586,6 +620,8 @@ omegaconf==2.3.0 ...@@ -586,6 +620,8 @@ omegaconf==2.3.0
# lightning # lightning
open-clip-torch==2.32.0 open-clip-torch==2.32.0
# via -r requirements/test.in # via -r requirements/test.in
openai-harmony==0.0.4
# via gpt-oss
opencensus==0.11.4 opencensus==0.11.4
# via ray # via ray
opencensus-context==0.1.3 opencensus-context==0.1.3
...@@ -706,7 +742,9 @@ prometheus-client==0.22.0 ...@@ -706,7 +742,9 @@ prometheus-client==0.22.0
# opentelemetry-exporter-prometheus # opentelemetry-exporter-prometheus
# ray # ray
propcache==0.2.0 propcache==0.2.0
# via yarl # via
# aiohttp
# yarl
proto-plus==1.26.1 proto-plus==1.26.1
# via google-api-core # via google-api-core
protobuf==5.28.3 protobuf==5.28.3
...@@ -749,19 +787,21 @@ pycparser==2.22 ...@@ -749,19 +787,21 @@ pycparser==2.22
# via cffi # via cffi
pycryptodomex==3.22.0 pycryptodomex==3.22.0
# via blobfile # via blobfile
pydantic==2.11.7 pydantic==2.12.0
# via # via
# -r requirements/test.in # -r requirements/test.in
# albumentations # albumentations
# datamodel-code-generator # datamodel-code-generator
# fastapi # fastapi
# gpt-oss
# lightly # lightly
# mistral-common # mistral-common
# mlflow-skinny # mlflow-skinny
# mteb # mteb
# openai-harmony
# pydantic-extra-types # pydantic-extra-types
# ray # ray
pydantic-core==2.33.2 pydantic-core==2.41.1
# via pydantic # via pydantic
pydantic-extra-types==2.10.5 pydantic-extra-types==2.10.5
# via mistral-common # via mistral-common
...@@ -888,6 +928,8 @@ requests==2.32.3 ...@@ -888,6 +928,8 @@ requests==2.32.3
# docker # docker
# evaluate # evaluate
# google-api-core # google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub # huggingface-hub
# lightly # lightly
# lm-eval # lm-eval
...@@ -925,10 +967,12 @@ rsa==4.9.1 ...@@ -925,10 +967,12 @@ rsa==4.9.1
# via google-auth # via google-auth
rtree==1.4.0 rtree==1.4.0
# via torchgeo # via torchgeo
runai-model-streamer==0.11.0 runai-model-streamer==0.15.0
# via -r requirements/test.in
runai-model-streamer-s3==0.11.0
# via -r requirements/test.in # via -r requirements/test.in
runai-model-streamer-gcs==0.15.0
# via runai-model-streamer
runai-model-streamer-s3==0.15.0
# via runai-model-streamer
s3transfer==0.10.3 s3transfer==0.10.3
# via boto3 # via boto3
sacrebleu==2.4.3 sacrebleu==2.4.3
...@@ -972,14 +1016,11 @@ sentence-transformers==3.2.1 ...@@ -972,14 +1016,11 @@ sentence-transformers==3.2.1
# via # via
# -r requirements/test.in # -r requirements/test.in
# mteb # mteb
sentencepiece==0.2.0
# via mistral-common
setuptools==77.0.3 setuptools==77.0.3
# via # via
# lightning-utilities # lightning-utilities
# pytablewriter # pytablewriter
# torch # torch
# triton
shapely==2.1.1 shapely==2.1.1
# via # via
# geopandas # geopandas
...@@ -1031,6 +1072,8 @@ starlette-testclient==0.4.1 ...@@ -1031,6 +1072,8 @@ starlette-testclient==0.4.1
# via schemathesis # via schemathesis
statsmodels==0.14.4 statsmodels==0.14.4
# via genai-perf # via genai-perf
structlog==25.4.0
# via gpt-oss
sympy==1.13.3 sympy==1.13.3
# via # via
# einx # einx
...@@ -1043,14 +1086,17 @@ tblib==3.1.0 ...@@ -1043,14 +1086,17 @@ tblib==3.1.0
# via -r requirements/test.in # via -r requirements/test.in
tcolorpy==0.1.6 tcolorpy==0.1.6
# via pytablewriter # via pytablewriter
tenacity==9.0.0 tenacity==9.1.2
# via # via
# gpt-oss
# lm-eval # lm-eval
# plotly # plotly
tensorboardx==2.6.4 tensorboardx==2.6.4
# via lightning # via lightning
tensorizer==2.10.1 tensorizer==2.10.1
# via -r requirements/test.in # via -r requirements/test.in
termcolor==3.1.0
# via gpt-oss
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
# via -r requirements/test.in # via -r requirements/test.in
threadpoolctl==3.5.0 threadpoolctl==3.5.0
...@@ -1059,8 +1105,9 @@ tifffile==2025.3.30 ...@@ -1059,8 +1105,9 @@ tifffile==2025.3.30
# via # via
# scikit-image # scikit-image
# terratorch # terratorch
tiktoken==0.7.0 tiktoken==0.12.0
# via # via
# gpt-oss
# lm-eval # lm-eval
# mistral-common # mistral-common
timm==1.0.17 timm==1.0.17
...@@ -1070,7 +1117,7 @@ timm==1.0.17 ...@@ -1070,7 +1117,7 @@ timm==1.0.17
# segmentation-models-pytorch # segmentation-models-pytorch
# terratorch # terratorch
# torchgeo # torchgeo
tokenizers==0.21.1 tokenizers==0.22.0
# via # via
# -r requirements/test.in # -r requirements/test.in
# transformers # transformers
...@@ -1078,7 +1125,7 @@ tomli==2.2.1 ...@@ -1078,7 +1125,7 @@ tomli==2.2.1
# via schemathesis # via schemathesis
tomli-w==1.2.0 tomli-w==1.2.0
# via schemathesis # via schemathesis
torch==2.8.0+cu128 torch==2.9.0+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
...@@ -1107,7 +1154,7 @@ torch==2.8.0+cu128 ...@@ -1107,7 +1154,7 @@ torch==2.8.0+cu128
# torchvision # torchvision
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
torchaudio==2.8.0+cu128 torchaudio==2.9.0+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
...@@ -1120,7 +1167,7 @@ torchmetrics==1.7.4 ...@@ -1120,7 +1167,7 @@ torchmetrics==1.7.4
# pytorch-lightning # pytorch-lightning
# terratorch # terratorch
# torchgeo # torchgeo
torchvision==0.23.0+cu128 torchvision==0.24.0+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# lightly # lightly
...@@ -1151,7 +1198,7 @@ tqdm==4.66.6 ...@@ -1151,7 +1198,7 @@ tqdm==4.66.6
# transformers # transformers
tqdm-multiprocess==0.0.11 tqdm-multiprocess==0.0.11
# via lm-eval # via lm-eval
transformers==4.55.2 transformers==4.57.1
# via # via
# -r requirements/test.in # -r requirements/test.in
# genai-perf # genai-perf
...@@ -1161,7 +1208,7 @@ transformers==4.55.2 ...@@ -1161,7 +1208,7 @@ transformers==4.55.2
# transformers-stream-generator # transformers-stream-generator
transformers-stream-generator==0.0.5 transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.4.0 triton==3.5.0
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.51.0
# via # via
...@@ -1178,10 +1225,12 @@ types-python-dateutil==2.9.0.20241206 ...@@ -1178,10 +1225,12 @@ types-python-dateutil==2.9.0.20241206
# via arrow # via arrow
typeshed-client==2.8.2 typeshed-client==2.8.2
# via jsonargparse # via jsonargparse
typing-extensions==4.12.2 typing-extensions==4.15.0
# via # via
# aiosignal
# albumentations # albumentations
# alembic # alembic
# chz
# fastapi # fastapi
# graphene # graphene
# huggingface-hub # huggingface-hub
...@@ -1205,7 +1254,7 @@ typing-extensions==4.12.2 ...@@ -1205,7 +1254,7 @@ typing-extensions==4.12.2
# typer # typer
# typeshed-client # typeshed-client
# typing-inspection # typing-inspection
typing-inspection==0.4.1 typing-inspection==0.4.2
# via pydantic # via pydantic
tzdata==2024.2 tzdata==2024.2
# via pandas # via pandas
...@@ -1221,7 +1270,9 @@ urllib3==2.2.3 ...@@ -1221,7 +1270,9 @@ urllib3==2.2.3
# responses # responses
# tritonclient # tritonclient
uvicorn==0.35.0 uvicorn==0.35.0
# via mlflow-skinny # via
# gpt-oss
# mlflow-skinny
vector-quantize-pytorch==1.21.2 vector-quantize-pytorch==1.21.2
# via -r requirements/test.in # via -r requirements/test.in
virtualenv==20.31.2 virtualenv==20.31.2
......
...@@ -5,15 +5,14 @@ ray>=2.9 ...@@ -5,15 +5,14 @@ ray>=2.9
cmake>=3.26.1 cmake>=3.26.1
packaging>=24.2 packaging>=24.2
setuptools-scm>=8 setuptools-scm>=8
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<81.0.0
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
datasets # for benchmark scripts datasets # for benchmark scripts
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
nixl==0.3.0 # for PD disaggregation
torch==2.8.0+xpu torch==2.8.0+xpu
torchaudio torchaudio
torchvision torchvision
--extra-index-url=https://download.pytorch.org/whl/xpu --extra-index-url=https://download.pytorch.org/whl/xpu
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
...@@ -34,32 +34,36 @@ logger = logging.getLogger(__name__) ...@@ -34,32 +34,36 @@ logger = logging.getLogger(__name__)
# cannot import envs directly because it depends on vllm, # cannot import envs directly because it depends on vllm,
# which is not installed yet # which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu": if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
logger.warning( logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
"VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
VLLM_TARGET_DEVICE = "cpu" VLLM_TARGET_DEVICE = "cpu"
elif not (sys.platform.startswith("linux") elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")):
or sys.platform.startswith("darwin")):
logger.warning( logger.warning(
"vLLM only supports Linux platform (including WSL) and MacOS." "vLLM only supports Linux platform (including WSL) and MacOS."
"Building on %s, " "Building on %s, "
"so vLLM may not be able to run correctly", sys.platform) "so vLLM may not be able to run correctly",
sys.platform,
)
VLLM_TARGET_DEVICE = "empty" VLLM_TARGET_DEVICE = "empty"
elif (sys.platform.startswith("linux") and torch.version.cuda is None elif (
and os.getenv("VLLM_TARGET_DEVICE") is None sys.platform.startswith("linux")
and torch.version.hip is None): and torch.version.cuda is None
and os.getenv("VLLM_TARGET_DEVICE") is None
and torch.version.hip is None
):
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set, # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
# fallback to cpu # fallback to cpu
VLLM_TARGET_DEVICE = "cpu" VLLM_TARGET_DEVICE = "cpu"
def is_sccache_available() -> bool: def is_sccache_available() -> bool:
return which("sccache") is not None and \ return which("sccache") is not None and not bool(
not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))) int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))
)
def is_ccache_available() -> bool: def is_ccache_available() -> bool:
...@@ -83,8 +87,7 @@ def is_url_available(url: str) -> bool: ...@@ -83,8 +87,7 @@ def is_url_available(url: str) -> bool:
class CMakeExtension(Extension): class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
super().__init__(name, sources=[], py_limited_api=True, **kwa) super().__init__(name, sources=[], py_limited_api=True, **kwa)
self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
...@@ -121,8 +124,8 @@ class cmake_build_ext(build_ext): ...@@ -121,8 +124,8 @@ class cmake_build_ext(build_ext):
if nvcc_threads is not None: if nvcc_threads is not None:
nvcc_threads = int(nvcc_threads) nvcc_threads = int(nvcc_threads)
logger.info( logger.info(
"Using NVCC_THREADS=%d as the number of nvcc threads.", "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
nvcc_threads) )
else: else:
nvcc_threads = 1 nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads) num_jobs = max(1, num_jobs // nvcc_threads)
...@@ -146,36 +149,36 @@ class cmake_build_ext(build_ext): ...@@ -146,36 +149,36 @@ class cmake_build_ext(build_ext):
cfg = envs.CMAKE_BUILD_TYPE or default_cfg cfg = envs.CMAKE_BUILD_TYPE or default_cfg
cmake_args = [ cmake_args = [
'-DCMAKE_BUILD_TYPE={}'.format(cfg), "-DCMAKE_BUILD_TYPE={}".format(cfg),
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), "-DVLLM_TARGET_DEVICE={}".format(VLLM_TARGET_DEVICE),
] ]
verbose = envs.VERBOSE verbose = envs.VERBOSE
if verbose: if verbose:
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"]
if is_sccache_available(): if is_sccache_available():
cmake_args += [ cmake_args += [
'-DCMAKE_C_COMPILER_LAUNCHER=sccache', "-DCMAKE_C_COMPILER_LAUNCHER=sccache",
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', "-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
'-DCMAKE_HIP_COMPILER_LAUNCHER=sccache', "-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
] ]
elif is_ccache_available(): elif is_ccache_available():
cmake_args += [ cmake_args += [
'-DCMAKE_C_COMPILER_LAUNCHER=ccache', "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
'-DCMAKE_HIP_COMPILER_LAUNCHER=ccache', "-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
] ]
# Pass the python executable to cmake so it can find an exact # Pass the python executable to cmake so it can find an exact
# match. # match.
cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)] cmake_args += ["-DVLLM_PYTHON_EXECUTABLE={}".format(sys.executable)]
# Pass the python path to cmake so it can reuse the build dependencies # Pass the python path to cmake so it can reuse the build dependencies
# on subsequent calls to python. # on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] cmake_args += ["-DVLLM_PYTHON_PATH={}".format(":".join(sys.path))]
# Override the base directory for FetchContent downloads to $ROOT/.deps # Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles, # This allows sharing dependencies between profiles,
...@@ -183,7 +186,7 @@ class cmake_build_ext(build_ext): ...@@ -183,7 +186,7 @@ class cmake_build_ext(build_ext):
# To override this, set the FETCHCONTENT_BASE_DIR environment variable. # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
fc_base_dir = os.path.join(ROOT_DIR, ".deps") fc_base_dir = os.path.join(ROOT_DIR, ".deps")
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)] cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
# #
# Setup parallelism and build tool # Setup parallelism and build tool
...@@ -191,30 +194,38 @@ class cmake_build_ext(build_ext): ...@@ -191,30 +194,38 @@ class cmake_build_ext(build_ext):
num_jobs, nvcc_threads = self.compute_num_jobs() num_jobs, nvcc_threads = self.compute_num_jobs()
if nvcc_threads: if nvcc_threads:
cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)] cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
if is_ninja_available(): if is_ninja_available():
build_tool = ['-G', 'Ninja'] build_tool = ["-G", "Ninja"]
cmake_args += [ cmake_args += [
'-DCMAKE_JOB_POOL_COMPILE:STRING=compile', "-DCMAKE_JOB_POOL_COMPILE:STRING=compile",
'-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs), "-DCMAKE_JOB_POOLS:STRING=compile={}".format(num_jobs),
] ]
else: else:
# Default build tool to whatever cmake picks. # Default build tool to whatever cmake picks.
build_tool = [] build_tool = []
# Make sure we use the nvcc from CUDA_HOME # Make sure we use the nvcc from CUDA_HOME
if _is_cuda(): if _is_cuda():
cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc'] cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
elif _is_hip():
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
other_cmake_args = os.environ.get("CMAKE_ARGS")
if other_cmake_args:
cmake_args += other_cmake_args.split()
subprocess.check_call( subprocess.check_call(
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], ["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp) cwd=self.build_temp,
)
def build_extensions(self) -> None: def build_extensions(self) -> None:
# Ensure that CMake is present and working # Ensure that CMake is present and working
try: try:
subprocess.check_output(['cmake', '--version']) subprocess.check_output(["cmake", "--version"])
except OSError as e: except OSError as e:
raise RuntimeError('Cannot find CMake executable') from e raise RuntimeError("Cannot find CMake executable") from e
# Create build directory if it does not exist. # Create build directory if it does not exist.
if not os.path.exists(self.build_temp): if not os.path.exists(self.build_temp):
...@@ -253,13 +264,18 @@ class cmake_build_ext(build_ext): ...@@ -253,13 +264,18 @@ class cmake_build_ext(build_ext):
# CMake appends the extension prefix to the install path, # CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it. # and outdir already contains that prefix, so we need to remove it.
prefix = outdir prefix = outdir
for _ in range(ext.name.count('.')): for _ in range(ext.name.count(".")):
prefix = prefix.parent prefix = prefix.parent
# prefix here should actually be the same for all components # prefix here should actually be the same for all components
install_args = [ install_args = [
"cmake", "--install", ".", "--prefix", prefix, "--component", "cmake",
target_name(ext.name) "--install",
".",
"--prefix",
prefix,
"--component",
target_name(ext.name),
] ]
subprocess.check_call(install_args, cwd=self.build_temp) subprocess.check_call(install_args, cwd=self.build_temp)
...@@ -270,12 +286,15 @@ class cmake_build_ext(build_ext): ...@@ -270,12 +286,15 @@ class cmake_build_ext(build_ext):
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
# directory so that they can be included in the editable build # directory so that they can be included in the editable build
import glob import glob
files = glob.glob(os.path.join(self.build_lib, "vllm",
"vllm_flash_attn", "**", "*.py"), files = glob.glob(
recursive=True) os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "**", "*.py"),
recursive=True,
)
for file in files: for file in files:
dst_file = os.path.join("vllm/vllm_flash_attn", dst_file = os.path.join(
file.split("vllm/vllm_flash_attn/")[-1]) "vllm/vllm_flash_attn", file.split("vllm/vllm_flash_attn/")[-1]
)
print(f"Copying {file} to {dst_file}") print(f"Copying {file} to {dst_file}")
os.makedirs(os.path.dirname(dst_file), exist_ok=True) os.makedirs(os.path.dirname(dst_file), exist_ok=True)
self.copy_file(file, dst_file) self.copy_file(file, dst_file)
...@@ -285,8 +304,7 @@ class precompiled_build_ext(build_ext): ...@@ -285,8 +304,7 @@ class precompiled_build_ext(build_ext):
"""Disables extension building when using precompiled binaries.""" """Disables extension building when using precompiled binaries."""
def run(self) -> None: def run(self) -> None:
assert _is_cuda( assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
def build_extensions(self) -> None: def build_extensions(self) -> None:
print("Skipping build_ext: using precompiled extensions.") print("Skipping build_ext: using precompiled extensions.")
...@@ -307,9 +325,9 @@ class precompiled_wheel_utils: ...@@ -307,9 +325,9 @@ class precompiled_wheel_utils:
wheel_filename = wheel_url_or_path.split("/")[-1] wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename) wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} " print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
f"to {wheel_path}")
from urllib.request import urlretrieve from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path) urlretrieve(wheel_url_or_path, filename=wheel_path)
else: else:
wheel_path = wheel_url_or_path wheel_path = wheel_url_or_path
...@@ -330,25 +348,29 @@ class precompiled_wheel_utils: ...@@ -330,25 +348,29 @@ class precompiled_wheel_utils:
] ]
compiled_regex = re.compile( compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members = list( file_members = list(
filter(lambda x: x.filename in files_to_copy, filter(lambda x: x.filename in files_to_copy, wheel.filelist)
wheel.filelist)) )
file_members += list( file_members += list(
filter(lambda x: compiled_regex.match(x.filename), filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
wheel.filelist)) )
for file in file_members: for file in file_members:
print(f"[extract] {file.filename}") print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename) target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True) os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open( with (
target_path, "wb") as dst: wheel.open(file.filename) as src,
open(target_path, "wb") as dst,
):
shutil.copyfileobj(src, dst) shutil.copyfileobj(src, dst)
pkg = os.path.dirname(file.filename).replace("/", ".") pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append( package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename)) os.path.basename(file.filename)
)
return package_data_patch return package_data_patch
finally: finally:
...@@ -364,10 +386,13 @@ class precompiled_wheel_utils: ...@@ -364,10 +386,13 @@ class precompiled_wheel_utils:
try: try:
# Get the latest commit hash of the upstream main branch. # Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output([ resp_json = subprocess.check_output(
"curl", "-s", [
"https://api.github.com/repos/vllm-project/vllm/commits/main" "curl",
]).decode("utf-8") "-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main",
]
).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"] upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing. # In Docker build context, .git may be immutable or missing.
...@@ -377,25 +402,32 @@ class precompiled_wheel_utils: ...@@ -377,25 +402,32 @@ class precompiled_wheel_utils:
# Check if the upstream_main_commit exists in the local repo # Check if the upstream_main_commit exists in the local repo
try: try:
subprocess.check_output( subprocess.check_output(
["git", "cat-file", "-e", f"{upstream_main_commit}"]) ["git", "cat-file", "-e", f"{upstream_main_commit}"]
)
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
# If not present, fetch it from the remote repository. # If not present, fetch it from the remote repository.
# Note that this does not update any local branches, # Note that this does not update any local branches,
# but ensures that this commit ref and its history are # but ensures that this commit ref and its history are
# available in our local repo. # available in our local repo.
subprocess.check_call([ subprocess.check_call(
"git", "fetch", "https://github.com/vllm-project/vllm", ["git", "fetch", "https://github.com/vllm-project/vllm", "main"]
"main" )
])
# Then get the commit hash of the current branch that is the same as # Then get the commit hash of the current branch that is the same as
# the upstream main commit. # the upstream main commit.
current_branch = subprocess.check_output( current_branch = (
["git", "branch", "--show-current"]).decode("utf-8").strip() subprocess.check_output(["git", "branch", "--show-current"])
.decode("utf-8")
.strip()
)
base_commit = subprocess.check_output([ base_commit = (
"git", "merge-base", f"{upstream_main_commit}", current_branch subprocess.check_output(
]).decode("utf-8").strip() ["git", "merge-base", f"{upstream_main_commit}", current_branch]
)
.decode("utf-8")
.strip()
)
return base_commit return base_commit
except ValueError as err: except ValueError as err:
raise ValueError(err) from None raise ValueError(err) from None
...@@ -403,7 +435,9 @@ class precompiled_wheel_utils: ...@@ -403,7 +435,9 @@ class precompiled_wheel_utils:
logger.warning( logger.warning(
"Failed to get the base commit in the main branch. " "Failed to get the base commit in the main branch. "
"Using the nightly wheel. The libraries in this " "Using the nightly wheel. The libraries in this "
"wheel may not be compatible with your dev branch: %s", err) "wheel may not be compatible with your dev branch: %s",
err,
)
return "nightly" return "nightly"
...@@ -413,12 +447,13 @@ def _no_device() -> bool: ...@@ -413,12 +447,13 @@ def _no_device() -> bool:
def _is_cuda() -> bool: def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()) return VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()
def _is_hip() -> bool: def _is_hip() -> bool:
return (VLLM_TARGET_DEVICE == "cuda" return (
or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None VLLM_TARGET_DEVICE == "cuda" or VLLM_TARGET_DEVICE == "rocm"
) and torch.version.hip is not None
def _is_tpu() -> bool: def _is_tpu() -> bool:
...@@ -457,8 +492,12 @@ def get_rocm_version(): ...@@ -457,8 +492,12 @@ def get_rocm_version():
minor = ctypes.c_uint32() minor = ctypes.c_uint32()
patch = ctypes.c_uint32() patch = ctypes.c_uint32()
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), if (
ctypes.byref(patch)) == 0): get_rocm_core_version(
ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)
)
== 0
):
return f"{major.value}.{minor.value}.{patch.value}" return f"{major.value}.{minor.value}.{patch.value}"
return None return None
except Exception: except Exception:
...@@ -471,8 +510,9 @@ def get_nvcc_cuda_version() -> Version: ...@@ -471,8 +510,9 @@ def get_nvcc_cuda_version() -> Version:
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
""" """
assert CUDA_HOME is not None, "CUDA_HOME is not set" assert CUDA_HOME is not None, "CUDA_HOME is not set"
nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], nvcc_output = subprocess.check_output(
universal_newlines=True) [CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True
)
output = nvcc_output.split() output = nvcc_output.split()
release_idx = output.index("release") + 1 release_idx = output.index("release") + 1
nvcc_cuda_version = parse(output[release_idx].split(",")[0]) nvcc_cuda_version = parse(output[release_idx].split(",")[0])
...@@ -484,18 +524,31 @@ def get_gaudi_sw_version(): ...@@ -484,18 +524,31 @@ def get_gaudi_sw_version():
Returns the driver version. Returns the driver version.
""" """
# Enable console printing for `hl-smi` check # Enable console printing for `hl-smi` check
output = subprocess.run("hl-smi", output = subprocess.run(
shell=True, "hl-smi",
text=True, shell=True,
capture_output=True, text=True,
env={"ENABLE_CONSOLE": "true"}) capture_output=True,
env={"ENABLE_CONSOLE": "true"},
)
if output.returncode == 0 and output.stdout: if output.returncode == 0 and output.stdout:
return output.stdout.split("\n")[2].replace( return (
" ", "").split(":")[1][:-1].split("-")[0] output.stdout.split("\n")[2]
.replace(" ", "")
.split(":")[1][:-1]
.split("-")[0]
)
return "0.0.0" # when hl-smi is not available return "0.0.0" # when hl-smi is not available
def get_vllm_version() -> str: def get_vllm_version() -> str:
# Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source.
if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
return get_version(write_to="vllm/_version.py")
version = get_version(write_to="vllm/_version.py") version = get_version(write_to="vllm/_version.py")
sep = "+" if "+" not in version else "." # dev versions might contain + sep = "+" if "+" not in version else "." # dev versions might contain +
...@@ -541,8 +594,11 @@ def get_requirements() -> list[str]: ...@@ -541,8 +594,11 @@ def get_requirements() -> list[str]:
for line in requirements: for line in requirements:
if line.startswith("-r "): if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1]) resolved_requirements += _read_requirements(line.split()[1])
elif not line.startswith("--") and not line.startswith( elif (
"#") and line.strip() != "": not line.startswith("--")
and not line.startswith("#")
and line.strip() != ""
):
resolved_requirements.append(line) resolved_requirements.append(line)
return resolved_requirements return resolved_requirements
...@@ -553,7 +609,7 @@ def get_requirements() -> list[str]: ...@@ -553,7 +609,7 @@ def get_requirements() -> list[str]:
cuda_major, cuda_minor = torch.version.cuda.split(".") cuda_major, cuda_minor = torch.version.cuda.split(".")
modified_requirements = [] modified_requirements = []
for req in requirements: for req in requirements:
if ("vllm-flash-attn" in req and cuda_major != "12"): if "vllm-flash-attn" in req and cuda_major != "12":
# vllm-flash-attn is built only for CUDA 12.x. # vllm-flash-attn is built only for CUDA 12.x.
# Skip for other versions. # Skip for other versions.
continue continue
...@@ -568,8 +624,7 @@ def get_requirements() -> list[str]: ...@@ -568,8 +624,7 @@ def get_requirements() -> list[str]:
elif _is_xpu(): elif _is_xpu():
requirements = _read_requirements("xpu.txt") requirements = _read_requirements("xpu.txt")
else: else:
raise ValueError( raise ValueError("Unsupported platform, please use CUDA, ROCm, or CPU.")
"Unsupported platform, please use CUDA, ROCm, or CPU.")
return requirements return requirements
...@@ -577,6 +632,7 @@ ext_modules = [] ...@@ -577,6 +632,7 @@ ext_modules = []
if _is_cuda() or _is_hip(): if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm._moe_C"))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
# if _is_hip(): # if _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C")) # ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
...@@ -585,15 +641,13 @@ if _is_cuda(): ...@@ -585,15 +641,13 @@ if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# FA3 requires CUDA 12.3 or later # FA3 requires CUDA 12.3 or later
ext_modules.append( ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# Optional since this doesn't get built (produce an .so file) when # Optional since this doesn't get built (produce an .so file) when
# not targeting a hopper system # not targeting a hopper system
ext_modules.append(CMakeExtension(name="vllm._flashmla_C", optional=True))
ext_modules.append( ext_modules.append(
CMakeExtension(name="vllm._flashmla_C", optional=True)) CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
ext_modules.append( )
CMakeExtension(name="vllm._flashmla_extension_C", optional=True))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
if _build_custom_ops(): if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
...@@ -614,6 +668,7 @@ if envs.VLLM_USE_PRECOMPILED: ...@@ -614,6 +668,7 @@ if envs.VLLM_USE_PRECOMPILED:
wheel_url = wheel_location wheel_url = wheel_location
else: else:
import platform import platform
arch = platform.machine() arch = platform.machine()
if arch == "x86_64": if arch == "x86_64":
wheel_tag = "manylinux1_x86_64" wheel_tag = "manylinux1_x86_64"
...@@ -623,8 +678,11 @@ if envs.VLLM_USE_PRECOMPILED: ...@@ -623,8 +678,11 @@ if envs.VLLM_USE_PRECOMPILED:
raise ValueError(f"Unsupported architecture: {arch}") raise ValueError(f"Unsupported architecture: {arch}")
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl" wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl" nightly_wheel_url = (
f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
)
from urllib.request import urlopen from urllib.request import urlopen
try: try:
with urlopen(wheel_url) as resp: with urlopen(wheel_url) as resp:
if resp.status != 200: if resp.status != 200:
...@@ -633,8 +691,7 @@ if envs.VLLM_USE_PRECOMPILED: ...@@ -633,8 +691,7 @@ if envs.VLLM_USE_PRECOMPILED:
print(f"[warn] Falling back to nightly wheel: {e}") print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = nightly_wheel_url wheel_url = nightly_wheel_url
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
wheel_url)
for pkg, files in patch.items(): for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files) package_data.setdefault(pkg, []).extend(files)
...@@ -645,8 +702,9 @@ if not ext_modules: ...@@ -645,8 +702,9 @@ if not ext_modules:
cmdclass = {} cmdclass = {}
else: else:
cmdclass = { cmdclass = {
"build_ext": "build_ext": precompiled_build_ext
precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext if envs.VLLM_USE_PRECOMPILED
else cmake_build_ext
} }
setup( setup(
...@@ -655,18 +713,17 @@ setup( ...@@ -655,18 +713,17 @@ setup(
ext_modules=ext_modules, ext_modules=ext_modules,
install_requires=get_requirements(), install_requires=get_requirements(),
extras_require={ extras_require={
"bench": ["pandas", "datasets"], "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
"tensorizer": ["tensorizer==2.10.1"], "tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"], "fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": [ "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
"runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs", "audio": [
"google-cloud-storage", "runai-model-streamer-s3", "boto3" "librosa",
], "soundfile",
"audio": ["librosa", "soundfile", "mistral_common[audio]",
"mistral_common[audio]"], # Required for audio processing ], # Required for audio processing
"video": [], # Kept for backwards compatibility "video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile "flashinfer": [], # Kept for backwards compatibility
"flashinfer": ["flashinfer-python==0.3.1"],
# Optional deps for AMD FP4 quantization support # Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"], "petit-kernel": ["petit-kernel"],
}, },
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment