Merge tag 'v0.11.2' into v0.11.2-ori

006693ed · zhuwenwen · 4b51e6f1 · 275de341 · 006693ed · 006693ed
Commit 006693ed authored Dec 01, 2025 by zhuwenwen
20 changed files
--- a/examples/template_vlm2vec.jinja
+++ b/examples/template_vlm2vec.jinja
--- a/examples/template_vlm2vec_qwen2vl.jinja
+++ b/examples/template_vlm2vec_qwen2vl.jinja
+{%- if messages | length > 1 -%}
+    {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+{% set vars = namespace(parts=[]) %}
+{%- for message in messages -%}
+    {%- for content in message['content'] -%}
+        {%- if content['type'] == 'text' -%}
+            {%- set vars.parts = vars.parts + [content['text']] %}
+        {%- elif content['type'] == 'image' -%}
+            {%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
--- a/format.sh
+++ b/format.sh
-#!/bin/bash
-echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install' to install the pre-commit hooks."
-echo "Then linters will run automatically before each commit."
\ No newline at end of file
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -102,7 +102,6 @@ plugins:
          - https://numpy.org/doc/stable/objects.inv
          - https://pytorch.org/docs/stable/objects.inv
          - https://psutil.readthedocs.io/en/stable/objects.inv
-          - https://huggingface.co/docs/transformers/main/en/objects.inv
 markdown_extensions:
  - attr_list
@@ -143,8 +142,3 @@ extra_javascript:
  - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
  - mkdocs/javascript/edit_and_feedback.js
  - mkdocs/javascript/slack_and_forum.js
-# Makes the url format end in .html rather than act as a dir
-# So index.md generates as index.html and is available under URL /index.html
-# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
-use_directory_urls: false
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,9 @@ requires = [
    "cmake>=3.26.1",
    "ninja",
    "packaging>=24.2",
-    "setuptools>=77.0.3,<80.0.0",
+    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.8.0",
+    "torch == 2.9.0",
    "wheel",
    "jinja2",
 ]
@@ -20,7 +20,6 @@ license-files = ["LICENSE"]
 readme = "README.md"
 description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
 classifiers = [
-    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
@@ -31,7 +30,7 @@ classifiers = [
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9,<3.14"
+requires-python = ">=3.10,<3.14"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 [project.urls]
@@ -52,27 +51,10 @@ lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:regi
 where = ["."]
 include = ["vllm*"]
-[tool.yapfignore]
-ignore_patterns = [
-    ".buildkite/**",
-    "benchmarks/**",
-    "build/**",
-    "examples/**",
-]
-[tool.ruff]
-# Allow lines to be as long as 80.
-line-length = 80
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
-# Python 3.8 typing - skip V0 code
-"vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/engine/**/*.py" = ["UP006", "UP035"]
-"vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/worker/**/*.py" = ["UP006", "UP035"]
 [tool.ruff.lint]
 select = [
@@ -87,7 +69,7 @@ select = [
    # flake8-simplify
    "SIM",
    # isort
-    # "I",
+    "I",
    # flake8-logging-format
    "G",
 ]
@@ -96,29 +78,23 @@ ignore = [
    "F405", "F403",
    # lambda expression assignment
    "E731",
+    # zip without `strict=`
+    "B905",
    # Loop control variable not used within loop body
    "B007",
    # f-string format
    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
 ]
+[tool.ruff.format]
+docstring-code-format = true
 [tool.mypy]
 plugins = ['pydantic.mypy']
 ignore_missing_imports = true
 check_untyped_defs = true
 follow_imports = "silent"
-[tool.isort]
-skip_glob = [
-    ".buildkite/*",
-    "benchmarks/*",
-    "examples/*",
-]
-use_parentheses = true
-skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "slow_test",
@@ -126,6 +102,7 @@ markers = [
    "core_model: enable this model test in each PR instead of only nightly",
    "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
    "cpu_model: enable this model test in CPU tests",
+    "cpu_test: mark test as CPU-only test",
    "split: run this test as part of a split",
    "distributed: run this test only in distributed GPU tests",
    "skip_v1: do not run this test with v1",
@@ -206,6 +183,7 @@ ba = "ba"
 [tool.typos.type.py.extend-words]
 ba = "ba"
+nd = "nd"
 [tool.typos.type.cpp]
 extend-glob = ["*.cu"]

--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -2,9 +2,9 @@
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.8.0
+torch==2.9.0
 wheel
 jinja2>=3.1.6
 regex

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,39 +7,38 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.2
+transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.99.1  # For Responses API with reasoning content
-pydantic >= 2.11.7
+pydantic >= 2.12.0
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
-llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
 outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
-importlib_metadata; python_version < '3.10'
+mistral_common[image] >= 1.8.5
-mistral_common[image,audio] >= 1.5.4 # requires numpy>=1.25 #1.8.2
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
-setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.11.0 # required for compressed-tensors
+compressed-tensors == 0.12.2 # required for compressed-tensors
-depyf==0.19.0 # required for profiling and debugging with compilation config
+depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
@@ -49,3 +48,5 @@ pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
+anthropic == 0.71.0
+model-hosting-container-standards < 1.0.0
\ No newline at end of file
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.9.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
 # Common dependencies
 -r common.txt
-numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
 # Dependencies for CPUs
 packaging>=24.2
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.8.0; platform_system == "Darwin"
+torch==2.9.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch

--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
 # Common dependencies
 -r common.txt
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2 # Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.8.0
+torch==2.9.0
-torchaudio==2.8.0
+torchaudio==2.9.0
 # These must be updated alongside torch
-torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
+xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
-xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
+# FlashInfer should be updated together with the Dockerfile
+flashinfer-python==0.5.2
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -9,10 +9,8 @@ mkdocs-git-revision-date-localized-plugin
 mkdocs-minify-plugin
 regex
 ruff
-# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
-cachetools
-msgspec
 pydantic
-torch
+# For generating argparse docs.
+# Adding requirements here should only be used as a last resort.
+msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
 lmcache
-nixl >= 0.5.1 # Required for disaggregated prefill
+nixl >= 0.6.0 # Required for disaggregated prefill
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -23,14 +23,14 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.52.4
+transformers==4.57.1
-tokenizers==0.21.1
+tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.46.1
@@ -40,10 +40,8 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2 # Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
 numpy
-runai-model-streamer==0.11.0
+runai-model-streamer[s3,gcs]==0.15.0
-runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
-pydantic>=2.10 # 2.9 leads to error on python 3.10
+pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
 # Common dependencies
 -r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.3
+--extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.8.0
+torch==2.9.0
-torchvision==0.23.0
+torchvision==0.24.0
-torchaudio==2.8.0
+torchaudio==2.9.0
-triton==3.3.0
+triton==3.5.0
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
-amdsmi==6.2.4
+amdsmi==6.4.3
 timm>=1.0.17
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
 # Common dependencies
 -r common.txt
 tblib==3.1.0
+bm25s==0.2.13
+pystemmer==3.0.0
-# entrypoints test
+# Entrypoints test
 # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
 audioread==3.0.1
 cffi==1.17.1
@@ -15,11 +17,11 @@ soundfile==0.13.1
 soxr==0.5.0.post1
 librosa==0.10.2.post1
-# entrypoints test
+# Entrypoints test
 #vllm[video] # required by entrypoints/openai/test_video.py
 decord==0.6.0
-# entrypoints test
+# Entrypoints test
 #sentence-transformers # required by entrypoints/openai/test_score.py
 sentence-transformers==3.4.1
@@ -29,4 +31,11 @@ matplotlib==3.10.3
 # Multi-Modal Models Test (Extended) 3
 blobfile==3.0.0
+# Required for openai schema test.
+schemathesis==3.39.15
+# Required for mteb test
+mteb[bm25s]>=1.38.11, <2
+# Required for eval tests
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
 # Common dependencies
 -r common.txt
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2 # Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
 # Dependencies for AMD GPUs
-boto3
-botocore
 datasets
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 peft
@@ -15,7 +12,6 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer==0.11.0
+runai-model-streamer[s3,gcs]==0.15.0
-runai-model-streamer-s3==0.11.0
 # conch-triton-kernels==1.2.1
 timm>=1.0.17
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,12 +24,12 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.8.0
+torch==2.9.0
-torchaudio==2.8.0
+torchaudio==2.9.0
-torchvision==0.23.0
+torchvision==0.24.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
@@ -37,8 +37,8 @@ datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.55.2
+transformers==4.57.1
-tokenizers==0.21.1
+tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.46.1
@@ -48,12 +48,12 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+arctic-inference == 0.1.1 # Required for suffix decoding test
-numba == 0.61.2; python_version > '3.9'
+numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer==0.11.0
+runai-model-streamer[s3,gcs]==0.15.0
-runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
-pydantic>=2.10 # 2.9 leads to error on python 3.10
+pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
+gpt-oss >= 0.0.7; python_version > '3.11'
--- a/requirements/test.txt
+++ b/requirements/test.txt
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
 absl-py==2.1.0
    # via rouge-score
 accelerate==1.0.1
@@ -10,18 +10,19 @@ aenum==3.1.16
    # via lightly
 affine==2.4.0
    # via rasterio
-aiohappyeyeballs==2.4.3
+aiohappyeyeballs==2.6.1
    # via aiohttp
-aiohttp==3.10.11
+aiohttp==3.13.0
    # via
    #   aiohttp-cors
    #   datasets
    #   fsspec
+    #   gpt-oss
    #   lm-eval
    #   ray
 aiohttp-cors==0.8.1
    # via ray
-aiosignal==1.3.1
+aiosignal==1.4.0
    # via aiohttp
 albucore==0.0.16
    # via terratorch
@@ -39,6 +40,8 @@ anyio==4.6.2.post1
    # via
    #   httpx
    #   starlette
+arctic-inference==0.1.1
+    # via -r requirements/test.in
 argcomplete==3.5.1
    # via datamodel-code-generator
 arrow==1.3.0
@@ -72,7 +75,9 @@ blobfile==3.0.0
 bm25s==0.2.13
    # via mteb
 boto3==1.35.57
-    # via tensorizer
+    # via
+    #   runai-model-streamer-s3
+    #   tensorizer
 botocore==1.35.57
    # via
    #   boto3
@@ -101,6 +106,8 @@ chardet==5.2.0
    # via mbstrdecoder
 charset-normalizer==3.4.0
    # via requests
+chz==0.3.0
+    # via gpt-oss
 click==8.1.7
    # via
    #   black
@@ -171,7 +178,9 @@ distlib==0.3.9
 dnspython==2.7.0
    # via email-validator
 docker==7.1.0
-    # via mlflow
+    # via
+    #   gpt-oss
+    #   mlflow
 docopt==0.6.2
    # via num2words
 docstring-parser==0.17.0
@@ -197,7 +206,9 @@ eval-type-backport==0.2.2
 evaluate==0.4.3
    # via lm-eval
 fastapi==0.116.1
-    # via mlflow-skinny
+    # via
+    #   gpt-oss
+    #   mlflow-skinny
 fastparquet==2024.11.0
    # via genai-perf
 fastrlock==0.8.2
@@ -249,13 +260,31 @@ gitdb==4.0.12
 gitpython==3.1.44
    # via mlflow-skinny
 google-api-core==2.24.2
-    # via opencensus
+    # via
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   opencensus
 google-auth==2.40.2
    # via
    #   databricks-sdk
    #   google-api-core
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   runai-model-streamer-gcs
+google-cloud-core==2.4.3
+    # via google-cloud-storage
+google-cloud-storage==3.4.0
+    # via runai-model-streamer-gcs
+google-crc32c==1.7.1
+    # via
+    #   google-cloud-storage
+    #   google-resumable-media
+google-resumable-media==2.7.2
+    # via google-cloud-storage
 googleapis-common-protos==1.70.0
    # via google-api-core
+gpt-oss==0.0.8
+    # via -r requirements/test.in
 graphene==3.4.3
    # via mlflow
 graphql-core==3.2.6
@@ -283,6 +312,8 @@ hf-xet==1.1.7
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
+html2text==2025.4.15
+    # via gpt-oss
 httpcore==1.0.6
    # via httpx
 httpx==0.27.2
@@ -417,6 +448,7 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215
 lxml==5.3.0
    # via
    #   blobfile
+    #   gpt-oss
    #   sacrebleu
 mako==1.3.10
    # via alembic
@@ -444,7 +476,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.8.2
+mistral-common==1.8.5
    # via -r requirements/test.in
 mlflow==2.22.0
    # via terratorch
@@ -543,42 +575,44 @@ numpy==1.26.4
    #   tritonclient
    #   vocos
    #   xarray
-nvidia-cublas-cu12==12.8.4.1
+nvidia-cublas-cu12==12.9.1.4
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
-nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-cupti-cu12==12.9.79
    # via torch
-nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-nvrtc-cu12==12.9.86
    # via torch
-nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cuda-runtime-cu12==12.9.79
    # via torch
 nvidia-cudnn-cu12==9.10.2.21
    # via torch
-nvidia-cufft-cu12==11.3.3.83
+nvidia-cufft-cu12==11.4.1.4
    # via torch
-nvidia-cufile-cu12==1.13.1.3
+nvidia-cufile-cu12==1.14.1.1
    # via torch
-nvidia-curand-cu12==10.3.9.90
+nvidia-curand-cu12==10.3.10.19
    # via torch
-nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusolver-cu12==11.7.5.82
    # via torch
-nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparse-cu12==12.5.10.65
    # via
    #   nvidia-cusolver-cu12
    #   torch
 nvidia-cusparselt-cu12==0.7.1
    # via torch
-nvidia-nccl-cu12==2.27.3
+nvidia-nccl-cu12==2.27.5
    # via torch
-nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvjitlink-cu12==12.9.86
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
-nvidia-nvtx-cu12==12.8.90
+nvidia-nvshmem-cu12==3.3.20
+    # via torch
+nvidia-nvtx-cu12==12.9.79
    # via torch
 omegaconf==2.3.0
    # via
@@ -586,6 +620,8 @@ omegaconf==2.3.0
    #   lightning
 open-clip-torch==2.32.0
    # via -r requirements/test.in
+openai-harmony==0.0.4
+    # via gpt-oss
 opencensus==0.11.4
    # via ray
 opencensus-context==0.1.3
@@ -706,7 +742,9 @@ prometheus-client==0.22.0
    #   opentelemetry-exporter-prometheus
    #   ray
 propcache==0.2.0
-    # via yarl
+    # via
+    #   aiohttp
+    #   yarl
 proto-plus==1.26.1
    # via google-api-core
 protobuf==5.28.3
@@ -749,19 +787,21 @@ pycparser==2.22
    # via cffi
 pycryptodomex==3.22.0
    # via blobfile
-pydantic==2.11.7
+pydantic==2.12.0
    # via
    #   -r requirements/test.in
    #   albumentations
    #   datamodel-code-generator
    #   fastapi
+    #   gpt-oss
    #   lightly
    #   mistral-common
    #   mlflow-skinny
    #   mteb
+    #   openai-harmony
    #   pydantic-extra-types
    #   ray
-pydantic-core==2.33.2
+pydantic-core==2.41.1
    # via pydantic
 pydantic-extra-types==2.10.5
    # via mistral-common
@@ -888,6 +928,8 @@ requests==2.32.3
    #   docker
    #   evaluate
    #   google-api-core
+    #   google-cloud-storage
+    #   gpt-oss
    #   huggingface-hub
    #   lightly
    #   lm-eval
@@ -925,10 +967,12 @@ rsa==4.9.1
    # via google-auth
 rtree==1.4.0
    # via torchgeo
-runai-model-streamer==0.11.0
+runai-model-streamer==0.15.0
-    # via -r requirements/test.in
-runai-model-streamer-s3==0.11.0
    # via -r requirements/test.in
+runai-model-streamer-gcs==0.15.0
+    # via runai-model-streamer
+runai-model-streamer-s3==0.15.0
+    # via runai-model-streamer
 s3transfer==0.10.3
    # via boto3
 sacrebleu==2.4.3
@@ -972,14 +1016,11 @@ sentence-transformers==3.2.1
    # via
    #   -r requirements/test.in
    #   mteb
-sentencepiece==0.2.0
-    # via mistral-common
 setuptools==77.0.3
    # via
    #   lightning-utilities
    #   pytablewriter
    #   torch
-    #   triton
 shapely==2.1.1
    # via
    #   geopandas
@@ -1031,6 +1072,8 @@ starlette-testclient==0.4.1
    # via schemathesis
 statsmodels==0.14.4
    # via genai-perf
+structlog==25.4.0
+    # via gpt-oss
 sympy==1.13.3
    # via
    #   einx
@@ -1043,14 +1086,17 @@ tblib==3.1.0
    # via -r requirements/test.in
 tcolorpy==0.1.6
    # via pytablewriter
-tenacity==9.0.0
+tenacity==9.1.2
    # via
+    #   gpt-oss
    #   lm-eval
    #   plotly
 tensorboardx==2.6.4
    # via lightning
 tensorizer==2.10.1
    # via -r requirements/test.in
+termcolor==3.1.0
+    # via gpt-oss
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
    # via -r requirements/test.in
 threadpoolctl==3.5.0
@@ -1059,8 +1105,9 @@ tifffile==2025.3.30
    # via
    #   scikit-image
    #   terratorch
-tiktoken==0.7.0
+tiktoken==0.12.0
    # via
+    #   gpt-oss
    #   lm-eval
    #   mistral-common
 timm==1.0.17
@@ -1070,7 +1117,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.21.1
+tokenizers==0.22.0
    # via
    #   -r requirements/test.in
    #   transformers
@@ -1078,7 +1125,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.8.0+cu128
+torch==2.9.0+cu129
    # via
    #   -r requirements/test.in
    #   accelerate
@@ -1107,7 +1154,7 @@ torch==2.8.0+cu128
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.8.0+cu128
+torchaudio==2.9.0+cu129
    # via
    #   -r requirements/test.in
    #   encodec
@@ -1120,7 +1167,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.23.0+cu128
+torchvision==0.24.0+cu129
    # via
    #   -r requirements/test.in
    #   lightly
@@ -1151,7 +1198,7 @@ tqdm==4.66.6
    #   transformers
 tqdm-multiprocess==0.0.11
    # via lm-eval
-transformers==4.55.2
+transformers==4.57.1
    # via
    #   -r requirements/test.in
    #   genai-perf
@@ -1161,7 +1208,7 @@ transformers==4.55.2
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.4.0
+triton==3.5.0
    # via torch
 tritonclient==2.51.0
    # via
@@ -1178,10 +1225,12 @@ types-python-dateutil==2.9.0.20241206
    # via arrow
 typeshed-client==2.8.2
    # via jsonargparse
-typing-extensions==4.12.2
+typing-extensions==4.15.0
    # via
+    #   aiosignal
    #   albumentations
    #   alembic
+    #   chz
    #   fastapi
    #   graphene
    #   huggingface-hub
@@ -1205,7 +1254,7 @@ typing-extensions==4.12.2
    #   typer
    #   typeshed-client
    #   typing-inspection
-typing-inspection==0.4.1
+typing-inspection==0.4.2
    # via pydantic
 tzdata==2024.2
    # via pandas
@@ -1221,7 +1270,9 @@ urllib3==2.2.3
    #   responses
    #   tritonclient
 uvicorn==0.35.0
-    # via mlflow-skinny
+    # via
+    #   gpt-oss
+    #   mlflow-skinny
 vector-quantize-pytorch==1.21.2
    # via -r requirements/test.in
 virtualenv==20.31.2

--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -5,15 +5,14 @@ ray>=2.9
 cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
-setuptools>=77.0.3,<80.0.0
+setuptools>=77.0.3,<81.0.0
 wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2 # Required for N-gram speculative decoding
-nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
 --extra-index-url=https://download.pytorch.org/whl/xpu
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -34,32 +34,36 @@ logger = logging.getLogger(__name__)
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
-envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
+envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
-    logger.warning(
+    logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
-        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
    VLLM_TARGET_DEVICE = "cpu"
-elif not (sys.platform.startswith("linux")
+elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")):
-          or sys.platform.startswith("darwin")):
    logger.warning(
        "vLLM only supports Linux platform (including WSL) and MacOS."
        "Building on %s, "
-        "so vLLM may not be able to run correctly", sys.platform)
+        "so vLLM may not be able to run correctly",
+        sys.platform,
+    )
    VLLM_TARGET_DEVICE = "empty"
-elif (sys.platform.startswith("linux") and torch.version.cuda is None
+elif (
-      and os.getenv("VLLM_TARGET_DEVICE") is None
+    sys.platform.startswith("linux")
-      and torch.version.hip is None):
+    and torch.version.cuda is None
+    and os.getenv("VLLM_TARGET_DEVICE") is None
+    and torch.version.hip is None
+):
    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
    # fallback to cpu
    VLLM_TARGET_DEVICE = "cpu"
 def is_sccache_available() -> bool:
-    return which("sccache") is not None and \
+    return which("sccache") is not None and not bool(
-        not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
+        int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))
+    )
 def is_ccache_available() -> bool:
@@ -83,8 +87,7 @@ def is_url_available(url: str) -> bool:
 class CMakeExtension(Extension):
+    def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
-    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
        super().__init__(name, sources=[], py_limited_api=True, **kwa)
        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
@@ -121,8 +124,8 @@ class cmake_build_ext(build_ext):
            if nvcc_threads is not None:
                nvcc_threads = int(nvcc_threads)
                logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
-                    nvcc_threads)
+                )
            else:
                nvcc_threads = 1
            num_jobs = max(1, num_jobs // nvcc_threads)
@@ -146,36 +149,36 @@ class cmake_build_ext(build_ext):
        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
        cmake_args = [
-            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
+            "-DCMAKE_BUILD_TYPE={}".format(cfg),
-            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
+            "-DVLLM_TARGET_DEVICE={}".format(VLLM_TARGET_DEVICE),
        ]
        verbose = envs.VERBOSE
        if verbose:
-            cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
+            cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"]
        if is_sccache_available():
            cmake_args += [
-                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                "-DCMAKE_C_COMPILER_LAUNCHER=sccache",
-                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
-                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
-                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
            ]
        elif is_ccache_available():
            cmake_args += [
-                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
+                "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
-                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
-                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
-                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
            ]
        # Pass the python executable to cmake so it can find an exact
        # match.
-        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+        cmake_args += ["-DVLLM_PYTHON_EXECUTABLE={}".format(sys.executable)]
        # Pass the python path to cmake so it can reuse the build dependencies
        # on subsequent calls to python.
-        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
+        cmake_args += ["-DVLLM_PYTHON_PATH={}".format(":".join(sys.path))]
        # Override the base directory for FetchContent downloads to $ROOT/.deps
        # This allows sharing dependencies between profiles,
@@ -183,7 +186,7 @@ class cmake_build_ext(build_ext):
        # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
        fc_base_dir = os.path.join(ROOT_DIR, ".deps")
        fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
-        cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
+        cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
        #
        # Setup parallelism and build tool
@@ -191,30 +194,38 @@ class cmake_build_ext(build_ext):
        num_jobs, nvcc_threads = self.compute_num_jobs()
        if nvcc_threads:
-            cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
+            cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
        if is_ninja_available():
-            build_tool = ['-G', 'Ninja']
+            build_tool = ["-G", "Ninja"]
            cmake_args += [
-                '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
+                "-DCMAKE_JOB_POOL_COMPILE:STRING=compile",
-                '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
+                "-DCMAKE_JOB_POOLS:STRING=compile={}".format(num_jobs),
            ]
        else:
            # Default build tool to whatever cmake picks.
            build_tool = []
        # Make sure we use the nvcc from CUDA_HOME
        if _is_cuda():
-            cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
+            cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
+        elif _is_hip():
+            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
+        other_cmake_args = os.environ.get("CMAKE_ARGS")
+        if other_cmake_args:
+            cmake_args += other_cmake_args.split()
        subprocess.check_call(
-            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
+            ["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args],
-            cwd=self.build_temp)
+            cwd=self.build_temp,
+        )
    def build_extensions(self) -> None:
        # Ensure that CMake is present and working
        try:
-            subprocess.check_output(['cmake', '--version'])
+            subprocess.check_output(["cmake", "--version"])
        except OSError as e:
-            raise RuntimeError('Cannot find CMake executable') from e
+            raise RuntimeError("Cannot find CMake executable") from e
        # Create build directory if it does not exist.
        if not os.path.exists(self.build_temp):
@@ -253,13 +264,18 @@ class cmake_build_ext(build_ext):
            # CMake appends the extension prefix to the install path,
            # and outdir already contains that prefix, so we need to remove it.
            prefix = outdir
-            for _ in range(ext.name.count('.')):
+            for _ in range(ext.name.count(".")):
                prefix = prefix.parent
            # prefix here should actually be the same for all components
            install_args = [
-                "cmake", "--install", ".", "--prefix", prefix, "--component",
+                "cmake",
-                target_name(ext.name)
+                "--install",
+                ".",
+                "--prefix",
+                prefix,
+                "--component",
+                target_name(ext.name),
            ]
            subprocess.check_call(install_args, cwd=self.build_temp)
@@ -270,12 +286,15 @@ class cmake_build_ext(build_ext):
        # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
        # directory so that they can be included in the editable build
        import glob
-        files = glob.glob(os.path.join(self.build_lib, "vllm",
-                                       "vllm_flash_attn", "**", "*.py"),
+        files = glob.glob(
-                          recursive=True)
+            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "**", "*.py"),
+            recursive=True,
+        )
        for file in files:
-            dst_file = os.path.join("vllm/vllm_flash_attn",
+            dst_file = os.path.join(
-                                    file.split("vllm/vllm_flash_attn/")[-1])
+                "vllm/vllm_flash_attn", file.split("vllm/vllm_flash_attn/")[-1]
+            )
            print(f"Copying {file} to {dst_file}")
            os.makedirs(os.path.dirname(dst_file), exist_ok=True)
            self.copy_file(file, dst_file)
@@ -285,8 +304,7 @@ class precompiled_build_ext(build_ext):
    """Disables extension building when using precompiled binaries."""
    def run(self) -> None:
-        assert _is_cuda(
+        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
    def build_extensions(self) -> None:
        print("Skipping build_ext: using precompiled extensions.")
@@ -307,9 +325,9 @@ class precompiled_wheel_utils:
                wheel_filename = wheel_url_or_path.split("/")[-1]
                temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                wheel_path = os.path.join(temp_dir, wheel_filename)
-                print(f"Downloading wheel from {wheel_url_or_path} "
+                print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
-                      f"to {wheel_path}")
                from urllib.request import urlretrieve
                urlretrieve(wheel_url_or_path, filename=wheel_path)
            else:
                wheel_path = wheel_url_or_path
@@ -330,25 +348,29 @@ class precompiled_wheel_utils:
                ]
                compiled_regex = re.compile(
-                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+                )
                file_members = list(
-                    filter(lambda x: x.filename in files_to_copy,
+                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
-                           wheel.filelist))
+                )
                file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename),
+                    filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
-                           wheel.filelist))
+                )
                for file in file_members:
                    print(f"[extract] {file.filename}")
                    target_path = os.path.join(".", file.filename)
                    os.makedirs(os.path.dirname(target_path), exist_ok=True)
-                    with wheel.open(file.filename) as src, open(
+                    with (
-                            target_path, "wb") as dst:
+                        wheel.open(file.filename) as src,
+                        open(target_path, "wb") as dst,
+                    ):
                        shutil.copyfileobj(src, dst)
                    pkg = os.path.dirname(file.filename).replace("/", ".")
                    package_data_patch.setdefault(pkg, []).append(
-                        os.path.basename(file.filename))
+                        os.path.basename(file.filename)
+                    )
            return package_data_patch
        finally:
@@ -364,10 +386,13 @@ class precompiled_wheel_utils:
        try:
            # Get the latest commit hash of the upstream main branch.
-            resp_json = subprocess.check_output([
+            resp_json = subprocess.check_output(
-                "curl", "-s",
+                [
-                "https://api.github.com/repos/vllm-project/vllm/commits/main"
+                    "curl",
-            ]).decode("utf-8")
+                    "-s",
+                    "https://api.github.com/repos/vllm-project/vllm/commits/main",
+                ]
+            ).decode("utf-8")
            upstream_main_commit = json.loads(resp_json)["sha"]
            # In Docker build context, .git may be immutable or missing.
@@ -377,25 +402,32 @@ class precompiled_wheel_utils:
            # Check if the upstream_main_commit exists in the local repo
            try:
                subprocess.check_output(
-                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+                    ["git", "cat-file", "-e", f"{upstream_main_commit}"]
+                )
            except subprocess.CalledProcessError:
                # If not present, fetch it from the remote repository.
                # Note that this does not update any local branches,
                # but ensures that this commit ref and its history are
                # available in our local repo.
-                subprocess.check_call([
+                subprocess.check_call(
-                    "git", "fetch", "https://github.com/vllm-project/vllm",
+                    ["git", "fetch", "https://github.com/vllm-project/vllm", "main"]
-                    "main"
+                )
-                ])
            # Then get the commit hash of the current branch that is the same as
            # the upstream main commit.
-            current_branch = subprocess.check_output(
+            current_branch = (
-                ["git", "branch", "--show-current"]).decode("utf-8").strip()
+                subprocess.check_output(["git", "branch", "--show-current"])
+                .decode("utf-8")
+                .strip()
+            )
-            base_commit = subprocess.check_output([
+            base_commit = (
-                "git", "merge-base", f"{upstream_main_commit}", current_branch
+                subprocess.check_output(
-            ]).decode("utf-8").strip()
+                    ["git", "merge-base", f"{upstream_main_commit}", current_branch]
+                )
+                .decode("utf-8")
+                .strip()
+            )
            return base_commit
        except ValueError as err:
            raise ValueError(err) from None
@@ -403,7 +435,9 @@ class precompiled_wheel_utils:
            logger.warning(
                "Failed to get the base commit in the main branch. "
                "Using the nightly wheel. The libraries in this "
-                "wheel may not be compatible with your dev branch: %s", err)
+                "wheel may not be compatible with your dev branch: %s",
+                err,
+            )
            return "nightly"
@@ -413,12 +447,13 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
-    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
+    return VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()
 def _is_hip() -> bool:
-    return (VLLM_TARGET_DEVICE == "cuda"
+    return (
-            or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
+        VLLM_TARGET_DEVICE == "cuda" or VLLM_TARGET_DEVICE == "rocm"
+    ) and torch.version.hip is not None
 def _is_tpu() -> bool:
@@ -457,8 +492,12 @@ def get_rocm_version():
        minor = ctypes.c_uint32()
        patch = ctypes.c_uint32()
-        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
+        if (
-                                  ctypes.byref(patch)) == 0):
+            get_rocm_core_version(
+                ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)
+            )
+            == 0
+        ):
            return f"{major.value}.{minor.value}.{patch.value}"
        return None
    except Exception:
@@ -471,8 +510,9 @@ def get_nvcc_cuda_version() -> Version:
    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    assert CUDA_HOME is not None, "CUDA_HOME is not set"
-    nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
+    nvcc_output = subprocess.check_output(
-                                          universal_newlines=True)
+        [CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True
+    )
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
@@ -484,18 +524,31 @@ def get_gaudi_sw_version():
    Returns the driver version.
    """
    # Enable console printing for `hl-smi` check
-    output = subprocess.run("hl-smi",
+    output = subprocess.run(
-                            shell=True,
+        "hl-smi",
-                            text=True,
+        shell=True,
-                            capture_output=True,
+        text=True,
-                            env={"ENABLE_CONSOLE": "true"})
+        capture_output=True,
+        env={"ENABLE_CONSOLE": "true"},
+    )
    if output.returncode == 0 and output.stdout:
-        return output.stdout.split("\n")[2].replace(
+        return (
-            " ", "").split(":")[1][:-1].split("-")[0]
+            output.stdout.split("\n")[2]
+            .replace(" ", "")
+            .split(":")[1][:-1]
+            .split("-")[0]
+        )
    return "0.0.0"  # when hl-smi is not available
 def get_vllm_version() -> str:
+    # Allow overriding the version. This is useful to build platform-specific
+    # wheels (e.g. CPU, TPU) without modifying the source.
+    if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
+        print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
+        os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
+        return get_version(write_to="vllm/_version.py")
    version = get_version(write_to="vllm/_version.py")
    sep = "+" if "+" not in version else "."  # dev versions might contain +
@@ -541,8 +594,11 @@ def get_requirements() -> list[str]:
        for line in requirements:
            if line.startswith("-r "):
                resolved_requirements += _read_requirements(line.split()[1])
-            elif not line.startswith("--") and not line.startswith(
+            elif (
-                    "#") and line.strip() != "":
+                not line.startswith("--")
+                and not line.startswith("#")
+                and line.strip() != ""
+            ):
                resolved_requirements.append(line)
        return resolved_requirements
@@ -553,7 +609,7 @@ def get_requirements() -> list[str]:
        cuda_major, cuda_minor = torch.version.cuda.split(".")
        modified_requirements = []
        for req in requirements:
-            if ("vllm-flash-attn" in req and cuda_major != "12"):
+            if "vllm-flash-attn" in req and cuda_major != "12":
                # vllm-flash-attn is built only for CUDA 12.x.
                # Skip for other versions.
                continue
@@ -568,8 +624,7 @@ def get_requirements() -> list[str]:
    elif _is_xpu():
        requirements = _read_requirements("xpu.txt")
    else:
-        raise ValueError(
+        raise ValueError("Unsupported platform, please use CUDA, ROCm, or CPU.")
-            "Unsupported platform, please use CUDA, ROCm, or CPU.")
    return requirements
@@ -577,6 +632,7 @@ ext_modules = []
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 # if _is_hip():
 #     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
@@ -585,15 +641,13 @@ if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
        # FA3 requires CUDA 12.3 or later
-        ext_modules.append(
+        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
-            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
        # Optional since this doesn't get built (produce an .so file) when
        # not targeting a hopper system
+        ext_modules.append(CMakeExtension(name="vllm._flashmla_C", optional=True))
        ext_modules.append(
-            CMakeExtension(name="vllm._flashmla_C", optional=True))
+            CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
-        ext_modules.append(
+        )
-            CMakeExtension(name="vllm._flashmla_extension_C", optional=True))
-    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))
@@ -614,6 +668,7 @@ if envs.VLLM_USE_PRECOMPILED:
        wheel_url = wheel_location
    else:
        import platform
        arch = platform.machine()
        if arch == "x86_64":
            wheel_tag = "manylinux1_x86_64"
@@ -623,8 +678,11 @@ if envs.VLLM_USE_PRECOMPILED:
            raise ValueError(f"Unsupported architecture: {arch}")
        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = (
+            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        )
        from urllib.request import urlopen
        try:
            with urlopen(wheel_url) as resp:
                if resp.status != 200:
@@ -633,8 +691,7 @@ if envs.VLLM_USE_PRECOMPILED:
            print(f"[warn] Falling back to nightly wheel: {e}")
            wheel_url = nightly_wheel_url
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
-        wheel_url)
    for pkg, files in patch.items():
        package_data.setdefault(pkg, []).extend(files)
@@ -645,8 +702,9 @@ if not ext_modules:
    cmdclass = {}
 else:
    cmdclass = {
-        "build_ext":
+        "build_ext": precompiled_build_ext
-        precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+        if envs.VLLM_USE_PRECOMPILED
+        else cmake_build_ext
    }
 setup(
@@ -655,18 +713,17 @@ setup(
    ext_modules=ext_modules,
    install_requires=get_requirements(),
    extras_require={
-        "bench": ["pandas", "datasets"],
+        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": [
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
-            "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
+        "audio": [
-            "google-cloud-storage", "runai-model-streamer-s3", "boto3"
+            "librosa",
-        ],
+            "soundfile",
-        "audio": ["librosa", "soundfile",
+            "mistral_common[audio]",
-                  "mistral_common[audio]"],  # Required for audio processing
+        ],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
-        # FlashInfer should be updated together with the Dockerfile
+        "flashinfer": [],  # Kept for backwards compatibility
-        "flashinfer": ["flashinfer-python==0.3.1"],
        # Optional deps for AMD FP4 quantization support
        "petit-kernel": ["petit-kernel"],
    },