Merge tag 'v0.9.2' into v0.9.2-ori

99324e25 · zhuwenwen · cc7f22a8 · a5dd03c1 · 99324e25 · 99324e25
Commit 99324e25 authored Jul 12, 2025 by zhuwenwen
20 changed files
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -202,7 +202,7 @@ def parse_args():



-def deserialize():
+def deserialize(args, tensorizer_config):
    if args.lora_path:
        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
        llm = LLM(model=args.model,
@@ -242,7 +242,7 @@ def deserialize():
    return llm


-if __name__ == '__main__':
+def main():
    args = parse_args()

    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
@@ -260,8 +260,6 @@ if __name__ == '__main__':

    model_ref = args.model

-    model_name = model_ref.split("/")[1]
-
    if args.command == "serialize" or args.command == "deserialize":
        keyfile = args.keyfile
    else:
@@ -309,6 +307,10 @@ if __name__ == '__main__':
                encryption_keyfile = keyfile,
                **credentials
            )
-        deserialize()
+        deserialize(args, tensorizer_config)
    else:
        raise ValueError("Either serialize or deserialize must be specified.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/tool_chat_template_minimax_m1.jinja
+++ b/examples/tool_chat_template_minimax_m1.jinja
+{{ '<begin_of_document>' -}}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- Extract system message #}
+{% set ns = namespace(system_prompt='') -%}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set ns.system_prompt = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set ns.system_prompt = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
+    {%- else %}
+        {%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message #}
+{%- if ns.system_prompt != '' %}
+{{ '<beginning_of_sentence>system ai_setting=assistant\n' + ns.system_prompt + '<end_of_sentence>\n' -}}
+{%- endif %}
+
+{#- Tools configuration #}
+{%- if tools is not none %}
+{{ '<beginning_of_sentence>system tool_setting=tools\nYou are provided with these tools:\n<tools>\n' -}}
+{%- for tool in tools %}
+{{ tool | tojson ~ '\n' -}}
+{%- endfor %}
+{{ '</tools>\n\nIf you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:\n<tool_calls>\n{"name": <tool-name>, "arguments": <args-json-object>}\n...\n</tool_calls><end_of_sentence>\n' -}}
+{%- endif %}
+
+{#- Process messages #}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {%- if message['role'] == 'user' %}
+{{ '<beginning_of_sentence>user name=user\n' -}}
+{%- if message['content'] is string %}
+{{ message['content']|trim -}}
+{%- else %}
+{%- for content in message['content'] %}
+{%- if content['type'] == 'text' %}
+{{ content['text']|trim -}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+        {%- elif message['role'] == 'assistant' %}
+{{ '<beginning_of_sentence>ai name=assistant\n' -}}
+{%- if message['content'] is string %}
+{{ message['content']|trim -}}
+{%- else %}
+{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}
+{{ content['text']|trim -}}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+        {%- endif %}
+    {%- elif 'tool_calls' in message %}
+{{ '<beginning_of_sentence>ai name=assistant\n<tool_calls>\n' -}}
+{%- for tool_call in message.tool_calls %}
+{{ '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}\n' -}}
+{%- endfor %}
+{{ '</tool_calls><end_of_sentence>\n' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+{{ '<beginning_of_sentence>tool name=tools\n' -}}
+{%- if message.content is string %}
+{{ 'tool result: ' + message.content + '\n\n' -}}
+{%- else %}
+{%- for content in message['content'] %}
+{%- if content['type'] == 'text' %}
+{{ 'tool result: ' + content['text'] + '\n\n' -}}
+{%- elif content.get('name') %}
+{{ 'tool name: ' + content['name'] + '\ntool result: ' + content['text'] + '\n\n' -}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{ '<end_of_sentence>\n' -}}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+{{ '<beginning_of_sentence>ai name=assistant\n' -}}
+{%- endif %}
\ No newline at end of file
--- a/examples/tool_chat_template_xlam_llama.jinja
+++ b/examples/tool_chat_template_xlam_llama.jinja
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- Extract system message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] | trim %}
+    {%- set messages = messages[1:] %}
+    {{- system_message + "\n" }}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant. You are developed by Salesforce xLAM team." %}
+    {% set format_instruction %}You have access to a set of tools. When using tools, make calls in a single JSON array: 
+
+[{"name": "tool_call_name", "arguments": {"arg1": "value1", "arg2": "value2"}}, ... (additional parallel tool calls as needed)]
+
+If no tool is suitable, state that explicitly. If the user's input lacks required parameters, ask for clarification. Do not interpret or respond until tool results are returned. Once they are available, process them or make additional calls if needed. For tasks that don't require tools, such as casual conversation or general advice, respond directly in plain text. The available tools are:{% endset %}
+    {{- system_message + "\n" }}
+    {%- if tools is not none %}
+        {{- format_instruction + "\n\n" }}
+    {%- endif %}
+{%- endif %}
+
+
+{%- if tools is not none %}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- "<|eot_id|>" }}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {%- if message['tool_calls'] %}
+            {{- "[" }}
+            {%- for tool_call_function in message.tool_calls %}
+                {%- set tool_call = tool_call_function.function %}
+                {{- '{"name": "' + tool_call.name + '", ' }}
+                {{- '"arguments": ' }}
+                {{- tool_call.arguments | tojson }}
+                {{- "}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "]" }}
+            {{- "<|eot_id|>" }}
+        {%- elif message['content'] %}
+            {{- message['content'] | trim + '<|eot_id|>' }}
+        {%- else %}
+            {{- "[]\n" + '<|eot_id|>' }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>" + "ipython" + "<|end_header_id|>\n\n" }}
+        {%- set content = message["content"] %}
+        {%- if content is mapping or (content is iterable and content is not string) %}
+            {{- content | tojson }}
+        {%- else %}
+            {{- content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
\ No newline at end of file
--- a/examples/tool_chat_template_xlam_qwen.jinja
+++ b/examples/tool_chat_template_xlam_qwen.jinja
+{# System message #}
+{{- "<|im_start|>system\n" }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] | trim %}
+    {%- set messages = messages[1:] %}
+    {{- system_message + "\n" }}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant. You are developed by Salesforce xLAM team." %}
+    {% set format_instruction %}You have access to a set of tools. When using tools, make calls in a single JSON array: 
+
+[{"name": "tool_call_name", "arguments": {"arg1": "value1", "arg2": "value2"}}, ... (additional parallel tool calls as needed)]
+
+If no tool is suitable, state that explicitly. If the user's input lacks required parameters, ask for clarification. Do not interpret or respond until tool results are returned. Once they are available, process them or make additional calls if needed. For tasks that don't require tools, such as casual conversation or general advice, respond directly in plain text. The available tools are:{% endset %}
+    {{- system_message + "\n" }}
+    {%- if tools is not none %}
+        {{- format_instruction + "\n\n" }}
+    {%- endif %}
+{%- endif %}
+
+{%- if tools is not none %}
+    {%- for func in tools %}
+        {{- func | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- "<|im_end|>\n" }}
+{%- for message in messages %}
+    {%- if message['role'] == 'tool' %}
+        {{- "<|im_start|>tool\n" }}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {%- if content is mapping or content is iterable and content is not string %}
+            {{- content | tojson }}
+        {%- else %}
+            {{- content }}
+        {%- endif %}
+        {{- "<|im_end|>\n" }}
+    {%- elif 'tool_calls' in message %}
+        {{- "<|im_start|>assistant\n" }}
+        {%- if message['tool_calls'] %}
+            {{- "[" }}
+            {%- for tool_call in message.tool_calls %}
+                {%- set out = tool_call.function | tojson %}
+                {{- out }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "]"}}
+        {%- elif message['content'] %}
+            {{- message['content'] | trim }}
+        {%- else %}
+            {{- "[]\n" }}
+        {%- endif %}
+        {{- "<|im_end|>\n" }}
+    {%- else %}
+        {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>\n" }}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- "<|im_start|>assistant\n" }}
+{%- endif %}
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
 site_name: vLLM
 site_url: https://docs.vllm.ai
 repo_url: https://github.com/vllm-project/vllm
+edit_uri: edit/main/docs/
 exclude_docs: |
  *.inc.md
  *.template.md
@@ -29,10 +30,12 @@ theme:
        icon: material/brightness-2
        name: Switch to system preference
  features:
+    - content.action.edit
    - content.code.copy
    - content.tabs.link
    - navigation.tracking
    - navigation.tabs
+    - navigation.tabs.sticky
    - navigation.sections
    - navigation.prune
    - navigation.top
@@ -123,6 +126,8 @@ extra_css:
 extra_javascript:
  - mkdocs/javascript/run_llm_widget.js
  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+  - mkdocs/javascript/edit_and_feedback.js
+  - mkdocs/javascript/slack_and_forum.js

 # Makes the url format end in .html rather than act as a dir
 # So index.md generates as index.html and is available under URL /index.html

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ line-length = 80
 "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
 # Python 3.8 typing - skip utils for ROCm
-"vllm/utils.py" = ["UP006", "UP035"]
+"vllm/utils/__init__.py" = ["UP006", "UP035"]

 [tool.ruff.lint]
 select = [
@@ -137,10 +137,6 @@ exclude = [
    'vllm/attention/ops/.*\.py$'
 ]

-[tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile, ElementE"
-skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
-
 [tool.isort]
 skip_glob = [
    ".buildkite/*",
@@ -154,6 +150,7 @@ skip_gitignore = true
 markers = [
    "skip_global_cleanup",
    "core_model: enable this model test in each PR instead of only nightly",
+    "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
    "cpu_model: enable this model test in CPU tests",
    "split: run this test as part of a split",
    "distributed: run this test only in distributed GPU tests",

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -8,12 +8,12 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.51.1
-huggingface-hub[hf_xet] >= 0.32.0  # Required for Xet downloads.
+huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.52.0, <= 1.90.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
@@ -23,7 +23,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@@ -31,20 +31,17 @@ pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata; python_version < '3.10'
-mistral_common[opencv] >= 1.5.4
+mistral_common[opencv] >= 1.6.2
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.1 # required for compressed-tensors
+compressed-tensors == 0.10.2 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
-opentelemetry-sdk>=1.26.0  # vllm.tracing
-opentelemetry-api>=1.26.0  # vllm.tracing
-opentelemetry-exporter-otlp>=1.26.0  # vllm.tracing
-opentelemetry-semantic-conventions-ai>=0.4.1  # vllm.tracing
+pybase64 # fast base64 implementation
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
+# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
+# see https://github.com/pytorch/pytorch/pull/151218
+cmake>=3.26.1
+ninja
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.6.0+cpu
+wheel
+jinja2>=3.1.6
+regex
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.7.0+cpu; platform_machine == "x86_64"
+torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 torch==2.7.0; platform_system == "Darwin"
 torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

@@ -21,11 +21,9 @@ torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchvision==0.22.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts

-# cpu cannot use triton 3.3.0
-triton==3.2.0; platform_machine == "x86_64"
-
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
+intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 py-libnuma; platform_system != "Darwin"
 psutil; platform_system != "Darwin"
+triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
+lmcache
\ No newline at end of file
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
-# Dependency that able to run entrypoints test
-# pytest and its extensions
+# testing
 pytest
-pytest-asyncio
+tensorizer>=2.9.0
 pytest-forked
-pytest-mock
+pytest-asyncio
 pytest-rerunfailures
 pytest-shard
 pytest-timeout

-librosa # required by audio tests in entrypoints/openai
-sentence-transformers # required for embedding tests
-transformers==4.51.3
-transformers_stream_generator # required for qwen-vl test
-numba == 0.61.2; python_version > '3.9'
 # testing utils
-boto3
-botocore
-datasets
-ray >= 2.10.0
+backoff # required for phi4mm test
+blobfile # required for kimi-vl test
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+vocos # required for minicpmo_26 test
 peft
-runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
-tensorizer>=2.9.0
-lm-eval==0.4.8
-buildkite-test-collector==0.1.9
+pqdm
+ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
+jiwer # required for audio tests
+timm # required for internvl test
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+mistral_common[opencv] >= 1.6.2 # required for pixtral test
+num2words # required for smolvlm test
+opencv-python-headless >= 4.11.0 # required for video test
+datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
+mteb>=1.38.11, <2 # required for mteb test
+transformers==4.52.4
+tokenizers==0.21.1
+huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
+# quantization
+bitsandbytes>=0.46.1
+buildkite-test-collector==0.1.9

-# required for quantization test
-bitsandbytes>=0.45.3
-
-# required for minicpmo_26 test
-vector_quantize_pytorch
-vocos

-# required for Basic Models Test
-blobfile # required for kimi-vl test
-matplotlib # required for qwen-vl test
+genai_perf==0.0.8
+tritonclient==2.51.0

-# required for  Multi-Modal Models Test (Standard)
-num2words # required for smolvlm test
-pqdm
-timm # required for internvl test
-
-schemathesis>=3.39.15  # Required for openai schema test.
-mteb>=1.38.11, <2 # required for mteb test
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+numpy
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -28,20 +28,21 @@ torchvision==0.22.0
 transformers_stream_generator # required for qwen-vl test
 mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.5.4 # required for pixtral test
+mistral_common[opencv] >= 1.6.2 # required for pixtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-mteb>=1.38.11, <2 # required for mteb test
+mteb[bm25s]>=1.38.11, <2 # required for mteb test
 transformers==4.52.4
 tokenizers==0.21.1
-huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet]>=0.33.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
-bitsandbytes>=0.45.3
+bitsandbytes==0.46.1
 buildkite-test-collector==0.1.9

+
 genai_perf==0.0.8
 tritonclient==2.51.0

@@ -51,4 +52,4 @@ numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
-pydantic>=2.10 # 2.9 leads to error on python 3.10
\ No newline at end of file
+pydantic>=2.10 # 2.9 leads to error on python 3.10
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -45,12 +45,14 @@ backoff==2.2.1
    # via
    #   -r requirements/test.in
    #   schemathesis
-bitsandbytes==0.45.3
+bitsandbytes==0.46.1
    # via -r requirements/test.in
 black==24.10.0
    # via datamodel-code-generator
 blobfile==3.0.0
    # via -r requirements/test.in
+bm25s==0.2.13
+    # via mteb
 boto3==1.35.57
    # via tensorizer
 botocore==1.35.57
@@ -190,7 +192,7 @@ h11==0.14.0
    # via httpcore
 harfile==0.3.0
    # via schemathesis
-hf-xet==0.1.4
+hf-xet==1.1.3
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
@@ -200,7 +202,7 @@ httpx==0.27.2
    # via
    #   -r requirements/test.in
    #   schemathesis
-huggingface-hub==0.30.1
+huggingface-hub==0.33.0
    # via
    #   -r requirements/test.in
    #   accelerate
@@ -303,7 +305,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.5.4
+mistral-common==1.6.2
    # via -r requirements/test.in
 more-itertools==10.5.0
    # via lm-eval
@@ -344,6 +346,7 @@ numpy==1.26.4
    #   -r requirements/test.in
    #   accelerate
    #   bitsandbytes
+    #   bm25s
    #   contourpy
    #   cupy-cuda12x
    #   datasets
@@ -534,6 +537,8 @@ pyparsing==3.2.0
    # via matplotlib
 pyrate-limiter==3.7.0
    # via schemathesis
+pystemmer==3.0.0
+    # via mteb
 pytablewriter==1.2.0
    # via lm-eval
 pytest==8.3.3
@@ -668,6 +673,7 @@ scikit-learn==1.5.2
    #   sentence-transformers
 scipy==1.13.1
    # via
+    #   bm25s
    #   librosa
    #   mteb
    #   scikit-learn

--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250605
-torchvision==0.23.0.dev20250605
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250618
+torchvision==0.23.0.dev20250618
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -9,6 +9,7 @@ setuptools>=77.0.3,<80.0.0
 wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding

 torch==2.7.0+xpu
 torchaudio

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -6,8 +6,8 @@ import os
 import uuid
 from asyncio import CancelledError
 from copy import copy
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import Any, Optional

 import pytest
 import pytest_asyncio
@@ -32,6 +32,7 @@ class RequestOutput:
 @dataclass
 class MockModelConfig:
    use_async_output_proc = True
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)


 class MockEngine:

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -49,7 +49,13 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", [
+    pytest.param("FLASHINFER",
+                 marks=pytest.mark.skipif(
+                     current_platform.is_rocm(),
+                     reason="FLASHINFER isn't supported on ROCm")),
+    "FLASH_ATTN"
+])
 def test_models(
    hf_runner: HfRunner,
    vllm_runner: VllmRunner,
@@ -99,7 +105,13 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", [
+    pytest.param("FLASHINFER",
+                 marks=pytest.mark.skipif(
+                     current_platform.is_rocm(),
+                     reason="FLASHINFER isn't supported on ROCm")),
+    "FLASH_ATTN"
+])
 def test_models_distributed(
    hf_runner: HfRunner,
    vllm_runner: VllmRunner,
@@ -172,6 +184,8 @@ def test_models_distributed(
 # Due to low-precision numerical divergence, this test is too sensitive to
 # the async postprocessor
 @pytest.mark.parametrize("disable_async_output_proc", [True])
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="machete_prepack_B isn't supported on ROCm")
 def test_models_with_fp8_kv_cache(
    vllm_runner: VllmRunner,
    example_prompts,

--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -31,6 +31,8 @@ def test_bench_serve(server):
        server.host,
        "--port",
        str(server.port),
+        "--dataset-name",
+        "random",
        "--random-input-len",
        "32",
        "--random-output-len",

--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@@ -25,7 +25,7 @@ infiles += [
 infiles += [
    "vllm/model_executor/layers/sampler.py",
    "vllm/sampling_params.py",
-    "vllm/utils.py",
+    "vllm/utils/__init__.py",
 ]

 setup(ext_modules=cythonize(infiles,

--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Sequence
 from copy import deepcopy
 from typing import Callable, Union

 from torch import fx
+from torch._ops import OpOverload

-from vllm.compilation.fx_utils import (find_specified_fn,
-                                       find_specified_fn_maybe)
+from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.inductor_pass import InductorPass
 from vllm.config import get_current_vllm_config

@@ -48,18 +49,19 @@ class TestBackend:
        # assign by reference, will reflect the final state of the graph
        self.final_graph = graph

-    def check_before_ops(self, ops,
-                         find_fn=find_specified_fn, \
-                         find_fn_maybe=find_specified_fn_maybe, \
-                        ops_fully_replaced=True):
+    def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True):
        for op in ops:
-            find_fn(self.graph_pre_pass.nodes, op)
-            if ops_fully_replaced:
-                assert find_fn_maybe(self.graph_post_pass.nodes, op) is None
+            num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
+            num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
+            assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
+            assert num_pre > num_post, f"All nodes remain for op {op.name()}"
+            if fully_replaced:
+                assert num_post == 0, \
+                    f"Unexpected op {op.name()} in post-pass graph"

-    def check_after_ops(self, ops,
-                        find_fn=find_specified_fn, \
-                        find_fn_maybe=find_specified_fn_maybe):
+    def check_after_ops(self, ops: Sequence[OpOverload]):
        for op in ops:
-            find_fn(self.graph_post_pass.nodes, op)
-            assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None
+            num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
+            num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
+            assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph"
+            assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
\ No newline at end of file