Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/examples/pooling/embed/embedding_requests_base64_online.py
+++ b/examples/pooling/embed/embedding_requests_base64_online.py
@@ -7,8 +7,8 @@ NOTE:
 """

 import argparse
-import base64

+import pybase64 as base64
 import requests
 import torch


--- a/examples/pooling/embed/vision_embedding_online.py
+++ b/examples/pooling/embed/vision_embedding_online.py
@@ -7,10 +7,10 @@ Refer to each `run_*` function for the command to run the server for that model.
 """

 import argparse
-import base64
 import io
 from typing import Literal

+import pybase64 as base64
 from openai import OpenAI
 from openai._types import NOT_GIVEN, NotGiven
 from openai.types.chat import ChatCompletionMessageParam

--- a/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
+++ b/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
 import os

+import pybase64 as base64
 import torch

 from vllm import LLM

--- a/examples/pooling/plugin/prithvi_geospatial_mae_online.py
+++ b/examples/pooling/plugin/prithvi_geospatial_mae_online.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import base64
 import os

+import pybase64 as base64
 import requests

 # This example shows how to perform an online inference that generates

--- a/examples/pooling/score/colqwen3_5_rerank_online.py
+++ b/examples/pooling/score/colqwen3_5_rerank_online.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example of using ColQwen3.5 late interaction model for reranking.
+
+ColQwen3.5 is a multi-modal ColBERT-style model based on Qwen3.5.
+It produces per-token embeddings and uses MaxSim scoring for retrieval
+and reranking. Supports both text and image inputs.
+
+Start the server with:
+    vllm serve athrael-soju/colqwen3.5-4.5B --max-model-len 4096
+
+Then run this script:
+    python colqwen3_5_rerank_online.py
+"""
+
+import requests
+
+MODEL = "athrael-soju/colqwen3.5-4.5B"
+BASE_URL = "http://127.0.0.1:8000"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+
+def rerank_text():
+    """Text-only reranking via /rerank endpoint."""
+    print("=" * 60)
+    print("1. Text reranking (/rerank)")
+    print("=" * 60)
+
+    data = {
+        "model": MODEL,
+        "query": "What is machine learning?",
+        "documents": [
+            "Machine learning is a subset of artificial intelligence.",
+            "Python is a programming language.",
+            "Deep learning uses neural networks for complex tasks.",
+            "The weather today is sunny.",
+        ],
+    }
+
+    response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
+
+    if response.status_code == 200:
+        result = response.json()
+        print("\n  Ranked documents (most relevant first):")
+        for item in result["results"]:
+            doc_idx = item["index"]
+            score = item["relevance_score"]
+            print(f"    [{score:.4f}] {data['documents'][doc_idx]}")
+    else:
+        print(f"  Request failed: {response.status_code}")
+        print(f"  {response.text[:300]}")
+
+
+def score_text():
+    """Text-only scoring via /score endpoint."""
+    print()
+    print("=" * 60)
+    print("2. Text scoring (/score)")
+    print("=" * 60)
+
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of France is Paris.",
+        "Berlin is the capital of Germany.",
+        "Python is a programming language.",
+    ]
+
+    data = {
+        "model": MODEL,
+        "text_1": query,
+        "text_2": documents,
+    }
+
+    response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
+
+    if response.status_code == 200:
+        result = response.json()
+        print(f"\n  Query: {query}\n")
+        for item in result["data"]:
+            idx = item["index"]
+            score = item["score"]
+            print(f"    Doc {idx} (score={score:.4f}): {documents[idx]}")
+    else:
+        print(f"  Request failed: {response.status_code}")
+        print(f"  {response.text[:300]}")
+
+
+def score_text_top_n():
+    """Text reranking with top_n filtering via /rerank endpoint."""
+    print()
+    print("=" * 60)
+    print("3. Text reranking with top_n=2 (/rerank)")
+    print("=" * 60)
+
+    data = {
+        "model": MODEL,
+        "query": "What is the capital of France?",
+        "documents": [
+            "The capital of France is Paris.",
+            "Berlin is the capital of Germany.",
+            "Python is a programming language.",
+            "The Eiffel Tower is in Paris.",
+        ],
+        "top_n": 2,
+    }
+
+    response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
+
+    if response.status_code == 200:
+        result = response.json()
+        print(f"\n  Top {data['top_n']} results:")
+        for item in result["results"]:
+            doc_idx = item["index"]
+            score = item["relevance_score"]
+            print(f"    [{score:.4f}] {data['documents'][doc_idx]}")
+    else:
+        print(f"  Request failed: {response.status_code}")
+        print(f"  {response.text[:300]}")
+
+
+def main():
+    rerank_text()
+    score_text()
+    score_text_top_n()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/score/colqwen3_rerank_online.py
+++ b/examples/pooling/score/colqwen3_rerank_online.py
@@ -15,9 +15,9 @@ Then run this script:
    python colqwen3_rerank_online.py
 """

-import base64
 from io import BytesIO

+import pybase64 as base64
 import requests
 from PIL import Image


--- a/examples/pooling/token_embed/colqwen3_token_embed_online.py
+++ b/examples/pooling/token_embed/colqwen3_token_embed_online.py
@@ -21,10 +21,10 @@ Then run this script:
 """

 import argparse
-import base64
 from io import BytesIO

 import numpy as np
+import pybase64 as base64
 import requests
 from PIL import Image


--- a/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
@@ -2,25 +2,38 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates async reinforcement learning using vLLM and Ray,
-with native weight syncing APIs at engine instance.
+with native weight syncing APIs and batch-invariant generation.

 The script separates training and inference workloads onto distinct GPUs
 so that Ray can manage process placement and inter-process communication.
-A Hugging Face Transformer model occupies one GPU for training, whereas a
-2x tensor-parallel vLLM inference engine occupies two GPUs.
+A Hugging Face Transformer model occupies one GPU for training, and a
+vLLM AsyncLLMEngine occupies another GPU for inference.
+
+Batch invariance is enabled so that generation output is deterministic
+regardless of how many requests are batched together. This is required
+for the validation phase to succeed. Batch invariance currently requires
+NVIDIA GPUs with compute capability 9.0 or higher:
+  - H-series: H100, H200
+  - B-series: B100, B200

 The example performs the following steps:
-* Load the training model on one gpu (scheduled via ray)
-* Initialize the inference model with dummy weights across
-  two gpus using vLLM's tensor parallelism and Ray placement groups.
-* Generate gibberish from a list of prompts using the randomly initialized
-  inference engine.
-* Pause generation once generation completes for one sequence
-* Update the weights of the training model and broadcast the updated weights
-  to the inference engine by using a Ray collective RPC group.
-* Resume generation and print out the results
-
-This example assumes a single-node cluster with three GPUs, but Ray
+* Load the training model (Qwen3-1.7B) on one GPU via a Ray actor.
+* Initialize the inference engine with a base model (Qwen3-1.7B-Base)
+  on a separate GPU using vLLM's AsyncLLMEngine with Ray as the
+  distributed executor backend.
+* Set up an NCCL-based weight transfer channel between the trainer
+  and the inference engine.
+* Submit generation requests for a batch of prompts.
+* Pause generation once any request reaches a token threshold.
+* Broadcast the training model's weights to the inference engine
+  via the NCCL weight transfer engine, replacing the base weights.
+* Resume generation and collect results, noting which tokens were
+  generated before vs. after the weight swap.
+* Validate correctness by launching a fresh vLLM instance loaded
+  directly with the training model and comparing its output to the
+  post-swap tokens from the weight-synced engine.
+
+This example assumes a single-node cluster with two GPUs, but Ray
 supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
 workloads. Residual GPU activity interferes with vLLM memory profiling and
 causes unexpected behavior.

--- a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
--- a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
--- a/examples/offline_inference/new_weight_syncing/rlhf_ipc.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf_ipc.py
--- a/examples/offline_inference/new_weight_syncing/rlhf_nccl.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf_nccl.py
--- a/examples/rl/rlhf_nccl_fsdp_ep.py
+++ b/examples/rl/rlhf_nccl_fsdp_ep.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+RLHF with FSDP2 training (4 GPUs) and vLLM expert-parallel inference (4 GPUs).
+
+8-GPU layout:
+  Training  — 4 GPUs, PyTorch FSDP2 (fully_shard)
+  Inference — 4 GPUs, vLLM AsyncLLMEngine with expert parallelism +
+              data parallelism (TP=1, DP=4, enable_expert_parallel
+              → EP_SIZE = TP×DP = 4)
+
+FSDP workers are Ray actors that form a single FSDP2 process group.
+Rank 0 gathers full parameters via DTensor.full_tensor() and broadcasts
+them to the vLLM inference engine through the NCCL weight-transfer API.
+
+The inference engine uses AsyncLLMEngine which automatically spawns
+DP worker processes (no manual placement group needed).  Weight sync
+uses pause_generation / resume_generation.
+
+Steps:
+  1. Launch 4 FSDP training workers.
+  2. Launch AsyncLLMEngine with EP+DP (dummy weights).
+  3. Generate from prompts → gibberish (random weights).
+  4. Pause generation, transfer weights from FSDP, resume.
+  5. Generate from prompts → sensible output (synced weights).
+
+Assumes a single-node cluster with 8 GPUs.
+"""
+
+import asyncio
+import os
+import uuid
+from dataclasses import asdict
+
+import ray
+import torch
+import torch.distributed as dist
+from huggingface_hub import snapshot_download
+from torch.distributed.fsdp import fully_shard
+from transformers import AutoModelForCausalLM
+
+import vllm
+from vllm import SamplingParams
+from vllm.config import WeightTransferConfig
+from vllm.distributed.weight_transfer.base import (
+    WeightTransferInitRequest,
+    WeightTransferUpdateRequest,
+)
+from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLTrainerSendWeightsArgs,
+    NCCLWeightTransferEngine,
+    NCCLWeightTransferInitInfo,
+    NCCLWeightTransferUpdateInfo,
+)
+from vllm.utils.network_utils import get_ip, get_open_port
+from vllm.v1.executor import Executor
+
+MODEL_NAME = "Qwen/Qwen3-30B-A3B"
+
+FSDP_WORLD_SIZE = 4
+INFERENCE_TP_SIZE = 1
+INFERENCE_DP_SIZE = 4
+
+
+@ray.remote(num_gpus=1)
+class FSDPTrainWorker:
+    """
+    One FSDP2 training worker per GPU.  Four of these form the FSDP group.
+    Rank 0 additionally handles weight transfer to the vLLM engine.
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        rank: int,
+        fsdp_world_size: int,
+        fsdp_master_addr: str,
+        fsdp_master_port: int,
+    ):
+        self.rank = rank
+
+        os.environ["MASTER_ADDR"] = fsdp_master_addr
+        os.environ["MASTER_PORT"] = str(fsdp_master_port)
+
+        dist.init_process_group(backend="nccl", rank=rank, world_size=fsdp_world_size)
+        torch.accelerator.set_device_index(0)
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16
+        )
+
+        self.weight_names = [n for n, _ in model.named_parameters()]
+        self.weight_dtype_names = [
+            str(p.dtype).split(".")[-1] for _, p in model.named_parameters()
+        ]
+        self.weight_shapes = [list(p.shape) for _, p in model.named_parameters()]
+
+        for layer in model.model.layers:
+            fully_shard(layer)
+        fully_shard(model)
+
+        self.model = model
+
+        self.transfer_port = None
+        self.transfer_master_address = None
+        self.model_update_group = None
+
+    def get_rank(self):
+        return self.rank
+
+    # ---- weight-transfer setup (rank 0 only) ----
+
+    def setup_transfer_endpoint(self):
+        """Create the NCCL rendezvous endpoint for weight transfer."""
+        assert self.rank == 0
+        self.transfer_port = get_open_port()
+        self.transfer_master_address = get_ip()
+        return self.transfer_master_address, self.transfer_port
+
+    def init_weight_transfer_group(self, transfer_world_size: int):
+        """Join the weight-transfer NCCL group as rank 0 (the source)."""
+        assert self.rank == 0
+        self.model_update_group = NCCLWeightTransferEngine.trainer_init(
+            dict(
+                master_address=self.transfer_master_address,
+                master_port=self.transfer_port,
+                world_size=transfer_world_size,
+            ),
+        )
+
+    def get_weight_metadata(self):
+        """Return weight names, dtypes, and shapes captured before FSDP wrapping."""
+        return self.weight_names, self.weight_dtype_names, self.weight_shapes
+
+    # ---- collective ops (ALL FSDP ranks must call concurrently) ----
+
+    def gather_and_broadcast_weights(self, packed: bool = True):
+        """
+        All-gather full parameters and broadcast them to vLLM.
+        Only rank 0 performs the actual NCCL broadcast; others just
+        participate in the FSDP all-gather.
+
+        full_tensor() is a collective — all FSDP ranks must call it
+        for each parameter in the same order.  Rank 0 additionally
+        feeds each gathered tensor to the weight-transfer engine.
+        """
+        if self.rank == 0:
+
+            def _full_param_iter():
+                for name, param in self.model.named_parameters():
+                    yield name, param.full_tensor()
+
+            trainer_args = NCCLTrainerSendWeightsArgs(
+                group=self.model_update_group,
+                packed=packed,
+            )
+            NCCLWeightTransferEngine.trainer_send_weights(
+                iterator=_full_param_iter(),
+                trainer_args=trainer_args,
+            )
+        else:
+            for _, param in self.model.named_parameters():
+                param.full_tensor()
+
+
+def create_async_engine(**kwargs):
+    """Create an AsyncLLMEngine directly (no subclass needed)."""
+    engine_args = vllm.AsyncEngineArgs(**kwargs)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+    return vllm.AsyncLLMEngine(
+        vllm_config=vllm_config,
+        executor_class=executor_class,
+        log_requests=engine_args.enable_log_requests,
+        log_stats=not engine_args.disable_log_stats,
+    )
+
+
+async def generate_batch(engine, prompts, sampling_params):
+    """Generate completions for a batch of prompts."""
+
+    async def gen_one(prompt):
+        output = None
+        async for request_output in engine.generate(
+            {"prompt": prompt},
+            sampling_params,
+            request_id=str(uuid.uuid4()),
+        ):
+            output = request_output
+        return output
+
+    return await asyncio.gather(*[gen_one(p) for p in prompts])
+
+
+async def main():
+    ray.init()
+
+    # Download model weights to local/shared disk once.
+    local_model_path = snapshot_download(MODEL_NAME)
+    print(f"[init] Model downloaded to {local_model_path}")
+
+    # FSDP rendezvous address (single-node)
+    fsdp_master_addr = get_ip()
+    fsdp_master_port = get_open_port()
+
+    # Launch 4 FSDP training workers.
+    # Ray allocates 1 GPU per worker; AsyncLLMEngine's internal DP
+    # placement groups will land on the remaining 4 GPUs.
+    fsdp_workers = [
+        FSDPTrainWorker.remote(
+            local_model_path,
+            rank,
+            FSDP_WORLD_SIZE,
+            fsdp_master_addr,
+            fsdp_master_port,
+        )
+        for rank in range(FSDP_WORLD_SIZE)
+    ]
+    ray.get([w.get_rank.remote() for w in fsdp_workers])
+    print(f"[init] {FSDP_WORLD_SIZE} FSDP training workers ready.")
+
+    # Launch vLLM with expert parallelism + data parallelism.
+    # AsyncLLMEngine with data_parallel_backend="ray" creates its own
+    # placement groups internally — no manual placement group needed.
+    print("[engine] Creating AsyncLLMEngine...")
+    engine = create_async_engine(
+        model=local_model_path,
+        enforce_eager=True,
+        tensor_parallel_size=INFERENCE_TP_SIZE,
+        data_parallel_size=INFERENCE_DP_SIZE,
+        enable_expert_parallel=True,
+        distributed_executor_backend="ray",
+        data_parallel_backend="ray",
+        weight_transfer_config=WeightTransferConfig(backend="nccl"),
+        load_format="dummy",
+        gpu_memory_utilization=0.7,
+    )
+    print("[engine] AsyncLLMEngine created.")
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+
+    # Generate with dummy weights — expect gibberish.
+    print("[generate] Starting generation with dummy weights...")
+    outputs = await generate_batch(engine, prompts, sampling_params)
+    print("[generate] Generation complete.")
+
+    print("-" * 60)
+    print("BEFORE weight sync (dummy weights):")
+    print("-" * 60)
+    for output in outputs:
+        print(f"Prompt: {output.prompt!r}")
+        print(f"Generated: {output.outputs[0].text!r}")
+        print("-" * 60)
+
+    # --- Weight-transfer setup ---
+    print("[transfer] Setting up weight-transfer endpoint...")
+    transfer_addr, transfer_port = ray.get(
+        fsdp_workers[0].setup_transfer_endpoint.remote()
+    )
+    print(f"[transfer] Endpoint ready at {transfer_addr}:{transfer_port}")
+
+    transfer_world_size = INFERENCE_TP_SIZE * INFERENCE_DP_SIZE + 1
+    print(
+        f"[transfer] World size: {transfer_world_size} "
+        f"(1 trainer + {INFERENCE_TP_SIZE * INFERENCE_DP_SIZE} vLLM workers)"
+    )
+
+    print("[transfer] Initializing NCCL groups...")
+    train_handle = fsdp_workers[0].init_weight_transfer_group.remote(
+        transfer_world_size
+    )
+    await engine.init_weight_transfer_engine(
+        WeightTransferInitRequest(
+            init_info=asdict(
+                NCCLWeightTransferInitInfo(
+                    master_address=transfer_addr,
+                    master_port=transfer_port,
+                    rank_offset=1,
+                    world_size=transfer_world_size,
+                )
+            )
+        )
+    )
+    ray.get(train_handle)
+    print("[transfer] NCCL groups initialized.")
+
+    # --- Pause, transfer weights, resume ---
+    print("[sync] Pausing generation...")
+    await engine.pause_generation(mode="abort")
+    print("[sync] Generation paused.")
+
+    names, dtype_names, shapes = ray.get(fsdp_workers[0].get_weight_metadata.remote())
+    print(f"[sync] Got metadata for {len(names)} parameters.")
+
+    print("[sync] Broadcasting weights from FSDP → vLLM...")
+    broadcast_handles = [
+        w.gather_and_broadcast_weights.remote(packed=True) for w in fsdp_workers
+    ]
+    await engine.update_weights(
+        WeightTransferUpdateRequest(
+            update_info=asdict(
+                NCCLWeightTransferUpdateInfo(
+                    names=names,
+                    dtype_names=dtype_names,
+                    shapes=shapes,
+                    packed=True,
+                )
+            )
+        )
+    )
+    ray.get(broadcast_handles)
+    print("[sync] Weight broadcast complete.")
+
+    print("[sync] Resuming generation...")
+    await engine.resume_generation()
+    print("[sync] Generation resumed.")
+
+    # Generate with synced weights — expect sensible output.
+    print("[generate] Starting generation with synced weights...")
+    outputs_updated = await generate_batch(engine, prompts, sampling_params)
+    print("[generate] Generation complete.")
+
+    print("-" * 60)
+    print("AFTER weight sync (real weights):")
+    print("-" * 60)
+    for output in outputs_updated:
+        print(f"Prompt: {output.prompt!r}")
+        print(f"Generated: {output.outputs[0].text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -121,7 +121,7 @@ python = "./.venv"
 # these files may be written in non english words
 extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*",
    "benchmarks/sonnet.txt", "tests/lora/data/*", "examples/pooling/token_embed/*", "build/*",
-    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/test_transcription_validation.py",
+    "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/speech_to_text/test_transcription_validation.py",
    "docs/governance/process.md", "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"]
 ignore-hidden = false


--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp >= 3.13.3
-openai >= 1.99.1, < 2.25.0  # For Responses API with reasoning content
+openai >= 2.0.0  # For Responses API with reasoning content
 pydantic >= 2.12.0
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.13.0 # required for compressed-tensors
+compressed-tensors == 0.14.0.1 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -50,7 +50,7 @@ av==16.1.0
 blobfile==3.0.0
    # Multi-Modal Models Test
 decord==0.6.0
-    # video processing, required by entrypoints/openai/test_video.py
+    # video processing, required by entrypoints/openai/chat_completion/test_video.py
 rapidfuzz==3.12.1

 # OpenAI compatibility and testing

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
 peft>=0.15.0 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
+resampy # required for audio tests
 sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -544,6 +544,7 @@ numba==0.61.2
    # via
    #   -r requirements/test.in
    #   librosa
+    #   resampy
 numpy==2.2.6
    # via
    #   -r requirements/test.in
@@ -584,6 +585,7 @@ numpy==2.2.6
    #   pyogrio
    #   pywavelets
    #   rasterio
+    #   resampy
    #   rioxarray
    #   rouge-score
    #   runai-model-streamer
@@ -995,6 +997,8 @@ requests==2.32.3
    #   tiktoken
    #   transformers
    #   wandb
+resampy==0.4.3
+    # via -r requirements/test.in
 responses==0.25.3
    # via genai-perf
 rfc3339-validator==0.1.4

--- a/requirements/xpu-test.in
+++ b/requirements/xpu-test.in
+# --- Test Infrastructure ---
+tblib
+pytest-timeout
+pytest-cov
+pytest-forked
+pytest-rerunfailures
+pytest-shard
+
+# --- Core Tools & Bindings ---
+absl-py
+arctic-inference
+
+# --- Audio Processing ---
+librosa
+audioread
+soxr
+pooch
+soundfile
+
+# --- Tool Parsing & Evaluation ---
+blobfile
+rapidfuzz
+gpt-oss
+schemathesis
+jiwer
+bm25s
+pystemmer
+mteb[bm25s]
+num2words
+pqdm
+
+# --- Vision & Multimodal ---
+timm
+albumentations
+mistral-common[image,audio]
\ No newline at end of file
--- a/requirements/xpu-test.txt
+++ b/requirements/xpu-test.txt
+# XPU Test Dependencies
+# NOTE: Base image already has common.txt + xpu.txt installed,
+#       and vllm-openai stage has pytest, pytest-asyncio, lm-eval[api].
+#       This file only adds incremental test-specific packages.
+
+# Additional test infrastructure (pytest/pytest-asyncio already in base)
+# This file was autogenerated by uv via the following command:
+#    uv pip compile /workspace/vllm/requirements/xpu-test.in -o /workspace/vllm/requirements/xpu-test.txt -c /workspace/vllm/requirements/xpu.txt --index-strategy unsafe-best-match --extra-index-url ${PIP_EXTRA_INDEX_URL} --python-version ${PYTHON_VERSION} 
+tblib==3.1.0
+pytest-timeout==2.3.1
+pytest-cov==6.3.0
+pytest-forked==1.6.0
+pytest-rerunfailures==14.0
+pytest-shard==0.1.2
+
+arctic-inference==0.1.1
+
+# Required for audio processing tests
+librosa==0.10.2.post1
+audioread==3.0.1
+soxr==0.5.0.post1
+pooch==1.8.2
+soundfile==0.13.1
+
+# Required for Mistral's streaming tool parser
+blobfile==3.0.0
+rapidfuzz==3.12.1
+
+# Required for Mistral's streaming tool parser and some evaluation scripts
+gpt-oss==0.0.8
+schemathesis==3.39.15
+jiwer==4.0.0
+bm25s==0.2.13
+pystemmer==3.0.0
+mteb[bm25s]>=2, <3
+num2words==0.5.14
+pqdm==0.2.0
+
+# Required for some evaluation scripts
+timm==1.0.17
+albumentations==1.4.6
+mistral-common[image,audio]==1.9.1
\ No newline at end of file