"DCO" did not exist on "04a3ae0acae3d522299ec90b5730f876daa845e6"
Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -7,8 +7,8 @@ NOTE:
"""
import argparse
import base64
import pybase64 as base64
import requests
import torch
......
......@@ -7,10 +7,10 @@ Refer to each `run_*` function for the command to run the server for that model.
"""
import argparse
import base64
import io
from typing import Literal
import pybase64 as base64
from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import ChatCompletionMessageParam
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import os
import pybase64 as base64
import torch
from vllm import LLM
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import os
import pybase64 as base64
import requests
# This example shows how to perform an online inference that generates
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using ColQwen3.5 late interaction model for reranking.
ColQwen3.5 is a multi-modal ColBERT-style model based on Qwen3.5.
It produces per-token embeddings and uses MaxSim scoring for retrieval
and reranking. Supports both text and image inputs.
Start the server with:
vllm serve athrael-soju/colqwen3.5-4.5B --max-model-len 4096
Then run this script:
python colqwen3_5_rerank_online.py
"""
import requests
MODEL = "athrael-soju/colqwen3.5-4.5B"
BASE_URL = "http://127.0.0.1:8000"
headers = {"accept": "application/json", "Content-Type": "application/json"}
def rerank_text():
"""Text-only reranking via /rerank endpoint."""
print("=" * 60)
print("1. Text reranking (/rerank)")
print("=" * 60)
data = {
"model": MODEL,
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks for complex tasks.",
"The weather today is sunny.",
],
}
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print("\n Ranked documents (most relevant first):")
for item in result["results"]:
doc_idx = item["index"]
score = item["relevance_score"]
print(f" [{score:.4f}] {data['documents'][doc_idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def score_text():
"""Text-only scoring via /score endpoint."""
print()
print("=" * 60)
print("2. Text scoring (/score)")
print("=" * 60)
query = "What is the capital of France?"
documents = [
"The capital of France is Paris.",
"Berlin is the capital of Germany.",
"Python is a programming language.",
]
data = {
"model": MODEL,
"text_1": query,
"text_2": documents,
}
response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print(f"\n Query: {query}\n")
for item in result["data"]:
idx = item["index"]
score = item["score"]
print(f" Doc {idx} (score={score:.4f}): {documents[idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def score_text_top_n():
"""Text reranking with top_n filtering via /rerank endpoint."""
print()
print("=" * 60)
print("3. Text reranking with top_n=2 (/rerank)")
print("=" * 60)
data = {
"model": MODEL,
"query": "What is the capital of France?",
"documents": [
"The capital of France is Paris.",
"Berlin is the capital of Germany.",
"Python is a programming language.",
"The Eiffel Tower is in Paris.",
],
"top_n": 2,
}
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print(f"\n Top {data['top_n']} results:")
for item in result["results"]:
doc_idx = item["index"]
score = item["relevance_score"]
print(f" [{score:.4f}] {data['documents'][doc_idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def main():
rerank_text()
score_text()
score_text_top_n()
if __name__ == "__main__":
main()
......@@ -15,9 +15,9 @@ Then run this script:
python colqwen3_rerank_online.py
"""
import base64
from io import BytesIO
import pybase64 as base64
import requests
from PIL import Image
......
......@@ -21,10 +21,10 @@ Then run this script:
"""
import argparse
import base64
from io import BytesIO
import numpy as np
import pybase64 as base64
import requests
from PIL import Image
......
......@@ -2,25 +2,38 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates async reinforcement learning using vLLM and Ray,
with native weight syncing APIs at engine instance.
with native weight syncing APIs and batch-invariant generation.
The script separates training and inference workloads onto distinct GPUs
so that Ray can manage process placement and inter-process communication.
A Hugging Face Transformer model occupies one GPU for training, whereas a
2x tensor-parallel vLLM inference engine occupies two GPUs.
A Hugging Face Transformer model occupies one GPU for training, and a
vLLM AsyncLLMEngine occupies another GPU for inference.
Batch invariance is enabled so that generation output is deterministic
regardless of how many requests are batched together. This is required
for the validation phase to succeed. Batch invariance currently requires
NVIDIA GPUs with compute capability 9.0 or higher:
- H-series: H100, H200
- B-series: B100, B200
The example performs the following steps:
* Load the training model on one gpu (scheduled via ray)
* Initialize the inference model with dummy weights across
two gpus using vLLM's tensor parallelism and Ray placement groups.
* Generate gibberish from a list of prompts using the randomly initialized
inference engine.
* Pause generation once generation completes for one sequence
* Update the weights of the training model and broadcast the updated weights
to the inference engine by using a Ray collective RPC group.
* Resume generation and print out the results
This example assumes a single-node cluster with three GPUs, but Ray
* Load the training model (Qwen3-1.7B) on one GPU via a Ray actor.
* Initialize the inference engine with a base model (Qwen3-1.7B-Base)
on a separate GPU using vLLM's AsyncLLMEngine with Ray as the
distributed executor backend.
* Set up an NCCL-based weight transfer channel between the trainer
and the inference engine.
* Submit generation requests for a batch of prompts.
* Pause generation once any request reaches a token threshold.
* Broadcast the training model's weights to the inference engine
via the NCCL weight transfer engine, replacing the base weights.
* Resume generation and collect results, noting which tokens were
generated before vs. after the weight swap.
* Validate correctness by launching a fresh vLLM instance loaded
directly with the training model and comparing its output to the
post-swap tokens from the weight-synced engine.
This example assumes a single-node cluster with two GPUs, but Ray
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
workloads. Residual GPU activity interferes with vLLM memory profiling and
causes unexpected behavior.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
RLHF with FSDP2 training (4 GPUs) and vLLM expert-parallel inference (4 GPUs).
8-GPU layout:
Training — 4 GPUs, PyTorch FSDP2 (fully_shard)
Inference — 4 GPUs, vLLM AsyncLLMEngine with expert parallelism +
data parallelism (TP=1, DP=4, enable_expert_parallel
→ EP_SIZE = TP×DP = 4)
FSDP workers are Ray actors that form a single FSDP2 process group.
Rank 0 gathers full parameters via DTensor.full_tensor() and broadcasts
them to the vLLM inference engine through the NCCL weight-transfer API.
The inference engine uses AsyncLLMEngine which automatically spawns
DP worker processes (no manual placement group needed). Weight sync
uses pause_generation / resume_generation.
Steps:
1. Launch 4 FSDP training workers.
2. Launch AsyncLLMEngine with EP+DP (dummy weights).
3. Generate from prompts → gibberish (random weights).
4. Pause generation, transfer weights from FSDP, resume.
5. Generate from prompts → sensible output (synced weights).
Assumes a single-node cluster with 8 GPUs.
"""
import asyncio
import os
import uuid
from dataclasses import asdict
import ray
import torch
import torch.distributed as dist
from huggingface_hub import snapshot_download
from torch.distributed.fsdp import fully_shard
from transformers import AutoModelForCausalLM
import vllm
from vllm import SamplingParams
from vllm.config import WeightTransferConfig
from vllm.distributed.weight_transfer.base import (
WeightTransferInitRequest,
WeightTransferUpdateRequest,
)
from vllm.distributed.weight_transfer.nccl_engine import (
NCCLTrainerSendWeightsArgs,
NCCLWeightTransferEngine,
NCCLWeightTransferInitInfo,
NCCLWeightTransferUpdateInfo,
)
from vllm.utils.network_utils import get_ip, get_open_port
from vllm.v1.executor import Executor
MODEL_NAME = "Qwen/Qwen3-30B-A3B"
FSDP_WORLD_SIZE = 4
INFERENCE_TP_SIZE = 1
INFERENCE_DP_SIZE = 4
@ray.remote(num_gpus=1)
class FSDPTrainWorker:
"""
One FSDP2 training worker per GPU. Four of these form the FSDP group.
Rank 0 additionally handles weight transfer to the vLLM engine.
"""
def __init__(
self,
model_name: str,
rank: int,
fsdp_world_size: int,
fsdp_master_addr: str,
fsdp_master_port: int,
):
self.rank = rank
os.environ["MASTER_ADDR"] = fsdp_master_addr
os.environ["MASTER_PORT"] = str(fsdp_master_port)
dist.init_process_group(backend="nccl", rank=rank, world_size=fsdp_world_size)
torch.accelerator.set_device_index(0)
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.bfloat16
)
self.weight_names = [n for n, _ in model.named_parameters()]
self.weight_dtype_names = [
str(p.dtype).split(".")[-1] for _, p in model.named_parameters()
]
self.weight_shapes = [list(p.shape) for _, p in model.named_parameters()]
for layer in model.model.layers:
fully_shard(layer)
fully_shard(model)
self.model = model
self.transfer_port = None
self.transfer_master_address = None
self.model_update_group = None
def get_rank(self):
return self.rank
# ---- weight-transfer setup (rank 0 only) ----
def setup_transfer_endpoint(self):
"""Create the NCCL rendezvous endpoint for weight transfer."""
assert self.rank == 0
self.transfer_port = get_open_port()
self.transfer_master_address = get_ip()
return self.transfer_master_address, self.transfer_port
def init_weight_transfer_group(self, transfer_world_size: int):
"""Join the weight-transfer NCCL group as rank 0 (the source)."""
assert self.rank == 0
self.model_update_group = NCCLWeightTransferEngine.trainer_init(
dict(
master_address=self.transfer_master_address,
master_port=self.transfer_port,
world_size=transfer_world_size,
),
)
def get_weight_metadata(self):
"""Return weight names, dtypes, and shapes captured before FSDP wrapping."""
return self.weight_names, self.weight_dtype_names, self.weight_shapes
# ---- collective ops (ALL FSDP ranks must call concurrently) ----
def gather_and_broadcast_weights(self, packed: bool = True):
"""
All-gather full parameters and broadcast them to vLLM.
Only rank 0 performs the actual NCCL broadcast; others just
participate in the FSDP all-gather.
full_tensor() is a collective — all FSDP ranks must call it
for each parameter in the same order. Rank 0 additionally
feeds each gathered tensor to the weight-transfer engine.
"""
if self.rank == 0:
def _full_param_iter():
for name, param in self.model.named_parameters():
yield name, param.full_tensor()
trainer_args = NCCLTrainerSendWeightsArgs(
group=self.model_update_group,
packed=packed,
)
NCCLWeightTransferEngine.trainer_send_weights(
iterator=_full_param_iter(),
trainer_args=trainer_args,
)
else:
for _, param in self.model.named_parameters():
param.full_tensor()
def create_async_engine(**kwargs):
"""Create an AsyncLLMEngine directly (no subclass needed)."""
engine_args = vllm.AsyncEngineArgs(**kwargs)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
return vllm.AsyncLLMEngine(
vllm_config=vllm_config,
executor_class=executor_class,
log_requests=engine_args.enable_log_requests,
log_stats=not engine_args.disable_log_stats,
)
async def generate_batch(engine, prompts, sampling_params):
"""Generate completions for a batch of prompts."""
async def gen_one(prompt):
output = None
async for request_output in engine.generate(
{"prompt": prompt},
sampling_params,
request_id=str(uuid.uuid4()),
):
output = request_output
return output
return await asyncio.gather(*[gen_one(p) for p in prompts])
async def main():
ray.init()
# Download model weights to local/shared disk once.
local_model_path = snapshot_download(MODEL_NAME)
print(f"[init] Model downloaded to {local_model_path}")
# FSDP rendezvous address (single-node)
fsdp_master_addr = get_ip()
fsdp_master_port = get_open_port()
# Launch 4 FSDP training workers.
# Ray allocates 1 GPU per worker; AsyncLLMEngine's internal DP
# placement groups will land on the remaining 4 GPUs.
fsdp_workers = [
FSDPTrainWorker.remote(
local_model_path,
rank,
FSDP_WORLD_SIZE,
fsdp_master_addr,
fsdp_master_port,
)
for rank in range(FSDP_WORLD_SIZE)
]
ray.get([w.get_rank.remote() for w in fsdp_workers])
print(f"[init] {FSDP_WORLD_SIZE} FSDP training workers ready.")
# Launch vLLM with expert parallelism + data parallelism.
# AsyncLLMEngine with data_parallel_backend="ray" creates its own
# placement groups internally — no manual placement group needed.
print("[engine] Creating AsyncLLMEngine...")
engine = create_async_engine(
model=local_model_path,
enforce_eager=True,
tensor_parallel_size=INFERENCE_TP_SIZE,
data_parallel_size=INFERENCE_DP_SIZE,
enable_expert_parallel=True,
distributed_executor_backend="ray",
data_parallel_backend="ray",
weight_transfer_config=WeightTransferConfig(backend="nccl"),
load_format="dummy",
gpu_memory_utilization=0.7,
)
print("[engine] AsyncLLMEngine created.")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
# Generate with dummy weights — expect gibberish.
print("[generate] Starting generation with dummy weights...")
outputs = await generate_batch(engine, prompts, sampling_params)
print("[generate] Generation complete.")
print("-" * 60)
print("BEFORE weight sync (dummy weights):")
print("-" * 60)
for output in outputs:
print(f"Prompt: {output.prompt!r}")
print(f"Generated: {output.outputs[0].text!r}")
print("-" * 60)
# --- Weight-transfer setup ---
print("[transfer] Setting up weight-transfer endpoint...")
transfer_addr, transfer_port = ray.get(
fsdp_workers[0].setup_transfer_endpoint.remote()
)
print(f"[transfer] Endpoint ready at {transfer_addr}:{transfer_port}")
transfer_world_size = INFERENCE_TP_SIZE * INFERENCE_DP_SIZE + 1
print(
f"[transfer] World size: {transfer_world_size} "
f"(1 trainer + {INFERENCE_TP_SIZE * INFERENCE_DP_SIZE} vLLM workers)"
)
print("[transfer] Initializing NCCL groups...")
train_handle = fsdp_workers[0].init_weight_transfer_group.remote(
transfer_world_size
)
await engine.init_weight_transfer_engine(
WeightTransferInitRequest(
init_info=asdict(
NCCLWeightTransferInitInfo(
master_address=transfer_addr,
master_port=transfer_port,
rank_offset=1,
world_size=transfer_world_size,
)
)
)
)
ray.get(train_handle)
print("[transfer] NCCL groups initialized.")
# --- Pause, transfer weights, resume ---
print("[sync] Pausing generation...")
await engine.pause_generation(mode="abort")
print("[sync] Generation paused.")
names, dtype_names, shapes = ray.get(fsdp_workers[0].get_weight_metadata.remote())
print(f"[sync] Got metadata for {len(names)} parameters.")
print("[sync] Broadcasting weights from FSDP → vLLM...")
broadcast_handles = [
w.gather_and_broadcast_weights.remote(packed=True) for w in fsdp_workers
]
await engine.update_weights(
WeightTransferUpdateRequest(
update_info=asdict(
NCCLWeightTransferUpdateInfo(
names=names,
dtype_names=dtype_names,
shapes=shapes,
packed=True,
)
)
)
)
ray.get(broadcast_handles)
print("[sync] Weight broadcast complete.")
print("[sync] Resuming generation...")
await engine.resume_generation()
print("[sync] Generation resumed.")
# Generate with synced weights — expect sensible output.
print("[generate] Starting generation with synced weights...")
outputs_updated = await generate_batch(engine, prompts, sampling_params)
print("[generate] Generation complete.")
print("-" * 60)
print("AFTER weight sync (real weights):")
print("-" * 60)
for output in outputs_updated:
print(f"Prompt: {output.prompt!r}")
print(f"Generated: {output.outputs[0].text!r}")
print("-" * 60)
if __name__ == "__main__":
asyncio.run(main())
......@@ -121,7 +121,7 @@ python = "./.venv"
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "examples/pooling/token_embed/*", "build/*",
"vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/test_transcription_validation.py",
"vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/speech_to_text/test_transcription_validation.py",
"docs/governance/process.md", "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"]
ignore-hidden = false
......
......@@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp >= 3.13.3
openai >= 1.99.1, < 2.25.0 # For Responses API with reasoning content
openai >= 2.0.0 # For Responses API with reasoning content
pydantic >= 2.12.0
prometheus_client >= 0.18.0
pillow # Required for image processing
......@@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.13.0 # required for compressed-tensors
compressed-tensors == 0.14.0.1 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
......
......@@ -50,7 +50,7 @@ av==16.1.0
blobfile==3.0.0
# Multi-Modal Models Test
decord==0.6.0
# video processing, required by entrypoints/openai/test_video.py
# video processing, required by entrypoints/openai/chat_completion/test_video.py
rapidfuzz==3.12.1
# OpenAI compatibility and testing
......
......@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
......
......@@ -544,6 +544,7 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
# resampy
numpy==2.2.6
# via
# -r requirements/test.in
......@@ -584,6 +585,7 @@ numpy==2.2.6
# pyogrio
# pywavelets
# rasterio
# resampy
# rioxarray
# rouge-score
# runai-model-streamer
......@@ -995,6 +997,8 @@ requests==2.32.3
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test.in
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
......
# --- Test Infrastructure ---
tblib
pytest-timeout
pytest-cov
pytest-forked
pytest-rerunfailures
pytest-shard
# --- Core Tools & Bindings ---
absl-py
arctic-inference
# --- Audio Processing ---
librosa
audioread
soxr
pooch
soundfile
# --- Tool Parsing & Evaluation ---
blobfile
rapidfuzz
gpt-oss
schemathesis
jiwer
bm25s
pystemmer
mteb[bm25s]
num2words
pqdm
# --- Vision & Multimodal ---
timm
albumentations
mistral-common[image,audio]
\ No newline at end of file
# XPU Test Dependencies
# NOTE: Base image already has common.txt + xpu.txt installed,
# and vllm-openai stage has pytest, pytest-asyncio, lm-eval[api].
# This file only adds incremental test-specific packages.
# Additional test infrastructure (pytest/pytest-asyncio already in base)
# This file was autogenerated by uv via the following command:
# uv pip compile /workspace/vllm/requirements/xpu-test.in -o /workspace/vllm/requirements/xpu-test.txt -c /workspace/vllm/requirements/xpu.txt --index-strategy unsafe-best-match --extra-index-url ${PIP_EXTRA_INDEX_URL} --python-version ${PYTHON_VERSION}
tblib==3.1.0
pytest-timeout==2.3.1
pytest-cov==6.3.0
pytest-forked==1.6.0
pytest-rerunfailures==14.0
pytest-shard==0.1.2
arctic-inference==0.1.1
# Required for audio processing tests
librosa==0.10.2.post1
audioread==3.0.1
soxr==0.5.0.post1
pooch==1.8.2
soundfile==0.13.1
# Required for Mistral's streaming tool parser
blobfile==3.0.0
rapidfuzz==3.12.1
# Required for Mistral's streaming tool parser and some evaluation scripts
gpt-oss==0.0.8
schemathesis==3.39.15
jiwer==4.0.0
bm25s==0.2.13
pystemmer==3.0.0
mteb[bm25s]>=2, <3
num2words==0.5.14
pqdm==0.2.0
# Required for some evaluation scripts
timm==1.0.17
albumentations==1.4.6
mistral-common[image,audio]==1.9.1
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment