Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"

 export RAY_DEDUP_LOGS=0
-export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1

 vllm serve $MODEL_NAME \
@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
    --enforce-eager \
    --enable-expert-parallel \
    --enable-eplb \
+    --all2all-backend pplx \
    --num-redundant-experts $REDUNDANT_EXPERTS \
    --trust-remote-code \
    --host $HOST \

--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -28,8 +28,14 @@ class BlockStored(KVCacheEvent):
    parent_block_hash: ExternalBlockHash | None
    token_ids: list[int]
    block_size: int
+
    lora_id: int | None
+    """Deprecated: use `lora_name` for KV block key hash.
+    Retained for backward compatibility.
+    """
+
    medium: str | None
+    lora_name: str | None


 class BlockRemoved(KVCacheEvent):

--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio
 """

 import base64
+import os

 import requests
 from openai import OpenAI
@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str:
    return result


+def encode_base64_content_from_file(file_path: str) -> str:
+    """Encode a local file content to base64 format."""
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+        result = base64.b64encode(file_content).decode("utf-8")
+
+    return result
+
+
 # Text-only inference
 def run_text_only(model: str, max_completion_tokens: int) -> None:
    chat_completion = client.chat.completions.create(
@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
 def run_single_image(model: str, max_completion_tokens: int) -> None:
    ## Use image url in the payload
    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_file = "/path/to/image.jpg"  # local file
    chat_completion_from_url = client.chat.completions.create(
        messages=[
            {
@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output from image url:\n", result)

+    ## Use local image url in the payload
+    # Launch the API server/engine with the --allowed-local-media-path argument.
+    if os.path.exists(image_file):
+        chat_completion_from_local_image_url = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"file://{image_file}"},
+                        },
+                    ],
+                }
+            ],
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+        )
+        result = chat_completion_from_local_image_url.choices[0].message.content
+        print("Chat completion output from local image file:\n", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
+
    ## Use base64 encoded image in the payload
    image_base64 = encode_base64_content_from_url(image_url)
    chat_completion_from_base64 = client.chat.completions.create(
@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded image:", result)

+    ## Use base64 encoded local image in the payload
+    if os.path.exists(image_file):
+        local_image_base64 = encode_base64_content_from_file(image_file)
+        chat_completion_from_local_image_base64 = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{local_image_base64}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+        )
+
+        result = chat_completion_from_local_image_base64.choices[0].message.content
+        print("Chat completion output from base64 encoded local image:", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
+

 # Multi-image input inference
 def run_multi_image(model: str, max_completion_tokens: int) -> None:

--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -18,6 +18,7 @@ The script performs:
 2. Streaming transcription using raw HTTP request to the vLLM server.
 """

+import argparse
 import asyncio

 from openai import AsyncOpenAI, OpenAI
@@ -25,14 +26,14 @@ from openai import AsyncOpenAI, OpenAI
 from vllm.assets.audio import AudioAsset


-def sync_openai(audio_path: str, client: OpenAI):
+def sync_openai(audio_path: str, client: OpenAI, model: str):
    """
    Perform synchronous transcription using OpenAI-compatible API.
    """
    with open(audio_path, "rb") as f:
        transcription = client.audio.transcriptions.create(
            file=f,
-            model="openai/whisper-large-v3",
+            model=model,
            language="en",
            response_format="json",
            temperature=0.0,
@@ -42,18 +43,18 @@ def sync_openai(audio_path: str, client: OpenAI):
                repetition_penalty=1.3,
            ),
        )
-        print("transcription result:", transcription.text)
+        print("transcription result [sync]:", transcription.text)


-async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: str):
    """
    Perform asynchronous transcription using OpenAI-compatible API.
    """
-    print("\ntranscription result:", end=" ")
+    print("\ntranscription result [stream]:", end=" ")
    with open(audio_path, "rb") as f:
        transcription = await client.audio.transcriptions.create(
            file=f,
-            model="openai/whisper-large-v3",
+            model=model,
            language="en",
            response_format="json",
            temperature=0.0,
@@ -72,7 +73,47 @@ async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
    print()  # Final newline after stream ends


-def main():
+def stream_api_response(audio_path: str, model: str, openai_api_base: str):
+    """
+    Perform streaming transcription using raw HTTP requests to the vLLM API server.
+    """
+    import json
+    import os
+
+    import requests
+
+    api_url = f"{openai_api_base}/audio/transcriptions"
+    headers = {"User-Agent": "Transcription-Client"}
+    with open(audio_path, "rb") as f:
+        files = {"file": (os.path.basename(audio_path), f)}
+        data = {
+            "stream": "true",
+            "model": model,
+            "language": "en",
+            "response_format": "json",
+        }
+
+        print("\ntranscription result [stream]:", end=" ")
+        response = requests.post(
+            api_url, headers=headers, files=files, data=data, stream=True
+        )
+        for chunk in response.iter_lines(
+            chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+        ):
+            if chunk:
+                data = chunk[len("data: ") :]
+                data = json.loads(data.decode("utf-8"))
+                data = data["choices"][0]
+                delta = data["delta"]["content"]
+                print(delta, end="", flush=True)
+
+                finish_reason = data.get("finish_reason")
+                if finish_reason is not None:
+                    print(f"\n[Stream finished reason: {finish_reason}]")
+                    break
+
+
+def main(args):
    mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
    winning_call = str(AudioAsset("winning_call").get_local_path())

@@ -84,14 +125,41 @@ def main():
        base_url=openai_api_base,
    )

-    sync_openai(mary_had_lamb, client)
+    model = client.models.list().data[0].id
+    print(f"Using model: {model}")
+
+    # Run the synchronous function
+    sync_openai(args.audio_path if args.audio_path else mary_had_lamb, client, model)
+
    # Run the asynchronous function
-    client = AsyncOpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    asyncio.run(stream_openai_response(winning_call, client))
+    if "openai" in model:
+        client = AsyncOpenAI(
+            api_key=openai_api_key,
+            base_url=openai_api_base,
+        )
+        asyncio.run(
+            stream_openai_response(
+                args.audio_path if args.audio_path else winning_call, client, model
+            )
+        )
+    else:
+        stream_api_response(
+            args.audio_path if args.audio_path else winning_call,
+            model,
+            openai_api_base,
+        )


 if __name__ == "__main__":
-    main()
+    # setup argparser
+    parser = argparse.ArgumentParser(
+        description="OpenAI Transcription Client using vLLM API Server"
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default=None,
+        help="The path to the audio file to transcribe.",
+    )
+    args = parser.parse_args()
+    main(args)
--- a/examples/online_serving/openai_translation_client.py
+++ b/examples/online_serving/openai_translation_client.py
@@ -9,11 +9,11 @@ from openai import OpenAI
 from vllm.assets.audio import AudioAsset


-def sync_openai(audio_path: str, client: OpenAI):
+def sync_openai(audio_path: str, client: OpenAI, model: str):
    with open(audio_path, "rb") as f:
        translation = client.audio.translations.create(
            file=f,
-            model="openai/whisper-large-v3",
+            model=model,
            response_format="json",
            temperature=0.0,
            # Additional params not provided by OpenAI API.
@@ -26,11 +26,13 @@ def sync_openai(audio_path: str, client: OpenAI):
        print("translation result:", translation.text)


-async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+async def stream_openai_response(
+    audio_path: str, base_url: str, api_key: str, model: str
+):
    data = {
        "language": "it",
        "stream": True,
-        "model": "openai/whisper-large-v3",
+        "model": model,
    }
    url = base_url + "/audio/translations"
    headers = {"Authorization": f"Bearer {api_key}"}
@@ -66,9 +68,13 @@ def main():
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
-    sync_openai(foscolo, client)
+
+    model = client.models.list().data[0].id
+    print(f"Using model: {model}")
+
+    sync_openai(foscolo, client, model)
    # Run the asynchronous function
-    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key, model))


 if __name__ == "__main__":

--- a/examples/pooling/embed/openai_embedding_long_text/README.md
+++ b/examples/pooling/embed/openai_embedding_long_text/README.md
@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
 ```json
 {
  "pooling_type": "auto",
-  "normalize": true,
+  "use_activation": true,
  "enable_chunked_processing": true,
  "max_embed_len": 3072000
 }

--- a/examples/pooling/embed/openai_embedding_long_text/client.py
+++ b/examples/pooling/embed/openai_embedding_long_text/client.py
@@ -14,7 +14,7 @@ Prerequisites:
   # MEAN pooling (processes all chunks, recommended for complete coverage)
   vllm serve intfloat/multilingual-e5-large \
     --pooler-config \
-      '{"pooling_type": "MEAN", "normalize": true, ' \
+      '{"pooling_type": "MEAN", "use_activation": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
     --served-model-name multilingual-e5-large \
     --trust-remote-code \
@@ -24,7 +24,7 @@ Prerequisites:
   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
   vllm serve BAAI/bge-large-en-v1.5 \
     --pooler-config \
-      '{"pooling_type": "CLS", "normalize": true, ' \
+      '{"pooling_type": "CLS", "use_activation": true, ' \
      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
     --served-model-name bge-large-en-v1.5 \
     --trust-remote-code \

--- a/examples/pooling/embed/openai_embedding_long_text/service.sh
+++ b/examples/pooling/embed/openai_embedding_long_text/service.sh
@@ -96,7 +96,7 @@ echo ""
 echo "🔧 Starting server with enhanced chunked processing configuration..."

 # Build pooler config JSON
-POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
+POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"

 # Start vLLM server with enhanced chunked processing
 vllm serve "$MODEL_NAME" \

--- a/examples/pooling/embed/vision_embedding_offline.py
+++ b/examples/pooling/embed/vision_embedding_offline.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal embedding.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+
+import argparse
+from dataclasses import asdict
+
+from vllm import LLM, EngineArgs
+from vllm.multimodal.utils import fetch_image
+
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+text = "A cat standing in the snow."
+multi_modal_data = {"image": fetch_image(image_url)}
+
+
+def print_embeddings(embeds):
+    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
+    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+
+
+def run_qwen3_vl():
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+    )
+    default_instruction = "Represent the user's input."
+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
+    image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
+    image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
+
+    llm = LLM(**asdict(engine_args))
+
+    print("Text embedding output:")
+    outputs = llm.embed(text_prompt, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    outputs = llm.embed(
+        {
+            "prompt": image_prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image+Text embedding output:")
+    outputs = llm.embed(
+        {
+            "prompt": image_text_prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+model_example_map = {
+    "qwen3_vl": run_qwen3_vl,
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "Script to run a specified VLM through vLLM offline api."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=model_example_map.keys(),
+        required=True,
+        help="The name of the embedding model.",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    model_example_map[args.model]()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
@@ -21,7 +21,8 @@ from PIL import Image
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"

-image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+text = "A cat standing in the snow."


 def create_chat_embeddings(
@@ -30,6 +31,8 @@ def create_chat_embeddings(
    messages: list[ChatCompletionMessageParam],
    model: str,
    encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
+    continue_final_message: bool = False,
+    add_special_tokens: bool = False,
 ) -> CreateEmbeddingResponse:
    """
    Convenience function for accessing vLLM's Chat Embeddings API,
@@ -38,10 +41,21 @@ def create_chat_embeddings(
    return client.post(
        "/embeddings",
        cast_to=CreateEmbeddingResponse,
-        body={"messages": messages, "model": model, "encoding_format": encoding_format},
+        body={
+            "messages": messages,
+            "model": model,
+            "encoding_format": encoding_format,
+            "continue_final_message": continue_final_message,
+            "add_special_tokens": add_special_tokens,
+        },
    )


+def print_embeddings(embeds):
+    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
+    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+
+
 def run_clip(client: OpenAI, model: str):
    """
    Start the server using:
@@ -145,6 +159,113 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
    print("Text embedding output:", response.data[0].embedding)


+def run_qwen3_vl(client: OpenAI, model: str):
+    """
+    Start the server using:
+
+    vllm serve Qwen/Qwen3-VL-Embedding-2B \
+        --runner pooling \
+        --max-model-len 8192
+    """
+
+    default_instruction = "Represent the user's input."
+
+    print("Text embedding output:")
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": default_instruction},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": ""},
+                ],
+            },
+        ],
+        model=model,
+        encoding_format="float",
+        continue_final_message=True,
+        add_special_tokens=True,
+    )
+    print_embeddings(response.data[0].embedding)
+
+    print("Image embedding output:")
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": default_instruction},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": ""},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": ""},
+                ],
+            },
+        ],
+        model=model,
+        encoding_format="float",
+        continue_final_message=True,
+        add_special_tokens=True,
+    )
+    print_embeddings(response.data[0].embedding)
+
+    print("Image+Text embedding output:")
+    response = create_chat_embeddings(
+        client,
+        messages=[
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": default_instruction},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": f"{text}",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": ""},
+                ],
+            },
+        ],
+        model=model,
+        encoding_format="float",
+        continue_final_message=True,
+        add_special_tokens=True,
+    )
+    print_embeddings(response.data[0].embedding)
+
+
 def run_siglip(client: OpenAI, model: str):
    """
    Start the server using:
@@ -213,7 +334,8 @@ def run_vlm2vec(client: OpenAI, model: str):
        encoding_format="float",
    )

-    print("Image embedding output:", response.data[0].embedding)
+    print("Image embedding output:")
+    print_embeddings(response.data[0].embedding)

    response = create_chat_embeddings(
        client,
@@ -233,7 +355,8 @@ def run_vlm2vec(client: OpenAI, model: str):
        encoding_format="float",
    )

-    print("Image+Text embedding output:", response.data[0].embedding)
+    print("Image+Text embedding output:")
+    print_embeddings(response.data[0].embedding)

    response = create_chat_embeddings(
        client,
@@ -249,11 +372,13 @@ def run_vlm2vec(client: OpenAI, model: str):
        encoding_format="float",
    )

-    print("Text embedding output:", response.data[0].embedding)
+    print("Text embedding output:")
+    print_embeddings(response.data[0].embedding)


 model_example_map = {
    "clip": run_clip,
+    "qwen3_vl": run_qwen3_vl,
    "dse_qwen2_vl": run_dse_qwen2_vl,
    "siglip": run_siglip,
    "vlm2vec": run_vlm2vec,

--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
    )


+def run_qwen3_vl(query: Query) -> ModelRequestData:
+    image_placeholder = "<vision_start><|image_pad|><vision_end>"
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = image_placeholder
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"{image_placeholder}\n{text}"
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
 def run_siglip(query: Query) -> ModelRequestData:
    if query["modality"] == "text":
        prompt = query["text"]
@@ -353,6 +383,7 @@ model_example_map = {
    "clip": run_clip,
    "e5_v": run_e5_v,
    "jinavl_reranker": run_jinavl_reranker,
+    "qwen3_vl": run_qwen3_vl,
    "siglip": run_siglip,
    "vlm2vec_phi3v": run_vlm2vec_phi3v,
    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,

--- a/examples/pooling/score/cohere_rerank_client.py
+++ b/examples/pooling/score/cohere_rerank_client.py
--- a/examples/pooling/score/convert_model_to_seq_cls.py
+++ b/examples/pooling/score/convert_model_to_seq_cls.py
@@ -2,35 +2,70 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501

+"""
+Script to convert Large Language Models (LLMs) to Sequence Classification models.
+This is particularly useful for converting reranker models that use next-token
+prediction to a sequence classification format for compatibility with standard
+classification and rerank pipelines.
+
+Usage examples:
+- For BAAI/bge-reranker-v2-gemma:
+  python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma \
+    --classifier_from_tokens '["Yes"]' --method no_post_processing \
+    --path ./bge-reranker-v2-gemma-seq-cls
+
+- For mxbai-rerank-v2:
+  python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 \
+    --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax \
+    --path ./mxbai-rerank-base-v2-seq-cls
+
+- For Qwen3-Reranker:
+  python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B \
+    --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax \
+    --path ./Qwen3-Reranker-0.6B-seq-cls
+
+Note: For BAAI/bge-reranker-v2-gemma, "Yes" and "yes" are different tokens.
+"""
+
 import argparse
 import json

 import torch
 import transformers

-# Usage:
-# for BAAI/bge-reranker-v2-gemma
-# Caution: "Yes" and "yes" are two different tokens
-# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
-# for mxbai-rerank-v2
-# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
-# for Qwen3-Reranker
-# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
-

 def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
-    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
-    assert len(tokens) == 2
+    """
+    This method extracts the difference between weights for 'true' and 'false' tokens
+    from the language model head to create a single classification weight vector.
+
+    Args:
+        causal_lm: The original causal language model
+        seq_cls_model: The target sequence classification model
+        tokenizer: Model tokenizer
+        tokens: List of two tokens representing [false_token, true_token]
+        device: Target device (cpu/cuda)
+
+    Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    """
+    assert len(tokens) == 2, (
+        "Method requires exactly two tokens for binary classification"
+    )

+    # Get the language model head weights (vocabulary_size x hidden_size)
    lm_head_weights = causal_lm.lm_head.weight

+    # Convert token strings to their corresponding token IDs
    false_id = tokenizer.convert_tokens_to_ids(tokens[0])
    true_id = tokenizer.convert_tokens_to_ids(tokens[1])

+    # Compute the classification weight as the difference between true and false token weights
+    # This follows the approach in: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
    score_weight = lm_head_weights[true_id].to(device).to(
        torch.float32
    ) - lm_head_weights[false_id].to(device).to(torch.float32)

+    # Copy the computed weights to the sequence classification model
    with torch.no_grad():
        seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
        if seq_cls_model.score.bias is not None:
@@ -38,12 +73,29 @@ def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):


 def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    """
+    Directly use token weights from the language model head for classification.
+
+    This method maps each classification label directly to a corresponding token
+    in the vocabulary without additional transformation.
+
+    Args:
+        causal_lm: The original causal language model
+        seq_cls_model: The target sequence classification model
+        tokenizer: Model tokenizer
+        tokens: List of tokens representing class labels
+        device: Target device (cpu/cuda)
+    """
+    # Get the language model head weights (vocabulary_size x hidden_size)
    lm_head_weights = causal_lm.lm_head.weight

+    # Convert all tokens to their corresponding token IDs
    token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]

+    # Extract weights for the specific tokens (num_tokens x hidden_size)
    score_weight = lm_head_weights[token_ids].to(device)

+    # Copy the weights to the sequence classification model
    with torch.no_grad():
        seq_cls_model.score.weight.copy_(score_weight)
        if seq_cls_model.score.bias is not None:
@@ -56,21 +108,35 @@ method_map = {


 def converting(
-    model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu"
+    model_name, classifier_from_tokens, path, method, use_sep_token=False, device="cpu"
 ):
-    assert method in method_map
-
+    """
+    Main conversion function to transform a CausalLM model to SequenceClassification.
+
+    Args:
+        model_name: Name or path of the pretrained model
+        classifier_from_tokens: List of tokens used for classification
+        path: Output path to save the converted model
+        method: Conversion method ('from_2_way_softmax' or 'no_post_processing')
+        use_sep_token: Whether to use separating token in the sequence classification model
+        device: Device to load the model on ('cpu' or 'cuda')
+    """
+    assert method in method_map, f"Unknown method: {method}"
+
+    # Determine number of labels based on conversion method
    if method == "from_2_way_softmax":
        assert len(classifier_from_tokens) == 2
        num_labels = 1
    else:
        num_labels = len(classifier_from_tokens)

+    # Load tokenizer and original causal language model
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
        model_name, device_map=device
    )

+    # Load an empty sequence classification model with the same architecture
    seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
@@ -78,14 +144,17 @@ def converting(
        device_map=device,
    )

+    # Apply the selected conversion method to transfer weights
    method_map[method](
        causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
    )

-    # `llm as reranker` defaults to not using pad_token
-    seq_cls_model.config.use_pad_token = use_pad_token
-    seq_cls_model.config.pad_token_id = tokenizer.pad_token_id
+    # Configure separating token settings
+    # Note: `llm as reranker` defaults to not using separating token.
+    seq_cls_model.config.use_sep_token = use_sep_token
+    seq_cls_model.config.sep_token_id = tokenizer.sep_token_id

+    # Save the converted model and tokenizer
    seq_cls_model.save_pretrained(path)
    tokenizer.save_pretrained(path)

@@ -99,25 +168,30 @@ def parse_args():
        "--model_name",
        type=str,
        default="BAAI/bge-reranker-v2-gemma",
-        help="Model name",
+        help="HuggingFace model name or local path",
    )
    parser.add_argument(
        "--classifier_from_tokens",
        type=str,
        default='["Yes"]',
-        help="classifier from tokens",
+        help="JSON string of tokens used for classification labels",
    )
    parser.add_argument(
-        "--method", type=str, default="no_post_processing", help="Converting converting"
+        "--method",
+        type=str,
+        default="no_post_processing",
+        help="Conversion method to use",
    )
    parser.add_argument(
-        "--use-pad-token", action="store_true", help="Whether to use pad_token"
+        "--use-pad-token",
+        action="store_true",
+        help="Enable padding token in the sequence classification model",
    )
    parser.add_argument(
        "--path",
        type=str,
        default="./bge-reranker-v2-gemma-seq-cls",
-        help="Path to save converted model",
+        help="Output directory to save the converted model",
    )
    return parser.parse_args()

@@ -129,6 +203,6 @@ if __name__ == "__main__":
        model_name=args.model_name,
        classifier_from_tokens=json.loads(args.classifier_from_tokens),
        method=args.method,
-        use_pad_token=args.use_pad_token,
+        use_sep_token=args.use_sep_token,
        path=args.path,
    )
--- a/examples/pooling/score/offline_reranker.py
+++ b/examples/pooling/score/offline_reranker.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
-
-from vllm import LLM
-
-model_name = "Qwen/Qwen3-Reranker-0.6B"
-
-# What is the difference between the official original version and one
-# that has been converted into a sequence classification model?
-# Qwen3-Reranker is a language model that doing reranker by using the
-# logits of "no" and "yes" tokens.
-# It needs to computing 151669 tokens logits, making this method extremely
-# inefficient, not to mention incompatible with the vllm score API.
-# A method for converting the original model into a sequence classification
-# model was proposed. See：https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
-# Models converted offline using this method can not only be more efficient
-# and support the vllm score API, but also make the init parameters more
-# concise, for example.
-# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
-
-# If you want to load the official original version, the init parameters are
-# as follows.
-
-
-def get_llm() -> LLM:
-    """Initializes and returns the LLM model for Qwen3-Reranker."""
-    return LLM(
-        model=model_name,
-        runner="pooling",
-        hf_overrides={
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        },
-    )
-
-
-# Why do we need hf_overrides for the official original version:
-# vllm converts it to Qwen3ForSequenceClassification when loaded for
-# better performance.
-# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
-# to manually route to Qwen3ForSequenceClassification.
-# - Then, we will extract the vector corresponding to classifier_from_token
-# from lm_head using `"classifier_from_token": ["no", "yes"]`.
-# - Third, we will convert these two vectors into one vector.  The use of
-# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
-
-# Please use the query_template and document_template to format the query and
-# document for better reranker results.
-
-prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
-suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
-
-query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
-document_template = "<Document>: {doc}{suffix}"
-
-
-def main() -> None:
-    instruction = (
-        "Given a web search query, retrieve relevant passages that answer the query"
-    )
-
-    queries = [
-        "What is the capital of China?",
-        "Explain gravity",
-    ]
-
-    documents = [
-        "The capital of China is Beijing.",
-        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
-    ]
-
-    queries = [
-        query_template.format(prefix=prefix, instruction=instruction, query=query)
-        for query in queries
-    ]
-    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
-
-    llm = get_llm()
-    outputs = llm.score(queries, documents)
-
-    print("-" * 30)
-    print([output.outputs.score for output in outputs])
-    print("-" * 30)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/pooling/score/qwen3_reranker_offline.py
+++ b/examples/pooling/score/qwen3_reranker_offline.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+"""
+What is the difference between the official original version and one
+that has been converted into a sequence classification model?
+
+Qwen3-Reranker is a language model that doing reranker by using the
+logits of "no" and "yes" tokens.
+This requires computing logits for all 151,669 tokens in the vocabulary,
+making it inefficient and incompatible with vLLM's score() API.
+
+A conversion method has been proposed to transform the original model into a
+sequence classification model. This converted model:
+1. Is significantly more efficient
+2. Fully supports vLLM's score() API
+3. Simplifies initialization parameters
+Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+
+For the converted model, initialization would simply be:
+llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
+
+This example demonstrates loading the ORIGINAL model with special overrides
+to make it compatible with vLLM's score API.
+"""
+
+from pathlib import Path
+
+from vllm import LLM
+
+model_name = "Qwen/Qwen3-Reranker-0.6B"
+
+
+def get_llm() -> LLM:
+    """
+    Initializes and returns the LLM model for Qwen3-Reranker.
+
+    Returns:
+        LLM: Configured vLLM instance for reranking tasks.
+
+    Note:
+        This function loads the ORIGINAL Qwen3-Reranker model with specific
+        overrides to make it compatible with vLLM's score API.
+    """
+    return LLM(
+        # Specify the original model from HuggingFace
+        model=model_name,
+        # Use pooling runner for score task
+        runner="pooling",
+        # HuggingFace model configuration overrides required for compatibility
+        hf_overrides={
+            # Manually route to sequence classification architecture
+            # This tells vLLM to use Qwen3ForSequenceClassification instead of
+            # the default Qwen3ForCausalLM
+            "architectures": ["Qwen3ForSequenceClassification"],
+            # Specify which token logits to extract from the language model head
+            # The original reranker uses "no" and "yes" token logits for scoring
+            "classifier_from_token": ["no", "yes"],
+            # Enable special handling for original Qwen3-Reranker models
+            # This flag triggers conversion logic that transforms the two token
+            # vectors into a single classification vector
+            "is_original_qwen3_reranker": True,
+        },
+    )
+
+
+def main() -> None:
+    # Load the Jinja template for formatting query-document pairs
+    # The template ensures proper formatting for the reranker model
+    template_home = Path(__file__).parent / "template"
+    template_path = "qwen3_reranker.jinja"
+    chat_template = (template_home / template_path).read_text()
+
+    # Sample queries for testing the reranker
+    queries = [
+        "What is the capital of China?",
+        "Explain gravity",
+    ]
+
+    # Corresponding documents to be scored against each query
+    documents = [
+        "The capital of China is Beijing.",
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+    ]
+
+    # Initialize the LLM model with the original Qwen3-Reranker configuration
+    llm = get_llm()
+
+    # Compute relevance scores for each query-document pair
+    # The score() method returns a relevance score for each pair
+    # Higher scores indicate better relevance
+    outputs = llm.score(queries, documents, chat_template=chat_template)
+
+    # Extract and print the relevance scores from the outputs
+    # Each output contains a score representing query-document relevance
+    print("-" * 30)
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/score/qwen3_reranker_online.py
+++ b/examples/pooling/score/qwen3_reranker_online.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""
+What is the difference between the official original version and one
+that has been converted into a sequence classification model?
+
+Qwen3-Reranker is a language model that doing reranker by using the
+logits of "no" and "yes" tokens.
+This requires computing logits for all 151,669 tokens in the vocabulary,
+making it inefficient and incompatible with vLLM's score() API.
+
+A conversion method has been proposed to transform the original model into a
+sequence classification model. This converted model:
+1. Is significantly more efficient
+2. Fully supports vLLM's score() API
+3. Simplifies initialization parameters
+Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
+
+For the converted model, initialization would simply be:
+    vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --runner pooling --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+
+This example demonstrates loading the ORIGINAL model with special overrides
+to make it compatible with vLLM's score API.
+    vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja
+"""
+
+import json
+
+import requests
+
+# URL of the vLLM server's score endpoint
+# Default vLLM server runs on localhost port 8000
+url = "http://127.0.0.1:8000/score"
+
+# HTTP headers for the request
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+# Example queries & documents
+queries = [
+    "What is the capital of China?",
+    "Explain gravity",
+]
+documents = [
+    "The capital of China is Beijing.",
+    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+]
+
+# Request payload for the score API
+data = {
+    "model": "Qwen/Qwen3-Reranker-0.6B",
+    "text_1": queries,
+    "text_2": documents,
+}
+
+
+def main():
+    """Main function to send a score request to the vLLM server.
+
+    This function sends a POST request to the /score endpoint with
+    the query and documents, then prints the relevance scores.
+    """
+    # Send POST request to the vLLM server's score endpoint
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        print("Request successful!")
+        # Pretty print the JSON response containing relevance scores
+        # The response includes scores for each document's relevance to the query
+        print(json.dumps(response.json(), indent=2))
+    else:
+        # Handle request failure
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/score/openai_reranker.py
+++ b/examples/pooling/score/openai_reranker.py
--- a/examples/pooling/score/openai_cross_encoder_score.py
+++ b/examples/pooling/score/openai_cross_encoder_score.py
--- a/examples/pooling/score/template/bge-reranker-v2-gemma.jinja
+++ b/examples/pooling/score/template/bge-reranker-v2-gemma.jinja
+A: {{ (messages | selectattr("role", "eq", "query") | first).content }}
+B: {{ (messages | selectattr("role", "eq", "document") | first).content }}
+Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'.
\ No newline at end of file
--- a/examples/pooling/score/template/mxbai_rerank_v2.jinja
+++ b/examples/pooling/score/template/mxbai_rerank_v2.jinja
+<|im_start|>system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
+<|im_start|>user
+query: {{ (messages | selectattr("role", "eq", "query") | first).content }}
+document: {{ (messages | selectattr("role", "eq", "document") | first).content }}
+You are a search relevance expert who evaluates how well documents match search queries. For each query-document pair, carefully analyze the semantic relationship between them, then provide your binary relevance judgment (0 for not relevant, 1 for relevant).
+Relevance:<|im_end|>
+<|im_start|>assistant