Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -55,7 +55,6 @@ done ...@@ -55,7 +55,6 @@ done
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS" echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
export RAY_DEDUP_LOGS=0 export RAY_DEDUP_LOGS=0
export VLLM_ALL2ALL_BACKEND="pplx"
export VLLM_USE_DEEP_GEMM=1 export VLLM_USE_DEEP_GEMM=1
vllm serve $MODEL_NAME \ vllm serve $MODEL_NAME \
...@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \ ...@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
--enforce-eager \ --enforce-eager \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend pplx \
--num-redundant-experts $REDUNDANT_EXPERTS \ --num-redundant-experts $REDUNDANT_EXPERTS \
--trust-remote-code \ --trust-remote-code \
--host $HOST \ --host $HOST \
......
...@@ -28,8 +28,14 @@ class BlockStored(KVCacheEvent): ...@@ -28,8 +28,14 @@ class BlockStored(KVCacheEvent):
parent_block_hash: ExternalBlockHash | None parent_block_hash: ExternalBlockHash | None
token_ids: list[int] token_ids: list[int]
block_size: int block_size: int
lora_id: int | None lora_id: int | None
"""Deprecated: use `lora_name` for KV block key hash.
Retained for backward compatibility.
"""
medium: str | None medium: str | None
lora_name: str | None
class BlockRemoved(KVCacheEvent): class BlockRemoved(KVCacheEvent):
......
...@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio ...@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio
""" """
import base64 import base64
import os
import requests import requests
from openai import OpenAI from openai import OpenAI
...@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str: ...@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str:
return result return result
def encode_base64_content_from_file(file_path: str) -> str:
"""Encode a local file content to base64 format."""
with open(file_path, "rb") as file:
file_content = file.read()
result = base64.b64encode(file_content).decode("utf-8")
return result
# Text-only inference # Text-only inference
def run_text_only(model: str, max_completion_tokens: int) -> None: def run_text_only(model: str, max_completion_tokens: int) -> None:
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
...@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None: ...@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
def run_single_image(model: str, max_completion_tokens: int) -> None: def run_single_image(model: str, max_completion_tokens: int) -> None:
## Use image url in the payload ## Use image url in the payload
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
image_file = "/path/to/image.jpg" # local file
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
messages=[ messages=[
{ {
...@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None: ...@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
result = chat_completion_from_url.choices[0].message.content result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:\n", result) print("Chat completion output from image url:\n", result)
## Use local image url in the payload
# Launch the API server/engine with the --allowed-local-media-path argument.
if os.path.exists(image_file):
chat_completion_from_local_image_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": f"file://{image_file}"},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_local_image_url.choices[0].message.content
print("Chat completion output from local image file:\n", result)
else:
print(f"Local image file not found at {image_file}, skipping local file test.")
## Use base64 encoded image in the payload ## Use base64 encoded image in the payload
image_base64 = encode_base64_content_from_url(image_url) image_base64 = encode_base64_content_from_url(image_url)
chat_completion_from_base64 = client.chat.completions.create( chat_completion_from_base64 = client.chat.completions.create(
...@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None: ...@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
result = chat_completion_from_base64.choices[0].message.content result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result) print("Chat completion output from base64 encoded image:", result)
## Use base64 encoded local image in the payload
if os.path.exists(image_file):
local_image_base64 = encode_base64_content_from_file(image_file)
chat_completion_from_local_image_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{local_image_base64}"
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_local_image_base64.choices[0].message.content
print("Chat completion output from base64 encoded local image:", result)
else:
print(f"Local image file not found at {image_file}, skipping local file test.")
# Multi-image input inference # Multi-image input inference
def run_multi_image(model: str, max_completion_tokens: int) -> None: def run_multi_image(model: str, max_completion_tokens: int) -> None:
......
...@@ -18,6 +18,7 @@ The script performs: ...@@ -18,6 +18,7 @@ The script performs:
2. Streaming transcription using raw HTTP request to the vLLM server. 2. Streaming transcription using raw HTTP request to the vLLM server.
""" """
import argparse
import asyncio import asyncio
from openai import AsyncOpenAI, OpenAI from openai import AsyncOpenAI, OpenAI
...@@ -25,14 +26,14 @@ from openai import AsyncOpenAI, OpenAI ...@@ -25,14 +26,14 @@ from openai import AsyncOpenAI, OpenAI
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
def sync_openai(audio_path: str, client: OpenAI): def sync_openai(audio_path: str, client: OpenAI, model: str):
""" """
Perform synchronous transcription using OpenAI-compatible API. Perform synchronous transcription using OpenAI-compatible API.
""" """
with open(audio_path, "rb") as f: with open(audio_path, "rb") as f:
transcription = client.audio.transcriptions.create( transcription = client.audio.transcriptions.create(
file=f, file=f,
model="openai/whisper-large-v3", model=model,
language="en", language="en",
response_format="json", response_format="json",
temperature=0.0, temperature=0.0,
...@@ -42,18 +43,18 @@ def sync_openai(audio_path: str, client: OpenAI): ...@@ -42,18 +43,18 @@ def sync_openai(audio_path: str, client: OpenAI):
repetition_penalty=1.3, repetition_penalty=1.3,
), ),
) )
print("transcription result:", transcription.text) print("transcription result [sync]:", transcription.text)
async def stream_openai_response(audio_path: str, client: AsyncOpenAI): async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: str):
""" """
Perform asynchronous transcription using OpenAI-compatible API. Perform asynchronous transcription using OpenAI-compatible API.
""" """
print("\ntranscription result:", end=" ") print("\ntranscription result [stream]:", end=" ")
with open(audio_path, "rb") as f: with open(audio_path, "rb") as f:
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
file=f, file=f,
model="openai/whisper-large-v3", model=model,
language="en", language="en",
response_format="json", response_format="json",
temperature=0.0, temperature=0.0,
...@@ -72,7 +73,47 @@ async def stream_openai_response(audio_path: str, client: AsyncOpenAI): ...@@ -72,7 +73,47 @@ async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
print() # Final newline after stream ends print() # Final newline after stream ends
def main(): def stream_api_response(audio_path: str, model: str, openai_api_base: str):
"""
Perform streaming transcription using raw HTTP requests to the vLLM API server.
"""
import json
import os
import requests
api_url = f"{openai_api_base}/audio/transcriptions"
headers = {"User-Agent": "Transcription-Client"}
with open(audio_path, "rb") as f:
files = {"file": (os.path.basename(audio_path), f)}
data = {
"stream": "true",
"model": model,
"language": "en",
"response_format": "json",
}
print("\ntranscription result [stream]:", end=" ")
response = requests.post(
api_url, headers=headers, files=files, data=data, stream=True
)
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = chunk[len("data: ") :]
data = json.loads(data.decode("utf-8"))
data = data["choices"][0]
delta = data["delta"]["content"]
print(delta, end="", flush=True)
finish_reason = data.get("finish_reason")
if finish_reason is not None:
print(f"\n[Stream finished reason: {finish_reason}]")
break
def main(args):
mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path()) mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
winning_call = str(AudioAsset("winning_call").get_local_path()) winning_call = str(AudioAsset("winning_call").get_local_path())
...@@ -84,14 +125,41 @@ def main(): ...@@ -84,14 +125,41 @@ def main():
base_url=openai_api_base, base_url=openai_api_base,
) )
sync_openai(mary_had_lamb, client) model = client.models.list().data[0].id
print(f"Using model: {model}")
# Run the synchronous function
sync_openai(args.audio_path if args.audio_path else mary_had_lamb, client, model)
# Run the asynchronous function # Run the asynchronous function
client = AsyncOpenAI( if "openai" in model:
api_key=openai_api_key, client = AsyncOpenAI(
base_url=openai_api_base, api_key=openai_api_key,
) base_url=openai_api_base,
asyncio.run(stream_openai_response(winning_call, client)) )
asyncio.run(
stream_openai_response(
args.audio_path if args.audio_path else winning_call, client, model
)
)
else:
stream_api_response(
args.audio_path if args.audio_path else winning_call,
model,
openai_api_base,
)
if __name__ == "__main__": if __name__ == "__main__":
main() # setup argparser
parser = argparse.ArgumentParser(
description="OpenAI Transcription Client using vLLM API Server"
)
parser.add_argument(
"--audio_path",
type=str,
default=None,
help="The path to the audio file to transcribe.",
)
args = parser.parse_args()
main(args)
...@@ -9,11 +9,11 @@ from openai import OpenAI ...@@ -9,11 +9,11 @@ from openai import OpenAI
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
def sync_openai(audio_path: str, client: OpenAI): def sync_openai(audio_path: str, client: OpenAI, model: str):
with open(audio_path, "rb") as f: with open(audio_path, "rb") as f:
translation = client.audio.translations.create( translation = client.audio.translations.create(
file=f, file=f,
model="openai/whisper-large-v3", model=model,
response_format="json", response_format="json",
temperature=0.0, temperature=0.0,
# Additional params not provided by OpenAI API. # Additional params not provided by OpenAI API.
...@@ -26,11 +26,13 @@ def sync_openai(audio_path: str, client: OpenAI): ...@@ -26,11 +26,13 @@ def sync_openai(audio_path: str, client: OpenAI):
print("translation result:", translation.text) print("translation result:", translation.text)
async def stream_openai_response(audio_path: str, base_url: str, api_key: str): async def stream_openai_response(
audio_path: str, base_url: str, api_key: str, model: str
):
data = { data = {
"language": "it", "language": "it",
"stream": True, "stream": True,
"model": "openai/whisper-large-v3", "model": model,
} }
url = base_url + "/audio/translations" url = base_url + "/audio/translations"
headers = {"Authorization": f"Bearer {api_key}"} headers = {"Authorization": f"Bearer {api_key}"}
...@@ -66,9 +68,13 @@ def main(): ...@@ -66,9 +68,13 @@ def main():
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
sync_openai(foscolo, client)
model = client.models.list().data[0].id
print(f"Using model: {model}")
sync_openai(foscolo, client, model)
# Run the asynchronous function # Run the asynchronous function
asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key)) asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key, model))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`: ...@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
```json ```json
{ {
"pooling_type": "auto", "pooling_type": "auto",
"normalize": true, "use_activation": true,
"enable_chunked_processing": true, "enable_chunked_processing": true,
"max_embed_len": 3072000 "max_embed_len": 3072000
} }
......
...@@ -14,7 +14,7 @@ Prerequisites: ...@@ -14,7 +14,7 @@ Prerequisites:
# MEAN pooling (processes all chunks, recommended for complete coverage) # MEAN pooling (processes all chunks, recommended for complete coverage)
vllm serve intfloat/multilingual-e5-large \ vllm serve intfloat/multilingual-e5-large \
--pooler-config \ --pooler-config \
'{"pooling_type": "MEAN", "normalize": true, ' \ '{"pooling_type": "MEAN", "use_activation": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \ '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
--served-model-name multilingual-e5-large \ --served-model-name multilingual-e5-large \
--trust-remote-code \ --trust-remote-code \
...@@ -24,7 +24,7 @@ Prerequisites: ...@@ -24,7 +24,7 @@ Prerequisites:
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
vllm serve BAAI/bge-large-en-v1.5 \ vllm serve BAAI/bge-large-en-v1.5 \
--pooler-config \ --pooler-config \
'{"pooling_type": "CLS", "normalize": true, ' \ '{"pooling_type": "CLS", "use_activation": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \ '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
--served-model-name bge-large-en-v1.5 \ --served-model-name bge-large-en-v1.5 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -96,7 +96,7 @@ echo "" ...@@ -96,7 +96,7 @@ echo ""
echo "🔧 Starting server with enhanced chunked processing configuration..." echo "🔧 Starting server with enhanced chunked processing configuration..."
# Build pooler config JSON # Build pooler config JSON
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}" POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
# Start vLLM server with enhanced chunked processing # Start vLLM server with enhanced chunked processing
vllm serve "$MODEL_NAME" \ vllm serve "$MODEL_NAME" \
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import argparse
from dataclasses import asdict
from vllm import LLM, EngineArgs
from vllm.multimodal.utils import fetch_image
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
text = "A cat standing in the snow."
multi_modal_data = {"image": fetch_image(image_url)}
def print_embeddings(embeds):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
def run_qwen3_vl():
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
)
default_instruction = "Represent the user's input."
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
llm = LLM(**asdict(engine_args))
print("Text embedding output:")
outputs = llm.embed(text_prompt, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
outputs = llm.embed(
{
"prompt": image_prompt,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
print("Image+Text embedding output:")
outputs = llm.embed(
{
"prompt": image_text_prompt,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
model_example_map = {
"qwen3_vl": run_qwen3_vl,
}
def parse_args():
parser = argparse.ArgumentParser(
"Script to run a specified VLM through vLLM offline api."
)
parser.add_argument(
"--model",
type=str,
choices=model_example_map.keys(),
required=True,
help="The name of the embedding model.",
)
return parser.parse_args()
def main(args):
model_example_map[args.model]()
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -21,7 +21,8 @@ from PIL import Image ...@@ -21,7 +21,8 @@ from PIL import Image
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
text = "A cat standing in the snow."
def create_chat_embeddings( def create_chat_embeddings(
...@@ -30,6 +31,8 @@ def create_chat_embeddings( ...@@ -30,6 +31,8 @@ def create_chat_embeddings(
messages: list[ChatCompletionMessageParam], messages: list[ChatCompletionMessageParam],
model: str, model: str,
encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN, encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
continue_final_message: bool = False,
add_special_tokens: bool = False,
) -> CreateEmbeddingResponse: ) -> CreateEmbeddingResponse:
""" """
Convenience function for accessing vLLM's Chat Embeddings API, Convenience function for accessing vLLM's Chat Embeddings API,
...@@ -38,10 +41,21 @@ def create_chat_embeddings( ...@@ -38,10 +41,21 @@ def create_chat_embeddings(
return client.post( return client.post(
"/embeddings", "/embeddings",
cast_to=CreateEmbeddingResponse, cast_to=CreateEmbeddingResponse,
body={"messages": messages, "model": model, "encoding_format": encoding_format}, body={
"messages": messages,
"model": model,
"encoding_format": encoding_format,
"continue_final_message": continue_final_message,
"add_special_tokens": add_special_tokens,
},
) )
def print_embeddings(embeds):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
def run_clip(client: OpenAI, model: str): def run_clip(client: OpenAI, model: str):
""" """
Start the server using: Start the server using:
...@@ -145,6 +159,113 @@ def run_dse_qwen2_vl(client: OpenAI, model: str): ...@@ -145,6 +159,113 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
print("Text embedding output:", response.data[0].embedding) print("Text embedding output:", response.data[0].embedding)
def run_qwen3_vl(client: OpenAI, model: str):
"""
Start the server using:
vllm serve Qwen/Qwen3-VL-Embedding-2B \
--runner pooling \
--max-model-len 8192
"""
default_instruction = "Represent the user's input."
print("Text embedding output:")
response = create_chat_embeddings(
client,
messages=[
{
"role": "system",
"content": [
{"type": "text", "text": default_instruction},
],
},
{
"role": "user",
"content": [
{"type": "text", "text": text},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
],
model=model,
encoding_format="float",
continue_final_message=True,
add_special_tokens=True,
)
print_embeddings(response.data[0].embedding)
print("Image embedding output:")
response = create_chat_embeddings(
client,
messages=[
{
"role": "system",
"content": [
{"type": "text", "text": default_instruction},
],
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": ""},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
],
model=model,
encoding_format="float",
continue_final_message=True,
add_special_tokens=True,
)
print_embeddings(response.data[0].embedding)
print("Image+Text embedding output:")
response = create_chat_embeddings(
client,
messages=[
{
"role": "system",
"content": [
{"type": "text", "text": default_instruction},
],
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "text",
"text": f"{text}",
},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
],
model=model,
encoding_format="float",
continue_final_message=True,
add_special_tokens=True,
)
print_embeddings(response.data[0].embedding)
def run_siglip(client: OpenAI, model: str): def run_siglip(client: OpenAI, model: str):
""" """
Start the server using: Start the server using:
...@@ -213,7 +334,8 @@ def run_vlm2vec(client: OpenAI, model: str): ...@@ -213,7 +334,8 @@ def run_vlm2vec(client: OpenAI, model: str):
encoding_format="float", encoding_format="float",
) )
print("Image embedding output:", response.data[0].embedding) print("Image embedding output:")
print_embeddings(response.data[0].embedding)
response = create_chat_embeddings( response = create_chat_embeddings(
client, client,
...@@ -233,7 +355,8 @@ def run_vlm2vec(client: OpenAI, model: str): ...@@ -233,7 +355,8 @@ def run_vlm2vec(client: OpenAI, model: str):
encoding_format="float", encoding_format="float",
) )
print("Image+Text embedding output:", response.data[0].embedding) print("Image+Text embedding output:")
print_embeddings(response.data[0].embedding)
response = create_chat_embeddings( response = create_chat_embeddings(
client, client,
...@@ -249,11 +372,13 @@ def run_vlm2vec(client: OpenAI, model: str): ...@@ -249,11 +372,13 @@ def run_vlm2vec(client: OpenAI, model: str):
encoding_format="float", encoding_format="float",
) )
print("Text embedding output:", response.data[0].embedding) print("Text embedding output:")
print_embeddings(response.data[0].embedding)
model_example_map = { model_example_map = {
"clip": run_clip, "clip": run_clip,
"qwen3_vl": run_qwen3_vl,
"dse_qwen2_vl": run_dse_qwen2_vl, "dse_qwen2_vl": run_dse_qwen2_vl,
"siglip": run_siglip, "siglip": run_siglip,
"vlm2vec": run_vlm2vec, "vlm2vec": run_vlm2vec,
......
...@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData: ...@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
) )
def run_qwen3_vl(query: Query) -> ModelRequestData:
image_placeholder = "<vision_start><|image_pad|><vision_end>"
if query["modality"] == "text":
prompt = query["text"]
image = None
elif query["modality"] == "image":
prompt = image_placeholder
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"{image_placeholder}\n{text}"
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_siglip(query: Query) -> ModelRequestData: def run_siglip(query: Query) -> ModelRequestData:
if query["modality"] == "text": if query["modality"] == "text":
prompt = query["text"] prompt = query["text"]
...@@ -353,6 +383,7 @@ model_example_map = { ...@@ -353,6 +383,7 @@ model_example_map = {
"clip": run_clip, "clip": run_clip,
"e5_v": run_e5_v, "e5_v": run_e5_v,
"jinavl_reranker": run_jinavl_reranker, "jinavl_reranker": run_jinavl_reranker,
"qwen3_vl": run_qwen3_vl,
"siglip": run_siglip, "siglip": run_siglip,
"vlm2vec_phi3v": run_vlm2vec_phi3v, "vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl, "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
......
...@@ -2,35 +2,70 @@ ...@@ -2,35 +2,70 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501 # ruff: noqa: E501
"""
Script to convert Large Language Models (LLMs) to Sequence Classification models.
This is particularly useful for converting reranker models that use next-token
prediction to a sequence classification format for compatibility with standard
classification and rerank pipelines.
Usage examples:
- For BAAI/bge-reranker-v2-gemma:
python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma \
--classifier_from_tokens '["Yes"]' --method no_post_processing \
--path ./bge-reranker-v2-gemma-seq-cls
- For mxbai-rerank-v2:
python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 \
--classifier_from_tokens '["0", "1"]' --method from_2_way_softmax \
--path ./mxbai-rerank-base-v2-seq-cls
- For Qwen3-Reranker:
python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B \
--classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax \
--path ./Qwen3-Reranker-0.6B-seq-cls
Note: For BAAI/bge-reranker-v2-gemma, "Yes" and "yes" are different tokens.
"""
import argparse import argparse
import json import json
import torch import torch
import transformers import transformers
# Usage:
# for BAAI/bge-reranker-v2-gemma
# Caution: "Yes" and "yes" are two different tokens
# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
# for mxbai-rerank-v2
# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
# for Qwen3-Reranker
# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device): def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
# refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 """
assert len(tokens) == 2 This method extracts the difference between weights for 'true' and 'false' tokens
from the language model head to create a single classification weight vector.
Args:
causal_lm: The original causal language model
seq_cls_model: The target sequence classification model
tokenizer: Model tokenizer
tokens: List of two tokens representing [false_token, true_token]
device: Target device (cpu/cuda)
Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
"""
assert len(tokens) == 2, (
"Method requires exactly two tokens for binary classification"
)
# Get the language model head weights (vocabulary_size x hidden_size)
lm_head_weights = causal_lm.lm_head.weight lm_head_weights = causal_lm.lm_head.weight
# Convert token strings to their corresponding token IDs
false_id = tokenizer.convert_tokens_to_ids(tokens[0]) false_id = tokenizer.convert_tokens_to_ids(tokens[0])
true_id = tokenizer.convert_tokens_to_ids(tokens[1]) true_id = tokenizer.convert_tokens_to_ids(tokens[1])
# Compute the classification weight as the difference between true and false token weights
# This follows the approach in: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
score_weight = lm_head_weights[true_id].to(device).to( score_weight = lm_head_weights[true_id].to(device).to(
torch.float32 torch.float32
) - lm_head_weights[false_id].to(device).to(torch.float32) ) - lm_head_weights[false_id].to(device).to(torch.float32)
# Copy the computed weights to the sequence classification model
with torch.no_grad(): with torch.no_grad():
seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0)) seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
if seq_cls_model.score.bias is not None: if seq_cls_model.score.bias is not None:
...@@ -38,12 +73,29 @@ def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device): ...@@ -38,12 +73,29 @@ def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device): def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
"""
Directly use token weights from the language model head for classification.
This method maps each classification label directly to a corresponding token
in the vocabulary without additional transformation.
Args:
causal_lm: The original causal language model
seq_cls_model: The target sequence classification model
tokenizer: Model tokenizer
tokens: List of tokens representing class labels
device: Target device (cpu/cuda)
"""
# Get the language model head weights (vocabulary_size x hidden_size)
lm_head_weights = causal_lm.lm_head.weight lm_head_weights = causal_lm.lm_head.weight
# Convert all tokens to their corresponding token IDs
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
# Extract weights for the specific tokens (num_tokens x hidden_size)
score_weight = lm_head_weights[token_ids].to(device) score_weight = lm_head_weights[token_ids].to(device)
# Copy the weights to the sequence classification model
with torch.no_grad(): with torch.no_grad():
seq_cls_model.score.weight.copy_(score_weight) seq_cls_model.score.weight.copy_(score_weight)
if seq_cls_model.score.bias is not None: if seq_cls_model.score.bias is not None:
...@@ -56,21 +108,35 @@ method_map = { ...@@ -56,21 +108,35 @@ method_map = {
def converting( def converting(
model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu" model_name, classifier_from_tokens, path, method, use_sep_token=False, device="cpu"
): ):
assert method in method_map """
Main conversion function to transform a CausalLM model to SequenceClassification.
Args:
model_name: Name or path of the pretrained model
classifier_from_tokens: List of tokens used for classification
path: Output path to save the converted model
method: Conversion method ('from_2_way_softmax' or 'no_post_processing')
use_sep_token: Whether to use separating token in the sequence classification model
device: Device to load the model on ('cpu' or 'cuda')
"""
assert method in method_map, f"Unknown method: {method}"
# Determine number of labels based on conversion method
if method == "from_2_way_softmax": if method == "from_2_way_softmax":
assert len(classifier_from_tokens) == 2 assert len(classifier_from_tokens) == 2
num_labels = 1 num_labels = 1
else: else:
num_labels = len(classifier_from_tokens) num_labels = len(classifier_from_tokens)
# Load tokenizer and original causal language model
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
causal_lm = transformers.AutoModelForCausalLM.from_pretrained( causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
model_name, device_map=device model_name, device_map=device
) )
# Load an empty sequence classification model with the same architecture
seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained( seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
model_name, model_name,
num_labels=num_labels, num_labels=num_labels,
...@@ -78,14 +144,17 @@ def converting( ...@@ -78,14 +144,17 @@ def converting(
device_map=device, device_map=device,
) )
# Apply the selected conversion method to transfer weights
method_map[method]( method_map[method](
causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
) )
# `llm as reranker` defaults to not using pad_token # Configure separating token settings
seq_cls_model.config.use_pad_token = use_pad_token # Note: `llm as reranker` defaults to not using separating token.
seq_cls_model.config.pad_token_id = tokenizer.pad_token_id seq_cls_model.config.use_sep_token = use_sep_token
seq_cls_model.config.sep_token_id = tokenizer.sep_token_id
# Save the converted model and tokenizer
seq_cls_model.save_pretrained(path) seq_cls_model.save_pretrained(path)
tokenizer.save_pretrained(path) tokenizer.save_pretrained(path)
...@@ -99,25 +168,30 @@ def parse_args(): ...@@ -99,25 +168,30 @@ def parse_args():
"--model_name", "--model_name",
type=str, type=str,
default="BAAI/bge-reranker-v2-gemma", default="BAAI/bge-reranker-v2-gemma",
help="Model name", help="HuggingFace model name or local path",
) )
parser.add_argument( parser.add_argument(
"--classifier_from_tokens", "--classifier_from_tokens",
type=str, type=str,
default='["Yes"]', default='["Yes"]',
help="classifier from tokens", help="JSON string of tokens used for classification labels",
) )
parser.add_argument( parser.add_argument(
"--method", type=str, default="no_post_processing", help="Converting converting" "--method",
type=str,
default="no_post_processing",
help="Conversion method to use",
) )
parser.add_argument( parser.add_argument(
"--use-pad-token", action="store_true", help="Whether to use pad_token" "--use-pad-token",
action="store_true",
help="Enable padding token in the sequence classification model",
) )
parser.add_argument( parser.add_argument(
"--path", "--path",
type=str, type=str,
default="./bge-reranker-v2-gemma-seq-cls", default="./bge-reranker-v2-gemma-seq-cls",
help="Path to save converted model", help="Output directory to save the converted model",
) )
return parser.parse_args() return parser.parse_args()
...@@ -129,6 +203,6 @@ if __name__ == "__main__": ...@@ -129,6 +203,6 @@ if __name__ == "__main__":
model_name=args.model_name, model_name=args.model_name,
classifier_from_tokens=json.loads(args.classifier_from_tokens), classifier_from_tokens=json.loads(args.classifier_from_tokens),
method=args.method, method=args.method,
use_pad_token=args.use_pad_token, use_sep_token=args.use_sep_token,
path=args.path, path=args.path,
) )
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from vllm import LLM
model_name = "Qwen/Qwen3-Reranker-0.6B"
# What is the difference between the official original version and one
# that has been converted into a sequence classification model?
# Qwen3-Reranker is a language model that doing reranker by using the
# logits of "no" and "yes" tokens.
# It needs to computing 151669 tokens logits, making this method extremely
# inefficient, not to mention incompatible with the vllm score API.
# A method for converting the original model into a sequence classification
# model was proposed. See:https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
# Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more
# concise, for example.
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
# If you want to load the official original version, the init parameters are
# as follows.
def get_llm() -> LLM:
"""Initializes and returns the LLM model for Qwen3-Reranker."""
return LLM(
model=model_name,
runner="pooling",
hf_overrides={
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
)
# Why do we need hf_overrides for the official original version:
# vllm converts it to Qwen3ForSequenceClassification when loaded for
# better performance.
# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
# to manually route to Qwen3ForSequenceClassification.
# - Then, we will extract the vector corresponding to classifier_from_token
# from lm_head using `"classifier_from_token": ["no", "yes"]`.
# - Third, we will convert these two vectors into one vector. The use of
# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
# Please use the query_template and document_template to format the query and
# document for better reranker results.
prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
document_template = "<Document>: {doc}{suffix}"
def main() -> None:
instruction = (
"Given a web search query, retrieve relevant passages that answer the query"
)
queries = [
"What is the capital of China?",
"Explain gravity",
]
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
queries = [
query_template.format(prefix=prefix, instruction=instruction, query=query)
for query in queries
]
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
llm = get_llm()
outputs = llm.score(queries, documents)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
What is the difference between the official original version and one
that has been converted into a sequence classification model?
Qwen3-Reranker is a language model that doing reranker by using the
logits of "no" and "yes" tokens.
This requires computing logits for all 151,669 tokens in the vocabulary,
making it inefficient and incompatible with vLLM's score() API.
A conversion method has been proposed to transform the original model into a
sequence classification model. This converted model:
1. Is significantly more efficient
2. Fully supports vLLM's score() API
3. Simplifies initialization parameters
Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
For the converted model, initialization would simply be:
llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
This example demonstrates loading the ORIGINAL model with special overrides
to make it compatible with vLLM's score API.
"""
from pathlib import Path
from vllm import LLM
model_name = "Qwen/Qwen3-Reranker-0.6B"
def get_llm() -> LLM:
"""
Initializes and returns the LLM model for Qwen3-Reranker.
Returns:
LLM: Configured vLLM instance for reranking tasks.
Note:
This function loads the ORIGINAL Qwen3-Reranker model with specific
overrides to make it compatible with vLLM's score API.
"""
return LLM(
# Specify the original model from HuggingFace
model=model_name,
# Use pooling runner for score task
runner="pooling",
# HuggingFace model configuration overrides required for compatibility
hf_overrides={
# Manually route to sequence classification architecture
# This tells vLLM to use Qwen3ForSequenceClassification instead of
# the default Qwen3ForCausalLM
"architectures": ["Qwen3ForSequenceClassification"],
# Specify which token logits to extract from the language model head
# The original reranker uses "no" and "yes" token logits for scoring
"classifier_from_token": ["no", "yes"],
# Enable special handling for original Qwen3-Reranker models
# This flag triggers conversion logic that transforms the two token
# vectors into a single classification vector
"is_original_qwen3_reranker": True,
},
)
def main() -> None:
# Load the Jinja template for formatting query-document pairs
# The template ensures proper formatting for the reranker model
template_home = Path(__file__).parent / "template"
template_path = "qwen3_reranker.jinja"
chat_template = (template_home / template_path).read_text()
# Sample queries for testing the reranker
queries = [
"What is the capital of China?",
"Explain gravity",
]
# Corresponding documents to be scored against each query
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
# Initialize the LLM model with the original Qwen3-Reranker configuration
llm = get_llm()
# Compute relevance scores for each query-document pair
# The score() method returns a relevance score for each pair
# Higher scores indicate better relevance
outputs = llm.score(queries, documents, chat_template=chat_template)
# Extract and print the relevance scores from the outputs
# Each output contains a score representing query-document relevance
print("-" * 30)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("-" * 30)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
What is the difference between the official original version and one
that has been converted into a sequence classification model?
Qwen3-Reranker is a language model that doing reranker by using the
logits of "no" and "yes" tokens.
This requires computing logits for all 151,669 tokens in the vocabulary,
making it inefficient and incompatible with vLLM's score() API.
A conversion method has been proposed to transform the original model into a
sequence classification model. This converted model:
1. Is significantly more efficient
2. Fully supports vLLM's score() API
3. Simplifies initialization parameters
Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
For the converted model, initialization would simply be:
vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --runner pooling --chat-template examples/pooling/score/template/qwen3_reranker.jinja
This example demonstrates loading the ORIGINAL model with special overrides
to make it compatible with vLLM's score API.
vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja
"""
import json
import requests
# URL of the vLLM server's score endpoint
# Default vLLM server runs on localhost port 8000
url = "http://127.0.0.1:8000/score"
# HTTP headers for the request
headers = {"accept": "application/json", "Content-Type": "application/json"}
# Example queries & documents
queries = [
"What is the capital of China?",
"Explain gravity",
]
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
# Request payload for the score API
data = {
"model": "Qwen/Qwen3-Reranker-0.6B",
"text_1": queries,
"text_2": documents,
}
def main():
"""Main function to send a score request to the vLLM server.
This function sends a POST request to the /score endpoint with
the query and documents, then prints the relevance scores.
"""
# Send POST request to the vLLM server's score endpoint
response = requests.post(url, headers=headers, json=data)
# Check if the request was successful
if response.status_code == 200:
print("Request successful!")
# Pretty print the JSON response containing relevance scores
# The response includes scores for each document's relevance to the query
print(json.dumps(response.json(), indent=2))
else:
# Handle request failure
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()
A: {{ (messages | selectattr("role", "eq", "query") | first).content }}
B: {{ (messages | selectattr("role", "eq", "document") | first).content }}
Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'.
\ No newline at end of file
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
query: {{ (messages | selectattr("role", "eq", "query") | first).content }}
document: {{ (messages | selectattr("role", "eq", "document") | first).content }}
You are a search relevance expert who evaluates how well documents match search queries. For each query-document pair, carefully analyze the semantic relationship between them, then provide your binary relevance judgment (0 for not relevant, 1 for relevant).
Relevance:<|im_end|>
<|im_start|>assistant
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment