Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
+# SPDX-License-Identifier: Apache-2.0
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for

--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams

 prompts = [

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for text generation.
@@ -530,18 +531,33 @@ def run_qwen2_vl(question: str, modality: str):
    return llm, prompt, stop_token_ids


-# GLM-4v
-def run_glm4v(question: str, modality: str):
-    assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
+# Qwen2.5-VL
+def run_qwen2_5_vl(question: str, modality: str):

-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True)
-    prompt = question
-    stop_token_ids = [151329, 151336, 151338]
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
    return llm, prompt, stop_token_ids


@@ -571,6 +587,7 @@ model_example_map = {
    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
+    "qwen2_5_vl": run_qwen2_5_vl,
 }



--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for multimodal embedding.

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models for text generation,
@@ -391,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
    )


+def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
+    try:
+        from qwen_vl_utils import process_vision_info
+    except ModuleNotFoundError:
+        print('WARNING: `qwen-vl-utils` not installed, input images will not '
+              'be automatically resized. You can enable this functionality by '
+              '`pip install qwen-vl-utils`.')
+        process_vision_info = None
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=32768 if process_vision_info is None else 4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    stop_token_ids = None
+
+    if process_vision_info is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+        image_data, _ = process_vision_info(messages,
+                                            return_video_sample_fps=False)
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=image_data,
+        chat_template=None,
+    )
+
+
 model_example_map = {
    "aria": load_aria,
    "deepseek_vl_v2": load_deepseek_vl2,
@@ -403,6 +461,7 @@ model_example_map = {
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
    "qwen2_vl": load_qwen2_vl,
+    "qwen2_5_vl": load_qwen2_5_vl,
 }



--- a/examples/offline_inference/whisper.py
+++ b/examples/offline_inference/whisper.py
+# SPDX-License-Identifier: Apache-2.0
+
 import time

 from vllm import LLM, SamplingParams

--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
+# SPDX-License-Identifier: Apache-2.0
 """Example Python client for `vllm.entrypoints.api_server`
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.

--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python

--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse

 import gradio as gr

--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json


--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 Jina and Cohere https://jina.ai/reranker

--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI

 # Modify OpenAI's API key and API base to use vLLM's API server.

--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
 """An example showing how to use vLLM to serve multimodal models 
 and run online serving with OpenAI client.


--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled. For example:

--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import Enum

 from openai import OpenAI

--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
+# SPDX-License-Identifier: Apache-2.0
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.

--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+# SPDX-License-Identifier: Apache-2.0
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.

--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import base64
 import io

--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
+# SPDX-License-Identifier: Apache-2.0
+
 from openai import OpenAI

 # Modify OpenAI's API key and API base to use vLLM's API server.

--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Example online usage of Score API.