Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
"""
experimental support for tensor-parallel inference with torchrun,
see https://github.com/vllm-project/vllm/issues/11400 for
......
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
prompts = [
......
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
......@@ -530,18 +531,33 @@ def run_qwen2_vl(question: str, modality: str):
return llm, prompt, stop_token_ids
# GLM-4v
def run_glm4v(question: str, modality: str):
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
# Qwen2.5-VL
def run_qwen2_5_vl(question: str, modality: str):
llm = LLM(model=model_name,
max_model_len=2048,
max_num_seqs=2,
trust_remote_code=True,
enforce_eager=True)
prompt = question
stop_token_ids = [151329, 151336, 151338]
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None
return llm, prompt, stop_token_ids
......@@ -571,6 +587,7 @@ model_example_map = {
"pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
}
......
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
......
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation,
......@@ -391,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
)
def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
except ModuleNotFoundError:
print('WARNING: `qwen-vl-utils` not installed, input images will not '
'be automatically resized. You can enable this functionality by '
'`pip install qwen-vl-utils`.')
process_vision_info = None
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
llm = LLM(
model=model_name,
max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls]
else:
image_data, _ = process_vision_info(messages,
return_video_sample_fps=False)
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data,
chat_template=None,
)
model_example_map = {
"aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2,
......@@ -403,6 +461,7 @@ model_example_map = {
"pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,
}
......
# SPDX-License-Identifier: Apache-2.0
import time
from vllm import LLM, SamplingParams
......
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for `vllm.entrypoints.api_server`
NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use.
......
# SPDX-License-Identifier: Apache-2.0
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
......
# SPDX-License-Identifier: Apache-2.0
import argparse
import gradio as gr
......
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
......
# SPDX-License-Identifier: Apache-2.0
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
Jina and Cohere https://jina.ai/reranker
......
# SPDX-License-Identifier: Apache-2.0
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
......
# SPDX-License-Identifier: Apache-2.0
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
......
# SPDX-License-Identifier: Apache-2.0
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
......
# SPDX-License-Identifier: Apache-2.0
from enum import Enum
from openai import OpenAI
......
# SPDX-License-Identifier: Apache-2.0
"""
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
......
# SPDX-License-Identifier: Apache-2.0
"""
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
......
# SPDX-License-Identifier: Apache-2.0
import argparse
import base64
import io
......
# SPDX-License-Identifier: Apache-2.0
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
......
# SPDX-License-Identifier: Apache-2.0
"""
Example online usage of Score API.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment