Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Saves each worker's model state dict directly to a checkpoint, which enables a
fast load path for large tensor-parallel models where each worker only needs to
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
experimental support for tensor-parallel inference with torchrun,
see https://github.com/vllm-project/vllm/issues/11400 for
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
from vllm import LLM, SamplingParams
......@@ -18,14 +22,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def main():
parser = argparse.ArgumentParser(description="TPU offline inference example")
parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
args = parser.parse_args()
llm_args = {
"model": "Qwen/Qwen2-1.5B-Instruct",
"max_num_batched_tokens": 64,
"max_num_seqs": 4,
"max_model_len": 128,
}
if args.use_spmd:
os.environ["VLLM_XLA_USE_SPMD"] = "1"
# Can only hardcode the number of chips for now.
# calling xr.global_runtime_device_count() beforeing init SPMD env in
# torch_xla will mess up the distributed env.
llm_args["tensor_parallel_size"] = 8
# Use Llama, for num_kv_heads = 8.
llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4,
max_model_len=128,
)
llm = LLM(**llm_args)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output, answer in zip(outputs, answers):
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
......@@ -333,6 +334,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
)
# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B"
......@@ -1091,6 +1111,7 @@ model_example_map = {
"qwen2_5_omni": run_qwen2_5_omni,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
"tarsier": run_tarsier,
}
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation,
......@@ -592,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
from qwen_vl_utils import smart_resize
except ModuleNotFoundError:
print(
"WARNING: `qwen-vl-utils` not installed, input images will not "
"be automatically resized. You can enable this functionality by "
"`pip install qwen-vl-utils`."
)
process_vision_info = None
smart_resize = None
model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40
engine_args = EngineArgs(
model=model_name,
max_model_len=32768 if process_vision_info is None else 4096,
max_model_len=32768 if smart_resize is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
......@@ -629,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
messages, tokenize=False, add_generation_prompt=True
)
if process_vision_info is None:
if smart_resize is None:
image_data = [fetch_image(url) for url in image_urls]
else:
image_data, _ = process_vision_info(messages)
def post_process_image(image: Image) -> Image:
width, height = image.size
resized_height, resized_width = smart_resize(
height, width, max_pixels=1024 * 28 * 28
)
return image.resize((resized_width, resized_height))
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
......@@ -643,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
from qwen_vl_utils import smart_resize
except ModuleNotFoundError:
print(
"WARNING: `qwen-vl-utils` not installed, input images will not "
"be automatically resized. You can enable this functionality by "
"`pip install qwen-vl-utils`."
)
process_vision_info = None
smart_resize = None
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768 if process_vision_info is None else 4096,
max_model_len=32768 if smart_resize is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
......@@ -679,10 +688,38 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
messages, tokenize=False, add_generation_prompt=True
)
if process_vision_info is None:
if smart_resize is None:
image_data = [fetch_image(url) for url in image_urls]
else:
image_data, _ = process_vision_info(messages, return_video_kwargs=False)
def post_process_image(image: Image) -> Image:
width, height = image.size
resized_height, resized_width = smart_resize(
height, width, max_pixels=1024 * 28 * 28
)
return image.resize((resized_width, resized_height))
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)
prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
......@@ -712,6 +749,7 @@ model_example_map = {
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm,
"tarsier": load_tarsier,
}
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for `vllm.entrypoints.api_server`
Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name>
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server \
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
Jina and Cohere https://jina.ai/reranker
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional, Union
import msgspec
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
from typing import Optional
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
"""
To run this example, run the following commands simultaneously with
different CUDA_VISIBLE_DEVICES:
python examples/online_serving/multi_instance_data_parallel.py
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
--data-parallel-size-local 1 --enforce-eager --headless
Once both instances have completed the handshake, this example will
send a request to the instance with DP rank 1.
"""
async def main():
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
data_parallel_size=2,
dtype="auto",
max_model_len=2048,
data_parallel_address="127.0.0.1",
data_parallel_rpc_port=62300,
data_parallel_size_local=1,
enforce_eager=True,
)
engine_client = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=100,
)
prompt = "Who won the 2004 World Series?"
final_output: Optional[RequestOutput] = None
async for output in engine_client.generate(
prompt=prompt,
sampling_params=sampling_params,
request_id="abcdef",
data_parallel_rank=1,
):
final_output = output
if final_output:
print(final_output.outputs[0].text)
if __name__ == "__main__":
asyncio.run(main())
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
To run this example, you can start the vLLM server
without any specific flags:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment