Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Saves each worker's model state dict directly to a checkpoint, which enables a Saves each worker's model state dict directly to a checkpoint, which enables a
fast load path for large tensor-parallel models where each worker only needs to fast load path for large tensor-parallel models where each worker only needs to
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import time import time
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This file demonstrates the example usage of guided decoding This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply to generate structured outputs using vLLM. It shows how to apply
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
experimental support for tensor-parallel inference with torchrun, experimental support for tensor-parallel inference with torchrun,
see https://github.com/vllm-project/vllm/issues/11400 for see https://github.com/vllm-project/vllm/issues/11400 for
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -18,14 +22,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) ...@@ -18,14 +22,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def main(): def main():
parser = argparse.ArgumentParser(description="TPU offline inference example")
parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
args = parser.parse_args()
llm_args = {
"model": "Qwen/Qwen2-1.5B-Instruct",
"max_num_batched_tokens": 64,
"max_num_seqs": 4,
"max_model_len": 128,
}
if args.use_spmd:
os.environ["VLLM_XLA_USE_SPMD"] = "1"
# Can only hardcode the number of chips for now.
# calling xr.global_runtime_device_count() beforeing init SPMD env in
# torch_xla will mess up the distributed env.
llm_args["tensor_parallel_size"] = 8
# Use Llama, for num_kv_heads = 8.
llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
# Set `enforce_eager=True` to avoid ahead-of-time compilation. # Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`. # In real workloads, `enforace_eager` should be `False`.
llm = LLM( llm = LLM(**llm_args)
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4,
max_model_len=128,
)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
print("-" * 50) print("-" * 50)
for output, answer in zip(outputs, answers): for output, answer in zip(outputs, answers):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This example shows how to use vLLM for running offline inference with This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation. the correct prompt format on vision language models for text generation.
...@@ -333,6 +334,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ...@@ -333,6 +334,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
) )
# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL # InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData: def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B" model_name = "OpenGVLab/InternVL3-2B"
...@@ -1091,6 +1111,7 @@ model_example_map = { ...@@ -1091,6 +1111,7 @@ model_example_map = {
"qwen2_5_omni": run_qwen2_5_omni, "qwen2_5_omni": run_qwen2_5_omni,
"skywork_chat": run_skyworkr1v, "skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm, "smolvlm": run_smolvlm,
"tarsier": run_tarsier,
} }
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This example shows how to use vLLM for running offline inference with This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding. the correct prompt format on vision language models for multimodal embedding.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This example shows how to use vLLM for running offline inference with This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation, multi-image input on vision language models for text generation,
...@@ -592,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -592,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import smart_resize
except ModuleNotFoundError: except ModuleNotFoundError:
print( print(
"WARNING: `qwen-vl-utils` not installed, input images will not " "WARNING: `qwen-vl-utils` not installed, input images will not "
"be automatically resized. You can enable this functionality by " "be automatically resized. You can enable this functionality by "
"`pip install qwen-vl-utils`." "`pip install qwen-vl-utils`."
) )
process_vision_info = None smart_resize = None
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40 # Tested on L40
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if smart_resize is None else 4096,
max_num_seqs=5, max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -629,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -629,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
messages, tokenize=False, add_generation_prompt=True messages, tokenize=False, add_generation_prompt=True
) )
if process_vision_info is None: if smart_resize is None:
image_data = [fetch_image(url) for url in image_urls] image_data = [fetch_image(url) for url in image_urls]
else: else:
image_data, _ = process_vision_info(messages)
def post_process_image(image: Image) -> Image:
width, height = image.size
resized_height, resized_width = smart_resize(
height, width, max_pixels=1024 * 28 * 28
)
return image.resize((resized_width, resized_height))
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
...@@ -643,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -643,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import smart_resize
except ModuleNotFoundError: except ModuleNotFoundError:
print( print(
"WARNING: `qwen-vl-utils` not installed, input images will not " "WARNING: `qwen-vl-utils` not installed, input images will not "
"be automatically resized. You can enable this functionality by " "be automatically resized. You can enable this functionality by "
"`pip install qwen-vl-utils`." "`pip install qwen-vl-utils`."
) )
process_vision_info = None smart_resize = None
model_name = "Qwen/Qwen2.5-VL-3B-Instruct" model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if smart_resize is None else 4096,
max_num_seqs=5, max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -679,10 +688,38 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -679,10 +688,38 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
messages, tokenize=False, add_generation_prompt=True messages, tokenize=False, add_generation_prompt=True
) )
if process_vision_info is None: if smart_resize is None:
image_data = [fetch_image(url) for url in image_urls] image_data = [fetch_image(url) for url in image_urls]
else: else:
image_data, _ = process_vision_info(messages, return_video_kwargs=False)
def post_process_image(image: Image) -> Image:
width, height = image.size
resized_height, resized_width = smart_resize(
height, width, max_pixels=1024 * 28 * 28
)
return image.resize((resized_width, resized_height))
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)
prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
...@@ -712,6 +749,7 @@ model_example_map = { ...@@ -712,6 +749,7 @@ model_example_map = {
"qwen2_vl": load_qwen2_vl, "qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl, "qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm, "smolvlm": load_smolvlm,
"tarsier": load_tarsier,
} }
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for `vllm.entrypoints.api_server` """Example Python client for `vllm.entrypoints.api_server`
Start the demo server: Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name> python -m vllm.entrypoints.api_server --model <model_name>
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Example of using the OpenAI entrypoint's rerank API which is compatible with Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python the Cohere SDK: https://github.com/cohere-ai/cohere-python
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This file provides a disaggregated prefilling proxy demo to demonstrate an This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling. example usage of XpYd disaggregated prefilling.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio OpenAI Chatbot Webserver """Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server: Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf vllm serve meta-llama/Llama-2-7b-chat-hf
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio Webserver """Example for starting a Gradio Webserver
Start vLLM API server: Start vLLM API server:
python -m vllm.entrypoints.api_server \ python -m vllm.entrypoints.api_server \
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Example of using the OpenAI entrypoint's rerank API which is compatible with Example of using the OpenAI entrypoint's rerank API which is compatible with
Jina and Cohere https://jina.ai/reranker Jina and Cohere https://jina.ai/reranker
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional, Union from typing import Any, Optional, Union
import msgspec import msgspec
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
from typing import Optional
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
"""
To run this example, run the following commands simultaneously with
different CUDA_VISIBLE_DEVICES:
python examples/online_serving/multi_instance_data_parallel.py
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
--data-parallel-size-local 1 --enforce-eager --headless
Once both instances have completed the handshake, this example will
send a request to the instance with DP rank 1.
"""
async def main():
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
data_parallel_size=2,
dtype="auto",
max_model_len=2048,
data_parallel_address="127.0.0.1",
data_parallel_rpc_port=62300,
data_parallel_size_local=1,
enforce_eager=True,
)
engine_client = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=100,
)
prompt = "Who won the 2004 World Series?"
final_output: Optional[RequestOutput] = None
async for output in engine_client.generate(
prompt=prompt,
sampling_params=sampling_params,
request_id="abcdef",
data_parallel_rank=1,
):
final_output = output
if final_output:
print(final_output.outputs[0].text)
if __name__ == "__main__":
asyncio.run(main())
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for OpenAI Chat Completion using vLLM API server """Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g. NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf vllm serve meta-llama/Llama-2-7b-chat-hf
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models """An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client. and run online serving with OpenAI client.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Set up this example by starting a vLLM OpenAI-compatible server with tool call Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example: options enabled. For example:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
To run this example, you can start the vLLM server To run this example, you can start the vLLM server
without any specific flags: without any specific flags:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment