Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
167 additions
and
16 deletions
+167
-16
examples/offline_inference/save_sharded_state.py
examples/offline_inference/save_sharded_state.py
+1
-0
examples/offline_inference/simple_profiling.py
examples/offline_inference/simple_profiling.py
+1
-0
examples/offline_inference/structured_outputs.py
examples/offline_inference/structured_outputs.py
+1
-0
examples/offline_inference/torchrun_example.py
examples/offline_inference/torchrun_example.py
+1
-0
examples/offline_inference/tpu.py
examples/offline_inference/tpu.py
+24
-6
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+21
-0
examples/offline_inference/vision_language_embedding.py
examples/offline_inference/vision_language_embedding.py
+1
-0
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+48
-10
examples/online_serving/api_client.py
examples/online_serving/api_client.py
+1
-0
examples/online_serving/cohere_rerank_client.py
examples/online_serving/cohere_rerank_client.py
+1
-0
examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
...online_serving/disaggregated_serving/disagg_proxy_demo.py
+1
-0
examples/online_serving/gradio_openai_chatbot_webserver.py
examples/online_serving/gradio_openai_chatbot_webserver.py
+1
-0
examples/online_serving/gradio_webserver.py
examples/online_serving/gradio_webserver.py
+1
-0
examples/online_serving/jinaai_rerank_client.py
examples/online_serving/jinaai_rerank_client.py
+1
-0
examples/online_serving/kv_events_subscriber.py
examples/online_serving/kv_events_subscriber.py
+1
-0
examples/online_serving/multi_instance_data_parallel.py
examples/online_serving/multi_instance_data_parallel.py
+58
-0
examples/online_serving/openai_chat_completion_client.py
examples/online_serving/openai_chat_completion_client.py
+1
-0
examples/online_serving/openai_chat_completion_client_for_multimodal.py
...e_serving/openai_chat_completion_client_for_multimodal.py
+1
-0
examples/online_serving/openai_chat_completion_client_with_tools.py
...nline_serving/openai_chat_completion_client_with_tools.py
+1
-0
examples/online_serving/openai_chat_completion_client_with_tools_required.py
...ving/openai_chat_completion_client_with_tools_required.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
examples/offline_inference/save_sharded_state.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Saves each worker's model state dict directly to a checkpoint, which enables a
fast load path for large tensor-parallel models where each worker only needs to
...
...
examples/offline_inference/simple_profiling.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
time
...
...
examples/offline_inference/structured_outputs.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply
...
...
examples/offline_inference/torchrun_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
experimental support for tensor-parallel inference with torchrun,
see https://github.com/vllm-project/vllm/issues/11400 for
...
...
examples/offline_inference/tpu.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
os
from
vllm
import
LLM
,
SamplingParams
...
...
@@ -18,14 +22,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"TPU offline inference example"
)
parser
.
add_argument
(
"--use-spmd"
,
action
=
"store_true"
,
help
=
"Enable SPMD mode"
)
args
=
parser
.
parse_args
()
llm_args
=
{
"model"
:
"Qwen/Qwen2-1.5B-Instruct"
,
"max_num_batched_tokens"
:
64
,
"max_num_seqs"
:
4
,
"max_model_len"
:
128
,
}
if
args
.
use_spmd
:
os
.
environ
[
"VLLM_XLA_USE_SPMD"
]
=
"1"
# Can only hardcode the number of chips for now.
# calling xr.global_runtime_device_count() beforeing init SPMD env in
# torch_xla will mess up the distributed env.
llm_args
[
"tensor_parallel_size"
]
=
8
# Use Llama, for num_kv_heads = 8.
llm_args
[
"model"
]
=
"meta-llama/Llama-3.1-8B-Instruct"
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
4
,
max_model_len
=
128
,
)
llm
=
LLM
(
**
llm_args
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"-"
*
50
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
...
...
examples/offline_inference/vision_language.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
...
...
@@ -333,6 +334,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
)
# omni-research/Tarsier-7b
def
run_tarsier
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"omni-research/Tarsier-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[(
f
"USER: <image>
\n
{
question
}
ASSISTANT:"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# InternVL
def
run_internvl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"OpenGVLab/InternVL3-2B"
...
...
@@ -1091,6 +1111,7 @@ model_example_map = {
"qwen2_5_omni"
:
run_qwen2_5_omni
,
"skywork_chat"
:
run_skyworkr1v
,
"smolvlm"
:
run_smolvlm
,
"tarsier"
:
run_tarsier
,
}
...
...
examples/offline_inference/vision_language_embedding.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation,
...
...
@@ -592,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
def
load_qwen2_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
try
:
from
qwen_vl_utils
import
process_vision_info
from
qwen_vl_utils
import
smart_resize
except
ModuleNotFoundError
:
print
(
"WARNING: `qwen-vl-utils` not installed, input images will not "
"be automatically resized. You can enable this functionality by "
"`pip install qwen-vl-utils`."
)
process_vision_info
=
None
smart_resize
=
None
model_name
=
"Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
32768
if
process_vision_info
is
None
else
4096
,
max_model_len
=
32768
if
smart_resize
is
None
else
4096
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
...
...
@@ -629,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
if
process_vision_info
is
None
:
if
smart_resize
is
None
:
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
else
:
image_data
,
_
=
process_vision_info
(
messages
)
def
post_process_image
(
image
:
Image
)
->
Image
:
width
,
height
=
image
.
size
resized_height
,
resized_width
=
smart_resize
(
height
,
width
,
max_pixels
=
1024
*
28
*
28
)
return
image
.
resize
((
resized_width
,
resized_height
))
image_data
=
[
post_process_image
(
fetch_image
(
url
))
for
url
in
image_urls
]
return
ModelRequestData
(
engine_args
=
engine_args
,
...
...
@@ -643,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
def
load_qwen2_5_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
try
:
from
qwen_vl_utils
import
process_vision_info
from
qwen_vl_utils
import
smart_resize
except
ModuleNotFoundError
:
print
(
"WARNING: `qwen-vl-utils` not installed, input images will not "
"be automatically resized. You can enable this functionality by "
"`pip install qwen-vl-utils`."
)
process_vision_info
=
None
smart_resize
=
None
model_name
=
"Qwen/Qwen2.5-VL-3B-Instruct"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
32768
if
process_vision_info
is
None
else
4096
,
max_model_len
=
32768
if
smart_resize
is
None
else
4096
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
...
...
@@ -679,10 +688,38 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
if
process_vision_info
is
None
:
if
smart_resize
is
None
:
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
else
:
image_data
,
_
=
process_vision_info
(
messages
,
return_video_kwargs
=
False
)
def
post_process_image
(
image
:
Image
)
->
Image
:
width
,
height
=
image
.
size
resized_height
,
resized_width
=
smart_resize
(
height
,
width
,
max_pixels
=
1024
*
28
*
28
)
return
image
.
resize
((
resized_width
,
resized_height
))
image_data
=
[
post_process_image
(
fetch_image
(
url
))
for
url
in
image_urls
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
image_data
,
)
def
load_tarsier
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"omni-research/Tarsier-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
prompt
=
f
"USER:
{
'<image>'
*
len
(
image_urls
)
}
\n
{
question
}
\n
ASSISTANT:"
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
return
ModelRequestData
(
engine_args
=
engine_args
,
...
...
@@ -712,6 +749,7 @@ model_example_map = {
"qwen2_vl"
:
load_qwen2_vl
,
"qwen2_5_vl"
:
load_qwen2_5_vl
,
"smolvlm"
:
load_smolvlm
,
"tarsier"
:
load_tarsier
,
}
...
...
examples/online_serving/api_client.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for `vllm.entrypoints.api_server`
Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name>
...
...
examples/online_serving/cohere_rerank_client.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
...
...
examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
...
...
examples/online_serving/gradio_openai_chatbot_webserver.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
...
...
examples/online_serving/gradio_webserver.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server
\
...
...
examples/online_serving/jinaai_rerank_client.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
Jina and Cohere https://jina.ai/reranker
...
...
examples/online_serving/kv_events_subscriber.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Optional
,
Union
import
msgspec
...
...
examples/online_serving/multi_instance_data_parallel.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
import
asyncio
from
typing
import
Optional
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
"""
To run this example, run the following commands simultaneously with
different CUDA_VISIBLE_DEVICES:
python examples/online_serving/multi_instance_data_parallel.py
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1
\
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300
\
--data-parallel-size-local 1 --enforce-eager --headless
Once both instances have completed the handshake, this example will
send a request to the instance with DP rank 1.
"""
async
def
main
():
engine_args
=
AsyncEngineArgs
(
model
=
"ibm-research/PowerMoE-3b"
,
data_parallel_size
=
2
,
dtype
=
"auto"
,
max_model_len
=
2048
,
data_parallel_address
=
"127.0.0.1"
,
data_parallel_rpc_port
=
62300
,
data_parallel_size_local
=
1
,
enforce_eager
=
True
,
)
engine_client
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
top_p
=
0.9
,
max_tokens
=
100
,
)
prompt
=
"Who won the 2004 World Series?"
final_output
:
Optional
[
RequestOutput
]
=
None
async
for
output
in
engine_client
.
generate
(
prompt
=
prompt
,
sampling_params
=
sampling_params
,
request_id
=
"abcdef"
,
data_parallel_rank
=
1
,
):
final_output
=
output
if
final_output
:
print
(
final_output
.
outputs
[
0
].
text
)
if
__name__
==
"__main__"
:
asyncio
.
run
(
main
())
examples/online_serving/openai_chat_completion_client.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
...
...
examples/online_serving/openai_chat_completion_client_for_multimodal.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
...
...
examples/online_serving/openai_chat_completion_client_with_tools.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
...
...
examples/online_serving/openai_chat_completion_client_with_tools_required.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
To run this example, you can start the vLLM server
without any specific flags:
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment