Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
081057de
Commit
081057de
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-ori
parents
7cf5d5c4
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
568 additions
and
319 deletions
+568
-319
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+36
-31
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
...line_inference/disaggregated-prefill-v1/decode_example.py
+36
-0
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
...ine_inference/disaggregated-prefill-v1/prefill_example.py
+43
-0
examples/offline_inference/disaggregated-prefill-v1/run.sh
examples/offline_inference/disaggregated-prefill-v1/run.sh
+5
-0
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/disaggregated_prefill.py
+5
-1
examples/offline_inference/distributed.py
examples/offline_inference/distributed.py
+0
-109
examples/offline_inference/eagle.py
examples/offline_inference/eagle.py
+18
-5
examples/offline_inference/embed_jina_embeddings_v3.py
examples/offline_inference/embed_jina_embeddings_v3.py
+11
-7
examples/offline_inference/embed_matryoshka_fy.py
examples/offline_inference/embed_matryoshka_fy.py
+11
-7
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+104
-86
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+20
-16
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+8
-4
examples/offline_inference/mistral-small.py
examples/offline_inference/mistral-small.py
+8
-4
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+5
-2
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+33
-29
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+10
-2
examples/offline_inference/qwen2_5_omni/README.md
examples/offline_inference/qwen2_5_omni/README.md
+32
-0
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen2_5_omni/only_thinker.py
+160
-0
examples/offline_inference/save_sharded_state.py
examples/offline_inference/save_sharded_state.py
+18
-15
examples/offline_inference/simple_profiling.py
examples/offline_inference/simple_profiling.py
+5
-1
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
examples/offline_inference/data_parallel.py
View file @
081057de
...
...
@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams
from
vllm.utils
import
get_open_port
def
parse_args
():
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Data Parallel Inference"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"ibm-research/PowerMoE-3b"
,
help
=
"Model name or path"
)
parser
.
add_argument
(
"--dp-size"
,
type
=
int
,
default
=
2
,
help
=
"Data parallel size"
)
parser
.
add_argument
(
"--tp-size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--node-size"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes"
)
parser
.
add_argument
(
"--node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node"
)
parser
.
add_argument
(
"--master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address"
)
parser
.
add_argument
(
"--master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port"
)
return
parser
.
parse_args
()
def
main
(
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
GPUs_per_dp_rank
):
os
.
environ
[
"VLLM_DP_RANK"
]
=
str
(
global_dp_rank
)
...
...
@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
if
__name__
==
"__main__"
:
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Data Parallel Inference"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"ibm-research/PowerMoE-3b"
,
help
=
"Model name or path"
)
parser
.
add_argument
(
"--dp-size"
,
type
=
int
,
default
=
2
,
help
=
"Data parallel size"
)
parser
.
add_argument
(
"--tp-size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--node-size"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes"
)
parser
.
add_argument
(
"--node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node"
)
parser
.
add_argument
(
"--master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address"
)
parser
.
add_argument
(
"--master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port"
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
dp_size
=
args
.
dp_size
tp_size
=
args
.
tp_size
...
...
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
0 → 100644
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
# Read prompts from output.txt
prompts
=
[]
try
:
with
open
(
"output.txt"
)
as
f
:
for
line
in
f
:
prompts
.
append
(
line
.
strip
())
print
(
f
"Loaded
{
len
(
prompts
)
}
prompts from output.txt"
)
except
FileNotFoundError
:
print
(
"Error: output.txt file not found"
)
exit
(
-
1
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.8
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
16
,
kv_transfer_config
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
'"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
))
#, max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
0 → 100644
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
context
=
"Hi "
*
1000
context2
=
"Hey "
*
500
prompts
=
[
context
+
"Hello, my name is"
,
context
+
"The capital of France is"
,
context2
+
"Your name is"
,
context2
+
"The capital of China is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.8
,
kv_transfer_config
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
'"kv_connector_extra_config": '
'{"shared_storage_path": "local_storage"}}'
)
)
#, max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
)
new_prompts
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
new_prompts
.
append
(
prompt
+
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# Write new_prompts to output.txt
with
open
(
"output.txt"
,
"w"
)
as
f
:
for
prompt
in
new_prompts
:
f
.
write
(
prompt
+
"
\n
"
)
print
(
f
"Saved
{
len
(
new_prompts
)
}
prompts to output.txt"
)
examples/offline_inference/disaggregated-prefill-v1/run.sh
0 → 100644
View file @
081057de
rm
-rf
local_storage/
rm
output.txt
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 prefill_example.py
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 decode_example.py
examples/offline_inference/disaggregated_prefill.py
View file @
081057de
...
...
@@ -95,7 +95,7 @@ def run_decode(prefill_done):
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
if
__name__
==
"__
main
__"
:
def
main
()
:
prefill_done
=
Event
()
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,
))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,
))
...
...
@@ -109,3 +109,7 @@ if __name__ == "__main__":
# Terminate the prefill node when decode is finished
decode_process
.
join
()
prefill_process
.
terminate
()
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/distributed.py
deleted
100644 → 0
View file @
7cf5d5c4
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use Ray Data for running offline batch inference
distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from
typing
import
Any
import
numpy
as
np
import
ray
from
packaging.version
import
Version
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
LLM
,
SamplingParams
assert
Version
(
ray
.
__version__
)
>=
Version
(
"2.22.0"
),
"Ray version must be at least 2.22.0"
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Set tensor parallelism per instance.
tensor_parallel_size
=
1
# Set number of instances. Each instance will use tensor_parallel_size GPUs.
num_instances
=
1
# Create a class to do batch inference.
class
LLMPredictor
:
def
__init__
(
self
):
# Create an LLM.
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
tensor_parallel_size
=
tensor_parallel_size
)
def
__call__
(
self
,
batch
:
dict
[
str
,
np
.
ndarray
])
->
dict
[
str
,
list
]:
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
prompt
:
list
[
str
]
=
[]
generated_text
:
list
[
str
]
=
[]
for
output
in
outputs
:
prompt
.
append
(
output
.
prompt
)
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
return
{
"prompt"
:
prompt
,
"generated_text"
:
generated_text
,
}
# Read one text file from S3. Ray Data supports reading multiple files
# from cloud storage (such as JSONL, Parquet, CSV, binary format).
ds
=
ray
.
data
.
read_text
(
"s3://anonymous@air-example-data/prompts.txt"
)
# For tensor_parallel_size > 1, we need to create placement groups for vLLM
# to use. Every actor has to have its own placement group.
def
scheduling_strategy_fn
():
# One bundle per tensor parallel worker
pg
=
ray
.
util
.
placement_group
(
[{
"GPU"
:
1
,
"CPU"
:
1
}]
*
tensor_parallel_size
,
strategy
=
"STRICT_PACK"
,
)
return
dict
(
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
pg
,
placement_group_capture_child_tasks
=
True
))
resources_kwarg
:
dict
[
str
,
Any
]
=
{}
if
tensor_parallel_size
==
1
:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg
[
"num_gpus"
]
=
1
else
:
# Otherwise, we have to set num_gpus=0 and provide
# a function that will create a placement group for
# each instance.
resources_kwarg
[
"num_gpus"
]
=
0
resources_kwarg
[
"ray_remote_args_fn"
]
=
scheduling_strategy_fn
# Apply batch inference for all input data.
ds
=
ds
.
map_batches
(
LLMPredictor
,
# Set the concurrency to the number of LLM instances.
concurrency
=
num_instances
,
# Specify the batch size for inference.
batch_size
=
32
,
**
resources_kwarg
,
)
# Peek first 10 results.
# NOTE: This is for local testing and debugging. For production use case,
# one should write full result out as shown below.
outputs
=
ds
.
take
(
limit
=
10
)
for
output
in
outputs
:
prompt
=
output
[
"prompt"
]
generated_text
=
output
[
"generated_text"
]
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# Write inference output data out as Parquet files to S3.
# Multiple files would be written to the output destination,
# and each task would write one or more files separately.
#
# ds.write_parquet("s3://<your-output-bucket>")
examples/offline_inference/eagle.py
View file @
081057de
...
...
@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts):
return
prompts
[:
num_prompts
]
def
main
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--dataset"
,
...
...
@@ -45,10 +45,15 @@ def main():
parser
.
add_argument
(
"--enable_chunked_prefill"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--max_num_batched_tokens"
,
type
=
int
,
default
=
2048
)
parser
.
add_argument
(
"--temp"
,
type
=
float
,
default
=
0
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
():
model_dir
=
"meta-llama/Meta-Llama-3-8B-Instruct"
eagle_dir
=
"abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
args
=
parse_args
()
model_dir
=
"meta-llama/Llama-3.1-8B-Instruct"
eagle_dir
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
max_model_len
=
2048
...
...
@@ -76,7 +81,7 @@ def main():
max_num_seqs
=
args
.
max_num_seqs
,
gpu_memory_utilization
=
0.8
,
speculative_config
=
{
"method"
:
"eagle"
,
"method"
:
"eagle3"
if
"eagle3"
in
eagle_dir
.
lower
()
else
"eagle"
,
"model"
:
eagle_dir
,
"num_speculative_tokens"
:
args
.
num_spec_tokens
,
"draft_tensor_parallel_size"
:
args
.
draft_tp
,
...
...
@@ -90,6 +95,9 @@ def main():
outputs
=
llm
.
generate
(
prompt_token_ids
=
prompt_ids
,
sampling_params
=
sampling_params
)
if
not
hasattr
(
outputs
,
"metrics"
)
or
outputs
.
metrics
is
None
:
return
# calculate the average number of accepted tokens per forward pass, +1 is
# to account for the token from the target model that's always going to be
# accepted
...
...
@@ -104,6 +112,11 @@ def main():
{
sum
(
acceptance_counts
)
/
acceptance_counts
[
0
]:.
2
f
}
"
)
print
(
"-"
*
50
)
# print acceptance at each token position
for
i
in
range
(
len
(
acceptance_counts
)):
print
(
f
"acceptance at token
{
i
}
:"
f
"
{
acceptance_counts
[
i
]
/
(
acceptance_counts
[
0
]):.
2
f
}
"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/embed_jina_embeddings_v3.py
View file @
081057de
...
...
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
def
parse_args
():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
# Sample prompts.
prompts
=
[
...
...
@@ -40,11 +50,5 @@ def main(args: Namespace):
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/embed_matryoshka_fy.py
View file @
081057de
...
...
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams
from
vllm.utils
import
FlexibleArgumentParser
def
parse_args
():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
# Sample prompts.
prompts
=
[
...
...
@@ -38,11 +48,5 @@ def main(args: Namespace):
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/encoder_decoder.py
View file @
081057de
...
...
@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
TokensPrompt
,
zip_enc_dec_prompts
)
dtype
=
"float"
# Create a BART encoder/decoder model instance
llm
=
LLM
(
model
=
"facebook/bart-large-cnn"
,
dtype
=
dtype
,
)
# Get BART tokenizer
tokenizer
=
llm
.
llm_engine
.
get_tokenizer_group
()
# Test prompts
#
# This section shows all of the valid ways to prompt an
# encoder/decoder model.
#
# - Helpers for building prompts
text_prompt_raw
=
"Hello, my name is"
text_prompt
=
TextPrompt
(
prompt
=
"The president of the United States is"
)
tokens_prompt
=
TokensPrompt
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
=
"The capital of France is"
))
# - Pass a single prompt to encoder/decoder model
# (implicitly encoder input prompt);
# decoder input prompt is assumed to be None
single_text_prompt_raw
=
text_prompt_raw
# Pass a string directly
single_text_prompt
=
text_prompt
# Pass a TextPrompt
single_tokens_prompt
=
tokens_prompt
# Pass a TokensPrompt
# - Pass explicit encoder and decoder input prompts within one data structure.
# Encoder and decoder prompts can both independently be text or tokens, with
# no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.
enc_dec_prompt1
=
ExplicitEncoderDecoderPrompt
(
# Pass encoder prompt string directly, &
# pass decoder prompt tokens
encoder_prompt
=
single_text_prompt_raw
,
decoder_prompt
=
single_tokens_prompt
,
)
enc_dec_prompt2
=
ExplicitEncoderDecoderPrompt
(
# Pass TextPrompt to encoder, and
# pass decoder prompt string directly
encoder_prompt
=
single_text_prompt
,
decoder_prompt
=
single_text_prompt_raw
,
)
enc_dec_prompt3
=
ExplicitEncoderDecoderPrompt
(
# Pass encoder prompt tokens directly, and
# pass TextPrompt to decoder
encoder_prompt
=
single_tokens_prompt
,
decoder_prompt
=
single_text_prompt
,
)
# - Finally, here's a useful helper function for zipping encoder and
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt
# instances
zipped_prompt_list
=
zip_enc_dec_prompts
(
[
'An encoder prompt'
,
'Another encoder prompt'
],
[
'A decoder prompt'
,
'Another decoder prompt'
])
# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
prompts
=
[
single_text_prompt_raw
,
single_text_prompt
,
single_tokens_prompt
,
enc_dec_prompt1
,
enc_dec_prompt2
,
enc_dec_prompt3
]
+
zipped_prompt_list
def
create_prompts
(
tokenizer
):
# Test prompts
#
# This section shows all of the valid ways to prompt an
# encoder/decoder model.
#
# - Helpers for building prompts
text_prompt_raw
=
"Hello, my name is"
text_prompt
=
TextPrompt
(
prompt
=
"The president of the United States is"
)
tokens_prompt
=
TokensPrompt
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
=
"The capital of France is"
))
# - Pass a single prompt to encoder/decoder model
# (implicitly encoder input prompt);
# decoder input prompt is assumed to be None
single_text_prompt_raw
=
text_prompt_raw
# Pass a string directly
single_text_prompt
=
text_prompt
# Pass a TextPrompt
single_tokens_prompt
=
tokens_prompt
# Pass a TokensPrompt
# ruff: noqa: E501
# - Pass explicit encoder and decoder input prompts within one data structure.
# Encoder and decoder prompts can both independently be text or tokens, with
# no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.
enc_dec_prompt1
=
ExplicitEncoderDecoderPrompt
(
# Pass encoder prompt string directly, &
# pass decoder prompt tokens
encoder_prompt
=
single_text_prompt_raw
,
decoder_prompt
=
single_tokens_prompt
,
)
enc_dec_prompt2
=
ExplicitEncoderDecoderPrompt
(
# Pass TextPrompt to encoder, and
# pass decoder prompt string directly
encoder_prompt
=
single_text_prompt
,
decoder_prompt
=
single_text_prompt_raw
,
)
enc_dec_prompt3
=
ExplicitEncoderDecoderPrompt
(
# Pass encoder prompt tokens directly, and
# pass TextPrompt to decoder
encoder_prompt
=
single_tokens_prompt
,
decoder_prompt
=
single_text_prompt
,
)
# - Finally, here's a useful helper function for zipping encoder and
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt
# instances
zipped_prompt_list
=
zip_enc_dec_prompts
(
[
'An encoder prompt'
,
'Another encoder prompt'
],
[
'A decoder prompt'
,
'Another decoder prompt'
])
# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
return
[
single_text_prompt_raw
,
single_text_prompt
,
single_tokens_prompt
,
enc_dec_prompt1
,
enc_dec_prompt2
,
enc_dec_prompt3
]
+
zipped_prompt_list
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
min_tokens
=
0
,
max_tokens
=
20
,
)
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
def
create_sampling_params
():
return
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
min_tokens
=
0
,
max_tokens
=
20
,
)
# Print the outputs.
print
(
"-"
*
50
)
for
i
,
output
in
enumerate
(
outputs
):
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Output
{
i
+
1
}
:"
)
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
\n
"
f
"Decoder prompt:
{
prompt
!
r
}
\n
"
f
"Generated text:
{
generated_text
!
r
}
"
)
def
print_outputs
(
outputs
):
print
(
"-"
*
50
)
for
i
,
output
in
enumerate
(
outputs
):
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Output
{
i
+
1
}
:"
)
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
\n
"
f
"Decoder prompt:
{
prompt
!
r
}
\n
"
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
def
main
():
dtype
=
"float"
# Create a BART encoder/decoder model instance
llm
=
LLM
(
model
=
"facebook/bart-large-cnn"
,
dtype
=
dtype
,
)
# Get BART tokenizer
tokenizer
=
llm
.
llm_engine
.
get_tokenizer_group
()
prompts
=
create_prompts
(
tokenizer
)
sampling_params
=
create_sampling_params
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/encoder_decoder_multimodal.py
View file @
081057de
...
...
@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
def
run_florence2
():
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"
facebook/bart-large
"
,
tokenizer
=
"
Isotr0py/Florence-2-tokenizer
"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
},
...
...
@@ -126,6 +126,23 @@ model_example_map = {
}
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"mllama"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
return
parser
.
parse_args
()
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
...
...
@@ -148,6 +165,7 @@ def main(args):
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
64
,
skip_special_tokens
=
False
,
)
start
=
time
.
time
()
...
...
@@ -171,19 +189,5 @@ def main(args):
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"mllama"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/llm_engine_example.py
View file @
081057de
...
...
@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
return
LLMEngine
.
from_engine_args
(
engine_args
)
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using the LLMEngine class directly'
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
return
parser
.
parse_args
()
def
main
(
args
:
argparse
.
Namespace
):
"""Main function that sets up and runs the prompt processing."""
engine
=
initialize_engine
(
args
)
...
...
@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
if
__name__
==
'__main__'
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using the LLMEngine class directly'
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/mistral-small.py
View file @
081057de
...
...
@@ -16,11 +16,11 @@ from vllm.sampling_params import SamplingParams
# # Mistral format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt 'image
=4
' --max-model-len 16384
# --limit-mm-per-prompt '
{"
image
":4}
' --max-model-len 16384
#
# # HF format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --limit-mm-per-prompt 'image
=4
' --max-model-len 16384
# --limit-mm-per-prompt '
{"
image
":4}
' --max-model-len 16384
# ```
#
# - Client:
...
...
@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
tokenizer_mode
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
config_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
load_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
2
,
...
...
@@ -168,7 +169,7 @@ def run_advanced_demo(args: argparse.Namespace):
print
(
"-"
*
50
)
def
main
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Run a demo in simple or advanced mode."
)
...
...
@@ -187,8 +188,11 @@ def main():
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
return
parser
.
parse_args
()
args
=
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
if
args
.
mode
==
"simple"
:
print
(
"Running simple demo..."
)
...
...
examples/offline_inference/mlpspeculator.py
View file @
081057de
...
...
@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str],
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
def
main
():
template
=
(
"Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.
\n\n
### Instruction:
\n
{}"
...
...
@@ -66,3 +65,7 @@ if __name__ == "__main__":
)
time_generation
(
llm
,
prompts
,
sampling_params
,
"With speculation"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/prithvi_geospatial_mae.py
View file @
081057de
...
...
@@ -417,6 +417,38 @@ def run_model(input_data,
return
pred_imgs
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"MAE run inference"
,
add_help
=
False
)
parser
.
add_argument
(
"--data_file"
,
type
=
str
,
default
=
"./India_900498_S2Hand.tif"
,
help
=
"Path to the file."
,
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
"output"
,
help
=
"Path to the directory where to save outputs."
,
)
parser
.
add_argument
(
"--input_indices"
,
default
=
[
1
,
2
,
3
,
8
,
11
,
12
],
type
=
int
,
nargs
=
"+"
,
help
=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data."
,
)
parser
.
add_argument
(
"--rgb_outputs"
,
action
=
"store_true"
,
help
=
"If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved."
,
)
def
main
(
data_file
:
str
,
output_dir
:
str
,
...
...
@@ -496,35 +528,7 @@ def main(
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
"MAE run inference"
,
add_help
=
False
)
parser
.
add_argument
(
"--data_file"
,
type
=
str
,
default
=
"./India_900498_S2Hand.tif"
,
help
=
"Path to the file."
,
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
"output"
,
help
=
"Path to the directory where to save outputs."
,
)
parser
.
add_argument
(
"--input_indices"
,
default
=
[
1
,
2
,
3
,
8
,
11
,
12
],
type
=
int
,
nargs
=
"+"
,
help
=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data."
,
)
parser
.
add_argument
(
"--rgb_outputs"
,
action
=
"store_true"
,
help
=
"If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved."
,
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
**
vars
(
args
))
examples/offline_inference/profiling.py
View file @
081057de
...
...
@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
f
" in folder
{
context
.
save_chrome_traces_folder
}
"
)
if
__name__
==
"__main__"
:
def
parse_args
()
:
parser
=
FlexibleArgumentParser
(
description
=
"""
Profile a model
...
...
@@ -449,7 +449,10 @@ Profile a model
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
context
=
ProfileContext
(
engine_args
=
EngineArgs
.
from_cli_args
(
args
),
**
{
...
...
@@ -458,3 +461,8 @@ Profile a model
if
k
in
inspect
.
signature
(
ProfileContext
).
parameters
})
run_profile
(
context
,
csv_output
=
args
.
csv
,
json_output
=
args
.
json
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/qwen2_5_omni/README.md
0 → 100644
View file @
081057de
# Qwen2.5-Omni Offline Inference Examples
This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
## Thinker Only
```
bash
# Audio + image + video
python examples/offline_inference/qwen2_5_omni/only_thinker.py
-q
mixed_modalities
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1
=
0 python examples/offline_inference/qwen2_5_omni/only_thinker.py
-q
use_audio_in_video
# Multiple audios
VLLM_USE_V1
=
0 python examples/offline_inference/qwen2_5_omni/only_thinker.py
-q
multi_audios
```
This script will run the thinker part of Qwen2.5-Omni, and generate text response.
You can also test Qwen2.5-Omni on a single modality:
```
bash
# Process audio inputs
python examples/offline_inference/audio_language.py
--model-type
qwen2_5_omni
# Process image inputs
python examples/offline_inference/vision_language.py
--modality
image
--model-type
qwen2_5_omni
# Process video inputs
python examples/offline_inference/vision_language.py
--modality
video
--model-type
qwen2_5_omni
```
examples/offline_inference/qwen2_5_omni/only_thinker.py
0 → 100644
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""
from
typing
import
NamedTuple
import
vllm.envs
as
envs
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.utils
import
FlexibleArgumentParser
class
QueryResult
(
NamedTuple
):
inputs
:
dict
limit_mm_per_prompt
:
dict
[
str
,
int
]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
default_system
=
(
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
def
get_mixed_modalities_query
()
->
QueryResult
:
question
=
(
"What is recited in the audio? "
"What is the content of this image? Why is this video funny?"
)
prompt
=
(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
"<|im_start|>user
\n
<|audio_bos|><|AUDIO|><|audio_eos|>"
"<|vision_bos|><|IMAGE|><|vision_eos|>"
"<|vision_bos|><|VIDEO|><|vision_eos|>"
f
"
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
"
)
return
QueryResult
(
inputs
=
{
"prompt"
:
prompt
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
),
"video"
:
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
num_frames
=
16
).
np_ndarrays
,
},
},
limit_mm_per_prompt
=
{
"audio"
:
1
,
"image"
:
1
,
"video"
:
1
},
)
def
get_use_audio_in_video_query
()
->
QueryResult
:
question
=
(
"Describe the content of the video, "
"then convert what the baby say into text."
)
prompt
=
(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
"<|im_start|>user
\n
<|vision_bos|><|VIDEO|><|vision_eos|>"
f
"
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
"
)
asset
=
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
num_frames
=
16
)
audio
=
asset
.
get_audio
(
sampling_rate
=
16000
)
assert
not
envs
.
VLLM_USE_V1
,
(
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)
return
QueryResult
(
inputs
=
{
"prompt"
:
prompt
,
"multi_modal_data"
:
{
"video"
:
asset
.
np_ndarrays
,
"audio"
:
audio
,
},
"mm_processor_kwargs"
:
{
"use_audio_in_video"
:
True
,
},
},
limit_mm_per_prompt
=
{
"audio"
:
1
,
"video"
:
1
},
)
def
get_multi_audios_query
()
->
QueryResult
:
question
=
"Are these two audio clips the same?"
prompt
=
(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
"<|im_start|>user
\n
<|audio_bos|><|AUDIO|><|audio_eos|>"
"<|audio_bos|><|AUDIO|><|audio_eos|>"
f
"
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
"
)
return
QueryResult
(
inputs
=
{
"prompt"
:
prompt
,
"multi_modal_data"
:
{
"audio"
:
[
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
],
},
},
limit_mm_per_prompt
=
{
"audio"
:
2
,
},
)
query_map
=
{
"mixed_modalities"
:
get_mixed_modalities_query
,
"use_audio_in_video"
:
get_use_audio_in_video_query
,
"multi_audios"
:
get_multi_audios_query
,
}
def
main
(
args
):
model_name
=
"Qwen/Qwen2.5-Omni-7B"
query_result
=
query_map
[
args
.
query_type
]()
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
5632
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
query_result
.
limit_mm_per_prompt
,
seed
=
args
.
seed
)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params
=
SamplingParams
(
temperature
=
0.2
,
max_tokens
=
64
)
outputs
=
llm
.
generate
(
query_result
.
inputs
,
sampling_params
=
sampling_params
)
for
o
in
outputs
:
generated_text
=
o
.
outputs
[
0
].
text
print
(
generated_text
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'audio language models'
)
parser
.
add_argument
(
'--query-type'
,
'-q'
,
type
=
str
,
default
=
"mixed_modalities"
,
choices
=
query_map
.
keys
(),
help
=
'Query type.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/save_sharded_state.py
View file @
081057de
...
...
@@ -29,20 +29,23 @@ from pathlib import Path
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
parser
=
FlexibleArgumentParser
()
EngineArgs
.
add_cli_args
(
parser
)
parser
.
add_argument
(
"--output"
,
"-o"
,
required
=
True
,
type
=
str
,
help
=
"path to output checkpoint"
)
parser
.
add_argument
(
"--file-pattern"
,
type
=
str
,
help
=
"string pattern of saved filenames"
)
parser
.
add_argument
(
"--max-file-size"
,
type
=
str
,
default
=
5
*
1024
**
3
,
help
=
"max size (in bytes) of each safetensors file"
)
def
parse_args
():
parser
=
FlexibleArgumentParser
()
EngineArgs
.
add_cli_args
(
parser
)
parser
.
add_argument
(
"--output"
,
"-o"
,
required
=
True
,
type
=
str
,
help
=
"path to output checkpoint"
)
parser
.
add_argument
(
"--file-pattern"
,
type
=
str
,
help
=
"string pattern of saved filenames"
)
parser
.
add_argument
(
"--max-file-size"
,
type
=
str
,
default
=
5
*
1024
**
3
,
help
=
"max size (in bytes) of each safetensors file"
)
return
parser
.
parse_args
()
def
main
(
args
):
...
...
@@ -87,5 +90,5 @@ def main(args):
if
__name__
==
"__main__"
:
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/simple_profiling.py
View file @
081057de
...
...
@@ -18,8 +18,8 @@ prompts = [
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
if
__name__
==
"__main__"
:
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
)
...
...
@@ -42,3 +42,7 @@ if __name__ == "__main__":
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
time
.
sleep
(
10
)
if
__name__
==
"__main__"
:
main
()
Prev
1
…
4
5
6
7
8
9
10
11
12
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment