Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1107 additions
and
402 deletions
+1107
-402
examples/offline_inference/disaggregated_prefill_lmcache.py
examples/offline_inference/disaggregated_prefill_lmcache.py
+130
-0
examples/offline_inference/distributed.py
examples/offline_inference/distributed.py
+5
-5
examples/offline_inference/eagle.py
examples/offline_inference/eagle.py
+93
-0
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+184
-0
examples/offline_inference/florence2_inference.py
examples/offline_inference/florence2_inference.py
+0
-46
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+3
-5
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+4
-5
examples/offline_inference/mistral-small.py
examples/offline_inference/mistral-small.py
+29
-11
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+4
-3
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+4
-4
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+4
-4
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+8
-7
examples/offline_inference/profiling_tpu/profiling.py
examples/offline_inference/profiling_tpu/profiling.py
+1
-2
examples/offline_inference/reproduciblity.py
examples/offline_inference/reproduciblity.py
+36
-0
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+3
-63
examples/offline_inference/rlhf_colocate.py
examples/offline_inference/rlhf_colocate.py
+1
-35
examples/offline_inference/rlhf_utils.py
examples/offline_inference/rlhf_utils.py
+105
-0
examples/offline_inference/tpu.py
examples/offline_inference/tpu.py
+3
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+470
-200
examples/offline_inference/vision_language_embedding.py
examples/offline_inference/vision_language_embedding.py
+20
-11
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
examples/offline_inference/disaggregated_prefill_lmcache.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of disaggregated prefilling
with LMCache.
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server.
KV cache is transferred in the following manner:
vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import
os
import
subprocess
import
time
from
multiprocessing
import
Event
,
Process
from
lmcache.experimental.cache_engine
import
LMCacheEngineBuilder
from
lmcache.integration.vllm.utils
import
ENGINE_NAME
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
# LMCache-related environment variables
# The port to start LMCache server
port
=
8100
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
# LMCache is set to use 256 tokens per chunk
os
.
environ
[
"LMCACHE_CHUNK_SIZE"
]
=
"256"
# Disable local CPU backend in LMCache
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"False"
# Set local CPU memory buffer limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
# Set the remote URL for LMCache server
os
.
environ
[
"LMCACHE_REMOTE_URL"
]
=
f
"lm://localhost:
{
port
}
"
# Set the serializer/deserializer between vllm and LMCache server
# `naive` indicates using raw bytes of the tensor without any compression
os
.
environ
[
"LMCACHE_REMOTE_SERDE"
]
=
"naive"
def
run_prefill
(
prefill_done
,
prompts
):
# We use GPU 0 for prefill node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
ktc
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
)
#llm.generate(prompts, sampling_params)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"Prefill node is finished."
)
prefill_done
.
set
()
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_decode
(
prefill_done
,
prompts
,
timeout
=
1
):
# We use GPU 1 for decode node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
ktc
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
)
print
(
"Waiting for prefill node to finish..."
)
prefill_done
.
wait
()
time
.
sleep
(
timeout
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_lmcache_server
(
port
):
server_proc
=
subprocess
.
Popen
([
"python"
,
"-m"
,
"lmcache.experimental.server"
,
"localhost"
,
str
(
port
)
])
return
server_proc
if
__name__
==
"__main__"
:
prompts
=
[
"Hello, how are you?"
*
1000
,
]
prefill_done
=
Event
()
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,
prompts
))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,
prompts
))
lmcache_server_process
=
run_lmcache_server
(
port
)
# Start prefill node
prefill_process
.
start
()
# Start decode node
decode_process
.
start
()
# Clean up the processes
decode_process
.
join
()
prefill_process
.
terminate
()
lmcache_server_process
.
terminate
()
lmcache_server_process
.
wait
()
examples/offline_inference/distributed.py
View file @
469e903b
...
...
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from
typing
import
Any
,
Dict
,
List
from
typing
import
Any
import
numpy
as
np
import
ray
...
...
@@ -36,13 +36,13 @@ class LLMPredictor:
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
tensor_parallel_size
=
tensor_parallel_size
)
def
__call__
(
self
,
batch
:
D
ict
[
str
,
np
.
ndarray
])
->
D
ict
[
str
,
list
]:
def
__call__
(
self
,
batch
:
d
ict
[
str
,
np
.
ndarray
])
->
d
ict
[
str
,
list
]:
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
prompt
:
L
ist
[
str
]
=
[]
generated_text
:
L
ist
[
str
]
=
[]
prompt
:
l
ist
[
str
]
=
[]
generated_text
:
l
ist
[
str
]
=
[]
for
output
in
outputs
:
prompt
.
append
(
output
.
prompt
)
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
...
...
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
pg
,
placement_group_capture_child_tasks
=
True
))
resources_kwarg
:
D
ict
[
str
,
Any
]
=
{}
resources_kwarg
:
d
ict
[
str
,
Any
]
=
{}
if
tensor_parallel_size
==
1
:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg
[
"num_gpus"
]
=
1
...
...
examples/offline_inference/eagle.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
json
import
os
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"./examples/data/gsm8k.jsonl"
,
help
=
"downloaded from the eagle repo "
\
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
)
parser
.
add_argument
(
"--max_num_seqs"
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
"--num_prompts"
,
type
=
int
,
default
=
80
)
parser
.
add_argument
(
"--num_spec_tokens"
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
"--tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--draft_tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--enforce_eager"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--enable_chunked_prefill"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--max_num_batched_tokens"
,
type
=
int
,
default
=
2048
)
parser
.
add_argument
(
"--temp"
,
type
=
float
,
default
=
0
)
args
=
parser
.
parse_args
()
print
(
args
)
model_dir
=
"meta-llama/Meta-Llama-3-8B-Instruct"
eagle_dir
=
"abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
max_model_len
=
2048
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_dir
)
if
os
.
path
.
exists
(
args
.
dataset
):
prompts
=
[]
num_prompts
=
args
.
num_prompts
with
open
(
args
.
dataset
)
as
f
:
for
line
in
f
:
data
=
json
.
loads
(
line
)
prompts
.
append
(
data
[
"turns"
][
0
])
else
:
prompts
=
[
"The future of AI is"
,
"The president of the United States is"
]
prompts
=
prompts
[:
args
.
num_prompts
]
num_prompts
=
len
(
prompts
)
prompt_ids
=
[
tokenizer
.
apply_chat_template
([{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
)
for
prompt
in
prompts
]
llm
=
LLM
(
model
=
model_dir
,
trust_remote_code
=
True
,
tensor_parallel_size
=
args
.
tp
,
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
enforce_eager
=
args
.
enforce_eager
,
max_model_len
=
max_model_len
,
max_num_seqs
=
args
.
max_num_seqs
,
gpu_memory_utilization
=
0.8
,
speculative_model
=
eagle_dir
,
num_speculative_tokens
=
args
.
num_spec_tokens
,
speculative_draft_tensor_parallel_size
=
args
.
draft_tp
,
speculative_max_model_len
=
max_model_len
,
disable_log_stats
=
False
,
)
sampling_params
=
SamplingParams
(
temperature
=
args
.
temp
,
max_tokens
=
256
)
outputs
=
llm
.
generate
(
prompt_token_ids
=
prompt_ids
,
sampling_params
=
sampling_params
)
# calculate the average number of accepted tokens per forward pass, +1 is
# to account for the token from the target model that's always going to be
# accepted
acceptance_counts
=
[
0
]
*
(
args
.
num_spec_tokens
+
1
)
for
output
in
outputs
:
for
step
,
count
in
enumerate
(
output
.
metrics
.
spec_token_acceptance_counts
):
acceptance_counts
[
step
]
+=
count
print
(
f
"mean acceptance length:
\
{
sum
(
acceptance_counts
)
/
acceptance_counts
[
0
]:.
2
f
}
"
)
examples/offline_inference/encoder_decoder_multimodal.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import
time
from
collections.abc
import
Sequence
from
dataclasses
import
asdict
from
typing
import
NamedTuple
from
vllm
import
LLM
,
EngineArgs
,
PromptType
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.utils
import
FlexibleArgumentParser
class
ModelRequestData
(
NamedTuple
):
engine_args
:
EngineArgs
prompts
:
Sequence
[
PromptType
]
def
run_florence2
():
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"facebook/bart-large"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# implicit prompt with task token
"prompt"
:
"<DETAILED_CAPTION>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
},
},
{
# explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
"Describe in detail what is shown in the image."
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
},
},
"decoder_prompt"
:
""
,
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
def
run_mllama
():
engine_args
=
EngineArgs
(
model
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Implicit prompt
"prompt"
:
"<|image|><|begin_of_text|>What is the content of this image?"
,
# noqa: E501
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
{
# Explicit prompt
"encoder_prompt"
:
{
"prompt"
:
"<|image|>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
"decoder_prompt"
:
"<|image|><|begin_of_text|>Please describe the image."
,
# noqa: E501
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
def
run_whisper
():
engine_args
=
EngineArgs
(
model
=
"openai/whisper-large-v3-turbo"
,
max_model_len
=
448
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Test implicit prompt
"prompt"
:
"<|startoftranscript|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|>"
,
}
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
model_example_map
=
{
"florence2"
:
run_florence2
,
"mllama"
:
run_mllama
,
"whisper"
:
run_whisper
,
}
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
raise
ValueError
(
f
"Model type
{
model
}
is not supported."
)
req_data
=
model_example_map
[
model
]()
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
args
.
seed
}
llm
=
LLM
(
**
engine_args
)
prompts
=
req_data
.
prompts
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
64
,
)
start
=
time
.
time
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
duration
=
time
.
time
()
-
start
print
(
"Duration:"
,
duration
)
print
(
"RPS:"
,
len
(
prompts
)
/
duration
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"mllama"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/florence2_inference.py
deleted
100644 → 0
View file @
389ebcf7
# SPDX-License-Identifier: Apache-2.0
'''
Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2
'''
# TODO(Isotr0py):
# Move to offline_inference/vision_language.py
# after porting vision backbone
from
vllm
import
LLM
,
SamplingParams
dtype
=
"float"
# Create a Florence-2 encoder/decoder model instance
llm
=
LLM
(
model
=
"microsoft/Florence-2-base"
,
tokenizer
=
"facebook/bart-base"
,
dtype
=
dtype
,
trust_remote_code
=
True
,
)
prompts
=
[
"<CAPTION>"
,
"<DETAILED_CAPTION>"
,
"<MORE_DETAILED_CAPTION>"
,
"<CAPTION_TO_PHRASE_GROUNDING>"
,
"<OD>"
,
"<DENSE_REGION_CAPTION>"
,
"<REGION_PROPOSAL>"
,
"<OCR>"
,
"<OCR_WITH_REGION>"
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
min_tokens
=
0
,
max_tokens
=
20
,
)
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
, "
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/llm_engine_example.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
argparse
from
typing
import
List
,
Tuple
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.utils
import
FlexibleArgumentParser
def
create_test_prompts
()
->
L
ist
[
T
uple
[
str
,
SamplingParams
]]:
def
create_test_prompts
()
->
l
ist
[
t
uple
[
str
,
SamplingParams
]]:
"""Create a list of test prompts with their sampling parameters."""
return
[
(
"A robot may not injure a human being"
,
...
...
@@ -16,7 +15,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
)),
(
"What is the meaning of life?"
,
SamplingParams
(
n
=
2
,
best_of
=
5
,
temperature
=
0.8
,
top_p
=
0.95
,
frequency_penalty
=
0.1
)),
...
...
@@ -24,7 +22,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
]]):
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
...
...
@@ -34,7 +32,7 @@ def process_requests(engine: LLMEngine,
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
469e903b
...
...
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
"""
import
gc
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
torch
from
huggingface_hub
import
snapshot_download
...
...
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
return
[
# this is an example of using quantization without LoRA
(
"My name is"
,
...
...
@@ -49,7 +49,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
...
...
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
print
(
"----------------------------------------------------"
)
...
...
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
engine_args
=
EngineArgs
(
model
=
model
,
quantization
=
quantization
,
qlora_adapter_name_or_path
=
lora_repo
,
load_format
=
"bitsandbytes"
,
enable_lora
=
True
,
max_lora_rank
=
64
)
else
:
...
...
examples/offline_inference/
pixtra
l.py
→
examples/offline_inference/
mistral-smal
l.py
View file @
469e903b
...
...
@@ -6,14 +6,16 @@ import argparse
from
vllm
import
LLM
from
vllm.sampling_params
import
SamplingParams
# This script is an offline demo for running
Pix
tral
.
# This script is an offline demo for running
Mis
tral
-Small-3.1
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ```
#
# - Client:
...
...
@@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
# "model": "mistralai/
Pix
tral-
12B-2409
",
# "model": "mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
",
# "messages": [
# {
# "role": "user",
...
...
@@ -43,12 +45,20 @@ from vllm.sampling_params import SamplingParams
# python demo.py advanced
def
run_simple_demo
():
model_name
=
"mistralai/
Pix
tral-
12B-2409
"
def
run_simple_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
"
sampling_params
=
SamplingParams
(
max_tokens
=
8192
)
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
)
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
"Describe this image in one sentence."
image_url
=
"https://picsum.photos/id/237/200/300"
...
...
@@ -76,8 +86,8 @@ def run_simple_demo():
print
(
outputs
[
0
].
outputs
[
0
].
text
)
def
run_advanced_demo
():
model_name
=
"mistralai/
Pix
tral-
12B-2409
"
def
run_advanced_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
"
max_img_per_msg
=
5
max_tokens_per_img
=
4096
...
...
@@ -85,8 +95,11 @@ def run_advanced_demo():
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
,
limit_mm_per_prompt
=
{
"image"
:
max_img_per_msg
},
max_model_len
=
max_img_per_msg
*
max_tokens_per_img
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
"Describe the following image."
...
...
@@ -153,14 +166,19 @@ def main():
help
=
"Specify the demo mode: 'simple' or 'advanced'"
,
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
args
=
parser
.
parse_args
()
if
args
.
mode
==
"simple"
:
print
(
"Running simple demo..."
)
run_simple_demo
()
run_simple_demo
(
args
)
elif
args
.
mode
==
"advanced"
:
print
(
"Running advanced demo..."
)
run_advanced_demo
()
run_advanced_demo
(
args
)
if
__name__
==
"__main__"
:
...
...
examples/offline_inference/mlpspeculator.py
View file @
469e903b
...
...
@@ -2,12 +2,11 @@
import
gc
import
time
from
typing
import
List
from
vllm
import
LLM
,
SamplingParams
def
time_generation
(
llm
:
LLM
,
prompts
:
L
ist
[
str
],
def
time_generation
(
llm
:
LLM
,
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
):
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
...
...
@@ -51,7 +50,9 @@ if __name__ == "__main__":
# Create an LLM with spec decoding
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
speculative_model
=
"ibm-ai-platform/llama-13b-accelerator"
,
speculative_config
=
{
"model"
:
"ibm-ai-platform/llama-13b-accelerator"
,
},
)
print
(
"With speculation"
)
...
...
examples/offline_inference/multilora_inference.py
View file @
469e903b
...
...
@@ -6,7 +6,7 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2.
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
huggingface_hub
import
snapshot_download
...
...
@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
...
...
@@ -56,7 +56,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
...
...
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
...
...
examples/offline_inference/prithvi_geospatial_mae.py
View file @
469e903b
...
...
@@ -21,7 +21,7 @@ import argparse
import
datetime
import
os
import
re
from
typing
import
List
,
Union
from
typing
import
Union
import
albumentations
import
numpy
as
np
...
...
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
def
load_example
(
file_paths
:
L
ist
[
str
],
mean
:
L
ist
[
float
]
=
None
,
std
:
L
ist
[
float
]
=
None
,
file_paths
:
l
ist
[
str
],
mean
:
l
ist
[
float
]
=
None
,
std
:
l
ist
[
float
]
=
None
,
indices
:
Union
[
list
[
int
],
None
]
=
None
,
):
"""Build an input example by loading images in *file_paths*.
...
...
examples/offline_inference/profiling.py
View file @
469e903b
...
...
@@ -5,8 +5,9 @@ import json
import
os
import
sys
from
argparse
import
RawTextHelpFormatter
from
collections.abc
import
Generator
from
dataclasses
import
asdict
,
dataclass
from
typing
import
Any
,
Dict
,
Generator
,
List
,
Optional
,
TypeAlias
from
typing
import
Any
,
Optional
,
TypeAlias
import
torch
import
tqdm
...
...
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
return
dtype
OutputLen_NumReqs_Map
:
TypeAlias
=
D
ict
[
int
,
int
]
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
L
ist
[
int
])
\
OutputLen_NumReqs_Map
:
TypeAlias
=
d
ict
[
int
,
int
]
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
l
ist
[
int
])
\
->
OutputLen_NumReqs_Map
:
"""
Given the number of requests, batch_size, and the number of requests
...
...
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
Args:
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
step_requests (
L
ist[int]): step_requests[i] is the number of requests
step_requests (
l
ist[int]): step_requests[i] is the number of requests
that the ith engine step should process.
Returns:
...
...
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
return
ol_nr
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
L
ist
[
int
]:
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
l
ist
[
int
]:
"""
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
...
...
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
context: ProfileContext object.
Returns:
L
ist[int]: Number of requests to process for all engine-steps.
l
ist[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
should process.
"""
...
...
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
for
key
,
value
in
asdict
(
context
).
items
():
print
(
f
"
{
key
}
=
{
value
}
"
)
requests_per_step
:
L
ist
[
int
]
=
determine_requests_per_step
(
context
)
requests_per_step
:
l
ist
[
int
]
=
determine_requests_per_step
(
context
)
ol_nr
:
OutputLen_NumReqs_Map
=
compute_request_output_lengths
(
context
.
batch_size
,
requests_per_step
)
...
...
examples/offline_inference/profiling_tpu/profiling.py
View file @
469e903b
...
...
@@ -4,7 +4,6 @@ import argparse
import
dataclasses
import
os
import
time
from
typing
import
List
import
numpy
as
np
import
torch_xla.debug.profiler
as
xp
...
...
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
batch_size
,
args
.
input_len
))
dummy_prompts
:
L
ist
[
PromptType
]
=
[{
dummy_prompts
:
l
ist
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
...
...
examples/offline_inference/reproduciblity.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
os
from
vllm
import
LLM
,
SamplingParams
# vLLM does not guarantee the reproducibility of the results by default,
# for the sake of performance. You need to do the following to achieve
# reproducible results:
# 1. Turn off multiprocessing to make the scheduling deterministic.
# NOTE(woosuk): This is not needed and will be ignored for V0.
os
.
environ
[
"VLLM_ENABLE_V1_MULTIPROCESSING"
]
=
"0"
# 2. Fix the global seed for reproducibility. The default seed is None, which is
# not reproducible.
SEED
=
42
# NOTE(woosuk): Even with the above two settings, vLLM only provides
# reproducibility when it runs on the same hardware and the same vLLM version.
# Also, the online serving API (`vllm serve`) does not support reproducibility
# because it is almost impossible to make the scheduling deterministic in the
# online serving setting.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
seed
=
SEED
)
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/rlhf.py
View file @
469e903b
...
...
@@ -18,72 +18,11 @@ import ray
import
torch
from
ray.util.placement_group
import
placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
rlhf_utils
import
stateless_init_process_group
from
transformers
import
AutoModelForCausalLM
from
vllm
import
LLM
,
SamplingParams
from
vllm.utils
import
get_ip
,
get_open_port
from
vllm.worker.worker
import
Worker
def
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
device
):
"""
vLLM provides `StatelessProcessGroup` to create a process group
without considering the global process group in torch.distributed.
It is recommended to create `StatelessProcessGroup`, and then initialize
the data-plane communication (NCCL) between external (train processes)
and vLLM workers.
"""
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.utils
import
StatelessProcessGroup
pg
=
StatelessProcessGroup
.
create
(
host
=
master_address
,
port
=
master_port
,
rank
=
rank
,
world_size
=
world_size
)
pynccl
=
PyNcclCommunicator
(
pg
,
device
=
device
)
return
pynccl
class
MyWorker
(
Worker
):
"""
The `MyWorker` class inherits from `Worker` to provide custom functions.
For simplicity, we define the `MyWorker` class in this self-contained
script. Normally, we should define the `MyWorker` class in a separate
file and pass the qualified name of the class to the `worker_cls`
parameter.
"""
def
init_weight_update_group
(
self
,
master_address
,
master_port
,
rank_offset
,
world_size
):
from
vllm.distributed.parallel_state
import
get_world_group
rank
=
get_world_group
().
rank
+
rank_offset
self
.
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
self
.
device
,
)
def
update_weight
(
self
,
name
,
dtype
,
shape
):
weight
=
torch
.
empty
(
shape
,
dtype
=
dtype
,
device
=
"cuda"
)
self
.
model_update_group
.
broadcast
(
weight
,
src
=
0
,
stream
=
torch
.
cuda
.
current_stream
())
self
.
model_runner
.
model
.
load_weights
(
weights
=
[(
name
,
weight
)])
del
weight
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
class
MyLLM
(
LLM
):
...
...
@@ -129,7 +68,7 @@ llm = ray.remote(
)(
MyLLM
).
remote
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
worker_
cls
=
MyWorker
,
worker_
extension_cls
=
"rlhf_utils.WorkerExtension"
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"ray"
,
)
...
...
@@ -159,6 +98,7 @@ master_port = get_open_port()
handle
=
llm
.
collective_rpc
.
remote
(
"init_weight_update_group"
,
args
=
(
master_address
,
master_port
,
1
,
3
))
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
0
,
3
,
torch
.
device
(
"cuda:0"
))
ray
.
get
(
handle
)
...
...
examples/offline_inference/rlhf_colocate.py
View file @
469e903b
...
...
@@ -17,40 +17,6 @@ from ray.util.placement_group import placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
LLM
from
vllm.worker.worker
import
Worker
class
MyWorker
(
Worker
):
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
return
self
.
device_uuid
def
update_weights_from_ipc_handles
(
self
,
ipc_handles
):
handles
=
ipc_handles
[
self
.
device_uuid
]
device_id
=
self
.
device
.
index
weights
=
[]
for
name
,
handle
in
handles
.
items
():
func
,
args
=
handle
list_args
=
list
(
args
)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args
[
6
]
=
device_id
tensor
=
func
(
*
list_args
)
weights
.
append
((
name
,
tensor
))
self
.
model_runner
.
model
.
load_weights
(
weights
=
weights
)
torch
.
cuda
.
synchronize
()
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
class
MyLLM
(
LLM
):
...
...
@@ -150,7 +116,7 @@ for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
)(
MyLLM
).
remote
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
worker_
cls
=
MyWorker
,
worker_
extension_cls
=
"rlhf_utils.ColocateWorkerExtension"
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"ray"
,
gpu_memory_utilization
=
0.4
,
...
...
examples/offline_inference/rlhf_utils.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
torch
def
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
device
):
"""
vLLM provides `StatelessProcessGroup` to create a process group
without considering the global process group in torch.distributed.
It is recommended to create `StatelessProcessGroup`, and then initialize
the data-plane communication (NCCL) between external (train processes)
and vLLM workers.
"""
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.utils
import
StatelessProcessGroup
pg
=
StatelessProcessGroup
.
create
(
host
=
master_address
,
port
=
master_port
,
rank
=
rank
,
world_size
=
world_size
)
pynccl
=
PyNcclCommunicator
(
pg
,
device
=
device
)
return
pynccl
class
WorkerExtension
:
"""
The class for vLLM's worker to inherit from.
By defining an extension class, the code can work no matter what is
the underlying worker class. This way, the code can be compatible
with both vLLM V0 and V1.
NOTE: we define this class in a separate module, and the main module
should pass the full qualified name as `worker_extension_cls` argument.
"""
def
init_weight_update_group
(
self
,
master_address
,
master_port
,
rank_offset
,
world_size
):
from
vllm.distributed.parallel_state
import
get_world_group
rank
=
get_world_group
().
rank
+
rank_offset
self
.
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
self
.
device
,
)
def
update_weight
(
self
,
name
,
dtype
,
shape
):
weight
=
torch
.
empty
(
shape
,
dtype
=
dtype
,
device
=
"cuda"
)
self
.
model_update_group
.
broadcast
(
weight
,
src
=
0
,
stream
=
torch
.
cuda
.
current_stream
())
self
.
model_runner
.
model
.
load_weights
(
weights
=
[(
name
,
weight
)])
del
weight
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
class
ColocateWorkerExtension
:
"""
The class for vLLM's worker to inherit from, in the colocate setting.
By defining an extension class, the code can work no matter what is
the underlying worker class. This way, the code can be compatible
with both vLLM V0 and V1.
NOTE: we define this class in a separate module, and the main module
should pass the full qualified name as `worker_extension_cls` argument.
"""
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
return
self
.
device_uuid
def
update_weights_from_ipc_handles
(
self
,
ipc_handles
):
handles
=
ipc_handles
[
self
.
device_uuid
]
device_id
=
self
.
device
.
index
weights
=
[]
for
name
,
handle
in
handles
.
items
():
func
,
args
=
handle
list_args
=
list
(
args
)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args
[
6
]
=
device_id
tensor
=
func
(
*
list_args
)
weights
.
append
((
name
,
tensor
))
self
.
model_runner
.
model
.
load_weights
(
weights
=
weights
)
torch
.
cuda
.
synchronize
()
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
examples/offline_inference/tpu.py
View file @
469e903b
...
...
@@ -21,7 +21,9 @@ sampling_params = SamplingParams(temperature=0.7,
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"google/gemma-2b"
,
enforce_eager
=
True
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
4
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
prompt
=
output
.
prompt
...
...
examples/offline_inference/vision_language.py
View file @
469e903b
...
...
@@ -6,122 +6,219 @@ the correct prompt format on vision language models for text generation.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import
os
import
random
from
dataclasses
import
asdict
from
typing
import
NamedTuple
,
Optional
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
FlexibleArgumentParser
class
ModelRequestData
(
NamedTuple
):
engine_args
:
EngineArgs
prompts
:
list
[
str
]
stop_token_ids
:
Optional
[
list
[
int
]]
=
None
lora_requests
:
Optional
[
list
[
LoRARequest
]]
=
None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# Aria
def
run_aria
(
question
:
str
,
modality
:
str
):
def
run_aria
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"rhymes-ai/Aria"
# NOTE: Need L40 (or equivalent) to avoid OOM
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
(
f
"<|im_start|>user
\n
<fim_prefix><|img|><fim_suffix>
{
question
}
"
"<|im_end|>
\n
<|im_start|>assistant
\n
"
)
prompts
=
[(
f
"<|im_start|>user
\n
<fim_prefix><|img|><fim_suffix>
{
question
}
"
"<|im_end|>
\n
<|im_start|>assistant
\n
"
)
for
question
in
questions
]
stop_token_ids
=
[
93532
,
93653
,
944
,
93421
,
1019
,
93653
,
93519
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# BLIP-2
def
run_blip2
(
question
:
str
,
modality
:
str
):
def
run_blip2
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompt
=
f
"Question:
{
question
}
Answer:"
llm
=
LLM
(
model
=
"Salesforce/blip2-opt-2.7b"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"Question:
{
question
}
Answer:"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"Salesforce/blip2-opt-2.7b"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Chameleon
def
run_chameleon
(
question
:
str
,
modality
:
str
):
def
run_chameleon
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompt
=
f
"
{
question
}
<image>"
llm
=
LLM
(
model
=
"facebook/chameleon-7b"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"
{
question
}
<image>"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"facebook/chameleon-7b"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Deepseek-VL2
def
run_deepseek_vl2
(
question
:
str
,
modality
:
str
):
def
run_deepseek_vl2
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]})
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
)
prompts
=
[
f
"<|User|>: <image>
\n
{
question
}
\n\n
<|Assistant|>:"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Florence2
def
run_florence2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"facebook/bart-large"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompts
=
[
"<MORE_DETAILED_CAPTION>"
for
_
in
questions
]
prompt
=
f
"<|User|>: <image>
\n
{
question
}
\n\n
<|Assistant|>:"
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Fuyu
def
run_fuyu
(
question
:
str
,
modality
:
str
):
def
run_fuyu
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompts
=
[
f
"
{
question
}
\n
"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"adept/fuyu-8b"
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Gemma 3
def
run_gemma3
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"google/gemma-3-4b-it"
prompt
=
f
"
{
question
}
\n
"
llm
=
LLM
(
model
=
"adept/fuyu-8b"
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
mm_processor_kwargs
=
{
"do_pan_and_scan"
:
True
},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompts
=
[(
"<bos><start_of_turn>user
\n
"
f
"<start_of_image>
{
question
}
<end_of_turn>
\n
"
"<start_of_turn>model
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# GLM-4v
def
run_glm4v
(
question
:
str
,
modality
:
str
):
def
run_glm4v
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"THUDM/glm-4v-9b"
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
f
"<|user|>
\n
<|begin_of_image|><|endoftext|><|end_of_image|>
\
{
question
}
<|assistant|>"
prompts
=
[
f
"<|user|>
\n
<|begin_of_image|><|endoftext|><|end_of_image|>
\
{
question
}
<|assistant|>"
for
question
in
questions
]
stop_token_ids
=
[
151329
,
151336
,
151338
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# H2OVL-Mississippi
def
run_h2ovl
(
question
:
str
,
modality
:
str
):
def
run_h2ovl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"h2oai/h2ovl-mississippi-800m"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
...
...
@@ -130,23 +227,31 @@ def run_h2ovl(question: str, modality: str):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
messages
=
[[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids
=
[
tokenizer
.
eos_token_id
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# Idefics3-8B-Llama3
def
run_idefics3
(
question
:
str
,
modality
:
str
):
def
run_idefics3
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
...
...
@@ -160,20 +265,23 @@ def run_idefics3(question: str, modality: str):
},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
(
prompt
s
=
[
(
f
"<|begin_of_text|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
# InternVL
def
run_internvl
(
question
:
str
,
modality
:
str
):
def
run_internvl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"OpenGVLab/InternVL2-2B"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
...
...
@@ -182,10 +290,13 @@ def run_internvl(question: str, modality: str):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
messages
=
[[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
...
...
@@ -193,84 +304,127 @@ def run_internvl(question: str, modality: str):
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens
=
[
"<|endoftext|>"
,
"<|im_start|>"
,
"<|im_end|>"
,
"<|end|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# LLaVA-1.5
def
run_llava
(
question
:
str
,
modality
:
str
):
def
run_llava
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompt
=
f
"USER: <image>
\n
{
question
}
\n
ASSISTANT:"
prompts
=
[
f
"USER: <image>
\n
{
question
}
\n
ASSISTANT:"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_model_len
=
4096
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
llm
=
LLM
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_model_len
=
4096
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LLaVA-1.6/LLaVA-NeXT
def
run_llava_next
(
question
:
str
,
modality
:
str
):
def
run_llava_next
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompt
=
f
"[INST] <image>
\n
{
question
}
[/INST]"
llm
=
LLM
(
model
=
"llava-hf/llava-v1.6-mistral-7b-hf"
,
max_model_len
=
8192
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"[INST] <image>
\n
{
question
}
[/INST]"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-v1.6-mistral-7b-hf"
,
max_model_len
=
8192
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LlaVA-NeXT-Video
# Currently only support for video input
def
run_llava_next_video
(
question
:
str
,
modality
:
str
):
def
run_llava_next_video
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"video"
prompt
=
f
"USER: <video>
\n
{
question
}
ASSISTANT:"
llm
=
LLM
(
model
=
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
max_model_len
=
8192
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"USER: <video>
\n
{
question
}
ASSISTANT:"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
max_model_len
=
8192
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LLaVA-OneVision
def
run_llava_onevision
(
question
:
str
,
modality
:
str
):
def
run_llava_onevision
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
if
modality
==
"video"
:
prompt
=
f
"<|im_start|>user <video>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
prompts
=
[
f
"<|im_start|>user <video>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
for
question
in
questions
]
elif
modality
==
"image"
:
prompt
=
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
prompts
=
[
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
,
max_model_len
=
16384
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
llm
=
LLM
(
model
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
,
max_model_len
=
16384
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Mantis
def
run_mantis
(
question
:
str
,
modality
:
str
):
def
run_mantis
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
llama3_template
=
'<|start_header_id|>user<|end_header_id|>
\n\n
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
'
# noqa: E501
prompt
=
llama3_template
.
format
(
f
"
{
question
}
\n
<image>"
)
prompts
=
[
llama3_template
.
format
(
f
"
{
question
}
\n
<image>"
)
for
question
in
questions
]
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
max_model_len
=
4096
,
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
stop_token_ids
=
[
128009
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# MiniCPM-V
def
run_minicpmv_base
(
question
:
str
,
modality
:
str
,
model_name
):
def
run_minicpmv_base
(
question
s
:
list
[
str
]
,
modality
:
str
,
model_name
):
assert
modality
in
[
"image"
,
"video"
]
# If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
...
...
@@ -294,7 +448,7 @@ def run_minicpmv_base(question: str, modality: str, model_name):
# model_name = "openbmb/MiniCPM-o-2_6"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
...
...
@@ -317,26 +471,33 @@ def run_minicpmv_base(question: str, modality: str, model_name):
"video"
:
"(<video>./</video>)"
,
}
messages
=
[{
'role'
:
'user'
,
'content'
:
f
'
{
modality_placeholder
[
modality
]
}
\n
{
question
}
'
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
tokenizer
.
apply_chat_template
(
[{
'role'
:
'user'
,
'content'
:
f
"
{
modality_placeholder
[
modality
]
}
\n
{
question
}
"
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
def
run_minicpmo
(
question
:
str
,
modality
:
str
):
return
run_minicpmv_base
(
question
,
modality
,
"openbmb/MiniCPM-o-2_6"
)
def
run_minicpmo
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
return
run_minicpmv_base
(
question
s
,
modality
,
"openbmb/MiniCPM-o-2_6"
)
def
run_minicpmv
(
question
:
str
,
modality
:
str
):
return
run_minicpmv_base
(
question
,
modality
,
"openbmb/MiniCPM-V-2_6"
)
def
run_minicpmv
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
return
run_minicpmv_base
(
question
s
,
modality
,
"openbmb/MiniCPM-V-2_6"
)
# LLama 3.2
def
run_mllama
(
question
:
str
,
modality
:
str
):
def
run_mllama
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
...
...
@@ -346,7 +507,7 @@ def run_mllama(question: str, modality: str):
# You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a single L40 GPU.
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
16
,
...
...
@@ -354,49 +515,58 @@ def run_mllama(question: str, modality: str):
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
messages
=
[{
messages
=
[
[{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image"
},
{
"type"
:
"text"
,
"text"
:
f
"
{
question
}
"
"text"
:
question
}]
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
tokenize
=
False
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
tokenize
=
False
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Molmo
def
run_molmo
(
question
,
modality
)
:
def
run_molmo
(
question
s
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"allenai/Molmo-7B-D-0924"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
question
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# NVLM-D
def
run_nvlm_d
(
question
:
str
,
modality
:
str
):
def
run_nvlm_d
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
...
...
@@ -406,43 +576,60 @@ def run_nvlm_d(question: str, modality: str):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
messages
=
[[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# PaliGemma
def
run_paligemma
(
question
:
str
,
modality
:
str
):
def
run_paligemma
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
# PaliGemma has special prompt format for VQA
prompt
=
"caption en"
llm
=
LLM
(
model
=
"google/paligemma-3b-mix-224"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
"caption en"
for
_
in
questions
]
engine_args
=
EngineArgs
(
model
=
"google/paligemma-3b-mix-224"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# PaliGemma 2
def
run_paligemma2
(
question
:
str
,
modality
:
str
):
def
run_paligemma2
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
# PaliGemma 2 has special prompt format for VQA
prompt
=
"caption en"
llm
=
LLM
(
model
=
"google/paligemma2-3b-ft-docci-448"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
"caption en"
for
_
in
questions
]
engine_args
=
EngineArgs
(
model
=
"google/paligemma2-3b-ft-docci-448"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Phi-3-Vision
def
run_phi3v
(
question
:
str
,
modality
:
str
):
def
run_phi3v
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompt
=
f
"<|user|>
\n
<|image_1|>
\n
{
question
}
<|end|>
\n
<|assistant|>
\n
"
prompts
=
[
f
"<|user|>
\n
<|image_1|>
\n
{
question
}
<|end|>
\n
<|assistant|>
\n
"
for
question
in
questions
]
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
...
...
@@ -456,7 +643,7 @@ def run_phi3v(question: str, modality: str):
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
...
...
@@ -465,34 +652,71 @@ def run_phi3v(question: str, modality: str):
mm_processor_kwargs
=
{
"num_crops"
:
16
},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Phi-4-multimodal-instruct
def
run_phi4mm
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs.
"""
assert
modality
==
"image"
model_path
=
snapshot_download
(
"microsoft/Phi-4-multimodal-instruct"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
prompts
=
[
f
"<|user|><|image_1|>
{
question
}
<|end|><|assistant|>"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
enable_lora
=
True
,
max_lora_rank
=
320
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
lora_requests
=
[
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)],
)
# Pixtral HF-format
def
run_pixtral_hf
(
question
:
str
,
modality
:
str
):
def
run_pixtral_hf
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"mistral-community/pixtral-12b"
# NOTE: Need L40 (or equivalent) to avoid OOM
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Qwen
def
run_qwen_vl
(
question
:
str
,
modality
:
str
):
def
run_qwen_vl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"Qwen/Qwen-VL"
,
trust_remote_code
=
True
,
max_model_len
=
1024
,
...
...
@@ -501,17 +725,20 @@ def run_qwen_vl(question: str, modality: str):
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
f
"
{
question
}
Picture 1: <img></img>
\n
"
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
f
"
{
question
}
Picture 1: <img></img>
\n
"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Qwen2-VL
def
run_qwen2_vl
(
question
:
str
,
modality
:
str
):
def
run_qwen2_vl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen2-VL-7B-Instruct"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
...
...
@@ -528,20 +755,25 @@ def run_qwen2_vl(question: str, modality: str):
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
prompt
=
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Qwen2.5-VL
def
run_qwen2_5_vl
(
question
:
str
,
modality
:
str
):
def
run_qwen2_5_vl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen2.5-VL-3B-Instruct"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
...
...
@@ -558,12 +790,17 @@ def run_qwen2_5_vl(question: str, modality: str):
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
prompt
=
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
prompts
=
[
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
model_example_map
=
{
...
...
@@ -571,7 +808,9 @@ model_example_map = {
"blip-2"
:
run_blip2
,
"chameleon"
:
run_chameleon
,
"deepseek_vl_v2"
:
run_deepseek_vl2
,
"florence2"
:
run_florence2
,
"fuyu"
:
run_fuyu
,
"gemma3"
:
run_gemma3
,
"glm4v"
:
run_glm4v
,
"h2ovl_chat"
:
run_h2ovl
,
"idefics3"
:
run_idefics3
,
...
...
@@ -589,6 +828,7 @@ model_example_map = {
"paligemma"
:
run_paligemma
,
"paligemma2"
:
run_paligemma2
,
"phi3_v"
:
run_phi3v
,
"phi4_mm"
:
run_phi4mm
,
"pixtral_hf"
:
run_pixtral_hf
,
"qwen_vl"
:
run_qwen_vl
,
"qwen2_vl"
:
run_qwen2_vl
,
...
...
@@ -607,29 +847,35 @@ def get_multi_modal_input(args):
# Input image and question
image
=
ImageAsset
(
"cherry_blossom"
)
\
.
pil_image
.
convert
(
"RGB"
)
img_question
=
"What is the content of this image?"
img_questions
=
[
"What is the content of this image?"
,
"Describe the content of this image in detail."
,
"What's in the image?"
,
"Where is this image taken?"
,
]
return
{
"data"
:
image
,
"question"
:
img_question
,
"question
s
"
:
img_question
s
,
}
if
args
.
modality
==
"video"
:
# Input video and question
video
=
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
num_frames
=
args
.
num_frames
).
np_ndarrays
vid_question
=
"Why is this video funny?"
vid_question
s
=
[
"Why is this video funny?"
]
return
{
"data"
:
video
,
"question"
:
vid_question
,
"question
s
"
:
vid_question
s
,
}
msg
=
f
"Modality
{
args
.
modality
}
is not supported."
raise
ValueError
(
msg
)
def
apply_image_repeat
(
image_repeat_prob
,
num_prompts
,
data
,
prompt
,
modality
):
def
apply_image_repeat
(
image_repeat_prob
,
num_prompts
,
data
,
prompts
:
list
[
str
],
modality
):
"""Repeats images with provided probability of "image_repeat_prob".
Used to simulate hit/miss for the MM preprocessor cache.
"""
...
...
@@ -649,7 +895,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
cur_image
.
putpixel
((
0
,
0
),
new_val
)
inputs
.
append
({
"prompt"
:
prompt
,
"prompt"
:
prompt
s
[
i
%
len
(
prompts
)]
,
"multi_modal_data"
:
{
modality
:
cur_image
}
...
...
@@ -666,41 +912,55 @@ def main(args):
modality
=
args
.
modality
mm_input
=
get_multi_modal_input
(
args
)
data
=
mm_input
[
"data"
]
question
=
mm_input
[
"question"
]
question
s
=
mm_input
[
"question
s
"
]
llm
,
prompt
,
stop_token_ids
=
model_example_map
[
model
](
question
,
modality
)
req_data
=
model_example_map
[
model
](
questions
,
modality
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
args
.
seed
}
llm
=
LLM
(
**
engine_args
)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if
req_data
.
lora_requests
:
for
lora_request
in
req_data
.
lora_requests
:
llm
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
# Don't want to check the flag multiple times, so just hijack `prompts`.
prompts
=
req_data
.
prompts
if
args
.
use_different_prompt_per_request
else
[
req_data
.
prompts
[
0
]
]
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params
=
SamplingParams
(
temperature
=
0.2
,
max_tokens
=
64
,
stop_token_ids
=
stop_token_ids
)
stop_token_ids
=
req_data
.
stop_token_ids
)
assert
args
.
num_prompts
>
0
if
args
.
num_prompts
==
1
:
# Single inference
inputs
=
{
"prompt"
:
prompt
,
"prompt"
:
prompt
s
[
0
]
,
"multi_modal_data"
:
{
modality
:
data
},
}
else
:
# Batch inference
if
args
.
image_repeat_prob
is
not
None
:
# Repeat images with specified probability of "image_repeat_prob"
inputs
=
apply_image_repeat
(
args
.
image_repeat_prob
,
args
.
num_prompts
,
data
,
prompt
,
args
.
num_prompts
,
data
,
prompt
s
,
modality
)
else
:
# Use the same image for all prompts
inputs
=
[{
"prompt"
:
prompt
,
"prompt"
:
prompt
s
[
i
%
len
(
prompts
)]
,
"multi_modal_data"
:
{
modality
:
data
},
}
for
_
in
range
(
args
.
num_prompts
)]
}
for
i
in
range
(
args
.
num_prompts
)]
if
args
.
time_generate
:
import
time
...
...
@@ -740,6 +1000,10 @@ if __name__ == "__main__":
type
=
int
,
default
=
16
,
help
=
'Number of frames to extract from the video.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
parser
.
add_argument
(
'--image-repeat-prob'
,
...
...
@@ -758,5 +1022,11 @@ if __name__ == "__main__":
action
=
'store_true'
,
help
=
'If True, then print the total generate() call time'
)
parser
.
add_argument
(
'--use-different-prompt-per-request'
,
action
=
'store_true'
,
help
=
'If True, then use different prompt (with the same multi-modal '
'data) for each request.'
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/vision_language_embedding.py
View file @
469e903b
...
...
@@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from
argparse
import
Namespace
from
dataclasses
import
asdict
from
typing
import
Literal
,
NamedTuple
,
Optional
,
TypedDict
,
Union
,
get_args
from
PIL.Image
import
Image
from
vllm
import
LLM
from
vllm
import
LLM
,
EngineArgs
from
vllm.multimodal.utils
import
fetch_image
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
class
ModelRequestData
(
NamedTuple
):
llm
:
LLM
engine_args
:
EngineArgs
prompt
:
str
image
:
Optional
[
Image
]
def
run_e5_v
(
query
:
Query
):
def
run_e5_v
(
query
:
Query
)
->
ModelRequestData
:
llama3_template
=
'<|start_header_id|>user<|end_header_id|>
\n\n
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
\n
'
# noqa: E501
if
query
[
"modality"
]
==
"text"
:
...
...
@@ -58,20 +59,20 @@ def run_e5_v(query: Query):
modality
=
query
[
'modality'
]
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"royokong/e5-v"
,
task
=
"embed"
,
max_model_len
=
4096
,
)
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
image
=
image
,
)
def
run_vlm2vec
(
query
:
Query
):
def
run_vlm2vec
(
query
:
Query
)
->
ModelRequestData
:
if
query
[
"modality"
]
==
"text"
:
text
=
query
[
"text"
]
prompt
=
f
"Find me an everyday image that matches the given caption:
{
text
}
"
# noqa: E501
...
...
@@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
modality
=
query
[
'modality'
]
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"TIGER-Lab/VLM2Vec-Full"
,
task
=
"embed"
,
trust_remote_code
=
True
,
...
...
@@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
)
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
image
=
image
,
)
...
...
@@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
raise
ValueError
(
msg
)
def
run_encode
(
model
:
str
,
modality
:
QueryModality
):
def
run_encode
(
model
:
str
,
modality
:
QueryModality
,
seed
:
Optional
[
int
]
):
query
=
get_query
(
modality
)
req_data
=
model_example_map
[
model
](
query
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
seed
}
llm
=
LLM
(
**
engine_args
)
mm_data
=
{}
if
req_data
.
image
is
not
None
:
mm_data
[
"image"
]
=
req_data
.
image
outputs
=
req_data
.
llm
.
embed
({
outputs
=
llm
.
embed
({
"prompt"
:
req_data
.
prompt
,
"multi_modal_data"
:
mm_data
,
})
...
...
@@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
)
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
model_example_map
=
{
...
...
@@ -167,5 +171,10 @@ if __name__ == "__main__":
default
=
"image"
,
choices
=
get_args
(
QueryModality
),
help
=
'Modality of the input.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
main
(
args
)
Prev
1
…
8
9
10
11
12
13
14
15
16
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment