Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1107 additions
and
402 deletions
+1107
-402
examples/offline_inference/disaggregated_prefill_lmcache.py
examples/offline_inference/disaggregated_prefill_lmcache.py
+130
-0
examples/offline_inference/distributed.py
examples/offline_inference/distributed.py
+5
-5
examples/offline_inference/eagle.py
examples/offline_inference/eagle.py
+93
-0
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+184
-0
examples/offline_inference/florence2_inference.py
examples/offline_inference/florence2_inference.py
+0
-46
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+3
-5
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+4
-5
examples/offline_inference/mistral-small.py
examples/offline_inference/mistral-small.py
+29
-11
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+4
-3
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+4
-4
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+4
-4
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+8
-7
examples/offline_inference/profiling_tpu/profiling.py
examples/offline_inference/profiling_tpu/profiling.py
+1
-2
examples/offline_inference/reproduciblity.py
examples/offline_inference/reproduciblity.py
+36
-0
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+3
-63
examples/offline_inference/rlhf_colocate.py
examples/offline_inference/rlhf_colocate.py
+1
-35
examples/offline_inference/rlhf_utils.py
examples/offline_inference/rlhf_utils.py
+105
-0
examples/offline_inference/tpu.py
examples/offline_inference/tpu.py
+3
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+470
-200
examples/offline_inference/vision_language_embedding.py
examples/offline_inference/vision_language_embedding.py
+20
-11
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
examples/offline_inference/disaggregated_prefill_lmcache.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of disaggregated prefilling
with LMCache.
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server.
KV cache is transferred in the following manner:
vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import
os
import
subprocess
import
time
from
multiprocessing
import
Event
,
Process
from
lmcache.experimental.cache_engine
import
LMCacheEngineBuilder
from
lmcache.integration.vllm.utils
import
ENGINE_NAME
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
# LMCache-related environment variables
# The port to start LMCache server
port
=
8100
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
# LMCache is set to use 256 tokens per chunk
os
.
environ
[
"LMCACHE_CHUNK_SIZE"
]
=
"256"
# Disable local CPU backend in LMCache
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"False"
# Set local CPU memory buffer limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
# Set the remote URL for LMCache server
os
.
environ
[
"LMCACHE_REMOTE_URL"
]
=
f
"lm://localhost:
{
port
}
"
# Set the serializer/deserializer between vllm and LMCache server
# `naive` indicates using raw bytes of the tensor without any compression
os
.
environ
[
"LMCACHE_REMOTE_SERDE"
]
=
"naive"
def
run_prefill
(
prefill_done
,
prompts
):
# We use GPU 0 for prefill node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
ktc
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
)
#llm.generate(prompts, sampling_params)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"Prefill node is finished."
)
prefill_done
.
set
()
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_decode
(
prefill_done
,
prompts
,
timeout
=
1
):
# We use GPU 1 for decode node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
ktc
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
)
print
(
"Waiting for prefill node to finish..."
)
prefill_done
.
wait
()
time
.
sleep
(
timeout
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_lmcache_server
(
port
):
server_proc
=
subprocess
.
Popen
([
"python"
,
"-m"
,
"lmcache.experimental.server"
,
"localhost"
,
str
(
port
)
])
return
server_proc
if
__name__
==
"__main__"
:
prompts
=
[
"Hello, how are you?"
*
1000
,
]
prefill_done
=
Event
()
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,
prompts
))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,
prompts
))
lmcache_server_process
=
run_lmcache_server
(
port
)
# Start prefill node
prefill_process
.
start
()
# Start decode node
decode_process
.
start
()
# Clean up the processes
decode_process
.
join
()
prefill_process
.
terminate
()
lmcache_server_process
.
terminate
()
lmcache_server_process
.
wait
()
examples/offline_inference/distributed.py
View file @
469e903b
...
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
...
@@ -6,7 +6,7 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
"""
from
typing
import
Any
,
Dict
,
List
from
typing
import
Any
import
numpy
as
np
import
numpy
as
np
import
ray
import
ray
...
@@ -36,13 +36,13 @@ class LLMPredictor:
...
@@ -36,13 +36,13 @@ class LLMPredictor:
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
self
.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-chat-hf"
,
tensor_parallel_size
=
tensor_parallel_size
)
tensor_parallel_size
=
tensor_parallel_size
)
def
__call__
(
self
,
batch
:
D
ict
[
str
,
np
.
ndarray
])
->
D
ict
[
str
,
list
]:
def
__call__
(
self
,
batch
:
d
ict
[
str
,
np
.
ndarray
])
->
d
ict
[
str
,
list
]:
# Generate texts from the prompts.
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
# generated text, and other information.
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
outputs
=
self
.
llm
.
generate
(
batch
[
"text"
],
sampling_params
)
prompt
:
L
ist
[
str
]
=
[]
prompt
:
l
ist
[
str
]
=
[]
generated_text
:
L
ist
[
str
]
=
[]
generated_text
:
l
ist
[
str
]
=
[]
for
output
in
outputs
:
for
output
in
outputs
:
prompt
.
append
(
output
.
prompt
)
prompt
.
append
(
output
.
prompt
)
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
generated_text
.
append
(
' '
.
join
([
o
.
text
for
o
in
output
.
outputs
]))
...
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
...
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
pg
,
placement_group_capture_child_tasks
=
True
))
pg
,
placement_group_capture_child_tasks
=
True
))
resources_kwarg
:
D
ict
[
str
,
Any
]
=
{}
resources_kwarg
:
d
ict
[
str
,
Any
]
=
{}
if
tensor_parallel_size
==
1
:
if
tensor_parallel_size
==
1
:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg
[
"num_gpus"
]
=
1
resources_kwarg
[
"num_gpus"
]
=
1
...
...
examples/offline_inference/eagle.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
json
import
os
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"./examples/data/gsm8k.jsonl"
,
help
=
"downloaded from the eagle repo "
\
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
)
parser
.
add_argument
(
"--max_num_seqs"
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
"--num_prompts"
,
type
=
int
,
default
=
80
)
parser
.
add_argument
(
"--num_spec_tokens"
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
"--tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--draft_tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--enforce_eager"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--enable_chunked_prefill"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--max_num_batched_tokens"
,
type
=
int
,
default
=
2048
)
parser
.
add_argument
(
"--temp"
,
type
=
float
,
default
=
0
)
args
=
parser
.
parse_args
()
print
(
args
)
model_dir
=
"meta-llama/Meta-Llama-3-8B-Instruct"
eagle_dir
=
"abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
max_model_len
=
2048
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_dir
)
if
os
.
path
.
exists
(
args
.
dataset
):
prompts
=
[]
num_prompts
=
args
.
num_prompts
with
open
(
args
.
dataset
)
as
f
:
for
line
in
f
:
data
=
json
.
loads
(
line
)
prompts
.
append
(
data
[
"turns"
][
0
])
else
:
prompts
=
[
"The future of AI is"
,
"The president of the United States is"
]
prompts
=
prompts
[:
args
.
num_prompts
]
num_prompts
=
len
(
prompts
)
prompt_ids
=
[
tokenizer
.
apply_chat_template
([{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
)
for
prompt
in
prompts
]
llm
=
LLM
(
model
=
model_dir
,
trust_remote_code
=
True
,
tensor_parallel_size
=
args
.
tp
,
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
enforce_eager
=
args
.
enforce_eager
,
max_model_len
=
max_model_len
,
max_num_seqs
=
args
.
max_num_seqs
,
gpu_memory_utilization
=
0.8
,
speculative_model
=
eagle_dir
,
num_speculative_tokens
=
args
.
num_spec_tokens
,
speculative_draft_tensor_parallel_size
=
args
.
draft_tp
,
speculative_max_model_len
=
max_model_len
,
disable_log_stats
=
False
,
)
sampling_params
=
SamplingParams
(
temperature
=
args
.
temp
,
max_tokens
=
256
)
outputs
=
llm
.
generate
(
prompt_token_ids
=
prompt_ids
,
sampling_params
=
sampling_params
)
# calculate the average number of accepted tokens per forward pass, +1 is
# to account for the token from the target model that's always going to be
# accepted
acceptance_counts
=
[
0
]
*
(
args
.
num_spec_tokens
+
1
)
for
output
in
outputs
:
for
step
,
count
in
enumerate
(
output
.
metrics
.
spec_token_acceptance_counts
):
acceptance_counts
[
step
]
+=
count
print
(
f
"mean acceptance length:
\
{
sum
(
acceptance_counts
)
/
acceptance_counts
[
0
]:.
2
f
}
"
)
examples/offline_inference/encoder_decoder_multimodal.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import
time
from
collections.abc
import
Sequence
from
dataclasses
import
asdict
from
typing
import
NamedTuple
from
vllm
import
LLM
,
EngineArgs
,
PromptType
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.utils
import
FlexibleArgumentParser
class
ModelRequestData
(
NamedTuple
):
engine_args
:
EngineArgs
prompts
:
Sequence
[
PromptType
]
def
run_florence2
():
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"facebook/bart-large"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# implicit prompt with task token
"prompt"
:
"<DETAILED_CAPTION>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
},
},
{
# explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
"Describe in detail what is shown in the image."
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
},
},
"decoder_prompt"
:
""
,
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
def
run_mllama
():
engine_args
=
EngineArgs
(
model
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Implicit prompt
"prompt"
:
"<|image|><|begin_of_text|>What is the content of this image?"
,
# noqa: E501
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
{
# Explicit prompt
"encoder_prompt"
:
{
"prompt"
:
"<|image|>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
"decoder_prompt"
:
"<|image|><|begin_of_text|>Please describe the image."
,
# noqa: E501
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
def
run_whisper
():
engine_args
=
EngineArgs
(
model
=
"openai/whisper-large-v3-turbo"
,
max_model_len
=
448
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Test implicit prompt
"prompt"
:
"<|startoftranscript|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|>"
,
}
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
model_example_map
=
{
"florence2"
:
run_florence2
,
"mllama"
:
run_mllama
,
"whisper"
:
run_whisper
,
}
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
raise
ValueError
(
f
"Model type
{
model
}
is not supported."
)
req_data
=
model_example_map
[
model
]()
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
args
.
seed
}
llm
=
LLM
(
**
engine_args
)
prompts
=
req_data
.
prompts
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
64
,
)
start
=
time
.
time
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
duration
=
time
.
time
()
-
start
print
(
"Duration:"
,
duration
)
print
(
"RPS:"
,
len
(
prompts
)
/
duration
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"mllama"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/florence2_inference.py
deleted
100644 → 0
View file @
389ebcf7
# SPDX-License-Identifier: Apache-2.0
'''
Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2
'''
# TODO(Isotr0py):
# Move to offline_inference/vision_language.py
# after porting vision backbone
from
vllm
import
LLM
,
SamplingParams
dtype
=
"float"
# Create a Florence-2 encoder/decoder model instance
llm
=
LLM
(
model
=
"microsoft/Florence-2-base"
,
tokenizer
=
"facebook/bart-base"
,
dtype
=
dtype
,
trust_remote_code
=
True
,
)
prompts
=
[
"<CAPTION>"
,
"<DETAILED_CAPTION>"
,
"<MORE_DETAILED_CAPTION>"
,
"<CAPTION_TO_PHRASE_GROUNDING>"
,
"<OD>"
,
"<DENSE_REGION_CAPTION>"
,
"<REGION_PROPOSAL>"
,
"<OCR>"
,
"<OCR_WITH_REGION>"
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
min_tokens
=
0
,
max_tokens
=
20
,
)
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
, "
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/llm_engine_example.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
argparse
from
typing
import
List
,
Tuple
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
def
create_test_prompts
()
->
L
ist
[
T
uple
[
str
,
SamplingParams
]]:
def
create_test_prompts
()
->
l
ist
[
t
uple
[
str
,
SamplingParams
]]:
"""Create a list of test prompts with their sampling parameters."""
"""Create a list of test prompts with their sampling parameters."""
return
[
return
[
(
"A robot may not injure a human being"
,
(
"A robot may not injure a human being"
,
...
@@ -16,7 +15,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
...
@@ -16,7 +15,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
)),
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
)),
(
"What is the meaning of life?"
,
(
"What is the meaning of life?"
,
SamplingParams
(
n
=
2
,
SamplingParams
(
n
=
2
,
best_of
=
5
,
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
frequency_penalty
=
0.1
)),
frequency_penalty
=
0.1
)),
...
@@ -24,7 +22,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
...
@@ -24,7 +22,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
def
process_requests
(
engine
:
LLMEngine
,
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
]]):
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
]]):
"""Continuously process a list of prompts and handle the outputs."""
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
request_id
=
0
...
@@ -34,7 +32,7 @@ def process_requests(engine: LLMEngine,
...
@@ -34,7 +32,7 @@ def process_requests(engine: LLMEngine,
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
request_id
+=
1
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
if
request_output
.
finished
:
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
469e903b
...
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
...
@@ -7,7 +7,7 @@ Requires HuggingFace credentials for access.
"""
"""
import
gc
import
gc
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
torch
import
torch
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
...
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
...
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
def
create_test_prompts
(
lora_path
:
str
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
return
[
return
[
# this is an example of using quantization without LoRA
# this is an example of using quantization without LoRA
(
"My name is"
,
(
"My name is"
,
...
@@ -49,7 +49,7 @@ def create_test_prompts(
...
@@ -49,7 +49,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
request_id
=
0
...
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
...
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
lora_request
=
lora_request
)
request_id
+=
1
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
if
request_output
.
finished
:
print
(
"----------------------------------------------------"
)
print
(
"----------------------------------------------------"
)
...
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
...
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
quantization
=
quantization
,
quantization
=
quantization
,
qlora_adapter_name_or_path
=
lora_repo
,
qlora_adapter_name_or_path
=
lora_repo
,
load_format
=
"bitsandbytes"
,
enable_lora
=
True
,
enable_lora
=
True
,
max_lora_rank
=
64
)
max_lora_rank
=
64
)
else
:
else
:
...
...
examples/offline_inference/
pixtra
l.py
→
examples/offline_inference/
mistral-smal
l.py
View file @
469e903b
...
@@ -6,14 +6,16 @@ import argparse
...
@@ -6,14 +6,16 @@ import argparse
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
# This script is an offline demo for running
Pix
tral
.
# This script is an offline demo for running
Mis
tral
-Small-3.1
#
#
# If you want to run a server/client setup, please follow this code:
# If you want to run a server/client setup, please follow this code:
#
#
# - Server:
# - Server:
#
#
# ```bash
# ```bash
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ```
# ```
#
#
# - Client:
# - Client:
...
@@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
...
@@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
# --header 'Content-Type: application/json' \
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --header 'Authorization: Bearer token' \
# --data '{
# --data '{
# "model": "mistralai/
Pix
tral-
12B-2409
",
# "model": "mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
",
# "messages": [
# "messages": [
# {
# {
# "role": "user",
# "role": "user",
...
@@ -43,12 +45,20 @@ from vllm.sampling_params import SamplingParams
...
@@ -43,12 +45,20 @@ from vllm.sampling_params import SamplingParams
# python demo.py advanced
# python demo.py advanced
def
run_simple_demo
():
def
run_simple_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/
Pix
tral-
12B-2409
"
model_name
=
"mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
"
sampling_params
=
SamplingParams
(
max_tokens
=
8192
)
sampling_params
=
SamplingParams
(
max_tokens
=
8192
)
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
)
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
"Describe this image in one sentence."
prompt
=
"Describe this image in one sentence."
image_url
=
"https://picsum.photos/id/237/200/300"
image_url
=
"https://picsum.photos/id/237/200/300"
...
@@ -76,8 +86,8 @@ def run_simple_demo():
...
@@ -76,8 +86,8 @@ def run_simple_demo():
print
(
outputs
[
0
].
outputs
[
0
].
text
)
print
(
outputs
[
0
].
outputs
[
0
].
text
)
def
run_advanced_demo
():
def
run_advanced_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/
Pix
tral-
12B-2409
"
model_name
=
"mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
"
max_img_per_msg
=
5
max_img_per_msg
=
5
max_tokens_per_img
=
4096
max_tokens_per_img
=
4096
...
@@ -85,8 +95,11 @@ def run_advanced_demo():
...
@@ -85,8 +95,11 @@ def run_advanced_demo():
llm
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
,
limit_mm_per_prompt
=
{
"image"
:
max_img_per_msg
},
limit_mm_per_prompt
=
{
"image"
:
max_img_per_msg
},
max_model_len
=
max_img_per_msg
*
max_tokens_per_img
,
max_model_len
=
max_img_per_msg
*
max_tokens_per_img
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
prompt
=
"Describe the following image."
prompt
=
"Describe the following image."
...
@@ -153,14 +166,19 @@ def main():
...
@@ -153,14 +166,19 @@ def main():
help
=
"Specify the demo mode: 'simple' or 'advanced'"
,
help
=
"Specify the demo mode: 'simple' or 'advanced'"
,
)
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
mode
==
"simple"
:
if
args
.
mode
==
"simple"
:
print
(
"Running simple demo..."
)
print
(
"Running simple demo..."
)
run_simple_demo
()
run_simple_demo
(
args
)
elif
args
.
mode
==
"advanced"
:
elif
args
.
mode
==
"advanced"
:
print
(
"Running advanced demo..."
)
print
(
"Running advanced demo..."
)
run_advanced_demo
()
run_advanced_demo
(
args
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/offline_inference/mlpspeculator.py
View file @
469e903b
...
@@ -2,12 +2,11 @@
...
@@ -2,12 +2,11 @@
import
gc
import
gc
import
time
import
time
from
typing
import
List
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
def
time_generation
(
llm
:
LLM
,
prompts
:
L
ist
[
str
],
def
time_generation
(
llm
:
LLM
,
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
):
sampling_params
:
SamplingParams
):
# Generate texts from the prompts. The output is a list of RequestOutput
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
# objects that contain the prompt, generated text, and other information.
...
@@ -51,7 +50,9 @@ if __name__ == "__main__":
...
@@ -51,7 +50,9 @@ if __name__ == "__main__":
# Create an LLM with spec decoding
# Create an LLM with spec decoding
llm
=
LLM
(
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
speculative_model
=
"ibm-ai-platform/llama-13b-accelerator"
,
speculative_config
=
{
"model"
:
"ibm-ai-platform/llama-13b-accelerator"
,
},
)
)
print
(
"With speculation"
)
print
(
"With speculation"
)
...
...
examples/offline_inference/multilora_inference.py
View file @
469e903b
...
@@ -6,7 +6,7 @@ for offline inference.
...
@@ -6,7 +6,7 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2.
Requires HuggingFace credentials for access to Llama2.
"""
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
...
@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
...
@@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest
def
create_test_prompts
(
def
create_test_prompts
(
lora_path
:
str
lora_path
:
str
)
->
L
ist
[
T
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
)
->
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
"""Create a list of test prompts with their sampling parameters.
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
2 requests for base model, 4 requests for the LoRA. We define 2
...
@@ -56,7 +56,7 @@ def create_test_prompts(
...
@@ -56,7 +56,7 @@ def create_test_prompts(
def
process_requests
(
engine
:
LLMEngine
,
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
L
ist
[
T
uple
[
str
,
SamplingParams
,
test_prompts
:
l
ist
[
t
uple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]):
Optional
[
LoRARequest
]]]):
"""Continuously process a list of prompts and handle the outputs."""
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
request_id
=
0
...
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
...
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
lora_request
=
lora_request
)
lora_request
=
lora_request
)
request_id
+=
1
request_id
+=
1
request_outputs
:
L
ist
[
RequestOutput
]
=
engine
.
step
()
request_outputs
:
l
ist
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
if
request_output
.
finished
:
...
...
examples/offline_inference/prithvi_geospatial_mae.py
View file @
469e903b
...
@@ -21,7 +21,7 @@ import argparse
...
@@ -21,7 +21,7 @@ import argparse
import
datetime
import
datetime
import
os
import
os
import
re
import
re
from
typing
import
List
,
Union
from
typing
import
Union
import
albumentations
import
albumentations
import
numpy
as
np
import
numpy
as
np
...
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
...
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
def
load_example
(
def
load_example
(
file_paths
:
L
ist
[
str
],
file_paths
:
l
ist
[
str
],
mean
:
L
ist
[
float
]
=
None
,
mean
:
l
ist
[
float
]
=
None
,
std
:
L
ist
[
float
]
=
None
,
std
:
l
ist
[
float
]
=
None
,
indices
:
Union
[
list
[
int
],
None
]
=
None
,
indices
:
Union
[
list
[
int
],
None
]
=
None
,
):
):
"""Build an input example by loading images in *file_paths*.
"""Build an input example by loading images in *file_paths*.
...
...
examples/offline_inference/profiling.py
View file @
469e903b
...
@@ -5,8 +5,9 @@ import json
...
@@ -5,8 +5,9 @@ import json
import
os
import
os
import
sys
import
sys
from
argparse
import
RawTextHelpFormatter
from
argparse
import
RawTextHelpFormatter
from
collections.abc
import
Generator
from
dataclasses
import
asdict
,
dataclass
from
dataclasses
import
asdict
,
dataclass
from
typing
import
Any
,
Dict
,
Generator
,
List
,
Optional
,
TypeAlias
from
typing
import
Any
,
Optional
,
TypeAlias
import
torch
import
torch
import
tqdm
import
tqdm
...
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
...
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
return
dtype
return
dtype
OutputLen_NumReqs_Map
:
TypeAlias
=
D
ict
[
int
,
int
]
OutputLen_NumReqs_Map
:
TypeAlias
=
d
ict
[
int
,
int
]
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
L
ist
[
int
])
\
def
compute_request_output_lengths
(
batch_size
:
int
,
step_requests
:
l
ist
[
int
])
\
->
OutputLen_NumReqs_Map
:
->
OutputLen_NumReqs_Map
:
"""
"""
Given the number of requests, batch_size, and the number of requests
Given the number of requests, batch_size, and the number of requests
...
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
...
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
Args:
Args:
batch_size (int): Number of requests submitted for profile. This is
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
args.batch_size.
step_requests (
L
ist[int]): step_requests[i] is the number of requests
step_requests (
l
ist[int]): step_requests[i] is the number of requests
that the ith engine step should process.
that the ith engine step should process.
Returns:
Returns:
...
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
...
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
return
ol_nr
return
ol_nr
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
L
ist
[
int
]:
def
determine_requests_per_step
(
context
:
ProfileContext
)
->
l
ist
[
int
]:
"""
"""
Determine number of requests each engine step should process.
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
If context.num_steps is set, then all engine steps process the
...
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
...
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
context: ProfileContext object.
context: ProfileContext object.
Returns:
Returns:
L
ist[int]: Number of requests to process for all engine-steps.
l
ist[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
output[i], contains the number of requests that the ith step
should process.
should process.
"""
"""
...
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
...
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
for
key
,
value
in
asdict
(
context
).
items
():
for
key
,
value
in
asdict
(
context
).
items
():
print
(
f
"
{
key
}
=
{
value
}
"
)
print
(
f
"
{
key
}
=
{
value
}
"
)
requests_per_step
:
L
ist
[
int
]
=
determine_requests_per_step
(
context
)
requests_per_step
:
l
ist
[
int
]
=
determine_requests_per_step
(
context
)
ol_nr
:
OutputLen_NumReqs_Map
=
compute_request_output_lengths
(
ol_nr
:
OutputLen_NumReqs_Map
=
compute_request_output_lengths
(
context
.
batch_size
,
requests_per_step
)
context
.
batch_size
,
requests_per_step
)
...
...
examples/offline_inference/profiling_tpu/profiling.py
View file @
469e903b
...
@@ -4,7 +4,6 @@ import argparse
...
@@ -4,7 +4,6 @@ import argparse
import
dataclasses
import
dataclasses
import
os
import
os
import
time
import
time
from
typing
import
List
import
numpy
as
np
import
numpy
as
np
import
torch_xla.debug.profiler
as
xp
import
torch_xla.debug.profiler
as
xp
...
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
...
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
batch_size
,
size
=
(
args
.
batch_size
,
args
.
input_len
))
args
.
input_len
))
dummy_prompts
:
L
ist
[
PromptType
]
=
[{
dummy_prompts
:
l
ist
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
...
...
examples/offline_inference/reproduciblity.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
os
from
vllm
import
LLM
,
SamplingParams
# vLLM does not guarantee the reproducibility of the results by default,
# for the sake of performance. You need to do the following to achieve
# reproducible results:
# 1. Turn off multiprocessing to make the scheduling deterministic.
# NOTE(woosuk): This is not needed and will be ignored for V0.
os
.
environ
[
"VLLM_ENABLE_V1_MULTIPROCESSING"
]
=
"0"
# 2. Fix the global seed for reproducibility. The default seed is None, which is
# not reproducible.
SEED
=
42
# NOTE(woosuk): Even with the above two settings, vLLM only provides
# reproducibility when it runs on the same hardware and the same vLLM version.
# Also, the online serving API (`vllm serve`) does not support reproducibility
# because it is almost impossible to make the scheduling deterministic in the
# online serving setting.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
seed
=
SEED
)
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/rlhf.py
View file @
469e903b
...
@@ -18,72 +18,11 @@ import ray
...
@@ -18,72 +18,11 @@ import ray
import
torch
import
torch
from
ray.util.placement_group
import
placement_group
from
ray.util.placement_group
import
placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
rlhf_utils
import
stateless_init_process_group
from
transformers
import
AutoModelForCausalLM
from
transformers
import
AutoModelForCausalLM
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.utils
import
get_ip
,
get_open_port
from
vllm.utils
import
get_ip
,
get_open_port
from
vllm.worker.worker
import
Worker
def
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
device
):
"""
vLLM provides `StatelessProcessGroup` to create a process group
without considering the global process group in torch.distributed.
It is recommended to create `StatelessProcessGroup`, and then initialize
the data-plane communication (NCCL) between external (train processes)
and vLLM workers.
"""
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.utils
import
StatelessProcessGroup
pg
=
StatelessProcessGroup
.
create
(
host
=
master_address
,
port
=
master_port
,
rank
=
rank
,
world_size
=
world_size
)
pynccl
=
PyNcclCommunicator
(
pg
,
device
=
device
)
return
pynccl
class
MyWorker
(
Worker
):
"""
The `MyWorker` class inherits from `Worker` to provide custom functions.
For simplicity, we define the `MyWorker` class in this self-contained
script. Normally, we should define the `MyWorker` class in a separate
file and pass the qualified name of the class to the `worker_cls`
parameter.
"""
def
init_weight_update_group
(
self
,
master_address
,
master_port
,
rank_offset
,
world_size
):
from
vllm.distributed.parallel_state
import
get_world_group
rank
=
get_world_group
().
rank
+
rank_offset
self
.
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
self
.
device
,
)
def
update_weight
(
self
,
name
,
dtype
,
shape
):
weight
=
torch
.
empty
(
shape
,
dtype
=
dtype
,
device
=
"cuda"
)
self
.
model_update_group
.
broadcast
(
weight
,
src
=
0
,
stream
=
torch
.
cuda
.
current_stream
())
self
.
model_runner
.
model
.
load_weights
(
weights
=
[(
name
,
weight
)])
del
weight
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
class
MyLLM
(
LLM
):
class
MyLLM
(
LLM
):
...
@@ -129,7 +68,7 @@ llm = ray.remote(
...
@@ -129,7 +68,7 @@ llm = ray.remote(
)(
MyLLM
).
remote
(
)(
MyLLM
).
remote
(
model
=
"facebook/opt-125m"
,
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
worker_
cls
=
MyWorker
,
worker_
extension_cls
=
"rlhf_utils.WorkerExtension"
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"ray"
,
distributed_executor_backend
=
"ray"
,
)
)
...
@@ -159,6 +98,7 @@ master_port = get_open_port()
...
@@ -159,6 +98,7 @@ master_port = get_open_port()
handle
=
llm
.
collective_rpc
.
remote
(
"init_weight_update_group"
,
handle
=
llm
.
collective_rpc
.
remote
(
"init_weight_update_group"
,
args
=
(
master_address
,
master_port
,
1
,
3
))
args
=
(
master_address
,
master_port
,
1
,
3
))
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
0
,
3
,
torch
.
device
(
"cuda:0"
))
0
,
3
,
torch
.
device
(
"cuda:0"
))
ray
.
get
(
handle
)
ray
.
get
(
handle
)
...
...
examples/offline_inference/rlhf_colocate.py
View file @
469e903b
...
@@ -17,40 +17,6 @@ from ray.util.placement_group import placement_group
...
@@ -17,40 +17,6 @@ from ray.util.placement_group import placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.worker.worker
import
Worker
class
MyWorker
(
Worker
):
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
return
self
.
device_uuid
def
update_weights_from_ipc_handles
(
self
,
ipc_handles
):
handles
=
ipc_handles
[
self
.
device_uuid
]
device_id
=
self
.
device
.
index
weights
=
[]
for
name
,
handle
in
handles
.
items
():
func
,
args
=
handle
list_args
=
list
(
args
)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args
[
6
]
=
device_id
tensor
=
func
(
*
list_args
)
weights
.
append
((
name
,
tensor
))
self
.
model_runner
.
model
.
load_weights
(
weights
=
weights
)
torch
.
cuda
.
synchronize
()
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
class
MyLLM
(
LLM
):
class
MyLLM
(
LLM
):
...
@@ -150,7 +116,7 @@ for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
...
@@ -150,7 +116,7 @@ for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
)(
MyLLM
).
remote
(
)(
MyLLM
).
remote
(
model
=
"facebook/opt-125m"
,
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
worker_
cls
=
MyWorker
,
worker_
extension_cls
=
"rlhf_utils.ColocateWorkerExtension"
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
"ray"
,
distributed_executor_backend
=
"ray"
,
gpu_memory_utilization
=
0.4
,
gpu_memory_utilization
=
0.4
,
...
...
examples/offline_inference/rlhf_utils.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
torch
def
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
device
):
"""
vLLM provides `StatelessProcessGroup` to create a process group
without considering the global process group in torch.distributed.
It is recommended to create `StatelessProcessGroup`, and then initialize
the data-plane communication (NCCL) between external (train processes)
and vLLM workers.
"""
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.utils
import
StatelessProcessGroup
pg
=
StatelessProcessGroup
.
create
(
host
=
master_address
,
port
=
master_port
,
rank
=
rank
,
world_size
=
world_size
)
pynccl
=
PyNcclCommunicator
(
pg
,
device
=
device
)
return
pynccl
class
WorkerExtension
:
"""
The class for vLLM's worker to inherit from.
By defining an extension class, the code can work no matter what is
the underlying worker class. This way, the code can be compatible
with both vLLM V0 and V1.
NOTE: we define this class in a separate module, and the main module
should pass the full qualified name as `worker_extension_cls` argument.
"""
def
init_weight_update_group
(
self
,
master_address
,
master_port
,
rank_offset
,
world_size
):
from
vllm.distributed.parallel_state
import
get_world_group
rank
=
get_world_group
().
rank
+
rank_offset
self
.
model_update_group
=
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
self
.
device
,
)
def
update_weight
(
self
,
name
,
dtype
,
shape
):
weight
=
torch
.
empty
(
shape
,
dtype
=
dtype
,
device
=
"cuda"
)
self
.
model_update_group
.
broadcast
(
weight
,
src
=
0
,
stream
=
torch
.
cuda
.
current_stream
())
self
.
model_runner
.
model
.
load_weights
(
weights
=
[(
name
,
weight
)])
del
weight
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
class
ColocateWorkerExtension
:
"""
The class for vLLM's worker to inherit from, in the colocate setting.
By defining an extension class, the code can work no matter what is
the underlying worker class. This way, the code can be compatible
with both vLLM V0 and V1.
NOTE: we define this class in a separate module, and the main module
should pass the full qualified name as `worker_extension_cls` argument.
"""
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
return
self
.
device_uuid
def
update_weights_from_ipc_handles
(
self
,
ipc_handles
):
handles
=
ipc_handles
[
self
.
device_uuid
]
device_id
=
self
.
device
.
index
weights
=
[]
for
name
,
handle
in
handles
.
items
():
func
,
args
=
handle
list_args
=
list
(
args
)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args
[
6
]
=
device_id
tensor
=
func
(
*
list_args
)
weights
.
append
((
name
,
tensor
))
self
.
model_runner
.
model
.
load_weights
(
weights
=
weights
)
torch
.
cuda
.
synchronize
()
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
"""
weights_updated
=
True
for
name
,
p
in
self
.
model_runner
.
model
.
named_parameters
():
weights_updated
=
weights_updated
and
torch
.
allclose
(
p
,
torch
.
zeros_like
(
p
))
return
weights_updated
examples/offline_inference/tpu.py
View file @
469e903b
...
@@ -21,7 +21,9 @@ sampling_params = SamplingParams(temperature=0.7,
...
@@ -21,7 +21,9 @@ sampling_params = SamplingParams(temperature=0.7,
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"google/gemma-2b"
,
enforce_eager
=
True
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
4
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
for
output
,
answer
in
zip
(
outputs
,
answers
):
prompt
=
output
.
prompt
prompt
=
output
.
prompt
...
...
examples/offline_inference/vision_language.py
View file @
469e903b
...
@@ -6,122 +6,219 @@ the correct prompt format on vision language models for text generation.
...
@@ -6,122 +6,219 @@ the correct prompt format on vision language models for text generation.
For most models, the prompt format should follow corresponding examples
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
on HuggingFace model repository.
"""
"""
import
os
import
random
import
random
from
dataclasses
import
asdict
from
typing
import
NamedTuple
,
Optional
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
class
ModelRequestData
(
NamedTuple
):
engine_args
:
EngineArgs
prompts
:
list
[
str
]
stop_token_ids
:
Optional
[
list
[
int
]]
=
None
lora_requests
:
Optional
[
list
[
LoRARequest
]]
=
None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# Unless specified, these settings have been tested to work on a single L4.
# Aria
# Aria
def
run_aria
(
question
:
str
,
modality
:
str
):
def
run_aria
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"rhymes-ai/Aria"
model_name
=
"rhymes-ai/Aria"
# NOTE: Need L40 (or equivalent) to avoid OOM
# NOTE: Need L40 (or equivalent) to avoid OOM
llm
=
LLM
(
model
=
model_name
,
engine_args
=
EngineArgs
(
max_model_len
=
4096
,
model
=
model_name
,
max_num_seqs
=
2
,
max_model_len
=
4096
,
dtype
=
"bfloat16"
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
(
f
"<|im_start|>user
\n
<fim_prefix><|img|><fim_suffix>
{
question
}
"
prompts
=
[(
f
"<|im_start|>user
\n
<fim_prefix><|img|><fim_suffix>
{
question
}
"
"<|im_end|>
\n
<|im_start|>assistant
\n
"
)
"<|im_end|>
\n
<|im_start|>assistant
\n
"
)
for
question
in
questions
]
stop_token_ids
=
[
93532
,
93653
,
944
,
93421
,
1019
,
93653
,
93519
]
stop_token_ids
=
[
93532
,
93653
,
944
,
93421
,
1019
,
93653
,
93519
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# BLIP-2
# BLIP-2
def
run_blip2
(
question
:
str
,
modality
:
str
):
def
run_blip2
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompt
=
f
"Question:
{
question
}
Answer:"
prompts
=
[
f
"Question:
{
question
}
Answer:"
for
question
in
questions
]
llm
=
LLM
(
model
=
"Salesforce/blip2-opt-2.7b"
,
engine_args
=
EngineArgs
(
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
model
=
"Salesforce/blip2-opt-2.7b"
,
stop_token_ids
=
None
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
return
llm
,
prompt
,
stop_token_ids
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Chameleon
# Chameleon
def
run_chameleon
(
question
:
str
,
modality
:
str
):
def
run_chameleon
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
prompt
=
f
"
{
question
}
<image>"
prompts
=
[
f
"
{
question
}
<image>"
for
question
in
questions
]
llm
=
LLM
(
model
=
"facebook/chameleon-7b"
,
engine_args
=
EngineArgs
(
max_model_len
=
4096
,
model
=
"facebook/chameleon-7b"
,
max_num_seqs
=
2
,
max_model_len
=
4096
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
max_num_seqs
=
2
,
stop_token_ids
=
None
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
return
llm
,
prompt
,
stop_token_ids
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Deepseek-VL2
# Deepseek-VL2
def
run_deepseek_vl2
(
question
:
str
,
modality
:
str
):
def
run_deepseek_vl2
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
llm
=
LLM
(
model
=
model_name
,
engine_args
=
EngineArgs
(
max_model_len
=
4096
,
model
=
model_name
,
max_num_seqs
=
2
,
max_model_len
=
4096
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]})
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
)
prompts
=
[
f
"<|User|>: <image>
\n
{
question
}
\n\n
<|Assistant|>:"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Florence2
def
run_florence2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"facebook/bart-large"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompts
=
[
"<MORE_DETAILED_CAPTION>"
for
_
in
questions
]
prompt
=
f
"<|User|>: <image>
\n
{
question
}
\n\n
<|Assistant|>:"
return
ModelRequestData
(
stop_token_ids
=
None
engine_args
=
engine_args
,
return
llm
,
prompt
,
stop_token_ids
prompts
=
prompts
,
)
# Fuyu
# Fuyu
def
run_fuyu
(
question
:
str
,
modality
:
str
):
def
run_fuyu
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompts
=
[
f
"
{
question
}
\n
"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"adept/fuyu-8b"
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Gemma 3
def
run_gemma3
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"google/gemma-3-4b-it"
prompt
=
f
"
{
question
}
\n
"
engine_args
=
EngineArgs
(
llm
=
LLM
(
model
=
"adept/fuyu-8b"
,
model
=
model_name
,
max_model_len
=
2048
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
mm_processor_kwargs
=
{
"do_pan_and_scan"
:
True
},
stop_token_ids
=
None
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
return
llm
,
prompt
,
stop_token_ids
)
prompts
=
[(
"<bos><start_of_turn>user
\n
"
f
"<start_of_image>
{
question
}
<end_of_turn>
\n
"
"<start_of_turn>model
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# GLM-4v
# GLM-4v
def
run_glm4v
(
question
:
str
,
modality
:
str
):
def
run_glm4v
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"THUDM/glm-4v-9b"
model_name
=
"THUDM/glm-4v-9b"
llm
=
LLM
(
model
=
model_name
,
engine_args
=
EngineArgs
(
max_model_len
=
2048
,
model
=
model_name
,
max_num_seqs
=
2
,
max_model_len
=
2048
,
trust_remote_code
=
True
,
max_num_seqs
=
2
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]},
enforce_eager
=
True
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
f
"<|user|>
\n
<|begin_of_image|><|endoftext|><|end_of_image|>
\
prompts
=
[
{
question
}
<|assistant|>"
f
"<|user|>
\n
<|begin_of_image|><|endoftext|><|end_of_image|>
\
{
question
}
<|assistant|>"
for
question
in
questions
]
stop_token_ids
=
[
151329
,
151336
,
151338
]
stop_token_ids
=
[
151329
,
151336
,
151338
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# H2OVL-Mississippi
# H2OVL-Mississippi
def
run_h2ovl
(
question
:
str
,
modality
:
str
):
def
run_h2ovl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"h2oai/h2ovl-mississippi-800m"
model_name
=
"h2oai/h2ovl-mississippi-800m"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
max_model_len
=
8192
,
...
@@ -130,23 +227,31 @@ def run_h2ovl(question: str, modality: str):
...
@@ -130,23 +227,31 @@ def run_h2ovl(question: str, modality: str):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
trust_remote_code
=
True
)
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
messages
=
[[{
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
'role'
:
'user'
,
tokenize
=
False
,
'content'
:
f
"<image>
\n
{
question
}
"
add_generation_prompt
=
True
)
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for H2OVL-Mississippi
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids
=
[
tokenizer
.
eos_token_id
]
stop_token_ids
=
[
tokenizer
.
eos_token_id
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# Idefics3-8B-Llama3
# Idefics3-8B-Llama3
def
run_idefics3
(
question
:
str
,
modality
:
str
):
def
run_idefics3
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
...
@@ -160,20 +265,23 @@ def run_idefics3(question: str, modality: str):
...
@@ -160,20 +265,23 @@ def run_idefics3(question: str, modality: str):
},
},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
prompt
=
(
prompt
s
=
[
(
f
"<|begin_of_text|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
f
"<|begin_of_text|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
# InternVL
# InternVL
def
run_internvl
(
question
:
str
,
modality
:
str
):
def
run_internvl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"OpenGVLab/InternVL2-2B"
model_name
=
"OpenGVLab/InternVL2-2B"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
4096
,
...
@@ -182,10 +290,13 @@ def run_internvl(question: str, modality: str):
...
@@ -182,10 +290,13 @@ def run_internvl(question: str, modality: str):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
trust_remote_code
=
True
)
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
messages
=
[[{
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
'role'
:
'user'
,
tokenize
=
False
,
'content'
:
f
"<image>
\n
{
question
}
"
add_generation_prompt
=
True
)
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for InternVL
# Stop tokens for InternVL
# models variants may have different stop tokens
# models variants may have different stop tokens
...
@@ -193,84 +304,127 @@ def run_internvl(question: str, modality: str):
...
@@ -193,84 +304,127 @@ def run_internvl(question: str, modality: str):
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens
=
[
"<|endoftext|>"
,
"<|im_start|>"
,
"<|im_end|>"
,
"<|end|>"
]
stop_tokens
=
[
"<|endoftext|>"
,
"<|im_start|>"
,
"<|im_end|>"
,
"<|end|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# LLaVA-1.5
# LLaVA-1.5
def
run_llava
(
question
:
str
,
modality
:
str
):
def
run_llava
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
prompt
=
f
"USER: <image>
\n
{
question
}
\n
ASSISTANT:"
prompts
=
[
f
"USER: <image>
\n
{
question
}
\n
ASSISTANT:"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_model_len
=
4096
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
llm
=
LLM
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
return
ModelRequestData
(
max_model_len
=
4096
,
engine_args
=
engine_args
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
prompts
=
prompts
,
stop_token_ids
=
None
)
return
llm
,
prompt
,
stop_token_ids
# LLaVA-1.6/LLaVA-NeXT
# LLaVA-1.6/LLaVA-NeXT
def
run_llava_next
(
question
:
str
,
modality
:
str
):
def
run_llava_next
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
prompt
=
f
"[INST] <image>
\n
{
question
}
[/INST]"
prompts
=
[
f
"[INST] <image>
\n
{
question
}
[/INST]"
for
question
in
questions
]
llm
=
LLM
(
model
=
"llava-hf/llava-v1.6-mistral-7b-hf"
,
engine_args
=
EngineArgs
(
max_model_len
=
8192
,
model
=
"llava-hf/llava-v1.6-mistral-7b-hf"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
max_model_len
=
8192
,
stop_token_ids
=
None
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
return
llm
,
prompt
,
stop_token_ids
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LlaVA-NeXT-Video
# LlaVA-NeXT-Video
# Currently only support for video input
# Currently only support for video input
def
run_llava_next_video
(
question
:
str
,
modality
:
str
):
def
run_llava_next_video
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"video"
assert
modality
==
"video"
prompt
=
f
"USER: <video>
\n
{
question
}
ASSISTANT:"
prompts
=
[
llm
=
LLM
(
model
=
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
f
"USER: <video>
\n
{
question
}
ASSISTANT:"
for
question
in
questions
max_model_len
=
8192
,
]
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
engine_args
=
EngineArgs
(
stop_token_ids
=
None
model
=
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
return
llm
,
prompt
,
stop_token_ids
max_model_len
=
8192
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LLaVA-OneVision
# LLaVA-OneVision
def
run_llava_onevision
(
question
:
str
,
modality
:
str
):
def
run_llava_onevision
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
if
modality
==
"video"
:
if
modality
==
"video"
:
prompt
=
f
"<|im_start|>user <video>
\n
{
question
}
<|im_end|>
\
prompts
=
[
<|im_start|>assistant
\n
"
f
"<|im_start|>user <video>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
for
question
in
questions
]
elif
modality
==
"image"
:
elif
modality
==
"image"
:
prompt
=
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
prompts
=
[
<|im_start|>assistant
\n
"
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
<|im_start|>assistant
\n
"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
,
max_model_len
=
16384
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
llm
=
LLM
(
model
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
,
return
ModelRequestData
(
max_model_len
=
16384
,
engine_args
=
engine_args
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
prompts
=
prompts
,
stop_token_ids
=
None
)
return
llm
,
prompt
,
stop_token_ids
# Mantis
# Mantis
def
run_mantis
(
question
:
str
,
modality
:
str
):
def
run_mantis
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
llama3_template
=
'<|start_header_id|>user<|end_header_id|>
\n\n
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
'
# noqa: E501
llama3_template
=
'<|start_header_id|>user<|end_header_id|>
\n\n
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
'
# noqa: E501
prompt
=
llama3_template
.
format
(
f
"
{
question
}
\n
<image>"
)
prompts
=
[
llama3_template
.
format
(
f
"
{
question
}
\n
<image>"
)
for
question
in
questions
]
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
model
=
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]},
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
stop_token_ids
=
[
128009
]
stop_token_ids
=
[
128009
]
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# MiniCPM-V
# MiniCPM-V
def
run_minicpmv_base
(
question
:
str
,
modality
:
str
,
model_name
):
def
run_minicpmv_base
(
question
s
:
list
[
str
]
,
modality
:
str
,
model_name
):
assert
modality
in
[
"image"
,
"video"
]
assert
modality
in
[
"image"
,
"video"
]
# If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
# If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
...
@@ -294,7 +448,7 @@ def run_minicpmv_base(question: str, modality: str, model_name):
...
@@ -294,7 +448,7 @@ def run_minicpmv_base(question: str, modality: str, model_name):
# model_name = "openbmb/MiniCPM-o-2_6"
# model_name = "openbmb/MiniCPM-o-2_6"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
trust_remote_code
=
True
)
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
...
@@ -317,26 +471,33 @@ def run_minicpmv_base(question: str, modality: str, model_name):
...
@@ -317,26 +471,33 @@ def run_minicpmv_base(question: str, modality: str, model_name):
"video"
:
"(<video>./</video>)"
,
"video"
:
"(<video>./</video>)"
,
}
}
messages
=
[{
prompts
=
[
'role'
:
'user'
,
tokenizer
.
apply_chat_template
(
'content'
:
f
'
{
modality_placeholder
[
modality
]
}
\n
{
question
}
'
[{
}]
'role'
:
'user'
,
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
'content'
:
f
"
{
modality_placeholder
[
modality
]
}
\n
{
question
}
"
tokenize
=
False
,
}],
add_generation_prompt
=
True
)
tokenize
=
False
,
return
llm
,
prompt
,
stop_token_ids
add_generation_prompt
=
True
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
def
run_minicpmo
(
question
:
str
,
modality
:
str
):
def
run_minicpmo
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
return
run_minicpmv_base
(
question
,
modality
,
"openbmb/MiniCPM-o-2_6"
)
return
run_minicpmv_base
(
question
s
,
modality
,
"openbmb/MiniCPM-o-2_6"
)
def
run_minicpmv
(
question
:
str
,
modality
:
str
):
def
run_minicpmv
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
return
run_minicpmv_base
(
question
,
modality
,
"openbmb/MiniCPM-V-2_6"
)
return
run_minicpmv_base
(
question
s
,
modality
,
"openbmb/MiniCPM-V-2_6"
)
# LLama 3.2
# LLama 3.2
def
run_mllama
(
question
:
str
,
modality
:
str
):
def
run_mllama
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
...
@@ -346,7 +507,7 @@ def run_mllama(question: str, modality: str):
...
@@ -346,7 +507,7 @@ def run_mllama(question: str, modality: str):
# You may lower either to run this example on lower-end GPUs.
# You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a single L40 GPU.
# The configuration below has been confirmed to launch on a single L40 GPU.
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
...
@@ -354,49 +515,58 @@ def run_mllama(question: str, modality: str):
...
@@ -354,49 +515,58 @@ def run_mllama(question: str, modality: str):
)
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
messages
=
[{
messages
=
[
[{
"role"
:
"role"
:
"user"
,
"user"
,
"content"
:
[{
"content"
:
[{
"type"
:
"image"
"type"
:
"image"
},
{
},
{
"type"
:
"text"
,
"type"
:
"text"
,
"text"
:
f
"
{
question
}
"
"text"
:
question
}]
}]
}]
}]
for
question
in
questions
]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
add_generation_prompt
=
True
,
tokenize
=
False
)
tokenize
=
False
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Molmo
# Molmo
def
run_molmo
(
question
,
modality
)
:
def
run_molmo
(
question
s
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"allenai/Molmo-7B-D-0924"
model_name
=
"allenai/Molmo-7B-D-0924"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
prompt
=
question
prompts
=
[
stop_token_ids
=
None
f
"<|im_start|>user <image>
\n
{
question
}
<|im_end|>
\
return
llm
,
prompt
,
stop_token_ids
<|im_start|>assistant
\n
"
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# NVLM-D
# NVLM-D
def
run_nvlm_d
(
question
:
str
,
modality
:
str
):
def
run_nvlm_d
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"nvidia/NVLM-D-72B"
model_name
=
"nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
# Adjust this as necessary to fit in GPU
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
4096
,
...
@@ -406,43 +576,60 @@ def run_nvlm_d(question: str, modality: str):
...
@@ -406,43 +576,60 @@ def run_nvlm_d(question: str, modality: str):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
trust_remote_code
=
True
)
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
messages
=
[[{
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
'role'
:
'user'
,
tokenize
=
False
,
'content'
:
f
"<image>
\n
{
question
}
"
add_generation_prompt
=
True
)
}]
for
question
in
questions
]
stop_token_ids
=
None
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
return
llm
,
prompt
,
stop_token_ids
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# PaliGemma
# PaliGemma
def
run_paligemma
(
question
:
str
,
modality
:
str
):
def
run_paligemma
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
# PaliGemma has special prompt format for VQA
# PaliGemma has special prompt format for VQA
prompt
=
"caption en"
prompts
=
[
"caption en"
for
_
in
questions
]
llm
=
LLM
(
model
=
"google/paligemma-3b-mix-224"
,
engine_args
=
EngineArgs
(
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
model
=
"google/paligemma-3b-mix-224"
,
stop_token_ids
=
None
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# PaliGemma 2
# PaliGemma 2
def
run_paligemma2
(
question
:
str
,
modality
:
str
):
def
run_paligemma2
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
# PaliGemma 2 has special prompt format for VQA
# PaliGemma 2 has special prompt format for VQA
prompt
=
"caption en"
prompts
=
[
"caption en"
for
_
in
questions
]
llm
=
LLM
(
model
=
"google/paligemma2-3b-ft-docci-448"
,
engine_args
=
EngineArgs
(
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
model
=
"google/paligemma2-3b-ft-docci-448"
,
stop_token_ids
=
None
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
)
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Phi-3-Vision
# Phi-3-Vision
def
run_phi3v
(
question
:
str
,
modality
:
str
):
def
run_phi3v
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
prompt
=
f
"<|user|>
\n
<|image_1|>
\n
{
question
}
<|end|>
\n
<|assistant|>
\n
"
prompts
=
[
f
"<|user|>
\n
<|image_1|>
\n
{
question
}
<|end|>
\n
<|assistant|>
\n
"
for
question
in
questions
]
# num_crops is an override kwarg to the multimodal image processor;
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
...
@@ -456,7 +643,7 @@ def run_phi3v(question: str, modality: str):
...
@@ -456,7 +643,7 @@ def run_phi3v(question: str, modality: str):
#
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
model
=
"microsoft/Phi-3.5-vision-instruct"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
4096
,
...
@@ -465,34 +652,71 @@ def run_phi3v(question: str, modality: str):
...
@@ -465,34 +652,71 @@ def run_phi3v(question: str, modality: str):
mm_processor_kwargs
=
{
"num_crops"
:
16
},
mm_processor_kwargs
=
{
"num_crops"
:
16
},
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Phi-4-multimodal-instruct
def
run_phi4mm
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs.
"""
assert
modality
==
"image"
model_path
=
snapshot_download
(
"microsoft/Phi-4-multimodal-instruct"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
prompts
=
[
f
"<|user|><|image_1|>
{
question
}
<|end|><|assistant|>"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
enable_lora
=
True
,
max_lora_rank
=
320
,
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
lora_requests
=
[
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)],
)
# Pixtral HF-format
# Pixtral HF-format
def
run_pixtral_hf
(
question
:
str
,
modality
:
str
):
def
run_pixtral_hf
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
model_name
=
"mistral-community/pixtral-12b"
model_name
=
"mistral-community/pixtral-12b"
# NOTE: Need L40 (or equivalent) to avoid OOM
# NOTE: Need L40 (or equivalent) to avoid OOM
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
8192
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
prompt
=
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
prompts
=
[
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
for
question
in
questions
]
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Qwen
# Qwen
def
run_qwen_vl
(
question
:
str
,
modality
:
str
):
def
run_qwen_vl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"Qwen/Qwen-VL"
,
model
=
"Qwen/Qwen-VL"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
1024
,
max_model_len
=
1024
,
...
@@ -501,17 +725,20 @@ def run_qwen_vl(question: str, modality: str):
...
@@ -501,17 +725,20 @@ def run_qwen_vl(question: str, modality: str):
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
)
prompt
=
f
"
{
question
}
Picture 1: <img></img>
\n
"
prompts
=
[
f
"
{
question
}
Picture 1: <img></img>
\n
"
for
question
in
questions
]
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Qwen2-VL
# Qwen2-VL
def
run_qwen2_vl
(
question
:
str
,
modality
:
str
):
def
run_qwen2_vl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen2-VL-7B-Instruct"
model_name
=
"Qwen/Qwen2-VL-7B-Instruct"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
max_num_seqs
=
5
,
...
@@ -528,20 +755,25 @@ def run_qwen2_vl(question: str, modality: str):
...
@@ -528,20 +755,25 @@ def run_qwen2_vl(question: str, modality: str):
elif
modality
==
"video"
:
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
"<|video_pad|>"
prompt
=
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
prompts
=
[
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
"<|im_start|>assistant
\n
"
)
f
"
{
question
}
<|im_end|>
\n
"
stop_token_ids
=
None
"<|im_start|>assistant
\n
"
)
for
question
in
questions
return
llm
,
prompt
,
stop_token_ids
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Qwen2.5-VL
# Qwen2.5-VL
def
run_qwen2_5_vl
(
question
:
str
,
modality
:
str
):
def
run_qwen2_5_vl
(
question
s
:
list
[
str
]
,
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Qwen/Qwen2.5-VL-3B-Instruct"
model_name
=
"Qwen/Qwen2.5-VL-3B-Instruct"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
max_num_seqs
=
5
,
...
@@ -558,12 +790,17 @@ def run_qwen2_5_vl(question: str, modality: str):
...
@@ -558,12 +790,17 @@ def run_qwen2_5_vl(question: str, modality: str):
elif
modality
==
"video"
:
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
placeholder
=
"<|video_pad|>"
prompt
=
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
prompts
=
[
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
(
"<|im_start|>system
\n
You are a helpful assistant.<|im_end|>
\n
"
f
"
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
"<|im_start|>assistant
\n
"
)
f
"
{
question
}
<|im_end|>
\n
"
stop_token_ids
=
None
"<|im_start|>assistant
\n
"
)
for
question
in
questions
return
llm
,
prompt
,
stop_token_ids
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
model_example_map
=
{
model_example_map
=
{
...
@@ -571,7 +808,9 @@ model_example_map = {
...
@@ -571,7 +808,9 @@ model_example_map = {
"blip-2"
:
run_blip2
,
"blip-2"
:
run_blip2
,
"chameleon"
:
run_chameleon
,
"chameleon"
:
run_chameleon
,
"deepseek_vl_v2"
:
run_deepseek_vl2
,
"deepseek_vl_v2"
:
run_deepseek_vl2
,
"florence2"
:
run_florence2
,
"fuyu"
:
run_fuyu
,
"fuyu"
:
run_fuyu
,
"gemma3"
:
run_gemma3
,
"glm4v"
:
run_glm4v
,
"glm4v"
:
run_glm4v
,
"h2ovl_chat"
:
run_h2ovl
,
"h2ovl_chat"
:
run_h2ovl
,
"idefics3"
:
run_idefics3
,
"idefics3"
:
run_idefics3
,
...
@@ -589,6 +828,7 @@ model_example_map = {
...
@@ -589,6 +828,7 @@ model_example_map = {
"paligemma"
:
run_paligemma
,
"paligemma"
:
run_paligemma
,
"paligemma2"
:
run_paligemma2
,
"paligemma2"
:
run_paligemma2
,
"phi3_v"
:
run_phi3v
,
"phi3_v"
:
run_phi3v
,
"phi4_mm"
:
run_phi4mm
,
"pixtral_hf"
:
run_pixtral_hf
,
"pixtral_hf"
:
run_pixtral_hf
,
"qwen_vl"
:
run_qwen_vl
,
"qwen_vl"
:
run_qwen_vl
,
"qwen2_vl"
:
run_qwen2_vl
,
"qwen2_vl"
:
run_qwen2_vl
,
...
@@ -607,29 +847,35 @@ def get_multi_modal_input(args):
...
@@ -607,29 +847,35 @@ def get_multi_modal_input(args):
# Input image and question
# Input image and question
image
=
ImageAsset
(
"cherry_blossom"
)
\
image
=
ImageAsset
(
"cherry_blossom"
)
\
.
pil_image
.
convert
(
"RGB"
)
.
pil_image
.
convert
(
"RGB"
)
img_question
=
"What is the content of this image?"
img_questions
=
[
"What is the content of this image?"
,
"Describe the content of this image in detail."
,
"What's in the image?"
,
"Where is this image taken?"
,
]
return
{
return
{
"data"
:
image
,
"data"
:
image
,
"question"
:
img_question
,
"question
s
"
:
img_question
s
,
}
}
if
args
.
modality
==
"video"
:
if
args
.
modality
==
"video"
:
# Input video and question
# Input video and question
video
=
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
video
=
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
num_frames
=
args
.
num_frames
).
np_ndarrays
num_frames
=
args
.
num_frames
).
np_ndarrays
vid_question
=
"Why is this video funny?"
vid_question
s
=
[
"Why is this video funny?"
]
return
{
return
{
"data"
:
video
,
"data"
:
video
,
"question"
:
vid_question
,
"question
s
"
:
vid_question
s
,
}
}
msg
=
f
"Modality
{
args
.
modality
}
is not supported."
msg
=
f
"Modality
{
args
.
modality
}
is not supported."
raise
ValueError
(
msg
)
raise
ValueError
(
msg
)
def
apply_image_repeat
(
image_repeat_prob
,
num_prompts
,
data
,
prompt
,
modality
):
def
apply_image_repeat
(
image_repeat_prob
,
num_prompts
,
data
,
prompts
:
list
[
str
],
modality
):
"""Repeats images with provided probability of "image_repeat_prob".
"""Repeats images with provided probability of "image_repeat_prob".
Used to simulate hit/miss for the MM preprocessor cache.
Used to simulate hit/miss for the MM preprocessor cache.
"""
"""
...
@@ -649,7 +895,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
...
@@ -649,7 +895,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
cur_image
.
putpixel
((
0
,
0
),
new_val
)
cur_image
.
putpixel
((
0
,
0
),
new_val
)
inputs
.
append
({
inputs
.
append
({
"prompt"
:
prompt
,
"prompt"
:
prompt
s
[
i
%
len
(
prompts
)]
,
"multi_modal_data"
:
{
"multi_modal_data"
:
{
modality
:
cur_image
modality
:
cur_image
}
}
...
@@ -666,41 +912,55 @@ def main(args):
...
@@ -666,41 +912,55 @@ def main(args):
modality
=
args
.
modality
modality
=
args
.
modality
mm_input
=
get_multi_modal_input
(
args
)
mm_input
=
get_multi_modal_input
(
args
)
data
=
mm_input
[
"data"
]
data
=
mm_input
[
"data"
]
question
=
mm_input
[
"question"
]
question
s
=
mm_input
[
"question
s
"
]
llm
,
prompt
,
stop_token_ids
=
model_example_map
[
model
](
question
,
modality
)
req_data
=
model_example_map
[
model
](
questions
,
modality
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
args
.
seed
}
llm
=
LLM
(
**
engine_args
)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if
req_data
.
lora_requests
:
for
lora_request
in
req_data
.
lora_requests
:
llm
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
# Don't want to check the flag multiple times, so just hijack `prompts`.
prompts
=
req_data
.
prompts
if
args
.
use_different_prompt_per_request
else
[
req_data
.
prompts
[
0
]
]
# We set temperature to 0.2 so that outputs can be different
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
# even when all prompts are identical when running batch inference.
sampling_params
=
SamplingParams
(
temperature
=
0.2
,
sampling_params
=
SamplingParams
(
temperature
=
0.2
,
max_tokens
=
64
,
max_tokens
=
64
,
stop_token_ids
=
stop_token_ids
)
stop_token_ids
=
req_data
.
stop_token_ids
)
assert
args
.
num_prompts
>
0
assert
args
.
num_prompts
>
0
if
args
.
num_prompts
==
1
:
if
args
.
num_prompts
==
1
:
# Single inference
# Single inference
inputs
=
{
inputs
=
{
"prompt"
:
prompt
,
"prompt"
:
prompt
s
[
0
]
,
"multi_modal_data"
:
{
"multi_modal_data"
:
{
modality
:
data
modality
:
data
},
},
}
}
else
:
else
:
# Batch inference
# Batch inference
if
args
.
image_repeat_prob
is
not
None
:
if
args
.
image_repeat_prob
is
not
None
:
# Repeat images with specified probability of "image_repeat_prob"
# Repeat images with specified probability of "image_repeat_prob"
inputs
=
apply_image_repeat
(
args
.
image_repeat_prob
,
inputs
=
apply_image_repeat
(
args
.
image_repeat_prob
,
args
.
num_prompts
,
data
,
prompt
,
args
.
num_prompts
,
data
,
prompt
s
,
modality
)
modality
)
else
:
else
:
# Use the same image for all prompts
# Use the same image for all prompts
inputs
=
[{
inputs
=
[{
"prompt"
:
prompt
,
"prompt"
:
prompt
s
[
i
%
len
(
prompts
)]
,
"multi_modal_data"
:
{
"multi_modal_data"
:
{
modality
:
data
modality
:
data
},
},
}
for
_
in
range
(
args
.
num_prompts
)]
}
for
i
in
range
(
args
.
num_prompts
)]
if
args
.
time_generate
:
if
args
.
time_generate
:
import
time
import
time
...
@@ -740,6 +1000,10 @@ if __name__ == "__main__":
...
@@ -740,6 +1000,10 @@ if __name__ == "__main__":
type
=
int
,
type
=
int
,
default
=
16
,
default
=
16
,
help
=
'Number of frames to extract from the video.'
)
help
=
'Number of frames to extract from the video.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
parser
.
add_argument
(
parser
.
add_argument
(
'--image-repeat-prob'
,
'--image-repeat-prob'
,
...
@@ -758,5 +1022,11 @@ if __name__ == "__main__":
...
@@ -758,5 +1022,11 @@ if __name__ == "__main__":
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'If True, then print the total generate() call time'
)
help
=
'If True, then print the total generate() call time'
)
parser
.
add_argument
(
'--use-different-prompt-per-request'
,
action
=
'store_true'
,
help
=
'If True, then use different prompt (with the same multi-modal '
'data) for each request.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
examples/offline_inference/vision_language_embedding.py
View file @
469e903b
...
@@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
...
@@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
on HuggingFace model repository.
"""
"""
from
argparse
import
Namespace
from
argparse
import
Namespace
from
dataclasses
import
asdict
from
typing
import
Literal
,
NamedTuple
,
Optional
,
TypedDict
,
Union
,
get_args
from
typing
import
Literal
,
NamedTuple
,
Optional
,
TypedDict
,
Union
,
get_args
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
vllm
import
LLM
from
vllm
import
LLM
,
EngineArgs
from
vllm.multimodal.utils
import
fetch_image
from
vllm.multimodal.utils
import
fetch_image
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
...
@@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
...
@@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
class
ModelRequestData
(
NamedTuple
):
class
ModelRequestData
(
NamedTuple
):
llm
:
LLM
engine_args
:
EngineArgs
prompt
:
str
prompt
:
str
image
:
Optional
[
Image
]
image
:
Optional
[
Image
]
def
run_e5_v
(
query
:
Query
):
def
run_e5_v
(
query
:
Query
)
->
ModelRequestData
:
llama3_template
=
'<|start_header_id|>user<|end_header_id|>
\n\n
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
\n
'
# noqa: E501
llama3_template
=
'<|start_header_id|>user<|end_header_id|>
\n\n
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
\n
'
# noqa: E501
if
query
[
"modality"
]
==
"text"
:
if
query
[
"modality"
]
==
"text"
:
...
@@ -58,20 +59,20 @@ def run_e5_v(query: Query):
...
@@ -58,20 +59,20 @@ def run_e5_v(query: Query):
modality
=
query
[
'modality'
]
modality
=
query
[
'modality'
]
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"royokong/e5-v"
,
model
=
"royokong/e5-v"
,
task
=
"embed"
,
task
=
"embed"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
)
)
return
ModelRequestData
(
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
prompt
=
prompt
,
image
=
image
,
image
=
image
,
)
)
def
run_vlm2vec
(
query
:
Query
):
def
run_vlm2vec
(
query
:
Query
)
->
ModelRequestData
:
if
query
[
"modality"
]
==
"text"
:
if
query
[
"modality"
]
==
"text"
:
text
=
query
[
"text"
]
text
=
query
[
"text"
]
prompt
=
f
"Find me an everyday image that matches the given caption:
{
text
}
"
# noqa: E501
prompt
=
f
"Find me an everyday image that matches the given caption:
{
text
}
"
# noqa: E501
...
@@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
...
@@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
modality
=
query
[
'modality'
]
modality
=
query
[
'modality'
]
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
raise
ValueError
(
f
"Unsupported query modality: '
{
modality
}
'"
)
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"TIGER-Lab/VLM2Vec-Full"
,
model
=
"TIGER-Lab/VLM2Vec-Full"
,
task
=
"embed"
,
task
=
"embed"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
...
@@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
)
)
return
ModelRequestData
(
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
prompt
=
prompt
,
image
=
image
,
image
=
image
,
)
)
...
@@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
...
@@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
raise
ValueError
(
msg
)
raise
ValueError
(
msg
)
def
run_encode
(
model
:
str
,
modality
:
QueryModality
):
def
run_encode
(
model
:
str
,
modality
:
QueryModality
,
seed
:
Optional
[
int
]
):
query
=
get_query
(
modality
)
query
=
get_query
(
modality
)
req_data
=
model_example_map
[
model
](
query
)
req_data
=
model_example_map
[
model
](
query
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
seed
}
llm
=
LLM
(
**
engine_args
)
mm_data
=
{}
mm_data
=
{}
if
req_data
.
image
is
not
None
:
if
req_data
.
image
is
not
None
:
mm_data
[
"image"
]
=
req_data
.
image
mm_data
[
"image"
]
=
req_data
.
image
outputs
=
req_data
.
llm
.
embed
({
outputs
=
llm
.
embed
({
"prompt"
:
req_data
.
prompt
,
"prompt"
:
req_data
.
prompt
,
"multi_modal_data"
:
mm_data
,
"multi_modal_data"
:
mm_data
,
})
})
...
@@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
...
@@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
def
main
(
args
:
Namespace
):
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
)
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
model_example_map
=
{
model_example_map
=
{
...
@@ -167,5 +171,10 @@ if __name__ == "__main__":
...
@@ -167,5 +171,10 @@ if __name__ == "__main__":
default
=
"image"
,
default
=
"image"
,
choices
=
get_args
(
QueryModality
),
choices
=
get_args
(
QueryModality
),
help
=
'Modality of the input.'
)
help
=
'Modality of the input.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
Prev
1
…
8
9
10
11
12
13
14
15
16
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment