Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fdcc4053
Unverified
Commit
fdcc4053
authored
Mar 01, 2025
by
Isotr0py
Committed by
GitHub
Feb 28, 2025
Browse files
[Doc] Consolidate `whisper` and `florence2` examples (#14050)
parent
8994dabc
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
210 additions
and
148 deletions
+210
-148
examples/offline_inference/audio_language.py
examples/offline_inference/audio_language.py
+50
-32
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+158
-0
examples/offline_inference/florence2_inference.py
examples/offline_inference/florence2_inference.py
+0
-53
examples/offline_inference/whisper.py
examples/offline_inference/whisper.py
+0
-61
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+2
-2
No files found.
examples/offline_inference/audio_language.py
View file @
fdcc4053
...
@@ -24,25 +24,30 @@ question_per_audio_count = {
...
@@ -24,25 +24,30 @@ question_per_audio_count = {
# Unless specified, these settings have been tested to work on a single L4.
# Unless specified, these settings have been tested to work on a single L4.
# Ultravox 0.5-1B
# MiniCPM-O
def
run_ultravox
(
question
:
str
,
audio_count
:
int
):
def
run_minicpmo
(
question
:
str
,
audio_count
:
int
):
model_name
=
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
model_name
=
"openbmb/MiniCPM-o-2_6"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
llm
=
LLM
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
})
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
stop_tokens
=
[
'<|im_end|>'
,
'<|endoftext|>'
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
audio_placeholder
=
"(<audio>./</audio>)"
*
audio_count
audio_chat_template
=
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '
\n
' + message['content'] + '<|im_end|>' + '
\n
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
\n
<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"
# noqa: E501
messages
=
[{
messages
=
[{
'role'
:
'user'
,
'role'
:
'user'
,
'content'
:
"<|audio|>
\n
"
*
audio_count
+
question
'content'
:
f
'
{
audio_placeholder
}
\n
{
question
}
'
}]
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
tokenize
=
False
,
add_generation_prompt
=
True
)
add_generation_prompt
=
True
,
chat_template
=
audio_chat_template
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
})
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
llm
,
prompt
,
stop_token_ids
...
@@ -68,36 +73,49 @@ def run_qwen2_audio(question: str, audio_count: int):
...
@@ -68,36 +73,49 @@ def run_qwen2_audio(question: str, audio_count: int):
return
llm
,
prompt
,
stop_token_ids
return
llm
,
prompt
,
stop_token_ids
def
run_minicpmo
(
question
:
str
,
audio_count
:
int
):
# Ultravox 0.5-1B
model_name
=
"openbmb/MiniCPM-o-2_6"
def
run_ultravox
(
question
:
str
,
audio_count
:
int
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
model_name
=
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
trust_remote_code
=
True
)
llm
=
LLM
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
})
stop_tokens
=
[
'<|im_end|>'
,
'<|endoftext|>'
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
audio_placeholder
=
"(<audio>./</audio>)"
*
audio_count
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
audio_chat_template
=
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '
\n
' + message['content'] + '<|im_end|>' + '
\n
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
\n
<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"
# noqa: E501
messages
=
[{
messages
=
[{
'role'
:
'user'
,
'role'
:
'user'
,
'content'
:
f
'
{
audio
_placeholder
}
\n
{
question
}
'
'content'
:
"<|
audio
|>
\n
"
*
audio_count
+
question
}]
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
tokenize
=
False
,
add_generation_prompt
=
True
,
add_generation_prompt
=
True
)
chat_template
=
audio_chat_template
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
})
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
# Whisper
def
run_whisper
(
question
:
str
,
audio_count
:
int
):
assert
audio_count
==
1
,
(
"Whisper only support single audio input per prompt"
)
model_name
=
"openai/whisper-large-v3-turbo"
prompt
=
"<|startoftranscript|>"
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
448
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
})
stop_token_ids
=
None
return
llm
,
prompt
,
stop_token_ids
return
llm
,
prompt
,
stop_token_ids
model_example_map
=
{
model_example_map
=
{
"
ultravox"
:
run_ultravox
,
"
minicpmo"
:
run_minicpmo
,
"qwen2_audio"
:
run_qwen2_audio
,
"qwen2_audio"
:
run_qwen2_audio
,
"minicpmo"
:
run_minicpmo
"ultravox"
:
run_ultravox
,
"whisper"
:
run_whisper
,
}
}
...
...
examples/offline_inference/encoder_decoder_multimodal.py
0 → 100644
View file @
fdcc4053
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import
time
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.utils
import
FlexibleArgumentParser
def
run_florence2
():
# Create a Florence-2 encoder/decoder model instance
llm
=
LLM
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"facebook/bart-large"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# implicit prompt with task token
"prompt"
:
"<DETAILED_CAPTION>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
},
},
{
# explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
"Describe in detail what is shown in the image."
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
},
},
"decoder_prompt"
:
""
,
},
]
return
llm
,
prompts
def
run_mllama
():
# Create a Mllama encoder/decoder model instance
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Implicit prompt
"prompt"
:
"<|image|><|begin_of_text|>What is the content of this image?"
,
# noqa: E501
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
{
# Explicit prompt
"encoder_prompt"
:
{
"prompt"
:
"<|image|>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
"decoder_prompt"
:
"<|image|><|begin_of_text|>Please describe the image."
,
# noqa: E501
},
]
return
llm
,
prompts
def
run_whisper
():
# Create a Whisper encoder/decoder model instance
llm
=
LLM
(
model
=
"openai/whisper-large-v3-turbo"
,
max_model_len
=
448
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Test implicit prompt
"prompt"
:
"<|startoftranscript|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|>"
,
}
]
return
llm
,
prompts
model_example_map
=
{
"florence2"
:
run_florence2
,
"mllama"
:
run_mllama
,
"whisper"
:
run_whisper
,
}
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
raise
ValueError
(
f
"Model type
{
model
}
is not supported."
)
llm
,
prompts
=
model_example_map
[
model
]()
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
64
,
)
start
=
time
.
time
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
duration
=
time
.
time
()
-
start
print
(
"Duration:"
,
duration
)
print
(
"RPS:"
,
len
(
prompts
)
/
duration
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"mllama"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/florence2_inference.py
deleted
100644 → 0
View file @
8994dabc
# SPDX-License-Identifier: Apache-2.0
"""
Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2
"""
# TODO(Isotr0py):
# Move to offline_inference/vision_language.py
# after porting vision backbone
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
# Create a Florence-2 encoder/decoder model instance
llm
=
LLM
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"facebook/bart-large"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
)
prompts
=
[
{
# implicit prompt with task token
"prompt"
:
"<DETAILED_CAPTION>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
},
},
{
# explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
"Describe in detail what is shown in the image."
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
},
},
"decoder_prompt"
:
""
,
},
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
min_tokens
=
0
,
max_tokens
=
128
,
)
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/whisper.py
deleted
100644 → 0
View file @
8994dabc
# SPDX-License-Identifier: Apache-2.0
import
time
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
# Create a Whisper encoder/decoder model instance
llm
=
LLM
(
model
=
"openai/whisper-large-v3"
,
max_model_len
=
448
,
max_num_seqs
=
400
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
kv_cache_dtype
=
"fp8"
,
)
prompts
=
[
{
"prompt"
:
"<|startoftranscript|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|>"
,
}
]
*
1024
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
200
,
)
start
=
time
.
time
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
, "
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
duration
=
time
.
time
()
-
start
print
(
"Duration:"
,
duration
)
print
(
"RPS:"
,
len
(
prompts
)
/
duration
)
vllm/model_executor/models/whisper.py
View file @
fdcc4053
...
@@ -748,11 +748,11 @@ def _create_fake_bias_for_k_proj(
...
@@ -748,11 +748,11 @@ def _create_fake_bias_for_k_proj(
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]
)
->
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]:
)
->
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]:
"""
"""
Create full zeros bias for k_proj weight in self-att
entio
n layers.
Create full zeros bias for k_proj weight in self-att
n and x-att
n layers.
So that the bias for k_proj in qkv_proj can be initialized with zeros.
So that the bias for k_proj in qkv_proj can be initialized with zeros.
"""
"""
for
name
,
weight
in
weights
:
for
name
,
weight
in
weights
:
if
name
.
endswith
(
".
self_attn.
k_proj.weight"
):
if
name
.
endswith
(
".k_proj.weight"
):
bias
=
torch
.
zeros
(
weight
.
size
(
0
))
bias
=
torch
.
zeros
(
weight
.
size
(
0
))
bias_name
=
name
.
replace
(
"weight"
,
"bias"
)
bias_name
=
name
.
replace
(
"weight"
,
"bias"
)
yield
from
[(
name
,
weight
),
(
bias_name
,
bias
)]
yield
from
[(
name
,
weight
),
(
bias_name
,
bias
)]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment