Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
716 additions
and
242 deletions
+716
-242
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+164
-83
examples/offline_inference/whisper.py
examples/offline_inference/whisper.py
+0
-61
examples/online_serving/api_client.py
examples/online_serving/api_client.py
+4
-4
examples/online_serving/disaggregated_prefill.sh
examples/online_serving/disaggregated_prefill.sh
+9
-4
examples/online_serving/gradio_webserver.py
examples/online_serving/gradio_webserver.py
+1
-1
examples/online_serving/multi-node-serving.sh
examples/online_serving/multi-node-serving.sh
+94
-0
examples/online_serving/openai_chat_completion_structured_outputs.py
...line_serving/openai_chat_completion_structured_outputs.py
+24
-1
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
...enai_chat_completion_structured_outputs_with_reasoning.py
+129
-0
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
...rving/openai_chat_completion_tool_calls_with_reasoning.py
+177
-0
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
...erving/openai_chat_completion_with_reasoning_streaming.py
+33
-56
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
...ne_serving/openai_chat_embedding_client_for_multimodal.py
+1
-1
examples/online_serving/openai_embedding_client.py
examples/online_serving/openai_embedding_client.py
+1
-1
examples/online_serving/openai_transcription_client.py
examples/online_serving/openai_transcription_client.py
+51
-8
examples/online_serving/opentelemetry/dummy_client.py
examples/online_serving/opentelemetry/dummy_client.py
+0
-1
examples/online_serving/prometheus_grafana/grafana.json
examples/online_serving/prometheus_grafana/grafana.json
+4
-4
examples/other/logging_configuration.md
examples/other/logging_configuration.md
+3
-8
examples/other/tensorize_vllm_model.py
examples/other/tensorize_vllm_model.py
+5
-5
examples/template_deepseek_vl2.jinja
examples/template_deepseek_vl2.jinja
+3
-3
examples/template_teleflm.jinja
examples/template_teleflm.jinja
+12
-0
format.sh
format.sh
+1
-1
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
examples/offline_inference/vision_language_multi_image.py
View file @
469e903b
...
...
@@ -4,13 +4,17 @@ This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation,
using the chat template defined by the model.
"""
import
os
from
argparse
import
Namespace
from
typing
import
List
,
NamedTuple
,
Optional
from
dataclasses
import
asdict
from
typing
import
NamedTuple
,
Optional
from
huggingface_hub
import
snapshot_download
from
PIL.Image
import
Image
from
transformers
import
AutoProcessor
,
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.utils
import
fetch_image
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -22,11 +26,12 @@ IMAGE_URLS = [
class
ModelRequestData
(
NamedTuple
):
llm
:
LLM
engine_args
:
EngineArgs
prompt
:
str
stop_token_ids
:
Optional
[
List
[
int
]]
image_data
:
List
[
Image
]
chat_template
:
Optional
[
str
]
image_data
:
list
[
Image
]
stop_token_ids
:
Optional
[
list
[
int
]]
=
None
chat_template
:
Optional
[
str
]
=
None
lora_requests
:
Optional
[
list
[
LoRARequest
]]
=
None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
...
...
@@ -34,53 +39,91 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4.
def
load_aria
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_aria
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"rhymes-ai/Aria"
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"slow"
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)})
engine_args
=
EngineArgs
(
model
=
model_name
,
tokenizer_mode
=
"slow"
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
"<fim_prefix><|img|><fim_suffix>
\n
"
*
len
(
image_urls
)
prompt
=
(
f
"<|im_start|>user
\n
{
placeholders
}{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
stop_token_ids
=
[
93532
,
93653
,
944
,
93421
,
1019
,
93653
,
93519
]
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
List
[
str
]):
def
load_deepseek_vl2
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"deepseek-ai/deepseek-vl2-tiny"
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)})
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholder
=
""
.
join
(
f
"image_
{
i
}
:<image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
))
prompt
=
f
"<|User|>:
{
placeholder
}{
question
}
\n\n
<|Assistant|>:"
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_gemma3
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"google/gemma-3-4b-it"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
[{
"type"
:
"image"
,
"image"
:
url
}
for
url
in
image_urls
]
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
placeholders
,
{
"type"
:
"text"
,
"text"
:
question
},
],
}]
processor
=
AutoProcessor
.
from_pretrained
(
model_name
)
prompt
=
processor
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
None
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_h2ovl
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_h2ovl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"h2oai/h2ovl-mississippi-800m"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
...
...
@@ -103,19 +146,18 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
stop_token_ids
=
[
tokenizer
.
eos_token_id
]
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_idefics3
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_idefics3
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU.
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
16
,
...
...
@@ -134,18 +176,16 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
))
prompt
=
f
"<|begin_of_text|>User:
{
placeholders
}
\n
{
question
}
<end_of_utterance>
\n
Assistant:"
# noqa: E501
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
None
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_internvl
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_internvl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"OpenGVLab/InternVL2-2B"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
...
...
@@ -171,19 +211,18 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_mllama
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_mllama
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
16
,
...
...
@@ -193,19 +232,17 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
placeholders
=
"<|image|>"
*
len
(
image_urls
)
prompt
=
f
"
{
placeholders
}
<|begin_of_text|>
{
question
}
"
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
None
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_nvlm_d
(
question
:
str
,
image_urls
:
L
ist
[
str
]):
def
load_nvlm_d
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
...
...
@@ -223,22 +260,19 @@ def load_nvlm_d(question: str, image_urls: List[str]):
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
stop_token_ids
=
None
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"mistral-community/pixtral-12b"
# Adjust this as necessary to fit in GPU
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
...
...
@@ -248,18 +282,15 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
placeholders
=
"[IMG]"
*
len
(
image_urls
)
prompt
=
f
"<s>[INST]
{
question
}
\n
{
placeholders
}
[/INST]"
stop_token_ids
=
None
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_phi3v
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_phi3v
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
...
...
@@ -272,7 +303,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
...
...
@@ -283,21 +314,50 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
placeholders
=
"
\n
"
.
join
(
f
"<|image_
{
i
}
|>"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
))
prompt
=
f
"<|user|>
\n
{
placeholders
}
\n
{
question
}
<|end|>
\n
<|assistant|>
\n
"
stop_token_ids
=
None
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
chat_template
=
None
,
)
def
load_phi4mm
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process multi images inputs.
"""
model_path
=
snapshot_download
(
"microsoft/Phi-4-multimodal-instruct"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
10000
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
enable_lora
=
True
,
max_lora_rank
=
320
,
)
placeholders
=
""
.
join
(
f
"<|image_
{
i
}
|>"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
))
prompt
=
f
"<|user|>
{
placeholders
}{
question
}
<|end|><|assistant|>"
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
lora_requests
=
[
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)],
)
def
load_qwen_vl_chat
(
question
:
str
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
model_name
=
"Qwen/Qwen-VL-Chat"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
1024
,
...
...
@@ -328,7 +388,7 @@ def load_qwen_vl_chat(question: str,
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
...
...
@@ -336,7 +396,7 @@ def load_qwen_vl_chat(question: str,
)
def
load_qwen2_vl
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_qwen2_vl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
try
:
from
qwen_vl_utils
import
process_vision_info
except
ModuleNotFoundError
:
...
...
@@ -348,7 +408,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
model_name
=
"Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
32768
if
process_vision_info
is
None
else
4096
,
max_num_seqs
=
5
,
...
...
@@ -377,23 +437,19 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
tokenize
=
False
,
add_generation_prompt
=
True
)
stop_token_ids
=
None
if
process_vision_info
is
None
:
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
else
:
image_data
,
_
=
process_vision_info
(
messages
)
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
image_data
,
chat_template
=
None
,
)
def
load_qwen2_5_vl
(
question
,
image_urls
:
L
ist
[
str
])
->
ModelRequestData
:
def
load_qwen2_5_vl
(
question
:
str
,
image_urls
:
l
ist
[
str
])
->
ModelRequestData
:
try
:
from
qwen_vl_utils
import
process_vision_info
except
ModuleNotFoundError
:
...
...
@@ -404,7 +460,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
model_name
=
"Qwen/Qwen2.5-VL-3B-Instruct"
llm
=
LLM
(
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
32768
if
process_vision_info
is
None
else
4096
,
max_num_seqs
=
5
,
...
...
@@ -433,32 +489,30 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
tokenize
=
False
,
add_generation_prompt
=
True
)
stop_token_ids
=
None
if
process_vision_info
is
None
:
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
else
:
image_data
,
_
=
process_vision_info
(
messages
,
return_video_
sample_fp
s
=
False
)
return_video_
kwarg
s
=
False
)
return
ModelRequestData
(
llm
=
llm
,
engine_args
=
engine_args
,
prompt
=
prompt
,
stop_token_ids
=
stop_token_ids
,
image_data
=
image_data
,
chat_template
=
None
,
)
model_example_map
=
{
"aria"
:
load_aria
,
"deepseek_vl_v2"
:
load_deepseek_vl2
,
"gemma3"
:
load_gemma3
,
"h2ovl_chat"
:
load_h2ovl
,
"idefics3"
:
load_idefics3
,
"internvl_chat"
:
load_internvl
,
"mllama"
:
load_mllama
,
"NVLM_D"
:
load_nvlm_d
,
"phi3_v"
:
load_phi3v
,
"phi4_mm"
:
load_phi4mm
,
"pixtral_hf"
:
load_pixtral_hf
,
"qwen_vl_chat"
:
load_qwen_vl_chat
,
"qwen2_vl"
:
load_qwen2_vl
,
...
...
@@ -466,14 +520,25 @@ model_example_map = {
}
def
run_generate
(
model
,
question
:
str
,
image_urls
:
List
[
str
]):
def
run_generate
(
model
,
question
:
str
,
image_urls
:
list
[
str
],
seed
:
Optional
[
int
]):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
args
.
seed
}
llm
=
LLM
(
**
engine_args
)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if
req_data
.
lora_requests
:
for
lora_request
in
req_data
.
lora_requests
:
llm
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
128
,
stop_token_ids
=
req_data
.
stop_token_ids
)
outputs
=
req_data
.
llm
.
generate
(
outputs
=
llm
.
generate
(
{
"prompt"
:
req_data
.
prompt
,
"multi_modal_data"
:
{
...
...
@@ -487,13 +552,24 @@ def run_generate(model, question: str, image_urls: List[str]):
print
(
generated_text
)
def
run_chat
(
model
:
str
,
question
:
str
,
image_urls
:
List
[
str
]):
def
run_chat
(
model
:
str
,
question
:
str
,
image_urls
:
list
[
str
],
seed
:
Optional
[
int
]):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
seed
}
llm
=
LLM
(
**
engine_args
)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if
req_data
.
lora_requests
:
for
lora_request
in
req_data
.
lora_requests
:
llm
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
128
,
stop_token_ids
=
req_data
.
stop_token_ids
)
outputs
=
req_data
.
llm
.
chat
(
outputs
=
llm
.
chat
(
[{
"role"
:
"user"
,
...
...
@@ -522,11 +598,12 @@ def run_chat(model: str, question: str, image_urls: List[str]):
def
main
(
args
:
Namespace
):
model
=
args
.
model_type
method
=
args
.
method
seed
=
args
.
seed
if
method
==
"generate"
:
run_generate
(
model
,
QUESTION
,
IMAGE_URLS
)
run_generate
(
model
,
QUESTION
,
IMAGE_URLS
,
seed
)
elif
method
==
"chat"
:
run_chat
(
model
,
QUESTION
,
IMAGE_URLS
)
run_chat
(
model
,
QUESTION
,
IMAGE_URLS
,
seed
)
else
:
raise
ValueError
(
f
"Invalid method:
{
method
}
"
)
...
...
@@ -547,6 +624,10 @@ if __name__ == "__main__":
default
=
"generate"
,
choices
=
[
"generate"
,
"chat"
],
help
=
"The method to run in `vllm.LLM`."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/whisper.py
deleted
100644 → 0
View file @
389ebcf7
# SPDX-License-Identifier: Apache-2.0
import
time
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
# Create a Whisper encoder/decoder model instance
llm
=
LLM
(
model
=
"openai/whisper-large-v3"
,
max_model_len
=
448
,
max_num_seqs
=
400
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
kv_cache_dtype
=
"fp8"
,
)
prompts
=
[
{
"prompt"
:
"<|startoftranscript|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|>"
,
}
]
*
1024
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
200
,
)
start
=
time
.
time
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
, "
f
"Decoder prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
duration
=
time
.
time
()
-
start
print
(
"Duration:"
,
duration
)
print
(
"RPS:"
,
len
(
prompts
)
/
duration
)
examples/online_serving/api_client.py
View file @
469e903b
...
...
@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import
argparse
import
json
from
typing
import
Iterable
,
List
from
collections.abc
import
Iterable
import
requests
...
...
@@ -39,17 +39,17 @@ def post_http_request(prompt: str,
return
response
def
get_streaming_response
(
response
:
requests
.
Response
)
->
Iterable
[
L
ist
[
str
]]:
def
get_streaming_response
(
response
:
requests
.
Response
)
->
Iterable
[
l
ist
[
str
]]:
for
chunk
in
response
.
iter_lines
(
chunk_size
=
8192
,
decode_unicode
=
False
,
delimiter
=
b
"
\
0
"
):
delimiter
=
b
"
\
n
"
):
if
chunk
:
data
=
json
.
loads
(
chunk
.
decode
(
"utf-8"
))
output
=
data
[
"text"
]
yield
output
def
get_response
(
response
:
requests
.
Response
)
->
L
ist
[
str
]:
def
get_response
(
response
:
requests
.
Response
)
->
l
ist
[
str
]:
data
=
json
.
loads
(
response
.
content
)
output
=
data
[
"text"
]
return
output
...
...
examples/online_serving/disaggregated_prefill.sh
View file @
469e903b
...
...
@@ -8,6 +8,9 @@ set -xe
echo
"🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
sleep
1
# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
MODEL_NAME
=
${
HF_MODEL_NAME
:-
meta
-llama/Meta-Llama-3.1-8B-Instruct
}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap
'cleanup'
INT
...
...
@@ -44,18 +47,20 @@ wait_for_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES
=
0 vllm serve
meta-llama/Meta-Llama-3.1-8B-Instruct
\
CUDA_VISIBLE_DEVICES
=
0 vllm serve
$MODEL_NAME
\
--port
8100
\
--max-model-len
100
\
--gpu-memory-utilization
0.8
\
--trust-remote-code
\
--kv-transfer-config
\
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
&
# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES
=
1 vllm serve
meta-llama/Meta-Llama-3.1-8B-Instruct
\
CUDA_VISIBLE_DEVICES
=
1 vllm serve
$MODEL_NAME
\
--port
8200
\
--max-model-len
100
\
--gpu-memory-utilization
0.8
\
--trust-remote-code
\
--kv-transfer-config
\
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
&
...
...
@@ -78,7 +83,7 @@ sleep 1
output1
=
$(
curl
-X
POST
-s
http://localhost:8000/v1/completions
\
-H
"Content-Type: application/json"
\
-d
'{
"model": "
meta-llama/Meta-Llama-3.1-8B-Instruct
",
"model": "
'
"
$MODEL_NAME
"
'
",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
...
...
@@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
output2
=
$(
curl
-X
POST
-s
http://localhost:8000/v1/completions
\
-H
"Content-Type: application/json"
\
-d
'{
"model": "
meta-llama/Meta-Llama-3.1-8B-Instruct
",
"model": "
'
"
$MODEL_NAME
"
'
",
"prompt": "Santa Clara is a",
"max_tokens": 10,
"temperature": 0
...
...
examples/online_serving/gradio_webserver.py
View file @
469e903b
...
...
@@ -21,7 +21,7 @@ def http_bot(prompt):
for
chunk
in
response
.
iter_lines
(
chunk_size
=
8192
,
decode_unicode
=
False
,
delimiter
=
b
"
\
0
"
):
delimiter
=
b
"
\
n
"
):
if
chunk
:
data
=
json
.
loads
(
chunk
.
decode
(
"utf-8"
))
output
=
data
[
"text"
][
0
]
...
...
examples/online_serving/multi-node-serving.sh
0 → 100644
View file @
469e903b
#!/bin/bash
subcommand
=
$1
shift
ray_port
=
6379
ray_init_timeout
=
300
declare
-a
start_params
case
"
$subcommand
"
in
worker
)
ray_address
=
""
while
[
$#
-gt
0
]
;
do
case
"
$1
"
in
--ray_address
=
*
)
ray_address
=
"
${
1
#*=
}
"
;;
--ray_port
=
*
)
ray_port
=
"
${
1
#*=
}
"
;;
--ray_init_timeout
=
*
)
ray_init_timeout
=
"
${
1
#*=
}
"
;;
*
)
start_params+
=(
"
$1
"
)
esac
shift
done
if
[
-z
"
$ray_address
"
]
;
then
echo
"Error: Missing argument --ray_address"
exit
1
fi
for
((
i
=
0
;
i <
$ray_init_timeout
;
i+
=
5
))
;
do
ray start
--address
=
$ray_address
:
$ray_port
--block
"
${
start_params
[@]
}
"
if
[
$?
-eq
0
]
;
then
echo
"Worker: Ray runtime started with head address
$ray_address
:
$ray_port
"
exit
0
fi
echo
"Waiting until the ray worker is active..."
sleep
5s
;
done
echo
"Ray worker starts timeout, head address:
$ray_address
:
$ray_port
"
exit
1
;;
leader
)
ray_cluster_size
=
""
while
[
$#
-gt
0
]
;
do
case
"
$1
"
in
--ray_port
=
*
)
ray_port
=
"
${
1
#*=
}
"
;;
--ray_cluster_size
=
*
)
ray_cluster_size
=
"
${
1
#*=
}
"
;;
--ray_init_timeout
=
*
)
ray_init_timeout
=
"
${
1
#*=
}
"
;;
*
)
start_params+
=(
"
$1
"
)
esac
shift
done
if
[
-z
"
$ray_cluster_size
"
]
;
then
echo
"Error: Missing argument --ray_cluster_size"
exit
1
fi
# start the ray daemon
ray start
--head
--port
=
$ray_port
"
${
start_params
[@]
}
"
# wait until all workers are active
for
((
i
=
0
;
i <
$ray_init_timeout
;
i+
=
5
))
;
do
active_nodes
=
`
python3
-c
'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'
`
if
[
$active_nodes
-eq
$ray_cluster_size
]
;
then
echo
"All ray workers are active and the ray cluster is initialized successfully."
exit
0
fi
echo
"Wait for all ray workers to be active.
$active_nodes
/
$ray_cluster_size
is active"
sleep
5s
;
done
echo
"Waiting for all ray workers to be active timed out."
exit
1
;;
*
)
echo
"unknown subcommand:
$subcommand
"
exit
1
;;
esac
examples/online_serving/openai_chat_completion_structured_outputs.py
View file @
469e903b
...
...
@@ -2,7 +2,7 @@
from
enum
import
Enum
from
openai
import
OpenAI
from
openai
import
BadRequestError
,
OpenAI
from
pydantic
import
BaseModel
client
=
OpenAI
(
...
...
@@ -94,3 +94,26 @@ completion = client.chat.completions.create(
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
# Extra backend options
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
try
:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"\w+@\w+\.com
\n
"
,
"stop"
:
[
"
\n
"
],
"guided_decoding_backend"
:
"xgrammar:no-fallback"
},
)
except
BadRequestError
as
e
:
print
(
"This error is expected:"
,
e
)
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
An example shows how to generate structured outputs from reasoning models
like DeepSeekR1. The thinking process will not be guided by the JSON
schema provided by the user. Only the final output will be structured.
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
\
--enable-reasoning --reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
using the OpenAI Python client library.
"""
from
enum
import
Enum
from
openai
import
OpenAI
from
pydantic
import
BaseModel
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Guided decoding by Regex
prompt
=
(
"What is the capital of France?"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"(Paris|London)"
,
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
class
People
(
BaseModel
):
name
:
str
age
:
int
json_schema
=
People
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the name and age of one random person."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
# Guided decoding by JSON using Pydantic schema
class
CarType
(
str
,
Enum
):
sedan
=
"sedan"
suv
=
"SUV"
truck
=
"Truck"
coupe
=
"Coupe"
class
CarDescription
(
BaseModel
):
brand
:
str
model
:
str
car_type
:
CarType
json_schema
=
CarDescription
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
# Guided decoding by Grammar
simplified_sql_grammar
=
"""
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
An example demonstrates how to use tool calling with reasoning models
like QwQ-32B. The reasoning_content will not be parsed by the tool
calling process; only the final output will be parsed.
To run this example, you need to start the vLLM server with both
the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B
\
--enable-reasoning --reasoning-parser deepseek_r1
\
--enable-auto-tool-choice --tool-call-parser hermes
```
"""
from
openai
import
OpenAI
# Now, simulate a tool call
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
'str'
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]
}
},
"required"
:
[
"city"
,
"state"
,
"unit"
]
}
}
}]
messages
=
[{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
},
{
"role"
:
"assistant"
,
"content"
:
"I'm doing well! How can I help you?"
},
{
"role"
:
"user"
,
"content"
:
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
def
extract_reasoning_and_calls
(
chunks
:
list
):
reasoning_content
=
""
tool_call_idx
=
-
1
arguments
=
[]
function_names
=
[]
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
function_names
.
append
(
""
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
function_names
[
tool_call_idx
]
=
tool_call
.
function
.
name
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
else
:
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
+=
chunk
.
choices
[
0
].
delta
.
reasoning_content
return
reasoning_content
,
arguments
,
function_names
print
(
"---------Full Generate With Automatic Function Calling-------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
name
}
"
)
print
(
f
"function arguments: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
arguments
}
"
)
print
(
"----------Stream Generate With Automatic Function Calling-----------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"----------Full Generate With Named Function Calling-----------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
})
tool_call
=
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_call
.
name
}
"
)
print
(
f
"function arguments:
{
tool_call
.
arguments
}
"
)
print
(
"----------Stream Generate With Named Function Calling--------------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
},
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"
\n\n
"
)
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
View file @
469e903b
...
...
@@ -19,73 +19,50 @@ in real-time as they are generated by the model. This is useful for scenarios
where you want to display chat completions to the user as they are generated
by the model.
Here we do not use the OpenAI Python client library, because it does not support
`reasoning_content` fields
in t
he response
.
Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
content may not exist lead
in
g
t
o errors if you try to access it
.
"""
import
json
import
requests
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
models
=
requests
.
get
(
f
"
{
openai_api_base
}
/models"
,
headers
=
{
"Authorization"
:
f
"Bearer
{
openai_api_key
}
"
},
).
json
()
model
=
models
[
"data"
][
0
][
"id"
]
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Streaming chat completions
m
essages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
models
=
client
.
models
.
list
()
m
odel
=
models
.
data
[
0
].
id
response
=
requests
.
post
(
f
"
{
openai_api_base
}
/chat/completions"
,
headers
=
{
"Authorization"
:
f
"Bearer
{
openai_api_key
}
"
},
json
=
{
"model"
:
model
,
"messages"
:
messages
,
"stream"
:
True
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
stream
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
stream
=
True
)
print
(
"client: Start streaming chat completions..."
)
printed_reasoning_content
=
False
printed_content
=
False
# Make the streaming request
if
response
.
status_code
==
200
:
# Process the streaming response
for
line
in
response
.
iter_lines
():
if
line
:
# Filter out keep-alive new lines
# Decode the line and parse the JSON
decoded_line
=
line
.
decode
(
"utf-8"
)
if
decoded_line
.
startswith
(
"data:"
):
data
=
decoded_line
[
5
:].
strip
()
# Remove "data:" prefix
if
data
==
"[DONE]"
:
# End of stream
print
(
"
\n
client: Stream completed."
)
break
try
:
# Parse the JSON data
chunk
=
json
.
loads
(
data
)
reasoning_content
=
chunk
[
"choices"
][
0
][
"delta"
].
get
(
"reasoning_content"
,
""
)
content
=
chunk
[
"choices"
][
0
][
"delta"
].
get
(
"content"
,
""
)
if
reasoning_content
:
if
not
printed_reasoning_content
:
printed_reasoning_content
=
True
print
(
"reasoning_content:"
,
end
=
""
,
flush
=
True
)
print
(
reasoning_content
,
end
=
""
,
flush
=
True
)
elif
content
:
if
not
printed_content
:
printed_content
=
True
print
(
"
\n
content:"
,
end
=
""
,
flush
=
True
)
# Extract and print the content
print
(
content
,
end
=
""
,
flush
=
True
)
except
json
.
JSONDecodeError
:
print
(
"Error decoding JSON:"
,
decoded_line
)
else
:
print
(
f
"Error:
{
response
.
status_code
}
-
{
response
.
text
}
"
)
for
chunk
in
stream
:
reasoning_content
=
None
content
=
None
# Check the content is reasoning_content or content
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
=
chunk
.
choices
[
0
].
delta
.
reasoning_content
elif
hasattr
(
chunk
.
choices
[
0
].
delta
,
"content"
):
content
=
chunk
.
choices
[
0
].
delta
.
content
if
reasoning_content
is
not
None
:
if
not
printed_reasoning_content
:
printed_reasoning_content
=
True
print
(
"reasoning_content:"
,
end
=
""
,
flush
=
True
)
print
(
reasoning_content
,
end
=
""
,
flush
=
True
)
elif
content
is
not
None
:
if
not
printed_content
:
printed_content
=
True
print
(
"
\n
content:"
,
end
=
""
,
flush
=
True
)
# Extract and print the content
print
(
content
,
end
=
""
,
flush
=
True
)
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
View file @
469e903b
...
...
@@ -102,7 +102,7 @@ if __name__ == '__main__':
parser
=
argparse
.
ArgumentParser
(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this."
)
parser
.
add_argument
(
"model"
,
parser
.
add_argument
(
"
--
model"
,
type
=
str
,
choices
=
[
"vlm2vec"
,
"dse_qwen2_vl"
],
required
=
True
,
...
...
examples/online_serving/openai_embedding_client.py
View file @
469e903b
...
...
@@ -24,4 +24,4 @@ responses = client.embeddings.create(
)
for
data
in
responses
.
data
:
print
(
data
.
embedding
)
#
l
ist of float of len 4096
print
(
data
.
embedding
)
#
L
ist of float of len 4096
examples/online_serving/openai_transcription_client.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
json
import
httpx
from
openai
import
OpenAI
from
vllm.assets.audio
import
AudioAsset
...
...
@@ -13,11 +17,50 @@ client = OpenAI(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
with
open
(
str
(
mary_had_lamb
),
"rb"
)
as
f
:
transcription
=
client
.
audio
.
transcriptions
.
create
(
file
=
f
,
model
=
"openai/whisper-large-v3"
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
)
print
(
"transcription result:"
,
transcription
)
def
sync_openai
():
with
open
(
str
(
mary_had_lamb
),
"rb"
)
as
f
:
transcription
=
client
.
audio
.
transcriptions
.
create
(
file
=
f
,
model
=
"openai/whisper-small"
,
language
=
"en"
,
response_format
=
"json"
,
temperature
=
0.0
)
print
(
"transcription result:"
,
transcription
.
text
)
sync_openai
()
# OpenAI Transcription API client does not support streaming.
async
def
stream_openai_response
():
data
=
{
"language"
:
"en"
,
'stream'
:
True
,
"model"
:
"openai/whisper-large-v3"
,
}
url
=
openai_api_base
+
"/audio/transcriptions"
print
(
"transcription result:"
,
end
=
' '
)
async
with
httpx
.
AsyncClient
()
as
client
:
with
open
(
str
(
winning_call
),
"rb"
)
as
f
:
async
with
client
.
stream
(
'POST'
,
url
,
files
=
{
'file'
:
f
},
data
=
data
)
as
response
:
async
for
line
in
response
.
aiter_lines
():
# Each line is a JSON object prefixed with 'data: '
if
line
:
if
line
.
startswith
(
'data: '
):
line
=
line
[
len
(
'data: '
):]
# Last chunk, stream ends
if
line
.
strip
()
==
'[DONE]'
:
break
# Parse the JSON response
chunk
=
json
.
loads
(
line
)
# Extract and print the content
content
=
chunk
[
'choices'
][
0
].
get
(
'delta'
,
{}).
get
(
'content'
)
print
(
content
,
end
=
''
)
# Run the asynchronous function
asyncio
.
run
(
stream_openai_response
())
examples/online_serving/opentelemetry/dummy_client.py
View file @
469e903b
...
...
@@ -28,7 +28,6 @@ with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
"model"
:
"facebook/opt-125m"
,
"prompt"
:
prompt
,
"max_tokens"
:
10
,
"best_of"
:
20
,
"n"
:
3
,
"use_beam_search"
:
"true"
,
"temperature"
:
0.0
,
...
...
examples/online_serving/prometheus_grafana/grafana.json
View file @
469e903b
...
...
@@ -1260,7 +1260,7 @@
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"
edx8memhpd9tsa
"
"uid"
:
"
${DS_PROMETHEUS}
"
},
"disableTextWrap"
:
false
,
"editorMode"
:
"code"
,
...
...
@@ -1360,7 +1360,7 @@
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"
edx8memhpd9tsa
"
"uid"
:
"
${DS_PROMETHEUS}
"
},
"disableTextWrap"
:
false
,
"editorMode"
:
"code"
,
...
...
@@ -1473,7 +1473,7 @@
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"
edx8memhpd9tsa
"
"uid"
:
"
${DS_PROMETHEUS}
"
},
"disableTextWrap"
:
false
,
"editorMode"
:
"code"
,
...
...
@@ -1523,7 +1523,7 @@
},
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"
edx8memhpd9tsa
"
"uid"
:
"
${DS_PROMETHEUS}
"
},
"definition"
:
"label_values(model_name)"
,
"hide"
:
0
,
...
...
examples/other/logging_configuration.md
View file @
469e903b
...
...
@@ -49,7 +49,8 @@ disabled, an error will occur while starting vLLM.
### Example 1: Customize vLLM root logger
For this example, we will customize the vLLM root logger to use
[
`python-json-logger`
](
https://github.com/madzak/python-json-logger
)
to log to
[
`python-json-logger`
](
https://github.com/nhairs/python-json-logger
)
(which is part of the container image) to log to
STDOUT of the console in JSON format with a log level of
`INFO`
.
To begin, first, create an appropriate JSON logging configuration file:
...
...
@@ -82,12 +83,6 @@ To begin, first, create an appropriate JSON logging configuration file:
}
```
Next, install the
`python-json-logger`
package if it's not already installed:
```
bash
pip
install
python-json-logger
```
Finally, run vLLM with the
`VLLM_LOGGING_CONFIG_PATH`
environment variable set
to the path of the custom logging configuration JSON file:
...
...
@@ -132,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
"vllm"
:
{
"handlers"
:
[
"vllm"
],
"level"
:
"DEBUG"
,
"propaga
g
e"
:
false
"propaga
t
e"
:
false
},
"vllm.example_noisy_logger"
:
{
"propagate"
:
false
...
...
examples/other/tensorize_vllm_model.py
View file @
469e903b
...
...
@@ -27,7 +27,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python -m examples.o
ffline_inference
.tensorize_vllm_model
\
python -m examples.o
ther
.tensorize_vllm_model
\
--model facebook/opt-125m
\
serialize
\
--serialized-directory s3://my-bucket
\
...
...
@@ -47,7 +47,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root
level of this repository:
python -m examples.o
ffline_inference
.tensorize_vllm_model
\
python -m examples.o
ther
.tensorize_vllm_model
\
--model EleutherAI/gpt-j-6B
\
--dtype float16
\
deserialize
\
...
...
@@ -65,11 +65,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run
`python -m examples.o
ffline_inference
.tensorize_vllm_model serialize --help`.
`python -m examples.o
ther
.tensorize_vllm_model serialize --help`.
Or for deserializing:
`python -m examples.o
ffline_inference
.tensorize_vllm_model deserialize --help`.
`python -m examples.o
ther
.tensorize_vllm_model deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models:
...
...
@@ -90,7 +90,7 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run:
`python -m examples.o
ffline_inference
.tensorize_vllm_model deserialize --help`
`python -m examples.o
ther
.tensorize_vllm_model deserialize --help`
under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
...
...
examples/template_deepseek_vl2.jinja
View file @
469e903b
...
...
@@ -12,12 +12,12 @@
{%- endif -%}
{%- if message['role'] == 'user' -%}
{{ '<|User|>: ' + message['content'] + '\n' }}
{{ '<|User|>: ' + message['content'] + '\n
\n
' }}
{%- elif message['role'] == 'assistant' -%}
{{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }}
{{ '<|Assistant|>: ' + message['content'] + eos_token + '\n
\n
' }}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{ '<|Assistant|>: ' }}
{% endif %}
{%
-
endif
-
%}
examples/template_teleflm.jinja
0 → 100644
View file @
469e903b
{%- for message in messages %}
{%- if message['role'] == 'user' %}
{{- '<_user>' + message['content']|trim }}
{%- elif message['role'] == 'system' %}
{{- '<_system>' + message['content']|trim }}
{%- elif message['role'] == 'assistant' %}
{{- '<_bot>' + message['content'] }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<_bot>' }}
{%- endif %}
format.sh
View file @
469e903b
#!/bin/bash
echo
"vLLM linting system has been moved from format.sh to pre-commit hook."
echo
"Please run 'pip install -r requirements
-
lint.txt', followed by"
echo
"Please run 'pip install -r requirements
/
lint.txt', followed by"
echo
"'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
echo
"Then linters will run automatically before each commit."
Prev
1
…
9
10
11
12
13
14
15
16
17
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment