Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb5624a
Commit
dcb5624a
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-dev
parents
55880ca2
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1055 additions
and
609 deletions
+1055
-609
examples/offline_inference/simple_profiling.py
examples/offline_inference/simple_profiling.py
+5
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+120
-53
examples/offline_inference/vision_language_embedding.py
examples/offline_inference/vision_language_embedding.py
+10
-6
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+65
-19
examples/online_serving/api_client.py
examples/online_serving/api_client.py
+11
-7
examples/online_serving/cohere_rerank_client.py
examples/online_serving/cohere_rerank_client.py
+38
-24
examples/online_serving/gradio_openai_chatbot_webserver.py
examples/online_serving/gradio_openai_chatbot_webserver.py
+98
-57
examples/online_serving/gradio_webserver.py
examples/online_serving/gradio_webserver.py
+26
-2
examples/online_serving/jinaai_rerank_client.py
examples/online_serving/jinaai_rerank_client.py
+16
-9
examples/online_serving/openai_chat_completion_client.py
examples/online_serving/openai_chat_completion_client.py
+42
-31
examples/online_serving/openai_chat_completion_client_for_multimodal.py
...e_serving/openai_chat_completion_client_for_multimodal.py
+12
-8
examples/online_serving/openai_chat_completion_client_with_tools.py
...nline_serving/openai_chat_completion_client_with_tools.py
+112
-83
examples/online_serving/openai_chat_completion_client_with_tools_required.py
...ving/openai_chat_completion_client_with_tools_required.py
+32
-26
examples/online_serving/openai_chat_completion_structured_outputs.py
...line_serving/openai_chat_completion_structured_outputs.py
+117
-77
examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
...enai_chat_completion_structured_outputs_structural_tag.py
+85
-0
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
...enai_chat_completion_structured_outputs_with_reasoning.py
+92
-66
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
...rving/openai_chat_completion_tool_calls_with_reasoning.py
+84
-74
examples/online_serving/openai_chat_completion_with_reasoning.py
...s/online_serving/openai_chat_completion_with_reasoning.py
+36
-27
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
...erving/openai_chat_completion_with_reasoning_streaming.py
+45
-37
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
...ne_serving/openai_chat_embedding_client_for_multimodal.py
+9
-2
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
examples/offline_inference/simple_profiling.py
View file @
dcb5624a
...
...
@@ -18,8 +18,8 @@ prompts = [
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
if
__name__
==
"__main__"
:
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
)
...
...
@@ -42,3 +42,7 @@ if __name__ == "__main__":
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
time
.
sleep
(
10
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/vision_language.py
View file @
dcb5624a
...
...
@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"
facebook/bart-large
"
,
tokenizer
=
"
Isotr0py/Florence-2-tokenizer
"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
...
...
@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Kimi-VL
def
run_kimi_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompts
=
[
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
f
"<|media_pad|><|media_end|>
{
question
}
<|im_end|>"
"<|im_assistant|>assistant<|im_middle|>"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"moonshotai/Kimi-VL-A3B-Instruct"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LLaVA-1.5
def
run_llava
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
...
...
@@ -791,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
5120
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
12800
,
enable_lora
=
True
,
max_lora_rank
=
320
,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs
=
{
"dynamic_hd"
:
16
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
...
...
@@ -918,6 +944,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
)
# Qwen2.5-Omni
def
run_qwen2_5_omni
(
questions
:
list
[
str
],
modality
:
str
):
model_name
=
"Qwen/Qwen2.5-Omni-7B"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
mm_processor_kwargs
=
{
"min_pixels"
:
28
*
28
,
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
[
1
],
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
if
modality
==
"image"
:
placeholder
=
"<|IMAGE|>"
elif
modality
==
"video"
:
placeholder
=
"<|VIDEO|>"
default_system
=
(
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
prompts
=
[(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_bos|>
{
placeholder
}
<|vision_eos|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# SkyworkR1V
def
run_skyworkr1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
...
...
@@ -966,6 +1028,7 @@ model_example_map = {
"h2ovl_chat"
:
run_h2ovl
,
"idefics3"
:
run_idefics3
,
"internvl_chat"
:
run_internvl
,
"kimi_vl"
:
run_kimi_vl
,
"llava"
:
run_llava
,
"llava-next"
:
run_llava_next
,
"llava-next-video"
:
run_llava_next_video
,
...
...
@@ -986,6 +1049,7 @@ model_example_map = {
"qwen_vl"
:
run_qwen_vl
,
"qwen2_vl"
:
run_qwen2_vl
,
"qwen2_5_vl"
:
run_qwen2_5_vl
,
"qwen2_5_omni"
:
run_qwen2_5_omni
,
"skywork_chat"
:
run_skyworkr1v
,
"smolvlm"
:
run_smolvlm
,
}
...
...
@@ -1073,6 +1137,59 @@ def time_counter(enable: bool):
yield
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"llava"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
'--num-prompts'
,
type
=
int
,
default
=
4
,
help
=
'Number of prompts to run.'
)
parser
.
add_argument
(
'--modality'
,
type
=
str
,
default
=
"image"
,
choices
=
[
'image'
,
'video'
],
help
=
'Modality of the input.'
)
parser
.
add_argument
(
'--num-frames'
,
type
=
int
,
default
=
16
,
help
=
'Number of frames to extract from the video.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
parser
.
add_argument
(
'--image-repeat-prob'
,
type
=
float
,
default
=
None
,
help
=
'Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)'
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
parser
.
add_argument
(
'--time-generate'
,
action
=
'store_true'
,
help
=
'If True, then print the total generate() call time'
)
parser
.
add_argument
(
'--use-different-prompt-per-request'
,
action
=
'store_true'
,
help
=
'If True, then use different prompt (with the same multi-modal '
'data) for each request.'
)
return
parser
.
parse_args
()
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
...
...
@@ -1151,55 +1268,5 @@ def main(args):
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"llava"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
'--num-prompts'
,
type
=
int
,
default
=
4
,
help
=
'Number of prompts to run.'
)
parser
.
add_argument
(
'--modality'
,
type
=
str
,
default
=
"image"
,
choices
=
[
'image'
,
'video'
],
help
=
'Modality of the input.'
)
parser
.
add_argument
(
'--num-frames'
,
type
=
int
,
default
=
16
,
help
=
'Number of frames to extract from the video.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
parser
.
add_argument
(
'--image-repeat-prob'
,
type
=
float
,
default
=
None
,
help
=
'Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)'
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
parser
.
add_argument
(
'--time-generate'
,
action
=
'store_true'
,
help
=
'If True, then print the total generate() call time'
)
parser
.
add_argument
(
'--use-different-prompt-per-request'
,
action
=
'store_true'
,
help
=
'If True, then use different prompt (with the same multi-modal '
'data) for each request.'
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/vision_language_embedding.py
View file @
dcb5624a
...
...
@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print
(
"-"
*
50
)
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
model_example_map
=
{
"e5_v"
:
run_e5_v
,
"vlm2vec"
:
run_vlm2vec
,
}
if
__name__
==
"__main__"
:
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding'
)
...
...
@@ -184,6 +181,13 @@ if __name__ == "__main__":
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
return
parser
.
parse_args
()
args
=
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/vision_language_multi_image.py
View file @
dcb5624a
...
...
@@ -326,6 +326,44 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_kimi_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"moonshotai/Kimi-VL-A3B-Instruct"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
4
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
[{
"type"
:
"image"
,
"image"
:
url
}
for
url
in
image_urls
]
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
placeholders
,
{
"type"
:
"text"
,
"text"
:
question
},
],
}]
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
prompt
=
processor
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_mistral3
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
...
...
@@ -465,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
10000
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
enable_lora
=
True
,
max_lora_rank
=
320
,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs
=
{
"dynamic_hd"
:
4
},
)
placeholders
=
""
.
join
(
f
"<|image_
{
i
}
|>"
...
...
@@ -640,6 +680,7 @@ model_example_map = {
"h2ovl_chat"
:
load_h2ovl
,
"idefics3"
:
load_idefics3
,
"internvl_chat"
:
load_internvl
,
"kimi_vl"
:
load_kimi_vl
,
"llama4"
:
load_llama4
,
"mistral3"
:
load_mistral3
,
"mllama"
:
load_mllama
,
...
...
@@ -727,22 +768,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
print
(
"-"
*
50
)
def
main
(
args
:
Namespace
):
model
=
args
.
model_type
method
=
args
.
method
seed
=
args
.
seed
image_urls
=
IMAGE_URLS
[:
args
.
num_images
]
if
method
==
"generate"
:
run_generate
(
model
,
QUESTION
,
image_urls
,
seed
)
elif
method
==
"chat"
:
run_chat
(
model
,
QUESTION
,
image_urls
,
seed
)
else
:
raise
ValueError
(
f
"Invalid method:
{
method
}
"
)
if
__name__
==
"__main__"
:
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models that support multi-image input for text '
...
...
@@ -765,9 +791,29 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--num-images"
,
"-n"
,
choices
=
list
(
range
(
1
,
13
)),
# 12 is the max number of images
type
=
int
,
choices
=
list
(
range
(
1
,
len
(
IMAGE_URLS
)
+
1
)),
# the max number of images
default
=
2
,
help
=
"Number of images to use for the demo."
)
return
parser
.
parse_args
()
args
=
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
model
=
args
.
model_type
method
=
args
.
method
seed
=
args
.
seed
image_urls
=
IMAGE_URLS
[:
args
.
num_images
]
if
method
==
"generate"
:
run_generate
(
model
,
QUESTION
,
image_urls
,
seed
)
elif
method
==
"chat"
:
run_chat
(
model
,
QUESTION
,
image_urls
,
seed
)
else
:
raise
ValueError
(
f
"Invalid method:
{
method
}
"
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/api_client.py
View file @
dcb5624a
...
...
@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]:
return
output
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"San Francisco is a"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
prompt
=
args
.
prompt
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/generate"
...
...
@@ -82,11 +92,5 @@ def main(args: Namespace):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"San Francisco is a"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/online_serving/cohere_rerank_client.py
View file @
dcb5624a
...
...
@@ -2,32 +2,46 @@
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base
"""
from
typing
import
Union
import
cohere
from
cohere
import
Client
,
ClientV2
model
=
"BAAI/bge-reranker-base"
query
=
"What is the capital of France?"
documents
=
[
"The capital of France is Paris"
,
"Reranking is fun!"
,
"vLLM is an open-source framework for fast AI serving"
]
def
cohere_rerank
(
client
:
Union
[
Client
,
ClientV2
],
model
:
str
,
query
:
str
,
documents
:
list
[
str
])
->
dict
:
return
client
.
rerank
(
model
=
model
,
query
=
query
,
documents
=
documents
)
def
main
():
# cohere v1 client
cohere_v1
=
cohere
.
Client
(
base_url
=
"http://localhost:8000"
,
api_key
=
"sk-fake-key"
)
rerank_v1_result
=
cohere_rerank
(
cohere_v1
,
model
,
query
,
documents
)
print
(
"-"
*
50
)
print
(
"rerank_v1_result:
\n
"
,
rerank_v1_result
)
print
(
"-"
*
50
)
# or the v2
cohere_v2
=
cohere
.
ClientV2
(
"sk-fake-key"
,
base_url
=
"http://localhost:8000"
)
rerank_v2_result
=
cohere_rerank
(
cohere_v2
,
model
,
query
,
documents
)
print
(
"rerank_v2_result:
\n
"
,
rerank_v2_result
)
print
(
"-"
*
50
)
# cohere v1 client
co
=
cohere
.
Client
(
base_url
=
"http://localhost:8000"
,
api_key
=
"sk-fake-key"
)
rerank_v1_result
=
co
.
rerank
(
model
=
"BAAI/bge-reranker-base"
,
query
=
"What is the capital of France?"
,
documents
=
[
"The capital of France is Paris"
,
"Reranking is fun!"
,
"vLLM is an open-source framework for fast AI serving"
])
print
(
rerank_v1_result
)
# or the v2
co2
=
cohere
.
ClientV2
(
"sk-fake-key"
,
base_url
=
"http://localhost:8000"
)
v2_rerank_result
=
co2
.
rerank
(
model
=
"BAAI/bge-reranker-base"
,
query
=
"What is the capital of France?"
,
documents
=
[
"The capital of France is Paris"
,
"Reranking is fun!"
,
"vLLM is an open-source framework for fast AI serving"
])
print
(
v2_rerank_result
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/gradio_openai_chatbot_webserver.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py
\
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import
argparse
import
gradio
as
gr
from
openai
import
OpenAI
# Argument parser setup
parser
=
argparse
.
ArgumentParser
(
description
=
'Chatbot Interface with Customizable Parameters'
)
parser
.
add_argument
(
'--model-url'
,
type
=
str
,
default
=
'http://localhost:8000/v1'
,
help
=
'Model URL'
)
parser
.
add_argument
(
'-m'
,
'--model'
,
type
=
str
,
required
=
True
,
help
=
'Model name for the chatbot'
)
parser
.
add_argument
(
'--temp'
,
type
=
float
,
default
=
0.8
,
help
=
'Temperature for text generation'
)
parser
.
add_argument
(
'--stop-token-ids'
,
type
=
str
,
default
=
''
,
help
=
'Comma-separated stop token IDs'
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
# Parse the arguments
args
=
parser
.
parse_args
()
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
args
.
model_url
# Create an OpenAI client to interact with the API server
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
def
predict
(
message
,
history
):
# Convert chat history to OpenAI format
def
format_history_to_openai
(
history
):
history_openai_format
=
[{
"role"
:
"system"
,
"content"
:
"You are a great
ai
assistant."
"content"
:
"You are a great
AI
assistant."
}]
for
human
,
assistant
in
history
:
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
human
})
...
...
@@ -54,31 +34,92 @@ def predict(message, history):
"role"
:
"assistant"
,
"content"
:
assistant
})
return
history_openai_format
def
predict
(
message
,
history
,
client
,
model_name
,
temp
,
stop_token_ids
):
# Format history to OpenAI chat format
history_openai_format
=
format_history_to_openai
(
history
)
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
message
})
#
Create a chat completion request and send it to the API
server
#
Send request to OpenAI API (vLLM
server
)
stream
=
client
.
chat
.
completions
.
create
(
model
=
args
.
model
,
# Model name to use
messages
=
history_openai_format
,
# Chat history
temperature
=
args
.
temp
,
# Temperature for text generation
stream
=
True
,
# Stream response
model
=
model_name
,
messages
=
history_openai_format
,
temperature
=
temp
,
stream
=
True
,
extra_body
=
{
'repetition_penalty'
:
1
,
'stop_token_ids'
:
[
int
(
id
.
strip
())
for
id
in
args
.
stop_token_ids
.
split
(
','
)
if
id
.
strip
()
]
if
args
.
stop_token_ids
else
[]
'stop_token_ids'
:
[
int
(
id
.
strip
())
for
id
in
stop_token_ids
.
split
(
','
)]
if
stop_token_ids
else
[]
})
#
Read and return gener
ate
d
t
ext from response stream
partia
l_message
=
""
#
Collect all chunks and concaten
ate t
hem into a full message
ful
l_message
=
""
for
chunk
in
stream
:
partial_message
+=
(
chunk
.
choices
[
0
].
delta
.
content
or
""
)
yield
partial_message
full_message
+=
(
chunk
.
choices
[
0
].
delta
.
content
or
""
)
# Return the full message as a single response
return
full_message
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Chatbot Interface with Customizable Parameters'
)
parser
.
add_argument
(
'--model-url'
,
type
=
str
,
default
=
'http://localhost:8000/v1'
,
help
=
'Model URL'
)
parser
.
add_argument
(
'-m'
,
'--model'
,
type
=
str
,
required
=
True
,
help
=
'Model name for the chatbot'
)
parser
.
add_argument
(
'--temp'
,
type
=
float
,
default
=
0.8
,
help
=
'Temperature for text generation'
)
parser
.
add_argument
(
'--stop-token-ids'
,
type
=
str
,
default
=
''
,
help
=
'Comma-separated stop token IDs'
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
return
parser
.
parse_args
()
def
build_gradio_interface
(
client
,
model_name
,
temp
,
stop_token_ids
):
def
chat_predict
(
message
,
history
):
return
predict
(
message
,
history
,
client
,
model_name
,
temp
,
stop_token_ids
)
return
gr
.
ChatInterface
(
fn
=
chat_predict
,
title
=
"Chatbot Interface"
,
description
=
"A simple chatbot powered by vLLM"
)
def
main
():
# Parse the arguments
args
=
parse_args
()
# Set OpenAI's API key and API base to use vLLM's API server
openai_api_key
=
"EMPTY"
openai_api_base
=
args
.
model_url
# Create an OpenAI client
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
)
# Define the Gradio chatbot interface using the predict function
gradio_interface
=
build_gradio_interface
(
client
,
args
.
model
,
args
.
temp
,
args
.
stop_token_ids
)
gradio_interface
.
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
# Create and launch a chat interface with Gradio
gr
.
ChatInterface
(
predict
).
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/gradio_webserver.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server
\
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import
argparse
import
json
...
...
@@ -39,16 +56,23 @@ def build_demo():
return
demo
if
__name__
==
"__main__"
:
def
parse_args
()
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
parser
.
add_argument
(
"--model-url"
,
type
=
str
,
default
=
"http://localhost:8000/generate"
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
demo
=
build_demo
()
demo
.
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/jinaai_rerank_client.py
View file @
dcb5624a
...
...
@@ -23,12 +23,19 @@ data = {
"The capital of France is Paris."
,
"Horses and cows are both animals"
]
}
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
# Check the response
if
response
.
status_code
==
200
:
print
(
"Request successful!"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
else
:
print
(
f
"Request failed with status code:
{
response
.
status_code
}
"
)
print
(
response
.
text
)
def
main
():
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
# Check the response
if
response
.
status_code
==
200
:
print
(
"Request successful!"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
else
:
print
(
f
"Request failed with status code:
{
response
.
status_code
}
"
)
print
(
response
.
text
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}],
model
=
model
,
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}]
def
main
():
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
)
print
(
"-"
*
50
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_for_multimodal.py
View file @
dcb5624a
...
...
@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate
\
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image
=2
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt
'{"
image
":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
...
...
@@ -303,12 +303,7 @@ example_function_map = {
}
def
main
(
args
)
->
None
:
chat_type
=
args
.
chat_type
example_function_map
[
chat_type
]()
if
__name__
==
"__main__"
:
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.'
)
...
...
@@ -318,5 +313,14 @@ if __name__ == "__main__":
default
=
"single-image"
,
choices
=
list
(
example_function_map
.
keys
()),
help
=
'Conversation type with multimodal data.'
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
)
->
None
:
chat_type
=
args
.
chat_type
example_function_map
[
chat_type
]()
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_chat_completion_client_with_tools.py
View file @
dcb5624a
...
...
@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import
json
from
typing
import
Any
from
openai
import
OpenAI
...
...
@@ -24,15 +25,6 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -78,86 +70,123 @@ messages = [{
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"
\n\n
"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
])
else
:
print
(
chunk
.
choices
[
0
].
delta
)
arguments
=
[]
tool_call_idx
=
-
1
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
if
tool_call_idx
>=
0
:
print
(
f
"streamed tool call arguments:
{
arguments
[
tool_call_idx
]
}
"
)
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
if
tool_call
.
id
:
print
(
f
"streamed tool call id:
{
tool_call
.
id
}
"
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
print
(
f
"streamed tool call name:
{
tool_call
.
function
.
name
}
"
)
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
if
len
(
arguments
):
print
(
f
"streamed tool call arguments:
{
arguments
[
-
1
]
}
"
)
print
(
"
\n\n
"
)
messages
.
append
({
"role"
:
"assistant"
,
"tool_calls"
:
chat_completion
.
choices
[
0
].
message
.
tool_calls
})
# Now, simulate a tool call
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
'str'
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
completion_tool_calls
=
chat_completion
.
choices
[
0
].
message
.
tool_calls
for
call
in
completion_tool_calls
:
tool_to_call
=
available_tools
[
call
.
function
.
name
]
args
=
json
.
loads
(
call
.
function
.
arguments
)
result
=
tool_to_call
(
**
args
)
print
(
result
)
def
handle_tool_calls_stream
(
client
:
OpenAI
,
messages
:
list
[
dict
[
str
,
str
]],
model
:
str
,
tools
:
list
[
dict
[
str
,
Any
]],
)
->
list
[
Any
]:
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
print
(
"chunks: "
)
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
])
else
:
print
(
chunk
.
choices
[
0
].
delta
)
return
chunks
def
handle_tool_calls_arguments
(
chunks
:
list
[
Any
])
->
list
[
str
]:
arguments
=
[]
tool_call_idx
=
-
1
print
(
"arguments: "
)
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
if
tool_call_idx
>=
0
:
print
(
f
"streamed tool call arguments: "
f
"
{
arguments
[
tool_call_idx
]
}
"
)
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
if
tool_call
.
id
:
print
(
f
"streamed tool call id:
{
tool_call
.
id
}
"
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
print
(
f
"streamed tool call name:
{
tool_call
.
function
.
name
}
"
)
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
return
arguments
def
main
():
# Initialize OpenAI client
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Get available models and select one
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
"-"
*
70
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"-"
*
70
)
# Stream tool calls
chunks
=
handle_tool_calls_stream
(
client
,
messages
,
model
,
tools
)
print
(
"-"
*
70
)
# Handle arguments from streamed tool calls
arguments
=
handle_tool_calls_arguments
(
chunks
)
if
len
(
arguments
):
print
(
f
"streamed tool call arguments:
{
arguments
[
-
1
]
}
\n
"
)
print
(
"-"
*
70
)
# Add tool call results to the conversation
messages
.
append
({
"role"
:
"tool"
,
"content"
:
result
,
"tool_call_id"
:
call
.
id
,
"name"
:
call
.
function
.
name
"role"
:
"assistant"
,
"tool_calls"
:
chat_completion
.
choices
[
0
].
message
.
tool_calls
})
chat_completion_2
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
False
)
print
(
"
\n\n
"
)
print
(
chat_completion_2
)
# Now, simulate a tool call
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
completion_tool_calls
=
chat_completion
.
choices
[
0
].
message
.
tool_calls
for
call
in
completion_tool_calls
:
tool_to_call
=
available_tools
[
call
.
function
.
name
]
args
=
json
.
loads
(
call
.
function
.
arguments
)
result
=
tool_to_call
(
**
args
)
print
(
"tool_to_call result: "
,
result
)
messages
.
append
({
"role"
:
"tool"
,
"content"
:
result
,
"tool_call_id"
:
call
.
id
,
"name"
:
call
.
function
.
name
})
chat_completion_2
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
False
)
print
(
"Chat completion2 results:"
)
print
(
chat_completion_2
)
print
(
"-"
*
70
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_with_tools_required.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you can start the vLLM server
To run this example, you can start the vLLM server
without any specific flags:
```bash
...
...
@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines
```
This example demonstrates how to generate chat completions
This example demonstrates how to generate chat completions
using the OpenAI Python client library.
"""
...
...
@@ -18,15 +18,6 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[
{
"type"
:
"function"
,
...
...
@@ -116,21 +107,36 @@ messages = [
},
]
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
,
stream
=
True
# Enable streaming response
)
for
chunk
in
chat_completion
:
if
chunk
.
choices
and
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
)
def
main
():
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
,
stream
=
True
# Enable streaming response
)
for
chunk
in
chat_completion
:
if
chunk
.
choices
and
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
)
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
)
print
(
chat_completion
.
choices
[
0
].
message
.
tool_calls
)
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
)
print
(
chat_completion
.
choices
[
0
].
message
.
tool_calls
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_structured_outputs.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you need to start the vLLM server:
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
"""
from
enum
import
Enum
from
openai
import
BadRequestError
,
OpenAI
from
pydantic
import
BaseModel
client
=
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"-"
,
)
# Guided decoding by Choice (list of possible options)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"Classify this sentiment: vLLM is wonderful!"
}],
extra_body
=
{
"guided_choice"
:
[
"positive"
,
"negative"
]},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
def
guided_choice_completion
(
client
:
OpenAI
,
model
:
str
):
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"Classify this sentiment: vLLM is wonderful!"
}],
extra_body
=
{
"guided_choice"
:
[
"positive"
,
"negative"
]},
)
return
completion
.
choices
[
0
].
message
.
content
# Guided decoding by Regex
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"\w+@\w+\.com
\n
"
,
"stop"
:
[
"
\n
"
]
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
def
guided_regex_completion
(
client
:
OpenAI
,
model
:
str
):
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
r
"\w+@\w+\.com\n"
,
"stop"
:
[
"
\n
"
]
},
)
return
completion
.
choices
[
0
].
message
.
content
# Guided decoding by JSON using Pydantic schema
...
...
@@ -54,66 +60,100 @@ class CarDescription(BaseModel):
car_type
:
CarType
json_schema
=
CarDescription
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
def
guided_json_completion
(
client
:
OpenAI
,
model
:
str
):
json_schema
=
CarDescription
.
model_json_schema
()
# Guided decoding by Grammar
simplified_sql_grammar
=
"""
?start: select_statement
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
return
completion
.
choices
[
0
].
message
.
content
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
# Guided decoding by Grammar
def
guided_grammar_completion
(
client
:
OpenAI
,
model
:
str
):
simplified_sql_grammar
=
"""
root ::= select_statement
?table_name: identifier
select_statement ::= "SELECT " column " from " table " where " condition
?
column
_name: identifier
column
::= "col_1 " | "col_2 "
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
table ::= "table_1 " | "table_2 "
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
condition ::= column "= " number
# Extra backend options
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
number ::= "1 " | "2 "
"""
try
:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"\w+@\w+\.com
\n
"
,
"stop"
:
[
"
\n
"
],
"guided_decoding_backend"
:
"xgrammar:no-fallback"
},
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
except
BadRequestError
as
e
:
print
(
"This error is expected:"
,
e
)
return
completion
.
choices
[
0
].
message
.
content
# Extra backend options
def
extra_backend_options_completion
(
client
:
OpenAI
,
model
:
str
):
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
try
:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
r
"\w+@\w+\.com\n"
,
"stop"
:
[
"
\n
"
],
"guided_decoding_backend"
:
"xgrammar:no-fallback"
},
)
return
completion
.
choices
[
0
].
message
.
content
except
BadRequestError
as
e
:
print
(
"This error is expected:"
,
e
)
def
main
():
client
:
OpenAI
=
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"-"
,
)
model
=
"Qwen/Qwen2.5-3B-Instruct"
print
(
"Guided Choice Completion:"
)
print
(
guided_choice_completion
(
client
,
model
))
print
(
"
\n
Guided Regex Completion:"
)
print
(
guided_regex_completion
(
client
,
model
))
print
(
"
\n
Guided JSON Completion:"
)
print
(
guided_json_completion
(
client
,
model
))
print
(
"
\n
Guided Grammar Completion:"
)
print
(
guided_grammar_completion
(
client
,
model
))
print
(
"
\n
Extra Backend Options Completion:"
)
print
(
extra_backend_options_completion
(
client
,
model
))
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
from
openai
import
OpenAI
# This example demonstrates the `structural_tag` response format.
# It can be used to specify a structured output format that occurs between
# specific tags in the response. This example shows how it could be used
# to enforce the format of a tool call response, but it could be used for
# any structured output within a subset of the response.
def
main
():
client
=
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"-"
,
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"""
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?
"""
}]
response
=
client
.
chat
.
completions
.
create
(
model
=
"meta-llama/Llama-3.1-8B-Instruct"
,
messages
=
messages
,
response_format
=
{
"type"
:
"structural_tag"
,
"structures"
:
[{
"begin"
:
"<function=get_weather>"
,
"schema"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
}
}
},
"end"
:
"</function>"
}],
"triggers"
:
[
"<function="
]
})
print
(
response
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
View file @
dcb5624a
...
...
@@ -25,29 +25,28 @@ from pydantic import BaseModel
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
def
print_completion_details
(
completion
):
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
# Guided decoding by Regex
prompt
=
(
"What is the capital of France?"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
m
essages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}]
,
extra_body
=
{
"guided_regex"
:
"(Paris|London)"
,
}
,
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
def
guided_regex_completion
(
client
:
OpenAI
,
model
:
str
):
prompt
=
(
"What is the capital of France?"
)
completion
=
client
.
chat
.
completions
.
create
(
m
odel
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"(Paris|London)"
,
},
)
print
_completion_details
(
completion
)
class
People
(
BaseModel
):
...
...
@@ -55,19 +54,19 @@ class People(BaseModel):
age
:
int
json_schema
=
People
.
model_json_schema
()
def
guided_json_completion
(
client
:
OpenAI
,
model
:
str
):
json_schema
=
People
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the name and age of one random person."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
prompt
=
(
"Generate a JSON with the name and age of one random person."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print_completion_details
(
completion
)
# Guided decoding by JSON using Pydantic schema
...
...
@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
car_type
:
CarType
json_schema
=
CarDescription
.
model_json_schema
()
def
guided_car_json_completion
(
client
:
OpenAI
,
model
:
str
):
json_schema
=
CarDescription
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print_completion_details
(
completion
)
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
# Guided decoding by Grammar
simplified_sql_grammar
=
"""
?start: select_statement
def
guided_grammar_completion
(
client
:
OpenAI
,
model
:
str
):
simplified_sql_grammar
=
"""
root ::= select_statement
?
select_statement
:
"SELECT " column
_list " FROM " table_name
select_statement
::=
"SELECT " column
" from " table " where " condition
?
column
_list: column_name ("," column_name)*
column
::= "col_1 " | "col_2 "
?
table
_name: identifier
table
::= "table_1 " | "table_2 "
?column_name: identifier
condition ::= column "= " number
number ::= "1 " | "2 "
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print_completion_details
(
completion
)
def
main
():
client
:
OpenAI
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
:
str
=
models
.
data
[
0
].
id
print
(
"Guided Regex Completion:"
)
guided_regex_completion
(
client
,
model
)
print
(
"
\n
Guided JSON Completion (People):"
)
guided_json_completion
(
client
,
model
)
print
(
"
\n
Guided JSON Completion (CarDescription):"
)
guided_car_json_completion
(
client
,
model
)
print
(
"
\n
Guided Grammar Completion:"
)
guided_grammar_completion
(
client
,
model
)
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
View file @
dcb5624a
...
...
@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list):
return
reasoning_content
,
arguments
,
function_names
print
(
"---------Full Generate With Automatic Function Calling-------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
name
}
"
)
print
(
f
"function arguments: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
arguments
}
"
)
print
(
"----------Stream Generate With Automatic Function Calling-----------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"----------Full Generate With Named Function Calling-----------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
})
tool_call
=
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_call
.
name
}
"
)
print
(
f
"function arguments:
{
tool_call
.
arguments
}
"
)
print
(
"----------Stream Generate With Named Function Calling--------------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
},
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"
\n\n
"
)
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
print
(
"---------Full Generate With Automatic Function Calling-------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
name
}
"
)
print
(
f
"function arguments: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
arguments
}
"
)
print
(
"----------Stream Generate With Automatic Function Calling-----------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
list
(
tool_calls_stream
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"----------Full Generate With Named Function Calling-----------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
})
tool_call
=
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_call
.
name
}
"
)
print
(
f
"function arguments:
{
tool_call
.
arguments
}
"
)
print
(
"----------Stream Generate With Named Function Calling--------------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}
},
stream
=
True
)
chunks
=
list
(
tool_calls_stream
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"
\n\n
"
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_with_reasoning.py
View file @
dcb5624a
...
...
@@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server
with the reasoning
parser:
To run this example, you need to start the vLLM server
with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
\
...
...
@@ -21,35 +21,44 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Round 1
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
# Round 1
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
print
(
"
reasoning_content
for Round 1:"
,
reasoning_content
)
print
(
"
content
for Round 1:"
,
content
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
# Round 2
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
content
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"How many Rs are there in the word 'strawberry'?"
,
})
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
print
(
"reasoning_content for Round 1:"
,
reasoning_content
)
print
(
"content for Round 1:"
,
content
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
# Round 2
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
content
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"How many Rs are there in the word 'strawberry'?"
,
})
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
print
(
"reasoning_content for Round 2:"
,
reasoning_content
)
print
(
"content for Round 2:"
,
content
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
print
(
"reasoning_content for Round 2:"
,
reasoning_content
)
print
(
"content for Round 2:"
,
content
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
View file @
dcb5624a
...
...
@@ -3,7 +3,7 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
...
...
@@ -29,41 +29,49 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
stream
=
True
)
print
(
"client: Start streaming chat completions..."
)
printed_reasoning_content
=
False
printed_content
=
False
for
chunk
in
stream
:
reasoning_content
=
None
content
=
None
# Check the content is reasoning_content or content
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
=
chunk
.
choices
[
0
].
delta
.
reasoning_content
elif
hasattr
(
chunk
.
choices
[
0
].
delta
,
"content"
):
content
=
chunk
.
choices
[
0
].
delta
.
content
if
reasoning_content
is
not
None
:
if
not
printed_reasoning_content
:
printed_reasoning_content
=
True
print
(
"reasoning_content:"
,
end
=
""
,
flush
=
True
)
print
(
reasoning_content
,
end
=
""
,
flush
=
True
)
elif
content
is
not
None
:
if
not
printed_content
:
printed_content
=
True
print
(
"
\n
content:"
,
end
=
""
,
flush
=
True
)
# Extract and print the content
print
(
content
,
end
=
""
,
flush
=
True
)
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# ruff: noqa: E501
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
stream
=
True
)
print
(
"client: Start streaming chat completions..."
)
printed_reasoning_content
=
False
printed_content
=
False
for
chunk
in
stream
:
reasoning_content
=
None
content
=
None
# Check the content is reasoning_content or content
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
=
chunk
.
choices
[
0
].
delta
.
reasoning_content
elif
hasattr
(
chunk
.
choices
[
0
].
delta
,
"content"
):
content
=
chunk
.
choices
[
0
].
delta
.
content
if
reasoning_content
is
not
None
:
if
not
printed_reasoning_content
:
printed_reasoning_content
=
True
print
(
"reasoning_content:"
,
end
=
""
,
flush
=
True
)
print
(
reasoning_content
,
end
=
""
,
flush
=
True
)
elif
content
is
not
None
:
if
not
printed_content
:
printed_content
=
True
print
(
"
\n
content:"
,
end
=
""
,
flush
=
True
)
# Extract and print the content
print
(
content
,
end
=
""
,
flush
=
True
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
View file @
dcb5624a
...
...
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print
(
"Embedding output:"
,
response_json
[
"data"
][
0
][
"embedding"
])
if
__name__
==
'__main__'
:
def
parse_args
()
:
parser
=
argparse
.
ArgumentParser
(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this."
)
...
...
@@ -107,8 +107,10 @@ if __name__ == '__main__':
choices
=
[
"vlm2vec"
,
"dse_qwen2_vl"
],
required
=
True
,
help
=
"Which model to call."
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
if
args
.
model
==
"vlm2vec"
:
vlm2vec
()
elif
args
.
model
==
"dse_qwen2_vl"
:
...
...
@@ -120,3 +122,8 @@ if __name__ == '__main__':
"type"
:
"text"
,
"content"
:
"What is the weather like today?"
,
})
if
__name__
==
'__main__'
:
args
=
parse_args
()
main
(
args
)
Prev
1
…
5
6
7
8
9
10
11
12
13
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment