Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb5624a
Commit
dcb5624a
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-dev
parents
55880ca2
ba41cc90
Changes
571
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1055 additions
and
609 deletions
+1055
-609
examples/offline_inference/simple_profiling.py
examples/offline_inference/simple_profiling.py
+5
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+120
-53
examples/offline_inference/vision_language_embedding.py
examples/offline_inference/vision_language_embedding.py
+10
-6
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+65
-19
examples/online_serving/api_client.py
examples/online_serving/api_client.py
+11
-7
examples/online_serving/cohere_rerank_client.py
examples/online_serving/cohere_rerank_client.py
+38
-24
examples/online_serving/gradio_openai_chatbot_webserver.py
examples/online_serving/gradio_openai_chatbot_webserver.py
+98
-57
examples/online_serving/gradio_webserver.py
examples/online_serving/gradio_webserver.py
+26
-2
examples/online_serving/jinaai_rerank_client.py
examples/online_serving/jinaai_rerank_client.py
+16
-9
examples/online_serving/openai_chat_completion_client.py
examples/online_serving/openai_chat_completion_client.py
+42
-31
examples/online_serving/openai_chat_completion_client_for_multimodal.py
...e_serving/openai_chat_completion_client_for_multimodal.py
+12
-8
examples/online_serving/openai_chat_completion_client_with_tools.py
...nline_serving/openai_chat_completion_client_with_tools.py
+112
-83
examples/online_serving/openai_chat_completion_client_with_tools_required.py
...ving/openai_chat_completion_client_with_tools_required.py
+32
-26
examples/online_serving/openai_chat_completion_structured_outputs.py
...line_serving/openai_chat_completion_structured_outputs.py
+117
-77
examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
...enai_chat_completion_structured_outputs_structural_tag.py
+85
-0
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
...enai_chat_completion_structured_outputs_with_reasoning.py
+92
-66
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
...rving/openai_chat_completion_tool_calls_with_reasoning.py
+84
-74
examples/online_serving/openai_chat_completion_with_reasoning.py
...s/online_serving/openai_chat_completion_with_reasoning.py
+36
-27
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
...erving/openai_chat_completion_with_reasoning_streaming.py
+45
-37
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
...ne_serving/openai_chat_embedding_client_for_multimodal.py
+9
-2
No files found.
Too many changes to show.
To preserve performance only
571 of 571+
files are displayed.
Plain diff
Email patch
examples/offline_inference/simple_profiling.py
View file @
dcb5624a
...
...
@@ -18,8 +18,8 @@ prompts = [
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
if
__name__
==
"__main__"
:
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
)
...
...
@@ -42,3 +42,7 @@ if __name__ == "__main__":
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
time
.
sleep
(
10
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/vision_language.py
View file @
dcb5624a
...
...
@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"
facebook/bart-large
"
,
tokenizer
=
"
Isotr0py/Florence-2-tokenizer
"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
...
...
@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Kimi-VL
def
run_kimi_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
prompts
=
[
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
f
"<|media_pad|><|media_end|>
{
question
}
<|im_end|>"
"<|im_assistant|>assistant<|im_middle|>"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"moonshotai/Kimi-VL-A3B-Instruct"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# LLaVA-1.5
def
run_llava
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
...
...
@@ -791,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
5120
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
12800
,
enable_lora
=
True
,
max_lora_rank
=
320
,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs
=
{
"dynamic_hd"
:
16
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
...
...
@@ -918,6 +944,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
)
# Qwen2.5-Omni
def
run_qwen2_5_omni
(
questions
:
list
[
str
],
modality
:
str
):
model_name
=
"Qwen/Qwen2.5-Omni-7B"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
mm_processor_kwargs
=
{
"min_pixels"
:
28
*
28
,
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
[
1
],
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
if
modality
==
"image"
:
placeholder
=
"<|IMAGE|>"
elif
modality
==
"video"
:
placeholder
=
"<|VIDEO|>"
default_system
=
(
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
prompts
=
[(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
f
"<|im_start|>user
\n
<|vision_bos|>
{
placeholder
}
<|vision_eos|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# SkyworkR1V
def
run_skyworkr1v
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
...
...
@@ -966,6 +1028,7 @@ model_example_map = {
"h2ovl_chat"
:
run_h2ovl
,
"idefics3"
:
run_idefics3
,
"internvl_chat"
:
run_internvl
,
"kimi_vl"
:
run_kimi_vl
,
"llava"
:
run_llava
,
"llava-next"
:
run_llava_next
,
"llava-next-video"
:
run_llava_next_video
,
...
...
@@ -986,6 +1049,7 @@ model_example_map = {
"qwen_vl"
:
run_qwen_vl
,
"qwen2_vl"
:
run_qwen2_vl
,
"qwen2_5_vl"
:
run_qwen2_5_vl
,
"qwen2_5_omni"
:
run_qwen2_5_omni
,
"skywork_chat"
:
run_skyworkr1v
,
"smolvlm"
:
run_smolvlm
,
}
...
...
@@ -1073,6 +1137,59 @@ def time_counter(enable: bool):
yield
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"llava"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
'--num-prompts'
,
type
=
int
,
default
=
4
,
help
=
'Number of prompts to run.'
)
parser
.
add_argument
(
'--modality'
,
type
=
str
,
default
=
"image"
,
choices
=
[
'image'
,
'video'
],
help
=
'Modality of the input.'
)
parser
.
add_argument
(
'--num-frames'
,
type
=
int
,
default
=
16
,
help
=
'Number of frames to extract from the video.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
parser
.
add_argument
(
'--image-repeat-prob'
,
type
=
float
,
default
=
None
,
help
=
'Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)'
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
parser
.
add_argument
(
'--time-generate'
,
action
=
'store_true'
,
help
=
'If True, then print the total generate() call time'
)
parser
.
add_argument
(
'--use-different-prompt-per-request'
,
action
=
'store_true'
,
help
=
'If True, then use different prompt (with the same multi-modal '
'data) for each request.'
)
return
parser
.
parse_args
()
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
...
...
@@ -1151,55 +1268,5 @@ def main(args):
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for text generation'
)
parser
.
add_argument
(
'--model-type'
,
'-m'
,
type
=
str
,
default
=
"llava"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
)
parser
.
add_argument
(
'--num-prompts'
,
type
=
int
,
default
=
4
,
help
=
'Number of prompts to run.'
)
parser
.
add_argument
(
'--modality'
,
type
=
str
,
default
=
"image"
,
choices
=
[
'image'
,
'video'
],
help
=
'Modality of the input.'
)
parser
.
add_argument
(
'--num-frames'
,
type
=
int
,
default
=
16
,
help
=
'Number of frames to extract from the video.'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
parser
.
add_argument
(
'--image-repeat-prob'
,
type
=
float
,
default
=
None
,
help
=
'Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)'
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disables caching of multi-modal preprocessor/mapper.'
)
parser
.
add_argument
(
'--time-generate'
,
action
=
'store_true'
,
help
=
'If True, then print the total generate() call time'
)
parser
.
add_argument
(
'--use-different-prompt-per-request'
,
action
=
'store_true'
,
help
=
'If True, then use different prompt (with the same multi-modal '
'data) for each request.'
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/offline_inference/vision_language_embedding.py
View file @
dcb5624a
...
...
@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print
(
"-"
*
50
)
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
model_example_map
=
{
"e5_v"
:
run_e5_v
,
"vlm2vec"
:
run_vlm2vec
,
}
if
__name__
==
"__main__"
:
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding'
)
...
...
@@ -184,6 +181,13 @@ if __name__ == "__main__":
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
return
parser
.
parse_args
()
args
=
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/vision_language_multi_image.py
View file @
dcb5624a
...
...
@@ -326,6 +326,44 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
def
load_kimi_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"moonshotai/Kimi-VL-A3B-Instruct"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
4
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
[{
"type"
:
"image"
,
"image"
:
url
}
for
url
in
image_urls
]
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
placeholders
,
{
"type"
:
"text"
,
"text"
:
question
},
],
}]
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
prompt
=
processor
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_mistral3
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
...
...
@@ -465,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
10000
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
enable_lora
=
True
,
max_lora_rank
=
320
,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs
=
{
"dynamic_hd"
:
4
},
)
placeholders
=
""
.
join
(
f
"<|image_
{
i
}
|>"
...
...
@@ -640,6 +680,7 @@ model_example_map = {
"h2ovl_chat"
:
load_h2ovl
,
"idefics3"
:
load_idefics3
,
"internvl_chat"
:
load_internvl
,
"kimi_vl"
:
load_kimi_vl
,
"llama4"
:
load_llama4
,
"mistral3"
:
load_mistral3
,
"mllama"
:
load_mllama
,
...
...
@@ -727,22 +768,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
print
(
"-"
*
50
)
def
main
(
args
:
Namespace
):
model
=
args
.
model_type
method
=
args
.
method
seed
=
args
.
seed
image_urls
=
IMAGE_URLS
[:
args
.
num_images
]
if
method
==
"generate"
:
run_generate
(
model
,
QUESTION
,
image_urls
,
seed
)
elif
method
==
"chat"
:
run_chat
(
model
,
QUESTION
,
image_urls
,
seed
)
else
:
raise
ValueError
(
f
"Invalid method:
{
method
}
"
)
if
__name__
==
"__main__"
:
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'vision language models that support multi-image input for text '
...
...
@@ -765,9 +791,29 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--num-images"
,
"-n"
,
choices
=
list
(
range
(
1
,
13
)),
# 12 is the max number of images
type
=
int
,
choices
=
list
(
range
(
1
,
len
(
IMAGE_URLS
)
+
1
)),
# the max number of images
default
=
2
,
help
=
"Number of images to use for the demo."
)
return
parser
.
parse_args
()
args
=
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
model
=
args
.
model_type
method
=
args
.
method
seed
=
args
.
seed
image_urls
=
IMAGE_URLS
[:
args
.
num_images
]
if
method
==
"generate"
:
run_generate
(
model
,
QUESTION
,
image_urls
,
seed
)
elif
method
==
"chat"
:
run_chat
(
model
,
QUESTION
,
image_urls
,
seed
)
else
:
raise
ValueError
(
f
"Invalid method:
{
method
}
"
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/api_client.py
View file @
dcb5624a
...
...
@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]:
return
output
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"San Francisco is a"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
prompt
=
args
.
prompt
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/generate"
...
...
@@ -82,11 +92,5 @@ def main(args: Namespace):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"San Francisco is a"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
)
args
=
parser
.
parse_args
()
args
=
parse_args
()
main
(
args
)
examples/online_serving/cohere_rerank_client.py
View file @
dcb5624a
...
...
@@ -2,32 +2,46 @@
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base
"""
from
typing
import
Union
import
cohere
from
cohere
import
Client
,
ClientV2
model
=
"BAAI/bge-reranker-base"
# cohere v1 client
co
=
cohere
.
Client
(
base_url
=
"http://localhost:8000"
,
api_key
=
"sk-fake-key"
)
rerank_v1_result
=
co
.
rerank
(
model
=
"BAAI/bge-reranker-base"
,
query
=
"What is the capital of France?"
,
documents
=
[
query
=
"What is the capital of France?"
documents
=
[
"The capital of France is Paris"
,
"Reranking is fun!"
,
"vLLM is an open-source framework for fast AI serving"
])
]
print
(
rerank_v1_result
)
# or the v2
co2
=
cohere
.
ClientV2
(
"sk-fake-key"
,
base_url
=
"http://localhost:8000"
)
def
cohere_rerank
(
client
:
Union
[
Client
,
ClientV2
],
model
:
str
,
query
:
str
,
documents
:
list
[
str
])
->
dict
:
return
client
.
rerank
(
model
=
model
,
query
=
query
,
documents
=
documents
)
def
main
():
# cohere v1 client
cohere_v1
=
cohere
.
Client
(
base_url
=
"http://localhost:8000"
,
api_key
=
"sk-fake-key"
)
rerank_v1_result
=
cohere_rerank
(
cohere_v1
,
model
,
query
,
documents
)
print
(
"-"
*
50
)
print
(
"rerank_v1_result:
\n
"
,
rerank_v1_result
)
print
(
"-"
*
50
)
# or the v2
cohere_v2
=
cohere
.
ClientV2
(
"sk-fake-key"
,
base_url
=
"http://localhost:8000"
)
rerank_v2_result
=
cohere_rerank
(
cohere_v2
,
model
,
query
,
documents
)
print
(
"rerank_v2_result:
\n
"
,
rerank_v2_result
)
print
(
"-"
*
50
)
v2_rerank_result
=
co2
.
rerank
(
model
=
"BAAI/bge-reranker-base"
,
query
=
"What is the capital of France?"
,
documents
=
[
"The capital of France is Paris"
,
"Reranking is fun!"
,
"vLLM is an open-source framework for fast AI serving"
])
print
(
v2_rerank_result
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/gradio_openai_chatbot_webserver.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py
\
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import
argparse
import
gradio
as
gr
from
openai
import
OpenAI
# Argument parser setup
parser
=
argparse
.
ArgumentParser
(
def
format_history_to_openai
(
history
):
history_openai_format
=
[{
"role"
:
"system"
,
"content"
:
"You are a great AI assistant."
}]
for
human
,
assistant
in
history
:
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
human
})
history_openai_format
.
append
({
"role"
:
"assistant"
,
"content"
:
assistant
})
return
history_openai_format
def
predict
(
message
,
history
,
client
,
model_name
,
temp
,
stop_token_ids
):
# Format history to OpenAI chat format
history_openai_format
=
format_history_to_openai
(
history
)
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
message
})
# Send request to OpenAI API (vLLM server)
stream
=
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
history_openai_format
,
temperature
=
temp
,
stream
=
True
,
extra_body
=
{
'repetition_penalty'
:
1
,
'stop_token_ids'
:
[
int
(
id
.
strip
())
for
id
in
stop_token_ids
.
split
(
','
)]
if
stop_token_ids
else
[]
})
# Collect all chunks and concatenate them into a full message
full_message
=
""
for
chunk
in
stream
:
full_message
+=
(
chunk
.
choices
[
0
].
delta
.
content
or
""
)
# Return the full message as a single response
return
full_message
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Chatbot Interface with Customizable Parameters'
)
parser
.
add_argument
(
'--model-url'
,
parser
.
add_argument
(
'--model-url'
,
type
=
str
,
default
=
'http://localhost:8000/v1'
,
help
=
'Model URL'
)
parser
.
add_argument
(
'-m'
,
parser
.
add_argument
(
'-m'
,
'--model'
,
type
=
str
,
required
=
True
,
help
=
'Model name for the chatbot'
)
parser
.
add_argument
(
'--temp'
,
parser
.
add_argument
(
'--temp'
,
type
=
float
,
default
=
0.8
,
help
=
'Temperature for text generation'
)
parser
.
add_argument
(
'--stop-token-ids'
,
parser
.
add_argument
(
'--stop-token-ids'
,
type
=
str
,
default
=
''
,
help
=
'Comma-separated stop token IDs'
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
return
parser
.
parse_args
()
# Parse the arguments
args
=
parser
.
parse_args
()
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
args
.
model_url
def
build_gradio_interface
(
client
,
model_name
,
temp
,
stop_token_ids
):
# Create an OpenAI client to interact with the API server
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
def
chat_predict
(
message
,
history
):
return
predict
(
message
,
history
,
client
,
model_name
,
temp
,
stop_token_ids
)
return
gr
.
ChatInterface
(
fn
=
chat_predict
,
title
=
"Chatbot Interface"
,
description
=
"A simple chatbot powered by vLLM"
)
def
predict
(
message
,
history
):
# Convert chat history to OpenAI format
history_openai_format
=
[{
"role"
:
"system"
,
"content"
:
"You are a great ai assistant."
}]
for
human
,
assistant
in
history
:
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
human
})
history_openai_format
.
append
({
"role"
:
"assistant"
,
"content"
:
assistant
})
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
message
})
# Create a chat completion request and send it to the API server
stream
=
client
.
chat
.
completions
.
create
(
model
=
args
.
model
,
# Model name to use
messages
=
history_openai_format
,
# Chat history
temperature
=
args
.
temp
,
# Temperature for text generation
stream
=
True
,
# Stream response
extra_body
=
{
'repetition_penalty'
:
1
,
'stop_token_ids'
:
[
int
(
id
.
strip
())
for
id
in
args
.
stop_token_ids
.
split
(
','
)
if
id
.
strip
()
]
if
args
.
stop_token_ids
else
[]
})
def
main
():
# Parse the arguments
args
=
parse_args
()
# Read and return generated text from response stream
partial_message
=
""
for
chunk
in
stream
:
partial_message
+=
(
chunk
.
choices
[
0
].
delta
.
content
or
""
)
yield
partial_message
# Set OpenAI's API key and API base to use vLLM's API server
openai_api_key
=
"EMPTY"
openai_api_base
=
args
.
model_url
# Create an OpenAI client
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
)
# Define the Gradio chatbot interface using the predict function
gradio_interface
=
build_gradio_interface
(
client
,
args
.
model
,
args
.
temp
,
args
.
stop_token_ids
)
# Create and launch a chat interface with Gradio
gr
.
ChatInterface
(
predict
).
queue
().
launch
(
server_name
=
args
.
host
,
gradio_interface
.
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/gradio_webserver.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server
\
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import
argparse
import
json
...
...
@@ -39,16 +56,23 @@ def build_demo():
return
demo
if
__name__
==
"__main__"
:
def
parse_args
()
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
parser
.
add_argument
(
"--model-url"
,
type
=
str
,
default
=
"http://localhost:8000/generate"
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
demo
=
build_demo
()
demo
.
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/jinaai_rerank_client.py
View file @
dcb5624a
...
...
@@ -23,12 +23,19 @@ data = {
"The capital of France is Paris."
,
"Horses and cows are both animals"
]
}
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
# Check the response
if
response
.
status_code
==
200
:
def
main
():
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
# Check the response
if
response
.
status_code
==
200
:
print
(
"Request successful!"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
else
:
else
:
print
(
f
"Request failed with status code:
{
response
.
status_code
}
"
)
print
(
response
.
text
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}],
}]
def
main
():
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
)
)
print
(
"-"
*
50
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"-"
*
50
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
if
__name__
==
"__main__"
:
main
(
)
examples/online_serving/openai_chat_completion_client_for_multimodal.py
View file @
dcb5624a
...
...
@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate
\
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image
=2
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt
'{"
image
":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
...
...
@@ -303,12 +303,7 @@ example_function_map = {
}
def
main
(
args
)
->
None
:
chat_type
=
args
.
chat_type
example_function_map
[
chat_type
]()
if
__name__
==
"__main__"
:
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.'
)
...
...
@@ -318,5 +313,14 @@ if __name__ == "__main__":
default
=
"single-image"
,
choices
=
list
(
example_function_map
.
keys
()),
help
=
'Conversation type with multimodal data.'
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
)
->
None
:
chat_type
=
args
.
chat_type
example_function_map
[
chat_type
]()
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_chat_completion_client_with_tools.py
View file @
dcb5624a
...
...
@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import
json
from
typing
import
Any
from
openai
import
OpenAI
...
...
@@ -24,15 +25,6 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -78,39 +70,44 @@ messages = [{
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"
\n\n
"
)
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
'str'
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.
"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
def
handle_tool_calls_stream
(
client
:
OpenAI
,
messages
:
list
[
dict
[
str
,
str
]],
model
:
str
,
tools
:
list
[
dict
[
str
,
Any
]],
)
->
list
[
Any
]:
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
=
[]
print
(
"
chunks
: "
)
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
])
else
:
print
(
chunk
.
choices
[
0
].
delta
)
return
chunks
arguments
=
[]
tool_call_idx
=
-
1
for
chunk
in
chunks
:
def
handle_tool_calls_arguments
(
chunks
:
list
[
Any
])
->
list
[
str
]:
arguments
=
[]
tool_call_idx
=
-
1
print
(
"arguments: "
)
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
if
tool_call_idx
>=
0
:
print
(
f
"streamed tool call arguments:
{
arguments
[
tool_call_idx
]
}
"
)
print
(
f
"streamed tool call arguments: "
f
"
{
arguments
[
tool_call_idx
]
}
"
)
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
if
tool_call
.
id
:
...
...
@@ -118,36 +115,63 @@ for chunk in chunks:
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
print
(
f
"streamed tool call name:
{
tool_call
.
function
.
name
}
"
)
print
(
f
"streamed tool call name:
{
tool_call
.
function
.
name
}
"
)
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
if
len
(
arguments
):
print
(
f
"streamed tool call arguments:
{
arguments
[
-
1
]
}
"
)
return
arguments
print
(
"
\n\n
"
)
messages
.
append
({
"role"
:
"assistant"
,
"tool_calls"
:
chat_completion
.
choices
[
0
].
message
.
tool_calls
})
def
main
():
# Initialize OpenAI client
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Get available models and select one
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Now, simulate a tool call
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
'str'
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
"-"
*
70
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"-"
*
70
)
# Stream tool calls
chunks
=
handle_tool_calls_stream
(
client
,
messages
,
model
,
tools
)
print
(
"-"
*
70
)
# Handle arguments from streamed tool calls
arguments
=
handle_tool_calls_arguments
(
chunks
)
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
if
len
(
arguments
):
print
(
f
"streamed tool call arguments:
{
arguments
[
-
1
]
}
\n
"
)
completion_tool_calls
=
chat_completion
.
choices
[
0
].
message
.
tool_calls
for
call
in
completion_tool_calls
:
print
(
"-"
*
70
)
# Add tool call results to the conversation
messages
.
append
({
"role"
:
"assistant"
,
"tool_calls"
:
chat_completion
.
choices
[
0
].
message
.
tool_calls
})
# Now, simulate a tool call
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
completion_tool_calls
=
chat_completion
.
choices
[
0
].
message
.
tool_calls
for
call
in
completion_tool_calls
:
tool_to_call
=
available_tools
[
call
.
function
.
name
]
args
=
json
.
loads
(
call
.
function
.
arguments
)
result
=
tool_to_call
(
**
args
)
print
(
result
)
print
(
"tool_to_call result: "
,
result
)
messages
.
append
({
"role"
:
"tool"
,
"content"
:
result
,
...
...
@@ -155,9 +179,14 @@ for call in completion_tool_calls:
"name"
:
call
.
function
.
name
})
chat_completion_2
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
chat_completion_2
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
False
)
print
(
"
\n\n
"
)
print
(
chat_completion_2
)
print
(
"Chat completion2 results:"
)
print
(
chat_completion_2
)
print
(
"-"
*
70
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_with_tools_required.py
View file @
dcb5624a
...
...
@@ -18,15 +18,6 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[
{
"type"
:
"function"
,
...
...
@@ -116,21 +107,36 @@ messages = [
},
]
chat_completion
=
client
.
chat
.
completions
.
create
(
def
main
():
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
,
stream
=
True
# Enable streaming response
)
)
for
chunk
in
chat_completion
:
for
chunk
in
chat_completion
:
if
chunk
.
choices
and
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
)
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
)
print
(
chat_completion
.
choices
[
0
].
message
.
tool_calls
)
print
(
chat_completion
.
choices
[
0
].
message
.
tool_calls
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_structured_outputs.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you need to start the vLLM server:
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
"""
from
enum
import
Enum
from
openai
import
BadRequestError
,
OpenAI
from
pydantic
import
BaseModel
client
=
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"-"
,
)
# Guided decoding by Choice (list of possible options)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
def
guided_choice_completion
(
client
:
OpenAI
,
model
:
str
):
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"Classify this sentiment: vLLM is wonderful!"
}],
extra_body
=
{
"guided_choice"
:
[
"positive"
,
"negative"
]},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
)
return
completion
.
choices
[
0
].
message
.
content
# Guided decoding by Regex
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
def
guided_regex_completion
(
client
:
OpenAI
,
model
:
str
):
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"\w+@\w+\.com
\n
"
,
"guided_regex"
:
r
"\w+@\w+\.com\n"
,
"stop"
:
[
"
\n
"
]
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
)
return
completion
.
choices
[
0
].
message
.
content
# Guided decoding by JSON using Pydantic schema
...
...
@@ -54,66 +60,100 @@ class CarDescription(BaseModel):
car_type
:
CarType
json_schema
=
CarDescription
.
model_json_schema
()
def
guided_json_completion
(
client
:
OpenAI
,
model
:
str
):
json_schema
=
CarDescription
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
)
return
completion
.
choices
[
0
].
message
.
content
# Guided decoding by Grammar
simplified_sql_grammar
=
"""
?start: select_statement
def
guided_grammar_completion
(
client
:
OpenAI
,
model
:
str
):
simplified_sql_grammar
=
"""
root ::= select_statement
?
select_statement
:
"SELECT " column
_list " FROM " table_name
select_statement
::=
"SELECT " column
" from " table " where " condition
?
column
_list: column_name ("," column_name)*
column
::= "col_1 " | "col_2 "
?
table
_name: identifier
table
::= "table_1 " | "table_2 "
?column_name: identifi
er
condition ::= column "= " numb
er
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
number ::= "1 " | "2 "
"""
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print
(
completion
.
choices
[
0
].
message
.
content
)
)
return
completion
.
choices
[
0
].
message
.
content
# Extra backend options
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
def
extra_backend_options_completion
(
client
:
OpenAI
,
model
:
str
):
prompt
=
(
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com
\n
"
)
try
:
try
:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
completion
=
client
.
chat
.
completions
.
create
(
model
=
"Qwen/Qwen2.5-3B-Instruct"
,
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_regex"
:
"\w+@\w+\.com
\n
"
,
"guided_regex"
:
r
"\w+@\w+\.com\n"
,
"stop"
:
[
"
\n
"
],
"guided_decoding_backend"
:
"xgrammar:no-fallback"
},
)
except
BadRequestError
as
e
:
return
completion
.
choices
[
0
].
message
.
content
except
BadRequestError
as
e
:
print
(
"This error is expected:"
,
e
)
def
main
():
client
:
OpenAI
=
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"-"
,
)
model
=
"Qwen/Qwen2.5-3B-Instruct"
print
(
"Guided Choice Completion:"
)
print
(
guided_choice_completion
(
client
,
model
))
print
(
"
\n
Guided Regex Completion:"
)
print
(
guided_regex_completion
(
client
,
model
))
print
(
"
\n
Guided JSON Completion:"
)
print
(
guided_json_completion
(
client
,
model
))
print
(
"
\n
Guided Grammar Completion:"
)
print
(
guided_grammar_completion
(
client
,
model
))
print
(
"
\n
Extra Backend Options Completion:"
)
print
(
extra_backend_options_completion
(
client
,
model
))
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
0 → 100644
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
from
openai
import
OpenAI
# This example demonstrates the `structural_tag` response format.
# It can be used to specify a structured output format that occurs between
# specific tags in the response. This example shows how it could be used
# to enforce the format of a tool call response, but it could be used for
# any structured output within a subset of the response.
def
main
():
client
=
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"-"
,
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"""
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?
"""
}]
response
=
client
.
chat
.
completions
.
create
(
model
=
"meta-llama/Llama-3.1-8B-Instruct"
,
messages
=
messages
,
response_format
=
{
"type"
:
"structural_tag"
,
"structures"
:
[{
"begin"
:
"<function=get_weather>"
,
"schema"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
}
}
},
"end"
:
"</function>"
}],
"triggers"
:
[
"<function="
]
})
print
(
response
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
View file @
dcb5624a
...
...
@@ -25,18 +25,18 @@ from pydantic import BaseModel
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
def
print_completion_details
(
completion
):
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
# Guided decoding by Regex
prompt
=
(
"What is the capital of France?"
)
def
guided_regex_completion
(
client
:
OpenAI
,
model
:
str
):
prompt
=
(
"What is the capital of France?"
)
completion
=
client
.
chat
.
completions
.
create
(
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
...
...
@@ -45,9 +45,8 @@ completion = client.chat.completions.create(
extra_body
=
{
"guided_regex"
:
"(Paris|London)"
,
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
)
print_completion_details
(
completion
)
class
People
(
BaseModel
):
...
...
@@ -55,19 +54,19 @@ class People(BaseModel):
age
:
int
json_schema
=
People
.
model_json_schema
()
def
guided_json_completion
(
client
:
OpenAI
,
model
:
str
):
json_schema
=
People
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the name and age of one random person."
)
completion
=
client
.
chat
.
completions
.
create
(
prompt
=
(
"Generate a JSON with the name and age of one random person."
)
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
)
print_completion_details
(
completion
)
# Guided decoding by JSON using Pydantic schema
...
...
@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
car_type
:
CarType
json_schema
=
CarDescription
.
model_json_schema
()
def
guided_car_json_completion
(
client
:
OpenAI
,
model
:
str
):
json_schema
=
CarDescription
.
model_json_schema
()
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
prompt
=
(
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
completion
=
client
.
chat
.
completions
.
create
(
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_json"
:
json_schema
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
)
print
_completion_details
(
completion
)
# Guided decoding by Grammar
simplified_sql_grammar
=
"""
?start: select_statement
def
guided_grammar_completion
(
client
:
OpenAI
,
model
:
str
):
simplified_sql_grammar
=
"""
root ::= select_statement
?
select_statement
:
"SELECT " column
_list " FROM " table_name
select_statement
::=
"SELECT " column
" from " table " where " condition
?
column
_list: column_name ("," column_name)*
column
::= "col_1 " | "col_2 "
?
table
_name: identifier
table
::= "table_1 " | "table_2 "
?column_name: identifi
er
condition ::= column "= " numb
er
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
number ::= "1 " | "2 "
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt
=
(
"Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table."
)
completion
=
client
.
chat
.
completions
.
create
(
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
,
}],
extra_body
=
{
"guided_grammar"
:
simplified_sql_grammar
},
)
print
(
"reasoning_content: "
,
completion
.
choices
[
0
].
message
.
reasoning_content
)
print
(
"content: "
,
completion
.
choices
[
0
].
message
.
content
)
)
print_completion_details
(
completion
)
def
main
():
client
:
OpenAI
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
:
str
=
models
.
data
[
0
].
id
print
(
"Guided Regex Completion:"
)
guided_regex_completion
(
client
,
model
)
print
(
"
\n
Guided JSON Completion (People):"
)
guided_json_completion
(
client
,
model
)
print
(
"
\n
Guided JSON Completion (CarDescription):"
)
guided_car_json_completion
(
client
,
model
)
print
(
"
\n
Guided Grammar Completion:"
)
guided_grammar_completion
(
client
,
model
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
View file @
dcb5624a
...
...
@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -109,34 +101,47 @@ def extract_reasoning_and_calls(chunks: list):
return
reasoning_content
,
arguments
,
function_names
print
(
"---------Full Generate With Automatic Function Calling-------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
print
(
"---------Full Generate With Automatic Function Calling-------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name: "
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
name
}
"
)
print
(
f
"function arguments: "
print
(
f
"function arguments: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
arguments
}
"
)
print
(
"----------Stream Generate With Automatic Function Calling-----------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
print
(
"----------Stream Generate With Automatic Function Calling-----------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
=
list
(
tool_calls_stream
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"----------Full Generate With Named Function Calling-----------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
print
(
"----------Full Generate With Named Function Calling-----------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
...
...
@@ -147,13 +152,16 @@ tool_calls = client.chat.completions.create(messages=messages,
}
})
tool_call
=
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_call
.
name
}
"
)
print
(
f
"function arguments:
{
tool_call
.
arguments
}
"
)
print
(
"----------Stream Generate With Named Function Calling--------------"
)
tool_call
=
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_call
.
name
}
"
)
print
(
f
"function arguments:
{
tool_call
.
arguments
}
"
)
print
(
"----------Stream Generate With Named Function Calling--------------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
...
...
@@ -165,13 +173,15 @@ tool_calls_stream = client.chat.completions.create(
},
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
chunks
=
list
(
tool_calls_stream
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"
\n\n
"
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"
\n\n
"
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_with_reasoning.py
View file @
dcb5624a
...
...
@@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server
with the reasoning
parser:
To run this example, you need to start the vLLM server
with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
\
...
...
@@ -21,35 +21,44 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Round 1
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
# Round 1
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
print
(
"reasoning_content for Round 1:"
,
reasoning_content
)
print
(
"content for Round 1:"
,
content
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
# Round 2
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
content
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"How many Rs are there in the word 'strawberry'?"
,
})
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
print
(
"
reasoning_content
for Round 1:"
,
reasoning_content
)
print
(
"
content
for Round 1:"
,
content
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
# Round 2
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
content
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"How many Rs are there in the word 'strawberry'?"
,
})
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
print
(
"reasoning_content for Round 2:"
,
reasoning_content
)
print
(
"content for Round 2:"
,
content
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
print
(
"reasoning_content for Round 2:"
,
reasoning_content
)
print
(
"content for Round 2:"
,
content
)
if
__name__
==
"__main__"
:
main
(
)
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
View file @
dcb5624a
...
...
@@ -29,25 +29,29 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# For granite
,
add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream
=
client
.
chat
.
completions
.
create
(
model
=
model
,
# ruff: noqa: E501
# For granite
:
add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
stream
=
True
)
print
(
"client: Start streaming chat completions..."
)
printed_reasoning_content
=
False
printed_content
=
False
print
(
"client: Start streaming chat completions..."
)
printed_reasoning_content
=
False
printed_content
=
False
for
chunk
in
stream
:
for
chunk
in
stream
:
reasoning_content
=
None
content
=
None
# Check the content is reasoning_content or content
...
...
@@ -67,3 +71,7 @@ for chunk in stream:
print
(
"
\n
content:"
,
end
=
""
,
flush
=
True
)
# Extract and print the content
print
(
content
,
end
=
""
,
flush
=
True
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
View file @
dcb5624a
...
...
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print
(
"Embedding output:"
,
response_json
[
"data"
][
0
][
"embedding"
])
if
__name__
==
'__main__'
:
def
parse_args
()
:
parser
=
argparse
.
ArgumentParser
(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this."
)
...
...
@@ -107,8 +107,10 @@ if __name__ == '__main__':
choices
=
[
"vlm2vec"
,
"dse_qwen2_vl"
],
required
=
True
,
help
=
"Which model to call."
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
if
args
.
model
==
"vlm2vec"
:
vlm2vec
()
elif
args
.
model
==
"dse_qwen2_vl"
:
...
...
@@ -120,3 +122,8 @@ if __name__ == '__main__':
"type"
:
"text"
,
"content"
:
"What is the weather like today?"
,
})
if
__name__
==
'__main__'
:
args
=
parse_args
()
main
(
args
)
Prev
1
…
5
6
7
8
9
10
11
12
13
…
29
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment