Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
LLaMA_vllm
Commits
25f39502
Commit
25f39502
authored
Aug 16, 2025
by
laibao
Browse files
更新README.md,修改Docker镜像版本和深度学习库依赖,删除多个示例文件以简化代码库。
parent
951558c2
Changes
186
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2468 additions
and
0 deletions
+2468
-0
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
...gated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+155
-0
examples/online_serving/gradio_openai_chatbot_webserver.py
examples/online_serving/gradio_openai_chatbot_webserver.py
+120
-0
examples/online_serving/gradio_webserver.py
examples/online_serving/gradio_webserver.py
+75
-0
examples/online_serving/jinaai_rerank_client.py
examples/online_serving/jinaai_rerank_client.py
+42
-0
examples/online_serving/kv_events_subscriber.py
examples/online_serving/kv_events_subscriber.py
+113
-0
examples/online_serving/multi-node-serving.sh
examples/online_serving/multi-node-serving.sh
+94
-0
examples/online_serving/multi_instance_data_parallel.py
examples/online_serving/multi_instance_data_parallel.py
+59
-0
examples/online_serving/openai_chat_completion_client.py
examples/online_serving/openai_chat_completion_client.py
+64
-0
examples/online_serving/openai_chat_completion_client_for_multimodal.py
...e_serving/openai_chat_completion_client_for_multimodal.py
+302
-0
examples/online_serving/openai_chat_completion_client_with_tools.py
...nline_serving/openai_chat_completion_client_with_tools.py
+194
-0
examples/online_serving/openai_chat_completion_client_with_tools_required.py
...ving/openai_chat_completion_client_with_tools_required.py
+130
-0
examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
..._serving/openai_chat_completion_client_with_tools_xlam.py
+245
-0
examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
...penai_chat_completion_client_with_tools_xlam_streaming.py
+273
-0
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
...rving/openai_chat_completion_tool_calls_with_reasoning.py
+170
-0
examples/online_serving/openai_chat_completion_with_reasoning.py
...s/online_serving/openai_chat_completion_with_reasoning.py
+65
-0
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
...erving/openai_chat_completion_with_reasoning_streaming.py
+76
-0
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
...ne_serving/openai_chat_embedding_client_for_multimodal.py
+127
-0
examples/online_serving/openai_classification_client.py
examples/online_serving/openai_classification_client.py
+48
-0
examples/online_serving/openai_completion_client.py
examples/online_serving/openai_completion_client.py
+53
-0
examples/online_serving/openai_cross_encoder_score.py
examples/online_serving/openai_cross_encoder_score.py
+63
-0
No files found.
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
socket
import
threading
import
uuid
import
aiohttp
import
msgpack
import
zmq
from
quart
import
Quart
,
make_response
,
request
count
=
0
prefill_instances
:
dict
[
str
,
str
]
=
{}
# http_address: zmq_address
decode_instances
:
dict
[
str
,
str
]
=
{}
# http_address: zmq_address
prefill_cv
=
threading
.
Condition
()
decode_cv
=
threading
.
Condition
()
def
_listen_for_register
(
poller
,
router_socket
):
while
True
:
socks
=
dict
(
poller
.
poll
())
if
router_socket
in
socks
:
remote_address
,
message
=
router_socket
.
recv_multipart
()
# data: {"type": "P", "http_address": "ip:port",
# "zmq_address": "ip:port"}
data
=
msgpack
.
loads
(
message
)
if
data
[
"type"
]
==
"P"
:
global
prefill_instances
global
prefill_cv
with
prefill_cv
:
prefill_instances
[
data
[
"http_address"
]]
=
data
[
"zmq_address"
]
elif
data
[
"type"
]
==
"D"
:
global
decode_instances
global
decode_cv
with
decode_cv
:
decode_instances
[
data
[
"http_address"
]]
=
data
[
"zmq_address"
]
else
:
print
(
"Unexpected, Received message from %s, data: %s"
,
remote_address
,
data
,
)
def
start_service_discovery
(
hostname
,
port
):
if
not
hostname
:
hostname
=
socket
.
gethostname
()
if
port
==
0
:
raise
ValueError
(
"Port cannot be 0"
)
context
=
zmq
.
Context
()
router_socket
=
context
.
socket
(
zmq
.
ROUTER
)
router_socket
.
bind
(
f
"tcp://
{
hostname
}
:
{
port
}
"
)
poller
=
zmq
.
Poller
()
poller
.
register
(
router_socket
,
zmq
.
POLLIN
)
_listener_thread
=
threading
.
Thread
(
target
=
_listen_for_register
,
args
=
[
poller
,
router_socket
],
daemon
=
True
)
_listener_thread
.
start
()
return
_listener_thread
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
app
=
Quart
(
__name__
)
def
random_uuid
()
->
str
:
return
str
(
uuid
.
uuid4
().
hex
)
async
def
forward_request
(
url
,
data
,
request_id
):
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
"X-Request-Id"
:
request_id
,
}
async
with
session
.
post
(
url
=
url
,
json
=
data
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
if
True
:
async
for
chunk_bytes
in
response
.
content
.
iter_chunked
(
1024
):
yield
chunk_bytes
else
:
content
=
await
response
.
read
()
yield
content
@
app
.
route
(
"/v1/completions"
,
methods
=
[
"POST"
])
async
def
handle_request
():
try
:
original_request_data
=
await
request
.
get_json
()
prefill_request
=
original_request_data
.
copy
()
# change max_tokens = 1 to let it only do prefill
prefill_request
[
"max_tokens"
]
=
1
global
count
global
prefill_instances
global
prefill_cv
with
prefill_cv
:
prefill_list
=
list
(
prefill_instances
.
items
())
prefill_addr
,
prefill_zmq_addr
=
prefill_list
[
count
%
len
(
prefill_list
)]
global
decode_instances
global
decode_cv
with
decode_cv
:
decode_list
=
list
(
decode_instances
.
items
())
decode_addr
,
decode_zmq_addr
=
decode_list
[
count
%
len
(
decode_list
)]
print
(
f
"handle_request count:
{
count
}
, [HTTP:
{
prefill_addr
}
, "
f
"ZMQ:
{
prefill_zmq_addr
}
] 👉 [HTTP:
{
decode_addr
}
, "
f
"ZMQ:
{
decode_zmq_addr
}
]"
)
count
+=
1
request_id
=
(
f
"___prefill_addr_
{
prefill_zmq_addr
}
___decode_addr_"
f
"
{
decode_zmq_addr
}
_
{
random_uuid
()
}
"
)
# finish prefill
async
for
_
in
forward_request
(
f
"http://
{
prefill_addr
}
/v1/completions"
,
prefill_request
,
request_id
):
continue
# return decode
generator
=
forward_request
(
f
"http://
{
decode_addr
}
/v1/completions"
,
original_request_data
,
request_id
)
response
=
await
make_response
(
generator
)
response
.
timeout
=
None
return
response
except
Exception
as
e
:
import
sys
import
traceback
exc_info
=
sys
.
exc_info
()
print
(
"Error occurred in disagg prefill proxy server"
)
print
(
e
)
print
(
""
.
join
(
traceback
.
format_exception
(
*
exc_info
)))
if
__name__
==
"__main__"
:
t
=
start_service_discovery
(
"0.0.0.0"
,
30001
)
app
.
run
(
host
=
"0.0.0.0"
,
port
=
10001
)
t
.
join
()
examples/online_serving/gradio_openai_chatbot_webserver.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py
\
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import
argparse
import
gradio
as
gr
from
openai
import
OpenAI
def
format_history_to_openai
(
history
):
history_openai_format
=
[
{
"role"
:
"system"
,
"content"
:
"You are a great AI assistant."
}
]
for
human
,
assistant
in
history
:
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
human
})
history_openai_format
.
append
({
"role"
:
"assistant"
,
"content"
:
assistant
})
return
history_openai_format
def
predict
(
message
,
history
,
client
,
model_name
,
temp
,
stop_token_ids
):
# Format history to OpenAI chat format
history_openai_format
=
format_history_to_openai
(
history
)
history_openai_format
.
append
({
"role"
:
"user"
,
"content"
:
message
})
# Send request to OpenAI API (vLLM server)
stream
=
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
history_openai_format
,
temperature
=
temp
,
stream
=
True
,
extra_body
=
{
"repetition_penalty"
:
1
,
"stop_token_ids"
:
[
int
(
id
.
strip
())
for
id
in
stop_token_ids
.
split
(
","
)]
if
stop_token_ids
else
[],
},
)
# Collect all chunks and concatenate them into a full message
full_message
=
""
for
chunk
in
stream
:
full_message
+=
chunk
.
choices
[
0
].
delta
.
content
or
""
# Return the full message as a single response
return
full_message
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Chatbot Interface with Customizable Parameters"
)
parser
.
add_argument
(
"--model-url"
,
type
=
str
,
default
=
"http://localhost:8000/v1"
,
help
=
"Model URL"
)
parser
.
add_argument
(
"-m"
,
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Model name for the chatbot"
)
parser
.
add_argument
(
"--temp"
,
type
=
float
,
default
=
0.8
,
help
=
"Temperature for text generation"
)
parser
.
add_argument
(
"--stop-token-ids"
,
type
=
str
,
default
=
""
,
help
=
"Comma-separated stop token IDs"
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
return
parser
.
parse_args
()
def
build_gradio_interface
(
client
,
model_name
,
temp
,
stop_token_ids
):
def
chat_predict
(
message
,
history
):
return
predict
(
message
,
history
,
client
,
model_name
,
temp
,
stop_token_ids
)
return
gr
.
ChatInterface
(
fn
=
chat_predict
,
title
=
"Chatbot Interface"
,
description
=
"A simple chatbot powered by vLLM"
,
)
def
main
():
# Parse the arguments
args
=
parse_args
()
# Set OpenAI's API key and API base to use vLLM's API server
openai_api_key
=
"EMPTY"
openai_api_base
=
args
.
model_url
# Create an OpenAI client
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
)
# Define the Gradio chatbot interface using the predict function
gradio_interface
=
build_gradio_interface
(
client
,
args
.
model
,
args
.
temp
,
args
.
stop_token_ids
)
gradio_interface
.
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/gradio_webserver.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server
\
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import
argparse
import
json
import
gradio
as
gr
import
requests
def
http_bot
(
prompt
):
headers
=
{
"User-Agent"
:
"vLLM Client"
}
pload
=
{
"prompt"
:
prompt
,
"stream"
:
True
,
"max_tokens"
:
128
,
}
response
=
requests
.
post
(
args
.
model_url
,
headers
=
headers
,
json
=
pload
,
stream
=
True
)
for
chunk
in
response
.
iter_lines
(
chunk_size
=
8192
,
decode_unicode
=
False
,
delimiter
=
b
"
\n
"
):
if
chunk
:
data
=
json
.
loads
(
chunk
.
decode
(
"utf-8"
))
output
=
data
[
"text"
][
0
]
yield
output
def
build_demo
():
with
gr
.
Blocks
()
as
demo
:
gr
.
Markdown
(
"# vLLM text completion demo
\n
"
)
inputbox
=
gr
.
Textbox
(
label
=
"Input"
,
placeholder
=
"Enter text and press ENTER"
)
outputbox
=
gr
.
Textbox
(
label
=
"Output"
,
placeholder
=
"Generated result from the model"
)
inputbox
.
submit
(
http_bot
,
[
inputbox
],
[
outputbox
])
return
demo
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8001
)
parser
.
add_argument
(
"--model-url"
,
type
=
str
,
default
=
"http://localhost:8000/generate"
)
return
parser
.
parse_args
()
def
main
(
args
):
demo
=
build_demo
()
demo
.
queue
().
launch
(
server_name
=
args
.
host
,
server_port
=
args
.
port
,
share
=
True
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/jinaai_rerank_client.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
Jina and Cohere https://jina.ai/reranker
run: vllm serve BAAI/bge-reranker-base
"""
import
json
import
requests
url
=
"http://127.0.0.1:8000/rerank"
headers
=
{
"accept"
:
"application/json"
,
"Content-Type"
:
"application/json"
}
data
=
{
"model"
:
"BAAI/bge-reranker-base"
,
"query"
:
"What is the capital of France?"
,
"documents"
:
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
,
"Horses and cows are both animals"
,
],
}
def
main
():
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
# Check the response
if
response
.
status_code
==
200
:
print
(
"Request successful!"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
else
:
print
(
f
"Request failed with status code:
{
response
.
status_code
}
"
)
print
(
response
.
text
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/kv_events_subscriber.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Optional
,
Union
import
msgspec
import
zmq
from
msgspec.msgpack
import
Decoder
#
# Types copied from vllm.distributed.kv_events
#
class
EventBatch
(
msgspec
.
Struct
,
array_like
=
True
,
omit_defaults
=
True
,
gc
=
False
):
ts
:
float
events
:
list
[
Any
]
class
KVCacheEvent
(
msgspec
.
Struct
,
array_like
=
True
,
omit_defaults
=
True
,
gc
=
False
,
tag
=
True
):
"""Base class for all KV cache-related events"""
class
BlockStored
(
KVCacheEvent
):
block_hashes
:
list
[
int
]
parent_block_hash
:
Optional
[
int
]
token_ids
:
list
[
int
]
block_size
:
int
lora_id
:
Optional
[
int
]
class
BlockRemoved
(
KVCacheEvent
):
block_hashes
:
list
[
int
]
class
AllBlocksCleared
(
KVCacheEvent
):
pass
class
KVEventBatch
(
EventBatch
):
events
:
list
[
Union
[
BlockStored
,
BlockRemoved
,
AllBlocksCleared
]]
def
process_event
(
event_batch
):
print
(
f
"Received event batch at
{
event_batch
.
ts
}
:"
)
for
event
in
event_batch
.
events
:
print
(
f
" -
{
event
}
"
)
def
main
():
decoder
=
Decoder
(
type
=
KVEventBatch
)
last_seq
=
-
1
context
=
zmq
.
Context
()
# Set up the main subscription socket
sub
=
context
.
socket
(
zmq
.
SUB
)
sub
.
connect
(
"tcp://localhost:5557"
)
topic
=
"kv-events"
sub
.
setsockopt_string
(
zmq
.
SUBSCRIBE
,
topic
)
# Initialize replay socket
replay
=
context
.
socket
(
zmq
.
REQ
)
replay
.
connect
(
"tcp://localhost:5558"
)
poller
=
zmq
.
Poller
()
poller
.
register
(
replay
,
zmq
.
POLLIN
)
print
(
"Listening for KV cache events on topic:"
,
topic
)
while
True
:
try
:
if
sub
.
poll
(
50
):
_
,
seq_bytes
,
payload
=
sub
.
recv_multipart
()
seq
=
int
.
from_bytes
(
seq_bytes
,
"big"
)
if
last_seq
>=
0
and
seq
>
last_seq
+
1
:
missed
=
seq
-
last_seq
-
1
print
(
f
"Missed
{
missed
}
messages (last:
{
last_seq
}
, current:
{
seq
}
)"
)
replay
.
send
((
last_seq
+
1
).
to_bytes
(
8
,
"big"
))
while
poller
.
poll
(
timeout
=
200
):
seq_bytes
,
replay_payload
=
replay
.
recv_multipart
()
if
not
replay_payload
:
# End of replay marker is sent as an empty frame
# for the payload
break
replay_seq
=
int
.
from_bytes
(
seq_bytes
,
"big"
)
if
replay_seq
>
last_seq
:
event_batch
=
decoder
.
decode
(
replay_payload
)
process_event
(
event_batch
)
last_seq
=
replay_seq
if
replay_seq
>=
seq
-
1
:
break
event_batch
=
decoder
.
decode
(
payload
)
process_event
(
event_batch
)
# ... do other periodic work or check for shutdown ...
except
KeyboardInterrupt
:
print
(
"Interrupted"
)
break
except
Exception
as
e
:
print
(
"Error decoding message:"
,
e
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/multi-node-serving.sh
0 → 100644
View file @
25f39502
#!/bin/bash
subcommand
=
$1
shift
ray_port
=
6379
ray_init_timeout
=
300
declare
-a
start_params
case
"
$subcommand
"
in
worker
)
ray_address
=
""
while
[
$#
-gt
0
]
;
do
case
"
$1
"
in
--ray_address
=
*
)
ray_address
=
"
${
1
#*=
}
"
;;
--ray_port
=
*
)
ray_port
=
"
${
1
#*=
}
"
;;
--ray_init_timeout
=
*
)
ray_init_timeout
=
"
${
1
#*=
}
"
;;
*
)
start_params+
=(
"
$1
"
)
esac
shift
done
if
[
-z
"
$ray_address
"
]
;
then
echo
"Error: Missing argument --ray_address"
exit
1
fi
for
((
i
=
0
;
i <
$ray_init_timeout
;
i+
=
5
))
;
do
ray start
--address
=
$ray_address
:
$ray_port
--block
"
${
start_params
[@]
}
"
if
[
$?
-eq
0
]
;
then
echo
"Worker: Ray runtime started with head address
$ray_address
:
$ray_port
"
exit
0
fi
echo
"Waiting until the ray worker is active..."
sleep
5s
;
done
echo
"Ray worker starts timeout, head address:
$ray_address
:
$ray_port
"
exit
1
;;
leader
)
ray_cluster_size
=
""
while
[
$#
-gt
0
]
;
do
case
"
$1
"
in
--ray_port
=
*
)
ray_port
=
"
${
1
#*=
}
"
;;
--ray_cluster_size
=
*
)
ray_cluster_size
=
"
${
1
#*=
}
"
;;
--ray_init_timeout
=
*
)
ray_init_timeout
=
"
${
1
#*=
}
"
;;
*
)
start_params+
=(
"
$1
"
)
esac
shift
done
if
[
-z
"
$ray_cluster_size
"
]
;
then
echo
"Error: Missing argument --ray_cluster_size"
exit
1
fi
# start the ray daemon
ray start
--head
--port
=
$ray_port
"
${
start_params
[@]
}
"
# wait until all workers are active
for
((
i
=
0
;
i <
$ray_init_timeout
;
i+
=
5
))
;
do
active_nodes
=
`
python3
-c
'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'
`
if
[
$active_nodes
-eq
$ray_cluster_size
]
;
then
echo
"All ray workers are active and the ray cluster is initialized successfully."
exit
0
fi
echo
"Wait for all ray workers to be active.
$active_nodes
/
$ray_cluster_size
is active"
sleep
5s
;
done
echo
"Waiting for all ray workers to be active timed out."
exit
1
;;
*
)
echo
"unknown subcommand:
$subcommand
"
exit
1
;;
esac
examples/online_serving/multi_instance_data_parallel.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
from
typing
import
Optional
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
"""
To run this example, run the following commands simultaneously with
different CUDA_VISIBLE_DEVICES:
python examples/online_serving/multi_instance_data_parallel.py
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1
\
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300
\
--data-parallel-size-local 1 --enforce-eager --headless
Once both instances have completed the handshake, this example will
send a request to the instance with DP rank 1.
"""
async
def
main
():
engine_args
=
AsyncEngineArgs
(
model
=
"ibm-research/PowerMoE-3b"
,
data_parallel_size
=
2
,
dtype
=
"auto"
,
max_model_len
=
2048
,
data_parallel_address
=
"127.0.0.1"
,
data_parallel_rpc_port
=
62300
,
data_parallel_size_local
=
1
,
enforce_eager
=
True
,
)
engine_client
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
top_p
=
0.9
,
max_tokens
=
100
,
)
prompt
=
"Who won the 2004 World Series?"
final_output
:
Optional
[
RequestOutput
]
=
None
async
for
output
in
engine_client
.
generate
(
prompt
=
prompt
,
sampling_params
=
sampling_params
,
request_id
=
"abcdef"
,
data_parallel_rank
=
1
,
):
final_output
=
output
if
final_output
:
print
(
final_output
.
outputs
[
0
].
text
)
if
__name__
==
"__main__"
:
asyncio
.
run
(
main
())
examples/online_serving/openai_chat_completion_client.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
import
argparse
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
,
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
},
]
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Client for vLLM API server"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
,
help
=
"Enable streaming response"
)
return
parser
.
parse_args
()
def
main
(
args
):
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Chat Completion API
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
stream
=
args
.
stream
,
)
print
(
"-"
*
50
)
print
(
"Chat completion results:"
)
if
args
.
stream
:
for
c
in
chat_completion
:
print
(
c
)
else
:
print
(
chat_completion
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_chat_completion_client_for_multimodal.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate
\
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
\
--max-model-len 4096 --trust-remote-code
run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""
import
base64
import
requests
from
openai
import
OpenAI
from
utils
import
get_first_model
from
vllm.utils
import
FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
def
encode_base64_content_from_url
(
content_url
:
str
)
->
str
:
"""Encode a content retrieved from a remote url to base64 format."""
with
requests
.
get
(
content_url
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
return
result
# Text-only inference
def
run_text_only
(
model
:
str
)
->
None
:
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
"content"
:
"What's the capital of France?"
}],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion
.
choices
[
0
].
message
.
content
print
(
"Chat completion output:"
,
result
)
# Single-image input inference
def
run_single_image
(
model
:
str
)
->
None
:
## Use image url in the payload
image_url
=
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from image url:"
,
result
)
## Use base64 encoded image in the payload
image_base64
=
encode_base64_content_from_url
(
image_url
)
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from base64 encoded image:"
,
result
)
# Multi-image input inference
def
run_multi_image
(
model
:
str
)
->
None
:
image_url_duck
=
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion
=
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What are the animals in these images?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url_duck
},
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url_lion
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output:"
,
result
)
# Video input inference
def
run_video
(
model
:
str
)
->
None
:
video_url
=
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64
=
encode_base64_content_from_url
(
video_url
)
## Use video url in the payload
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from image url:"
,
result
)
## Use base64 encoded video in the payload
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
f
"data:video/mp4;base64,
{
video_base64
}
"
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from base64 encoded image:"
,
result
)
# Audio input inference
def
run_audio
(
model
:
str
)
->
None
:
from
vllm.assets.audio
import
AudioAsset
audio_url
=
AudioAsset
(
"winning_call"
).
url
audio_base64
=
encode_base64_content_from_url
(
audio_url
)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this audio?"
},
{
"type"
:
"input_audio"
,
"input_audio"
:
{
# Any format supported by librosa is supported
"data"
:
audio_base64
,
"format"
:
"wav"
,
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from input audio:"
,
result
)
# HTTP URL
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this audio?"
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
# Any format supported by librosa is supported
"url"
:
audio_url
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from audio url:"
,
result
)
# base64 URL
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this audio?"
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
# Any format supported by librosa is supported
"url"
:
f
"data:audio/ogg;base64,
{
audio_base64
}
"
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
64
,
)
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from base64 encoded audio:"
,
result
)
example_function_map
=
{
"text-only"
:
run_text_only
,
"single-image"
:
run_single_image
,
"multi-image"
:
run_multi_image
,
"video"
:
run_video
,
"audio"
:
run_audio
,
}
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
"Demo on using OpenAI client for online serving with "
"multimodal language models served with vLLM."
)
parser
.
add_argument
(
"--chat-type"
,
"-c"
,
type
=
str
,
default
=
"single-image"
,
choices
=
list
(
example_function_map
.
keys
()),
help
=
"Conversation type with multimodal data."
,
)
return
parser
.
parse_args
()
def
main
(
args
)
->
None
:
chat_type
=
args
.
chat_type
model
=
get_first_model
(
client
)
example_function_map
[
chat_type
](
model
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_chat_completion_client_with_tools.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve mistralai/Mistral-7B-Instruct-v0.3
\
--chat-template examples/tool_chat_template_mistral.jinja
\
--enable-auto-tool-choice --tool-call-parser mistral
OR
vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B
\
--chat-template examples/tool_chat_template_hermes.jinja
\
--enable-auto-tool-choice --tool-call-parser hermes
"""
import
json
from
typing
import
Any
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
properties
=
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
,
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
,
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
}
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
properties
,
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
},
{
"role"
:
"assistant"
,
"content"
:
"I'm doing well! How can I help you?"
},
{
"role"
:
"user"
,
"content"
:
(
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
"str"
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
def
handle_tool_calls_stream
(
client
:
OpenAI
,
messages
:
list
[
dict
[
str
,
str
]],
model
:
str
,
tools
:
list
[
dict
[
str
,
Any
]],
)
->
list
[
Any
]:
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
print
(
"chunks: "
)
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
])
else
:
print
(
chunk
.
choices
[
0
].
delta
)
return
chunks
def
handle_tool_calls_arguments
(
chunks
:
list
[
Any
])
->
list
[
str
]:
arguments
=
[]
tool_call_idx
=
-
1
print
(
"arguments: "
)
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
if
tool_call_idx
>=
0
:
print
(
f
"streamed tool call arguments:
{
arguments
[
tool_call_idx
]
}
"
)
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
if
tool_call
.
id
:
print
(
f
"streamed tool call id:
{
tool_call
.
id
}
"
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
print
(
f
"streamed tool call name:
{
tool_call
.
function
.
name
}
"
)
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
return
arguments
def
main
():
# Initialize OpenAI client
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Get available models and select one
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
"-"
*
70
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"-"
*
70
)
# Stream tool calls
chunks
=
handle_tool_calls_stream
(
client
,
messages
,
model
,
tools
)
print
(
"-"
*
70
)
# Handle arguments from streamed tool calls
arguments
=
handle_tool_calls_arguments
(
chunks
)
if
len
(
arguments
):
print
(
f
"streamed tool call arguments:
{
arguments
[
-
1
]
}
\n
"
)
print
(
"-"
*
70
)
# Add tool call results to the conversation
messages
.
append
(
{
"role"
:
"assistant"
,
"tool_calls"
:
chat_completion
.
choices
[
0
].
message
.
tool_calls
,
}
)
# Now, simulate a tool call
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
completion_tool_calls
=
chat_completion
.
choices
[
0
].
message
.
tool_calls
for
call
in
completion_tool_calls
:
tool_to_call
=
available_tools
[
call
.
function
.
name
]
args
=
json
.
loads
(
call
.
function
.
arguments
)
result
=
tool_to_call
(
**
args
)
print
(
"tool_to_call result: "
,
result
)
messages
.
append
(
{
"role"
:
"tool"
,
"content"
:
result
,
"tool_call_id"
:
call
.
id
,
"name"
:
call
.
function
.
name
,
}
)
chat_completion_2
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
False
)
print
(
"Chat completion2 results:"
)
print
(
chat_completion_2
)
print
(
"-"
*
70
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_with_tools_required.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
To run this example, you can start the vLLM server
without any specific flags:
```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct
\
--guided-decoding-backend outlines
```
This example demonstrates how to generate chat completions
using the OpenAI Python client library.
"""
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for"
", e.g. 'San Francisco'"
,
},
"state"
:
{
"type"
:
"string"
,
"description"
:
(
"the two-letter abbreviation for the state that the "
"city is in, e.g. 'CA' which would mean 'California'"
),
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
},
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_forecast"
,
"description"
:
"Get the weather forecast for a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
(
"The city to get the forecast for, e.g. 'New York'"
),
},
"state"
:
{
"type"
:
"string"
,
"description"
:
(
"The two-letter abbreviation for the state, e.g. 'NY'"
),
},
"days"
:
{
"type"
:
"integer"
,
"description"
:
"Number of days to get the forecast for (1-7)"
,
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"days"
,
"unit"
],
},
},
},
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
},
{
"role"
:
"assistant"
,
"content"
:
"I'm doing well! How can I help you?"
},
{
"role"
:
"user"
,
"content"
:
"Can you tell me what the current weather is in Dallas
\
and the forecast for the next 5 days, in fahrenheit?"
,
},
]
def
main
():
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
,
stream
=
True
,
# Enable streaming response
)
for
chunk
in
chat_completion
:
if
chunk
.
choices
and
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
)
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
"required"
)
print
(
chat_completion
.
choices
[
0
].
message
.
tool_calls
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
"""
import
json
import
time
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"empty"
openai_api_base
=
"http://localhost:8000/v1"
# Define tool functions
def
get_weather
(
location
:
str
,
unit
:
str
):
return
f
"Weather in
{
location
}
is 22 degrees
{
unit
}
."
def
calculate_expression
(
expression
:
str
):
try
:
result
=
eval
(
expression
)
return
f
"The result of
{
expression
}
is
{
result
}
"
except
Exception
as
e
:
return
f
"Could not calculate
{
expression
}
:
{
e
}
"
def
translate_text
(
text
:
str
,
target_language
:
str
):
return
f
"Translation of '
{
text
}
' to
{
target_language
}
: [translated content]"
# Define tools
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"location"
:
{
"type"
:
"string"
,
"description"
:
"City and state, e.g., 'San Francisco, CA'"
,
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]},
},
"required"
:
[
"location"
,
"unit"
],
},
},
},
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"calculate_expression"
,
"description"
:
"Calculate a mathematical expression"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"expression"
:
{
"type"
:
"string"
,
"description"
:
"Mathematical expression to evaluate, needs to be a valid python expression"
,
}
},
"required"
:
[
"expression"
],
},
},
},
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"translate_text"
,
"description"
:
"Translate text to another language"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"text"
:
{
"type"
:
"string"
,
"description"
:
"Text to translate"
},
"target_language"
:
{
"type"
:
"string"
,
"description"
:
"Target language for translation"
,
},
},
"required"
:
[
"text"
,
"target_language"
],
},
},
},
]
# Map of function names to implementations
tool_functions
=
{
"get_weather"
:
get_weather
,
"calculate_expression"
:
calculate_expression
,
"translate_text"
:
translate_text
,
}
def
process_response
(
response
,
tool_functions
,
original_query
):
"""Process a non-streaming response with possible tool calls"""
print
(
"
\n
--- Response Output ---"
)
# Check if the response has content
if
response
.
choices
[
0
].
message
.
content
:
print
(
f
"Content:
{
response
.
choices
[
0
].
message
.
content
}
"
)
# Check if the response has tool calls
if
response
.
choices
[
0
].
message
.
tool_calls
:
print
(
"--------------------------------"
)
print
(
f
"Tool calls:
{
response
.
choices
[
0
].
message
.
tool_calls
}
"
)
print
(
"--------------------------------"
)
# Collect all tool calls and results before making follow-up request
tool_results
=
[]
assistant_message
=
{
"role"
:
"assistant"
}
if
response
.
choices
[
0
].
message
.
content
:
assistant_message
[
"content"
]
=
response
.
choices
[
0
].
message
.
content
assistant_tool_calls
=
[]
# Process each tool call
for
tool_call
in
response
.
choices
[
0
].
message
.
tool_calls
:
function_name
=
tool_call
.
function
.
name
function_args
=
tool_call
.
function
.
arguments
function_id
=
tool_call
.
id
print
(
f
"Function called:
{
function_name
}
"
)
print
(
f
"Arguments:
{
function_args
}
"
)
print
(
f
"Function ID:
{
function_id
}
"
)
# Execute the function
try
:
# Parse the JSON arguments
args
=
json
.
loads
(
function_args
)
# Call the function with the arguments
function_result
=
tool_functions
[
function_name
](
**
args
)
print
(
f
"
\n
--- Function Result ---
\n
{
function_result
}
\n
"
)
# Add tool call to assistant message
assistant_tool_calls
.
append
(
{
"id"
:
function_id
,
"type"
:
"function"
,
"function"
:
{
"name"
:
function_name
,
"arguments"
:
function_args
},
}
)
# Add tool result to tool_results
tool_results
.
append
(
{
"role"
:
"tool"
,
"tool_call_id"
:
function_id
,
"content"
:
function_result
,
}
)
except
Exception
as
e
:
print
(
f
"Error executing function:
{
e
}
"
)
# Add tool_calls to assistant message
assistant_message
[
"tool_calls"
]
=
assistant_tool_calls
# Create a follow-up message with all function results
follow_up_messages
=
[
{
"role"
:
"user"
,
"content"
:
original_query
},
assistant_message
,
]
# Add all tool results to the messages
follow_up_messages
.
extend
(
tool_results
)
# Get completion with all tool results in a single follow-up
follow_up_response
=
client
.
chat
.
completions
.
create
(
model
=
client
.
models
.
list
().
data
[
0
].
id
,
messages
=
follow_up_messages
,
stream
=
False
,
)
print
(
"
\n
--- Follow-up Response ---"
)
print
(
follow_up_response
.
choices
[
0
].
message
.
content
)
print
(
"--- End Follow-up ---
\n
"
)
print
(
"--- End Response ---
\n
"
)
def
run_test_case
(
query
,
test_name
):
"""Run a single test case with the given query"""
print
(
f
"
\n
{
'='
*
50
}
\n
TEST CASE:
{
test_name
}
\n
{
'='
*
50
}
"
)
print
(
f
"Query: '
{
query
}
'"
)
start_time
=
time
.
time
()
# Create non-streaming chat completion request
response
=
client
.
chat
.
completions
.
create
(
model
=
client
.
models
.
list
().
data
[
0
].
id
,
messages
=
[{
"role"
:
"user"
,
"content"
:
query
}],
tools
=
tools
,
tool_choice
=
"auto"
,
stream
=
False
,
)
# Process the non-streaming response, passing the original query
process_response
(
response
,
tool_functions
,
query
)
end_time
=
time
.
time
()
print
(
f
"Test completed in
{
end_time
-
start_time
:.
2
f
}
seconds"
)
def
main
():
# Initialize OpenAI client
global
client
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Run test cases
test_cases
=
[
(
"I want to know the weather in San Francisco"
,
"Weather Information"
),
(
"Calculate 25 * 17 + 31"
,
"Math Calculation"
),
(
"Translate 'Hello world' to Spanish"
,
"Text Translation"
),
(
"What is the weather in Tokyo and New York in celsius"
,
"Multiple Tool Usage"
),
]
# Execute all test cases
for
query
,
test_name
in
test_cases
:
run_test_case
(
query
,
test_name
)
time
.
sleep
(
1
)
# Small delay between tests
print
(
"
\n
All tests completed."
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
This example demonstrates streaming tool calls with xLAM models.
"""
import
json
import
time
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"empty"
openai_api_base
=
"http://localhost:8000/v1"
# Define tool functions
def
get_weather
(
location
:
str
,
unit
:
str
):
return
f
"Weather in
{
location
}
is 22 degrees
{
unit
}
."
def
calculate_expression
(
expression
:
str
):
try
:
result
=
eval
(
expression
)
return
f
"The result of
{
expression
}
is
{
result
}
"
except
Exception
as
e
:
return
f
"Could not calculate
{
expression
}
:
{
e
}
"
def
translate_text
(
text
:
str
,
target_language
:
str
):
return
f
"Translation of '
{
text
}
' to
{
target_language
}
: [translated content]"
# Define tools
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"location"
:
{
"type"
:
"string"
,
"description"
:
"City and state, e.g., 'San Francisco, CA'"
,
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]},
},
"required"
:
[
"location"
,
"unit"
],
},
},
},
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"calculate_expression"
,
"description"
:
"Calculate a mathematical expression"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"expression"
:
{
"type"
:
"string"
,
"description"
:
"Mathematical expression to evaluate, needs to be a valid Python expression"
,
}
},
"required"
:
[
"expression"
],
},
},
},
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"translate_text"
,
"description"
:
"Translate text to another language"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"text"
:
{
"type"
:
"string"
,
"description"
:
"Text to translate"
},
"target_language"
:
{
"type"
:
"string"
,
"description"
:
"Target language for translation"
,
},
},
"required"
:
[
"text"
,
"target_language"
],
},
},
},
]
# Map of function names to implementations
tool_functions
=
{
"get_weather"
:
get_weather
,
"calculate_expression"
:
calculate_expression
,
"translate_text"
:
translate_text
,
}
def
process_stream
(
response
,
tool_functions
,
original_query
):
"""Process a streaming response with possible tool calls"""
# Track multiple tool calls
tool_calls
=
{}
# Dictionary to store tool calls by ID
current_id
=
None
print
(
"
\n
--- Stream Output ---"
)
for
chunk
in
response
:
# Handle tool calls in the stream
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
for
tool_call_chunk
in
chunk
.
choices
[
0
].
delta
.
tool_calls
:
# Get the tool call ID
if
hasattr
(
tool_call_chunk
,
"id"
)
and
tool_call_chunk
.
id
:
current_id
=
tool_call_chunk
.
id
if
current_id
not
in
tool_calls
:
tool_calls
[
current_id
]
=
{
"function_name"
:
None
,
"function_args"
:
""
,
"function_id"
:
current_id
,
}
# Extract function information as it comes in chunks
if
(
hasattr
(
tool_call_chunk
,
"function"
)
and
current_id
and
current_id
in
tool_calls
):
if
(
hasattr
(
tool_call_chunk
.
function
,
"name"
)
and
tool_call_chunk
.
function
.
name
):
tool_calls
[
current_id
][
"function_name"
]
=
(
tool_call_chunk
.
function
.
name
)
print
(
f
"Function called:
{
tool_call_chunk
.
function
.
name
}
"
)
if
(
hasattr
(
tool_call_chunk
.
function
,
"arguments"
)
and
tool_call_chunk
.
function
.
arguments
):
tool_calls
[
current_id
][
"function_args"
]
+=
(
tool_call_chunk
.
function
.
arguments
)
print
(
f
"Arguments chunk:
{
tool_call_chunk
.
function
.
arguments
}
"
)
# Handle regular content in the stream
elif
chunk
.
choices
[
0
].
delta
.
content
:
print
(
chunk
.
choices
[
0
].
delta
.
content
,
end
=
""
)
print
(
"
\n
--- End Stream ---
\n
"
)
# Execute each function call and build messages for follow-up
follow_up_messages
=
[{
"role"
:
"user"
,
"content"
:
original_query
}]
for
tool_id
,
tool_data
in
tool_calls
.
items
():
function_name
=
tool_data
[
"function_name"
]
function_args
=
tool_data
[
"function_args"
]
function_id
=
tool_data
[
"function_id"
]
if
function_name
and
function_args
:
try
:
# Parse the JSON arguments
args
=
json
.
loads
(
function_args
)
# Call the function with the arguments
function_result
=
tool_functions
[
function_name
](
**
args
)
print
(
f
"
\n
--- Function Result (
{
function_name
}
) ---
\n
{
function_result
}
\n
"
)
# Add the assistant message with tool call
follow_up_messages
.
append
(
{
"role"
:
"assistant"
,
"tool_calls"
:
[
{
"id"
:
function_id
,
"type"
:
"function"
,
"function"
:
{
"name"
:
function_name
,
"arguments"
:
function_args
,
},
}
],
}
)
# Add the tool message with function result
follow_up_messages
.
append
(
{
"role"
:
"tool"
,
"tool_call_id"
:
function_id
,
"content"
:
function_result
,
}
)
except
Exception
as
e
:
print
(
f
"Error executing function:
{
e
}
"
)
# Only send follow-up if we have results to process
if
len
(
follow_up_messages
)
>
1
:
# Create a follow-up message with all the function results
follow_up_response
=
client
.
chat
.
completions
.
create
(
model
=
client
.
models
.
list
().
data
[
0
].
id
,
messages
=
follow_up_messages
,
stream
=
True
,
)
print
(
"
\n
--- Follow-up Response ---"
)
for
chunk
in
follow_up_response
:
if
chunk
.
choices
[
0
].
delta
.
content
:
print
(
chunk
.
choices
[
0
].
delta
.
content
,
end
=
""
)
print
(
"
\n
--- End Follow-up ---
\n
"
)
def
run_test_case
(
query
,
test_name
):
"""Run a single test case with the given query"""
print
(
f
"
\n
{
'='
*
50
}
\n
TEST CASE:
{
test_name
}
\n
{
'='
*
50
}
"
)
print
(
f
"Query: '
{
query
}
'"
)
start_time
=
time
.
time
()
# Create streaming chat completion request
response
=
client
.
chat
.
completions
.
create
(
model
=
client
.
models
.
list
().
data
[
0
].
id
,
messages
=
[{
"role"
:
"user"
,
"content"
:
query
}],
tools
=
tools
,
tool_choice
=
"auto"
,
stream
=
True
,
)
# Process the streaming response
process_stream
(
response
,
tool_functions
,
query
)
end_time
=
time
.
time
()
print
(
f
"Test completed in
{
end_time
-
start_time
:.
2
f
}
seconds"
)
def
main
():
# Initialize OpenAI client
global
client
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
# Run test cases
test_cases
=
[
(
"I want to know the weather in San Francisco"
,
"Weather Information"
),
(
"Calculate 25 * 17 + 31"
,
"Math Calculation"
),
(
"Translate 'Hello world' to Spanish"
,
"Text Translation"
),
(
"What is the weather in Tokyo and New York in celsius"
,
"Multiple Tool Usage"
),
]
# Execute all test cases
for
query
,
test_name
in
test_cases
:
run_test_case
(
query
,
test_name
)
time
.
sleep
(
1
)
# Small delay between tests
print
(
"
\n
All tests completed."
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example demonstrates how to use tool calling with reasoning models
like QwQ-32B. The reasoning_content will not be parsed by the tool
calling process; only the final output will be parsed.
To run this example, you need to start the vLLM server with both
the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B
\
--reasoning-parser deepseek_r1
\
--enable-auto-tool-choice --tool-call-parser hermes
```
"""
from
openai
import
OpenAI
# Now, simulate a tool call
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
"str"
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
properties
=
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
,
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
,
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
}
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
properties
,
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
},
{
"role"
:
"assistant"
,
"content"
:
"I'm doing well! How can I help you?"
},
{
"role"
:
"user"
,
"content"
:
(
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
def
extract_reasoning_and_calls
(
chunks
:
list
):
reasoning_content
=
""
tool_call_idx
=
-
1
arguments
=
[]
function_names
=
[]
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
function_names
.
append
(
""
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
function_names
[
tool_call_idx
]
=
tool_call
.
function
.
name
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
else
:
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
+=
chunk
.
choices
[
0
].
delta
.
reasoning_content
return
reasoning_content
,
arguments
,
function_names
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
print
(
"---------Full Generate With Automatic Function Calling-------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
name
}
"
)
print
(
f
"function arguments: "
f
"
{
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
arguments
}
"
)
print
(
"----------Stream Generate With Automatic Function Calling-----------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
list
(
tool_calls_stream
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"----------Full Generate With Named Function Calling-----------------"
)
tool_calls
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}},
)
tool_call
=
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
f
"reasoning_content:
{
tool_calls
.
choices
[
0
].
message
.
reasoning_content
}
"
)
print
(
f
"function name:
{
tool_call
.
name
}
"
)
print
(
f
"function arguments:
{
tool_call
.
arguments
}
"
)
print
(
"----------Stream Generate With Named Function Calling--------------"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
}},
stream
=
True
,
)
chunks
=
list
(
tool_calls_stream
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
print
(
f
"reasoning_content:
{
reasoning_content
}
"
)
print
(
f
"function name:
{
function_names
[
0
]
}
"
)
print
(
f
"function arguments:
{
arguments
[
0
]
}
"
)
print
(
"
\n\n
"
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_with_reasoning.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server
with the reasoning parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
\
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
using the OpenAI Python client library.
"""
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Round 1
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
print
(
"reasoning_content for Round 1:"
,
reasoning_content
)
print
(
"content for Round 1:"
,
content
)
# Round 2
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
content
})
messages
.
append
(
{
"role"
:
"user"
,
"content"
:
"How many Rs are there in the word 'strawberry'?"
,
}
)
response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
)
reasoning_content
=
response
.
choices
[
0
].
message
.
reasoning_content
content
=
response
.
choices
[
0
].
message
.
content
print
(
"reasoning_content for Round 2:"
,
reasoning_content
)
print
(
"content for Round 2:"
,
content
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
\
--reasoning-parser deepseek_r1
```
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
streaming chat completions feature.
The streaming chat completions feature allows you to receive chat completions
in real-time as they are generated by the model. This is useful for scenarios
where you want to display chat completions to the user as they are generated
by the model.
Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
content may not exist leading to errors if you try to access it.
"""
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
messages
=
[{
"role"
:
"user"
,
"content"
:
"9.11 and 9.8, which is greater?"
}]
def
main
():
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# ruff: noqa: E501
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
stream
=
True
)
print
(
"client: Start streaming chat completions..."
)
printed_reasoning_content
=
False
printed_content
=
False
for
chunk
in
stream
:
reasoning_content
=
None
content
=
None
# Check the content is reasoning_content or content
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
=
chunk
.
choices
[
0
].
delta
.
reasoning_content
elif
hasattr
(
chunk
.
choices
[
0
].
delta
,
"content"
):
content
=
chunk
.
choices
[
0
].
delta
.
content
if
reasoning_content
is
not
None
:
if
not
printed_reasoning_content
:
printed_reasoning_content
=
True
print
(
"reasoning_content:"
,
end
=
""
,
flush
=
True
)
print
(
reasoning_content
,
end
=
""
,
flush
=
True
)
elif
content
is
not
None
:
if
not
printed_content
:
printed_content
=
True
print
(
"
\n
content:"
,
end
=
""
,
flush
=
True
)
# Extract and print the content
print
(
content
,
end
=
""
,
flush
=
True
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
base64
import
io
import
requests
from
PIL
import
Image
image_url
=
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
def
vlm2vec
():
response
=
requests
.
post
(
"http://localhost:8000/v1/embeddings"
,
json
=
{
"model"
:
"TIGER-Lab/VLM2Vec-Full"
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"text"
,
"text"
:
"Represent the given image."
},
],
}
],
"encoding_format"
:
"float"
,
},
)
response
.
raise_for_status
()
response_json
=
response
.
json
()
print
(
"Embedding output:"
,
response_json
[
"data"
][
0
][
"embedding"
])
def
dse_qwen2_vl
(
inp
:
dict
):
# Embedding an Image
if
inp
[
"type"
]
==
"image"
:
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
inp
[
"image_url"
],
},
},
{
"type"
:
"text"
,
"text"
:
"What is shown in this image?"
},
],
}
]
# Embedding a Text Query
else
:
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer
=
io
.
BytesIO
()
image_placeholder
=
Image
.
new
(
"RGB"
,
(
56
,
56
))
image_placeholder
.
save
(
buffer
,
"png"
)
buffer
.
seek
(
0
)
image_placeholder
=
base64
.
b64encode
(
buffer
.
read
()).
decode
(
"utf-8"
)
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_placeholder
}
"
,
},
},
{
"type"
:
"text"
,
"text"
:
f
"Query:
{
inp
[
'content'
]
}
"
},
],
}
]
response
=
requests
.
post
(
"http://localhost:8000/v1/embeddings"
,
json
=
{
"model"
:
"MrLight/dse-qwen2-2b-mrl-v1"
,
"messages"
:
messages
,
"encoding_format"
:
"float"
,
},
)
response
.
raise_for_status
()
response_json
=
response
.
json
()
print
(
"Embedding output:"
,
response_json
[
"data"
][
0
][
"embedding"
])
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
choices
=
[
"vlm2vec"
,
"dse_qwen2_vl"
],
required
=
True
,
help
=
"Which model to call."
,
)
return
parser
.
parse_args
()
def
main
(
args
):
if
args
.
model
==
"vlm2vec"
:
vlm2vec
()
elif
args
.
model
==
"dse_qwen2_vl"
:
dse_qwen2_vl
(
{
"type"
:
"image"
,
"image_url"
:
image_url
,
}
)
dse_qwen2_vl
(
{
"type"
:
"text"
,
"content"
:
"What is the weather like today?"
,
}
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_classification_client.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
pprint
import
requests
def
post_http_request
(
payload
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
response
=
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
payload
)
return
response
def
parse_args
():
parse
=
argparse
.
ArgumentParser
()
parse
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parse
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parse
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"jason9693/Qwen2.5-1.5B-apeach"
)
return
parse
.
parse_args
()
def
main
(
args
):
host
=
args
.
host
port
=
args
.
port
model_name
=
args
.
model
api_url
=
f
"http://
{
host
}
:
{
port
}
/classify"
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
payload
=
{
"model"
:
model_name
,
"input"
:
prompts
,
}
classify_response
=
post_http_request
(
payload
=
payload
,
api_url
=
api_url
)
pprint
.
pprint
(
classify_response
.
json
())
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_completion_client.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Client for vLLM API server"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
,
help
=
"Enable streaming response"
)
return
parser
.
parse_args
()
def
main
(
args
):
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Completion API
completion
=
client
.
completions
.
create
(
model
=
model
,
prompt
=
"A robot may not injure a human being"
,
echo
=
False
,
n
=
2
,
stream
=
args
.
stream
,
logprobs
=
3
,
)
print
(
"-"
*
50
)
print
(
"Completion results:"
)
if
args
.
stream
:
for
c
in
completion
:
print
(
c
)
else
:
print
(
completion
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_cross_encoder_score.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example online usage of Score API.
Run `vllm serve <model> --task score` to start up the server in vLLM.
"""
import
argparse
import
pprint
import
requests
def
post_http_request
(
prompt
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
response
=
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
prompt
)
return
response
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"BAAI/bge-reranker-v2-m3"
)
return
parser
.
parse_args
()
def
main
(
args
):
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/score"
model_name
=
args
.
model
text_1
=
"What is the capital of Brazil?"
text_2
=
"The capital of Brazil is Brasilia."
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"
\n
Prompt when text_1 and text_2 are both strings:"
)
pprint
.
pprint
(
prompt
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
text_1
=
"What is the capital of France?"
text_2
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
]
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"
\n
Prompt when text_1 is string and text_2 is a list:"
)
pprint
.
pprint
(
prompt
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
text_1
=
[
"What is the capital of Brazil?"
,
"What is the capital of France?"
]
text_2
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
]
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"
\n
Prompt when text_1 and text_2 are both lists:"
)
pprint
.
pprint
(
prompt
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment