Commit dcb5624a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-dev

parents 55880ca2 ba41cc90
......@@ -18,8 +18,8 @@ prompts = [
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
if __name__ == "__main__":
def main():
# Create an LLM.
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
......@@ -42,3 +42,7 @@ if __name__ == "__main__":
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
time.sleep(10)
if __name__ == "__main__":
main()
......@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large",
tokenizer="Isotr0py/Florence-2-tokenizer",
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
......@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
f"<|media_pad|><|media_end|>{question}<|im_end|>"
"<|im_assistant|>assistant<|im_middle|>" for question in questions
]
engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -791,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=4096,
max_model_len=5120,
max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
)
......@@ -918,6 +944,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
)
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
model_name = "Qwen/Qwen2.5-Omni-7B"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": [1],
},
limit_mm_per_prompt={"image": 1},
)
if modality == "image":
placeholder = "<|IMAGE|>"
elif modality == "video":
placeholder = "<|VIDEO|>"
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech.")
prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -966,6 +1028,7 @@ model_example_map = {
"h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3,
"internvl_chat": run_internvl,
"kimi_vl": run_kimi_vl,
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
......@@ -986,6 +1049,7 @@ model_example_map = {
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
"qwen2_5_omni": run_qwen2_5_omni,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
}
......@@ -1073,6 +1137,59 @@ def time_counter(enable: bool):
yield
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="llava",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
parser.add_argument(
'--use-different-prompt-per-request',
action='store_true',
help='If True, then use different prompt (with the same multi-modal '
'data) for each request.')
return parser.parse_args()
def main(args):
model = args.model_type
if model not in model_example_map:
......@@ -1151,55 +1268,5 @@ def main(args):
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="llava",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
parser.add_argument(
'--use-different-prompt-per-request',
action='store_true',
help='If True, then use different prompt (with the same multi-modal '
'data) for each request.')
args = parser.parse_args()
args = parse_args()
main(args)
......@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print("-" * 50)
def main(args: Namespace):
run_encode(args.model_name, args.modality, args.seed)
model_example_map = {
"e5_v": run_e5_v,
"vlm2vec": run_vlm2vec,
}
if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
......@@ -184,6 +181,13 @@ if __name__ == "__main__":
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
return parser.parse_args()
args = parser.parse_args()
def main(args: Namespace):
run_encode(args.model_name, args.modality, args.seed)
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -326,6 +326,44 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=4,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name,
trust_remote_code=True)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
......@@ -465,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=10000,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
)
placeholders = "".join(f"<|image_{i}|>"
......@@ -640,6 +680,7 @@ model_example_map = {
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"kimi_vl": load_kimi_vl,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama,
......@@ -727,22 +768,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
print("-" * 50)
def main(args: Namespace):
model = args.model_type
method = args.method
seed = args.seed
image_urls = IMAGE_URLS[:args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
else:
raise ValueError(f"Invalid method: {method}")
if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input for text '
......@@ -765,9 +791,29 @@ if __name__ == "__main__":
parser.add_argument(
"--num-images",
"-n",
choices=list(range(1, 13)), # 12 is the max number of images
type=int,
choices=list(range(1,
len(IMAGE_URLS) + 1)), # the max number of images
default=2,
help="Number of images to use for the demo.")
return parser.parse_args()
args = parser.parse_args()
def main(args: Namespace):
model = args.model_type
method = args.method
seed = args.seed
image_urls = IMAGE_URLS[:args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
else:
raise ValueError(f"Invalid method: {method}")
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]:
return output
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
return parser.parse_args()
def main(args: Namespace):
prompt = args.prompt
api_url = f"http://{args.host}:{args.port}/generate"
......@@ -82,11 +92,5 @@ def main(args: Namespace):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
args = parse_args()
main(args)
......@@ -2,32 +2,46 @@
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base
"""
from typing import Union
import cohere
from cohere import Client, ClientV2
model = "BAAI/bge-reranker-base"
# cohere v1 client
co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
rerank_v1_result = co.rerank(
model="BAAI/bge-reranker-base",
query="What is the capital of France?",
documents=[
query = "What is the capital of France?"
documents = [
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
])
]
print(rerank_v1_result)
# or the v2
co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
documents: list[str]) -> dict:
return client.rerank(model=model, query=query, documents=documents)
def main():
# cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000",
api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50)
# or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key",
base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50)
v2_rerank_result = co2.rerank(
model="BAAI/bge-reranker-base",
query="What is the capital of France?",
documents=[
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
])
print(v2_rerank_result)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py \
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse
import gradio as gr
from openai import OpenAI
# Argument parser setup
parser = argparse.ArgumentParser(
def format_history_to_openai(history):
history_openai_format = [{
"role": "system",
"content": "You are a great AI assistant."
}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
return history_openai_format
def predict(message, history, client, model_name, temp, stop_token_ids):
# Format history to OpenAI chat format
history_openai_format = format_history_to_openai(history)
history_openai_format.append({"role": "user", "content": message})
# Send request to OpenAI API (vLLM server)
stream = client.chat.completions.create(
model=model_name,
messages=history_openai_format,
temperature=temp,
stream=True,
extra_body={
'repetition_penalty':
1,
'stop_token_ids':
[int(id.strip())
for id in stop_token_ids.split(',')] if stop_token_ids else []
})
# Collect all chunks and concatenate them into a full message
full_message = ""
for chunk in stream:
full_message += (chunk.choices[0].delta.content or "")
# Return the full message as a single response
return full_message
def parse_args():
parser = argparse.ArgumentParser(
description='Chatbot Interface with Customizable Parameters')
parser.add_argument('--model-url',
parser.add_argument('--model-url',
type=str,
default='http://localhost:8000/v1',
help='Model URL')
parser.add_argument('-m',
parser.add_argument('-m',
'--model',
type=str,
required=True,
help='Model name for the chatbot')
parser.add_argument('--temp',
parser.add_argument('--temp',
type=float,
default=0.8,
help='Temperature for text generation')
parser.add_argument('--stop-token-ids',
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
return parser.parse_args()
# Parse the arguments
args = parser.parse_args()
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = args.model_url
def build_gradio_interface(client, model_name, temp, stop_token_ids):
# Create an OpenAI client to interact with the API server
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
def chat_predict(message, history):
return predict(message, history, client, model_name, temp,
stop_token_ids)
return gr.ChatInterface(fn=chat_predict,
title="Chatbot Interface",
description="A simple chatbot powered by vLLM")
def predict(message, history):
# Convert chat history to OpenAI format
history_openai_format = [{
"role": "system",
"content": "You are a great ai assistant."
}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
history_openai_format.append({"role": "user", "content": message})
# Create a chat completion request and send it to the API server
stream = client.chat.completions.create(
model=args.model, # Model name to use
messages=history_openai_format, # Chat history
temperature=args.temp, # Temperature for text generation
stream=True, # Stream response
extra_body={
'repetition_penalty':
1,
'stop_token_ids': [
int(id.strip()) for id in args.stop_token_ids.split(',')
if id.strip()
] if args.stop_token_ids else []
})
def main():
# Parse the arguments
args = parse_args()
# Read and return generated text from response stream
partial_message = ""
for chunk in stream:
partial_message += (chunk.choices[0].delta.content or "")
yield partial_message
# Set OpenAI's API key and API base to use vLLM's API server
openai_api_key = "EMPTY"
openai_api_base = args.model_url
# Create an OpenAI client
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
# Define the Gradio chatbot interface using the predict function
gradio_interface = build_gradio_interface(client, args.model, args.temp,
args.stop_token_ids)
# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch(server_name=args.host,
gradio_interface.queue().launch(server_name=args.host,
server_port=args.port,
share=True)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server \
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse
import json
......@@ -39,16 +56,23 @@ def build_demo():
return demo
if __name__ == "__main__":
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--model-url",
type=str,
default="http://localhost:8000/generate")
args = parser.parse_args()
return parser.parse_args()
def main(args):
demo = build_demo()
demo.queue().launch(server_name=args.host,
server_port=args.port,
share=True)
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -23,12 +23,19 @@ data = {
"The capital of France is Paris.", "Horses and cows are both animals"
]
}
response = requests.post(url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
def main():
response = requests.post(url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=[{
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
}, {
"role": "user",
"content": "Who won the world series in 2020?"
}, {
"role":
"assistant",
"content":
"The Los Angeles Dodgers won the World Series in 2020."
}, {
}, {
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020."
}, {
"role": "user",
"content": "Where was it played?"
}],
}]
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
)
)
print("-" * 50)
print("Chat completion results:")
print(chat_completion)
print("-" * 50)
print("Chat completion results:")
print(chat_completion)
if __name__ == "__main__":
main()
......@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
......@@ -303,12 +303,7 @@ example_function_map = {
}
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.')
......@@ -318,5 +313,14 @@ if __name__ == "__main__":
default="single-image",
choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.')
args = parser.parse_args()
return parser.parse_args()
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import json
from typing import Any
from openai import OpenAI
......@@ -24,15 +25,6 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{
"type": "function",
"function": {
......@@ -78,39 +70,44 @@ messages = [{
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print("Chat completion results:")
print(chat_completion)
print("\n\n")
def get_current_weather(city: str, state: str, unit: 'str'):
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.")
tool_calls_stream = client.chat.completions.create(messages=messages,
def handle_tool_calls_stream(
client: OpenAI,
messages: list[dict[str, str]],
model: str,
tools: list[dict[str, Any]],
) -> list[Any]:
tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks = []
print("chunks: ")
for chunk in tool_calls_stream:
chunks.append(chunk)
if chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls[0])
else:
print(chunk.choices[0].delta)
return chunks
arguments = []
tool_call_idx = -1
for chunk in chunks:
def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
arguments = []
tool_call_idx = -1
print("arguments: ")
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
if tool_call_idx >= 0:
print(
f"streamed tool call arguments: {arguments[tool_call_idx]}"
)
print(f"streamed tool call arguments: "
f"{arguments[tool_call_idx]}")
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
if tool_call.id:
......@@ -118,36 +115,63 @@ for chunk in chunks:
if tool_call.function:
if tool_call.function.name:
print(f"streamed tool call name: {tool_call.function.name}")
print(
f"streamed tool call name: {tool_call.function.name}")
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
if len(arguments):
print(f"streamed tool call arguments: {arguments[-1]}")
return arguments
print("\n\n")
messages.append({
"role": "assistant",
"tool_calls": chat_completion.choices[0].message.tool_calls
})
def main():
# Initialize OpenAI client
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
# Get available models and select one
models = client.models.list()
model = models.data[0].id
# Now, simulate a tool call
def get_current_weather(city: str, state: str, unit: 'str'):
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.")
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print("-" * 70)
print("Chat completion results:")
print(chat_completion)
print("-" * 70)
# Stream tool calls
chunks = handle_tool_calls_stream(client, messages, model, tools)
print("-" * 70)
# Handle arguments from streamed tool calls
arguments = handle_tool_calls_arguments(chunks)
available_tools = {"get_current_weather": get_current_weather}
if len(arguments):
print(f"streamed tool call arguments: {arguments[-1]}\n")
completion_tool_calls = chat_completion.choices[0].message.tool_calls
for call in completion_tool_calls:
print("-" * 70)
# Add tool call results to the conversation
messages.append({
"role": "assistant",
"tool_calls": chat_completion.choices[0].message.tool_calls
})
# Now, simulate a tool call
available_tools = {"get_current_weather": get_current_weather}
completion_tool_calls = chat_completion.choices[0].message.tool_calls
for call in completion_tool_calls:
tool_to_call = available_tools[call.function.name]
args = json.loads(call.function.arguments)
result = tool_to_call(**args)
print(result)
print("tool_to_call result: ", result)
messages.append({
"role": "tool",
"content": result,
......@@ -155,9 +179,14 @@ for call in completion_tool_calls:
"name": call.function.name
})
chat_completion_2 = client.chat.completions.create(messages=messages,
chat_completion_2 = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=False)
print("\n\n")
print(chat_completion_2)
print("Chat completion2 results:")
print(chat_completion_2)
print("-" * 70)
if __name__ == "__main__":
main()
......@@ -18,15 +18,6 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [
{
"type": "function",
......@@ -116,21 +107,36 @@ messages = [
},
]
chat_completion = client.chat.completions.create(
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
)
for chunk in chat_completion:
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls)
print(chat_completion.choices[0].message.tool_calls)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you need to start the vLLM server:
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
"""
from enum import Enum
from openai import BadRequestError, OpenAI
from pydantic import BaseModel
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
# Guided decoding by Choice (list of possible options)
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
def guided_choice_completion(client: OpenAI, model: str):
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": "Classify this sentiment: vLLM is wonderful!"
}],
extra_body={"guided_choice": ["positive", "negative"]},
)
print(completion.choices[0].message.content)
)
return completion.choices[0].message.content
# Guided decoding by Regex
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
def guided_regex_completion(client: OpenAI, model: str):
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={
"guided_regex": "\w+@\w+\.com\n",
"guided_regex": r"\w+@\w+\.com\n",
"stop": ["\n"]
},
)
print(completion.choices[0].message.content)
)
return completion.choices[0].message.content
# Guided decoding by JSON using Pydantic schema
......@@ -54,66 +60,100 @@ class CarDescription(BaseModel):
car_type: CarType
json_schema = CarDescription.model_json_schema()
def guided_json_completion(client: OpenAI, model: str):
json_schema = CarDescription.model_json_schema()
prompt = ("Generate a JSON with the brand, model and car_type of"
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print(completion.choices[0].message.content)
)
return completion.choices[0].message.content
# Guided decoding by Grammar
simplified_sql_grammar = """
?start: select_statement
def guided_grammar_completion(client: OpenAI, model: str):
simplified_sql_grammar = """
root ::= select_statement
?select_statement: "SELECT " column_list " FROM " table_name
select_statement ::= "SELECT " column " from " table " where " condition
?column_list: column_name ("," column_name)*
column ::= "col_1 " | "col_2 "
?table_name: identifier
table ::= "table_1 " | "table_2 "
?column_name: identifier
condition ::= column "= " number
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
number ::= "1 " | "2 "
"""
prompt = ("Generate an SQL query to show the 'username' and 'email'"
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print(completion.choices[0].message.content)
)
return completion.choices[0].message.content
# Extra backend options
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
def extra_backend_options_completion(client: OpenAI, model: str):
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
try:
try:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={
"guided_regex": "\w+@\w+\.com\n",
"guided_regex": r"\w+@\w+\.com\n",
"stop": ["\n"],
"guided_decoding_backend": "xgrammar:no-fallback"
},
)
except BadRequestError as e:
return completion.choices[0].message.content
except BadRequestError as e:
print("This error is expected:", e)
def main():
client: OpenAI = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
model = "Qwen/Qwen2.5-3B-Instruct"
print("Guided Choice Completion:")
print(guided_choice_completion(client, model))
print("\nGuided Regex Completion:")
print(guided_regex_completion(client, model))
print("\nGuided JSON Completion:")
print(guided_json_completion(client, model))
print("\nGuided Grammar Completion:")
print(guided_grammar_completion(client, model))
print("\nExtra Backend Options Completion:")
print(extra_backend_options_completion(client, model))
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
from openai import OpenAI
# This example demonstrates the `structural_tag` response format.
# It can be used to specify a structured output format that occurs between
# specific tags in the response. This example shows how it could be used
# to enforce the format of a tool call response, but it could be used for
# any structured output within a subset of the response.
def main():
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
messages = [{
"role":
"user",
"content":
"""
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?
"""
}]
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=messages,
response_format={
"type":
"structural_tag",
"structures": [{
"begin": "<function=get_weather>",
"schema": {
"type": "object",
"properties": {
"city": {
"type": "string"
}
}
},
"end": "</function>"
}],
"triggers": ["<function="]
})
print(response)
if __name__ == "__main__":
main()
......@@ -25,18 +25,18 @@ from pydantic import BaseModel
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def print_completion_details(completion):
print("reasoning_content: ",
completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# Guided decoding by Regex
prompt = ("What is the capital of France?")
def guided_regex_completion(client: OpenAI, model: str):
prompt = ("What is the capital of France?")
completion = client.chat.completions.create(
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
......@@ -45,9 +45,8 @@ completion = client.chat.completions.create(
extra_body={
"guided_regex": "(Paris|London)",
},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
)
print_completion_details(completion)
class People(BaseModel):
......@@ -55,19 +54,19 @@ class People(BaseModel):
age: int
json_schema = People.model_json_schema()
def guided_json_completion(client: OpenAI, model: str):
json_schema = People.model_json_schema()
prompt = ("Generate a JSON with the name and age of one random person.")
completion = client.chat.completions.create(
prompt = ("Generate a JSON with the name and age of one random person.")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
)
print_completion_details(completion)
# Guided decoding by JSON using Pydantic schema
......@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
car_type: CarType
json_schema = CarDescription.model_json_schema()
def guided_car_json_completion(client: OpenAI, model: str):
json_schema = CarDescription.model_json_schema()
prompt = ("Generate a JSON with the brand, model and car_type of"
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
)
print_completion_details(completion)
# Guided decoding by Grammar
simplified_sql_grammar = """
?start: select_statement
def guided_grammar_completion(client: OpenAI, model: str):
simplified_sql_grammar = """
root ::= select_statement
?select_statement: "SELECT " column_list " FROM " table_name
select_statement ::= "SELECT " column " from " table " where " condition
?column_list: column_name ("," column_name)*
column ::= "col_1 " | "col_2 "
?table_name: identifier
table ::= "table_1 " | "table_2 "
?column_name: identifier
condition ::= column "= " number
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
number ::= "1 " | "2 "
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt = ("Generate an SQL query to show the 'username' and 'email'"
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
completion = client.chat.completions.create(
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
)
print_completion_details(completion)
def main():
client: OpenAI = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model: str = models.data[0].id
print("Guided Regex Completion:")
guided_regex_completion(client, model)
print("\nGuided JSON Completion (People):")
guided_json_completion(client, model)
print("\nGuided JSON Completion (CarDescription):")
guided_car_json_completion(client, model)
print("\nGuided Grammar Completion:")
guided_grammar_completion(client, model)
if __name__ == "__main__":
main()
......@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{
"type": "function",
"function": {
......@@ -109,34 +101,47 @@ def extract_reasoning_and_calls(chunks: list):
return reasoning_content, arguments, function_names
print("---------Full Generate With Automatic Function Calling-------------")
tool_calls = client.chat.completions.create(messages=messages,
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
print(
"---------Full Generate With Automatic Function Calling-------------")
tool_calls = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
print(f"function name: "
print(
f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
)
print(f"function name: "
f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
print(f"function arguments: "
print(f"function arguments: "
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
print("----------Stream Generate With Automatic Function Calling-----------")
tool_calls_stream = client.chat.completions.create(messages=messages,
print(
"----------Stream Generate With Automatic Function Calling-----------")
tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks = list(tool_calls_stream)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("----------Full Generate With Named Function Calling-----------------")
tool_calls = client.chat.completions.create(messages=messages,
print(
"----------Full Generate With Named Function Calling-----------------")
tool_calls = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice={
......@@ -147,13 +152,16 @@ tool_calls = client.chat.completions.create(messages=messages,
}
})
tool_call = tool_calls.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
print(f"function name: {tool_call.name}")
print(f"function arguments: {tool_call.arguments}")
print("----------Stream Generate With Named Function Calling--------------")
tool_call = tool_calls.choices[0].message.tool_calls[0].function
print(
f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
)
print(f"function name: {tool_call.name}")
print(f"function arguments: {tool_call.arguments}")
print(
"----------Stream Generate With Named Function Calling--------------")
tool_calls_stream = client.chat.completions.create(
tool_calls_stream = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
......@@ -165,13 +173,15 @@ tool_calls_stream = client.chat.completions.create(
},
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
chunks = list(tool_calls_stream)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("\n\n")
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("\n\n")
if __name__ == "__main__":
main()
......@@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning
parser:
To run this example, you need to start the vLLM server
with the reasoning parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
......@@ -21,35 +21,44 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
)
models = client.models.list()
model = models.data[0].id
# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
models = client.models.list()
model = models.data[0].id
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content)
print("content for Round 1:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 2
messages.append({"role": "assistant", "content": content})
messages.append({
"role":
"user",
"content":
"How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content)
print("content for Round 1:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 2
messages.append({"role": "assistant", "content": content})
messages.append({
"role": "user",
"content": "How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
if __name__ == "__main__":
main()
......@@ -29,25 +29,29 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
)
models = client.models.list()
model = models.data[0].id
models = client.models.list()
model = models.data[0].id
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model,
# ruff: noqa: E501
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)
print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False
print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False
for chunk in stream:
for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
......@@ -67,3 +71,7 @@ for chunk in stream:
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
if __name__ == "__main__":
main()
......@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print("Embedding output:", response_json["data"][0]["embedding"])
if __name__ == '__main__':
def parse_args():
parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this.")
......@@ -107,8 +107,10 @@ if __name__ == '__main__':
choices=["vlm2vec", "dse_qwen2_vl"],
required=True,
help="Which model to call.")
args = parser.parse_args()
return parser.parse_args()
def main(args):
if args.model == "vlm2vec":
vlm2vec()
elif args.model == "dse_qwen2_vl":
......@@ -120,3 +122,8 @@ if __name__ == '__main__':
"type": "text",
"content": "What is the weather like today?",
})
if __name__ == '__main__':
args = parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment