Commit 081057de authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-ori

parents 7cf5d5c4 ba41cc90
...@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="microsoft/Florence-2-large", model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large", tokenizer="Isotr0py/Florence-2-tokenizer",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
...@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
) )
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
f"<|media_pad|><|media_end|>{question}<|im_end|>"
"<|im_assistant|>assistant<|im_middle|>" for question in questions
]
engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLaVA-1.5 # LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData: def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -791,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: ...@@ -791,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_path, model=model_path,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=5120,
max_num_seqs=2, max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True, enable_lora=True,
max_lora_rank=320, max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
) )
...@@ -918,6 +944,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -918,6 +944,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
) )
# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
model_name = "Qwen/Qwen2.5-Omni-7B"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": [1],
},
limit_mm_per_prompt={"image": 1},
)
if modality == "image":
placeholder = "<|IMAGE|>"
elif modality == "video":
placeholder = "<|VIDEO|>"
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech.")
prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# SkyworkR1V # SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -966,6 +1028,7 @@ model_example_map = { ...@@ -966,6 +1028,7 @@ model_example_map = {
"h2ovl_chat": run_h2ovl, "h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3, "idefics3": run_idefics3,
"internvl_chat": run_internvl, "internvl_chat": run_internvl,
"kimi_vl": run_kimi_vl,
"llava": run_llava, "llava": run_llava,
"llava-next": run_llava_next, "llava-next": run_llava_next,
"llava-next-video": run_llava_next_video, "llava-next-video": run_llava_next_video,
...@@ -986,6 +1049,7 @@ model_example_map = { ...@@ -986,6 +1049,7 @@ model_example_map = {
"qwen_vl": run_qwen_vl, "qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl, "qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_vl": run_qwen2_5_vl,
"qwen2_5_omni": run_qwen2_5_omni,
"skywork_chat": run_skyworkr1v, "skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm, "smolvlm": run_smolvlm,
} }
...@@ -1073,6 +1137,59 @@ def time_counter(enable: bool): ...@@ -1073,6 +1137,59 @@ def time_counter(enable: bool):
yield yield
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="llava",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
parser.add_argument(
'--use-different-prompt-per-request',
action='store_true',
help='If True, then use different prompt (with the same multi-modal '
'data) for each request.')
return parser.parse_args()
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -1151,55 +1268,5 @@ def main(args): ...@@ -1151,55 +1268,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="llava",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
parser.add_argument(
'--use-different-prompt-per-request',
action='store_true',
help='If True, then use different prompt (with the same multi-modal '
'data) for each request.')
args = parser.parse_args()
main(args) main(args)
...@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print("-" * 50) print("-" * 50)
def main(args: Namespace):
run_encode(args.model_name, args.modality, args.seed)
model_example_map = { model_example_map = {
"e5_v": run_e5_v, "e5_v": run_e5_v,
"vlm2vec": run_vlm2vec, "vlm2vec": run_vlm2vec,
} }
if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding') 'vision language models for multimodal embedding')
...@@ -184,6 +181,13 @@ if __name__ == "__main__": ...@@ -184,6 +181,13 @@ if __name__ == "__main__":
type=int, type=int,
default=None, default=None,
help="Set the seed when initializing `vllm.LLM`.") help="Set the seed when initializing `vllm.LLM`.")
return parser.parse_args()
args = parser.parse_args()
def main(args: Namespace):
run_encode(args.model_name, args.modality, args.seed)
if __name__ == "__main__":
args = parse_args()
main(args) main(args)
...@@ -326,6 +326,44 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -326,6 +326,44 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=4,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name,
trust_remote_code=True)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
...@@ -465,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -465,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_path, model=model_path,
trust_remote_code=True, trust_remote_code=True,
max_model_len=10000, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True, enable_lora=True,
max_lora_rank=320, max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
) )
placeholders = "".join(f"<|image_{i}|>" placeholders = "".join(f"<|image_{i}|>"
...@@ -640,6 +680,7 @@ model_example_map = { ...@@ -640,6 +680,7 @@ model_example_map = {
"h2ovl_chat": load_h2ovl, "h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"kimi_vl": load_kimi_vl,
"llama4": load_llama4, "llama4": load_llama4,
"mistral3": load_mistral3, "mistral3": load_mistral3,
"mllama": load_mllama, "mllama": load_mllama,
...@@ -727,22 +768,7 @@ def run_chat(model: str, question: str, image_urls: list[str], ...@@ -727,22 +768,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
print("-" * 50) print("-" * 50)
def main(args: Namespace): def parse_args():
model = args.model_type
method = args.method
seed = args.seed
image_urls = IMAGE_URLS[:args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
else:
raise ValueError(f"Invalid method: {method}")
if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input for text ' 'vision language models that support multi-image input for text '
...@@ -765,9 +791,29 @@ if __name__ == "__main__": ...@@ -765,9 +791,29 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--num-images", "--num-images",
"-n", "-n",
choices=list(range(1, 13)), # 12 is the max number of images type=int,
choices=list(range(1,
len(IMAGE_URLS) + 1)), # the max number of images
default=2, default=2,
help="Number of images to use for the demo.") help="Number of images to use for the demo.")
return parser.parse_args()
args = parser.parse_args()
def main(args: Namespace):
model = args.model_type
method = args.method
seed = args.seed
image_urls = IMAGE_URLS[:args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
else:
raise ValueError(f"Invalid method: {method}")
if __name__ == "__main__":
args = parse_args()
main(args) main(args)
...@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]: ...@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]:
return output return output
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
prompt = args.prompt prompt = args.prompt
api_url = f"http://{args.host}:{args.port}/generate" api_url = f"http://{args.host}:{args.port}/generate"
...@@ -82,11 +92,5 @@ def main(args: Namespace): ...@@ -82,11 +92,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() args = parse_args()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
main(args) main(args)
...@@ -2,32 +2,46 @@ ...@@ -2,32 +2,46 @@
""" """
Example of using the OpenAI entrypoint's rerank API which is compatible with Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python the Cohere SDK: https://github.com/cohere-ai/cohere-python
Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base run: vllm serve BAAI/bge-reranker-base
""" """
from typing import Union
import cohere import cohere
from cohere import Client, ClientV2
model = "BAAI/bge-reranker-base"
query = "What is the capital of France?"
documents = [
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
]
def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
documents: list[str]) -> dict:
return client.rerank(model=model, query=query, documents=documents)
def main():
# cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000",
api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50)
# or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key",
base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50)
# cohere v1 client if __name__ == "__main__":
co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") main()
rerank_v1_result = co.rerank(
model="BAAI/bge-reranker-base",
query="What is the capital of France?",
documents=[
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
])
print(rerank_v1_result)
# or the v2
co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
v2_rerank_result = co2.rerank(
model="BAAI/bge-reranker-base",
query="What is the capital of France?",
documents=[
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
])
print(v2_rerank_result)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio OpenAI Chatbot Webserver
Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py \
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse import argparse
import gradio as gr import gradio as gr
from openai import OpenAI from openai import OpenAI
# Argument parser setup
parser = argparse.ArgumentParser( def format_history_to_openai(history):
description='Chatbot Interface with Customizable Parameters')
parser.add_argument('--model-url',
type=str,
default='http://localhost:8000/v1',
help='Model URL')
parser.add_argument('-m',
'--model',
type=str,
required=True,
help='Model name for the chatbot')
parser.add_argument('--temp',
type=float,
default=0.8,
help='Temperature for text generation')
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
# Parse the arguments
args = parser.parse_args()
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = args.model_url
# Create an OpenAI client to interact with the API server
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
def predict(message, history):
# Convert chat history to OpenAI format
history_openai_format = [{ history_openai_format = [{
"role": "system", "role": "system",
"content": "You are a great ai assistant." "content": "You are a great AI assistant."
}] }]
for human, assistant in history: for human, assistant in history:
history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({"role": "user", "content": human})
...@@ -54,31 +34,92 @@ def predict(message, history): ...@@ -54,31 +34,92 @@ def predict(message, history):
"role": "assistant", "role": "assistant",
"content": assistant "content": assistant
}) })
return history_openai_format
def predict(message, history, client, model_name, temp, stop_token_ids):
# Format history to OpenAI chat format
history_openai_format = format_history_to_openai(history)
history_openai_format.append({"role": "user", "content": message}) history_openai_format.append({"role": "user", "content": message})
# Create a chat completion request and send it to the API server # Send request to OpenAI API (vLLM server)
stream = client.chat.completions.create( stream = client.chat.completions.create(
model=args.model, # Model name to use model=model_name,
messages=history_openai_format, # Chat history messages=history_openai_format,
temperature=args.temp, # Temperature for text generation temperature=temp,
stream=True, # Stream response stream=True,
extra_body={ extra_body={
'repetition_penalty': 'repetition_penalty':
1, 1,
'stop_token_ids': [ 'stop_token_ids':
int(id.strip()) for id in args.stop_token_ids.split(',') [int(id.strip())
if id.strip() for id in stop_token_ids.split(',')] if stop_token_ids else []
] if args.stop_token_ids else []
}) })
# Read and return generated text from response stream # Collect all chunks and concatenate them into a full message
partial_message = "" full_message = ""
for chunk in stream: for chunk in stream:
partial_message += (chunk.choices[0].delta.content or "") full_message += (chunk.choices[0].delta.content or "")
yield partial_message
# Return the full message as a single response
return full_message
def parse_args():
parser = argparse.ArgumentParser(
description='Chatbot Interface with Customizable Parameters')
parser.add_argument('--model-url',
type=str,
default='http://localhost:8000/v1',
help='Model URL')
parser.add_argument('-m',
'--model',
type=str,
required=True,
help='Model name for the chatbot')
parser.add_argument('--temp',
type=float,
default=0.8,
help='Temperature for text generation')
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
return parser.parse_args()
def build_gradio_interface(client, model_name, temp, stop_token_ids):
def chat_predict(message, history):
return predict(message, history, client, model_name, temp,
stop_token_ids)
return gr.ChatInterface(fn=chat_predict,
title="Chatbot Interface",
description="A simple chatbot powered by vLLM")
def main():
# Parse the arguments
args = parse_args()
# Set OpenAI's API key and API base to use vLLM's API server
openai_api_key = "EMPTY"
openai_api_base = args.model_url
# Create an OpenAI client
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
# Define the Gradio chatbot interface using the predict function
gradio_interface = build_gradio_interface(client, args.model, args.temp,
args.stop_token_ids)
gradio_interface.queue().launch(server_name=args.host,
server_port=args.port,
share=True)
# Create and launch a chat interface with Gradio if __name__ == "__main__":
gr.ChatInterface(predict).queue().launch(server_name=args.host, main()
server_port=args.port,
share=True)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Example for starting a Gradio Webserver
Start vLLM API server:
python -m vllm.entrypoints.api_server \
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
If your antivirus software blocks the download of frpc for gradio,
you can install it manually by following these steps:
1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.3
3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
"""
import argparse import argparse
import json import json
...@@ -39,16 +56,23 @@ def build_demo(): ...@@ -39,16 +56,23 @@ def build_demo():
return demo return demo
if __name__ == "__main__": def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None) parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001) parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--model-url", parser.add_argument("--model-url",
type=str, type=str,
default="http://localhost:8000/generate") default="http://localhost:8000/generate")
args = parser.parse_args() return parser.parse_args()
def main(args):
demo = build_demo() demo = build_demo()
demo.queue().launch(server_name=args.host, demo.queue().launch(server_name=args.host,
server_port=args.port, server_port=args.port,
share=True) share=True)
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -23,12 +23,19 @@ data = { ...@@ -23,12 +23,19 @@ data = {
"The capital of France is Paris.", "Horses and cows are both animals" "The capital of France is Paris.", "Horses and cows are both animals"
] ]
} }
response = requests.post(url, headers=headers, json=data)
# Check the response def main():
if response.status_code == 200: response = requests.post(url, headers=headers, json=data)
print("Request successful!")
print(json.dumps(response.json(), indent=2)) # Check the response
else: if response.status_code == 200:
print(f"Request failed with status code: {response.status_code}") print("Request successful!")
print(response.text) print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
from openai import OpenAI from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server. # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( messages = [{
# defaults to os.environ.get("OPENAI_API_KEY") "role": "system",
api_key=openai_api_key, "content": "You are a helpful assistant."
base_url=openai_api_base, }, {
) "role": "user",
"content": "Who won the world series in 2020?"
models = client.models.list() }, {
model = models.data[0].id "role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020."
chat_completion = client.chat.completions.create( }, {
messages=[{ "role": "user",
"role": "system", "content": "Where was it played?"
"content": "You are a helpful assistant." }]
}, {
"role": "user",
"content": "Who won the world series in 2020?" def main():
}, { client = OpenAI(
"role": # defaults to os.environ.get("OPENAI_API_KEY")
"assistant", api_key=openai_api_key,
"content": base_url=openai_api_base,
"The Los Angeles Dodgers won the World Series in 2020." )
}, {
"role": "user", models = client.models.list()
"content": "Where was it played?" model = models.data[0].id
}],
model=model, chat_completion = client.chat.completions.create(
) messages=messages,
model=model,
print("Chat completion results:") )
print(chat_completion)
print("-" * 50)
print("Chat completion results:")
print(chat_completion)
print("-" * 50)
if __name__ == "__main__":
main()
...@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja ...@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct) (multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox) (audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
...@@ -303,12 +303,7 @@ example_function_map = { ...@@ -303,12 +303,7 @@ example_function_map = {
} }
def main(args) -> None: def parse_args():
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online serving with ' description='Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.') 'multimodal language models served with vLLM.')
...@@ -318,5 +313,14 @@ if __name__ == "__main__": ...@@ -318,5 +313,14 @@ if __name__ == "__main__":
default="single-image", default="single-image",
choices=list(example_function_map.keys()), choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.') help='Conversation type with multimodal data.')
args = parser.parse_args() return parser.parse_args()
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
args = parse_args()
main(args) main(args)
...@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \ ...@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
--enable-auto-tool-choice --tool-call-parser hermes --enable-auto-tool-choice --tool-call-parser hermes
""" """
import json import json
from typing import Any
from openai import OpenAI from openai import OpenAI
...@@ -24,15 +25,6 @@ from openai import OpenAI ...@@ -24,15 +25,6 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{ tools = [{
"type": "function", "type": "function",
"function": { "function": {
...@@ -78,86 +70,123 @@ messages = [{ ...@@ -78,86 +70,123 @@ messages = [{
"Can you tell me what the temperate will be in Dallas, in fahrenheit?" "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}] }]
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print("Chat completion results:")
print(chat_completion)
print("\n\n")
tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
if chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls[0])
else:
print(chunk.choices[0].delta)
arguments = []
tool_call_idx = -1
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
if tool_call_idx >= 0:
print(
f"streamed tool call arguments: {arguments[tool_call_idx]}"
)
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
if tool_call.id:
print(f"streamed tool call id: {tool_call.id} ")
if tool_call.function:
if tool_call.function.name:
print(f"streamed tool call name: {tool_call.function.name}")
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
if len(arguments):
print(f"streamed tool call arguments: {arguments[-1]}")
print("\n\n")
messages.append({
"role": "assistant",
"tool_calls": chat_completion.choices[0].message.tool_calls
})
# Now, simulate a tool call
def get_current_weather(city: str, state: str, unit: 'str'): def get_current_weather(city: str, state: str, unit: 'str'):
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.") "partly cloudly, with highs in the 90's.")
available_tools = {"get_current_weather": get_current_weather} def handle_tool_calls_stream(
client: OpenAI,
completion_tool_calls = chat_completion.choices[0].message.tool_calls messages: list[dict[str, str]],
for call in completion_tool_calls: model: str,
tool_to_call = available_tools[call.function.name] tools: list[dict[str, Any]],
args = json.loads(call.function.arguments) ) -> list[Any]:
result = tool_to_call(**args) tool_calls_stream = client.chat.completions.create(messages=messages,
print(result) model=model,
tools=tools,
stream=True)
chunks = []
print("chunks: ")
for chunk in tool_calls_stream:
chunks.append(chunk)
if chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls[0])
else:
print(chunk.choices[0].delta)
return chunks
def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
arguments = []
tool_call_idx = -1
print("arguments: ")
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
if tool_call_idx >= 0:
print(f"streamed tool call arguments: "
f"{arguments[tool_call_idx]}")
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
if tool_call.id:
print(f"streamed tool call id: {tool_call.id} ")
if tool_call.function:
if tool_call.function.name:
print(
f"streamed tool call name: {tool_call.function.name}")
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
return arguments
def main():
# Initialize OpenAI client
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
# Get available models and select one
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print("-" * 70)
print("Chat completion results:")
print(chat_completion)
print("-" * 70)
# Stream tool calls
chunks = handle_tool_calls_stream(client, messages, model, tools)
print("-" * 70)
# Handle arguments from streamed tool calls
arguments = handle_tool_calls_arguments(chunks)
if len(arguments):
print(f"streamed tool call arguments: {arguments[-1]}\n")
print("-" * 70)
# Add tool call results to the conversation
messages.append({ messages.append({
"role": "tool", "role": "assistant",
"content": result, "tool_calls": chat_completion.choices[0].message.tool_calls
"tool_call_id": call.id,
"name": call.function.name
}) })
chat_completion_2 = client.chat.completions.create(messages=messages, # Now, simulate a tool call
model=model, available_tools = {"get_current_weather": get_current_weather}
tools=tools,
stream=False) completion_tool_calls = chat_completion.choices[0].message.tool_calls
print("\n\n") for call in completion_tool_calls:
print(chat_completion_2) tool_to_call = available_tools[call.function.name]
args = json.loads(call.function.arguments)
result = tool_to_call(**args)
print("tool_to_call result: ", result)
messages.append({
"role": "tool",
"content": result,
"tool_call_id": call.id,
"name": call.function.name
})
chat_completion_2 = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=False)
print("Chat completion2 results:")
print(chat_completion_2)
print("-" * 70)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
To run this example, you can start the vLLM server To run this example, you can start the vLLM server
without any specific flags: without any specific flags:
```bash ```bash
...@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ ...@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines --guided-decoding-backend outlines
``` ```
This example demonstrates how to generate chat completions This example demonstrates how to generate chat completions
using the OpenAI Python client library. using the OpenAI Python client library.
""" """
...@@ -18,15 +18,6 @@ from openai import OpenAI ...@@ -18,15 +18,6 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [ tools = [
{ {
"type": "function", "type": "function",
...@@ -116,21 +107,36 @@ messages = [ ...@@ -116,21 +107,36 @@ messages = [
}, },
] ]
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion: def main():
if chunk.choices and chunk.choices[0].delta.tool_calls: client = OpenAI(
print(chunk.choices[0].delta.tool_calls) # defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls) if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""
To run this example, you need to start the vLLM server:
```bash
vllm serve Qwen/Qwen2.5-3B-Instruct
```
"""
from enum import Enum from enum import Enum
from openai import BadRequestError, OpenAI from openai import BadRequestError, OpenAI
from pydantic import BaseModel from pydantic import BaseModel
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
# Guided decoding by Choice (list of possible options) # Guided decoding by Choice (list of possible options)
completion = client.chat.completions.create( def guided_choice_completion(client: OpenAI, model: str):
model="Qwen/Qwen2.5-3B-Instruct", completion = client.chat.completions.create(
messages=[{ model=model,
"role": "user", messages=[{
"content": "Classify this sentiment: vLLM is wonderful!" "role": "user",
}], "content": "Classify this sentiment: vLLM is wonderful!"
extra_body={"guided_choice": ["positive", "negative"]}, }],
) extra_body={"guided_choice": ["positive", "negative"]},
print(completion.choices[0].message.content) )
return completion.choices[0].message.content
# Guided decoding by Regex # Guided decoding by Regex
prompt = ("Generate an email address for Alan Turing, who works in Enigma." def guided_regex_completion(client: OpenAI, model: str):
"End in .com and new line. Example result:" prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"alan.turing@enigma.com\n") "End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct", completion = client.chat.completions.create(
messages=[{ model=model,
"role": "user", messages=[{
"content": prompt, "role": "user",
}], "content": prompt,
extra_body={ }],
"guided_regex": "\w+@\w+\.com\n", extra_body={
"stop": ["\n"] "guided_regex": r"\w+@\w+\.com\n",
}, "stop": ["\n"]
) },
print(completion.choices[0].message.content) )
return completion.choices[0].message.content
# Guided decoding by JSON using Pydantic schema # Guided decoding by JSON using Pydantic schema
...@@ -54,66 +60,100 @@ class CarDescription(BaseModel): ...@@ -54,66 +60,100 @@ class CarDescription(BaseModel):
car_type: CarType car_type: CarType
json_schema = CarDescription.model_json_schema() def guided_json_completion(client: OpenAI, model: str):
json_schema = CarDescription.model_json_schema()
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print(completion.choices[0].message.content)
# Guided decoding by Grammar prompt = ("Generate a JSON with the brand, model and car_type of"
simplified_sql_grammar = """ "the most iconic car from the 90's")
?start: select_statement completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
return completion.choices[0].message.content
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)* # Guided decoding by Grammar
def guided_grammar_completion(client: OpenAI, model: str):
simplified_sql_grammar = """
root ::= select_statement
?table_name: identifier select_statement ::= "SELECT " column " from " table " where " condition
?column_name: identifier column ::= "col_1 " | "col_2 "
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ table ::= "table_1 " | "table_2 "
"""
prompt = ("Generate an SQL query to show the 'username' and 'email'" condition ::= column "= " number
"from the 'users' table.")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print(completion.choices[0].message.content)
# Extra backend options number ::= "1 " | "2 "
prompt = ("Generate an email address for Alan Turing, who works in Enigma." """
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
try: prompt = ("Generate an SQL query to show the 'username' and 'email'"
# The no-fallback option forces vLLM to use xgrammar, so when it fails "from the 'users' table.")
# you get a 400 with the reason why
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct", model=model,
messages=[{ messages=[{
"role": "user", "role": "user",
"content": prompt, "content": prompt,
}], }],
extra_body={ extra_body={"guided_grammar": simplified_sql_grammar},
"guided_regex": "\w+@\w+\.com\n",
"stop": ["\n"],
"guided_decoding_backend": "xgrammar:no-fallback"
},
) )
except BadRequestError as e: return completion.choices[0].message.content
print("This error is expected:", e)
# Extra backend options
def extra_backend_options_completion(client: OpenAI, model: str):
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
try:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={
"guided_regex": r"\w+@\w+\.com\n",
"stop": ["\n"],
"guided_decoding_backend": "xgrammar:no-fallback"
},
)
return completion.choices[0].message.content
except BadRequestError as e:
print("This error is expected:", e)
def main():
client: OpenAI = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
model = "Qwen/Qwen2.5-3B-Instruct"
print("Guided Choice Completion:")
print(guided_choice_completion(client, model))
print("\nGuided Regex Completion:")
print(guided_regex_completion(client, model))
print("\nGuided JSON Completion:")
print(guided_json_completion(client, model))
print("\nGuided Grammar Completion:")
print(guided_grammar_completion(client, model))
print("\nExtra Backend Options Completion:")
print(extra_backend_options_completion(client, model))
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
from openai import OpenAI
# This example demonstrates the `structural_tag` response format.
# It can be used to specify a structured output format that occurs between
# specific tags in the response. This example shows how it could be used
# to enforce the format of a tool call response, but it could be used for
# any structured output within a subset of the response.
def main():
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
messages = [{
"role":
"user",
"content":
"""
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?
"""
}]
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=messages,
response_format={
"type":
"structural_tag",
"structures": [{
"begin": "<function=get_weather>",
"schema": {
"type": "object",
"properties": {
"city": {
"type": "string"
}
}
},
"end": "</function>"
}],
"triggers": ["<function="]
})
print(response)
if __name__ == "__main__":
main()
...@@ -25,29 +25,28 @@ from pydantic import BaseModel ...@@ -25,29 +25,28 @@ from pydantic import BaseModel
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list() def print_completion_details(completion):
model = models.data[0].id print("reasoning_content: ",
completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# Guided decoding by Regex # Guided decoding by Regex
prompt = ("What is the capital of France?") def guided_regex_completion(client: OpenAI, model: str):
prompt = ("What is the capital of France?")
completion = client.chat.completions.create(
model=model, completion = client.chat.completions.create(
messages=[{ model=model,
"role": "user", messages=[{
"content": prompt, "role": "user",
}], "content": prompt,
extra_body={ }],
"guided_regex": "(Paris|London)", extra_body={
}, "guided_regex": "(Paris|London)",
) },
print("reasoning_content: ", completion.choices[0].message.reasoning_content) )
print("content: ", completion.choices[0].message.content) print_completion_details(completion)
class People(BaseModel): class People(BaseModel):
...@@ -55,19 +54,19 @@ class People(BaseModel): ...@@ -55,19 +54,19 @@ class People(BaseModel):
age: int age: int
json_schema = People.model_json_schema() def guided_json_completion(client: OpenAI, model: str):
json_schema = People.model_json_schema()
prompt = ("Generate a JSON with the name and age of one random person.") prompt = ("Generate a JSON with the name and age of one random person.")
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[{ messages=[{
"role": "user", "role": "user",
"content": prompt, "content": prompt,
}], }],
extra_body={"guided_json": json_schema}, extra_body={"guided_json": json_schema},
) )
print("reasoning_content: ", completion.choices[0].message.reasoning_content) print_completion_details(completion)
print("content: ", completion.choices[0].message.content)
# Guided decoding by JSON using Pydantic schema # Guided decoding by JSON using Pydantic schema
...@@ -84,46 +83,73 @@ class CarDescription(BaseModel): ...@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
car_type: CarType car_type: CarType
json_schema = CarDescription.model_json_schema() def guided_car_json_completion(client: OpenAI, model: str):
json_schema = CarDescription.model_json_schema()
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print_completion_details(completion)
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# Guided decoding by Grammar # Guided decoding by Grammar
simplified_sql_grammar = """ def guided_grammar_completion(client: OpenAI, model: str):
?start: select_statement simplified_sql_grammar = """
root ::= select_statement
?select_statement: "SELECT " column_list " FROM " table_name select_statement ::= "SELECT " column " from " table " where " condition
?column_list: column_name ("," column_name)* column ::= "col_1 " | "col_2 "
?table_name: identifier table ::= "table_1 " | "table_2 "
?column_name: identifier condition ::= column "= " number
number ::= "1 " | "2 "
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print_completion_details(completion)
def main():
client: OpenAI = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model: str = models.data[0].id
print("Guided Regex Completion:")
guided_regex_completion(client, model)
print("\nGuided JSON Completion (People):")
guided_json_completion(client, model)
print("\nGuided JSON Completion (CarDescription):")
guided_car_json_completion(client, model)
print("\nGuided Grammar Completion:")
guided_grammar_completion(client, model)
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122 if __name__ == "__main__":
prompt = ("Generate an SQL query to show the 'username' and 'email'" main()
"from the 'users' table.")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
...@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather} ...@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{ tools = [{
"type": "function", "type": "function",
"function": { "function": {
...@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list): ...@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list):
return reasoning_content, arguments, function_names return reasoning_content, arguments, function_names
print("---------Full Generate With Automatic Function Calling-------------") def main():
tool_calls = client.chat.completions.create(messages=messages, client = OpenAI(
model=model, api_key=openai_api_key,
tools=tools) base_url=openai_api_base,
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") )
print(f"function name: "
f"{tool_calls.choices[0].message.tool_calls[0].function.name}") models = client.models.list()
print(f"function arguments: " model = models.data[0].id
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
print(
print("----------Stream Generate With Automatic Function Calling-----------") "---------Full Generate With Automatic Function Calling-------------")
tool_calls_stream = client.chat.completions.create(messages=messages, tool_calls = client.chat.completions.create(messages=messages,
model=model, model=model,
tools=tools, tools=tools)
stream=True) print(
chunks = [] f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
for chunk in tool_calls_stream: )
chunks.append(chunk) print(f"function name: "
f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
reasoning_content, arguments, function_names = extract_reasoning_and_calls( print(f"function arguments: "
chunks) f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
print(f"reasoning_content: {reasoning_content}") print(
print(f"function name: {function_names[0]}") "----------Stream Generate With Automatic Function Calling-----------")
print(f"function arguments: {arguments[0]}") tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
print("----------Full Generate With Named Function Calling-----------------") tools=tools,
tool_calls = client.chat.completions.create(messages=messages, stream=True)
model=model,
tools=tools, chunks = list(tool_calls_stream)
tool_choice={
"type": "function", reasoning_content, arguments, function_names = extract_reasoning_and_calls(
"function": { chunks)
"name":
"get_current_weather" print(f"reasoning_content: {reasoning_content}")
} print(f"function name: {function_names[0]}")
}) print(f"function arguments: {arguments[0]}")
tool_call = tool_calls.choices[0].message.tool_calls[0].function print(
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") "----------Full Generate With Named Function Calling-----------------")
print(f"function name: {tool_call.name}") tool_calls = client.chat.completions.create(messages=messages,
print(f"function arguments: {tool_call.arguments}") model=model,
print("----------Stream Generate With Named Function Calling--------------") tools=tools,
tool_choice={
tool_calls_stream = client.chat.completions.create( "type": "function",
messages=messages, "function": {
model=model, "name":
tools=tools, "get_current_weather"
tool_choice={ }
"type": "function", })
"function": {
"name": "get_current_weather" tool_call = tool_calls.choices[0].message.tool_calls[0].function
} print(
}, f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
stream=True) )
print(f"function name: {tool_call.name}")
chunks = [] print(f"function arguments: {tool_call.arguments}")
for chunk in tool_calls_stream: print(
chunks.append(chunk) "----------Stream Generate With Named Function Calling--------------")
reasoning_content, arguments, function_names = extract_reasoning_and_calls( tool_calls_stream = client.chat.completions.create(
chunks) messages=messages,
print(f"reasoning_content: {reasoning_content}") model=model,
print(f"function name: {function_names[0]}") tools=tools,
print(f"function arguments: {arguments[0]}") tool_choice={
print("\n\n") "type": "function",
"function": {
"name": "get_current_weather"
}
},
stream=True)
chunks = list(tool_calls_stream)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("\n\n")
if __name__ == "__main__":
main()
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models An example shows how to generate chat completions from reasoning models
like DeepSeekR1. like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning To run this example, you need to start the vLLM server
parser: with the reasoning parser:
```bash ```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
...@@ -21,35 +21,44 @@ from openai import OpenAI ...@@ -21,35 +21,44 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list() def main():
model = models.data[0].id client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Round 1 models = client.models.list()
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] model = models.data[0].id
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content # Round 1
content = response.choices[0].message.content messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content) reasoning_content = response.choices[0].message.reasoning_content
print("content for Round 1:", content) content = response.choices[0].message.content
# Round 2 print("reasoning_content for Round 1:", reasoning_content)
messages.append({"role": "assistant", "content": content}) print("content for Round 1:", content)
messages.append({
"role": "user",
"content": "How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content # Round 2
content = response.choices[0].message.content messages.append({"role": "assistant", "content": content})
messages.append({
"role":
"user",
"content":
"How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 2:", reasoning_content) reasoning_content = response.choices[0].message.reasoning_content
print("content for Round 2:", content) content = response.choices[0].message.content
print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
if __name__ == "__main__":
main()
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
An example shows how to generate chat completions from reasoning models An example shows how to generate chat completions from reasoning models
like DeepSeekR1. like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning To run this example, you need to start the vLLM server with the reasoning
parser: parser:
```bash ```bash
...@@ -29,41 +29,49 @@ from openai import OpenAI ...@@ -29,41 +29,49 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] def main():
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` client = OpenAI(
stream = client.chat.completions.create(model=model, api_key=openai_api_key,
messages=messages, base_url=openai_api_base,
stream=True) )
print("client: Start streaming chat completions...") models = client.models.list()
printed_reasoning_content = False model = models.data[0].id
printed_content = False
# ruff: noqa: E501
for chunk in stream: # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
reasoning_content = None stream = client.chat.completions.create(model=model,
content = None messages=messages,
# Check the content is reasoning_content or content stream=True)
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content print("client: Start streaming chat completions...")
elif hasattr(chunk.choices[0].delta, "content"): printed_reasoning_content = False
content = chunk.choices[0].delta.content printed_content = False
if reasoning_content is not None: for chunk in stream:
if not printed_reasoning_content: reasoning_content = None
printed_reasoning_content = True content = None
print("reasoning_content:", end="", flush=True) # Check the content is reasoning_content or content
print(reasoning_content, end="", flush=True) if hasattr(chunk.choices[0].delta, "reasoning_content"):
elif content is not None: reasoning_content = chunk.choices[0].delta.reasoning_content
if not printed_content: elif hasattr(chunk.choices[0].delta, "content"):
printed_content = True content = chunk.choices[0].delta.content
print("\ncontent:", end="", flush=True)
# Extract and print the content if reasoning_content is not None:
print(content, end="", flush=True) if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
if __name__ == "__main__":
main()
...@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict): ...@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print("Embedding output:", response_json["data"][0]["embedding"]) print("Embedding output:", response_json["data"][0]["embedding"])
if __name__ == '__main__': def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve " "Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this.") "the model with --task embed before running this.")
...@@ -107,8 +107,10 @@ if __name__ == '__main__': ...@@ -107,8 +107,10 @@ if __name__ == '__main__':
choices=["vlm2vec", "dse_qwen2_vl"], choices=["vlm2vec", "dse_qwen2_vl"],
required=True, required=True,
help="Which model to call.") help="Which model to call.")
args = parser.parse_args() return parser.parse_args()
def main(args):
if args.model == "vlm2vec": if args.model == "vlm2vec":
vlm2vec() vlm2vec()
elif args.model == "dse_qwen2_vl": elif args.model == "dse_qwen2_vl":
...@@ -120,3 +122,8 @@ if __name__ == '__main__': ...@@ -120,3 +122,8 @@ if __name__ == '__main__':
"type": "text", "type": "text",
"content": "What is the weather like today?", "content": "What is the weather like today?",
}) })
if __name__ == '__main__':
args = parse_args()
main(args)
...@@ -6,28 +6,36 @@ from openai import OpenAI ...@@ -6,28 +6,36 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") def main():
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
# Completion API model = models.data[0].id
stream = False
completion = client.completions.create( # Completion API
model=model, stream = False
prompt="A robot may not injure a human being", completion = client.completions.create(
echo=False, model=model,
n=2, prompt="A robot may not injure a human being",
stream=stream, echo=False,
logprobs=3) n=2,
stream=stream,
print("Completion results:") logprobs=3)
if stream:
for c in completion: print("-" * 50)
print(c) print("Completion results:")
else: if stream:
print(completion) for c in completion:
print(c)
else:
print(completion)
print("-" * 50)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment