Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
# ruff: noqa
import argparse
from vllm import LLM
from vllm.sampling_params import SamplingParams
# This script is an offline demo for running Pixtral.
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ```
#
# - Client:
#
# ```bash
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
# "model": "mistralai/Pixtral-12B-2409",
# "messages": [
# {
# "role": "user",
# "content": [
# {"type" : "text", "text": "Describe this image in detail please."},
# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
# {"type" : "text", "text": "and this one as well. Answer in French."},
# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
# ]
# }
# ]
# }'
# ```
#
# Usage:
# python demo.py simple
# python demo.py advanced
def run_simple_demo():
model_name = "mistralai/Pixtral-12B-2409"
sampling_params = SamplingParams(max_tokens=8192)
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
llm = LLM(model=model_name, tokenizer_mode="mistral")
prompt = "Describe this image in one sentence."
image_url = "https://picsum.photos/id/237/200/300"
messages = [
{
"role":
"user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
],
},
]
outputs = llm.chat(messages, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
def run_advanced_demo():
model_name = "mistralai/Pixtral-12B-2409"
max_img_per_msg = 5
max_tokens_per_img = 4096
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
llm = LLM(
model=model_name,
tokenizer_mode="mistral",
limit_mm_per_prompt={"image": max_img_per_msg},
max_model_len=max_img_per_msg * max_tokens_per_img,
)
prompt = "Describe the following image."
url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
url_2 = "https://picsum.photos/seed/picsum/200/300"
url_3 = "https://picsum.photos/id/32/512/512"
messages = [
{
"role":
"user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": url_1
}
},
{
"type": "image_url",
"image_url": {
"url": url_2
}
},
],
},
{
"role": "assistant",
"content": "The images show nature.",
},
{
"role": "user",
"content": "More details please and answer only in French!.",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": url_3
}
},
],
},
]
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
def main():
parser = argparse.ArgumentParser(
description="Run a demo in simple or advanced mode.")
parser.add_argument(
"mode",
choices=["simple", "advanced"],
help="Specify the demo mode: 'simple' or 'advanced'",
)
args = parser.parse_args()
if args.mode == "simple":
print("Running simple demo...")
run_simple_demo()
elif args.mode == "advanced":
print("Running advanced demo...")
run_advanced_demo()
if __name__ == "__main__":
main()
......@@ -9,43 +9,76 @@ from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser
# Input image and question
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
question = "What is the content of this image?"
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# LLaVA-1.5
def run_llava(question):
def run_llava(question: str, modality: str):
assert modality == "image"
prompt = f"USER: <image>\n{question}\nASSISTANT:"
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
return llm, prompt
llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
stop_token_ids = None
return llm, prompt, stop_token_ids
# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(question):
def run_llava_next(question: str, modality: str):
assert modality == "image"
prompt = f"[INST] <image>\n{question} [/INST]"
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids
# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question: str, modality: str):
assert modality == "video"
prompt = f"USER: <video>\n{question} ASSISTANT:"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids
# LLaVA-OneVision
def run_llava_onevision(question: str, modality: str):
return llm, prompt
if modality == "video":
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
<|im_start|>assistant\n"
elif modality == "image":
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384)
stop_token_ids = None
return llm, prompt, stop_token_ids
# Fuyu
def run_fuyu(question):
def run_fuyu(question: str, modality: str):
assert modality == "image"
prompt = f"{question}\n"
llm = LLM(model="adept/fuyu-8b")
return llm, prompt
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
stop_token_ids = None
return llm, prompt, stop_token_ids
# Phi-3-Vision
def run_phi3v(question):
def run_phi3v(question: str, modality: str):
assert modality == "image"
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
# Note: The default setting of max_num_seqs (256) and
......@@ -54,34 +87,55 @@ def run_phi3v(question):
# In this example, we override max_num_seqs to 5 while
# keeping the original context length of 128k.
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM(
model="microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True,
max_num_seqs=5,
max_model_len=4096,
max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16},
)
return llm, prompt
stop_token_ids = None
return llm, prompt, stop_token_ids
# PaliGemma
def run_paligemma(question):
def run_paligemma(question: str, modality: str):
assert modality == "image"
# PaliGemma has special prompt format for VQA
prompt = "caption en"
llm = LLM(model="google/paligemma-3b-mix-224")
return llm, prompt
stop_token_ids = None
return llm, prompt, stop_token_ids
# Chameleon
def run_chameleon(question):
def run_chameleon(question: str, modality: str):
assert modality == "image"
prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b")
return llm, prompt
llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
stop_token_ids = None
return llm, prompt, stop_token_ids
# MiniCPM-V
def run_minicpmv(question):
def run_minicpmv(question: str, modality: str):
assert modality == "image"
# 2.0
# The official repo doesn't work yet, so we need to use a fork for now
......@@ -89,13 +143,28 @@ def run_minicpmv(question):
# model_name = "HwwwH/MiniCPM-V-2"
# 2.5
model_name = "openbmb/MiniCPM-Llama3-V-2_5"
# model_name = "openbmb/MiniCPM-Llama3-V-2_5"
#2.6
model_name = "openbmb/MiniCPM-V-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
# stop_token_ids = [tokenizer.eos_id]
# 2.5
# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
# 2.6
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
messages = [{
'role': 'user',
......@@ -104,36 +173,170 @@ def run_minicpmv(question):
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return llm, prompt
return llm, prompt, stop_token_ids
# InternVL
def run_internvl(question):
# Generally, InternVL can use chatml template for conversation
TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n"
prompt = f"<image>\n{question}\n"
prompt = TEMPLATE.format(prompt=prompt)
def run_internvl(question: str, modality: str):
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
llm = LLM(
model="OpenGVLab/InternVL2-4B",
model=model_name,
trust_remote_code=True,
max_num_seqs=5,
max_model_len=4096,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids
# NVLM-D
def run_nvlm_d(question: str, modality: str):
assert modality == "image"
model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
tensor_parallel_size=4,
)
return llm, prompt
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
return llm, prompt, stop_token_ids
# BLIP-2
def run_blip2(question):
def run_blip2(question: str, modality: str):
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompt = f"Question: {question} Answer:"
llm = LLM(model="Salesforce/blip2-opt-2.7b")
return llm, prompt
stop_token_ids = None
return llm, prompt, stop_token_ids
# Qwen
def run_qwen_vl(question: str, modality: str):
assert modality == "image"
llm = LLM(
model="Qwen/Qwen-VL",
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=2,
)
prompt = f"{question}Picture 1: <img></img>\n"
stop_token_ids = None
return llm, prompt, stop_token_ids
# Qwen2-VL
def run_qwen2_vl(question: str, modality: str):
assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40
llm = LLM(
model=model_name,
max_model_len=8192,
max_num_seqs=5,
)
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None
return llm, prompt, stop_token_ids
# LLama 3.2
def run_mllama(question: str, modality: str):
assert modality == "image"
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# Note: The default setting of max_num_seqs (256) and
# max_model_len (131072) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
enforce_eager=True,
)
prompt = f"<|image|><|begin_of_text|>{question}"
stop_token_ids = None
return llm, prompt, stop_token_ids
# Molmo
def run_molmo(question, modality):
assert modality == "image"
model_name = "allenai/Molmo-7B-D-0924"
llm = LLM(
model=model_name,
trust_remote_code=True,
dtype="bfloat16",
)
prompt = question
stop_token_ids = None
return llm, prompt, stop_token_ids
# GLM-4v
def run_glm4v(question: str, modality: str):
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
llm = LLM(model=model_name,
max_model_len=2048,
max_num_seqs=2,
trust_remote_code=True,
enforce_eager=True)
prompt = question
stop_token_ids = [151329, 151336, 151338]
return llm, prompt, stop_token_ids
model_example_map = {
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
"llava-onevision": run_llava_onevision,
"fuyu": run_fuyu,
"phi3_v": run_phi3v,
"paligemma": run_paligemma,
......@@ -141,19 +344,65 @@ model_example_map = {
"minicpmv": run_minicpmv,
"blip-2": run_blip2,
"internvl_chat": run_internvl,
"NVLM_D": run_nvlm_d,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"mllama": run_mllama,
"molmo": run_molmo,
"glm4v": run_glm4v,
}
def get_multi_modal_input(args):
"""
return {
"data": image or video,
"question": question,
}
"""
if args.modality == "image":
# Input image and question
image = ImageAsset("cherry_blossom") \
.pil_image.convert("RGB")
img_question = "What is the content of this image?"
return {
"data": image,
"question": img_question,
}
if args.modality == "video":
# Input video and question
video = VideoAsset(name="sample_demo_1.mp4",
num_frames=args.num_frames).np_ndarrays
vid_question = "Why is this video funny?"
return {
"data": video,
"question": vid_question,
}
msg = f"Modality {args.modality} is not supported."
raise ValueError(msg)
def main(args):
model = args.model_type
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
llm, prompt = model_example_map[model](question)
modality = args.modality
mm_input = get_multi_modal_input(args)
data = mm_input["data"]
question = mm_input["question"]
llm, prompt, stop_token_ids = model_example_map[model](question, modality)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
stop_token_ids=stop_token_ids)
assert args.num_prompts > 0
if args.num_prompts == 1:
......@@ -161,7 +410,7 @@ def main(args):
inputs = {
"prompt": prompt,
"multi_modal_data": {
"image": image
modality: data
},
}
......@@ -170,7 +419,7 @@ def main(args):
inputs = [{
"prompt": prompt,
"multi_modal_data": {
"image": image
modality: data
},
} for _ in range(args.num_prompts)]
......@@ -193,8 +442,16 @@ if __name__ == "__main__":
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=1,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
args = parser.parse_args()
main(args)
from vllm import LLM
from vllm.assets.image import ImageAsset
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
# Create an LLM.
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={"num_crops": 16},
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
# Print the outputs.
for output in outputs:
print(output.outputs.embedding) # list of 3072 floats
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models, using the chat template defined
by the model.
"""
from argparse import Namespace
from typing import List, NamedTuple, Optional
from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]
class ModelRequestData(NamedTuple):
llm: LLM
prompt: str
stop_token_ids: Optional[List[str]]
image_data: List[Image]
chat_template: Optional[str]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat"
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "".join(f"Picture {i}: <img></img>\n"
for i, _ in enumerate(image_urls, start=1))
# This model does not have a chat_template attribute on its tokenizer,
# so we need to explicitly pass it. We use ChatML since it's used in the
# generation utils of the model:
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True,
chat_template=chat_template)
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=chat_template,
)
def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"num_crops": 4},
)
placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
stop_token_ids = None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B"
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_nvlm_d(question: str, image_urls: List[str]):
model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
tensor_parallel_size=4,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
except ModuleNotFoundError:
print('WARNING: `qwen-vl-utils` not installed, input images will not '
'be automatically resized. You can enable this functionality by '
'`pip install qwen-vl-utils`.')
process_vision_info = None
model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40
llm = LLM(
model=model_name,
max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls]
else:
image_data, _ = process_vision_info(messages)
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data,
chat_template=None,
)
def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
)
prompt = f"<|image|><|image|><|begin_of_text|>{question}"
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
model_example_map = {
"phi3_v": load_phi3v,
"internvl_chat": load_internvl,
"NVLM_D": load_nvlm_d,
"qwen2_vl": load_qwen2_vl,
"qwen_vl_chat": load_qwenvl_chat,
"mllama": load_mllama,
}
def run_generate(model, question: str, image_urls: List[str]):
req_data = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=req_data.stop_token_ids)
outputs = req_data.llm.generate(
{
"prompt": req_data.prompt,
"multi_modal_data": {
"image": req_data.image_data
},
},
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def run_chat(model: str, question: str, image_urls: List[str]):
req_data = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=req_data.stop_token_ids)
outputs = req_data.llm.chat(
[{
"role":
"user",
"content": [
{
"type": "text",
"text": question,
},
*({
"type": "image_url",
"image_url": {
"url": image_url
},
} for image_url in image_urls),
],
}],
sampling_params=sampling_params,
chat_template=req_data.chat_template,
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main(args: Namespace):
model = args.model_type
method = args.method
if method == "generate":
run_generate(model, QUESTION, IMAGE_URLS)
elif method == "chat":
run_chat(model, QUESTION, IMAGE_URLS)
else:
raise ValueError(f"Invalid method: {method}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input')
parser.add_argument('--model-type',
'-m',
type=str,
default="phi3_v",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument("--method",
type=str,
default="generate",
choices=["generate", "chat"],
help="The method to run in `vllm.LLM`.")
args = parser.parse_args()
main(args)
from time import time
from vllm import LLM, SamplingParams
# NOTE: This is just a running example. For benchmarking purpose,
# please see benchmarks/benchmark_prefix_caching.py
# Common prefix.
prefix = (
"You are an expert school principal, skilled in effectively managing "
......@@ -37,9 +38,7 @@ print("Results without `enable_prefix_caching`")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
start_time_regular = time()
outputs = regular_llm.generate(generating_prompts, sampling_params)
duration_regular = time() - start_time_regular
regular_generated_texts = []
# Print the outputs.
......@@ -55,9 +54,7 @@ print("-" * 80)
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
# Generate with prefix caching.
start_time_cached = time()
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
duration_cached = time() - start_time_cached
print("Results with `enable_prefix_caching`")
......@@ -77,6 +74,3 @@ generated_same = all([
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")
speedup = round(duration_regular / duration_cached, 2)
print(f"Speed up of cached generation compared to the regular is: {speedup}")
import os
from vllm import LLM, SamplingParams
# enable torch profiler, can also be set on cmd line
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
llm.start_profile()
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
llm.stop_profile()
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
import inspect
import json
import os
import sys
from argparse import RawTextHelpFormatter
from dataclasses import asdict, dataclass
from typing import Optional
import torch
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.profiler import layerwise_profile
from vllm.utils import FlexibleArgumentParser
BATCH_SIZE_DEFAULT = 1
PROMPT_LEN_DEFAULT = 256
OUTPUT_LEN_DEFAULT = 2
@dataclass
class ProfileContext:
engine_args: EngineArgs
prompt_len: int
output_len: int
batch_size: int
save_chrome_traces_folder: Optional[str]
def get_dtype(dtype: str):
if dtype == "torch.float":
return torch.float
else:
return dtype
def run_profile(context: ProfileContext, csv_output: Optional[str],
json_output: Optional[str]):
print("Run profile with:")
for key, value in asdict(context).items():
print(f" {key} = {value}")
# Create sampling params
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=args.output_len,
ignore_eos=True)
# Create LLM
llm = LLM(**asdict(context.engine_args))
batch_size = context.batch_size
prompt_len = context.prompt_len
output_len = context.output_len
scheduler_config = llm.llm_engine.scheduler_config
max_model_len = llm.llm_engine.model_config.max_model_len
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
max_num_seqs = scheduler_config.max_num_seqs
if batch_size * prompt_len > max_num_batched_tokens:
print(f"ERROR: chosen batch_size * prompt_len "
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
f"and therefore cannot be run in a single profile step, please "
f"choose a smaller batch size or prompt length, or increase "
f"--max-num-batched-tokens")
sys.exit(-1)
if batch_size >= max_num_seqs:
print(
f"ERROR: chosen batch_size ({batch_size}) is larger than "
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
f"single profile step, please choose a smaller batch size")
sys.exit(-1)
print("llm.llm_engine.model_config.max_model_len: ",
llm.llm_engine.model_config.max_model_len)
if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
print(
f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
f"{output_len} = {prompt_len + output_len}) is larger than the "
f"model's max_model_len ({max_model_len}), please choose a smaller "
f"prompt_len or output_len, or increase --max-model-len")
sys.exit(-1)
def add_requests():
for i in range(batch_size):
prompt_token_ids = torch.randint(
llm.llm_engine.model_config.get_vocab_size(),
size=(prompt_len, )).tolist()
llm.llm_engine.add_request(
request_id=f"seq{i}",
prompt={'prompt_token_ids': prompt_token_ids},
params=sampling_params)
def abort_requests():
for i in range(batch_size):
llm.llm_engine.abort_request(f"seq{i}")
# Warm up run
print("Warm up run ...")
add_requests()
llm.llm_engine.step() # Prefill
llm.llm_engine.step() # Decode
abort_requests()
print("Profile run ...")
add_requests()
with layerwise_profile() as prefill_prof:
llm.llm_engine.step() # First step is prefill
decode_profs = []
for x in range(args.output_len - 1):
with layerwise_profile() as decode_prof:
llm.llm_engine.step()
decode_profs.append(decode_prof)
decode_results_list = [prof.results for prof in decode_profs]
prefill_results = prefill_prof.results
has_decode = len(decode_results_list) > 0
LINE_WIDTH = 80
print("=" * LINE_WIDTH)
print(f"= Prefill Model Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_model_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(f"= First Decode Step Model Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_model_table()
print()
print("=" * LINE_WIDTH)
print(f"= Prefill Summary Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_summary_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(f"= First Decode Step Summary Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_summary_table()
if csv_output:
csv_filename_base = csv_output.rstrip(".csv")
prefill_results.export_model_stats_table_csv(
csv_filename_base + "_prefill_model_table.csv")
prefill_results.export_summary_stats_table_csv(
csv_filename_base + "_prefill_summary_table.csv")
if has_decode:
decode_results_list[0].export_model_stats_table_csv(\
csv_filename_base + "_decode_model_table.csv")
decode_results_list[0].export_summary_stats_table_csv(
csv_filename_base + "_decode_summary_table.csv")
if json_output:
cuda_devices = [
torch.cuda.get_device_properties(dev_idx)
for dev_idx in range(torch.cuda.device_count())
]
json_dict = {
"context": {
"python_version": f"{sys.version}",
"torch_version": f"{torch.__version__}",
"torch_cuda_version": f"{torch.version.cuda}",
"cuda_devices": f"{cuda_devices}",
**asdict(context)
},
"prefill": prefill_results.convert_stats_to_dict(),
}
if has_decode:
for idx, dr in enumerate(decode_results_list):
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
for idx, dr in enumerate(decode_results_list[1:]):
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
with open(json_output.rstrip(".json") + ".json", "w+") as f:
json.dump(json_dict, f, indent=2)
pass
if context.save_chrome_traces_folder is not None:
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
prefill_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + "/prefill.json")
for idx, decode_prof in enumerate(decode_profs):
decode_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json")
print("Traces saved as prefill.json and decode_1.json, etc."
f" in folder {context.save_chrome_traces_folder}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="""
Profile a model
example:
```
python examples/offline_profile.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager
```
then you can use various tools to analyze the json output
terminal ascii tables:
```
python tools/profiler/print_layerwise_table.py \\
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
```
or create matplotlib stacked bar charts:
```
python tools/profiler/visualize_layerwise_profile.py \\
--json-trace Llama31-8b-FP8.json \\
--output-directory profile_breakdown --plot-metric pct_cuda_time
```
""",
formatter_class=RawTextHelpFormatter)
parser.add_argument(
"--csv",
type=str,
default=None,
help="Export the results as multiple csv file. This should be the root "
"filename, will create <filename>_prefill_model_table.csv, "
"<filename>_prefill_summary_table.csv, "
"<filename>_decode_model_table.csv, and "
"<filename>_decode_summary_table.csv")
parser.add_argument(
"--json",
type=str,
default=None,
help="Export the results as a json file. This should be the filename")
parser.add_argument("--save-chrome-traces-folder",
type=str,
help="Save chrome traces for the prefill and decode "
"will save traces as prefill.json and decode_1.json, "
"etc. inside this folder")
parser.add_argument(
"--prompt-len",
type=int,
default=PROMPT_LEN_DEFAULT,
help=f"Length of the random prompt to use when profiling, all batched "
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
parser.add_argument("--batch-size",
type=int,
default=BATCH_SIZE_DEFAULT,
help=f"Number of requests to run as a single batch, "
f"default={BATCH_SIZE_DEFAULT}")
parser.add_argument(
"--output-len",
type=int,
default=OUTPUT_LEN_DEFAULT,
help="Number of llm steps to run (includes prefill and decode) "
"- default={OUTPUT_LEN_DEFAULT}")
EngineArgs.add_cli_args(parser)
args = parser.parse_args()
context = ProfileContext(
engine_args=EngineArgs.from_cli_args(args),
**{
k: v
for k, v in vars(args).items()
if k in inspect.signature(ProfileContext).parameters
})
run_profile(context, csv_output=args.csv, json_output=args.json)
"""An example showing how to use vLLM to serve multimodal models
and run online inference with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
--trust-remote-code --limit-mm-per-prompt image=2
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
"""
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
# Text-only inference
def run_text_only() -> None:
chat_completion = client.chat.completions.create(
messages=[{
"role": "user",
"content": "What's the capital of France?"
}],
model=model,
max_tokens=64,
)
result = chat_completion.choices[0].message.content
print("Chat completion output:", result)
# Single-image input inference
def run_single_image() -> None:
## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
## Use base64 encoded image in the payload
image_base64 = encode_base64_content_from_url(image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)
# Multi-image input inference
def run_multi_image() -> None:
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What are the animals in these images?"
},
{
"type": "image_url",
"image_url": {
"url": image_url_duck
},
},
{
"type": "image_url",
"image_url": {
"url": image_url_lion
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output:", result)
# Audio input inference
def run_audio() -> None:
# Any format supported by librosa is supported
audio_url = AudioAsset("winning_call").url
# Use audio url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
audio_base64 = encode_base64_content_from_url(audio_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": f"data:audio/ogg;base64,{audio_base64}"
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded audio:", result)
example_function_map = {
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"audio": run_audio,
}
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online inference with '
'multimodal language models served with vLLM.')
parser.add_argument(
'--chat-type',
'-c',
type=str,
default="single-image",
choices=["text-only", "single-image", "multi-image", "audio"],
help='Conversation type with multimodal data.')
args = parser.parse_args()
main(args)
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \
--chat-template examples/tool_chat_template_mistral.jinja \
--enable-auto-tool-choice --tool-call-parser mistral
OR
vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
--chat-template examples/tool_chat_template_hermes.jinja \
--enable-auto-tool-choice --tool-call-parser hermes
"""
import json
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["city", "state", "unit"]
}
}
}]
messages = [{
"role": "user",
"content": "Hi! How are you doing today?"
}, {
"role": "assistant",
"content": "I'm doing well! How can I help you?"
}, {
"role":
"user",
"content":
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print("Chat completion results:")
print(chat_completion)
print("\n\n")
tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
if chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls[0])
else:
print(chunk.choices[0].delta)
arguments = []
tool_call_idx = -1
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
if tool_call_idx >= 0:
print(
f"streamed tool call arguments: {arguments[tool_call_idx]}"
)
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
if tool_call.id:
print(f"streamed tool call id: {tool_call.id} ")
if tool_call.function:
if tool_call.function.name:
print(f"streamed tool call name: {tool_call.function.name}")
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
if len(arguments):
print(f"streamed tool call arguments: {arguments[-1]}")
print("\n\n")
messages.append({
"role": "assistant",
"tool_calls": chat_completion.choices[0].message.tool_calls
})
# Now, simulate a tool call
def get_current_weather(city: str, state: str, unit: 'str'):
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.")
available_tools = {"get_current_weather": get_current_weather}
completion_tool_calls = chat_completion.choices[0].message.tool_calls
for call in completion_tool_calls:
tool_to_call = available_tools[call.function.name]
args = json.loads(call.function.arguments)
result = tool_to_call(**args)
print(result)
messages.append({
"role": "tool",
"content": result,
"tool_call_id": call.id,
"name": call.function.name
})
chat_completion_2 = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=False)
print("\n\n")
print(chat_completion_2)
......@@ -19,7 +19,6 @@ responses = client.embeddings.create(
"The best thing about vLLM is that it supports many different models"
],
model=model,
encoding_format="float",
)
for data in responses.data:
......
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
"""
import base64
import requests
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Use image url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print(f"Chat completion output:{result}")
# Use base64 encoded image in the payload
def encode_image_base64_from_url(image_url: str) -> str:
"""Encode an image retrieved from a remote url to base64 format."""
with requests.get(image_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
image_base64 = encode_image_base64_from_url(image_url=image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print(f"Chat completion output:{result}")
......@@ -3,10 +3,10 @@
1. Install OpenTelemetry packages:
```
pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
```
1. Start Jaeger in a docker container:
......
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- '<_user>' + message['content'] +'<_bot>' -}}
{%- elif message['role'] == 'assistant' -%}
{{- message['content'] + '<_end>' -}}
{%- endif -%}
{%- endfor -%}
{%- macro json_to_python_type(json_spec) %}
{%- set basic_type_map = {
"string": "str",
"number": "float",
"integer": "int",
"boolean": "bool"
} %}
{%- if basic_type_map[json_spec.type] is defined %}
{{- basic_type_map[json_spec.type] }}
{%- elif json_spec.type == "array" %}
{{- "list[" + json_to_python_type(json_spec|items) + "]" }}
{%- elif json_spec.type == "object" %}
{%- if json_spec.additionalProperties is defined %}
{{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
{%- else %}
{{- "dict" }}
{%- endif %}
{%- elif json_spec.type is iterable %}
{{- "Union[" }}
{%- for t in json_spec.type %}
{{- json_to_python_type({"type": t}) }}
{%- if not loop.last %}
{{- "," }}
{%- endif %}
{%- endfor %}
{{- "]" }}
{%- else %}
{{- "Any" }}
{%- endif %}
{%- endmacro %}
{{- bos_token }}
{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
{%- if tools is iterable and tools | length > 0 %}
{%- for tool in tools %}
{%- if tool.function is defined %}
{%- set tool = tool.function %}
{%- endif %}
{{- '{"type": "function", "function": ' }}
{{- '{"name": "' + tool.name + '", ' }}
{{- '"description": "' + tool.name + '(' }}
{%- for param_name, param_fields in tool.parameters.properties|items %}
{{- param_name + ": " + json_to_python_type(param_fields) }}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- ")" }}
{%- if tool.return is defined %}
{{- " -> " + json_to_python_type(tool.return) }}
{%- endif %}
{{- " - " + tool.description + "\n\n" }}
{%- for param_name, param_fields in tool.parameters.properties|items %}
{%- if loop.first %}
{{- " Args:\n" }}
{%- endif %}
{{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
{%- endfor %}
{%- if tool.return is defined and tool.return.description is defined %}
{{- "\n Returns:\n " + tool.return.description }}
{%- endif %}
{{- '"' }}
{{- ', "parameters": ' }}
{%- if tool.parameters.properties | length == 0 %}
{{- "{}" }}
{%- else %}
{{- tool.parameters|tojson }}
{%- endif %}
{{- "}" }}
{%- if not loop.last %}
{{- "\n" }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- " </tools>" }}
{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
' }}
{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
" }}
{{- "<tool_call>
" }}
{{- '{"name": <function-name>, "arguments": <args-dict>}
' }}
{{- '</tool_call><|im_end|>' }}
{%- for message in messages %}
{%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" and message.tool_calls is defined %}
{{- '<|im_start|>' + message.role }}
{%- for tool_call in message.tool_calls %}
{{- '\n<tool_call>\n' }}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '{' }}
{{- '"name": "' }}
{{- tool_call.name }}
{{- '"' }}
{%- if tool_call.arguments is defined %}
{{- ', ' }}
{{- '"arguments": ' }}
{{- tool_call.arguments|tojson }}
{%- endif %}
{{- '}' }}
{{- '\n</tool_call>' }}
{%- endfor %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>tool\n' }}
{%- endif %}
{{- '<tool_response>\n' }}
{{- message.content }}
{%- if not loop.last %}
{{- '\n</tool_response>\n' }}
{%- else %}
{{- '\n</tool_response>' }}
{%- endif %}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>' }}
{%- elif loop.last %}
{{- '<|im_end|>' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- endif %}
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{{- bos_token }}
{%- if system_message is defined %}
{{- "<|im_start|>system\n" + system_message + "<|im_end|>\n" }}
{%- endif %}
{%- if tools is not none %}
{{- "<|im_start|>system name=<|plugin|>\n[" }}
{%- for tool in tools %}
{{- tool.function|tojson }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" }}
{%- endif %}
{%- endfor %}
{{- "<|im_end|>\n" }}
{%- endif %}
{%- for message in loop_messages %}
{%- if message["role"] == "user" %}
{{- "<|im_start|>user\n" + message["content"] + "<|im_end|>\n"}}
{%- elif message.tool_calls is defined and message.tool_calls is not none %}
{%- set content = message["content"] if message["content"] else "" %}
{{- "<|im_start|>assistant\n" + content }}
{%- for tool_call in message.tool_calls %}
{%- set function=tool_call.function %}
{{- "<|action_start|><|plugin|>\n" }}
{{- '{"name": "' + function.name + '", '}}
{{- '"arguments": ' + function.arguments|tojson + '}' }}
{{- "<|action_end|>" }}
{%- endfor %}
{{- "<|im_end|>\n" }}
{%- elif message["role"] == "assistant" %}
{{- "<|im_start|>assistant\n" + message["content"] + "<|im_end|>\n"}}
{%- elif message["role"] == "tool_results" or message["role"] == "tool" or message["role"] == "function" %}
{%- if message.content is defined and message.content.content is defined %}
{%- set content = message.content.content %}
{%- else %}
{%- set content = message.content %}
{%- endif %}
{{- "<|im_start|>environment name=<|plugin|>\n" + content|string + "<|im_end|>\n" }}
{%- else %}
{{- raise_exception("Only user and assistant and tool_results and tool and function roles are supported, with the exception of an initial optional system message!") }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- endif %}
\ No newline at end of file
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
{%- if strftime_now is defined %}
{%- set date_string = strftime_now("%d %b %Y") %}
{%- else %}
{%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
{%- endif %}
{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
{{- "Environment: ipython\n" }}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- system_message }}
{{- "<|eot_id|>" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
{{- "Given the following functions, please respond with a JSON for a function call " }}
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- first_user_message + "<|eot_id|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- elif 'tool_calls' in message %}
{%- if not message.tool_calls|length == 1 %}
{{- raise_exception("This model only supports single tool-calls at once!") }}
{%- endif %}
{%- set tool_call = message.tool_calls[0].function %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- '{"name": "' + tool_call.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.arguments | tojson }}
{{- "}" }}
{{- "<|eot_id|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
{%- if message.content is mapping %}
{{- message.content | tojson }}
{%- else %}
{{- { "output": message.content } | tojson }}
{%- endif %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = false %}
{%- endif %}
{%- if not date_string is defined %}
{%- if strftime_now is defined %}
{%- set date_string = strftime_now("%d %b %Y") %}
{%- else %}
{%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
{%- endif %}
{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
{{- "Environment: ipython\n" }}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- system_message }}
{{- "<|eot_id|>" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
{{- "Given the following functions, please respond with a JSON for a function call " }}
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- first_user_message + "<|eot_id|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- elif 'tool_calls' in message %}
{%- if not message.tool_calls|length == 1 %}
{{- raise_exception("This model only supports single tool-calls at once!") }}
{%- endif %}
{%- set tool_call = message.tool_calls[0].function %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- '{"name": "' + tool_call.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.arguments | tojson }}
{{- "}" }}
{{- "<|eot_id|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
{%- if message.content is mapping %}
{{- message.content | tojson }}
{%- else %}
{{- { "output": message.content } | tojson }}
{%- endif %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
{%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
{{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
{%- endif %}
{%- endfor %}
{{- bos_token }}
{%- for message in loop_messages %}
{%- if message["role"] == "user" %}
{%- if tools is not none and (message == user_messages[-1]) %}
{{- "[AVAILABLE_TOOLS] [" }}
{%- for tool in tools %}
{%- set tool = tool.function %}
{{- '{"type": "function", "function": {' }}
{%- for key, val in tool.items() if key != "return" %}
{%- if val is string %}
{{- '"' + key + '": "' + val + '"' }}
{%- else %}
{{- '"' + key + '": ' + val|tojson }}
{%- endif %}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- "}}" }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" }}
{%- endif %}
{%- endfor %}
{{- "[/AVAILABLE_TOOLS]" }}
{%- endif %}
{%- if loop.last and system_message is defined %}
{{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
{%- else %}
{{- "[INST] " + message["content"] + "[/INST]" }}
{%- endif %}
{%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
{%- if message.tool_calls is defined %}
{%- set tool_calls = message.tool_calls %}
{%- else %}
{%- set tool_calls = message.content %}
{%- endif %}
{{- "[TOOL_CALLS] [" }}
{%- for tool_call in tool_calls %}
{%- set out = tool_call.function|tojson %}
{{- out[:-1] }}
{%- if not tool_call.id is defined or tool_call.id|length < 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
{%- endif %}
{{- ', "id": "' + tool_call.id[-9:] + '"}' }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" + eos_token }}
{%- endif %}
{%- endfor %}
{%- elif message["role"] == "assistant" %}
{{- " " + message["content"] + eos_token }}
{%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
{%- if message.content is defined and message.content.content is defined %}
{%- set content = message.content.content %}
{%- else %}
{%- set content = message.content %}
{%- endif %}
{{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
{%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
{%- endif %}
{{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
{%- else %}
{{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
{%- endif %}
{%- endfor %}
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- elif tools is not none %}
{%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
{%- if system_message is defined %}
{%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
{%- else %}
{%- set system_message = parallel_tool_prompt %}
{%- endif %}
{%- endif %}
{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
{%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
{{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
{%- endif %}
{%- endfor %}
{{- bos_token }}
{%- for message in loop_messages %}
{%- if message["role"] == "user" %}
{%- if tools is not none and (message == user_messages[-1]) %}
{{- "[AVAILABLE_TOOLS] [" }}
{%- for tool in tools %}
{%- set tool = tool.function %}
{{- '{"type": "function", "function": {' }}
{%- for key, val in tool.items() if key != "return" %}
{%- if val is string %}
{{- '"' + key + '": "' + val + '"' }}
{%- else %}
{{- '"' + key + '": ' + val|tojson }}
{%- endif %}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- "}}" }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" }}
{%- endif %}
{%- endfor %}
{{- "[/AVAILABLE_TOOLS]" }}
{%- endif %}
{%- if loop.last and system_message is defined %}
{{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
{%- else %}
{{- "[INST] " + message["content"] + "[/INST]" }}
{%- endif %}
{%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
{%- if message.tool_calls is defined %}
{%- set tool_calls = message.tool_calls %}
{%- else %}
{%- set tool_calls = message.content %}
{%- endif %}
{{- "[TOOL_CALLS] [" }}
{%- for tool_call in tool_calls %}
{%- set out = tool_call.function|tojson %}
{{- out[:-1] }}
{%- if not tool_call.id is defined or tool_call.id|length < 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
{%- endif %}
{{- ', "id": "' + tool_call.id[-9:] + '"}' }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" + eos_token }}
{%- endif %}
{%- endfor %}
{%- elif message["role"] == "assistant" %}
{{- " " + message["content"] + eos_token }}
{%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
{%- if message.content is defined and message.content.content is defined %}
{%- set content = message.content.content %}
{%- else %}
{%- set content = message.content %}
{%- endif %}
{{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
{%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
{%- endif %}
{{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
{%- else %}
{{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
{%- endif %}
{%- endfor %}
import importlib
import traceback
from typing import Callable
from unittest.mock import patch
def find_cuda_init(fn: Callable[[], object]) -> None:
"""
Helper function to debug CUDA re-initialization errors.
If `fn` initializes CUDA, prints the stack trace of how this happens.
"""
from torch.cuda import _lazy_init
stack = None
def wrapper():
nonlocal stack
stack = traceback.extract_stack()
return _lazy_init()
with patch("torch.cuda._lazy_init", wrapper):
fn()
if stack is not None:
print("==== CUDA Initialized ====")
print("".join(traceback.format_list(stack)).strip())
print("==========================")
if __name__ == "__main__":
find_cuda_init(
lambda: importlib.import_module("vllm.model_executor.models.llava"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment