Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
...@@ -4,13 +4,17 @@ This example shows how to use vLLM for running offline inference with ...@@ -4,13 +4,17 @@ This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation, multi-image input on vision language models for text generation,
using the chat template defined by the model. using the chat template defined by the model.
""" """
import os
from argparse import Namespace from argparse import Namespace
from typing import List, NamedTuple, Optional from dataclasses import asdict
from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.multimodal.utils import fetch_image from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
...@@ -22,11 +26,12 @@ IMAGE_URLS = [ ...@@ -22,11 +26,12 @@ IMAGE_URLS = [
class ModelRequestData(NamedTuple): class ModelRequestData(NamedTuple):
llm: LLM engine_args: EngineArgs
prompt: str prompt: str
stop_token_ids: Optional[List[int]] image_data: list[Image]
image_data: List[Image] stop_token_ids: Optional[list[int]] = None
chat_template: Optional[str] chat_template: Optional[str] = None
lora_requests: Optional[list[LoRARequest]] = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
...@@ -34,53 +39,91 @@ class ModelRequestData(NamedTuple): ...@@ -34,53 +39,91 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4. # Unless specified, these settings have been tested to work on a single L4.
def load_aria(question, image_urls: List[str]) -> ModelRequestData: def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "rhymes-ai/Aria" model_name = "rhymes-ai/Aria"
llm = LLM(model=model_name, engine_args = EngineArgs(
tokenizer_mode="slow", model=model_name,
trust_remote_code=True, tokenizer_mode="slow",
dtype="bfloat16", trust_remote_code=True,
limit_mm_per_prompt={"image": len(image_urls)}) dtype="bfloat16",
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls) placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
"<|im_start|>assistant\n") "<|im_start|>assistant\n")
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_deepseek_vl2(question: str, image_urls: List[str]): def load_deepseek_vl2(question: str,
image_urls: list[str]) -> ModelRequestData:
model_name = "deepseek-ai/deepseek-vl2-tiny" model_name = "deepseek-ai/deepseek-vl2-tiny"
llm = LLM(model=model_name, engine_args = EngineArgs(
max_model_len=4096, model=model_name,
max_num_seqs=2, max_model_len=4096,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)}) hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholder = "".join(f"image_{i}:<image>\n" placeholder = "".join(f"image_{i}:<image>\n"
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "google/gemma-3-4b-it"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-800m" model_name = "h2oai/h2ovl-mississippi-800m"
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
...@@ -103,19 +146,18 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -103,19 +146,18 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
stop_token_ids = [tokenizer.eos_token_id] stop_token_ids = [tokenizer.eos_token_id]
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceM4/Idefics3-8B-Llama3" model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=16, max_num_seqs=16,
...@@ -134,18 +176,16 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: ...@@ -134,18 +176,16 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B" model_name = "OpenGVLab/InternVL2-2B"
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
...@@ -171,19 +211,18 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -171,19 +211,18 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_mllama(question, image_urls: List[str]) -> ModelRequestData: def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=16, max_num_seqs=16,
...@@ -193,19 +232,17 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ...@@ -193,19 +232,17 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
placeholders = "<|image|>" * len(image_urls) placeholders = "<|image|>" * len(image_urls)
prompt = f"{placeholders}<|begin_of_text|>{question}" prompt = f"{placeholders}<|begin_of_text|>{question}"
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_nvlm_d(question: str, image_urls: List[str]): def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "nvidia/NVLM-D-72B" model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
...@@ -223,22 +260,19 @@ def load_nvlm_d(question: str, image_urls: List[str]): ...@@ -223,22 +260,19 @@ def load_nvlm_d(question: str, image_urls: List[str]):
prompt = tokenizer.apply_chat_template(messages, prompt = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b" model_name = "mistral-community/pixtral-12b"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
...@@ -248,18 +282,15 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -248,18 +282,15 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
placeholders = "[IMG]" * len(image_urls) placeholders = "[IMG]" * len(image_urls)
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]" prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
# num_crops is an override kwarg to the multimodal image processor; # num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame. # to use 16 for single frame scenarios, and 4 for multi-frame.
...@@ -272,7 +303,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -272,7 +303,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
# #
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM( engine_args = EngineArgs(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
...@@ -283,21 +314,50 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -283,21 +314,50 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
placeholders = "\n".join(f"<|image_{i}|>" placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None, )
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process multi images inputs.
"""
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=10000,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
)
placeholders = "".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
) )
def load_qwen_vl_chat(question: str, def load_qwen_vl_chat(question: str,
image_urls: List[str]) -> ModelRequestData: image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat" model_name = "Qwen/Qwen-VL-Chat"
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
...@@ -328,7 +388,7 @@ def load_qwen_vl_chat(question: str, ...@@ -328,7 +388,7 @@ def load_qwen_vl_chat(question: str,
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
...@@ -336,7 +396,7 @@ def load_qwen_vl_chat(question: str, ...@@ -336,7 +396,7 @@ def load_qwen_vl_chat(question: str,
) )
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -348,7 +408,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -348,7 +408,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40 # Tested on L40
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5, max_num_seqs=5,
...@@ -377,23 +437,19 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -377,23 +437,19 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None: if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls] image_data = [fetch_image(url) for url in image_urls]
else: else:
image_data, _ = process_vision_info(messages) image_data, _ = process_vision_info(messages)
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data, image_data=image_data,
chat_template=None,
) )
def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -404,7 +460,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -404,7 +460,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
model_name = "Qwen/Qwen2.5-VL-3B-Instruct" model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
llm = LLM( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5, max_num_seqs=5,
...@@ -433,32 +489,30 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -433,32 +489,30 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None: if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls] image_data = [fetch_image(url) for url in image_urls]
else: else:
image_data, _ = process_vision_info(messages, image_data, _ = process_vision_info(messages,
return_video_sample_fps=False) return_video_kwargs=False)
return ModelRequestData( return ModelRequestData(
llm=llm, engine_args=engine_args,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data, image_data=image_data,
chat_template=None,
) )
model_example_map = { model_example_map = {
"aria": load_aria, "aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2, "deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl, "h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"mllama": load_mllama, "mllama": load_mllama,
"NVLM_D": load_nvlm_d, "NVLM_D": load_nvlm_d,
"phi3_v": load_phi3v, "phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"pixtral_hf": load_pixtral_hf, "pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat, "qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl, "qwen2_vl": load_qwen2_vl,
...@@ -466,14 +520,25 @@ model_example_map = { ...@@ -466,14 +520,25 @@ model_example_map = {
} }
def run_generate(model, question: str, image_urls: List[str]): def run_generate(model, question: str, image_urls: list[str],
seed: Optional[int]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
max_tokens=128, max_tokens=128,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=req_data.stop_token_ids)
outputs = req_data.llm.generate( outputs = llm.generate(
{ {
"prompt": req_data.prompt, "prompt": req_data.prompt,
"multi_modal_data": { "multi_modal_data": {
...@@ -487,13 +552,24 @@ def run_generate(model, question: str, image_urls: List[str]): ...@@ -487,13 +552,24 @@ def run_generate(model, question: str, image_urls: List[str]):
print(generated_text) print(generated_text)
def run_chat(model: str, question: str, image_urls: List[str]): def run_chat(model: str, question: str, image_urls: list[str],
seed: Optional[int]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
max_tokens=128, max_tokens=128,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=req_data.stop_token_ids)
outputs = req_data.llm.chat( outputs = llm.chat(
[{ [{
"role": "role":
"user", "user",
...@@ -522,11 +598,12 @@ def run_chat(model: str, question: str, image_urls: List[str]): ...@@ -522,11 +598,12 @@ def run_chat(model: str, question: str, image_urls: List[str]):
def main(args: Namespace): def main(args: Namespace):
model = args.model_type model = args.model_type
method = args.method method = args.method
seed = args.seed
if method == "generate": if method == "generate":
run_generate(model, QUESTION, IMAGE_URLS) run_generate(model, QUESTION, IMAGE_URLS, seed)
elif method == "chat": elif method == "chat":
run_chat(model, QUESTION, IMAGE_URLS) run_chat(model, QUESTION, IMAGE_URLS, seed)
else: else:
raise ValueError(f"Invalid method: {method}") raise ValueError(f"Invalid method: {method}")
...@@ -547,6 +624,10 @@ if __name__ == "__main__": ...@@ -547,6 +624,10 @@ if __name__ == "__main__":
default="generate", default="generate",
choices=["generate", "chat"], choices=["generate", "chat"],
help="The method to run in `vllm.LLM`.") help="The method to run in `vllm.LLM`.")
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
# SPDX-License-Identifier: Apache-2.0
import time
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
# Create a Whisper encoder/decoder model instance
llm = LLM(
model="openai/whisper-large-v3",
max_model_len=448,
max_num_seqs=400,
limit_mm_per_prompt={"audio": 1},
kv_cache_dtype="fp8",
)
prompts = [
{
"prompt": "<|startoftranscript|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt": "<|startoftranscript|>",
}
] * 1024
# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
)
start = time.time()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Encoder prompt: {encoder_prompt!r}, "
f"Decoder prompt: {prompt!r}, "
f"Generated text: {generated_text!r}")
duration = time.time() - start
print("Duration:", duration)
print("RPS:", len(prompts) / duration)
...@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API. ...@@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import argparse import argparse
import json import json
from typing import Iterable, List from collections.abc import Iterable
import requests import requests
...@@ -39,17 +39,17 @@ def post_http_request(prompt: str, ...@@ -39,17 +39,17 @@ def post_http_request(prompt: str,
return response return response
def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False, decode_unicode=False,
delimiter=b"\0"): delimiter=b"\n"):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"] output = data["text"]
yield output yield output
def get_response(response: requests.Response) -> List[str]: def get_response(response: requests.Response) -> list[str]:
data = json.loads(response.content) data = json.loads(response.content)
output = data["text"] output = data["text"]
return output return output
......
...@@ -8,6 +8,9 @@ set -xe ...@@ -8,6 +8,9 @@ set -xe
echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
sleep 1 sleep 1
# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
# Trap the SIGINT signal (triggered by Ctrl+C) # Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT trap 'cleanup' INT
...@@ -44,18 +47,20 @@ wait_for_server() { ...@@ -44,18 +47,20 @@ wait_for_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference. # You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer # prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--port 8100 \ --port 8100 \
--max-model-len 100 \ --max-model-len 100 \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
# decoding instance, which is the KV consumer # decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--port 8200 \ --port 8200 \
--max-model-len 100 \ --max-model-len 100 \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
...@@ -78,7 +83,7 @@ sleep 1 ...@@ -78,7 +83,7 @@ sleep 1
output1=$(curl -X POST -s http://localhost:8000/v1/completions \ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "'"$MODEL_NAME"'",
"prompt": "San Francisco is a", "prompt": "San Francisco is a",
"max_tokens": 10, "max_tokens": 10,
"temperature": 0 "temperature": 0
...@@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \ ...@@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
output2=$(curl -X POST -s http://localhost:8000/v1/completions \ output2=$(curl -X POST -s http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "'"$MODEL_NAME"'",
"prompt": "Santa Clara is a", "prompt": "Santa Clara is a",
"max_tokens": 10, "max_tokens": 10,
"temperature": 0 "temperature": 0
......
...@@ -21,7 +21,7 @@ def http_bot(prompt): ...@@ -21,7 +21,7 @@ def http_bot(prompt):
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False, decode_unicode=False,
delimiter=b"\0"): delimiter=b"\n"):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"][0] output = data["text"][0]
......
#!/bin/bash
subcommand=$1
shift
ray_port=6379
ray_init_timeout=300
declare -a start_params
case "$subcommand" in
worker)
ray_address=""
while [ $# -gt 0 ]; do
case "$1" in
--ray_address=*)
ray_address="${1#*=}"
;;
--ray_port=*)
ray_port="${1#*=}"
;;
--ray_init_timeout=*)
ray_init_timeout="${1#*=}"
;;
*)
start_params+=("$1")
esac
shift
done
if [ -z "$ray_address" ]; then
echo "Error: Missing argument --ray_address"
exit 1
fi
for (( i=0; i < $ray_init_timeout; i+=5 )); do
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
if [ $? -eq 0 ]; then
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
exit 0
fi
echo "Waiting until the ray worker is active..."
sleep 5s;
done
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
exit 1
;;
leader)
ray_cluster_size=""
while [ $# -gt 0 ]; do
case "$1" in
--ray_port=*)
ray_port="${1#*=}"
;;
--ray_cluster_size=*)
ray_cluster_size="${1#*=}"
;;
--ray_init_timeout=*)
ray_init_timeout="${1#*=}"
;;
*)
start_params+=("$1")
esac
shift
done
if [ -z "$ray_cluster_size" ]; then
echo "Error: Missing argument --ray_cluster_size"
exit 1
fi
# start the ray daemon
ray start --head --port=$ray_port "${start_params[@]}"
# wait until all workers are active
for (( i=0; i < $ray_init_timeout; i+=5 )); do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
if [ $active_nodes -eq $ray_cluster_size ]; then
echo "All ray workers are active and the ray cluster is initialized successfully."
exit 0
fi
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
sleep 5s;
done
echo "Waiting for all ray workers to be active timed out."
exit 1
;;
*)
echo "unknown subcommand: $subcommand"
exit 1
;;
esac
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
from enum import Enum from enum import Enum
from openai import OpenAI from openai import BadRequestError, OpenAI
from pydantic import BaseModel from pydantic import BaseModel
client = OpenAI( client = OpenAI(
...@@ -94,3 +94,26 @@ completion = client.chat.completions.create( ...@@ -94,3 +94,26 @@ completion = client.chat.completions.create(
extra_body={"guided_grammar": simplified_sql_grammar}, extra_body={"guided_grammar": simplified_sql_grammar},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
# Extra backend options
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
try:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={
"guided_regex": "\w+@\w+\.com\n",
"stop": ["\n"],
"guided_decoding_backend": "xgrammar:no-fallback"
},
)
except BadRequestError as e:
print("This error is expected:", e)
# SPDX-License-Identifier: Apache-2.0
"""
An example shows how to generate structured outputs from reasoning models
like DeepSeekR1. The thinking process will not be guided by the JSON
schema provided by the user. Only the final output will be structured.
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
using the OpenAI Python client library.
"""
from enum import Enum
from openai import OpenAI
from pydantic import BaseModel
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Guided decoding by Regex
prompt = ("What is the capital of France?")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={
"guided_regex": "(Paris|London)",
},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
class People(BaseModel):
name: str
age: int
json_schema = People.model_json_schema()
prompt = ("Generate a JSON with the name and age of one random person.")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
json_schema = CarDescription.model_json_schema()
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# Guided decoding by Grammar
simplified_sql_grammar = """
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
completion = client.chat.completions.create(
model=model,
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
# SPDX-License-Identifier: Apache-2.0
"""
An example demonstrates how to use tool calling with reasoning models
like QwQ-32B. The reasoning_content will not be parsed by the tool
calling process; only the final output will be parsed.
To run this example, you need to start the vLLM server with both
the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B \
--enable-reasoning --reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes
```
"""
from openai import OpenAI
# Now, simulate a tool call
def get_current_weather(city: str, state: str, unit: 'str'):
return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's.")
available_tools = {"get_current_weather": get_current_weather}
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["city", "state", "unit"]
}
}
}]
messages = [{
"role": "user",
"content": "Hi! How are you doing today?"
}, {
"role": "assistant",
"content": "I'm doing well! How can I help you?"
}, {
"role":
"user",
"content":
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
def extract_reasoning_and_calls(chunks: list):
reasoning_content = ""
tool_call_idx = -1
arguments = []
function_names = []
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
function_names.append("")
if tool_call.function:
if tool_call.function.name:
function_names[tool_call_idx] = tool_call.function.name
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
else:
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content += chunk.choices[0].delta.reasoning_content
return reasoning_content, arguments, function_names
print("---------Full Generate With Automatic Function Calling-------------")
tool_calls = client.chat.completions.create(messages=messages,
model=model,
tools=tools)
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
print(f"function name: "
f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
print(f"function arguments: "
f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
print("----------Stream Generate With Automatic Function Calling-----------")
tool_calls_stream = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("----------Full Generate With Named Function Calling-----------------")
tool_calls = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice={
"type": "function",
"function": {
"name":
"get_current_weather"
}
})
tool_call = tool_calls.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
print(f"function name: {tool_call.name}")
print(f"function arguments: {tool_call.arguments}")
print("----------Stream Generate With Named Function Calling--------------")
tool_calls_stream = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice={
"type": "function",
"function": {
"name": "get_current_weather"
}
},
stream=True)
chunks = []
for chunk in tool_calls_stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
print(f"reasoning_content: {reasoning_content}")
print(f"function name: {function_names[0]}")
print(f"function arguments: {arguments[0]}")
print("\n\n")
...@@ -19,73 +19,50 @@ in real-time as they are generated by the model. This is useful for scenarios ...@@ -19,73 +19,50 @@ in real-time as they are generated by the model. This is useful for scenarios
where you want to display chat completions to the user as they are generated where you want to display chat completions to the user as they are generated
by the model. by the model.
Here we do not use the OpenAI Python client library, because it does not support Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
`reasoning_content` fields in the response. content may not exist leading to errors if you try to access it.
""" """
import json from openai import OpenAI
import requests
# Modify OpenAI's API key and API base to use vLLM's API server. # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
models = requests.get( client = OpenAI(
f"{openai_api_base}/models", api_key=openai_api_key,
headers={ base_url=openai_api_base,
"Authorization": f"Bearer {openai_api_key}" )
},
).json()
model = models["data"][0]["id"]
# Streaming chat completions models = client.models.list()
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] model = models.data[0].id
response = requests.post( messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
f"{openai_api_base}/chat/completions", stream = client.chat.completions.create(model=model,
headers={"Authorization": f"Bearer {openai_api_key}"}, messages=messages,
json={ stream=True)
"model": model,
"messages": messages,
"stream": True
},
)
print("client: Start streaming chat completions...") print("client: Start streaming chat completions...")
printed_reasoning_content = False printed_reasoning_content = False
printed_content = False printed_content = False
# Make the streaming request
if response.status_code == 200:
# Process the streaming response
for line in response.iter_lines():
if line: # Filter out keep-alive new lines
# Decode the line and parse the JSON
decoded_line = line.decode("utf-8")
if decoded_line.startswith("data:"):
data = decoded_line[5:].strip() # Remove "data:" prefix
if data == "[DONE]": # End of stream
print("\nclient: Stream completed.")
break
try:
# Parse the JSON data
chunk = json.loads(data)
reasoning_content = chunk["choices"][0]["delta"].get(
"reasoning_content", "")
content = chunk["choices"][0]["delta"].get("content", "")
if reasoning_content: for chunk in stream:
if not printed_reasoning_content: reasoning_content = None
printed_reasoning_content = True content = None
print("reasoning_content:", end="", flush=True) # Check the content is reasoning_content or content
print(reasoning_content, end="", flush=True) if hasattr(chunk.choices[0].delta, "reasoning_content"):
elif content: reasoning_content = chunk.choices[0].delta.reasoning_content
if not printed_content: elif hasattr(chunk.choices[0].delta, "content"):
printed_content = True content = chunk.choices[0].delta.content
print("\ncontent:", end="", flush=True)
# Extract and print the content if reasoning_content is not None:
print(content, end="", flush=True) if not printed_reasoning_content:
except json.JSONDecodeError: printed_reasoning_content = True
print("Error decoding JSON:", decoded_line) print("reasoning_content:", end="", flush=True)
else: print(reasoning_content, end="", flush=True)
print(f"Error: {response.status_code} - {response.text}") elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
...@@ -102,7 +102,7 @@ if __name__ == '__main__': ...@@ -102,7 +102,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve " "Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this.") "the model with --task embed before running this.")
parser.add_argument("model", parser.add_argument("--model",
type=str, type=str,
choices=["vlm2vec", "dse_qwen2_vl"], choices=["vlm2vec", "dse_qwen2_vl"],
required=True, required=True,
......
...@@ -24,4 +24,4 @@ responses = client.embeddings.create( ...@@ -24,4 +24,4 @@ responses = client.embeddings.create(
) )
for data in responses.data: for data in responses.data:
print(data.embedding) # list of float of len 4096 print(data.embedding) # List of float of len 4096
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio
import json
import httpx
from openai import OpenAI from openai import OpenAI
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
...@@ -13,11 +17,50 @@ client = OpenAI( ...@@ -13,11 +17,50 @@ client = OpenAI(
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
with open(str(mary_had_lamb), "rb") as f:
transcription = client.audio.transcriptions.create(
file=f, def sync_openai():
model="openai/whisper-large-v3", with open(str(mary_had_lamb), "rb") as f:
language="en", transcription = client.audio.transcriptions.create(
response_format="text", file=f,
temperature=0.0) model="openai/whisper-small",
print("transcription result:", transcription) language="en",
response_format="json",
temperature=0.0)
print("transcription result:", transcription.text)
sync_openai()
# OpenAI Transcription API client does not support streaming.
async def stream_openai_response():
data = {
"language": "en",
'stream': True,
"model": "openai/whisper-large-v3",
}
url = openai_api_base + "/audio/transcriptions"
print("transcription result:", end=' ')
async with httpx.AsyncClient() as client:
with open(str(winning_call), "rb") as f:
async with client.stream('POST', url, files={'file': f},
data=data) as response:
async for line in response.aiter_lines():
# Each line is a JSON object prefixed with 'data: '
if line:
if line.startswith('data: '):
line = line[len('data: '):]
# Last chunk, stream ends
if line.strip() == '[DONE]':
break
# Parse the JSON response
chunk = json.loads(line)
# Extract and print the content
content = chunk['choices'][0].get('delta',
{}).get('content')
print(content, end='')
# Run the asynchronous function
asyncio.run(stream_openai_response())
...@@ -28,7 +28,6 @@ with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span: ...@@ -28,7 +28,6 @@ with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
"model": "facebook/opt-125m", "model": "facebook/opt-125m",
"prompt": prompt, "prompt": prompt,
"max_tokens": 10, "max_tokens": 10,
"best_of": 20,
"n": 3, "n": 3,
"use_beam_search": "true", "use_beam_search": "true",
"temperature": 0.0, "temperature": 0.0,
......
...@@ -1260,7 +1260,7 @@ ...@@ -1260,7 +1260,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "edx8memhpd9tsa" "uid": "${DS_PROMETHEUS}"
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
...@@ -1360,7 +1360,7 @@ ...@@ -1360,7 +1360,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "edx8memhpd9tsa" "uid": "${DS_PROMETHEUS}"
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
...@@ -1473,7 +1473,7 @@ ...@@ -1473,7 +1473,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "edx8memhpd9tsa" "uid": "${DS_PROMETHEUS}"
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
...@@ -1523,7 +1523,7 @@ ...@@ -1523,7 +1523,7 @@
}, },
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "edx8memhpd9tsa" "uid": "${DS_PROMETHEUS}"
}, },
"definition": "label_values(model_name)", "definition": "label_values(model_name)",
"hide": 0, "hide": 0,
......
...@@ -49,7 +49,8 @@ disabled, an error will occur while starting vLLM. ...@@ -49,7 +49,8 @@ disabled, an error will occur while starting vLLM.
### Example 1: Customize vLLM root logger ### Example 1: Customize vLLM root logger
For this example, we will customize the vLLM root logger to use For this example, we will customize the vLLM root logger to use
[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to [`python-json-logger`](https://github.com/nhairs/python-json-logger)
(which is part of the container image) to log to
STDOUT of the console in JSON format with a log level of `INFO`. STDOUT of the console in JSON format with a log level of `INFO`.
To begin, first, create an appropriate JSON logging configuration file: To begin, first, create an appropriate JSON logging configuration file:
...@@ -82,12 +83,6 @@ To begin, first, create an appropriate JSON logging configuration file: ...@@ -82,12 +83,6 @@ To begin, first, create an appropriate JSON logging configuration file:
} }
``` ```
Next, install the `python-json-logger` package if it's not already installed:
```bash
pip install python-json-logger
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file: to the path of the custom logging configuration JSON file:
...@@ -132,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence: ...@@ -132,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
"vllm": { "vllm": {
"handlers": ["vllm"], "handlers": ["vllm"],
"level": "DEBUG", "level": "DEBUG",
"propagage": false "propagate": false
}, },
"vllm.example_noisy_logger": { "vllm.example_noisy_logger": {
"propagate": false "propagate": false
......
...@@ -27,7 +27,7 @@ https://github.com/coreweave/tensorizer ...@@ -27,7 +27,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something To serialize a model, install vLLM from source, then run something
like this from the root level of this repository: like this from the root level of this repository:
python -m examples.offline_inference.tensorize_vllm_model \ python -m examples.other.tensorize_vllm_model \
--model facebook/opt-125m \ --model facebook/opt-125m \
serialize \ serialize \
--serialized-directory s3://my-bucket \ --serialized-directory s3://my-bucket \
...@@ -47,7 +47,7 @@ providing a `--keyfile` argument. ...@@ -47,7 +47,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root To deserialize a model, you can run something like this from the root
level of this repository: level of this repository:
python -m examples.offline_inference.tensorize_vllm_model \ python -m examples.other.tensorize_vllm_model \
--model EleutherAI/gpt-j-6B \ --model EleutherAI/gpt-j-6B \
--dtype float16 \ --dtype float16 \
deserialize \ deserialize \
...@@ -65,11 +65,11 @@ shard's rank. Sharded models serialized with this script will be named as ...@@ -65,11 +65,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors model-rank-%03d.tensors
For more information on the available arguments for serializing, run For more information on the available arguments for serializing, run
`python -m examples.offline_inference.tensorize_vllm_model serialize --help`. `python -m examples.other.tensorize_vllm_model serialize --help`.
Or for deserializing: Or for deserializing:
`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`. `python -m examples.other.tensorize_vllm_model deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models: directly to load models:
...@@ -90,7 +90,7 @@ TensorizerConfig arguments desired. ...@@ -90,7 +90,7 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run: loading with tensorizer that are given to `TensorizerConfig`, run:
`python -m examples.offline_inference.tensorize_vllm_model deserialize --help` `python -m examples.other.tensorize_vllm_model deserialize --help`
under the `tensorizer options` section. These can also be used for under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and deserialization in this example script, although `--tensorizer-uri` and
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
{%- endif -%} {%- endif -%}
{%- if message['role'] == 'user' -%} {%- if message['role'] == 'user' -%}
{{ '<|User|>: ' + message['content'] + '\n' }} {{ '<|User|>: ' + message['content'] + '\n\n' }}
{%- elif message['role'] == 'assistant' -%} {%- elif message['role'] == 'assistant' -%}
{{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }} {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
{%- endif -%} {%- endif -%}
{%- endfor -%} {%- endfor -%}
{%- if add_generation_prompt -%} {%- if add_generation_prompt -%}
{{ '<|Assistant|>: ' }} {{ '<|Assistant|>: ' }}
{% endif %} {%- endif -%}
{%- for message in messages %}
{%- if message['role'] == 'user' %}
{{- '<_user>' + message['content']|trim }}
{%- elif message['role'] == 'system' %}
{{- '<_system>' + message['content']|trim }}
{%- elif message['role'] == 'assistant' %}
{{- '<_bot>' + message['content'] }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<_bot>' }}
{%- endif %}
#!/bin/bash #!/bin/bash
echo "vLLM linting system has been moved from format.sh to pre-commit hook." echo "vLLM linting system has been moved from format.sh to pre-commit hook."
echo "Please run 'pip install -r requirements-lint.txt', followed by" echo "Please run 'pip install -r requirements/lint.txt', followed by"
echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook." echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
echo "Then linters will run automatically before each commit." echo "Then linters will run automatically before each commit."
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment