"lib/bindings/vscode:/vscode.git/clone" did not exist on "22fbc022e12bbc043f4fe6f814d13dedd3e9df14"
Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
...@@ -14,7 +14,7 @@ import tqdm ...@@ -14,7 +14,7 @@ import tqdm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.profiler import layerwise_profile from vllm.profiler.layerwise_profile import layerwise_profile
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
BATCH_SIZE_DEFAULT = 1 BATCH_SIZE_DEFAULT = 1
...@@ -193,7 +193,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], ...@@ -193,7 +193,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
batch_size = context.batch_size batch_size = context.batch_size
prompt_len = context.prompt_len prompt_len = context.prompt_len
scheduler_config = llm.llm_engine.scheduler_config scheduler_config = llm.llm_engine.vllm_config.scheduler_config
max_model_len = llm.llm_engine.model_config.max_model_len max_model_len = llm.llm_engine.model_config.max_model_len
max_num_batched_tokens = scheduler_config.max_num_batched_tokens max_num_batched_tokens = scheduler_config.max_num_batched_tokens
max_num_seqs = scheduler_config.max_num_seqs max_num_seqs = scheduler_config.max_num_seqs
......
...@@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult: ...@@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
"image": "image":
ImageAsset("cherry_blossom").pil_image.convert("RGB"), ImageAsset("cherry_blossom").pil_image.convert("RGB"),
"video": "video":
VideoAsset(name="sample_demo_1.mp4", VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
num_frames=16).np_ndarrays,
}, },
}, },
limit_mm_per_prompt={ limit_mm_per_prompt={
...@@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult: ...@@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
"<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n") f"<|im_start|>assistant\n")
asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16) asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000) audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
"Please launch this example with " "Please launch this example with "
...@@ -141,7 +140,7 @@ def main(args): ...@@ -141,7 +140,7 @@ def main(args):
print(generated_text) print(generated_text)
if __name__ == "__main__": def parse_args():
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description='Demo on using vLLM for offline inference with '
'audio language models') 'audio language models')
...@@ -156,5 +155,9 @@ if __name__ == "__main__": ...@@ -156,5 +155,9 @@ if __name__ == "__main__":
default=None, default=None,
help="Set the seed when initializing `vllm.LLM`.") help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args() return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args) main(args)
# SPDX-License-Identifier: Apache-2.0
import os
from urllib.request import urlopen
from vllm import LLM, SamplingParams
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
def load_prompt() -> str:
# Test cases with various lengths can be found at:
#
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
with urlopen(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com"
"/Qwen2.5-1M/test-data/600k.txt",
timeout=5) as response:
prompt = response.read().decode('utf-8')
return prompt
# Processing the prompt.
def process_requests(llm: LLM, prompts: list[str]) -> None:
# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.8,
top_k=20,
repetition_penalty=1.05,
detokenize=True,
max_tokens=256,
)
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt_token_ids = output.prompt_token_ids
generated_text = output.outputs[0].text
print(f"Prompt length: {len(prompt_token_ids)}, "
f"Generated text: {generated_text!r}")
# Create an LLM.
def initialize_engine() -> LLM:
llm = LLM(model="Qwen/Qwen2.5-7B-Instruct-1M",
max_model_len=1048576,
tensor_parallel_size=4,
enforce_eager=True,
enable_chunked_prefill=True,
max_num_batched_tokens=131072)
return llm
def main():
llm = initialize_engine()
prompt = load_prompt()
process_requests(llm, [prompt])
if __name__ == '__main__':
main()
...@@ -8,6 +8,8 @@ the argument 2 should match the `tensor_parallel_size` below. ...@@ -8,6 +8,8 @@ the argument 2 should match the `tensor_parallel_size` below.
see `tests/distributed/test_torchrun_example.py` for the unit test. see `tests/distributed/test_torchrun_example.py` for the unit test.
""" """
import torch.distributed as dist
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
# Create prompts, the same across all ranks # Create prompts, the same across all ranks
...@@ -27,23 +29,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ...@@ -27,23 +29,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# all ranks have the same random seed, so that sampling can be # all ranks have the same random seed, so that sampling can be
# deterministic across ranks. # deterministic across ranks.
llm = LLM( llm = LLM(
model="facebook/opt-125m", model="meta-llama/Llama-3.1-8B",
tensor_parallel_size=2, tensor_parallel_size=2,
pipeline_parallel_size=2,
distributed_executor_backend="external_launcher", distributed_executor_backend="external_launcher",
seed=0, max_model_len=32768,
seed=1,
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# all ranks will have the same outputs # all ranks will have the same outputs
print("-" * 50) if dist.get_rank() == 0:
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50) print("-" * 50)
""" for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}\n")
print("-" * 50)
"""
Further tips: Further tips:
1. to communicate control messages across all ranks, use the cpu group, 1. to communicate control messages across all ranks, use the cpu group,
......
...@@ -22,7 +22,8 @@ def main(): ...@@ -22,7 +22,8 @@ def main():
# In real workloads, `enforace_eager` should be `False`. # In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64, max_num_batched_tokens=64,
max_num_seqs=4) max_num_seqs=4,
max_model_len=128)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
print("-" * 50) print("-" * 50)
for output, answer in zip(outputs, answers): for output, answer in zip(outputs, answers):
......
...@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: ...@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
...@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: ...@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True}, mm_processor_kwargs={"crop_to_patches": True},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
...@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts = [f"Question: {question} Answer:" for question in questions] prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b", model="Salesforce/blip2-opt-6.7b",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ...@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model="facebook/chameleon-7b", model="facebook/chameleon-7b",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
...@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions] prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
...@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: ...@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model="adept/fuyu-8b", model="adept/fuyu-8b",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True}, mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [("<bos><start_of_turn>user\n" prompts = [("<bos><start_of_turn>user\n"
...@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}, hf_overrides={"architectures": ["GLM4VForCausalLM"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
...@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 3 * 364 "longest_edge": 3 * 364
}, },
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [( prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:" f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
...@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ...@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 384 "longest_edge": 384
}, },
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:") (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
...@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
model="moonshotai/Kimi-VL-A3B-Instruct", model="moonshotai/Kimi-VL-A3B-Instruct",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData: ...@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: ...@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-v1.6-mistral-7b-hf", model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str], ...@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
model="llava-hf/LLaVA-NeXT-Video-7B-hf", model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str], ...@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf", model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384, max_model_len=16384,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData: ...@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model="TIGER-Lab/Mantis-8B-siglip-llama3", model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096, max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
stop_token_ids = [128009] stop_token_ids = [128009]
...@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): ...@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
# NOTE The stop_token_ids are different for various versions of MiniCPM-V # NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0 # 2.0
...@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
tensor_parallel_size=2, tensor_parallel_size=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
...@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ...@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
...@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData: ...@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=4, max_num_seqs=4,
tensor_parallel_size=8, tensor_parallel_size=8,
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
...@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ...@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
...@@ -706,7 +706,38 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: ...@@ -706,7 +706,38 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
tensor_parallel_size=4, tensor_parallel_size=4,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"<image>\n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "AIDC-AI/Ovis2-1B"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -733,7 +764,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData: ...@@ -733,7 +764,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions] prompts = ["caption en" for _ in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="google/paligemma-3b-mix-224", model="google/paligemma-3b-mix-224",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -750,7 +781,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -750,7 +781,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions] prompts = ["caption en" for _ in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="google/paligemma2-3b-ft-docci-448", model="google/paligemma2-3b-ft-docci-448",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -787,7 +818,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -787,7 +818,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls # Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16}, mm_processor_kwargs={"num_crops": 16},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -821,7 +852,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: ...@@ -821,7 +852,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_lora_rank=320, max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls # Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16}, mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -842,7 +873,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: ...@@ -842,7 +873,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
max_model_len=6144, max_model_len=6144,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
...@@ -863,7 +894,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -863,7 +894,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions] prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
...@@ -888,7 +919,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -888,7 +919,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28, "min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
if modality == "image": if modality == "image":
...@@ -923,7 +954,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -923,7 +954,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
if modality == "image": if modality == "image":
...@@ -957,7 +988,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ...@@ -957,7 +988,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": [1], "fps": [1],
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
if modality == "image": if modality == "image":
...@@ -990,7 +1021,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -990,7 +1021,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -1041,6 +1072,7 @@ model_example_map = { ...@@ -1041,6 +1072,7 @@ model_example_map = {
"llama4": run_llama4, "llama4": run_llama4,
"molmo": run_molmo, "molmo": run_molmo,
"NVLM_D": run_nvlm_d, "NVLM_D": run_nvlm_d,
"ovis": run_ovis,
"paligemma": run_paligemma, "paligemma": run_paligemma,
"paligemma2": run_paligemma2, "paligemma2": run_paligemma2,
"phi3_v": run_phi3v, "phi3_v": run_phi3v,
...@@ -1080,7 +1112,7 @@ def get_multi_modal_input(args): ...@@ -1080,7 +1112,7 @@ def get_multi_modal_input(args):
if args.modality == "video": if args.modality == "video":
# Input video and question # Input video and question
video = VideoAsset(name="sample_demo_1.mp4", video = VideoAsset(name="baby_reading",
num_frames=args.num_frames).np_ndarrays num_frames=args.num_frames).np_ndarrays
vid_questions = ["Why is this video funny?"] vid_questions = ["Why is this video funny?"]
......
...@@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
) )
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2-1B"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b" model_name = "mistral-community/pixtral-12b"
...@@ -685,6 +715,7 @@ model_example_map = { ...@@ -685,6 +715,7 @@ model_example_map = {
"mistral3": load_mistral3, "mistral3": load_mistral3,
"mllama": load_mllama, "mllama": load_mllama,
"NVLM_D": load_nvlm_d, "NVLM_D": load_nvlm_d,
"ovis": load_ovis,
"phi3_v": load_phi3v, "phi3_v": load_phi3v,
"phi4_mm": load_phi4mm, "phi4_mm": load_phi4mm,
"pixtral_hf": load_pixtral_hf, "pixtral_hf": load_pixtral_hf,
......
...@@ -8,7 +8,7 @@ image: ...@@ -8,7 +8,7 @@ image:
# -- Image tag # -- Image tag
tag: "latest" tag: "latest"
# -- Container launch command # -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"] command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
# -- Container port # -- Container port
containerPort: 8000 containerPort: 8000
......
# Disaggregated Serving
This example contains scripts that demonstrate the disaggregated serving features of vLLM.
## Files
- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
- `kv_events.sh` - Demonstrates KV cache event publishing.
...@@ -4,7 +4,7 @@ This file provides a disaggregated prefilling proxy demo to demonstrate an ...@@ -4,7 +4,7 @@ This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling. example usage of XpYd disaggregated prefilling.
We can launch multiple vllm instances (2 for prefill and 2 for decode), and We can launch multiple vllm instances (2 for prefill and 2 for decode), and
launch this proxy demo through: launch this proxy demo through:
python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \ python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py \
--model $model_name \ --model $model_name \
--prefill localhost:8100 localhost:8101 \ --prefill localhost:8100 localhost:8101 \
--decode localhost:8200 localhost:8201 \ --decode localhost:8200 localhost:8201 \
...@@ -414,7 +414,7 @@ class ProxyServer: ...@@ -414,7 +414,7 @@ class ProxyServer:
server.run() server.run()
if __name__ == "__main__": def parse_args():
# Todo: allow more config # Todo: allow more config
parser = argparse.ArgumentParser("vLLM disaggregated proxy server.") parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
parser.add_argument("--model", parser.add_argument("--model",
...@@ -445,6 +445,10 @@ if __name__ == "__main__": ...@@ -445,6 +445,10 @@ if __name__ == "__main__":
default=8000, default=8000,
help="Server port number", help="Server port number",
) )
args = parser.parse_args() return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
proxy_server = ProxyServer(args=args) proxy_server = ProxyServer(args=args)
proxy_server.run_server() proxy_server.run_server()
#!/bin/bash
# This file demonstrates the KV cache event publishing
# We will launch a vllm instances configured to publish KV cache
# events and launch a simple subscriber to log those events.
set -xe
echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
sleep 1
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT
# Cleanup function
cleanup() {
echo "Caught Ctrl+C, cleaning up..."
# Cleanup commands
pgrep python | xargs kill -9
pkill -f python
echo "Cleanup complete. Exiting."
exit 0
}
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# a function that waits vLLM server to start
wait_for_server() {
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
vllm serve $MODEL_NAME \
--port 8100 \
--max-model-len 100 \
--enforce-eager \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-events-config \
'{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
wait_for_server 8100
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
sleep 1
# serve two example requests
output1=$(curl -X POST -s http://localhost:8100/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}')
output2=$(curl -X POST -s http://localhost:8100/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}')
# Cleanup commands
pkill -9 -u "$USER" -f python
pkill -9 -u "$USER" -f vllm
sleep 1
echo "Cleaned up"
# Print the outputs of the curl requests
echo ""
echo "Output of first request: $output1"
echo "Output of second request: $output2"
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo ""
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Optional, Union
import msgspec
import zmq
from msgspec.msgpack import Decoder
#
# Types copied from vllm.distributed.kv_events
#
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
gc=False):
ts: float
events: list[Any]
class KVCacheEvent(msgspec.Struct,
array_like=True,
omit_defaults=True,
gc=False,
tag=True):
"""Base class for all KV cache-related events"""
class BlockStored(KVCacheEvent):
block_hashes: list[int]
parent_block_hash: Optional[int]
token_ids: list[int]
block_size: int
lora_id: Optional[int]
class BlockRemoved(KVCacheEvent):
block_hashes: list[int]
class AllBlocksCleared(KVCacheEvent):
pass
class KVEventBatch(EventBatch):
events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
def process_event(event_batch):
print(f"Received event batch at {event_batch.ts}:")
for event in event_batch.events:
print(f" - {event}")
def main():
decoder = Decoder(type=KVEventBatch)
last_seq = -1
context = zmq.Context()
# Set up the main subscription socket
sub = context.socket(zmq.SUB)
sub.connect("tcp://localhost:5557")
topic = "kv-events"
sub.setsockopt_string(zmq.SUBSCRIBE, topic)
# Initialize replay socket
replay = context.socket(zmq.REQ)
replay.connect("tcp://localhost:5558")
poller = zmq.Poller()
poller.register(replay, zmq.POLLIN)
print("Listening for KV cache events on topic:", topic)
while True:
try:
if sub.poll(50):
_, seq_bytes, payload = sub.recv_multipart()
seq = int.from_bytes(seq_bytes, "big")
if last_seq >= 0 and seq > last_seq + 1:
missed = seq - last_seq - 1
print(f"Missed {missed} messages"
f" (last: {last_seq}, current: {seq})")
replay.send((last_seq + 1).to_bytes(8, "big"))
while poller.poll(timeout=200):
seq_bytes, replay_payload = replay.recv_multipart()
if not replay_payload:
# End of replay marker is sent as an empty frame
# for the payload
break
replay_seq = int.from_bytes(seq_bytes, "big")
if replay_seq > last_seq:
event_batch = decoder.decode(replay_payload)
process_event(event_batch)
last_seq = replay_seq
if replay_seq >= seq - 1:
break
event_batch = decoder.decode(payload)
process_event(event_batch)
# ... do other periodic work or check for shutdown ...
except KeyboardInterrupt:
print("Interrupted")
break
except Exception as e:
print("Error decoding message:", e)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""An example showing how to use vLLM to serve multimodal models """An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client. and run online serving with OpenAI client.
Launch the vLLM server with the following command: Launch the vLLM server with the following command:
(single image inference with Llava) (single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct) (multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox) (audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
--max-model-len 4096 --trust-remote-code
run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
""" """
import base64 import base64
import requests import requests
from openai import OpenAI from openai import OpenAI
from utils import get_first_model
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
...@@ -31,9 +37,6 @@ client = OpenAI( ...@@ -31,9 +37,6 @@ client = OpenAI(
base_url=openai_api_base, base_url=openai_api_base,
) )
models = client.models.list()
model = models.data[0].id
def encode_base64_content_from_url(content_url: str) -> str: def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format.""" """Encode a content retrieved from a remote url to base64 format."""
...@@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str: ...@@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
# Text-only inference # Text-only inference
def run_text_only() -> None: def run_text_only(model: str) -> None:
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
messages=[{ messages=[{
"role": "user", "role": "user",
...@@ -61,7 +64,7 @@ def run_text_only() -> None: ...@@ -61,7 +64,7 @@ def run_text_only() -> None:
# Single-image input inference # Single-image input inference
def run_single_image() -> None: def run_single_image(model: str) -> None:
## Use image url in the payload ## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
...@@ -117,7 +120,7 @@ def run_single_image() -> None: ...@@ -117,7 +120,7 @@ def run_single_image() -> None:
# Multi-image input inference # Multi-image input inference
def run_multi_image() -> None: def run_multi_image(model: str) -> None:
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
...@@ -152,7 +155,7 @@ def run_multi_image() -> None: ...@@ -152,7 +155,7 @@ def run_multi_image() -> None:
# Video input inference # Video input inference
def run_video() -> None: def run_video(model: str) -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url) video_base64 = encode_base64_content_from_url(video_url)
...@@ -208,7 +211,7 @@ def run_video() -> None: ...@@ -208,7 +211,7 @@ def run_video() -> None:
# Audio input inference # Audio input inference
def run_audio() -> None: def run_audio(model: str) -> None:
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
audio_url = AudioAsset("winning_call").url audio_url = AudioAsset("winning_call").url
...@@ -318,7 +321,8 @@ def parse_args(): ...@@ -318,7 +321,8 @@ def parse_args():
def main(args) -> None: def main(args) -> None:
chat_type = args.chat_type chat_type = args.chat_type
example_function_map[chat_type]() model = get_first_model(client)
example_function_map[chat_type](model)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -7,12 +7,12 @@ IMPORTANT: for mistral, you must use one of the provided mistral tool call ...@@ -7,12 +7,12 @@ IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details. See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \ vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
--chat-template examples/tool_chat_template_mistral.jinja \ --chat-template examples/tool_chat_template_mistral.jinja \
--enable-auto-tool-choice --tool-call-parser mistral --enable-auto-tool-choice --tool-call-parser mistral
OR OR
vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \ vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
--chat-template examples/tool_chat_template_hermes.jinja \ --chat-template examples/tool_chat_template_hermes.jinja \
--enable-auto-tool-choice --tool-call-parser hermes --enable-auto-tool-choice --tool-call-parser hermes
""" """
......
...@@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str): ...@@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
"alan.turing@enigma.com\n") "alan.turing@enigma.com\n")
try: try:
# The no-fallback option forces vLLM to use xgrammar, so when it fails # The guided_decoding_disable_fallback option forces vLLM to use
# you get a 400 with the reason why # xgrammar, so when it fails you get a 400 with the reason why
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[{ messages=[{
...@@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str): ...@@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
extra_body={ extra_body={
"guided_regex": r"\w+@\w+\.com\n", "guided_regex": r"\w+@\w+\.com\n",
"stop": ["\n"], "stop": ["\n"],
"guided_decoding_backend": "xgrammar:no-fallback" "guided_decoding_backend": "xgrammar",
"guided_decoding_disable_fallback": True,
}, },
) )
return completion.choices[0].message.content return completion.choices[0].message.content
...@@ -137,7 +138,7 @@ def main(): ...@@ -137,7 +138,7 @@ def main():
api_key="-", api_key="-",
) )
model = "Qwen/Qwen2.5-3B-Instruct" model = client.models.list().data[0].id
print("Guided Choice Completion:") print("Guided Choice Completion:")
print(guided_choice_completion(client, model)) print(guided_choice_completion(client, model))
......
...@@ -59,7 +59,7 @@ and San Francisco? ...@@ -59,7 +59,7 @@ and San Francisco?
}] }]
response = client.chat.completions.create( response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct", model=client.models.list().data[0].id,
messages=messages, messages=messages,
response_format={ response_format={
"type": "type":
......
...@@ -4,12 +4,12 @@ An example shows how to generate structured outputs from reasoning models ...@@ -4,12 +4,12 @@ An example shows how to generate structured outputs from reasoning models
like DeepSeekR1. The thinking process will not be guided by the JSON like DeepSeekR1. The thinking process will not be guided by the JSON
schema provided by the user. Only the final output will be structured. schema provided by the user. Only the final output will be structured.
To run this example, you need to start the vLLM server with the reasoning To run this example, you need to start the vLLM server with the reasoning
parser: parser:
```bash ```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1 --reasoning-parser deepseek_r1
``` ```
This example demonstrates how to generate chat completions from reasoning models This example demonstrates how to generate chat completions from reasoning models
......
...@@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled. ...@@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
```bash ```bash
vllm serve Qwen/QwQ-32B \ vllm serve Qwen/QwQ-32B \
--enable-reasoning --reasoning-parser deepseek_r1 \ --reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes --enable-auto-tool-choice --tool-call-parser hermes
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment