Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
......@@ -14,7 +14,7 @@ import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.profiler import layerwise_profile
from vllm.profiler.layerwise_profile import layerwise_profile
from vllm.utils import FlexibleArgumentParser
BATCH_SIZE_DEFAULT = 1
......@@ -193,7 +193,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
batch_size = context.batch_size
prompt_len = context.prompt_len
scheduler_config = llm.llm_engine.scheduler_config
scheduler_config = llm.llm_engine.vllm_config.scheduler_config
max_model_len = llm.llm_engine.model_config.max_model_len
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
max_num_seqs = scheduler_config.max_num_seqs
......
......@@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
"image":
ImageAsset("cherry_blossom").pil_image.convert("RGB"),
"video":
VideoAsset(name="sample_demo_1.mp4",
num_frames=16).np_ndarrays,
VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={
......@@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
"<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n")
asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
"Please launch this example with "
......@@ -141,7 +140,7 @@ def main(args):
print(generated_text)
if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'audio language models')
......@@ -156,5 +155,9 @@ if __name__ == "__main__":
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
import os
from urllib.request import urlopen
from vllm import LLM, SamplingParams
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
def load_prompt() -> str:
# Test cases with various lengths can be found at:
#
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
with urlopen(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com"
"/Qwen2.5-1M/test-data/600k.txt",
timeout=5) as response:
prompt = response.read().decode('utf-8')
return prompt
# Processing the prompt.
def process_requests(llm: LLM, prompts: list[str]) -> None:
# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.8,
top_k=20,
repetition_penalty=1.05,
detokenize=True,
max_tokens=256,
)
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt_token_ids = output.prompt_token_ids
generated_text = output.outputs[0].text
print(f"Prompt length: {len(prompt_token_ids)}, "
f"Generated text: {generated_text!r}")
# Create an LLM.
def initialize_engine() -> LLM:
llm = LLM(model="Qwen/Qwen2.5-7B-Instruct-1M",
max_model_len=1048576,
tensor_parallel_size=4,
enforce_eager=True,
enable_chunked_prefill=True,
max_num_batched_tokens=131072)
return llm
def main():
llm = initialize_engine()
prompt = load_prompt()
process_requests(llm, [prompt])
if __name__ == '__main__':
main()
......@@ -8,6 +8,8 @@ the argument 2 should match the `tensor_parallel_size` below.
see `tests/distributed/test_torchrun_example.py` for the unit test.
"""
import torch.distributed as dist
from vllm import LLM, SamplingParams
# Create prompts, the same across all ranks
......@@ -27,23 +29,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# all ranks have the same random seed, so that sampling can be
# deterministic across ranks.
llm = LLM(
model="facebook/opt-125m",
model="meta-llama/Llama-3.1-8B",
tensor_parallel_size=2,
pipeline_parallel_size=2,
distributed_executor_backend="external_launcher",
seed=0,
max_model_len=32768,
seed=1,
)
outputs = llm.generate(prompts, sampling_params)
# all ranks will have the same outputs
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
if dist.get_rank() == 0:
print("-" * 50)
"""
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}\n")
print("-" * 50)
"""
Further tips:
1. to communicate control messages across all ranks, use the cpu group,
......
......@@ -22,7 +22,8 @@ def main():
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4)
max_num_seqs=4,
max_model_len=128)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output, answer in zip(outputs, answers):
......
......@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096,
max_num_seqs=2,
dtype="bfloat16",
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
......@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
......@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b",
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model="facebook/chameleon-7b",
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096,
max_num_seqs=2,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [
......@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
trust_remote_code=True,
dtype="bfloat16",
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
......@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [("<bos><start_of_turn>user\n"
......@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True,
enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [
......@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 3 * 364
},
},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
......@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 384
},
},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
......@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
model="moonshotai/Kimi-VL-A3B-Instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
stop_token_ids = [128009]
......@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
......@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
......@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
......@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=4,
tensor_parallel_size=8,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
......@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
dtype="bfloat16",
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [
......@@ -706,7 +706,38 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True,
max_model_len=4096,
tensor_parallel_size=4,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"<image>\n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "AIDC-AI/Ovis2-1B"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -733,7 +764,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions]
engine_args = EngineArgs(
model="google/paligemma-3b-mix-224",
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -750,7 +781,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions]
engine_args = EngineArgs(
model="google/paligemma2-3b-ft-docci-448",
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -787,7 +818,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -821,7 +852,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
......@@ -842,7 +873,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=6144,
max_num_seqs=2,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
......@@ -863,7 +894,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=1024,
max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
......@@ -888,7 +919,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
......@@ -923,7 +954,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
......@@ -957,7 +988,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels": 1280 * 28 * 28,
"fps": [1],
},
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
......@@ -990,7 +1021,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
limit_mm_per_prompt={modality: 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -1041,6 +1072,7 @@ model_example_map = {
"llama4": run_llama4,
"molmo": run_molmo,
"NVLM_D": run_nvlm_d,
"ovis": run_ovis,
"paligemma": run_paligemma,
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
......@@ -1080,7 +1112,7 @@ def get_multi_modal_input(args):
if args.modality == "video":
# Input video and question
video = VideoAsset(name="sample_demo_1.mp4",
video = VideoAsset(name="baby_reading",
num_frames=args.num_frames).np_ndarrays
vid_questions = ["Why is this video funny?"]
......
......@@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
)
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2-1B"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b"
......@@ -685,6 +715,7 @@ model_example_map = {
"mistral3": load_mistral3,
"mllama": load_mllama,
"NVLM_D": load_nvlm_d,
"ovis": load_ovis,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"pixtral_hf": load_pixtral_hf,
......
......@@ -8,7 +8,7 @@ image:
# -- Image tag
tag: "latest"
# -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
# -- Container port
containerPort: 8000
......
# Disaggregated Serving
This example contains scripts that demonstrate the disaggregated serving features of vLLM.
## Files
- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
- `kv_events.sh` - Demonstrates KV cache event publishing.
......@@ -4,7 +4,7 @@ This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
We can launch multiple vllm instances (2 for prefill and 2 for decode), and
launch this proxy demo through:
python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \
python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py \
--model $model_name \
--prefill localhost:8100 localhost:8101 \
--decode localhost:8200 localhost:8201 \
......@@ -414,7 +414,7 @@ class ProxyServer:
server.run()
if __name__ == "__main__":
def parse_args():
# Todo: allow more config
parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
parser.add_argument("--model",
......@@ -445,6 +445,10 @@ if __name__ == "__main__":
default=8000,
help="Server port number",
)
args = parser.parse_args()
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
proxy_server = ProxyServer(args=args)
proxy_server.run_server()
#!/bin/bash
# This file demonstrates the KV cache event publishing
# We will launch a vllm instances configured to publish KV cache
# events and launch a simple subscriber to log those events.
set -xe
echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
sleep 1
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT
# Cleanup function
cleanup() {
echo "Caught Ctrl+C, cleaning up..."
# Cleanup commands
pgrep python | xargs kill -9
pkill -f python
echo "Cleanup complete. Exiting."
exit 0
}
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# a function that waits vLLM server to start
wait_for_server() {
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
vllm serve $MODEL_NAME \
--port 8100 \
--max-model-len 100 \
--enforce-eager \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-events-config \
'{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
wait_for_server 8100
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
sleep 1
# serve two example requests
output1=$(curl -X POST -s http://localhost:8100/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}')
output2=$(curl -X POST -s http://localhost:8100/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'"$MODEL_NAME"'",
"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}')
# Cleanup commands
pkill -9 -u "$USER" -f python
pkill -9 -u "$USER" -f vllm
sleep 1
echo "Cleaned up"
# Print the outputs of the curl requests
echo ""
echo "Output of first request: $output1"
echo "Output of second request: $output2"
echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo ""
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Optional, Union
import msgspec
import zmq
from msgspec.msgpack import Decoder
#
# Types copied from vllm.distributed.kv_events
#
class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
gc=False):
ts: float
events: list[Any]
class KVCacheEvent(msgspec.Struct,
array_like=True,
omit_defaults=True,
gc=False,
tag=True):
"""Base class for all KV cache-related events"""
class BlockStored(KVCacheEvent):
block_hashes: list[int]
parent_block_hash: Optional[int]
token_ids: list[int]
block_size: int
lora_id: Optional[int]
class BlockRemoved(KVCacheEvent):
block_hashes: list[int]
class AllBlocksCleared(KVCacheEvent):
pass
class KVEventBatch(EventBatch):
events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
def process_event(event_batch):
print(f"Received event batch at {event_batch.ts}:")
for event in event_batch.events:
print(f" - {event}")
def main():
decoder = Decoder(type=KVEventBatch)
last_seq = -1
context = zmq.Context()
# Set up the main subscription socket
sub = context.socket(zmq.SUB)
sub.connect("tcp://localhost:5557")
topic = "kv-events"
sub.setsockopt_string(zmq.SUBSCRIBE, topic)
# Initialize replay socket
replay = context.socket(zmq.REQ)
replay.connect("tcp://localhost:5558")
poller = zmq.Poller()
poller.register(replay, zmq.POLLIN)
print("Listening for KV cache events on topic:", topic)
while True:
try:
if sub.poll(50):
_, seq_bytes, payload = sub.recv_multipart()
seq = int.from_bytes(seq_bytes, "big")
if last_seq >= 0 and seq > last_seq + 1:
missed = seq - last_seq - 1
print(f"Missed {missed} messages"
f" (last: {last_seq}, current: {seq})")
replay.send((last_seq + 1).to_bytes(8, "big"))
while poller.poll(timeout=200):
seq_bytes, replay_payload = replay.recv_multipart()
if not replay_payload:
# End of replay marker is sent as an empty frame
# for the payload
break
replay_seq = int.from_bytes(seq_bytes, "big")
if replay_seq > last_seq:
event_batch = decoder.decode(replay_payload)
process_event(event_batch)
last_seq = replay_seq
if replay_seq >= seq - 1:
break
event_batch = decoder.decode(payload)
process_event(event_batch)
# ... do other periodic work or check for shutdown ...
except KeyboardInterrupt:
print("Interrupted")
break
except Exception as e:
print("Error decoding message:", e)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""An example showing how to use vLLM to serve multimodal models
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
--max-model-len 4096 --trust-remote-code
run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""
import base64
import requests
from openai import OpenAI
from utils import get_first_model
from vllm.utils import FlexibleArgumentParser
......@@ -31,9 +37,6 @@ client = OpenAI(
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
......@@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
# Text-only inference
def run_text_only() -> None:
def run_text_only(model: str) -> None:
chat_completion = client.chat.completions.create(
messages=[{
"role": "user",
......@@ -61,7 +64,7 @@ def run_text_only() -> None:
# Single-image input inference
def run_single_image() -> None:
def run_single_image(model: str) -> None:
## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
......@@ -117,7 +120,7 @@ def run_single_image() -> None:
# Multi-image input inference
def run_multi_image() -> None:
def run_multi_image(model: str) -> None:
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url = client.chat.completions.create(
......@@ -152,7 +155,7 @@ def run_multi_image() -> None:
# Video input inference
def run_video() -> None:
def run_video(model: str) -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url)
......@@ -208,7 +211,7 @@ def run_video() -> None:
# Audio input inference
def run_audio() -> None:
def run_audio(model: str) -> None:
from vllm.assets.audio import AudioAsset
audio_url = AudioAsset("winning_call").url
......@@ -318,7 +321,8 @@ def parse_args():
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
model = get_first_model(client)
example_function_map[chat_type](model)
if __name__ == "__main__":
......
......@@ -7,12 +7,12 @@ IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \
vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
--chat-template examples/tool_chat_template_mistral.jinja \
--enable-auto-tool-choice --tool-call-parser mistral
OR
vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
--chat-template examples/tool_chat_template_hermes.jinja \
--enable-auto-tool-choice --tool-call-parser hermes
"""
......
......@@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
"alan.turing@enigma.com\n")
try:
# The no-fallback option forces vLLM to use xgrammar, so when it fails
# you get a 400 with the reason why
# The guided_decoding_disable_fallback option forces vLLM to use
# xgrammar, so when it fails you get a 400 with the reason why
completion = client.chat.completions.create(
model=model,
messages=[{
......@@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
extra_body={
"guided_regex": r"\w+@\w+\.com\n",
"stop": ["\n"],
"guided_decoding_backend": "xgrammar:no-fallback"
"guided_decoding_backend": "xgrammar",
"guided_decoding_disable_fallback": True,
},
)
return completion.choices[0].message.content
......@@ -137,7 +138,7 @@ def main():
api_key="-",
)
model = "Qwen/Qwen2.5-3B-Instruct"
model = client.models.list().data[0].id
print("Guided Choice Completion:")
print(guided_choice_completion(client, model))
......
......@@ -59,7 +59,7 @@ and San Francisco?
}]
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
model=client.models.list().data[0].id,
messages=messages,
response_format={
"type":
......
......@@ -4,12 +4,12 @@ An example shows how to generate structured outputs from reasoning models
like DeepSeekR1. The thinking process will not be guided by the JSON
schema provided by the user. Only the final output will be structured.
To run this example, you need to start the vLLM server with the reasoning
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
......
......@@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B \
--enable-reasoning --reasoning-parser deepseek_r1 \
--reasoning-parser deepseek_r1 \
--enable-auto-tool-choice --tool-call-parser hermes
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment