Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/openai/openai_example_batch.jsonl
+++ b/examples/offline_inference/openai/openai_example_batch.jsonl
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -14,7 +14,7 @@ import tqdm
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler import layerwise_profile
+from vllm.profiler.layerwise_profile import layerwise_profile
 from vllm.utils import FlexibleArgumentParser
 BATCH_SIZE_DEFAULT = 1
@@ -193,7 +193,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
    batch_size = context.batch_size
    prompt_len = context.prompt_len
-    scheduler_config = llm.llm_engine.scheduler_config
+    scheduler_config = llm.llm_engine.vllm_config.scheduler_config
    max_model_len = llm.llm_engine.model_config.max_model_len
    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
    max_num_seqs = scheduler_config.max_num_seqs

--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
                "image":
                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
                "video":
-                VideoAsset(name="sample_demo_1.mp4",
+                VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
-                           num_frames=16).np_ndarrays,
            },
        },
        limit_mm_per_prompt={
@@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
              f"{question}<|im_end|>\n"
              f"<|im_start|>assistant\n")
-    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
+    asset = VideoAsset(name="baby_reading", num_frames=16)
    audio = asset.get_audio(sampling_rate=16000)
    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
                                  "Please launch this example with "
@@ -141,7 +140,7 @@ def main(args):
        print(generated_text)
-if __name__ == "__main__":
+def parse_args():
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'audio language models')
@@ -156,5 +155,9 @@ if __name__ == "__main__":
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
-    args = parser.parse_args()
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
    main(args)
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+from urllib.request import urlopen
+from vllm import LLM, SamplingParams
+os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
+os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
+def load_prompt() -> str:
+    # Test cases with various lengths can be found at:
+    #
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
+    with urlopen(
+            "https://qianwen-res.oss-cn-beijing.aliyuncs.com"
+            "/Qwen2.5-1M/test-data/600k.txt",
+            timeout=5) as response:
+        prompt = response.read().decode('utf-8')
+    return prompt
+# Processing the prompt.
+def process_requests(llm: LLM, prompts: list[str]) -> None:
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.8,
+        top_k=20,
+        repetition_penalty=1.05,
+        detokenize=True,
+        max_tokens=256,
+    )
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt_token_ids = output.prompt_token_ids
+        generated_text = output.outputs[0].text
+        print(f"Prompt length: {len(prompt_token_ids)}, "
+              f"Generated text: {generated_text!r}")
+# Create an LLM.
+def initialize_engine() -> LLM:
+    llm = LLM(model="Qwen/Qwen2.5-7B-Instruct-1M",
+              max_model_len=1048576,
+              tensor_parallel_size=4,
+              enforce_eager=True,
+              enable_chunked_prefill=True,
+              max_num_batched_tokens=131072)
+    return llm
+def main():
+    llm = initialize_engine()
+    prompt = load_prompt()
+    process_requests(llm, [prompt])
+if __name__ == '__main__':
+    main()
--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -8,6 +8,8 @@ the argument 2 should match the `tensor_parallel_size` below.
 see `tests/distributed/test_torchrun_example.py` for the unit test.
 """
+import torch.distributed as dist
 from vllm import LLM, SamplingParams
 # Create prompts, the same across all ranks
@@ -27,23 +29,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="facebook/opt-125m",
+    model="meta-llama/Llama-3.1-8B",
    tensor_parallel_size=2,
+    pipeline_parallel_size=2,
    distributed_executor_backend="external_launcher",
-    seed=0,
+    max_model_len=32768,
+    seed=1,
 )
 outputs = llm.generate(prompts, sampling_params)
 # all ranks will have the same outputs
-print("-" * 50)
+if dist.get_rank() == 0:
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\n"
-          f"Generated text: {generated_text!r}")
    print("-" * 50)
-"""
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\n"
+              f"Generated text: {generated_text!r}\n")
+        print("-" * 50)
+    """
 Further tips:
 1. to communicate control messages across all ranks, use the cpu group,

--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -22,7 +22,8 @@ def main():
    # In real workloads, `enforace_eager` should be `False`.
    llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
              max_num_batched_tokens=64,
-              max_num_seqs=4)
+              max_num_seqs=4,
+              max_model_len=128)
    outputs = llm.generate(prompts, sampling_params)
    print("-" * 50)
    for output, answer in zip(outputs, answers):

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
        model="Salesforce/blip2-opt-6.7b",
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="bfloat16",
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [("<bos><start_of_turn>user\n"
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
                "longest_edge": 3 * 364
            },
        },
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
                "longest_edge": 384
            },
        },
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
        max_model_len=4096,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    stop_token_ids = [128009]
@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
@@ -706,7 +706,38 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+# Ovis
+def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "AIDC-AI/Ovis2-1B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={modality: 1},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -733,7 +764,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -750,7 +781,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -787,7 +818,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"num_crops": 16},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -821,7 +852,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
        max_lora_rank=320,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    return ModelRequestData(
@@ -842,7 +873,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=6144,
        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -863,7 +894,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=1024,
        max_num_seqs=2,
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -888,7 +919,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
        },
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    if modality == "image":
@@ -923,7 +954,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    if modality == "image":
@@ -957,7 +988,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
            "max_pixels": 1280 * 28 * 28,
            "fps": [1],
        },
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    if modality == "image":
@@ -990,7 +1021,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
-        limit_mm_per_prompt={"image": 1},
+        limit_mm_per_prompt={modality: 1},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -1041,6 +1072,7 @@ model_example_map = {
    "llama4": run_llama4,
    "molmo": run_molmo,
    "NVLM_D": run_nvlm_d,
+    "ovis": run_ovis,
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
@@ -1080,7 +1112,7 @@ def get_multi_modal_input(args):
    if args.modality == "video":
        # Input video and question
-        video = VideoAsset(name="sample_demo_1.mp4",
+        video = VideoAsset(name="baby_reading",
                           num_frames=args.num_frames).np_ndarrays
        vid_questions = ["Why is this video funny?"]

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    )
+# Ovis
+def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2-1B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistral-community/pixtral-12b"
@@ -685,6 +715,7 @@ model_example_map = {
    "mistral3": load_mistral3,
    "mllama": load_mllama,
    "NVLM_D": load_nvlm_d,
+    "ovis": load_ovis,
    "phi3_v": load_phi3v,
    "phi4_mm": load_phi4mm,
    "pixtral_hf": load_pixtral_hf,

--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -8,7 +8,7 @@ image:
  # -- Image tag
  tag: "latest"
  # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
 # -- Container port
 containerPort: 8000

--- a/examples/online_serving/disaggregated_serving/README.md
+++ b/examples/online_serving/disaggregated_serving/README.md
+# Disaggregated Serving
+This example contains scripts that demonstrate the disaggregated serving features of vLLM.
+## Files
+- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
+- `kv_events.sh` - Demonstrates KV cache event publishing.
--- a/examples/online_serving/disagg_examples/disagg_proxy_demo.py
+++ b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
@@ -4,7 +4,7 @@ This file provides a disaggregated prefilling proxy demo to demonstrate an
 example usage of XpYd disaggregated prefilling.
 We can launch multiple vllm instances (2 for prefill and 2 for decode), and
 launch this proxy demo through:
-  python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py  \
+  python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py  \
       --model $model_name  \
       --prefill localhost:8100 localhost:8101   \
       --decode localhost:8200 localhost:8201   \
@@ -414,7 +414,7 @@ class ProxyServer:
        server.run()
-if __name__ == "__main__":
+def parse_args():
    # Todo: allow more config
    parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
    parser.add_argument("--model",
@@ -445,6 +445,10 @@ if __name__ == "__main__":
        default=8000,
        help="Server port number",
    )
-    args = parser.parse_args()
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
    proxy_server = ProxyServer(args=args)
    proxy_server.run_server()
--- a/examples/online_serving/disaggregated_serving/kv_events.sh
+++ b/examples/online_serving/disaggregated_serving/kv_events.sh
+#!/bin/bash
+# This file demonstrates the KV cache event publishing
+# We will launch a vllm instances configured to publish KV cache
+# events and launch a simple subscriber to log those events.
+set -xe
+echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
+sleep 1
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+vllm serve $MODEL_NAME \
+    --port 8100 \
+    --max-model-len 100 \
+    --enforce-eager \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-events-config \
+    '{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
+wait_for_server 8100
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
+sleep 1
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+output2=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+# Cleanup commands
+pkill -9 -u "$USER" -f python
+pkill -9 -u "$USER" -f vllm
+sleep 1
+echo "Cleaned up"
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Optional, Union
+import msgspec
+import zmq
+from msgspec.msgpack import Decoder
+#
+# Types copied from vllm.distributed.kv_events
+#
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
+                 gc=False):
+    ts: float
+    events: list[Any]
+class KVCacheEvent(msgspec.Struct,
+                   array_like=True,
+                   omit_defaults=True,
+                   gc=False,
+                   tag=True):
+    """Base class for all KV cache-related events"""
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+class AllBlocksCleared(KVCacheEvent):
+    pass
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+def process_event(event_batch):
+    print(f"Received event batch at {event_batch.ts}:")
+    for event in event_batch.events:
+        print(f"  - {event}")
+def main():
+    decoder = Decoder(type=KVEventBatch)
+    last_seq = -1
+    context = zmq.Context()
+    # Set up the main subscription socket
+    sub = context.socket(zmq.SUB)
+    sub.connect("tcp://localhost:5557")
+    topic = "kv-events"
+    sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+    # Initialize replay socket
+    replay = context.socket(zmq.REQ)
+    replay.connect("tcp://localhost:5558")
+    poller = zmq.Poller()
+    poller.register(replay, zmq.POLLIN)
+    print("Listening for KV cache events on topic:", topic)
+    while True:
+        try:
+            if sub.poll(50):
+                _, seq_bytes, payload = sub.recv_multipart()
+                seq = int.from_bytes(seq_bytes, "big")
+                if last_seq >= 0 and seq > last_seq + 1:
+                    missed = seq - last_seq - 1
+                    print(f"Missed {missed} messages"
+                          f" (last: {last_seq}, current: {seq})")
+                    replay.send((last_seq + 1).to_bytes(8, "big"))
+                    while poller.poll(timeout=200):
+                        seq_bytes, replay_payload = replay.recv_multipart()
+                        if not replay_payload:
+                            # End of replay marker is sent as an empty frame
+                            # for the payload
+                            break
+                        replay_seq = int.from_bytes(seq_bytes, "big")
+                        if replay_seq > last_seq:
+                            event_batch = decoder.decode(replay_payload)
+                            process_event(event_batch)
+                            last_seq = replay_seq
+                            if replay_seq >= seq - 1:
+                                break
+                event_batch = decoder.decode(payload)
+                process_event(event_batch)
+            # ... do other periodic work or check for shutdown ...
+        except KeyboardInterrupt:
+            print("Interrupted")
+            break
+        except Exception as e:
+            print("Error decoding message:", e)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
 # SPDX-License-Identifier: Apache-2.0
-"""An example showing how to use vLLM to serve multimodal models 
+"""An example showing how to use vLLM to serve multimodal models
 and run online serving with OpenAI client.
 Launch the vLLM server with the following command:
 (single image inference with Llava)
-vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+vllm serve llava-hf/llava-1.5-7b-hf
 (multi-image inference with Phi-3.5-vision-instruct)
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 (audio inference with Ultravox)
-vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
+    --max-model-len 4096 --trust-remote-code
+run the script with
+python openai_chat_completion_client_for_multimodal.py --chat-type audio
 """
 import base64
 import requests
 from openai import OpenAI
+from utils import get_first_model
 from vllm.utils import FlexibleArgumentParser
@@ -31,9 +37,6 @@ client = OpenAI(
    base_url=openai_api_base,
 )
-models = client.models.list()
-model = models.data[0].id
 def encode_base64_content_from_url(content_url: str) -> str:
    """Encode a content retrieved from a remote url to base64 format."""
@@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
 # Text-only inference
-def run_text_only() -> None:
+def run_text_only(model: str) -> None:
    chat_completion = client.chat.completions.create(
        messages=[{
            "role": "user",
@@ -61,7 +64,7 @@ def run_text_only() -> None:
 # Single-image input inference
-def run_single_image() -> None:
+def run_single_image(model: str) -> None:
    ## Use image url in the payload
    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
@@ -117,7 +120,7 @@ def run_single_image() -> None:
 # Multi-image input inference
-def run_multi_image() -> None:
+def run_multi_image(model: str) -> None:
    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
    chat_completion_from_url = client.chat.completions.create(
@@ -152,7 +155,7 @@ def run_multi_image() -> None:
 # Video input inference
-def run_video() -> None:
+def run_video(model: str) -> None:
    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
    video_base64 = encode_base64_content_from_url(video_url)
@@ -208,7 +211,7 @@ def run_video() -> None:
 # Audio input inference
-def run_audio() -> None:
+def run_audio(model: str) -> None:
    from vllm.assets.audio import AudioAsset
    audio_url = AudioAsset("winning_call").url
@@ -318,7 +321,8 @@ def parse_args():
 def main(args) -> None:
    chat_type = args.chat_type
-    example_function_map[chat_type]()
+    model = get_first_model(client)
+    example_function_map[chat_type](model)
 if __name__ == "__main__":

--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -7,12 +7,12 @@ IMPORTANT: for mistral, you must use one of the provided mistral tool call
 templates, or your own - the model default doesn't work for tool calls with vLLM
 See the vLLM docs on OpenAI server & tool calling for more details.
-vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \
+vllm serve mistralai/Mistral-7B-Instruct-v0.3 \
            --chat-template examples/tool_chat_template_mistral.jinja \
            --enable-auto-tool-choice --tool-call-parser mistral
 OR
-vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
+vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \
            --chat-template examples/tool_chat_template_hermes.jinja \
            --enable-auto-tool-choice --tool-call-parser hermes
 """

--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
              "alan.turing@enigma.com\n")
    try:
-        # The no-fallback option forces vLLM to use xgrammar, so when it fails
+        # The guided_decoding_disable_fallback option forces vLLM to use
-        # you get a 400 with the reason why
+        # xgrammar, so when it fails you get a 400 with the reason why
        completion = client.chat.completions.create(
            model=model,
            messages=[{
@@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
            extra_body={
                "guided_regex": r"\w+@\w+\.com\n",
                "stop": ["\n"],
-                "guided_decoding_backend": "xgrammar:no-fallback"
+                "guided_decoding_backend": "xgrammar",
+                "guided_decoding_disable_fallback": True,
            },
        )
        return completion.choices[0].message.content
@@ -137,7 +138,7 @@ def main():
        api_key="-",
    )
-    model = "Qwen/Qwen2.5-3B-Instruct"
+    model = client.models.list().data[0].id
    print("Guided Choice Completion:")
    print(guided_choice_completion(client, model))

--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
@@ -59,7 +59,7 @@ and San Francisco?
    }]
    response = client.chat.completions.create(
-        model="meta-llama/Llama-3.1-8B-Instruct",
+        model=client.models.list().data[0].id,
        messages=messages,
        response_format={
            "type":

--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -4,12 +4,12 @@ An example shows how to generate structured outputs from reasoning models
 like DeepSeekR1. The thinking process will not be guided by the JSON
 schema provided by the user. Only the final output will be structured.
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server with the reasoning
 parser:
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --enable-reasoning --reasoning-parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 This example demonstrates how to generate chat completions from reasoning models

--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
 ```bash
 vllm serve Qwen/QwQ-32B \
-     --enable-reasoning --reasoning-parser deepseek_r1 \
+     --reasoning-parser deepseek_r1 \
     --enable-auto-tool-choice --tool-call-parser hermes
 ```