Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -88,7 +88,7 @@ class RayTrainingActor:
        # Zero out all the parameters.
        for name, p in self.model.named_parameters():
            p.data.zero_()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        # The argument for `get_device_uuid` is the index of the GPU in the
        # list of visible devices.
        from vllm.platforms import current_platform
@@ -151,7 +151,7 @@ class RayTrainingActor:
                    p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
                )
                offset += get_size(p)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            s.send_pyobj(named_tensors)
            s.recv()
        s.send_pyobj(None)
@@ -159,7 +159,7 @@ class RayTrainingActor:
        s.close()
        del buffer
        gc.collect()
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()


 # Ray manages four GPUs.

--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -120,7 +120,7 @@ class ColocateWorkerExtension:
                process_weights_after_loading(
                    self.model_runner.model, self.model_config, self.device
                )
-                torch.cuda.synchronize()
+                torch.accelerator.synchronize()
                socket.send(b"")
                break
            if isinstance(payload, tuple):
@@ -144,13 +144,13 @@ class ColocateWorkerExtension:
                weights.append((item["name"], tensor))
            self.model_runner.model.load_weights(weights=weights)
            del weights
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            socket.send(b"")

        socket.close()
        del buffer
        gc.collect()
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()

    def report_device_id(self) -> str:
        from vllm.platforms import current_platform

--- a/examples/offline_inference/routed_experts_e2e.py
+++ b/examples/offline_inference/routed_experts_e2e.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+End-to-end example for routed experts capture with hybrid models.
+
+Validates that:
+1. routed_experts is returned in CompletionOutput for MoE models.
+2. Expert IDs are within valid range.
+3. Results are deterministic across runs (baseline vs reference).
+
+Usage:
+    python examples/offline_inference/routed_experts_e2e.py \
+        --model Qwen/Qwen3-30B-A3B \
+        --tp 4 \
+        --max-model-len 4096 \
+        --num-prompts 20 \
+        --max-new-tokens 50
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import uuid
+from dataclasses import dataclass, field
+
+import numpy as np
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MODEL = "Qwen/Qwen3-30B-A3B"
+
+TEST_PROMPTS = [
+    "Hello, my name is",
+    "The capital of France is",
+    "Explain quantum computing in simple terms:",
+    "Write a Python function that sorts a list:",
+    "The meaning of life is",
+    "In a distant galaxy, there was a",
+    "The best way to learn programming is",
+    "Once upon a time in a land far away,",
+    "The theory of relativity states that",
+    "How does photosynthesis work?",
+    "Describe the process of machine learning:",
+    "What are the benefits of exercise?",
+    "The history of artificial intelligence began",
+    "Translate the following to French: Hello world",
+    "Summarize the plot of Romeo and Juliet:",
+    "What is the difference between TCP and UDP?",
+    "The water cycle consists of",
+    "Explain how a neural network learns:",
+    "The periodic table organizes elements by",
+    "Write a haiku about the ocean:",
+]
+
+
+@dataclass
+class InferenceResult:
+    """Result from a single inference run."""
+
+    experts_list: list[np.ndarray] = field(default_factory=list)
+    token_ids_list: list[list[int]] = field(default_factory=list)
+    num_experts: int = 0
+
+
+# ---------------------------------------------------------------------------
+# Inference helpers
+# ---------------------------------------------------------------------------
+
+
+async def _run_async_inference(
+    engine_args: AsyncEngineArgs,
+    prompts: list[str],
+    max_new_tokens: int,
+) -> InferenceResult:
+    """Run inference using AsyncLLM."""
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.engine.async_llm import AsyncLLM
+
+    engine = AsyncLLM.from_engine_args(engine_args)
+
+    hf_config = engine.model_config.hf_text_config
+    num_experts: int = getattr(hf_config, "num_experts", 0) or getattr(
+        hf_config, "num_local_experts", 0
+    )
+    assert num_experts > 0, "Could not determine num_experts from model config"
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_new_tokens,
+    )
+
+    async def _generate_one(prompt: str, idx: int):
+        request_id = str(uuid.uuid4())
+        final_output = None
+        async for output in engine.generate(prompt, sampling_params, request_id):
+            final_output = output
+        assert final_output is not None
+
+        completion = final_output.outputs[0]
+        routed = completion.routed_experts
+        num_prompt_tokens = len(final_output.prompt_token_ids)
+        num_generated_tokens = len(completion.token_ids)
+        expected_len = num_prompt_tokens + num_generated_tokens - 1
+        assert routed is not None, f"Prompt {idx}: routed_experts is None"
+        assert routed.shape[0] == expected_len, (
+            f"Prompt {idx}: routed_experts length {routed.shape[0]} != "
+            f"prompt ({num_prompt_tokens}) + generated ({num_generated_tokens})"
+            f" - 1 = {expected_len}"
+        )
+        return idx, routed, list(completion.token_ids)
+
+    tasks = [_generate_one(p, i) for i, p in enumerate(prompts)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Sort by original index to maintain prompt order
+    outputs.sort(key=lambda x: x[0])
+
+    result = InferenceResult(num_experts=num_experts)
+    for _, routed, token_ids in outputs:
+        result.experts_list.append(routed)
+        result.token_ids_list.append(token_ids)
+
+    engine.shutdown()
+    return result
+
+
+def run_inference(
+    model: str,
+    prompts: list[str],
+    max_new_tokens: int = 50,
+    tp: int = 1,
+    max_model_len: int = 4096,
+) -> InferenceResult:
+    """Run inference with routed experts capture enabled via AsyncLLM."""
+    engine_args = AsyncEngineArgs(
+        model=model,
+        enable_return_routed_experts=True,
+        tensor_parallel_size=tp,
+        max_model_len=max_model_len,
+        disable_log_stats=True,
+        attention_backend="FLASH_ATTN",
+    )
+
+    result = asyncio.run(_run_async_inference(engine_args, prompts, max_new_tokens))
+
+    from vllm.platforms import current_platform
+
+    if current_platform.is_cuda_alike():
+        current_platform.empty_cache()
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Validation helpers
+# ---------------------------------------------------------------------------
+
+
+def validate_expert_ids(
+    experts_list: list[np.ndarray],
+    num_experts: int,
+) -> None:
+    """Check that all expert IDs are within valid range [0, num_experts)."""
+    for i, experts in enumerate(experts_list):
+        assert np.all(experts >= 0), (
+            f"Prompt {i}: negative expert IDs found, min={experts.min()}"
+        )
+        assert np.all(experts < num_experts), (
+            f"Prompt {i}: expert ID out of range [0, {num_experts}), "
+            f"max={experts.max()}"
+        )
+
+
+def validate_shapes(experts_list: list[np.ndarray]) -> None:
+    """Check that all routed_experts arrays have at least 2 dimensions."""
+    for i, experts in enumerate(experts_list):
+        assert experts.ndim >= 2, (
+            f"Prompt {i}: expected at least 2D array, got shape {experts.shape}"
+        )
+        logger.info("Prompt %d: routed_experts shape = %s", i, experts.shape)
+
+
+# ---------------------------------------------------------------------------
+# Comparison helpers
+# ---------------------------------------------------------------------------
+
+
+def compare_token_ids(
+    baseline: list[list[int]],
+    reference: list[list[int]],
+) -> float:
+    """Compare token IDs from two runs. Returns mismatch ratio."""
+    assert len(baseline) == len(reference), (
+        f"Length mismatch: {len(baseline)} vs {len(reference)}"
+    )
+
+    total_tokens = 0
+    total_mismatches = 0
+
+    for i, (base, ref) in enumerate(zip(baseline, reference)):
+        min_len = min(len(base), len(ref))
+        max_len = max(len(base), len(ref))
+        matches = 0
+        for a, b in zip(base[:min_len], ref[:min_len]):
+            if a != b:
+                break
+            matches += 1
+
+        total_mismatches += max_len - matches
+        total_tokens += max_len
+
+        if matches < min_len or len(base) != len(ref):
+            print(
+                f"  Prompt {i}: token_ids len={len(base)} vs {len(ref)}, "
+                f"mismatches={max_len - matches}/{max_len}"
+            )
+
+    if total_tokens == 0:
+        raise ValueError("No tokens to compare")
+
+    mismatch_ratio = total_mismatches / total_tokens
+    print(
+        f"Token ID mismatches: {total_mismatches}/{total_tokens} ({mismatch_ratio:.4%})"
+    )
+    return mismatch_ratio
+
+
+def compare_routed_experts(
+    baseline: list[np.ndarray],
+    reference: list[np.ndarray],
+    threshold: float = 0.05,
+) -> float:
+    """Compare two runs of routed experts. Returns mismatch ratio.
+
+    Raises AssertionError if ratio exceeds threshold.
+    """
+    assert len(baseline) == len(reference), (
+        f"Length mismatch: {len(baseline)} vs {len(reference)}"
+    )
+
+    total_elements = 0
+    total_mismatches = 0
+
+    for i, (base, ref) in enumerate(zip(baseline, reference)):
+        min_len = min(len(base), len(ref))
+        max_len = max(len(base), len(ref))
+        if min_len == 0:
+            continue
+
+        base_trimmed = base[:min_len]
+        ref_trimmed = ref[:min_len]
+
+        matches = 0
+        for a, b in zip(base_trimmed, ref_trimmed):
+            if a.sum() != b.sum():
+                break
+            matches += 1
+
+        total_mismatches += max_len - matches
+        total_elements += max_len
+
+        if matches < min_len or len(base) != len(ref):
+            print(
+                f"  Prompt {i}: routed_experts len={len(base)} vs {len(ref)}, "
+                f"mismatches={max_len - matches}/{max_len}"
+            )
+
+    if total_elements == 0:
+        raise ValueError("No elements to compare")
+
+    mismatch_ratio = total_mismatches / total_elements
+    print(
+        f"Routed experts mismatches: {total_mismatches}/{total_elements} "
+        f"({mismatch_ratio:.4%})"
+    )
+
+    assert mismatch_ratio < threshold, (
+        f"Too many mismatches: {total_mismatches}/{total_elements} "
+        f"({mismatch_ratio:.4%}) exceeds threshold {threshold:.4%}"
+    )
+
+    return mismatch_ratio
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main():
+    os.environ.setdefault("VLLM_BATCH_INVARIANT", "1")
+
+    parser = argparse.ArgumentParser(
+        description="Test routed experts capture for MoE models"
+    )
+    parser.add_argument("--model", type=str, default=DEFAULT_MODEL)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--max-model-len", type=int, default=4096)
+    parser.add_argument("--num-prompts", type=int, default=20)
+    parser.add_argument("--max-new-tokens", type=int, default=50)
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="Run twice and compare results for determinism check",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.05,
+        help="Maximum allowed mismatch ratio for determinism check",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    prompts = TEST_PROMPTS[: args.num_prompts]
+
+    print(f"Model: {args.model}")
+    print(f"TP: {args.tp}")
+    print(f"Prompts: {len(prompts)}")
+    print(f"Max new tokens: {args.max_new_tokens}")
+    print()
+
+    print("=== Run 1 (baseline) ===")
+    baseline = run_inference(
+        model=args.model,
+        prompts=prompts,
+        max_new_tokens=args.max_new_tokens,
+        tp=args.tp,
+        max_model_len=args.max_model_len,
+    )
+    print(f"num_experts (from model config): {baseline.num_experts}")
+
+    print("\n=== Validation ===")
+    validate_shapes(baseline.experts_list)
+    validate_expert_ids(baseline.experts_list, num_experts=baseline.num_experts)
+    print(f"All {len(baseline.experts_list)} results passed validation.")
+
+    for i, experts in enumerate(baseline.experts_list):
+        print(
+            f"  Prompt {i}: shape={experts.shape}, "
+            f"min={experts.min()}, max={experts.max()}"
+        )
+
+    if args.deterministic:
+        print("\n=== Run 2 (reference) ===")
+        reference = run_inference(
+            model=args.model,
+            prompts=prompts,
+            max_new_tokens=args.max_new_tokens,
+            tp=args.tp,
+            max_model_len=args.max_model_len,
+        )
+
+        print("\n=== Determinism Check ===")
+        validate_expert_ids(reference.experts_list, num_experts=baseline.num_experts)
+
+        print("\n--- Token IDs ---")
+        token_mismatch = compare_token_ids(
+            baseline.token_ids_list, reference.token_ids_list
+        )
+
+        print("\n--- Routed Experts ---")
+        expert_mismatch = compare_routed_experts(
+            baseline.experts_list,
+            reference.experts_list,
+            threshold=args.threshold,
+        )
+
+        print(
+            f"\nDeterminism check passed. "
+            f"Token mismatch: {token_mismatch:.4%}, "
+            f"Expert mismatch: {expert_mismatch:.4%}"
+        )
+
+    print("\nAll tests passed!")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/run_one_batch.py
+++ b/examples/offline_inference/run_one_batch.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from vllm import LLM, EngineArgs
+from vllm.config import ProfilerConfig
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+DEFAULT_MAX_TOKENS = 16
+
+
+def create_parser() -> FlexibleArgumentParser:
+    parser = FlexibleArgumentParser()
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+
+    batch_group = parser.add_argument_group("Batch parameters")
+    batch_group.add_argument("--batch-size", type=int, default=1)
+    batch_group.add_argument("--prompt-size", type=int, default=128)
+    batch_group.add_argument("--prompt-prefix", type=str, default="Hello, my name is")
+
+    profile_group = parser.add_argument_group("Profiling parameters")
+    profile_group.add_argument(
+        "--profile",
+        choices=["none", "prefill", "decode", "both"],
+        default="none",
+    )
+    profile_group.add_argument(
+        "--profile-dir",
+        type=str,
+        default="",
+        help="Required when --profile is not 'none'.",
+    )
+
+    return parser
+
+
+def _build_prompt(prefix: str, prompt_size: int) -> str:
+    if prompt_size <= 0:
+        return ""
+    if not prefix:
+        prefix = " "
+    if len(prefix) >= prompt_size:
+        return prefix[:prompt_size]
+    repeat_count = (prompt_size + len(prefix) - 1) // len(prefix)
+    return (prefix * repeat_count)[:prompt_size]
+
+
+def _build_profiler_config(
+    profile: str, profile_dir: str, max_tokens: int
+) -> ProfilerConfig | None:
+    if profile == "none":
+        return None
+    if not profile_dir:
+        raise ValueError("--profile-dir must be set when profiling is enabled.")
+    if profile == "prefill":
+        delay_iterations = 0
+        max_iterations = 1
+    elif profile == "decode":
+        delay_iterations = 1
+        max_iterations = max(1, max_tokens)
+    else:
+        delay_iterations = 0
+        max_iterations = 0
+
+    return ProfilerConfig(
+        profiler="torch",
+        torch_profiler_dir=profile_dir,
+        delay_iterations=delay_iterations,
+        max_iterations=max_iterations,
+    )
+
+
+def main(args: dict) -> None:
+    max_tokens = DEFAULT_MAX_TOKENS
+    batch_size = args.pop("batch_size")
+    prompt_size = args.pop("prompt_size")
+    prompt_prefix = args.pop("prompt_prefix")
+    profile = args.pop("profile")
+    profile_dir = args.pop("profile_dir")
+
+    profiler_config = _build_profiler_config(profile, profile_dir, max_tokens)
+    if profiler_config is not None:
+        args["profiler_config"] = profiler_config
+
+    llm = LLM(**args)
+
+    sampling_params = llm.get_default_sampling_params()
+    sampling_params.max_tokens = max_tokens
+    sampling_params.min_tokens = max_tokens
+    sampling_params.ignore_eos = True
+
+    prompt = _build_prompt(prompt_prefix, prompt_size)
+    prompts = [prompt] * batch_size
+
+    if profile != "none":
+        llm.start_profile()
+    outputs = llm.generate(prompts, sampling_params)
+    if profile != "none":
+        llm.stop_profile()
+
+    print("-" * 50)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    main(vars(parser.parse_args()))
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -5,14 +5,9 @@ from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams
 from vllm.benchmarks.datasets import add_dataset_parser, get_samples
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.metrics.reader import Counter, Vector

-try:
-    from vllm.utils.argparse_utils import FlexibleArgumentParser
-except ImportError:
-    from argparse import ArgumentParser as FlexibleArgumentParser
-
-
 QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",

--- a/examples/online_serving/dashboards/README.md
+++ b/examples/online_serving/dashboards/README.md
@@ -34,7 +34,7 @@ deployment methods:
 Both platforms provide equivalent monitoring capabilities:

 | Dashboard | Description |
-|-----------|-------------|
+| --------- | ----------- |
 | **Performance Statistics** | Tracks latency, throughput, and performance metrics |
 | **Query Statistics** | Monitors request volume, query performance, and KPIs |


--- a/examples/online_serving/dashboards/grafana/query_statistics.json
+++ b/examples/online_serving/dashboards/grafana/query_statistics.json
@@ -349,7 +349,7 @@
        "defaults": {
          "color": { "mode": "thresholds" },
          "mappings": [
-            { "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
+            { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
          ],
          "thresholds": {
            "mode": "absolute",

--- a/examples/online_serving/data_parallel_pause_resume.py
+++ b/examples/online_serving/data_parallel_pause_resume.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test pause/resume with Data Parallel (DP) via HTTP API.
+
+This example demonstrates coordinated pause/resume across multiple DP ranks.
+The pause synchronizes across all DP engines via all-reduce.
+
+Prerequisites:
+    Start a vLLM server with data parallelism:
+
+    $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \
+        --enforce-eager \
+        --data-parallel-size 4 \
+        --tensor-parallel-size 1
+
+    Then run this script:
+
+    $ python data_parallel_pause_resume.py
+
+The test verifies pause works by:
+1. Starting a streaming generation request
+2. Pausing the server mid-generation
+3. Sleeping for PAUSE_DURATION seconds
+4. Resuming the server
+5. Verifying there was a gap in token generation matching the pause duration
+"""
+
+import argparse
+import threading
+import time
+
+import requests
+from openai import OpenAI
+
+BASE_URL = "http://localhost:8000"
+MODEL_NAME = "facebook/opt-125m"
+PAUSE_DURATION = 3.0
+
+
+def pause_generation(base_url: str, mode: str = "keep") -> None:
+    """Pause generation via HTTP endpoint."""
+    url = f"{base_url}/pause"
+    response = requests.post(url, params={"mode": mode}, timeout=60)
+    response.raise_for_status()
+    print("Server paused")
+
+
+def resume_generation(base_url: str) -> None:
+    """Resume generation via HTTP endpoint."""
+    url = f"{base_url}/resume"
+    response = requests.post(url, timeout=60)
+    response.raise_for_status()
+    print("Server resumed")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-url", default=BASE_URL)
+    parser.add_argument("--model", default=MODEL_NAME)
+    args = parser.parse_args()
+
+    client = OpenAI(
+        base_url=f"{args.base_url}/v1",
+        api_key="EMPTY",
+    )
+
+    prompt = "Write a long story about a dragon. Once upon a time"
+    token_times: list[float] = []
+    pause_token_idx = 0
+    pause_triggered = threading.Event()
+
+    def generator_thread():
+        """Stream tokens and record timestamps."""
+        stream = client.completions.create(
+            model=args.model,
+            prompt=prompt,
+            max_tokens=50,
+            stream=True,
+        )
+        for chunk in stream:
+            if chunk.choices[0].text:
+                token_times.append(time.monotonic())
+                token_count = len(token_times)
+                print(f"Token {token_count}: {chunk.choices[0].text!r}")
+
+                # Signal controller after some tokens
+                if token_count >= 5 and not pause_triggered.is_set():
+                    pause_triggered.set()
+
+    def controller_thread():
+        """Pause and resume the server."""
+        nonlocal pause_token_idx
+
+        # Wait for some tokens
+        pause_triggered.wait()
+
+        print(f"\nPausing server (keep mode) at token {len(token_times)}...")
+        pause_generation(args.base_url, mode="keep")
+        pause_token_idx = len(token_times)
+        print(f"Sleeping for {PAUSE_DURATION}s...")
+
+        time.sleep(PAUSE_DURATION)
+
+        print("Resuming server...")
+        resume_generation(args.base_url)
+        print("Resumed!\n")
+
+    # Run both threads
+    gen_thread = threading.Thread(target=generator_thread)
+    ctrl_thread = threading.Thread(target=controller_thread)
+
+    gen_thread.start()
+    ctrl_thread.start()
+
+    gen_thread.join()
+    ctrl_thread.join()
+
+    # Check gap at the pause point
+    if pause_token_idx < len(token_times):
+        pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1]
+        print(
+            f"\nGap after pause (token {pause_token_idx} -> "
+            f"{pause_token_idx + 1}): {pause_gap:.3f}s"
+        )
+        if pause_gap >= PAUSE_DURATION * 0.9:
+            print("Test passed! Pause synchronized across DP ranks.")
+        else:
+            print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s")
+    else:
+        print("Test failed! No tokens were generated after resuming.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -95,7 +95,7 @@ If you enable prefill instance (`--prefill-servers-urls` not disabled), you will
 ## Proxy Instance Flags (`disagg_epd_proxy.py`)

 | Flag | Description |
-|------|-------------|
+| ---- | ----------- |
 | `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
 | `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
 | `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |

--- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -8,7 +8,7 @@ declare -a PIDS=()
 ###############################################################################
 MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
 LOG_PATH="${LOG_PATH:-./logs}"
-mkdir -p $LOG_PATH
+mkdir -p "$LOG_PATH"

 ENCODE_PORT="${ENCODE_PORT:-19534}"
 PREFILL_PORT="${PREFILL_PORT:-19535}"
@@ -84,10 +84,10 @@ trap cleanup TERM

 # clear previous cache
 echo "remove previous ec cache folder"
-rm -rf $EC_SHARED_STORAGE_PATH
+rm -rf "$EC_SHARED_STORAGE_PATH"

 echo "make ec cache folder"
-mkdir -p $EC_SHARED_STORAGE_PATH
+mkdir -p "$EC_SHARED_STORAGE_PATH"

 ###############################################################################
 # Encoder worker
@@ -100,7 +100,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
    --no-enable-prefix-caching \
    --max-num-batched-tokens 114688 \
    --max-num-seqs 128 \
-    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_producer",
@@ -124,7 +124,7 @@ vllm serve "$MODEL" \
    --enforce-eager \
    --enable-request-id-headers \
    --max-num-seqs 128 \
-    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_consumer",
@@ -152,7 +152,7 @@ vllm serve "$MODEL" \
    --enforce-eager \
    --enable-request-id-headers \
    --max-num-seqs 128 \
-    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --kv-transfer-config '{
        "kv_connector": "NixlConnector",
        "kv_role": "kv_consumer"
@@ -162,9 +162,9 @@ vllm serve "$MODEL" \
 PIDS+=($!)

 # Wait for workers
-wait_for_server $ENCODE_PORT
-wait_for_server $PREFILL_PORT
-wait_for_server $DECODE_PORT
+wait_for_server "$ENCODE_PORT"
+wait_for_server "$PREFILL_PORT"
+wait_for_server "$DECODE_PORT"

 ###############################################################################
 # Proxy
@@ -179,7 +179,7 @@ python disagg_epd_proxy.py \

 PIDS+=($!)

-wait_for_server $PROXY_PORT
+wait_for_server "$PROXY_PORT"
 echo "All services are up!"

 ###############################################################################
@@ -187,14 +187,14 @@ echo "All services are up!"
 ###############################################################################
 echo "Running benchmark (stream)..."
 vllm bench serve \
-  --model               $MODEL \
+  --model               "$MODEL" \
  --backend             openai-chat \
  --endpoint            /v1/chat/completions \
  --dataset-name        hf \
  --dataset-path        lmarena-ai/VisionArena-Chat \
  --seed                0 \
-  --num-prompts         $NUM_PROMPTS \
-  --port                $PROXY_PORT
+  --num-prompts         "$NUM_PROMPTS" \
+  --port                "$PROXY_PORT"

 PIDS+=($!)

@@ -202,10 +202,10 @@ PIDS+=($!)
 # Single request with local image
 ###############################################################################
 echo "Running single request with local image (non-stream)..."
-curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-    "model": "'${MODEL}'",
+    "model": "'"${MODEL}"'",
    "messages": [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": [

--- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -8,7 +8,7 @@ declare -a PIDS=()
 ###############################################################################
 MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
 LOG_PATH="${LOG_PATH:-./logs}"
-mkdir -p $LOG_PATH
+mkdir -p "$LOG_PATH"

 ENCODE_PORT="${ENCODE_PORT:-19534}"
 PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
@@ -78,10 +78,10 @@ trap cleanup TERM

 # clear previous cache
 echo "remove previous ec cache folder"
-rm -rf $EC_SHARED_STORAGE_PATH
+rm -rf "$EC_SHARED_STORAGE_PATH"

 echo "make ec cache folder"
-mkdir -p $EC_SHARED_STORAGE_PATH
+mkdir -p "$EC_SHARED_STORAGE_PATH"

 ###############################################################################
 # Encoder worker
@@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
    --no-enable-prefix-caching \
    --max-num-batched-tokens 114688 \
    --max-num-seqs 128 \
-    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_producer",
@@ -115,7 +115,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
    --enforce-eager \
    --enable-request-id-headers \
    --max-num-seqs 128 \
-    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_consumer",
@@ -128,8 +128,8 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
 PIDS+=($!)

 # Wait for workers
-wait_for_server $ENCODE_PORT
-wait_for_server $PREFILL_DECODE_PORT
+wait_for_server "$ENCODE_PORT"
+wait_for_server "$PREFILL_DECODE_PORT"

 ###############################################################################
 # Proxy
@@ -144,7 +144,7 @@ python disagg_epd_proxy.py \

 PIDS+=($!)

-wait_for_server $PROXY_PORT
+wait_for_server "$PROXY_PORT"
 echo "All services are up!"

 ###############################################################################
@@ -152,14 +152,14 @@ echo "All services are up!"
 ###############################################################################
 echo "Running benchmark (stream)..."
 vllm bench serve \
-  --model               $MODEL \
+  --model               "$MODEL" \
  --backend             openai-chat \
  --endpoint            /v1/chat/completions \
  --dataset-name        hf \
  --dataset-path        lmarena-ai/VisionArena-Chat \
  --seed                0 \
-  --num-prompts         $NUM_PROMPTS \
-  --port                $PROXY_PORT
+  --num-prompts         "$NUM_PROMPTS" \
+  --port                "$PROXY_PORT"

 PIDS+=($!)

@@ -167,10 +167,10 @@ PIDS+=($!)
 # Single request with local image
 ###############################################################################
 echo "Running single request with local image (non-stream)..."
-curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-    "model": "'${MODEL}'",
+    "model": "'"${MODEL}"'",
    "messages": [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": [

--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -54,7 +54,7 @@ wait_for_server() {
 # You can also adjust --kv-ip and --kv-port for distributed inference.

 # prefilling instance, which is the KV producer
-CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
+CUDA_VISIBLE_DEVICES=0 vllm serve "$MODEL_NAME" \
    --host 0.0.0.0 \
    --port 8100 \
    --max-model-len 100 \
@@ -64,7 +64,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &

 # decoding instance, which is the KV consumer  
-CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
+CUDA_VISIBLE_DEVICES=1 vllm serve "$MODEL_NAME" \
    --host 0.0.0.0 \
    --port 8200 \
    --max-model-len 100 \

--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -328,9 +328,9 @@ class Proxy:
        if instance_type == "decode" and instance in self.decode_instances:
            self.decode_instances.remove(instance)
            self.decode_cycler = itertools.cycle(self.decode_instances)
-        if instance_type == "prefill" and instance in self.decode_instances:
+        if instance_type == "prefill" and instance in self.prefill_instances:
            self.prefill_instances.remove(instance)
-            self.prefill_cycler = itertools.cycle(self.decode_instances)
+            self.prefill_cycler = itertools.cycle(self.prefill_instances)


 class RoundRobinSchedulingPolicy(SchedulingPolicy):

--- a/examples/online_serving/disaggregated_serving/kv_events.sh
+++ b/examples/online_serving/disaggregated_serving/kv_events.sh
@@ -34,7 +34,7 @@ wait_for_server() {
    done" && return 0 || return 1
 }

-vllm serve $MODEL_NAME \
+vllm serve "$MODEL_NAME" \
    --port 8100 \
    --max-model-len 100 \
    --enforce-eager \

--- a/examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
+++ b/examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
@@ -143,7 +143,7 @@ main() {
    IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS"
    IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"

-    proxy_param=""
+    proxy_args=()

    # =============================================================================
    # Launch Prefill Servers (X Producers)
@@ -156,12 +156,12 @@ main() {
        local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]}

        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port"
-        VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
-        --port $port \
+        VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
+        --port "$port" \
        --kv-transfer-config \
        "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 &
        PIDS+=($!)
-        proxy_param="${proxy_param} --prefill http://0.0.0.0:${port} $bootstrap_port"
+        proxy_args+=(--prefill "http://0.0.0.0:${port}" "$bootstrap_port")
    done

    # =============================================================================
@@ -174,12 +174,12 @@ main() {
        local port=${DECODE_PORT_ARRAY[$i]}

        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port"
-        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
-        --port $port \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
+        --port "$port" \
        --kv-transfer-config \
        "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 &
        PIDS+=($!)
-        proxy_param="${proxy_param} --decode http://0.0.0.0:${port}"
+        proxy_args+=(--decode "http://0.0.0.0:${port}")
    done

    # =============================================================================
@@ -187,7 +187,7 @@ main() {
    # =============================================================================
    echo ""
    echo "Starting proxy server on port $PROXY_PORT..."
-    python3 mooncake_connector_proxy.py $proxy_param --port $PROXY_PORT > proxy.log 2>&1 &
+    python3 mooncake_connector_proxy.py "${proxy_args[@]}" --port "$PROXY_PORT" > proxy.log 2>&1 &
    PIDS+=($!)

    # =============================================================================
@@ -196,9 +196,10 @@ main() {
    echo ""
    echo "Waiting for all servers to start..."
    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
-        if ! wait_for_server $port; then
+        if ! wait_for_server "$port"; then
            echo "Failed to start server on port $port"
            cleanup
+            # shellcheck disable=SC2317
            exit 1
        fi
    done
@@ -209,8 +210,8 @@ main() {
    # =============================================================================
    # Run Benchmark
    # =============================================================================
-    vllm bench serve --port $PROXY_PORT --seed $(date +%s) \
-        --backend vllm --model $MODEL \
+    vllm bench serve --port "$PROXY_PORT" --seed "$(date +%s)" \
+        --backend vllm --model "$MODEL" \
        --dataset-name random --random-input-len 7500 --random-output-len 200 \
        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log


--- a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
+++ b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
@@ -14,6 +14,10 @@ import regex as re
 import zmq
 from quart import Quart, make_response, request

+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common import (
+    MoRIIOConstants,
+)
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 prefill_instances: list[dict] = []
@@ -213,6 +217,8 @@ async def handle_request():

        dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"])

+        transfer_id = f"{MoRIIOConstants.TRANSFER_PREFIX}-{str(uuid.uuid4())}"
+
        req_data_to_prefill = copy.deepcopy(req_data)
        req_data_to_prefill["kv_transfer_params"] = {}
        req_data["kv_transfer_params"] = {}
@@ -222,6 +228,7 @@ async def handle_request():
        req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = (
            decode_instance_endpoint["tp_size"]
        )
+        req_data_to_prefill["kv_transfer_params"]["transfer_id"] = transfer_id

        send_prefill_task = asyncio.create_task(
            send_request_to_prefill(
@@ -267,6 +274,7 @@ async def handle_request():

        if selected_prefill_dp_rank is not None:
            req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank
+        req_data["kv_transfer_params"]["transfer_id"] = transfer_id

        decode_request_task = asyncio.create_task(
            start_decode_request(

--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -166,10 +166,10 @@ main() {
        local kv_port=$((21001 + i))

        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
        --enforce-eager \
        --host 0.0.0.0 \
-        --port $port \
+        --port "$port" \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --dtype float16 \
@@ -194,10 +194,10 @@ main() {
        local kv_port=$((22001 + i))

        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
-        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
        --enforce-eager \
        --host 0.0.0.0 \
-        --port $port \
+        --port "$port" \
        --tensor-parallel-size 1 \
        --seed 1024 \
        --dtype float16 \
@@ -217,9 +217,10 @@ main() {
    echo ""
    echo "Waiting for all servers to start..."
    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
-        if ! wait_for_server $port; then
+        if ! wait_for_server "$port"; then
            echo "Failed to start server on port $port"
            cleanup
+            # shellcheck disable=SC2317
            exit 1
        fi
    done
@@ -231,8 +232,8 @@ main() {
    # Run Benchmark
    # =============================================================================
    cd ../../../benchmarks/
-    vllm bench serve --port 10001 --seed $(date +%s) \
-        --model $MODEL \
+    vllm bench serve --port 10001 --seed "$(date +%s)" \
+        --model "$MODEL" \
        --dataset-name random --random-input-len 7500 --random-output-len 200 \
        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log


--- a/examples/online_serving/ec_both_encoder/ec_both_encoder.sh
+++ b/examples/online_serving/ec_both_encoder/ec_both_encoder.sh
+#!/bin/bash
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+PORT="${PORT:-8000}"
+GPU="${GPU:-0}"
+NUM_PROMPTS="${NUM_PROMPTS:-200}"
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT="${TIMEOUT:-600}"
+
+SERVER_PID=""
+
+cleanup() {
+    echo "Stopping server..."
+    if [[ -n "$SERVER_PID" ]] && kill -0 "$SERVER_PID" 2>/dev/null; then
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    echo "Done."
+}
+trap cleanup EXIT INT TERM
+
+wait_for_server() {
+    local deadline=$((SECONDS + TIMEOUT))
+    echo "Waiting for server on port $PORT..."
+    while (( SECONDS < deadline )); do
+        if curl -sf "http://localhost:${PORT}/v1/models" > /dev/null 2>&1; then
+            echo "Server ready."
+            return 0
+        fi
+        sleep 2
+    done
+    echo "ERROR: Server did not start within ${TIMEOUT}s"
+    return 1
+}
+
+rm -rf "$EC_SHARED_STORAGE_PATH"
+mkdir -p "$EC_SHARED_STORAGE_PATH"
+
+###############################################################################
+# Start server with ec_both
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU" \
+vllm serve "$MODEL" \
+    --port "$PORT" \
+    --enforce-eager \
+    --ec-transfer-config '{
+        "ec_connector": "ECExampleConnector",
+        "ec_role": "ec_both",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    "$@" &
+
+SERVER_PID=$!
+wait_for_server
+
+###############################################################################
+# Benchmark -- dataset contains duplicate images, exercises cache hits
+###############################################################################
+echo "Running benchmark ($NUM_PROMPTS prompts)..."
+vllm bench serve \
+    --model "$MODEL" \
+    --backend openai-chat \
+    --endpoint /v1/chat/completions \
+    --dataset-name hf \
+    --dataset-path lmarena-ai/VisionArena-Chat \
+    --seed 0 \
+    --num-prompts "$NUM_PROMPTS" \
+    --port "$PORT"
+
+echo "Benchmark complete."
--- a/examples/online_serving/elastic_ep/bench.sh
+++ b/examples/online_serving/elastic_ep/bench.sh
@@ -50,8 +50,8 @@ while [[ $# -gt 0 ]]; do
 done

 vllm bench serve \
-    --model $MODEL_NAME \
-    --host $HOST \
-    --port $PORT \
-    --num-prompts $NUM_PROMPTS \
-    --request-rate $REQUEST_RATE
+    --model "$MODEL_NAME" \
+    --host "$HOST" \
+    --port "$PORT" \
+    --num-prompts "$NUM_PROMPTS" \
+    --request-rate "$REQUEST_RATE"
--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -57,15 +57,15 @@ echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALL
 export RAY_DEDUP_LOGS=0
 export VLLM_USE_DEEP_GEMM=1

-vllm serve $MODEL_NAME \
-    --data-parallel-size $DATA_PARALLEL_SIZE \
-    --data-parallel-size-local $DATA_PARALLEL_SIZE \
+vllm serve "$MODEL_NAME" \
+    --data-parallel-size "$DATA_PARALLEL_SIZE" \
+    --data-parallel-size-local "$DATA_PARALLEL_SIZE" \
    --data-parallel-backend ray \
    --enforce-eager \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend pplx \
-    --num-redundant-experts $REDUNDANT_EXPERTS \
+    --all2all-backend allgather_reducescatter \
+    --num-redundant-experts "$REDUNDANT_EXPERTS" \
    --trust-remote-code \
-    --host $HOST \
-    --port $PORT
+    --host "$HOST" \
+    --port "$PORT"