feat: sglang to 0.5.9 + updated docs (#6518)

Co-authored-by: baihuitian <baihuitian.bht@gmail.com> Co-authored-by: Cursor <cursoragent@cursor.com>

feat: sglang to 0.5.9 + updated docs (#6518)
Co-authored-by: baihuitian <baihuitian.bht@gmail.com> Co-authored-by: Cursor <cursoragent@cursor.com>
6642e23e · ishandhanani · GitHub · 1df620b4 · 1df620b4 · 1df620b4
Unverified Commit 6642e23e authored Feb 24, 2026 by ishandhanani Committed by GitHub Feb 24, 2026
11 changed files
--- a/examples/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-n_prefill=$1
-n_decode=$2
-prefill_gpus=$3
-decode_gpus=$4
-total_gpus=$((prefill_gpus+decode_gpus))
-chosen_isl=$5
-chosen_osl=$6
-concurrency_list=$7
-IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
-chosen_req_rate=$8
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[@]}; ${chosen_req_rate}"
-head_node="localhost"
-head_port="8000"
-SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
-MODEL_PATH=/model/
-source /scripts/benchmark_utils.sh
-wait_for_model $head_node $head_port $n_prefill $n_decode 5 900 60
-sleep 300
-set -e
-warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x250"
-set +e
-profile_folder="/logs/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $profile_folder
-for max_concurrency in ${chosen_concurrencies[@]}; do
-    chosen_n_requests=$((5*max_concurrency))
-    export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}.json"
-    command=(
-        python3 -m sglang.bench_serving
-        --base-url "http://${head_node}:${head_port}"
-        --model ${SERVED_MODEL_NAME} --tokenizer ${MODEL_PATH}
-        --backend sglang-oai
-        --dataset-name random --random-input ${chosen_isl} --random-output ${chosen_osl}
-        --random-range-ratio 1
-        --num-prompts ${chosen_n_requests} --request-rate ${chosen_req_rate} --max-concurrency ${max_concurrency}
-        --output-file $export_file
-    )
-    echo "Running command ${command[@]}"
-    ${command[@]}
-    echo "-----------------------------------------"
-done
--- a/examples/backends/sglang/slurm_jobs/scripts/sglang_bench_serving.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/sglang_bench_serving.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-### Benchmark configuration and setup
-# Benchmarking script setup - ISL/OSL/concurrencies/request_rate
-chosen_isl=1024
-chosen_osl=1024
-chosen_req_rate=250
-chosen_concurrencies=(2 10 20 50 100 200 500 1000 2000 2500 3000 3500 4000 4500 5000 7500 10000 12500 15000 16250 17500 18750 20000)
-# Model config setup - frontend URL, model name, and path
-head_node="localhost"
-head_port="8000"
-SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
-MODEL_PATH=/model/
-# This file contains `wait_for_model` and `warmup_model`
-source /scripts/benchmark_utils.sh
-### Benchmark runs
-# 1. wait for model to come alive - `wait_for_model`
-# 2. warms up the model - `warmup_model`
-# 3. benchmark model - for concurrency in concurrencies; do <benchmark script>; done
-wait_for_model $head_node $head_port 5 2400 60
-set -e
-warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x${chosen_req_rate}"
-set +e
-for max_concurrency in ${chosen_concurrencies[@]}; do
-    chosen_n_requests=$((5*max_concurrency))
-    command=(
-        python3 -m sglang.bench_serving
-        --base-url "http://${head_node}:${head_port}"
-        --model ${SERVED_MODEL_NAME} --tokenizer ${MODEL_PATH}
-        --backend sglang-oai
-        --dataset-name random --random-input ${chosen_isl} --random-output ${chosen_osl}
-        --random-range-ratio 1
-        --num-prompts ${chosen_n_requests} --request-rate ${chosen_req_rate} --max-concurrency ${max_concurrency}
-    )
-    echo "Running command ${command[@]}"
-    ${command[@]}
-    echo "-----------------------------------------"
-done
--- a/examples/backends/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# pytest: skip-file
-import json
-import os
-import sys
-import time
-import traceback
-from dataclasses import dataclass, field
-from typing import List, Optional, Union
-import aiohttp
-import huggingface_hub.constants
-from tqdm.asyncio import tqdm
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
-@dataclass
-class RequestFuncInput:
-    prompt: str
-    api_url: str
-    prompt_len: int
-    output_len: int
-    model: str
-    model_name: Optional[str] = None
-    best_of: int = 1
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict] = None
-    ignore_eos: bool = False
-@dataclass
-class RequestFuncOutput:
-    generated_text: str = ""
-    success: bool = False
-    latency: float = 0.0
-    # output_tokens: int = 0
-    output_tokens: Optional[int] = None
-    ttft: float = 0.0  # Time to first token
-    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
-    tpot: float = 0.0  # avg next-token latencies
-    prompt_len: int = 0
-    error: str = ""
-async def async_request_tgi(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(
-        trust_env=True, timeout=AIOHTTP_TIMEOUT
-    ) as session:
-        params = {
-            "best_of": request_func_input.best_of,
-            "max_new_tokens": request_func_input.output_len,
-            "do_sample": True,
-            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
-            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
-            "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
-        }
-        payload = {
-            "inputs": request_func_input.prompt,
-            "parameters": params,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk_bytes = chunk_bytes.decode("utf-8")
-                        # NOTE: Sometimes TGI returns a ping response without
-                        # any data, we should skip it.
-                        if chunk_bytes.startswith(":"):
-                            continue
-                        chunk = chunk_bytes.removeprefix("data:")
-                        data = json.loads(chunk)
-                        timestamp = time.perf_counter()
-                        # First token
-                        if ttft == 0.0:
-                            ttft = time.perf_counter() - st
-                            output.ttft = ttft
-                        # Decoding phase
-                        else:
-                            output.itl.append(timestamp - most_recent_timestamp)
-                        most_recent_timestamp = timestamp
-                    output.latency = most_recent_timestamp - st
-                    output.success = True
-                    output.generated_text = data["generated_text"]
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-        if pbar:
-            pbar.update(1)
-        return output
-async def async_request_trt_llm(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(
-        trust_env=True, timeout=AIOHTTP_TIMEOUT
-    ) as session:
-        assert request_func_input.best_of == 1
-        payload = {
-            "accumulate_tokens": True,
-            "text_input": request_func_input.prompt,
-            "temperature": 0.0,
-            "top_p": 1.0,
-            "max_tokens": request_func_input.output_len,
-            "stream": True,
-        }
-        if request_func_input.ignore_eos:
-            payload["min_length"] = request_func_input.output_len
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
-                        data = json.loads(chunk)
-                        output.generated_text += data["text_output"]
-                        timestamp = time.perf_counter()
-                        # First token
-                        if ttft == 0.0:
-                            ttft = timestamp - st
-                            output.ttft = ttft
-                        # Decoding phase
-                        else:
-                            output.itl.append(timestamp - most_recent_timestamp)
-                        most_recent_timestamp = timestamp
-                    output.latency = most_recent_timestamp - st
-                    output.success = True
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-        if pbar:
-            pbar.update(1)
-        return output
-async def async_request_deepspeed_mii(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(
-        trust_env=True, timeout=AIOHTTP_TIMEOUT
-    ) as session:
-        assert request_func_input.best_of == 1
-        payload = {
-            "prompt": request_func_input.prompt,
-            "max_tokens": request_func_input.output_len,
-            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
-            "top_p": 1.0,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
-        # will use 0 as placeholder.
-        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
-        output.ttft = 0
-        st = time.perf_counter()
-        try:
-            async with session.post(
-                url=request_func_input.api_url, json=payload
-            ) as response:
-                if response.status == 200:
-                    parsed_resp = await response.json()
-                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
-                    output.success = True
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-        if pbar:
-            pbar.update(1)
-        return output
-async def async_request_openai_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
-    async with aiohttp.ClientSession(
-        trust_env=True, timeout=AIOHTTP_TIMEOUT
-    ) as session:
-        payload = {
-            "model": request_func_input.model_name
-            if request_func_input.model_name
-            else request_func_input.model,
-            "prompt": request_func_input.prompt,
-            "temperature": 0.0,
-            "best_of": request_func_input.best_of,
-            "max_tokens": request_func_input.output_len,
-            "logprobs": request_func_input.logprobs,
-            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
-        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        generated_text = ""
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(
-                url=api_url, json=payload, headers=headers
-            ) as response:
-                if response.status == 200:
-                    first_chunk_received = False
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
-                        if chunk != "[DONE]":
-                            data = json.loads(chunk)
-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
-                            if choices := data.get("choices"):
-                                # Note that text could be empty here
-                                # e.g. for special tokens
-                                text = choices[0].get("text")
-                                timestamp = time.perf_counter()
-                                # First token
-                                if not first_chunk_received:
-                                    first_chunk_received = True
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp - most_recent_timestamp)
-                                most_recent_timestamp = timestamp
-                                generated_text += text or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get("completion_tokens")
-                    if first_chunk_received:
-                        output.success = True
-                    else:
-                        output.success = False
-                        output.error = (
-                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!"
-                        )
-                    output.generated_text = generated_text
-                    output.latency = most_recent_timestamp - st
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-    if pbar:
-        pbar.update(1)
-    return output
-async def async_request_dynamo_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
-    async with aiohttp.ClientSession(
-        trust_env=True, timeout=AIOHTTP_TIMEOUT
-    ) as session:
-        payload = {
-            "model": request_func_input.model_name
-            if request_func_input.model_name
-            else request_func_input.model,
-            "prompt": request_func_input.prompt,
-            "temperature": 0.0,
-            "best_of": request_func_input.best_of,
-            "max_tokens": request_func_input.output_len,
-            "logprobs": request_func_input.logprobs,
-            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
-        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        generated_text = ""
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(
-                url=api_url, json=payload, headers=headers
-            ) as response:
-                if response.status == 200:
-                    first_chunk_received = False
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk = chunk_bytes.decode("utf-8")
-                        # Skip SSE event/comment lines (not data)
-                        if chunk.startswith("event:") or chunk.startswith(":"):
-                            continue
-                        chunk = chunk.removeprefix("data: ")
-                        if chunk != "[DONE]":
-                            data = json.loads(chunk)
-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
-                            if choices := data.get("choices"):
-                                # Note that text could be empty here
-                                # e.g. for special tokens
-                                text = choices[0].get("text")
-                                timestamp = time.perf_counter()
-                                # First token
-                                if not first_chunk_received:
-                                    first_chunk_received = True
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp - most_recent_timestamp)
-                                most_recent_timestamp = timestamp
-                                generated_text += text or ""
-                            if usage := data.get("usage"):
-                                output.output_tokens = usage.get("completion_tokens")
-                    if first_chunk_received:
-                        output.success = True
-                    else:
-                        output.success = False
-                        output.error = (
-                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!"
-                        )
-                    output.generated_text = generated_text
-                    output.latency = most_recent_timestamp - st
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-    if pbar:
-        pbar.update(1)
-    return output
-async def async_request_openai_chat_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        "chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
-    async with aiohttp.ClientSession(
-        trust_env=True, timeout=AIOHTTP_TIMEOUT
-    ) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
-        if request_func_input.multi_modal_content:
-            content.append(request_func_input.multi_modal_content)
-        payload = {
-            "model": request_func_input.model_name
-            if request_func_input.model_name
-            else request_func_input.model,
-            "messages": [
-                {"role": "user", "content": content},
-            ],
-            "temperature": 0.0,
-            "max_completion_tokens": request_func_input.output_len,
-            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
-        }
-        if request_func_input.ignore_eos:
-            payload["ignore_eos"] = request_func_input.ignore_eos
-        if request_func_input.extra_body:
-            payload.update(request_func_input.extra_body)
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        generated_text = ""
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(
-                url=api_url, json=payload, headers=headers
-            ) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
-                        if chunk != "[DONE]":
-                            timestamp = time.perf_counter()
-                            data = json.loads(chunk)
-                            if choices := data.get("choices"):
-                                content = choices[0]["delta"].get("content")
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = timestamp - st
-                                    output.ttft = ttft
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp - most_recent_timestamp)
-                                generated_text += content or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get("completion_tokens")
-                            most_recent_timestamp = timestamp
-                    output.generated_text = generated_text
-                    output.success = True
-                    output.latency = most_recent_timestamp - st
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-    if pbar:
-        pbar.update(1)
-    return output
-def get_model(pretrained_model_name_or_path: str) -> str:
-    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
-        from modelscope import snapshot_download
-        model_path = snapshot_download(
-            model_id=pretrained_model_name_or_path,
-            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
-        )
-        return model_path
-    return pretrained_model_name_or_path
-def get_tokenizer(
-    pretrained_model_name_or_path: str,
-    tokenizer_mode: str = "auto",
-    trust_remote_code: bool = False,
-    **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    if pretrained_model_name_or_path is not None and not os.path.exists(
-        pretrained_model_name_or_path
-    ):
-        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
-    if tokenizer_mode == "slow":
-        if kwargs.get("use_fast", False):
-            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
-        kwargs["use_fast"] = False
-    if tokenizer_mode == "mistral":
-        try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
-        except ImportError as e:
-            raise ImportError(
-                "MistralTokenizer requires vllm package.\n"
-                "Please install it with `pip install vllm` "
-                "to use mistral tokenizer mode."
-            ) from e
-        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
-    else:
-        return AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
-ASYNC_REQUEST_FUNCS = {
-    "tgi": async_request_tgi,
-    "vllm": async_request_openai_completions,
-    "dynamo": async_request_dynamo_completions,
-    "lmdeploy": async_request_openai_completions,
-    "deepspeed-mii": async_request_deepspeed_mii,
-    "openai": async_request_openai_completions,
-    "openai-chat": async_request_openai_chat_completions,
-    "tensorrt-llm": async_request_trt_llm,
-    "scalellm": async_request_openai_completions,
-    "sglang": async_request_openai_completions,
-}
--- a/examples/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
+++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# Example script adapted from https://github.com/kedarpotdar-nv/bench_serving/tree/dynamo-fix.
-model_name="deepseek-ai/DeepSeek-R1"
-model_path="/model/"
-head_node="localhost"
-head_port=8000
-n_prefill=$1
-n_decode=$2
-prefill_gpus=$3
-decode_gpus=$4
-total_gpus=$((prefill_gpus+decode_gpus))
-source /scripts/benchmark_utils.sh
-work_dir="/scripts/vllm/"
-cd $work_dir
-chosen_isl=$5
-chosen_osl=$6
-concurrency_list=$7
-IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
-chosen_req_rate=$8
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[@]}; ${chosen_req_rate}"
-wait_for_model_timeout=3000
-wait_for_model_check_interval=5
-wait_for_model_report_interval=60
-wait_for_model $head_node $head_port $n_prefill $n_decode \
-    $wait_for_model_check_interval $wait_for_model_timeout $wait_for_model_report_interval
-set -e
-# Warmup defaults
-warmup_isl=$chosen_isl
-warmup_osl=$chosen_osl
-warmup_req_rate=250
-warmup_concurrency_list=(1 4 32 128 256)
-# Ensure all chosen concurrencies are in warmup list
-for c in "${chosen_concurrencies[@]}"; do
-    found=false
-    for w in "${warmup_concurrency_list[@]}"; do
-        if [[ "$c" == "$w" ]]; then
-            found=true
-            break
-        fi
-    done
-    if [[ "$found" == false ]]; then
-        warmup_concurrency_list+=("$c")
-    fi
-done
-# Optional: sort warmup list numerically
-IFS=$'\n' warmup_concurrency_list=($(sort -n <<<"${warmup_concurrency_list[*]}"))
-unset IFS
-echo "Final warmup list: ${warmup_concurrency_list[@]}"
-# Warmup
-for warmup_concurrency in "${warmup_concurrency_list[@]}"
-do
-    echo "Warming up model with concurrency $warmup_concurrency"
-    echo "$(date '+%Y-%m-%d %H:%M:%S')"
-    num_prompts=$((warmup_concurrency * 5))
-    set -x
-    python3 -u benchmark_serving.py \
-        --model ${model_name} --tokenizer ${model_path} \
-        --host $head_node --port $head_port \
-        --backend "dynamo" --endpoint /v1/completions \
-        --disable-tqdm \
-        --dataset-name random \
-        --num-prompts "$num_prompts" \
-        --random-input-len $warmup_isl \
-        --random-output-len $warmup_osl \
-        --random-range-ratio 0.8 \
-        --ignore-eos \
-        --request-rate ${warmup_req_rate} \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --max-concurrency "$warmup_concurrency"
-    set +x
-    echo "$(date '+%Y-%m-%d %H:%M:%S')"
-done
-set +e
-result_dir="/logs/vllm_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $result_dir
-set -e
-for concurrency in "${chosen_concurrencies[@]}"
-do
-    num_prompts=$((concurrency * 5))
-    echo "Running benchmark with concurrency: $concurrency and num-prompts: $num_prompts, writing to file ${result_dir}"
-    result_filename="isl_${chosen_isl}_osl_${chosen_osl}_concurrency_${concurrency}_req_rate_${chosen_req_rate}_ctx_${prefill_gpus}_gen_${decode_gpus}_gpus_${total_gpus}.json"
-    set -x
-    echo "$(date '+%Y-%m-%d %H:%M:%S')"
-    python3 -u benchmark_serving.py \
-        --model ${model_name} --tokenizer ${model_path} \
-        --host $head_node --port $head_port \
-        --backend "dynamo" --endpoint /v1/completions \
-        --disable-tqdm \
-        --dataset-name random \
-        --num-prompts "$num_prompts" \
-        --random-input-len $chosen_isl \
-        --random-output-len $chosen_osl \
-        --random-range-ratio 0.8 \
-        --ignore-eos \
-        --request-rate ${chosen_req_rate} \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --max-concurrency "$concurrency" \
-        --save-result --result-dir $result_dir --result-filename $result_filename
-    set +x
-    echo "$(date '+%Y-%m-%d %H:%M:%S')"
-    echo "Completed benchmark with concurrency: $concurrency"
-    echo "-----------------------------------------"
-done
--- a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# pytest: skip-file
-r"""Benchmark online serving throughput.
-On the server side, run one of the following commands:
-    vLLM OpenAI API server
-    vllm serve <your_model> \
-        --swap-space 16 \
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-On the client side, run:
-    python benchmarks/benchmark_serving.py \
-        --backend <backend> \
-        --model <your_model> \
-        --dataset-name sharegpt \
-        --dataset-path <path to dataset> \
-        --request-rate <request_rate> \ # By default <request_rate> is inf
-        --num-prompts <num_prompts> # By default <num_prompts> is 1000
-    when using tgi backend, add
-        --endpoint /generate_stream
-    to the end of the command above.
-"""
-import argparse
-import asyncio
-import base64
-import gc
-import io
-import json
-import os
-import random
-import time
-import warnings
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
-import numpy as np
-import pandas as pd
-from backend_request_func import (
-    ASYNC_REQUEST_FUNCS,
-    RequestFuncInput,
-    RequestFuncOutput,
-)
-from datasets import load_dataset
-from PIL.Image import Image
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
-    from backend_request_func import get_tokenizer
-try:
-    from vllm.utils import FlexibleArgumentParser
-except ImportError:
-    from argparse import ArgumentParser as FlexibleArgumentParser
-from benchmark_utils import convert_to_pytorch_benchmark_format
-MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-@dataclass
-class BenchmarkMetrics:
-    completed: int
-    total_input: int
-    total_output: int
-    request_throughput: float
-    request_goodput: float
-    output_throughput: float
-    total_token_throughput: float
-    mean_ttft_ms: float
-    median_ttft_ms: float
-    std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float, float]]
-    mean_tpot_ms: float
-    median_tpot_ms: float
-    std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float, float]]
-    mean_itl_ms: float
-    median_itl_ms: float
-    std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float, float]]
-    # E2EL stands for end-to-end latency per request.
-    # It is the time taken on the client side from sending
-    # a request to receiving a complete response.
-    mean_e2el_ms: float
-    median_e2el_ms: float
-    std_e2el_ms: float
-    percentiles_e2el_ms: List[Tuple[float, float]]
-def sample_sharegpt_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int, None]]:
-    # Load the dataset.
-    with open(dataset_path, encoding="utf-8") as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [
-        (data["conversations"][0]["value"], data["conversations"][1]["value"])
-        for data in dataset
-    ]
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
-            break
-        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = (
-            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
-        )
-        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append((prompt, prompt_len, output_len, None))
-    return filtered_dataset
-def sample_burstgpt_requests(
-    dataset_path: str,
-    num_requests: int,
-    random_seed: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, int, int, None]]:
-    df = pd.read_csv(dataset_path)
-    gpt4_df = df[df["Model"] == "GPT-4"]
-    # Remove the failed requests (i.e., response length is 0)
-    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
-    # Randomly sample num_requests from the dataset
-    if num_requests <= len(gpt4_df):
-        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
-    else:
-        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed, replace=True)
-    # Convert the dataframe to a list of tuples
-    dataset = gpt4_df.values.tolist()
-    input_requests = []
-    for i in range(num_requests):
-        input_len = int(dataset[i][2])
-        output_len = int(dataset[i][3])
-        prompt = tokenizer.decode(
-            [(i + j) % tokenizer.vocab_size for j in range(input_len)]
-        )
-        input_requests.append((prompt, input_len, output_len, None))
-    return input_requests
-def sample_sonnet_requests(
-    dataset_path: str,
-    num_requests: int,
-    input_len: int,
-    output_len: int,
-    prefix_len: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, str, int, int, None]]:
-    assert (
-        input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
-    # Load the dataset.
-    with open(dataset_path, encoding="utf-8") as f:
-        poem_lines = f.readlines()
-    # Tokenize the poem lines.
-    poem_token_ids = tokenizer(poem_lines).input_ids
-    average_poem_len = sum(len(token_ids) for token_ids in poem_token_ids) / len(
-        poem_token_ids
-    )
-    # Base prefix for all requests.
-    base_prompt = "Pick as many lines as you can from these poem lines:\n"
-    base_message = [
-        {
-            "role": "user",
-            "content": base_prompt,
-        }
-    ]
-    base_prompt_formatted = tokenizer.apply_chat_template(
-        base_message, add_generation_prompt=True, tokenize=False
-    )
-    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
-    assert (
-        input_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
-    num_input_lines = round((input_len - base_prompt_offset) / average_poem_len)
-    # First approximately `prefix_len` number of tokens in the
-    # prompt are fixed poem lines.
-    assert (
-        prefix_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
-    num_prefix_lines = round((prefix_len - base_prompt_offset) / average_poem_len)
-    prefix_lines = poem_lines[:num_prefix_lines]
-    # Sample the rest of lines per request.
-    sampled_requests: List[Tuple[str, int, int]] = []
-    for _ in range(num_requests):
-        num_lines_needed = num_input_lines - num_prefix_lines
-        sampled_lines = "".join(
-            prefix_lines + random.choices(poem_lines, k=num_lines_needed)
-        )
-        prompt = f"{base_prompt}{sampled_lines}"
-        message = [
-            {
-                "role": "user",
-                "content": prompt,
-            },
-        ]
-        prompt_formatted = tokenizer.apply_chat_template(
-            message, add_generation_prompt=True, tokenize=False
-        )
-        prompt_len = len(tokenizer(prompt_formatted).input_ids)
-        sampled_requests.append(
-            (prompt, prompt_formatted, prompt_len, output_len, None)
-        )
-    return sampled_requests
-def sample_vision_arena_requests(
-    dataset,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
-    sampled_requests: List[Tuple[str, int, int, Dict[str, Collection[str]]]] = []
-    for data in dataset:
-        if len(sampled_requests) == num_requests:
-            break
-        prompt = data["turns"][0][0]["content"]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        if fixed_output_len is None:
-            # Default max output len is set to 128
-            print("--hf-output-len is not provided. Using default value 128.")
-            fixed_output_len = 128
-        prompt_len = len(prompt_token_ids)
-        output_len = fixed_output_len
-        assert isinstance(data["images"][0], Image), (
-            "Input image format must be `PIL.Image.Image`, "
-            f"given {type(data['image'])}."
-        )
-        image: Image = data["images"][0]
-        image = image.convert("RGB")
-        image_data = io.BytesIO()
-        image.save(image_data, format="JPEG")
-        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
-        mm_content = {
-            "type": "image_url",
-            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
-        }
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
-    return sampled_requests
-def sample_hf_requests(
-    dataset_path: str,
-    dataset_subset: Optional[str],
-    dataset_split: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    random_seed: int,
-    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
-    # Special case for vision_arena dataset
-    if dataset_path == "lmarena-ai/vision-arena-bench-v0.1" and dataset_subset is None:
-        assert dataset_split == "train"
-        dataset = load_dataset(
-            dataset_path, name=dataset_subset, split=dataset_split, streaming=True
-        )
-        dataset = dataset.shuffle(seed=random_seed)
-        return sample_vision_arena_requests(
-            dataset, num_requests, tokenizer, fixed_output_len
-        )
-    dataset = load_dataset(
-        dataset_path, name=dataset_subset, split=dataset_split, streaming=True
-    )
-    assert (
-        "conversations" in dataset.features
-    ), "HF Dataset must have 'conversations' column."
-    filter_func = lambda x: len(x["conversations"]) >= 2  # noqa: E731
-    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
-    sampled_requests: List[Tuple[str, int, int, Dict[str, Collection[str]]]] = []
-    for data in filtered_dataset:
-        if len(sampled_requests) == num_requests:
-            break
-        # Tokenize the prompts and completions.
-        prompt = data["conversations"][0]["value"]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = data["conversations"][1]["value"]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = (
-            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
-        )
-        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
-            # Prune too short sequences.
-            continue
-        if fixed_output_len is None and (
-            prompt_len > 1024 or prompt_len + output_len > 2048
-        ):
-            # Prune too long sequences.
-            continue
-        if "image" in data and isinstance(data["image"], Image):
-            image: Image = data["image"]
-            image = image.convert("RGB")
-            image_data = io.BytesIO()
-            image.save(image_data, format="JPEG")
-            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
-            mm_content = {
-                "type": "image_url",
-                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
-            }
-        elif "image" in data and isinstance(data["image"], str):
-            if data["image"].startswith("http://") or data["image"].startswith(
-                "file://"
-            ):
-                image_url = data["image"]
-            else:
-                image_url = f"file://{data['image']}"
-            mm_content = {
-                "type": "image_url",
-                "image_url": {"url": image_url},
-            }
-        else:
-            mm_content = None
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
-    return sampled_requests
-def sample_random_requests(
-    prefix_len: int,
-    input_len: int,
-    output_len: int,
-    num_prompts: int,
-    range_ratio: float,
-    tokenizer: PreTrainedTokenizerBase,
-    use_chat_template: bool = False,
-) -> List[Tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(
-        0, tokenizer.vocab_size, size=prefix_len
-    ).tolist()
-    if use_chat_template:
-        chat_template_dummy = tokenizer.apply_chat_template(
-            [{"role": "user", "content": "a"}],
-            add_generation_prompt=True,
-            tokenize=False,
-        )
-        tokenized_chat_template_dummy = tokenizer.encode(
-            chat_template_dummy, add_special_tokens=False
-        )
-        chat_template_len = len(tokenized_chat_template_dummy) - 1
-        input_len = input_len - chat_template_len
-    input_lens = np.random.randint(
-        int(input_len * range_ratio),
-        input_len + 1,
-        size=num_prompts,
-    )
-    output_lens = np.random.randint(
-        int(output_len * range_ratio),
-        output_len + 1,
-        size=num_prompts,
-    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
-    input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode(
-            prefix_token_ids
-            + [
-                (offsets[i] + i + j) % tokenizer.vocab_size
-                for j in range(input_lens[i])
-            ]
-        )
-        re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
-            : (prefix_len + input_lens[i])
-        ]
-        prompt = tokenizer.decode(re_encoded_sequence)
-        if use_chat_template:
-            prompt = tokenizer.apply_chat_template(
-                [{"role": "user", "content": prompt}],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
-            input_lens[i] += chat_template_len
-        input_requests.append(
-            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i]), None)
-        )
-    return input_requests
-async def get_request(
-    input_requests: List[Tuple[str, int, int]],
-    request_rate: float,
-    burstiness: float = 1.0,
-) -> AsyncGenerator[Tuple[str, int, int], None]:
-    """
-    Asynchronously generates requests at a specified rate
-    with OPTIONAL burstiness.
-    Args:
-        input_requests:
-            A list of input requests, each represented as a tuple.
-        request_rate:
-            The rate at which requests are generated (requests/s).
-        burstiness (optional):
-            The burstiness factor of the request generation.
-            Only takes effect when request_rate is not inf.
-            Default value is 1, which follows a Poisson process.
-            Otherwise, the request intervals follow a gamma distribution.
-            A lower burstiness value (0 < burstiness < 1) results
-            in more bursty requests, while a higher burstiness value
-            (burstiness > 1) results in a more uniform arrival of requests.
-    """
-    input_requests = iter(input_requests)
-    # Calculate scale parameter theta to maintain the desired request_rate.
-    assert (
-        burstiness > 0
-    ), f"A positive burstiness factor is expected, but given {burstiness}."
-    theta = 1.0 / (request_rate * burstiness)
-    for request in input_requests:
-        yield request
-        if request_rate == float("inf"):
-            # If the request rate is infinity, then we don't need to wait.
-            continue
-        # Sample the request interval from the gamma distribution.
-        # If burstiness is 1, it follows exponential distribution.
-        interval = np.random.gamma(shape=burstiness, scale=theta)
-        # The next request will be sent after the interval.
-        await asyncio.sleep(interval)
-def calculate_metrics(
-    input_requests: List[Tuple[str, int, int]],
-    outputs: List[RequestFuncOutput],
-    dur_s: float,
-    tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[float],
-    goodput_config_dict: Dict[str, float],
-) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
-    total_input = 0
-    completed = 0
-    good_completed = 0
-    itls: List[float] = []
-    tpots: List[float] = []
-    all_tpots: List[float] = []
-    ttfts: List[float] = []
-    e2els: List[float] = []
-    for i in range(len(outputs)):
-        if outputs[i].success:
-            output_len = outputs[i].output_tokens
-            if output_len is None:
-                # We use the tokenizer to count the number of output tokens
-                # for some serving backends instead of looking at
-                # len(outputs[i].itl) since multiple output tokens may be
-                # bundled together
-                # Note : this may inflate the output token count slightly
-                output_len = len(
-                    tokenizer(
-                        outputs[i].generated_text, add_special_tokens=False
-                    ).input_ids
-                )
-            actual_output_lens.append(output_len)
-            total_input += input_requests[i][1]
-            tpot = 0
-            if output_len > 1:
-                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
-                tpot = latency_minus_ttft / (output_len - 1)
-                tpots.append(tpot)
-            # Note: if output_len <= 1, we regard tpot as 0 for goodput
-            all_tpots.append(tpot)
-            itls += outputs[i].itl
-            ttfts.append(outputs[i].ttft)
-            e2els.append(outputs[i].latency)
-            completed += 1
-        else:
-            actual_output_lens.append(0)
-    if goodput_config_dict:
-        valid_metrics = []
-        slo_values = []
-        if "ttft" in goodput_config_dict:
-            valid_metrics.append(ttfts)
-            slo_values.append(
-                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
-            )
-        if "tpot" in goodput_config_dict:
-            valid_metrics.append(all_tpots)
-            slo_values.append(
-                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
-            )
-        if "e2el" in goodput_config_dict:
-            valid_metrics.append(e2els)
-            slo_values.append(
-                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
-            )
-        for req_metric in zip(*valid_metrics):
-            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
-            if is_good_req:
-                good_completed += 1
-    if completed == 0:
-        warnings.warn(
-            "All requests failed. This is likely due to a misconfiguration "
-            "on the benchmark arguments.",
-            stacklevel=2,
-        )
-    metrics = BenchmarkMetrics(
-        completed=completed,
-        total_input=total_input,
-        total_output=sum(actual_output_lens),
-        request_throughput=completed / dur_s,
-        request_goodput=good_completed / dur_s,
-        output_throughput=sum(actual_output_lens) / dur_s,
-        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0)
-        * 1000,  # ttfts is empty if streaming is not supported by backend
-        std_ttft_ms=np.std(ttfts or 0) * 1000,
-        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[
-            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
-        ],
-        mean_tpot_ms=np.mean(tpots or 0) * 1000,
-        std_tpot_ms=np.std(tpots or 0) * 1000,
-        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[
-            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
-        ],
-        mean_itl_ms=np.mean(itls or 0) * 1000,
-        std_itl_ms=np.std(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[
-            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
-        ],
-        mean_e2el_ms=np.mean(e2els or 0) * 1000,
-        std_e2el_ms=np.std(e2els or 0) * 1000,
-        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[
-            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
-        ],
-    )
-    return metrics, actual_output_lens
-async def benchmark(
-    backend: str,
-    api_url: str,
-    base_url: str,
-    model_id: str,
-    model_name: str,
-    tokenizer: PreTrainedTokenizerBase,
-    input_requests: List[Tuple[str, int, int]],
-    logprobs: Optional[int],
-    best_of: int,
-    request_rate: float,
-    burstiness: float,
-    disable_tqdm: bool,
-    profile: bool,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[str],
-    ignore_eos: bool,
-    goodput_config_dict: Dict[str, float],
-    max_concurrency: Optional[int],
-    lora_modules: Optional[List[str]],
-):
-    if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS[backend]
-    else:
-        raise ValueError(f"Unknown backend: {backend}")
-    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = input_requests[0]
-    if backend != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat backend.
-        raise ValueError(
-            "Multi-modal content is only supported on 'openai-chat' backend."
-        )
-    test_input = RequestFuncInput(
-        model=model_id,
-        model_name=model_name,
-        prompt=test_prompt,
-        api_url=api_url,
-        prompt_len=test_prompt_len,
-        output_len=test_output_len,
-        logprobs=logprobs,
-        best_of=best_of,
-        multi_modal_content=test_mm_content,
-        ignore_eos=ignore_eos,
-    )
-    test_output = await request_func(request_func_input=test_input)
-    if not test_output.success:
-        raise ValueError(
-            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}"
-        )
-    else:
-        print("Initial test run completed. Starting main benchmark run...")
-    if lora_modules:
-        # For each input request, choose a LoRA module at random.
-        lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))]
-        )
-    if profile:
-        print("Starting profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            model_name=model_name,
-            prompt=test_prompt,
-            api_url=base_url + "/start_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-            best_of=best_of,
-            multi_modal_content=test_mm_content,
-            ignore_eos=ignore_eos,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler started")
-    if burstiness == 1.0:
-        distribution = "Poisson process"
-    else:
-        distribution = "Gamma distribution"
-    print(f"Traffic request rate: {request_rate}")
-    print(f"Burstiness factor: {burstiness} ({distribution})")
-    print(f"Maximum request concurrency: {max_concurrency}")
-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-    async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
-        async with semaphore:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
-    benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request
-        req_model_id, req_model_name = model_id, model_name
-        if lora_modules:
-            req_lora_module = next(lora_modules)
-            req_model_id, req_model_name = req_lora_module, req_lora_module
-        request_func_input = RequestFuncInput(
-            model=req_model_id,
-            model_name=req_model_name,
-            prompt=prompt,
-            api_url=api_url,
-            prompt_len=prompt_len,
-            output_len=output_len,
-            logprobs=logprobs,
-            best_of=best_of,
-            multi_modal_content=mm_content,
-            ignore_eos=ignore_eos,
-        )
-        tasks.append(
-            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input, pbar=pbar)
-            )
-        )
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-            best_of=best_of,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-    if pbar is not None:
-        pbar.close()
-    benchmark_duration = time.perf_counter() - benchmark_start_time
-    metrics, actual_output_lens = calculate_metrics(
-        input_requests=input_requests,
-        outputs=outputs,
-        dur_s=benchmark_duration,
-        tokenizer=tokenizer,
-        selected_percentile_metrics=selected_percentile_metrics,
-        selected_percentiles=selected_percentiles,
-        goodput_config_dict=goodput_config_dict,
-    )
-    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
-    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
-    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Request throughput (req/s):", metrics.request_throughput
-        )
-    )
-    if goodput_config_dict:
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Request goodput (req/s):", metrics.request_goodput
-            )
-        )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Output token throughput (tok/s):", metrics.output_throughput
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
-        )
-    )
-    result = {
-        "duration": benchmark_duration,
-        "completed": metrics.completed,
-        "total_input_tokens": metrics.total_input,
-        "total_output_tokens": metrics.total_output,
-        "request_throughput": metrics.request_throughput,
-        "request_goodput:": metrics.request_goodput if goodput_config_dict else None,
-        "output_throughput": metrics.output_throughput,
-        "total_token_throughput": metrics.total_token_throughput,
-        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
-        "ttfts": [output.ttft for output in outputs],
-        "itls": [output.itl for output in outputs],
-        "generated_texts": [output.generated_text for output in outputs],
-        "errors": [output.error for output in outputs],
-    }
-    def process_one_metric(
-        # E.g., "ttft"
-        metric_attribute_name: str,
-        # E.g., "TTFT"
-        metric_name: str,
-        # E.g., "Time to First Token"
-        metric_header: str,
-    ):
-        # This function prints and adds statistics of the specified
-        # metric.
-        if metric_attribute_name not in selected_percentile_metrics:
-            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
-        print(
-            "{:<40} {:<10.2f}".format(
-                f"Mean {metric_name} (ms):",
-                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
-            )
-        )
-        print(
-            "{:<40} {:<10.2f}".format(
-                f"Median {metric_name} (ms):",
-                getattr(metrics, f"median_{metric_attribute_name}_ms"),
-            )
-        )
-        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms"
-        )
-        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms"
-        )
-        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms"
-        )
-        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
-            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
-    process_one_metric("itl", "ITL", "Inter-token Latency")
-    process_one_metric("e2el", "E2EL", "End-to-end Latency")
-    print("=" * 50)
-    return result
-def check_goodput_args(args):
-    # Check and parse goodput arguments
-    goodput_config_dict = {}
-    VALID_NAMES = ["ttft", "tpot", "e2el"]
-    if args.goodput:
-        goodput_config_dict = parse_goodput(args.goodput)
-        for slo_name, slo_val in goodput_config_dict.items():
-            if slo_name not in VALID_NAMES:
-                raise ValueError(
-                    f"Invalid metric name found, {slo_name}: {slo_val}. "
-                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. "
-                )
-            if slo_val < 0:
-                raise ValueError(
-                    f"Invalid value found, {slo_name}: {slo_val}. "
-                    "The service level objective value should be "
-                    "non-negative."
-                )
-    return goodput_config_dict
-def parse_goodput(slo_pairs):
-    goodput_config_dict = {}
-    try:
-        for slo_pair in slo_pairs:
-            slo_name, slo_val = slo_pair.split(":")
-            goodput_config_dict[slo_name] = float(slo_val)
-    except ValueError as err:
-        raise argparse.ArgumentTypeError(
-            "Invalid format found for service level objectives. "
-            'Specify service level objectives for goodput as "KEY:VALUE" '
-            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds."
-        ) from err
-    return goodput_config_dict
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: Dict[str, Any], file_name: str
-) -> None:
-    metrics = [
-        "median_ttft_ms",
-        "mean_ttft_ms",
-        "std_ttft_ms",
-        "p99_ttft_ms",
-        "mean_tpot_ms",
-        "median_tpot_ms",
-        "std_tpot_ms",
-        "p99_tpot_ms",
-        "median_itl_ms",
-        "mean_itl_ms",
-        "std_itl_ms",
-        "p99_itl_ms",
-    ]
-    # These raw data might be useful, but they are rather big. They can be added
-    # later if needed
-    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={k: [results[k]] for k in metrics},
-        extra_info={
-            k: results[k]
-            for k in results
-            if k not in metrics and k not in ignored_metrics
-        },
-    )
-    if pt_records:
-        # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
-            json.dump(pt_records, f)
-def main(args: argparse.Namespace):
-    print(args)
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    backend = args.backend
-    model_id = args.model
-    model_name = args.served_model_name
-    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
-    tokenizer_mode = args.tokenizer_mode
-    if args.base_url is not None:
-        api_url = f"{args.base_url}{args.endpoint}"
-        base_url = f"{args.base_url}"
-    else:
-        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"
-    tokenizer = get_tokenizer(
-        tokenizer_id,
-        tokenizer_mode=tokenizer_mode,
-        trust_remote_code=args.trust_remote_code,
-    )
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next "
-            "release. Please use '--dataset-name' and "
-            "'--dataset-path' in the future runs.",
-            stacklevel=2,
-        )
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-    elif args.dataset_name == "sharegpt":
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-    elif args.dataset_name == "burstgpt":
-        input_requests = sample_burstgpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            random_seed=args.seed,
-            tokenizer=tokenizer,
-        )
-    elif args.dataset_name == "sonnet":
-        # Do not format the prompt, pass to message directly
-        if args.backend == "openai-chat":
-            input_requests = sample_sonnet_requests(
-                dataset_path=args.dataset_path,
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-            )
-            input_requests = [
-                (prompt, prompt_len, output_len, None)
-                for prompt, prompt_formatted, prompt_len, output_len, _ in input_requests
-            ]
-        else:
-            assert (
-                tokenizer.chat_template or tokenizer.default_chat_template
-            ), "Tokenizer/model must have chat template for sonnet dataset."
-            input_requests = sample_sonnet_requests(
-                dataset_path=args.dataset_path,
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-            )
-            input_requests = [
-                (prompt_formatted, prompt_len, output_len, None)
-                for prompt, prompt_formatted, prompt_len, output_len, _ in input_requests
-            ]
-    elif args.dataset_name == "hf":
-        input_requests = sample_hf_requests(
-            dataset_path=args.dataset_path,
-            dataset_subset=args.hf_subset,
-            dataset_split=args.hf_split,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            random_seed=args.seed,
-            fixed_output_len=args.hf_output_len,
-        )
-    elif args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            prefix_len=args.random_prefix_len,
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
-            use_chat_template=args.use_chat_template,
-        )
-    else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
-    goodput_config_dict = check_goodput_args(args)
-    # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
-    gc.freeze()
-    benchmark_result = asyncio.run(
-        benchmark(
-            backend=backend,
-            api_url=api_url,
-            base_url=base_url,
-            model_id=model_id,
-            model_name=model_name,
-            tokenizer=tokenizer,
-            input_requests=input_requests,
-            logprobs=args.logprobs,
-            best_of=args.best_of,
-            request_rate=args.request_rate,
-            burstiness=args.burstiness,
-            disable_tqdm=args.disable_tqdm,
-            profile=args.profile,
-            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
-            ignore_eos=args.ignore_eos,
-            goodput_config_dict=goodput_config_dict,
-            max_concurrency=args.max_concurrency,
-            lora_modules=args.lora_modules,
-        )
-    )
-    # Save config and results to json
-    if args.save_result:
-        result_json: Dict[str, Any] = {}
-        # Setup
-        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
-        result_json["date"] = current_dt
-        result_json["backend"] = backend
-        result_json["model_id"] = model_id
-        result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
-        result_json["num_prompts"] = args.num_prompts
-        # Metadata
-        if args.metadata:
-            for item in args.metadata:
-                if "=" in item:
-                    kvstring = item.split("=")
-                    result_json[kvstring[0].strip()] = kvstring[1].strip()
-                else:
-                    raise ValueError(
-                        "Invalid metadata format. Please use KEY=VALUE format."
-                    )
-        # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf"
-        )
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}
-        # Save to file
-        base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (
-            f"-concurrency{args.max_concurrency}"
-            if args.max_concurrency is not None
-            else ""
-        )
-        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
-        if args.result_filename:
-            file_name = args.result_filename
-        if args.result_dir:
-            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding="utf-8") as outfile:
-            json.dump(result_json, outfile)
-        save_to_pytorch_benchmark_format(args, result_json, file_name)
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="Benchmark the online serving throughput."
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="vllm",
-        choices=list(ASYNC_REQUEST_FUNCS.keys()),
-    )
-    parser.add_argument(
-        "--base-url",
-        type=str,
-        default=None,
-        help="Server or API base url if not using http host and port.",
-    )
-    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
-    parser.add_argument("--host", type=str, default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default="/v1/completions",
-        help="API endpoint.",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=None,
-        help="Path to the ShareGPT dataset, will be deprecated in the " "next release.",
-    )
-    parser.add_argument(
-        "--dataset-name",
-        type=str,
-        default="sharegpt",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
-        help="Name of the dataset to benchmark on.",
-    )
-    parser.add_argument(
-        "--dataset-path",
-        type=str,
-        default=None,
-        help="Path to the sharegpt/sonnet dataset. "
-        "Or the huggingface dataset ID if using HF dataset.",
-    )
-    parser.add_argument(
-        "--max-concurrency",
-        type=int,
-        default=None,
-        help="Maximum number of concurrent requests. This can be used "
-        "to help simulate an environment where a higher level component "
-        "is enforcing a maximum number of concurrent requests. While the "
-        "--request-rate argument controls the rate at which requests are "
-        "initiated, this argument will control how many are actually allowed "
-        "to execute at a time. This means that when used in combination, the "
-        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="Name of the model.",
-    )
-    parser.add_argument(
-        "--tokenizer",
-        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
-    )
-    parser.add_argument(
-        "--best-of",
-        type=int,
-        default=1,
-        help="Generates `best_of` sequences per prompt and " "returns the best one.",
-    )
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument(
-        "--num-prompts",
-        type=int,
-        default=1000,
-        help="Number of prompts to process.",
-    )
-    parser.add_argument(
-        "--logprobs",
-        type=int,
-        default=None,
-        help=(
-            "Number of logprobs-per-token to compute & return as part of "
-            "the request. If unspecified, then either (1) if beam search "
-            "is disabled, no logprobs are computed & a single dummy "
-            "logprob is returned for each token; or (2) if beam search "
-            "is enabled 1 logprob per token is computed"
-        ),
-    )
-    parser.add_argument(
-        "--request-rate",
-        type=float,
-        default=float("inf"),
-        help="Number of requests per second. If this is inf, "
-        "then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process or gamma distribution "
-        "to synthesize the request arrival times.",
-    )
-    parser.add_argument(
-        "--burstiness",
-        type=float,
-        default=1.0,
-        help="Burstiness factor of the request generation. "
-        "Only take effect when request_rate is not inf. "
-        "Default value is 1, which follows Poisson process. "
-        "Otherwise, the request intervals follow a gamma distribution. "
-        "A lower burstiness value (0 < burstiness < 1) results in more "
-        "bursty requests. A higher burstiness value (burstiness > 1) "
-        "results in a more uniform arrival of requests.",
-    )
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument(
-        "--trust-remote-code",
-        action="store_true",
-        help="Trust remote code from huggingface",
-    )
-    parser.add_argument(
-        "--disable-tqdm",
-        action="store_true",
-        help="Specify to disable tqdm progress bar.",
-    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
-    )
-    parser.add_argument(
-        "--save-result",
-        action="store_true",
-        help="Specify to save benchmark results to a json file",
-    )
-    parser.add_argument(
-        "--metadata",
-        metavar="KEY=VALUE",
-        nargs="*",
-        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
-        "for metadata of this run to be saved in the result JSON file "
-        "for record keeping purposes.",
-    )
-    parser.add_argument(
-        "--result-dir",
-        type=str,
-        default=None,
-        help="Specify directory to save benchmark json results."
-        "If not specified, results are saved in the current directory.",
-    )
-    parser.add_argument(
-        "--result-filename",
-        type=str,
-        default=None,
-        help="Specify the filename to save benchmark json results."
-        "If not specified, results will be saved in "
-        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
-        " format.",
-    )
-    parser.add_argument(
-        "--ignore-eos",
-        action="store_true",
-        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
-    )
-    parser.add_argument(
-        "--percentile-metrics",
-        type=str,
-        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentiles. "
-        "This argument specifies the metrics to report percentiles. "
-        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
-        'Default value is "ttft,tpot,itl".',
-    )
-    parser.add_argument(
-        "--metric-percentiles",
-        type=str,
-        default="99",
-        help="Comma-separated list of percentiles for selected metrics. "
-        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
-        'Default value is "99". '
-        'Use "--percentile-metrics" to select metrics.',
-    )
-    parser.add_argument(
-        "--goodput",
-        nargs="+",
-        required=False,
-        help='Specify service level objectives for goodput as "KEY:VALUE" '
-        "pairs, where the key is a metric name, and the value is in "
-        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
-        "separated by spaces. Allowed request level metric names are "
-        '"ttft", "tpot", "e2el". For more context on the definition of '
-        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
-    )
-    # group for dataset specific arguments
-    sonnet_group = parser.add_argument_group("sonnet dataset options")
-    sonnet_group.add_argument(
-        "--sonnet-input-len",
-        type=int,
-        default=550,
-        help="Number of input tokens per request, used only for sonnet dataset.",
-    )
-    sonnet_group.add_argument(
-        "--sonnet-output-len",
-        type=int,
-        default=150,
-        help="Number of output tokens per request, used only for sonnet dataset.",
-    )
-    sonnet_group.add_argument(
-        "--sonnet-prefix-len",
-        type=int,
-        default=200,
-        help="Number of prefix tokens per request, used only for sonnet dataset.",
-    )
-    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
-    sharegpt_group.add_argument(
-        "--sharegpt-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.",
-    )
-    random_group = parser.add_argument_group("random dataset options")
-    random_group.add_argument(
-        "--random-input-len",
-        type=int,
-        default=1024,
-        help="Number of input tokens per request, used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-output-len",
-        type=int,
-        default=128,
-        help="Number of output tokens per request, used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-prefix-len",
-        type=int,
-        default=0,
-        help="Number of fixed prefix tokens before random "
-        " context. The length range of context in a random "
-        " request is [random-prefix-len, "
-        " random-prefix-len + random-prefix-len * random-range-ratio).",
-    )
-    random_group.add_argument(
-        "--use-chat-template",
-        action="store_true",
-        help="Use chat template to format the prompt.",
-    )
-    hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument(
-        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
-    )
-    hf_group.add_argument(
-        "--hf-split", type=str, default=None, help="Split of the HF dataset."
-    )
-    hf_group.add_argument(
-        "--hf-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output lengths "
-        "from the sampled HF dataset.",
-    )
-    parser.add_argument(
-        "--tokenizer-mode",
-        type=str,
-        default="auto",
-        choices=["auto", "slow", "mistral", "custom"],
-        help='The tokenizer mode.\n\n* "auto" will use the '
-        'fast tokenizer if available.\n* "slow" will '
-        "always use the slow tokenizer. \n* "
-        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.',
-    )
-    parser.add_argument(
-        "--served-model-name",
-        type=str,
-        default=None,
-        help="The model name used in the API. "
-        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
-    )
-    parser.add_argument(
-        "--lora-modules",
-        nargs="+",
-        default=None,
-        help="A subset of LoRA module names passed in when "
-        "launching the server. For each request, the "
-        "script chooses a LoRA module at random.",
-    )
-    args = parser.parse_args()
-    main(args)
--- a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_utils.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_utils.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# pytest: skip-file
-import argparse
-import os
-from typing import Any, Dict, List
-def convert_to_pytorch_benchmark_format(
-    args: argparse.Namespace, metrics: Dict[str, List], extra_info: Dict[str, Any]
-) -> List:
-    """
-    Save the benchmark results in the format used by PyTorch OSS benchmark with
-    on metric per record
-    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-    """
-    records = []
-    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
-        return records
-    for name, benchmark_values in metrics.items():
-        record = {
-            "benchmark": {
-                "name": "vLLM benchmark",
-                "extra_info": {
-                    "args": vars(args),
-                },
-            },
-            "model": {
-                "name": args.model,
-            },
-            "metric": {
-                "name": name,
-                "benchmark_values": benchmark_values,
-                "extra_info": extra_info,
-            },
-        }
-        records.append(record)
-    return records
--- a/examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/worker_setup.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Worker setup script for Slurm nodes.
-This script will be running on the prefill and decode nodes, and will be called by the
-benchmark_dynamo.sh script.
-The script will:
- Setup the environment
- Generate the python3 command to run the prefill or decode worker
- Start dynamo (or sglang)
- Monitor the GPU utilization
-"""
-import argparse
-import logging
-import os
-import socket
-import subprocess
-import time
-from pathlib import Path
-import requests
-# Network configurations
-ETCD_CLIENT_PORT = 2379
-ETCD_PEER_PORT = 2380
-NATS_PORT = 4222
-DIST_INIT_PORT = 29500
-ETCD_LISTEN_ADDR = "http://0.0.0.0"
-def setup_logging(level: int = logging.INFO) -> None:
-    logging.basicConfig(
-        level=level,
-        format="%(asctime)s| %(name)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-def log_gpu_utilization(log_file: Path) -> None:
-    """
-    Log GPU utilization for all GPUs in the node.
-    Format: utilization.gpu [%] x y z
-    """
-    util_script = Path(__file__).parent / "monitor_gpu_utilization.sh"
-    util_process = run_command(
-        f"bash {util_script}",
-        background=True,
-        stdout=open(log_file, "w"),
-        stderr=subprocess.STDOUT,
-    )
-    if not util_process:
-        logging.warning("Failed to start GPU utilization monitoring")
-    else:
-        logging.info("Started GPU utilization monitoring in the background")
-def check_etcd_health(etcd_url: str) -> bool:
-    """Check if etcd is healthy"""
-    health_url = f"{etcd_url}/health"
-    try:
-        response = requests.get(health_url, timeout=5)
-        return response.status_code == 200
-    except requests.exceptions.RequestException:
-        return False
-def wait_for_etcd(etcd_url: str, max_retries: int = 1000) -> bool:
-    """Wait for etcd to be ready"""
-    logging.info(f"Waiting for etcd to be ready on {etcd_url}...")
-    for attempt in range(max_retries):
-        try:
-            if check_etcd_health(etcd_url):
-                logging.info("Etcd is ready!")
-                return True
-        except requests.exceptions.RequestException:
-            pass
-        logging.info(
-            f"Etcd not ready yet, retrying in 2 seconds... (attempt {attempt + 1}/{max_retries})"
-        )
-        time.sleep(2)
-    logging.error("Etcd failed to become ready within the timeout period")
-    return False
-def run_command(
-    cmd: str, background: bool = False, shell: bool = True, stdout=None, stderr=None
-):
-    """
-    Run a command either in background or foreground.
-    Args:
-        cmd: Command to run
-        background: If True, run in background and return Popen object. If False, wait for
-            completion and return exit code.
-        shell: Whether to run command through shell
-    Returns:
-        If background=True: subprocess.Popen
-        If background=False: int (exit code)
-    """
-    logging.info(f"Running command (background={background}, shell={shell}): {cmd}")
-    if background:
-        process = subprocess.Popen(
-            cmd,
-            shell=shell,
-            stdout=stdout if stdout else subprocess.PIPE,
-            stderr=stderr if stderr else subprocess.PIPE,
-        )  # noqa: S603
-        return process
-    else:
-        result = subprocess.run(cmd, shell=shell, check=True)  # noqa: S603
-        return result.returncode
-def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Worker setup script for Dynamo distributed training",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--leader_ip",
-        type=str,
-        required=False,
-        help="IP address of the leader node for this worker group",
-    )
-    parser.add_argument(
-        "--master_ip",
-        type=str,
-        required=True,
-        help="IP address of the master node (first prefill node) for NATS/ETCD",
-    )
-    parser.add_argument(
-        "--worker_idx",
-        type=int,
-        required=False,
-        help="Index of the worker group (0-based)",
-    )
-    parser.add_argument(
-        "--local_rank",
-        type=int,
-        required=False,
-        help="Local rank within the worker group (0 for leader)",
-    )
-    parser.add_argument(
-        "--nodes_per_worker",
-        type=int,
-        required=False,
-        help="Number of nodes per worker",
-    )
-    parser.add_argument(
-        "--worker_type",
-        choices=["decode", "prefill", "frontend", "nginx", "aggregated"],
-        required=True,
-        help="Type of worker to run",
-    )
-    parser.add_argument(
-        "--gpus_per_node",
-        type=int,
-        default=8,
-        help="Number of GPUs per node (default: 8)",
-    )
-    parser.add_argument(
-        "--gpu_utilization_log",
-        type=str,
-        default=None,
-        help="File to log GPU utilization (default: None)",
-    )
-    parser.add_argument(
-        "--gpu_type",
-        type=str,
-        default="gb200-fp8",
-        help="Type of GPU to use (script will be validated at runtime)",
-    )
-    parser.add_argument(
-        "--script-variant",
-        type=str,
-        default="default",
-        help="Script variant to use (e.g., 'default', 'optim', 'decode-optim'). Defaults to 'default'",
-    )
-    parser.add_argument(
-        "--nginx_config",
-        type=str,
-        help="Path to nginx configuration file (required for nginx worker type)",
-    )
-    parser.add_argument(
-        "--multiple-frontends-enabled",
-        action="store_true",
-        help="Whether multiple frontend architecture is enabled (affects infrastructure setup)",
-    )
-    parser.add_argument(
-        "--use_init_locations",
-        action="store_true",
-        help="Whether we add --init-expert-locations to launch commands",
-    )
-    parser.add_argument(
-        "--dump-config-path",
-        type=str,
-        default=None,
-        help="Path to dump config file (e.g., /logs/node_config.json)",
-    )
-    parser.add_argument(
-        "--run-in-ci",
-        action="store_true",
-        help="Run in CI mode - use binaries from /configs/ for nats/etcd",
-    )
-    return parser.parse_args(args)
-def _validate_args(args: argparse.Namespace) -> None:
-    """Validate command line arguments"""
-    if args.worker_type in ["prefill", "decode"]:
-        if args.worker_idx is None or args.worker_idx < 0:
-            raise ValueError(
-                "Worker index must be provided and non-negative for prefill/decode"
-            )
-    if args.worker_type in ["prefill", "decode"]:
-        if args.local_rank is None or args.local_rank < 0:
-            raise ValueError("Local rank must be non-negative")
-        if args.nodes_per_worker is None or args.nodes_per_worker < 1:
-            raise ValueError("Nodes per worker must be at least 1")
-        if args.gpus_per_node < 1:
-            raise ValueError("GPUs per node must be at least 1")
-        if args.local_rank >= args.nodes_per_worker:
-            raise ValueError(
-                f"Local rank ({args.local_rank}) must be less than nodes per worker ({args.nodes_per_worker})"
-            )
-    # Validate nginx-specific arguments
-    if args.worker_type == "nginx" and not args.nginx_config:
-        raise ValueError("--nginx_config is required for nginx worker type")
-def setup_env_vars_for_gpu_script(
-    host_ip: str,
-    local_rank: int,
-    total_gpus: int,
-    total_nodes: int,
-    port: int = DIST_INIT_PORT,
-    use_init_locations: bool = True,
-    dump_config_path: str | None = None,
-    run_in_ci: bool = False,
-):
-    """Setup environment variables required by GPU scripts (gb200-fp8.sh)"""
-    os.environ["HOST_IP_MACHINE"] = host_ip
-    os.environ["PORT"] = str(port)
-    os.environ["TOTAL_GPUS"] = str(total_gpus)
-    os.environ["RANK"] = str(local_rank)
-    os.environ["TOTAL_NODES"] = str(total_nodes)
-    os.environ["USE_INIT_LOCATIONS"] = str(use_init_locations)
-    os.environ["RUN_IN_CI"] = str(run_in_ci)
-    if dump_config_path:
-        os.environ["DUMP_CONFIG_PATH"] = dump_config_path
-    else:
-        os.environ.pop("DUMP_CONFIG_PATH", None)
-    logging.info(f"Set HOST_IP: {host_ip}")
-    logging.info(f"Set PORT: {port}")
-    logging.info(f"Set TOTAL_GPUS: {total_gpus}")
-    logging.info(f"Set RANK: {local_rank}")
-    logging.info(f"Set TOTAL_NODES: {total_nodes}")
-    logging.info(f"Set USE_INIT_LOCATIONS: {use_init_locations}")
-    logging.info(f"Set RUN_IN_CI: {run_in_ci}")
-    if dump_config_path:
-        logging.info(f"Set DUMP_CONFIG_PATH: {dump_config_path}")
-def get_gpu_command(
-    worker_type: str, gpu_type: str, script_variant: str = "default"
-) -> str:
-    """Generate command to run the appropriate GPU script.
-    Scripts are organized as: scripts/{gpu_type}/{agg,disagg}/{script_variant}.sh
-    """
-    script_base = Path(__file__).parent
-    script_name = f"{script_variant}.sh"
-    if worker_type == "aggregated":
-        # Remove any -prefill or -decode suffix if present
-        base_gpu_type = gpu_type.replace("-prefill", "").replace("-decode", "")
-        script_path = script_base / base_gpu_type / "agg" / script_name
-        if not script_path.exists():
-            raise ValueError(f"Aggregated GPU script not found: {script_path}")
-        return f"bash {script_path}"
-    else:
-        # Disaggregated mode: scripts/{gpu_type}/disagg/{script_variant}.sh {prefill|decode}
-        script_path = script_base / gpu_type / "disagg" / script_name
-        if not script_path.exists():
-            raise ValueError(f"Disaggregated GPU script not found: {script_path}")
-        mode = worker_type  # "prefill" or "decode"
-        return f"bash {script_path} {mode}"
-def setup_head_prefill_node(prefill_host_ip: str, run_in_ci: bool = False) -> None:
-    """
-    Setup NATS, etcd, ingress, and http servers on the prefill host node.
-    """
-    if run_in_ci:
-        logging.info(
-            f"Starting nats server on node {prefill_host_ip} (CI mode - using /configs/nats-server)"
-        )
-        nats_cmd = "/configs/nats-server -js"
-    else:
-        logging.info(f"Starting nats server on node {prefill_host_ip}")
-        nats_cmd = "nats-server -js"
-    nats_process = run_command(nats_cmd, background=True)
-    if not nats_process:
-        raise RuntimeError("Failed to start nats-server")
-    if run_in_ci:
-        logging.info(
-            f"Starting etcd server on node {prefill_host_ip} (CI mode - using /configs/etcd)"
-        )
-        etcd_binary = "/configs/etcd"
-    else:
-        logging.info(f"Starting etcd server on node {prefill_host_ip}")
-        etcd_binary = "etcd"
-    etcd_cmd = (
-        f"{etcd_binary} --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
-        f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
-        f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} "
-        f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}"
-    )
-    etcd_process = run_command(etcd_cmd, background=True)
-    if not etcd_process:
-        raise RuntimeError("Failed to start etcd")
-def setup_nginx_worker(master_ip: str, nginx_config: str) -> int:
-    """Setup nginx load balancer"""
-    logging.info("Setting up nginx load balancer")
-    if not nginx_config or not os.path.exists(nginx_config):
-        raise ValueError(f"Nginx config file not found: {nginx_config}")
-    nginx_cmd = f"apt-get update && apt-get install -y nginx && nginx -c {nginx_config} && sleep 86400"
-    return run_command(nginx_cmd)
-def setup_frontend_worker(
-    worker_idx: int, master_ip: str, run_in_ci: bool = False
-) -> int:
-    """Setup a frontend worker"""
-    logging.info(f"Setting up frontend worker {worker_idx}")
-    # First frontend (worker_idx 0) also sets up NATS/ETCD
-    if worker_idx == 0:
-        setup_head_prefill_node(master_ip, run_in_ci)
-    else:
-        logging.info(f"Setting up additional frontend worker {worker_idx}")
-        if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
-            raise RuntimeError("Failed to connect to etcd")
-    # All frontends run the ingress server
-    frontend_cmd = "python3 -m dynamo.frontend --http-port=8000"
-    if run_in_ci:
-        frontend_cmd = "python3 -m pip install /configs/ai_dynamo_runtime-0.7.0-cp310-abi3-manylinux_2_28_aarch64.whl && python3 -m pip install /configs/ai_dynamo-0.7.0-py3-none-any.whl && python3 -m dynamo.frontend --http-port=8000"
-    return run_command(frontend_cmd)
-def setup_prefill_worker(
-    worker_idx: int,
-    local_rank: int,
-    leader_ip: str,
-    master_ip: str,
-    nodes_per_worker: int,
-    gpus_per_node: int,
-    gpu_type: str,
-    multiple_frontends_enabled: bool = False,
-    use_init_locations: bool = True,
-    dump_config_path: str | None = None,
-    script_variant: str = "default",
-    run_in_ci: bool = False,
-) -> int:
-    """
-    Setup the prefill worker.
-    """
-    total_gpus = nodes_per_worker * gpus_per_node
-    # Only setup infrastructure in traditional mode (not multiple frontends)
-    if not multiple_frontends_enabled and worker_idx == 0 and local_rank == 0:
-        setup_head_prefill_node(master_ip, run_in_ci)
-    else:
-        logging.info(f"Setting up prefill worker {worker_idx}, local rank {local_rank}")
-        if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
-            raise RuntimeError("Failed to connect to etcd")
-    # Setup environment variables for GPU script - use leader_ip as dist-init-addr
-    setup_env_vars_for_gpu_script(
-        leader_ip,
-        local_rank,
-        total_gpus,
-        nodes_per_worker,
-        use_init_locations=use_init_locations,
-        dump_config_path=dump_config_path,
-        run_in_ci=run_in_ci,
-    )
-    # Use appropriate GPU script instead of generating command directly
-    cmd_to_run = get_gpu_command("prefill", gpu_type, script_variant)
-    return run_command(cmd_to_run)
-def setup_decode_worker(
-    worker_idx: int,
-    local_rank: int,
-    leader_ip: str,
-    master_ip: str,
-    nodes_per_worker: int,
-    gpus_per_node: int,
-    gpu_type: str,
-    use_init_locations: bool = True,
-    dump_config_path: str | None = None,
-    script_variant: str = "default",
-    run_in_ci: bool = False,
-) -> int:
-    """
-    Setup the decode worker.
-    """
-    total_gpus = nodes_per_worker * gpus_per_node
-    logging.info(f"Setting up decode worker {worker_idx}, local rank {local_rank}")
-    if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
-        raise RuntimeError("Failed to connect to etcd")
-    # Setup environment variables for GPU script - use leader_ip as dist-init-addr
-    setup_env_vars_for_gpu_script(
-        leader_ip,
-        local_rank,
-        total_gpus,
-        nodes_per_worker,
-        use_init_locations=use_init_locations,
-        dump_config_path=dump_config_path,
-        run_in_ci=run_in_ci,
-    )
-    # Use appropriate GPU script instead of generating command directly
-    cmd_to_run = get_gpu_command("decode", gpu_type, script_variant)
-    return run_command(cmd_to_run)
-def setup_aggregated_worker(
-    worker_idx: int,
-    local_rank: int,
-    leader_ip: str,
-    master_ip: str,
-    nodes_per_worker: int,
-    gpus_per_node: int,
-    gpu_type: str,
-    multiple_frontends_enabled: bool = False,
-    dump_config_path: str | None = None,
-    script_variant: str = "default",
-    run_in_ci: bool = False,
-) -> int:
-    """
-    Setup the aggregated worker.
-    """
-    total_gpus = nodes_per_worker * gpus_per_node
-    # Only setup infrastructure in traditional mode (not multiple frontends) on first worker, first node
-    if not multiple_frontends_enabled and worker_idx == 0 and local_rank == 0:
-        setup_head_prefill_node(master_ip, run_in_ci)
-    else:
-        logging.info(
-            f"Setting up aggregated worker {worker_idx}, local rank {local_rank}"
-        )
-        if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
-            raise RuntimeError("Failed to connect to etcd")
-    # Setup environment variables for GPU script - use leader_ip as dist-init-addr
-    # Aggregated mode doesn't use init locations
-    setup_env_vars_for_gpu_script(
-        leader_ip,
-        local_rank,
-        total_gpus,
-        nodes_per_worker,
-        use_init_locations=False,
-        dump_config_path=dump_config_path,
-        run_in_ci=run_in_ci,
-    )
-    # Use appropriate aggregated GPU script
-    cmd_to_run = get_gpu_command("aggregated", gpu_type, script_variant)
-    return run_command(cmd_to_run)
-def setup_env(master_ip: str):
-    nats_server = f"nats://{master_ip}:{NATS_PORT}"
-    etcd_endpoints = f"http://{master_ip}:{ETCD_CLIENT_PORT}"
-    os.environ["NATS_SERVER"] = nats_server
-    os.environ["ETCD_ENDPOINTS"] = etcd_endpoints
-    logging.info(f"set NATS_SERVER: {nats_server}")
-    logging.info(f"set ETCD_ENDPOINTS: {etcd_endpoints}")
-def main(input_args: list[str] | None = None):
-    setup_logging()
-    args = _parse_command_line_args(input_args)
-    _validate_args(args)
-    if args.gpu_utilization_log:
-        log_gpu_utilization(args.gpu_utilization_log)
-    logging.info(f"{args.worker_type.capitalize()} worker setup started")
-    logging.info(f"Hostname: {socket.gethostname()}")
-    logging.info(f"Worker type: {args.worker_type}")
-    logging.info(f"Worker index: {args.worker_idx}")
-    logging.info(f"Local rank: {args.local_rank}")
-    logging.info(f"Leader IP: {args.leader_ip}")
-    logging.info(f"Master IP: {args.master_ip}")
-    logging.info(f"Nodes per worker: {args.nodes_per_worker}")
-    logging.info(f"Run in CI mode?: {args.run_in_ci}")
-    logging.info(f"Use init locations?: {args.use_init_locations}")
-    setup_env(args.master_ip)
-    if args.worker_type == "nginx":
-        if not args.nginx_config:
-            raise ValueError("--nginx_config is required for nginx worker type")
-        setup_nginx_worker(args.master_ip, args.nginx_config)
-    elif args.worker_type == "frontend":
-        setup_frontend_worker(args.worker_idx, args.master_ip, args.run_in_ci)
-    elif args.worker_type == "prefill":
-        setup_prefill_worker(
-            args.worker_idx,
-            args.local_rank,
-            args.leader_ip,
-            args.master_ip,
-            args.nodes_per_worker,
-            args.gpus_per_node,
-            args.gpu_type,
-            args.multiple_frontends_enabled,
-            args.use_init_locations,
-            args.dump_config_path,
-            args.script_variant,
-            args.run_in_ci,
-        )
-    elif args.worker_type == "decode":
-        setup_decode_worker(
-            args.worker_idx,
-            args.local_rank,
-            args.leader_ip,
-            args.master_ip,
-            args.nodes_per_worker,
-            args.gpus_per_node,
-            args.gpu_type,
-            args.use_init_locations,
-            args.dump_config_path,
-            args.script_variant,
-            args.run_in_ci,
-        )
-    elif args.worker_type == "aggregated":
-        setup_aggregated_worker(
-            args.worker_idx,
-            args.local_rank,
-            args.leader_ip,
-            args.master_ip,
-            args.nodes_per_worker,
-            args.gpus_per_node,
-            args.gpu_type,
-            args.multiple_frontends_enabled,
-            args.dump_config_path,
-            args.script_variant,
-            args.run_in_ci,
-        )
-    logging.info(f"{args.worker_type.capitalize()} worker setup complete")
-if __name__ == "__main__":
-    main()
--- a/examples/backends/sglang/slurm_jobs/submit_disagg.sh
+++ b/examples/backends/sglang/slurm_jobs/submit_disagg.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-usage() {
-    cat << 'USAGE'
-This script aims to provide a one-liner call to the submit_job_script.py,
-so that the deployment process can be further simplified.
-To use this script, fill in the following script and run it under your `slurm_jobs` directory:
-======== begin script area ========
-export SLURM_ACCOUNT=
-export SLURM_PARTITION=
-export TIME_LIMIT=
-# Add path to your DSR1-FP8 model directory here
-export MODEL_PATH=
-# This path should contain the deepep.json and optionally init expert locations.
-# Please refer to the README for more detail.
-export CONFIG_DIR=
-# Add path to your container image here, either as a link or as a cached file
-export CONTAINER_IMAGE=
-bash submit_disagg.sh \
-$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \
-$ADDITIONAL_FRONTENDS \
-$ISL $OSL $CONCURRENCIES $REQUEST_RATE
-======== end script area ========
-USAGE
-}
-check_env() {
-    local name="$1"
-    if [[ -z "${!name:-}" ]]; then
-        echo "Error: ${name} not specified" >&2
-        usage >&2
-        exit 1
-    fi
-}
-check_env SLURM_ACCOUNT
-check_env SLURM_PARTITION
-check_env TIME_LIMIT
-check_env MODEL_PATH
-check_env CONFIG_DIR
-check_env CONTAINER_IMAGE
-GPUS_PER_NODE=4
-: "${NETWORK_INTERFACE:=enP6p9s0np0}"
-# COMMAND_LINE ARGS
-PREFILL_NODES=$1
-PREFILL_WORKERS=$2
-DECODE_NODES=$3
-DECODE_WORKERS=$4
-N_ADDITIONAL_FRONTENDS=$5
-ISL=$6
-OSL=$7
-CONCURRENCIES=$8
-REQUEST_RATE=$9
-GPU_TYPE=${10}
-SCRIPT_VARIANT=${11}
-RETRIES=1 # defaults to retry the job 1 time to avoid transient errors
-# Should not need retries
-profiler_args="type=vllm; isl=${ISL}; osl=${OSL}; concurrencies=${CONCURRENCIES}; req-rate=${REQUEST_RATE}"
-USE_INIT_LOCATIONS=()
-if [[ $PREFILL_NODES -eq 6 ]] && [[ $PREFILL_WORKERS -eq 3 ]] && [[ $DECODE_NODES -eq 12 ]] && [[ $DECODE_WORKERS -eq 1 ]]; then
-    USE_INIT_LOCATIONS=(--use-init-location)
-fi
-SCRIPT_VARIANT_ARGS=()
-if [[ -n "$SCRIPT_VARIANT" ]]; then
-    SCRIPT_VARIANT_ARGS=(--script-variant "$SCRIPT_VARIANT")
-fi
-command=(
-    python3 submit_job_script.py
-    --account $SLURM_ACCOUNT --partition $SLURM_PARTITION --time-limit $TIME_LIMIT
-    --model-dir $MODEL_PATH --config-dir $CONFIG_DIR
-    --container-image $CONTAINER_IMAGE
-    --gpus-per-node $GPUS_PER_NODE --network-interface $NETWORK_INTERFACE
-    --prefill-nodes $PREFILL_NODES --prefill-workers $PREFILL_WORKERS
-    --decode-nodes $DECODE_NODES --decode-workers $DECODE_WORKERS
-    --enable-multiple-frontends --num-additional-frontends $N_ADDITIONAL_FRONTENDS ${USE_INIT_LOCATIONS[@]}
-    --profiler "${profiler_args}"
-    --retries $RETRIES
-    --gpu-type $GPU_TYPE
-    --run-in-ci
-    ${SCRIPT_VARIANT_ARGS[@]}
-)
-"${command[@]}"
--- a/examples/backends/sglang/slurm_jobs/submit_job_script.py
+++ b/examples/backends/sglang/slurm_jobs/submit_job_script.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Script to generate SLURM job scripts from Jinja2 templates.
-"""
-import argparse
-import logging
-import os
-import pathlib
-import subprocess
-import tempfile
-from datetime import datetime
-from jinja2 import Template
-def print_welcome_message(job_ids: list[str], log_dir_name: str):
-    """Print a clean welcome message with job information."""
-    _ = f"{', '.join(job_ids)}"
-    print(
-        f"""
-🚀 Welcome! We hope you enjoy your time on our GB200 NVL72.
-Your logs for this submitted job will be available in logs/{log_dir_name}
-You can access them by running:
-    cd logs/{log_dir_name}
-You can view all of the prefill/decode worker logs by running:
-    tail -f *_decode_*.err *_prefill_*.err
-To kick off the benchmark we suggest opening up a new terminal, SSH-ing
-into the login node, and running the srun command that is found at the
-bottom of the log.out. You can find it by running:
-    cat log.out
-Enjoy :)
- NVIDIA
-"""
-    )
-def setup_logging(level: int = logging.INFO) -> None:
-    logging.basicConfig(
-        level=level,
-        format="%(asctime)s| %(name)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-def generate_job_script(template_path, output_path, **kwargs):
-    """Generate a job script from template with given parameters."""
-    with open(template_path, "r") as f:
-        template = Template(f.read())
-    rendered_script = template.render(**kwargs)
-    with open(output_path, "w") as f:
-        f.write(rendered_script)
-    return output_path, rendered_script
-def submit_job(job_script_path, extra_slurm_args=[]):
-    """
-    Submit the job script to SLURM and extract the job ID from the output.
-    Returns:
-        The job ID of the submitted job.
-    """
-    try:
-        command = (
-            ["sbatch"]
-            + ["--" + x for x in extra_slurm_args]
-            + [
-                job_script_path,
-            ]
-        )
-        result = subprocess.run(command, capture_output=True, text=True, check=True)
-        output_lines = result.stdout.strip().split("\n")
-        # sbatch typically outputs: "Submitted batch job JOBID"
-        job_id = output_lines[-1].split()[-1]
-        logging.info(f"Job submitted successfully with ID: {job_id}")
-        return job_id
-    except subprocess.CalledProcessError as e:
-        logging.error(f"Error submitting job: {e}")
-        logging.error(f"stderr: {e.stderr}")
-        raise
-    except (IndexError, ValueError):
-        logging.error(f"Error parsing job ID from sbatch output: {result.stdout}")
-        raise
-def _get_available_gpu_types() -> list[str]:
-    """Discover available GPU types by scanning scripts directory structure.
-    Looks for scripts in: scripts/{gpu_type}/{agg,disagg}/*.sh
-    """
-    script_dir = pathlib.Path(__file__).parent / "scripts"
-    gpu_types = set()
-    # Scan for GPU type directories (directories that contain agg/ or disagg/)
-    for gpu_dir in script_dir.iterdir():
-        if not gpu_dir.is_dir():
-            continue
-        # Check if this directory has agg/ or disagg/ subdirectories
-        has_agg = (gpu_dir / "agg").is_dir()
-        has_disagg = (gpu_dir / "disagg").is_dir()
-        if has_agg or has_disagg:
-            gpu_types.add(gpu_dir.name)
-    return sorted(list(gpu_types))
-def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Generate and submit SLURM job scripts"
-    )
-    # Get available GPU types dynamically
-    available_gpu_types = _get_available_gpu_types()
-    # Template parameters
-    parser.add_argument("--job-name", default="dynamo_setup", help="SLURM job name")
-    parser.add_argument("--account", required=True, help="SLURM account")
-    parser.add_argument("--model-dir", required=True, help="Model directory path")
-    parser.add_argument("--config-dir", required=True, help="Config directory path")
-    parser.add_argument("--container-image", required=True, help="Container image")
-    parser.add_argument(
-        "--time-limit", default="04:00:00", help="Time limit (HH:MM:SS)"
-    )
-    parser.add_argument(
-        "--prefill-nodes", type=int, default=None, help="Number of prefill nodes"
-    )
-    parser.add_argument(
-        "--decode-nodes", type=int, default=None, help="Number of decode nodes"
-    )
-    parser.add_argument(
-        "--prefill-workers", type=int, default=None, help="Number of prefill workers"
-    )
-    parser.add_argument(
-        "--decode-workers", type=int, default=None, help="Number of decode workers"
-    )
-    parser.add_argument(
-        "--agg-nodes", type=int, default=None, help="Number of aggregated worker nodes"
-    )
-    parser.add_argument(
-        "--agg-workers", type=int, default=None, help="Number of aggregated workers"
-    )
-    parser.add_argument(
-        "--gpus-per-node", type=int, default=8, help="Number of GPUs per node"
-    )
-    parser.add_argument(
-        "--network-interface", default="eth3", help="Network interface to use"
-    )
-    parser.add_argument(
-        "--gpu-type",
-        choices=available_gpu_types,
-        default=available_gpu_types[0] if available_gpu_types else None,
-        help=f"GPU type to use. Available types: {', '.join(available_gpu_types)}",
-    )
-    parser.add_argument(
-        "--script-variant",
-        type=str,
-        default="default",
-        help="Script variant to use (e.g., 'default', 'optim', 'decode-optim'). Defaults to 'default.sh'",
-    )
-    parser.add_argument(
-        "--partition",
-        default="batch",
-        help="SLURM partition to use",
-    )
-    parser.add_argument(
-        "--enable-multiple-frontends",
-        action="store_true",
-        help="Enable multiple frontend architecture with nginx load balancer",
-    )
-    parser.add_argument(
-        "--num-additional-frontends",
-        type=int,
-        default=0,
-        help="Number of additional frontend nodes (beyond the first frontend on node 1)",
-    )
-    parser.add_argument(
-        "--use-init-location",
-        action="store_true",
-        help="Whether we use '--init-expert-locations' json files",
-    )
-    parser.add_argument(
-        "--profiler",
-        type=str,
-        help="Profiler configurations. Example: "
-        + '"type=vllm; isl=8192; osl=1024; concurrencies=16x2048x4096x8192; req-rate=inf"',
-    )
-    parser.add_argument(
-        "--extra-slurm-args",
-        action="append",
-        default=[],
-        help="Extra slurm arguments, remove the '--' prefix. Example: --extra-slurm-args dependency=afterok:<x>",
-    )
-    parser.add_argument(
-        "--retries",
-        type=int,
-        default=0,
-        help="Tries to launch the job multiple times to catch transient errors",
-    )
-    parser.add_argument(
-        "--disable-config-dump",
-        action="store_false",
-        dest="enable_config_dump",
-        default=True,
-        help="Disable dumping config to file on each node (default: config dump is enabled)",
-    )
-    parser.add_argument(
-        "--run-in-ci",
-        action="store_true",
-        help="Run in CI mode - use binaries from /configs/ for nats/etcd and install dynamo wheel",
-    )
-    return parser.parse_args(args)
-def _validate_args(args: argparse.Namespace) -> None:
-    """Validate arguments and ensure aggregated and disaggregated args are mutually exclusive."""
-    has_disagg_args = any(
-        [
-            args.prefill_nodes is not None,
-            args.decode_nodes is not None,
-            args.prefill_workers is not None,
-            args.decode_workers is not None,
-        ]
-    )
-    has_agg_args = any(
-        [
-            args.agg_nodes is not None,
-            args.agg_workers is not None,
-        ]
-    )
-    if has_disagg_args and has_agg_args:
-        raise ValueError(
-            "Cannot specify both aggregated (--agg-nodes, --agg-workers) and "
-            "disaggregated (--prefill-nodes, --decode-nodes, --prefill-workers, --decode-workers) arguments"
-        )
-    if has_disagg_args:
-        # Validate disaggregated args
-        if args.prefill_nodes is None or args.decode_nodes is None:
-            raise ValueError(
-                "Disaggregated mode requires both --prefill-nodes and --decode-nodes"
-            )
-        if args.prefill_workers is None or args.decode_workers is None:
-            raise ValueError(
-                "Disaggregated mode requires both --prefill-workers and --decode-workers"
-            )
-        if args.prefill_nodes % args.prefill_workers != 0:
-            raise ValueError(
-                f"Prefill nodes ({args.prefill_nodes}) must be divisible by prefill workers ({args.prefill_workers})"
-            )
-        if args.decode_nodes % args.decode_workers != 0:
-            raise ValueError(
-                f"Decode nodes ({args.decode_nodes}) must be divisible by decode workers ({args.decode_workers})"
-            )
-        # Validate GPU script exists for disaggregated mode
-        script_dir = pathlib.Path(__file__).parent / "scripts"
-        disagg_dir = script_dir / args.gpu_type / "disagg"
-        # Use script variant (defaults to "default")
-        script_name = f"{args.script_variant}.sh"
-        gpu_script = disagg_dir / script_name
-        if not gpu_script.exists():
-            raise ValueError(
-                f"Disaggregated GPU script not found: {gpu_script}. Available GPU types: {', '.join(_get_available_gpu_types())}"
-            )
-    if has_agg_args:
-        # Validate aggregated args
-        if args.agg_nodes is None or args.agg_workers is None:
-            raise ValueError(
-                "Aggregated mode requires both --agg-nodes and --agg-workers"
-            )
-        if args.agg_nodes % args.agg_workers != 0:
-            raise ValueError(
-                f"Aggregated nodes ({args.agg_nodes}) must be divisible by aggregated workers ({args.agg_workers})"
-            )
-        # Validate aggregated GPU script exists
-        script_dir = pathlib.Path(__file__).parent / "scripts"
-        # Remove any -prefill or -decode suffix if present
-        base_gpu_type = args.gpu_type.replace("-prefill", "").replace("-decode", "")
-        agg_dir = script_dir / base_gpu_type / "agg"
-        # Use script variant (defaults to "default")
-        script_name = f"{args.script_variant}.sh"
-        agg_gpu_script = agg_dir / script_name
-        if not agg_gpu_script.exists():
-            raise ValueError(
-                f"Aggregated GPU script not found: {agg_gpu_script}. Available GPU types: {', '.join(_get_available_gpu_types())}"
-            )
-    if not has_disagg_args and not has_agg_args:
-        raise ValueError(
-            "Must specify either aggregated (--agg-nodes, --agg-workers) or "
-            "disaggregated (--prefill-nodes, --decode-nodes, --prefill-workers, --decode-workers) arguments"
-        )
-def main(input_args: list[str] | None = None):
-    setup_logging()
-    args = _parse_command_line_args(input_args)
-    # Validate arguments
-    _validate_args(args)
-    # Determine mode and set defaults
-    is_aggregated = args.agg_nodes is not None
-    if is_aggregated:
-        agg_nodes = args.agg_nodes
-        agg_workers = args.agg_workers
-        prefill_nodes = 0
-        decode_nodes = 0
-        prefill_workers = 0
-        decode_workers = 0
-        total_nodes = agg_nodes
-    else:
-        prefill_nodes = args.prefill_nodes
-        decode_nodes = args.decode_nodes
-        prefill_workers = args.prefill_workers
-        decode_workers = args.decode_workers
-        agg_nodes = 0
-        agg_workers = 0
-        total_nodes = prefill_nodes + decode_nodes
-    # Validation for multiple frontends
-    if args.enable_multiple_frontends:
-        if args.num_additional_frontends < 0:
-            raise ValueError("Number of additional frontends cannot be negative")
-    # parse profiler configs
-    profiler_config = {}
-    if args.profiler:
-        for key_val_pair in args.profiler.split("; "):
-            key, val = key_val_pair.split("=")
-            profiler_config[key] = val
-    # validate profiler configs
-    if profiler_config == {} or profiler_config["type"] == "manual":
-        parsable_config = ""
-        profiler_config["type"] = "manual"
-    elif profiler_config["type"] in ["sglang", "vllm", "gap"]:
-        parsable_config = ""
-        need_keys = ["isl", "osl", "concurrencies"]
-        assert all([key in profiler_config for key in need_keys])
-        assert profiler_config["isl"].isnumeric()
-        parsable_config = f"{parsable_config} {profiler_config['isl']}"
-        assert profiler_config["osl"].isnumeric()
-        parsable_config = f"{parsable_config} {profiler_config['osl']}"
-        assert all([x.isnumeric() for x in profiler_config["concurrencies"].split("x")])
-        parsable_config = f"{parsable_config} {profiler_config['concurrencies']}"
-        if profiler_config["type"] in ["sglang", "vllm"]:
-            assert "req-rate" in profiler_config
-            assert (
-                profiler_config["req-rate"] == "inf"
-                or profiler_config["req-rate"].isnumeric()
-            )
-            parsable_config = f"{parsable_config} {profiler_config['req-rate']}"
-    else:
-        assert False, profiler_config["type"]
-    # Generate timestamp for log directory naming
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    # Select template based on mode
-    if is_aggregated:
-        template_path = "job_script_template_agg.j2"
-    else:
-        template_path = "job_script_template_disagg.j2"
-    template_vars = {
-        "job_name": args.job_name,
-        "total_nodes": total_nodes,
-        "account": args.account,
-        "time_limit": args.time_limit,
-        "prefill_nodes": prefill_nodes,
-        "decode_nodes": decode_nodes,
-        "prefill_workers": prefill_workers,
-        "decode_workers": decode_workers,
-        "agg_nodes": agg_nodes,
-        "agg_workers": agg_workers,
-        "is_aggregated": is_aggregated,
-        "model_dir": args.model_dir,
-        "config_dir": args.config_dir,
-        "container_image": args.container_image,
-        "gpus_per_node": args.gpus_per_node,
-        "network_interface": args.network_interface,
-        "gpu_type": args.gpu_type,
-        "script_variant": args.script_variant,
-        "partition": args.partition,
-        "enable_multiple_frontends": args.enable_multiple_frontends,
-        "num_additional_frontends": args.num_additional_frontends,
-        "use_init_location": args.use_init_location,
-        "do_profile": profiler_config["type"] != "manual",
-        "profiler_type": profiler_config["type"],
-        "profiler_arg": parsable_config,
-        "timestamp": timestamp,
-        "enable_config_dump": args.enable_config_dump,
-        "run_in_ci": args.run_in_ci,
-    }
-    # Create temporary file for sbatch script
-    temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False)
-    temp_path = temp_file.name
-    temp_file.close()
-    try:
-        _, rendered_script = generate_job_script(
-            template_path, temp_path, **template_vars
-        )
-        submitted_job_ids = []
-        job_id = submit_job(temp_path, args.extra_slurm_args)
-        submitted_job_ids.append(job_id)
-        # Create log directory with new naming format IMMEDIATELY after submission
-        # SLURM will write log.out/log.err to this directory when job starts
-        if is_aggregated:
-            log_dir_name = f"{job_id}_{agg_workers}A_{timestamp}"
-        else:
-            log_dir_name = f"{job_id}_{prefill_workers}P_{decode_workers}D_{timestamp}"
-        log_dir_path = os.path.join("logs", log_dir_name)
-        os.makedirs(log_dir_path, exist_ok=True)
-        # Save rendered sbatch script
-        sbatch_script_path = os.path.join(log_dir_path, "sbatch_script.sh")
-        with open(sbatch_script_path, "w") as f:
-            f.write(rendered_script)
-        logging.info(f"Saved rendered sbatch script to {sbatch_script_path}")
-        # retries logic
-        if args.retries > 0:
-            extra_slurm_args_without_dependencies = [
-                x for x in args.extra_slurm_args if "dependency" not in x
-            ]
-            for _ in range(args.retries):
-                dependencies = ",".join(
-                    [f"afternotok:{job}" for job in submitted_job_ids]
-                )
-                slurm_args = extra_slurm_args_without_dependencies + [
-                    f"dependency={dependencies}"
-                ]
-                job_id = submit_job(temp_path, slurm_args)
-                submitted_job_ids.append(job_id)
-                # Save script for retry job as well
-                if is_aggregated:
-                    retry_log_dir_name = f"{job_id}_{agg_workers}A_{timestamp}"
-                else:
-                    retry_log_dir_name = (
-                        f"{job_id}_{prefill_workers}P_{decode_workers}D_{timestamp}"
-                    )
-                retry_log_dir_path = os.path.join("logs", retry_log_dir_name)
-                os.makedirs(retry_log_dir_path, exist_ok=True)
-                retry_sbatch_script_path = os.path.join(
-                    retry_log_dir_path, "sbatch_script.sh"
-                )
-                with open(retry_sbatch_script_path, "w") as f:
-                    f.write(rendered_script)
-                logging.info(
-                    f"Saved rendered sbatch script to {retry_sbatch_script_path}"
-                )
-        print_welcome_message(submitted_job_ids, log_dir_name)
-    finally:
-        # Clean up temporary file
-        try:
-            os.unlink(temp_path)
-        except OSError:
-            pass
-if __name__ == "__main__":
-    main()
--- a/lib/llm/src/kv_router/prefill_router.rs
+++ b/lib/llm/src/kv_router/prefill_router.rs
@@ -5,7 +5,6 @@ use std::sync::{Arc, OnceLock};
 use anyhow::Result;
 use futures::StreamExt;
-use rand::Rng;
 use tokio::sync::{OwnedSemaphorePermit, oneshot};
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -321,9 +320,9 @@ impl PrefillRouter {
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;
-        let bootstrap_room: u64 = rand::rng().random();
+        let bootstrap_room: u64 = rand::random_range(0..=i64::MAX as u64);
-        tracing::info!(
+        tracing::debug!(
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,7 @@ vllm = [
 sglang = [
    "uvloop",
-    "sglang[diffusion]==0.5.8",
+    "sglang[diffusion]==0.5.9",
    "nixl[cu12]<=0.9.0",
    "cupy-cuda12x>=13.0.0",
 ]