test: add dryrun mode for sla planner (#2557)

Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

test: add dryrun mode for sla planner (#2557)
Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1b3a1073 · Hongkuan Zhou · GitHub · 626d7e18 · 1b3a1073 · 1b3a1073
Unverified Commit 1b3a1073 authored Aug 20, 2025 by Hongkuan Zhou Committed by GitHub Aug 20, 2025
20 changed files
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -28,6 +28,6 @@ pip install -e .
 Currently, this will install lightweight tools for:
 - Analyzing prefix-structured data (`datagen analyze`)
 - Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
-Detailed information are provided in the `data_generator` directory.
+Detailed information are provided in the `prefix_data_generator` directory.
 The benchmarking scripts for the core dynamo components are to come soon (e.g. routing, disagg, Planner).
\ No newline at end of file
--- a/benchmarks/data_generator/README.md
+++ b/benchmarks/data_generator/README.md
--- a/benchmarks/data_generator/__init__.py
+++ b/benchmarks/data_generator/__init__.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from data_generator.cli import main as cli_main
+from prefix_data_generator.cli import main as cli_main
 def main():

--- a/benchmarks/data_generator/cli.py
+++ b/benchmarks/data_generator/cli.py
@@ -36,13 +36,13 @@ def main():
    if args.command == "analyze":
        # Import and run the analyzer main
-        from data_generator import prefix_analyzer
+        from prefix_data_generator import prefix_analyzer
        sys.argv = [sys.argv[0]] + remaining
        prefix_analyzer.main()
    elif args.command == "synthesize":
        # Import and run the synthesizer main
-        from data_generator import synthesizer
+        from prefix_data_generator import synthesizer
        sys.argv = [sys.argv[0]] + remaining
        synthesizer.main()

--- a/benchmarks/data_generator/example.py
+++ b/benchmarks/data_generator/example.py
@@ -17,8 +17,8 @@ import json
 import tempfile
 import requests
-from data_generator.hasher import hashes_to_texts
+from prefix_data_generator.hasher import hashes_to_texts
-from data_generator.synthesizer import Synthesizer
+from prefix_data_generator.synthesizer import Synthesizer
 # download the mooncake trace file
 mooncake_trace_permalink = "https://raw.githubusercontent.com/kvcache-ai/Mooncake/f09c501b2a5d73e4d60cdeb612d7d0d54e1ec228/mooncake_trace.jsonl"

--- a/benchmarks/data_generator/graph_utils.py
+++ b/benchmarks/data_generator/graph_utils.py
@@ -15,8 +15,8 @@
 import networkx as nx
 import numpy as np
-from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
+from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
-from data_generator.sampler import get_cdf
+from prefix_data_generator.sampler import get_cdf
 def _verify_tree(G: nx.DiGraph) -> None:

--- a/benchmarks/data_generator/hasher.py
+++ b/benchmarks/data_generator/hasher.py
--- a/benchmarks/data_generator/logging_utils.py
+++ b/benchmarks/data_generator/logging_utils.py
--- a/benchmarks/data_generator/prefix_analyzer.py
+++ b/benchmarks/data_generator/prefix_analyzer.py
@@ -16,7 +16,7 @@
 import json
 from collections import Counter
-from data_generator.logging_utils import calculate_and_print_statistics
+from prefix_data_generator.logging_utils import calculate_and_print_statistics
 class PrefixAnalyzer:

--- a/benchmarks/data_generator/protocols.py
+++ b/benchmarks/data_generator/protocols.py
--- a/benchmarks/data_generator/sampler.py
+++ b/benchmarks/data_generator/sampler.py
--- a/benchmarks/data_generator/synthesizer.py
+++ b/benchmarks/data_generator/synthesizer.py
@@ -20,15 +20,15 @@ from typing import Any, Optional
 import networkx as nx
 import numpy as np
 import pandas as pd
-from data_generator.graph_utils import (
+from prefix_data_generator.graph_utils import (
    _mark_visited,
    _merge_chains,
    _precompute_transition_cdfs,
    _remove_leaves,
    _verify_tree,
 )
-from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
+from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
-from data_generator.sampler import EmpiricalSampler, sample_from_cdf
+from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf
 class Synthesizer:
@@ -334,7 +334,7 @@ def main():
    import argparse
    from pathlib import Path
-    from data_generator.logging_utils import calculate_and_print_statistics
+    from prefix_data_generator.logging_utils import calculate_and_print_statistics
    parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
    parser.add_argument(

--- a/benchmarks/data_generator/tests/test_hasher.py
+++ b/benchmarks/data_generator/tests/test_hasher.py
@@ -17,7 +17,7 @@ import math
 import random
 import pytest
-from data_generator.hasher import hashes_to_texts, texts_to_hashes
+from prefix_data_generator.hasher import hashes_to_texts, texts_to_hashes
 from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
 from transformers import AutoTokenizer, PreTrainedTokenizerFast

--- a/benchmarks/data_generator/tests/test_sampler.py
+++ b/benchmarks/data_generator/tests/test_sampler.py
@@ -16,7 +16,7 @@
 from collections import Counter
 import numpy as np
-from data_generator.sampler import EmpiricalSampler
+from prefix_data_generator.sampler import EmpiricalSampler
 def test_empirical_sampler_distribution():

--- a/benchmarks/data_generator/tests/test_synthesizer.py
+++ b/benchmarks/data_generator/tests/test_synthesizer.py
@@ -19,7 +19,7 @@ import random
 import tempfile
 import unittest
-from data_generator.synthesizer import Synthesizer
+from prefix_data_generator.synthesizer import Synthesizer
 # Helper function to create and dump data

--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = [
 ]
 [project.scripts]
-datagen = "data_generator.cli:main"
+datagen = "prefix_data_generator.cli:main"
 [project.urls]
 Repository = "https://github.com/ai-dynamo/dynamo.git"
@@ -59,10 +59,10 @@ requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
 [tool.setuptools]
-packages = ["data_generator"]
+packages = ["prefix_data_generator"]
 [tool.setuptools.package-data]
-data_generator = ["**/*.py"]
+prefix_data_generator = ["**/*.py"]
 [tool.mypy]
 explicit_package_bases = true

--- a/benchmarks/sin_load_generator/README.md
+++ b/benchmarks/sin_load_generator/README.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+# Sinusoidal Load Generator
+`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
+## Usage
+```bash
+cd benchmarks/sin_load_generator
+python sin_synth.py [OPTIONS]
+```
+### Basic Options
+- `--block-size INT` (default: 512)
+  - Block size for hashing, since there is no prefix caching, the block size does not need to be the same as the engine's KV block size.
+- `--total-blocks INT` (default: 10000)
+  - ISL prompt blocks are randomly sampled from this range. Use a larger number to reduce the chance of duplicated prompts.
+- `--output-file STR` (default: auto-generated)
+  - Output file name (in jsonl format)
+  - If not specified, the script will generate a filename based on parameters
+- `--time-duration INT` (default: 100)
+  - Total time duration of the dataset in seconds
+- `--process-interval INT` (default: 1)
+  - Sampling interval used to generate the dataset
+  - Smaller interval leads to more precise changes in request rate and isl/osl ratio but longer generation time.
+### Request Rate Parameters
+The request rate follows a sinusoidal pattern:
+```
+request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * π / period * t - π / 2)
+```
+Note the phase shift of `-π/2` is to make the request rate start from the minimum at `t = 0`.
+- `--request-rate-min FLOAT` (default: 5)
+  - Minimum request rate in requests per second
+- `--request-rate-max FLOAT` (default: 10)
+  - Maximum request rate in requests per second
+- `--request-rate-period FLOAT` (default: 10)
+  - Period of the sinusoidal request rate in seconds
+### Input/Output Sequence Length Parameters
+The script will generate load with requests sampled from two preset ISL/OSL combinations.
+The ISL/OSL ratio defines how much of requests follow the first preset ISL/OSL pattern. ISl/OSL 0 means all requests follow the first preset ISL/OSL pattern, while ISL/OSL 1 means all requests follow the second preset ISL/OSL pattern.
+The ISL/OSL ratio follows a sinusoidal pattern:
+```
+isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * π / period * t - π / 2)
+```
+Similarly, the phase shift of `-π/2` is to make the ISL/OSL ratio start from the minimum at `t = 0`.
+- `--isl1 INT` (default: 100)
+  - Minimum input sequence length
+- `--osl1 INT` (default: 2000)
+  - Minimum output sequence length
+- `--isl2 INT` (default: 5000)
+  - Maximum input sequence length
+- `--osl2 INT` (default: 100)
+  - Maximum output sequence length
+- `--isl-osl-ratio-min FLOAT` (default: 0.2)
+  - Minimum ratio of input sequence length to output sequence length
+- `--isl-osl-ratio-max FLOAT` (default: 0.8)
+  - Maximum ratio of input sequence length to output sequence length
+- `--isl-osl-ratio-period FLOAT` (default: 10)
+  - Period of the sinusoidal input/output sequence length ratio
+### Examples
+#### Varying Request Rate with Fixed ISL/OSL Ratio
+```bash
+python sin_synth.py \
+  --time-duration 60 \
+  --request-rate-min 2 \
+  --request-rate-max 8 \
+  --request-rate-period 20 \
+  --isl1 3000 \
+  --osl1 150 \
+  --isl2 3000 \
+  --osl2 150 \
+  --output-file dataset.jsonl
+```
+This generates a 60-second dataset with request rates varying between 2-8 requests/second over a 20-second period, with 3000 ISL and 150 OSL. The ISL/OSL ratio is fixed at 0.2.
+#### Varying ISL/OSL Ratio with Fixed Request Rate
+```bash
+python sin_synth.py \
+  --time-duration 60 \
+  --request-rate-min 5 \
+  --request-rate-max 5 \
+  --isl1 3000 \
+  --osl1 150 \
+  --isl2 500 \
+  --osl2 2000 \
+  --isl-osl-ratio-min 0.2 \
+  --isl-osl-ratio-max 0.8 \
+  --isl-osl-ratio-period 20 \
+  --output-file dataset.jsonl
+```
+This generates a 60-second dataset with request rate fixed at 5 requests/second, with ISL/OSL ratio varying between 0.2 and 0.8 between I3000O150 and I500O2000over a 20-second period.
\ No newline at end of file
--- a/docs/guides/planner_benchmark/sin_synth.py
+++ b/docs/guides/planner_benchmark/sin_synth.py
@@ -31,7 +31,7 @@ def main(args):
    def get_isl_osl(t):
        isl_osl_ratio = (args.isl_osl_ratio_min + args.isl_osl_ratio_max) / 2 + (
            args.isl_osl_ratio_max - args.isl_osl_ratio_min
-        ) / 2 * np.sin(2 * np.pi / args.isl_osl_ratio_period * t)
+        ) / 2 * np.sin(2 * np.pi / args.isl_osl_ratio_period * t - np.pi / 2)
        logger.info(f"isl_osl_ratio at {t:.2f}: {isl_osl_ratio:.2f}")
        if np.random.uniform(0, 1) < isl_osl_ratio:
            return (args.isl1, args.osl1)
@@ -43,7 +43,7 @@ def main(args):
        t_e = min(t + args.process_interval, args.time_duration)
        request_rate = (args.request_rate_min + args.request_rate_max) / 2 + (
            args.request_rate_max - args.request_rate_min
-        ) / 2 * np.sin(2 * np.pi / args.request_rate_period * t)
+        ) / 2 * np.sin(2 * np.pi / args.request_rate_period * t - np.pi / 2)
        logger.info(f"request_rate at {t:.2f}: {request_rate:.2f}")
        num_requests = np.random.poisson(request_rate * (t_e - t))
        for req_idx in range(num_requests):
@@ -100,7 +100,8 @@ if __name__ == "__main__":
    # request rate parameters
    # for the process interval at [t, t + process_interval), the number of requests to generate is sampled
    # from a poison distribution with the following parameters:
-    # request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t)
+    # request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t - pi / 2)
+    # the phase shift is pi / 2 to make the request rate start from the minimum at t = 0
    # num_requests[t, t + process_interval) ~ Poisson(request_rate(t) * process_interval)
    # requests are uniformly distributed in the interval [t, t + process_interval)
    parser.add_argument(
@@ -125,7 +126,7 @@ if __name__ == "__main__":
    # isl/osl parameters
    # isl/osl is randomly sampled from two candidates following the isl-osl-ratio.
    # at time t, the isl-osl-ratio is calculated as:
-    # isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t)
+    # isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t - pi / 2)
    # Then, we sample [isl1/osl1, isl2/osl2] from the distribution [isl-osl-ratio(t), 1 - isl-osl-ratio(t)]
    parser.add_argument(
        "--isl1", type=int, default=100, help="Minimum input sequence length"

--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import asyncio
 import logging
 from pydantic import BaseModel
 from dynamo.planner.defaults import SLAPlannerDefaults
+from dynamo.planner.utils.argparse import create_sla_planner_parser
 from dynamo.planner.utils.planner_core import start_sla_planner
 from dynamo.runtime import DistributedRuntime, dynamo_worker
@@ -52,101 +52,6 @@ async def init_planner(runtime: DistributedRuntime, args):
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="SLA Planner")
+    parser = create_sla_planner_parser()
-    parser.add_argument(
-        "--environment",
-        default=SLAPlannerDefaults.environment,
-        choices=["kubernetes"],
-        help="Environment type",
-    )
-    parser.add_argument(
-        "--backend",
-        default=SLAPlannerDefaults.backend,
-        choices=["vllm", "sglang"],
-        help="Backend type",
-    )
-    parser.add_argument(
-        "--no-operation",
-        action="store_true",
-        default=SLAPlannerDefaults.no_operation,
-        help="Enable no-operation mode",
-    )
-    parser.add_argument(
-        "--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
-    )
-    parser.add_argument(
-        "--adjustment-interval",
-        type=int,
-        default=SLAPlannerDefaults.adjustment_interval,
-        help="Adjustment interval in seconds",
-    )
-    parser.add_argument(
-        "--max-gpu-budget",
-        type=int,
-        default=SLAPlannerDefaults.max_gpu_budget,
-        help="Maximum GPU budget",
-    )
-    parser.add_argument(
-        "--min-endpoint",
-        type=int,
-        default=SLAPlannerDefaults.min_endpoint,
-        help="Minimum number of endpoints",
-    )
-    parser.add_argument(
-        "--decode-engine-num-gpu",
-        type=int,
-        default=SLAPlannerDefaults.decode_engine_num_gpu,
-        help="Number of GPUs for decode engine",
-    )
-    parser.add_argument(
-        "--prefill-engine-num-gpu",
-        type=int,
-        default=SLAPlannerDefaults.prefill_engine_num_gpu,
-        help="Number of GPUs for prefill engine",
-    )
-    parser.add_argument(
-        "--profile-results-dir",
-        default=SLAPlannerDefaults.profile_results_dir,
-        help="Profile results directory",
-    )
-    parser.add_argument(
-        "--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
-    )
-    parser.add_argument(
-        "--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
-    )
-    parser.add_argument(
-        "--ttft",
-        type=float,
-        default=SLAPlannerDefaults.ttft,
-        help="Time to first token",
-    )
-    parser.add_argument(
-        "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
-    )
-    parser.add_argument(
-        "--load-predictor",
-        default=SLAPlannerDefaults.load_predictor,
-        help="Load predictor type",
-    )
-    parser.add_argument(
-        "--load-prediction-window-size",
-        type=int,
-        default=SLAPlannerDefaults.load_prediction_window_size,
-        help="Load prediction window size",
-    )
-    parser.add_argument(
-        "--prometheus-port",
-        type=int,
-        default=SLAPlannerDefaults.prometheus_port,
-        help="Prometheus port",
-    )
-    parser.add_argument(
-        "--no-correction",
-        action="store_true",
-        default=SLAPlannerDefaults.no_correction,
-        help="Disable correction factor",
-    )
    args = parser.parse_args()
    asyncio.run(init_planner(args))
--- a/components/planner/src/dynamo/planner/utils/argparse.py
+++ b/components/planner/src/dynamo/planner/utils/argparse.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from dynamo.planner.defaults import SLAPlannerDefaults
+def create_sla_planner_parser() -> argparse.ArgumentParser:
+    """Create and configure the argument parser for SLA Planner.
+    Returns:
+        argparse.ArgumentParser: Configured argument parser for SLA Planner
+    """
+    parser = argparse.ArgumentParser(description="SLA Planner")
+    parser.add_argument(
+        "--environment",
+        default=SLAPlannerDefaults.environment,
+        choices=["kubernetes"],
+        help="Environment type",
+    )
+    parser.add_argument(
+        "--backend",
+        default=SLAPlannerDefaults.backend,
+        choices=["vllm", "sglang"],
+        help="Backend type",
+    )
+    parser.add_argument(
+        "--no-operation",
+        action="store_true",
+        default=SLAPlannerDefaults.no_operation,
+        help="Enable no-operation mode",
+    )
+    parser.add_argument(
+        "--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
+    )
+    parser.add_argument(
+        "--adjustment-interval",
+        type=int,
+        default=SLAPlannerDefaults.adjustment_interval,
+        help="Adjustment interval in seconds",
+    )
+    parser.add_argument(
+        "--max-gpu-budget",
+        type=int,
+        default=SLAPlannerDefaults.max_gpu_budget,
+        help="Maximum GPU budget",
+    )
+    parser.add_argument(
+        "--min-endpoint",
+        type=int,
+        default=SLAPlannerDefaults.min_endpoint,
+        help="Minimum number of endpoints",
+    )
+    parser.add_argument(
+        "--decode-engine-num-gpu",
+        type=int,
+        default=SLAPlannerDefaults.decode_engine_num_gpu,
+        help="Number of GPUs for decode engine",
+    )
+    parser.add_argument(
+        "--prefill-engine-num-gpu",
+        type=int,
+        default=SLAPlannerDefaults.prefill_engine_num_gpu,
+        help="Number of GPUs for prefill engine",
+    )
+    parser.add_argument(
+        "--profile-results-dir",
+        default=SLAPlannerDefaults.profile_results_dir,
+        help="Profile results directory",
+    )
+    parser.add_argument(
+        "--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
+    )
+    parser.add_argument(
+        "--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
+    )
+    parser.add_argument(
+        "--ttft",
+        type=float,
+        default=SLAPlannerDefaults.ttft,
+        help="Time to first token",
+    )
+    parser.add_argument(
+        "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
+    )
+    parser.add_argument(
+        "--load-predictor",
+        default=SLAPlannerDefaults.load_predictor,
+        help="Load predictor type",
+    )
+    parser.add_argument(
+        "--load-prediction-window-size",
+        type=int,
+        default=SLAPlannerDefaults.load_prediction_window_size,
+        help="Load prediction window size",
+    )
+    parser.add_argument(
+        "--prometheus-port",
+        type=int,
+        default=SLAPlannerDefaults.prometheus_port,
+        help="Prometheus port",
+    )
+    parser.add_argument(
+        "--no-correction",
+        action="store_true",
+        default=SLAPlannerDefaults.no_correction,
+        help="Disable correction factor",
+    )
+    return parser