Unverified Commit 1b3a1073 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

test: add dryrun mode for sla planner (#2557)


Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent 626d7e18
......@@ -28,6 +28,6 @@ pip install -e .
Currently, this will install lightweight tools for:
- Analyzing prefix-structured data (`datagen analyze`)
- Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
Detailed information are provided in the `data_generator` directory.
Detailed information are provided in the `prefix_data_generator` directory.
The benchmarking scripts for the core dynamo components are to come soon (e.g. routing, disagg, Planner).
\ No newline at end of file
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from data_generator.cli import main as cli_main
from prefix_data_generator.cli import main as cli_main
def main():
......
......@@ -36,13 +36,13 @@ def main():
if args.command == "analyze":
# Import and run the analyzer main
from data_generator import prefix_analyzer
from prefix_data_generator import prefix_analyzer
sys.argv = [sys.argv[0]] + remaining
prefix_analyzer.main()
elif args.command == "synthesize":
# Import and run the synthesizer main
from data_generator import synthesizer
from prefix_data_generator import synthesizer
sys.argv = [sys.argv[0]] + remaining
synthesizer.main()
......
......@@ -17,8 +17,8 @@ import json
import tempfile
import requests
from data_generator.hasher import hashes_to_texts
from data_generator.synthesizer import Synthesizer
from prefix_data_generator.hasher import hashes_to_texts
from prefix_data_generator.synthesizer import Synthesizer
# download the mooncake trace file
mooncake_trace_permalink = "https://raw.githubusercontent.com/kvcache-ai/Mooncake/f09c501b2a5d73e4d60cdeb612d7d0d54e1ec228/mooncake_trace.jsonl"
......
......@@ -15,8 +15,8 @@
import networkx as nx
import numpy as np
from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from data_generator.sampler import get_cdf
from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from prefix_data_generator.sampler import get_cdf
def _verify_tree(G: nx.DiGraph) -> None:
......
......@@ -16,7 +16,7 @@
import json
from collections import Counter
from data_generator.logging_utils import calculate_and_print_statistics
from prefix_data_generator.logging_utils import calculate_and_print_statistics
class PrefixAnalyzer:
......
......@@ -20,15 +20,15 @@ from typing import Any, Optional
import networkx as nx
import numpy as np
import pandas as pd
from data_generator.graph_utils import (
from prefix_data_generator.graph_utils import (
_mark_visited,
_merge_chains,
_precompute_transition_cdfs,
_remove_leaves,
_verify_tree,
)
from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from data_generator.sampler import EmpiricalSampler, sample_from_cdf
from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf
class Synthesizer:
......@@ -334,7 +334,7 @@ def main():
import argparse
from pathlib import Path
from data_generator.logging_utils import calculate_and_print_statistics
from prefix_data_generator.logging_utils import calculate_and_print_statistics
parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
parser.add_argument(
......
......@@ -17,7 +17,7 @@ import math
import random
import pytest
from data_generator.hasher import hashes_to_texts, texts_to_hashes
from prefix_data_generator.hasher import hashes_to_texts, texts_to_hashes
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
from transformers import AutoTokenizer, PreTrainedTokenizerFast
......
......@@ -16,7 +16,7 @@
from collections import Counter
import numpy as np
from data_generator.sampler import EmpiricalSampler
from prefix_data_generator.sampler import EmpiricalSampler
def test_empirical_sampler_distribution():
......
......@@ -19,7 +19,7 @@ import random
import tempfile
import unittest
from data_generator.synthesizer import Synthesizer
from prefix_data_generator.synthesizer import Synthesizer
# Helper function to create and dump data
......
......@@ -49,7 +49,7 @@ dependencies = [
]
[project.scripts]
datagen = "data_generator.cli:main"
datagen = "prefix_data_generator.cli:main"
[project.urls]
Repository = "https://github.com/ai-dynamo/dynamo.git"
......@@ -59,10 +59,10 @@ requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"
[tool.setuptools]
packages = ["data_generator"]
packages = ["prefix_data_generator"]
[tool.setuptools.package-data]
data_generator = ["**/*.py"]
prefix_data_generator = ["**/*.py"]
[tool.mypy]
explicit_package_bases = true
......
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->
# Sinusoidal Load Generator
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
## Usage
```bash
cd benchmarks/sin_load_generator
python sin_synth.py [OPTIONS]
```
### Basic Options
- `--block-size INT` (default: 512)
- Block size for hashing, since there is no prefix caching, the block size does not need to be the same as the engine's KV block size.
- `--total-blocks INT` (default: 10000)
- ISL prompt blocks are randomly sampled from this range. Use a larger number to reduce the chance of duplicated prompts.
- `--output-file STR` (default: auto-generated)
- Output file name (in jsonl format)
- If not specified, the script will generate a filename based on parameters
- `--time-duration INT` (default: 100)
- Total time duration of the dataset in seconds
- `--process-interval INT` (default: 1)
- Sampling interval used to generate the dataset
- Smaller interval leads to more precise changes in request rate and isl/osl ratio but longer generation time.
### Request Rate Parameters
The request rate follows a sinusoidal pattern:
```
request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * π / period * t - π / 2)
```
Note the phase shift of `-π/2` is to make the request rate start from the minimum at `t = 0`.
- `--request-rate-min FLOAT` (default: 5)
- Minimum request rate in requests per second
- `--request-rate-max FLOAT` (default: 10)
- Maximum request rate in requests per second
- `--request-rate-period FLOAT` (default: 10)
- Period of the sinusoidal request rate in seconds
### Input/Output Sequence Length Parameters
The script will generate load with requests sampled from two preset ISL/OSL combinations.
The ISL/OSL ratio defines how much of requests follow the first preset ISL/OSL pattern. ISl/OSL 0 means all requests follow the first preset ISL/OSL pattern, while ISL/OSL 1 means all requests follow the second preset ISL/OSL pattern.
The ISL/OSL ratio follows a sinusoidal pattern:
```
isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * π / period * t - π / 2)
```
Similarly, the phase shift of `-π/2` is to make the ISL/OSL ratio start from the minimum at `t = 0`.
- `--isl1 INT` (default: 100)
- Minimum input sequence length
- `--osl1 INT` (default: 2000)
- Minimum output sequence length
- `--isl2 INT` (default: 5000)
- Maximum input sequence length
- `--osl2 INT` (default: 100)
- Maximum output sequence length
- `--isl-osl-ratio-min FLOAT` (default: 0.2)
- Minimum ratio of input sequence length to output sequence length
- `--isl-osl-ratio-max FLOAT` (default: 0.8)
- Maximum ratio of input sequence length to output sequence length
- `--isl-osl-ratio-period FLOAT` (default: 10)
- Period of the sinusoidal input/output sequence length ratio
### Examples
#### Varying Request Rate with Fixed ISL/OSL Ratio
```bash
python sin_synth.py \
--time-duration 60 \
--request-rate-min 2 \
--request-rate-max 8 \
--request-rate-period 20 \
--isl1 3000 \
--osl1 150 \
--isl2 3000 \
--osl2 150 \
--output-file dataset.jsonl
```
This generates a 60-second dataset with request rates varying between 2-8 requests/second over a 20-second period, with 3000 ISL and 150 OSL. The ISL/OSL ratio is fixed at 0.2.
#### Varying ISL/OSL Ratio with Fixed Request Rate
```bash
python sin_synth.py \
--time-duration 60 \
--request-rate-min 5 \
--request-rate-max 5 \
--isl1 3000 \
--osl1 150 \
--isl2 500 \
--osl2 2000 \
--isl-osl-ratio-min 0.2 \
--isl-osl-ratio-max 0.8 \
--isl-osl-ratio-period 20 \
--output-file dataset.jsonl
```
This generates a 60-second dataset with request rate fixed at 5 requests/second, with ISL/OSL ratio varying between 0.2 and 0.8 between I3000O150 and I500O2000over a 20-second period.
\ No newline at end of file
......@@ -31,7 +31,7 @@ def main(args):
def get_isl_osl(t):
isl_osl_ratio = (args.isl_osl_ratio_min + args.isl_osl_ratio_max) / 2 + (
args.isl_osl_ratio_max - args.isl_osl_ratio_min
) / 2 * np.sin(2 * np.pi / args.isl_osl_ratio_period * t)
) / 2 * np.sin(2 * np.pi / args.isl_osl_ratio_period * t - np.pi / 2)
logger.info(f"isl_osl_ratio at {t:.2f}: {isl_osl_ratio:.2f}")
if np.random.uniform(0, 1) < isl_osl_ratio:
return (args.isl1, args.osl1)
......@@ -43,7 +43,7 @@ def main(args):
t_e = min(t + args.process_interval, args.time_duration)
request_rate = (args.request_rate_min + args.request_rate_max) / 2 + (
args.request_rate_max - args.request_rate_min
) / 2 * np.sin(2 * np.pi / args.request_rate_period * t)
) / 2 * np.sin(2 * np.pi / args.request_rate_period * t - np.pi / 2)
logger.info(f"request_rate at {t:.2f}: {request_rate:.2f}")
num_requests = np.random.poisson(request_rate * (t_e - t))
for req_idx in range(num_requests):
......@@ -100,7 +100,8 @@ if __name__ == "__main__":
# request rate parameters
# for the process interval at [t, t + process_interval), the number of requests to generate is sampled
# from a poison distribution with the following parameters:
# request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t)
# request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t - pi / 2)
# the phase shift is pi / 2 to make the request rate start from the minimum at t = 0
# num_requests[t, t + process_interval) ~ Poisson(request_rate(t) * process_interval)
# requests are uniformly distributed in the interval [t, t + process_interval)
parser.add_argument(
......@@ -125,7 +126,7 @@ if __name__ == "__main__":
# isl/osl parameters
# isl/osl is randomly sampled from two candidates following the isl-osl-ratio.
# at time t, the isl-osl-ratio is calculated as:
# isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t)
# isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t - pi / 2)
# Then, we sample [isl1/osl1, isl2/osl2] from the distribution [isl-osl-ratio(t), 1 - isl-osl-ratio(t)]
parser.add_argument(
"--isl1", type=int, default=100, help="Minimum input sequence length"
......
......@@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import logging
from pydantic import BaseModel
from dynamo.planner.defaults import SLAPlannerDefaults
from dynamo.planner.utils.argparse import create_sla_planner_parser
from dynamo.planner.utils.planner_core import start_sla_planner
from dynamo.runtime import DistributedRuntime, dynamo_worker
......@@ -52,101 +52,6 @@ async def init_planner(runtime: DistributedRuntime, args):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="SLA Planner")
parser.add_argument(
"--environment",
default=SLAPlannerDefaults.environment,
choices=["kubernetes"],
help="Environment type",
)
parser.add_argument(
"--backend",
default=SLAPlannerDefaults.backend,
choices=["vllm", "sglang"],
help="Backend type",
)
parser.add_argument(
"--no-operation",
action="store_true",
default=SLAPlannerDefaults.no_operation,
help="Enable no-operation mode",
)
parser.add_argument(
"--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
)
parser.add_argument(
"--adjustment-interval",
type=int,
default=SLAPlannerDefaults.adjustment_interval,
help="Adjustment interval in seconds",
)
parser.add_argument(
"--max-gpu-budget",
type=int,
default=SLAPlannerDefaults.max_gpu_budget,
help="Maximum GPU budget",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=SLAPlannerDefaults.min_endpoint,
help="Minimum number of endpoints",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.decode_engine_num_gpu,
help="Number of GPUs for decode engine",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.prefill_engine_num_gpu,
help="Number of GPUs for prefill engine",
)
parser.add_argument(
"--profile-results-dir",
default=SLAPlannerDefaults.profile_results_dir,
help="Profile results directory",
)
parser.add_argument(
"--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
)
parser.add_argument(
"--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
)
parser.add_argument(
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token",
)
parser.add_argument(
"--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
)
parser.add_argument(
"--load-predictor",
default=SLAPlannerDefaults.load_predictor,
help="Load predictor type",
)
parser.add_argument(
"--load-prediction-window-size",
type=int,
default=SLAPlannerDefaults.load_prediction_window_size,
help="Load prediction window size",
)
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port",
)
parser.add_argument(
"--no-correction",
action="store_true",
default=SLAPlannerDefaults.no_correction,
help="Disable correction factor",
)
parser = create_sla_planner_parser()
args = parser.parse_args()
asyncio.run(init_planner(args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from dynamo.planner.defaults import SLAPlannerDefaults
def create_sla_planner_parser() -> argparse.ArgumentParser:
"""Create and configure the argument parser for SLA Planner.
Returns:
argparse.ArgumentParser: Configured argument parser for SLA Planner
"""
parser = argparse.ArgumentParser(description="SLA Planner")
parser.add_argument(
"--environment",
default=SLAPlannerDefaults.environment,
choices=["kubernetes"],
help="Environment type",
)
parser.add_argument(
"--backend",
default=SLAPlannerDefaults.backend,
choices=["vllm", "sglang"],
help="Backend type",
)
parser.add_argument(
"--no-operation",
action="store_true",
default=SLAPlannerDefaults.no_operation,
help="Enable no-operation mode",
)
parser.add_argument(
"--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
)
parser.add_argument(
"--adjustment-interval",
type=int,
default=SLAPlannerDefaults.adjustment_interval,
help="Adjustment interval in seconds",
)
parser.add_argument(
"--max-gpu-budget",
type=int,
default=SLAPlannerDefaults.max_gpu_budget,
help="Maximum GPU budget",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=SLAPlannerDefaults.min_endpoint,
help="Minimum number of endpoints",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.decode_engine_num_gpu,
help="Number of GPUs for decode engine",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.prefill_engine_num_gpu,
help="Number of GPUs for prefill engine",
)
parser.add_argument(
"--profile-results-dir",
default=SLAPlannerDefaults.profile_results_dir,
help="Profile results directory",
)
parser.add_argument(
"--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
)
parser.add_argument(
"--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
)
parser.add_argument(
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token",
)
parser.add_argument(
"--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
)
parser.add_argument(
"--load-predictor",
default=SLAPlannerDefaults.load_predictor,
help="Load predictor type",
)
parser.add_argument(
"--load-prediction-window-size",
type=int,
default=SLAPlannerDefaults.load_prediction_window_size,
help="Load prediction window size",
)
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port",
)
parser.add_argument(
"--no-correction",
action="store_true",
default=SLAPlannerDefaults.no_correction,
help="Disable correction factor",
)
return parser
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment