Unverified Commit 1b3a1073 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

test: add dryrun mode for sla planner (#2557)


Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent 626d7e18
...@@ -28,6 +28,6 @@ pip install -e . ...@@ -28,6 +28,6 @@ pip install -e .
Currently, this will install lightweight tools for: Currently, this will install lightweight tools for:
- Analyzing prefix-structured data (`datagen analyze`) - Analyzing prefix-structured data (`datagen analyze`)
- Synthesizing structured data customizable for testing purposes (`datagen synthesize`) - Synthesizing structured data customizable for testing purposes (`datagen synthesize`)
Detailed information are provided in the `data_generator` directory. Detailed information are provided in the `prefix_data_generator` directory.
The benchmarking scripts for the core dynamo components are to come soon (e.g. routing, disagg, Planner). The benchmarking scripts for the core dynamo components are to come soon (e.g. routing, disagg, Planner).
\ No newline at end of file
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from data_generator.cli import main as cli_main from prefix_data_generator.cli import main as cli_main
def main(): def main():
......
...@@ -36,13 +36,13 @@ def main(): ...@@ -36,13 +36,13 @@ def main():
if args.command == "analyze": if args.command == "analyze":
# Import and run the analyzer main # Import and run the analyzer main
from data_generator import prefix_analyzer from prefix_data_generator import prefix_analyzer
sys.argv = [sys.argv[0]] + remaining sys.argv = [sys.argv[0]] + remaining
prefix_analyzer.main() prefix_analyzer.main()
elif args.command == "synthesize": elif args.command == "synthesize":
# Import and run the synthesizer main # Import and run the synthesizer main
from data_generator import synthesizer from prefix_data_generator import synthesizer
sys.argv = [sys.argv[0]] + remaining sys.argv = [sys.argv[0]] + remaining
synthesizer.main() synthesizer.main()
......
...@@ -17,8 +17,8 @@ import json ...@@ -17,8 +17,8 @@ import json
import tempfile import tempfile
import requests import requests
from data_generator.hasher import hashes_to_texts from prefix_data_generator.hasher import hashes_to_texts
from data_generator.synthesizer import Synthesizer from prefix_data_generator.synthesizer import Synthesizer
# download the mooncake trace file # download the mooncake trace file
mooncake_trace_permalink = "https://raw.githubusercontent.com/kvcache-ai/Mooncake/f09c501b2a5d73e4d60cdeb612d7d0d54e1ec228/mooncake_trace.jsonl" mooncake_trace_permalink = "https://raw.githubusercontent.com/kvcache-ai/Mooncake/f09c501b2a5d73e4d60cdeb612d7d0d54e1ec228/mooncake_trace.jsonl"
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import networkx as nx import networkx as nx
import numpy as np import numpy as np
from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from data_generator.sampler import get_cdf from prefix_data_generator.sampler import get_cdf
def _verify_tree(G: nx.DiGraph) -> None: def _verify_tree(G: nx.DiGraph) -> None:
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
import json import json
from collections import Counter from collections import Counter
from data_generator.logging_utils import calculate_and_print_statistics from prefix_data_generator.logging_utils import calculate_and_print_statistics
class PrefixAnalyzer: class PrefixAnalyzer:
......
...@@ -20,15 +20,15 @@ from typing import Any, Optional ...@@ -20,15 +20,15 @@ from typing import Any, Optional
import networkx as nx import networkx as nx
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from data_generator.graph_utils import ( from prefix_data_generator.graph_utils import (
_mark_visited, _mark_visited,
_merge_chains, _merge_chains,
_precompute_transition_cdfs, _precompute_transition_cdfs,
_remove_leaves, _remove_leaves,
_verify_tree, _verify_tree,
) )
from data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from data_generator.sampler import EmpiricalSampler, sample_from_cdf from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf
class Synthesizer: class Synthesizer:
...@@ -334,7 +334,7 @@ def main(): ...@@ -334,7 +334,7 @@ def main():
import argparse import argparse
from pathlib import Path from pathlib import Path
from data_generator.logging_utils import calculate_and_print_statistics from prefix_data_generator.logging_utils import calculate_and_print_statistics
parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset") parser = argparse.ArgumentParser(description="Synthesize Mooncake-Esque dataset")
parser.add_argument( parser.add_argument(
......
...@@ -17,7 +17,7 @@ import math ...@@ -17,7 +17,7 @@ import math
import random import random
import pytest import pytest
from data_generator.hasher import hashes_to_texts, texts_to_hashes from prefix_data_generator.hasher import hashes_to_texts, texts_to_hashes
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers
from transformers import AutoTokenizer, PreTrainedTokenizerFast from transformers import AutoTokenizer, PreTrainedTokenizerFast
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
from collections import Counter from collections import Counter
import numpy as np import numpy as np
from data_generator.sampler import EmpiricalSampler from prefix_data_generator.sampler import EmpiricalSampler
def test_empirical_sampler_distribution(): def test_empirical_sampler_distribution():
......
...@@ -19,7 +19,7 @@ import random ...@@ -19,7 +19,7 @@ import random
import tempfile import tempfile
import unittest import unittest
from data_generator.synthesizer import Synthesizer from prefix_data_generator.synthesizer import Synthesizer
# Helper function to create and dump data # Helper function to create and dump data
......
...@@ -49,7 +49,7 @@ dependencies = [ ...@@ -49,7 +49,7 @@ dependencies = [
] ]
[project.scripts] [project.scripts]
datagen = "data_generator.cli:main" datagen = "prefix_data_generator.cli:main"
[project.urls] [project.urls]
Repository = "https://github.com/ai-dynamo/dynamo.git" Repository = "https://github.com/ai-dynamo/dynamo.git"
...@@ -59,10 +59,10 @@ requires = ["setuptools>=42", "wheel"] ...@@ -59,10 +59,10 @@ requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.setuptools] [tool.setuptools]
packages = ["data_generator"] packages = ["prefix_data_generator"]
[tool.setuptools.package-data] [tool.setuptools.package-data]
data_generator = ["**/*.py"] prefix_data_generator = ["**/*.py"]
[tool.mypy] [tool.mypy]
explicit_package_bases = true explicit_package_bases = true
......
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->
# Sinusoidal Load Generator
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
## Usage
```bash
cd benchmarks/sin_load_generator
python sin_synth.py [OPTIONS]
```
### Basic Options
- `--block-size INT` (default: 512)
- Block size for hashing, since there is no prefix caching, the block size does not need to be the same as the engine's KV block size.
- `--total-blocks INT` (default: 10000)
- ISL prompt blocks are randomly sampled from this range. Use a larger number to reduce the chance of duplicated prompts.
- `--output-file STR` (default: auto-generated)
- Output file name (in jsonl format)
- If not specified, the script will generate a filename based on parameters
- `--time-duration INT` (default: 100)
- Total time duration of the dataset in seconds
- `--process-interval INT` (default: 1)
- Sampling interval used to generate the dataset
- Smaller interval leads to more precise changes in request rate and isl/osl ratio but longer generation time.
### Request Rate Parameters
The request rate follows a sinusoidal pattern:
```
request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * π / period * t - π / 2)
```
Note the phase shift of `-π/2` is to make the request rate start from the minimum at `t = 0`.
- `--request-rate-min FLOAT` (default: 5)
- Minimum request rate in requests per second
- `--request-rate-max FLOAT` (default: 10)
- Maximum request rate in requests per second
- `--request-rate-period FLOAT` (default: 10)
- Period of the sinusoidal request rate in seconds
### Input/Output Sequence Length Parameters
The script will generate load with requests sampled from two preset ISL/OSL combinations.
The ISL/OSL ratio defines how much of requests follow the first preset ISL/OSL pattern. ISl/OSL 0 means all requests follow the first preset ISL/OSL pattern, while ISL/OSL 1 means all requests follow the second preset ISL/OSL pattern.
The ISL/OSL ratio follows a sinusoidal pattern:
```
isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * π / period * t - π / 2)
```
Similarly, the phase shift of `-π/2` is to make the ISL/OSL ratio start from the minimum at `t = 0`.
- `--isl1 INT` (default: 100)
- Minimum input sequence length
- `--osl1 INT` (default: 2000)
- Minimum output sequence length
- `--isl2 INT` (default: 5000)
- Maximum input sequence length
- `--osl2 INT` (default: 100)
- Maximum output sequence length
- `--isl-osl-ratio-min FLOAT` (default: 0.2)
- Minimum ratio of input sequence length to output sequence length
- `--isl-osl-ratio-max FLOAT` (default: 0.8)
- Maximum ratio of input sequence length to output sequence length
- `--isl-osl-ratio-period FLOAT` (default: 10)
- Period of the sinusoidal input/output sequence length ratio
### Examples
#### Varying Request Rate with Fixed ISL/OSL Ratio
```bash
python sin_synth.py \
--time-duration 60 \
--request-rate-min 2 \
--request-rate-max 8 \
--request-rate-period 20 \
--isl1 3000 \
--osl1 150 \
--isl2 3000 \
--osl2 150 \
--output-file dataset.jsonl
```
This generates a 60-second dataset with request rates varying between 2-8 requests/second over a 20-second period, with 3000 ISL and 150 OSL. The ISL/OSL ratio is fixed at 0.2.
#### Varying ISL/OSL Ratio with Fixed Request Rate
```bash
python sin_synth.py \
--time-duration 60 \
--request-rate-min 5 \
--request-rate-max 5 \
--isl1 3000 \
--osl1 150 \
--isl2 500 \
--osl2 2000 \
--isl-osl-ratio-min 0.2 \
--isl-osl-ratio-max 0.8 \
--isl-osl-ratio-period 20 \
--output-file dataset.jsonl
```
This generates a 60-second dataset with request rate fixed at 5 requests/second, with ISL/OSL ratio varying between 0.2 and 0.8 between I3000O150 and I500O2000over a 20-second period.
\ No newline at end of file
...@@ -31,7 +31,7 @@ def main(args): ...@@ -31,7 +31,7 @@ def main(args):
def get_isl_osl(t): def get_isl_osl(t):
isl_osl_ratio = (args.isl_osl_ratio_min + args.isl_osl_ratio_max) / 2 + ( isl_osl_ratio = (args.isl_osl_ratio_min + args.isl_osl_ratio_max) / 2 + (
args.isl_osl_ratio_max - args.isl_osl_ratio_min args.isl_osl_ratio_max - args.isl_osl_ratio_min
) / 2 * np.sin(2 * np.pi / args.isl_osl_ratio_period * t) ) / 2 * np.sin(2 * np.pi / args.isl_osl_ratio_period * t - np.pi / 2)
logger.info(f"isl_osl_ratio at {t:.2f}: {isl_osl_ratio:.2f}") logger.info(f"isl_osl_ratio at {t:.2f}: {isl_osl_ratio:.2f}")
if np.random.uniform(0, 1) < isl_osl_ratio: if np.random.uniform(0, 1) < isl_osl_ratio:
return (args.isl1, args.osl1) return (args.isl1, args.osl1)
...@@ -43,7 +43,7 @@ def main(args): ...@@ -43,7 +43,7 @@ def main(args):
t_e = min(t + args.process_interval, args.time_duration) t_e = min(t + args.process_interval, args.time_duration)
request_rate = (args.request_rate_min + args.request_rate_max) / 2 + ( request_rate = (args.request_rate_min + args.request_rate_max) / 2 + (
args.request_rate_max - args.request_rate_min args.request_rate_max - args.request_rate_min
) / 2 * np.sin(2 * np.pi / args.request_rate_period * t) ) / 2 * np.sin(2 * np.pi / args.request_rate_period * t - np.pi / 2)
logger.info(f"request_rate at {t:.2f}: {request_rate:.2f}") logger.info(f"request_rate at {t:.2f}: {request_rate:.2f}")
num_requests = np.random.poisson(request_rate * (t_e - t)) num_requests = np.random.poisson(request_rate * (t_e - t))
for req_idx in range(num_requests): for req_idx in range(num_requests):
...@@ -100,7 +100,8 @@ if __name__ == "__main__": ...@@ -100,7 +100,8 @@ if __name__ == "__main__":
# request rate parameters # request rate parameters
# for the process interval at [t, t + process_interval), the number of requests to generate is sampled # for the process interval at [t, t + process_interval), the number of requests to generate is sampled
# from a poison distribution with the following parameters: # from a poison distribution with the following parameters:
# request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t) # request_rate(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t - pi / 2)
# the phase shift is pi / 2 to make the request rate start from the minimum at t = 0
# num_requests[t, t + process_interval) ~ Poisson(request_rate(t) * process_interval) # num_requests[t, t + process_interval) ~ Poisson(request_rate(t) * process_interval)
# requests are uniformly distributed in the interval [t, t + process_interval) # requests are uniformly distributed in the interval [t, t + process_interval)
parser.add_argument( parser.add_argument(
...@@ -125,7 +126,7 @@ if __name__ == "__main__": ...@@ -125,7 +126,7 @@ if __name__ == "__main__":
# isl/osl parameters # isl/osl parameters
# isl/osl is randomly sampled from two candidates following the isl-osl-ratio. # isl/osl is randomly sampled from two candidates following the isl-osl-ratio.
# at time t, the isl-osl-ratio is calculated as: # at time t, the isl-osl-ratio is calculated as:
# isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t) # isl-osl-ratio(t) = (min + max) / 2 + (max - min) / 2 * sin(2 * pi / period * t - pi / 2)
# Then, we sample [isl1/osl1, isl2/osl2] from the distribution [isl-osl-ratio(t), 1 - isl-osl-ratio(t)] # Then, we sample [isl1/osl1, isl2/osl2] from the distribution [isl-osl-ratio(t), 1 - isl-osl-ratio(t)]
parser.add_argument( parser.add_argument(
"--isl1", type=int, default=100, help="Minimum input sequence length" "--isl1", type=int, default=100, help="Minimum input sequence length"
......
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import asyncio import asyncio
import logging import logging
from pydantic import BaseModel from pydantic import BaseModel
from dynamo.planner.defaults import SLAPlannerDefaults from dynamo.planner.defaults import SLAPlannerDefaults
from dynamo.planner.utils.argparse import create_sla_planner_parser
from dynamo.planner.utils.planner_core import start_sla_planner from dynamo.planner.utils.planner_core import start_sla_planner
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
...@@ -52,101 +52,6 @@ async def init_planner(runtime: DistributedRuntime, args): ...@@ -52,101 +52,6 @@ async def init_planner(runtime: DistributedRuntime, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="SLA Planner") parser = create_sla_planner_parser()
parser.add_argument(
"--environment",
default=SLAPlannerDefaults.environment,
choices=["kubernetes"],
help="Environment type",
)
parser.add_argument(
"--backend",
default=SLAPlannerDefaults.backend,
choices=["vllm", "sglang"],
help="Backend type",
)
parser.add_argument(
"--no-operation",
action="store_true",
default=SLAPlannerDefaults.no_operation,
help="Enable no-operation mode",
)
parser.add_argument(
"--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
)
parser.add_argument(
"--adjustment-interval",
type=int,
default=SLAPlannerDefaults.adjustment_interval,
help="Adjustment interval in seconds",
)
parser.add_argument(
"--max-gpu-budget",
type=int,
default=SLAPlannerDefaults.max_gpu_budget,
help="Maximum GPU budget",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=SLAPlannerDefaults.min_endpoint,
help="Minimum number of endpoints",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.decode_engine_num_gpu,
help="Number of GPUs for decode engine",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.prefill_engine_num_gpu,
help="Number of GPUs for prefill engine",
)
parser.add_argument(
"--profile-results-dir",
default=SLAPlannerDefaults.profile_results_dir,
help="Profile results directory",
)
parser.add_argument(
"--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
)
parser.add_argument(
"--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
)
parser.add_argument(
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token",
)
parser.add_argument(
"--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
)
parser.add_argument(
"--load-predictor",
default=SLAPlannerDefaults.load_predictor,
help="Load predictor type",
)
parser.add_argument(
"--load-prediction-window-size",
type=int,
default=SLAPlannerDefaults.load_prediction_window_size,
help="Load prediction window size",
)
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port",
)
parser.add_argument(
"--no-correction",
action="store_true",
default=SLAPlannerDefaults.no_correction,
help="Disable correction factor",
)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(init_planner(args)) asyncio.run(init_planner(args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from dynamo.planner.defaults import SLAPlannerDefaults
def create_sla_planner_parser() -> argparse.ArgumentParser:
"""Create and configure the argument parser for SLA Planner.
Returns:
argparse.ArgumentParser: Configured argument parser for SLA Planner
"""
parser = argparse.ArgumentParser(description="SLA Planner")
parser.add_argument(
"--environment",
default=SLAPlannerDefaults.environment,
choices=["kubernetes"],
help="Environment type",
)
parser.add_argument(
"--backend",
default=SLAPlannerDefaults.backend,
choices=["vllm", "sglang"],
help="Backend type",
)
parser.add_argument(
"--no-operation",
action="store_true",
default=SLAPlannerDefaults.no_operation,
help="Enable no-operation mode",
)
parser.add_argument(
"--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
)
parser.add_argument(
"--adjustment-interval",
type=int,
default=SLAPlannerDefaults.adjustment_interval,
help="Adjustment interval in seconds",
)
parser.add_argument(
"--max-gpu-budget",
type=int,
default=SLAPlannerDefaults.max_gpu_budget,
help="Maximum GPU budget",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=SLAPlannerDefaults.min_endpoint,
help="Minimum number of endpoints",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.decode_engine_num_gpu,
help="Number of GPUs for decode engine",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=SLAPlannerDefaults.prefill_engine_num_gpu,
help="Number of GPUs for prefill engine",
)
parser.add_argument(
"--profile-results-dir",
default=SLAPlannerDefaults.profile_results_dir,
help="Profile results directory",
)
parser.add_argument(
"--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
)
parser.add_argument(
"--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
)
parser.add_argument(
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token",
)
parser.add_argument(
"--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
)
parser.add_argument(
"--load-predictor",
default=SLAPlannerDefaults.load_predictor,
help="Load predictor type",
)
parser.add_argument(
"--load-prediction-window-size",
type=int,
default=SLAPlannerDefaults.load_prediction_window_size,
help="Load prediction window size",
)
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port",
)
parser.add_argument(
"--no-correction",
action="store_true",
default=SLAPlannerDefaults.no_correction,
help="Disable correction factor",
)
return parser
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment