Unverified Commit 7a384793 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: make loading planner profile data one-stage for mockers (#4590)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 9c5018e5
...@@ -47,70 +47,22 @@ python -m dynamo.frontend --http-port 8000 ...@@ -47,70 +47,22 @@ python -m dynamo.frontend --http-port 8000
## Performance modeling with planner profile data ## Performance modeling with planner profile data
By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, you can load performance data from actual profiling results. By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, you can load performance data from actual profiling results using `--planner-profile-data`:
### Using profiled performance data
Add the `--planner-profile-data` flag to load an NPZ file containing interpolation grids from the planner profiler:
```bash ```bash
python -m dynamo.mocker \
--model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
--planner-profile-data /path/to/profiling_results/perf_data.npz \
--speedup-ratio 1.0
```
The NPZ file should contain the following arrays:
- `prefill_isl`: 1D array of input sequence lengths
- `prefill_ttft_ms`: 1D array of time-to-first-token values (ms)
- `decode_active_kv_tokens`: 1D array of active KV token counts
- `decode_context_length`: 1D array of context lengths
- `decode_itl`: 2D array of inter-token latencies (ms)
### Generating performance data from profiler results
#### Option 1: Use existing pre-swept results
The repository includes pre-swept profiling results for common models and hardware configurations. For example, to use Llama-3.1-8B-Instruct-FP8 on H200 SXM:
```bash
# Convert existing pre-swept results to mocker-compatible NPZ format
python components/src/dynamo/mocker/utils/planner_profiler_perf_data_converter.py \
--profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D \
--output_dir mocker_perf_data \
--resolution 100
# Use the generated NPZ with mocker
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path nvidia/Llama-3.1-8B-Instruct-FP8 \ --model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
--planner-profile-data mocker_perf_data/perf_data.npz --planner-profile-data tests/planner/profiling_results/H200_TP1P_TP1D \
--speedup-ratio 1.0
``` ```
#### Option 2: Generate from custom profiler runs The profile results directory should contain `selected_prefill_interpolation/` and `selected_decode_interpolation/` subdirectories with `raw_data.npz` files. This works seamlessly in Kubernetes where profile data is mounted via ConfigMap or PersistentVolume.
To convert your own profiler results into the NPZ format suitable for the mocker, you'll need to run the profiler (see [SLA-driven profiling documentation](../../../../docs/benchmarks/sla_driven_profiling.md) for details). Note that this is generally run in a Kubernetes environment. To generate profiling data for your own model/hardware configuration, run the profiler (see [SLA-driven profiling documentation](../../../../docs/benchmarks/sla_driven_profiling.md) for details):
```bash ```bash
# Run the profiler
python benchmarks/profiler/profile_sla.py \ python benchmarks/profiler/profile_sla.py \
--profile-config your_profile_config.yaml --profile-config your_profile_config.yaml
# Convert profiler results to mocker-compatible NPZ format
python components/src/dynamo/mocker/utils/planner_profiler_perf_data_converter.py \
--profile_results_dir profiling_results/selected_prefill_interpolation \
--output_dir profiling_results \
--resolution 100
# This creates profiling_results/perf_data.npz
``` ```
The converter script combines prefill and decode interpolation data into a single NPZ file with the appropriate array structure. Then use the resulting profile results directory directly with `--planner-profile-data`.
\ No newline at end of file
### How it works
When you provide `--planner-profile-data`:
1. The mocker loads the NPZ file during initialization
2. Prefill timing uses 1D linear interpolation on the ISL grid
3. Decode timing uses 2D bilinear interpolation on (active_kv_tokens, context_length)
Without `--planner-profile-data`, the mocker falls back to the default polynomial formulas for backward compatibility.
\ No newline at end of file
...@@ -9,6 +9,11 @@ import tempfile ...@@ -9,6 +9,11 @@ import tempfile
from pathlib import Path from pathlib import Path
from . import __version__ from . import __version__
from .utils.planner_profiler_perf_data_converter import (
convert_profile_results_to_npz,
is_mocker_format_npz,
is_profile_results_dir,
)
DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo") DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.backend.generate" DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.backend.generate"
...@@ -17,6 +22,71 @@ DEFAULT_PREFILL_ENDPOINT = f"dyn://{DYN_NAMESPACE}.prefill.generate" ...@@ -17,6 +22,71 @@ DEFAULT_PREFILL_ENDPOINT = f"dyn://{DYN_NAMESPACE}.prefill.generate"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ProfileDataResult:
"""Result of processing --planner-profile-data argument. Cleans up tmpdir on deletion."""
def __init__(
self, npz_path: Path | None, tmpdir: tempfile.TemporaryDirectory | None
):
self.npz_path = npz_path
self._tmpdir = tmpdir
def __del__(self):
if self._tmpdir is not None:
try:
self._tmpdir.cleanup()
logger.debug("Cleaned up profile data temporary directory")
except Exception:
pass # Best effort cleanup
def resolve_planner_profile_data(
planner_profile_data: Path | None,
) -> ProfileDataResult:
"""
Resolve --planner-profile-data to an NPZ file path.
Handles backward compatibility by accepting either:
1. A mocker-format NPZ file (returned as-is)
2. A profiler-style results directory (converted to mocker-format NPZ)
Args:
planner_profile_data: Path from --planner-profile-data argument.
Returns:
ProfileDataResult with npz_path and optional tmpdir for cleanup.
Raises:
FileNotFoundError: If path doesn't contain valid profile data in any supported format.
"""
if planner_profile_data is None:
return ProfileDataResult(npz_path=None, tmpdir=None)
# Case 1: Already a mocker-format NPZ file
if is_mocker_format_npz(planner_profile_data):
logger.info(f"Using mocker-format NPZ file: {planner_profile_data}")
return ProfileDataResult(npz_path=planner_profile_data, tmpdir=None)
# Case 2: Profiler-style results directory - needs conversion
if is_profile_results_dir(planner_profile_data):
logger.info(
f"Detected profiler-style results directory at {planner_profile_data}, converting to NPZ..."
)
tmpdir = tempfile.TemporaryDirectory(prefix="mocker_perf_data_")
npz_path = Path(tmpdir.name) / "perf_data.npz"
convert_profile_results_to_npz(planner_profile_data, npz_path)
return ProfileDataResult(npz_path=npz_path, tmpdir=tmpdir)
# Case 3: Invalid path - neither mocker-format NPZ nor profiler-style directory
raise FileNotFoundError(
f"Path '{planner_profile_data}' is neither a mocker-format NPZ file nor a valid profiler results directory.\n"
f"Expected either:\n"
f" - A .npz file with keys: prefill_isl, prefill_ttft_ms, decode_active_kv_tokens, decode_context_length, decode_itl\n"
f" - A directory containing selected_prefill_interpolation/raw_data.npz and selected_decode_interpolation/raw_data.npz\n"
f" - A directory containing prefill_raw_data.json and decode_raw_data.json"
)
def create_temp_engine_args_file(args) -> Path: def create_temp_engine_args_file(args) -> Path:
""" """
Create a temporary JSON file with MockEngineArgs from CLI arguments. Create a temporary JSON file with MockEngineArgs from CLI arguments.
...@@ -182,7 +252,8 @@ def parse_args(): ...@@ -182,7 +252,8 @@ def parse_args():
"--planner-profile-data", "--planner-profile-data",
type=Path, type=Path,
default=None, default=None,
help="Path to JSON configmap or NPZ file containing performance profiling data from planner_profiler_perf_data_converter.py (default: None, uses hardcoded polynomials)", help="Path to profile results directory containing selected_prefill_interpolation/ and "
"selected_decode_interpolation/ subdirectories (default: None, uses hardcoded polynomials)",
) )
parser.add_argument( parser.add_argument(
"--num-workers", "--num-workers",
......
...@@ -16,7 +16,7 @@ from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input ...@@ -16,7 +16,7 @@ from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from .args import create_temp_engine_args_file, parse_args from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
configure_dynamo_logging() configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -30,6 +30,10 @@ async def worker(): ...@@ -30,6 +30,10 @@ async def worker():
""" """
args = parse_args() args = parse_args()
# Resolve planner-profile-data: convert profile results dir to NPZ if needed
profile_data_result = resolve_planner_profile_data(args.planner_profile_data)
args.planner_profile_data = profile_data_result.npz_path
# Handle extra_engine_args: either use provided file or create from CLI args # Handle extra_engine_args: either use provided file or create from CLI args
if args.extra_engine_args: if args.extra_engine_args:
# User provided explicit JSON file # User provided explicit JSON file
...@@ -54,6 +58,8 @@ async def worker(): ...@@ -54,6 +58,8 @@ async def worker():
except Exception as e: except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}") logger.warning(f"Failed to clean up temporary file: {e}")
del profile_data_result # Triggers tmpdir cleanup via __del__
async def launch_workers(args, extra_engine_args_path): async def launch_workers(args, extra_engine_args_path):
"""Launch mocker worker(s) with isolated DistributedRuntime instances. """Launch mocker worker(s) with isolated DistributedRuntime instances.
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
This script converts planner profiler's results for mocker to use. This module converts planner profiler's results for mocker to use.
Example prefill query: Example prefill query:
input: input:
...@@ -28,9 +28,7 @@ This ignores the fact that active tokens' up/down projection is usually combine ...@@ -28,9 +28,7 @@ This ignores the fact that active tokens' up/down projection is usually combine
and might leads to slightly higher latency. and might leads to slightly higher latency.
""" """
import argparse
import logging import logging
import os
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
...@@ -39,40 +37,39 @@ from dynamo.planner.utils.perf_interpolation import ( ...@@ -39,40 +37,39 @@ from dynamo.planner.utils.perf_interpolation import (
DecodeInterpolator, DecodeInterpolator,
PrefillInterpolator, PrefillInterpolator,
) )
from dynamo.runtime.logging import configure_dynamo_logging
configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--profile_results_dir", type=str, required=True)
parser.add_argument("--resolution", type=int, default=100)
parser.add_argument("--output_dir", type=str, default="")
args = parser.parse_args()
# Convert to absolute paths to handle relative directories properly def convert_profile_results_to_npz(
args.profile_results_dir = str(Path(args.profile_results_dir).resolve()) profile_results_dir: str | Path,
output_path: str | Path,
resolution: int = 100,
) -> Path:
"""
Convert planner profiler results directory to mocker-compatible NPZ format.
if not args.output_dir: Args:
args.output_dir = args.profile_results_dir profile_results_dir: Path to directory containing selected_prefill_interpolation
else: and selected_decode_interpolation subdirectories with raw_data.npz files.
args.output_dir = str(Path(args.output_dir).resolve()) output_path: Full path where the output perf_data.npz will be written.
resolution: Resolution for the interpolation grid (default: 100).
# Create output directory if it doesn't exist Returns:
Path(args.output_dir).mkdir(parents=True, exist_ok=True) Path to the generated NPZ file.
"""
profile_results_dir = str(Path(profile_results_dir).resolve())
output_path = Path(output_path)
logger.info( logger.info(f"Converting profile results from {profile_results_dir}...")
f"Converting profile results from {args.profile_results_dir} to {args.output_dir}..."
)
# first convert prefill # Convert prefill data
prefill_interpolator = PrefillInterpolator(args.profile_results_dir) prefill_interpolator = PrefillInterpolator(profile_results_dir)
prefill_x = np.linspace( prefill_x = np.linspace(
prefill_interpolator.ttft_interpolator.x.min(), prefill_interpolator.ttft_interpolator.x.min(),
prefill_interpolator.ttft_interpolator.x.max(), prefill_interpolator.ttft_interpolator.x.max(),
args.resolution, resolution,
) )
prefill_y = prefill_interpolator.ttft_interpolator(prefill_x) prefill_y = prefill_interpolator.ttft_interpolator(prefill_x)
...@@ -81,10 +78,8 @@ if __name__ == "__main__": ...@@ -81,10 +78,8 @@ if __name__ == "__main__":
"prefill_ttft_ms": prefill_y.tolist(), "prefill_ttft_ms": prefill_y.tolist(),
} }
# then convert decode # Convert decode data
decode_interpolator = DecodeInterpolator( decode_interpolator = DecodeInterpolator(profile_results_dir, resolution=resolution)
args.profile_results_dir, resolution=args.resolution
)
decode_active_kv_tokens = decode_interpolator.xi * decode_interpolator.max_kv_tokens decode_active_kv_tokens = decode_interpolator.xi * decode_interpolator.max_kv_tokens
decode_context_length = decode_interpolator.yi decode_context_length = decode_interpolator.yi
...@@ -94,6 +89,94 @@ if __name__ == "__main__": ...@@ -94,6 +89,94 @@ if __name__ == "__main__":
result["decode_context_length"] = decode_context_length.tolist() result["decode_context_length"] = decode_context_length.tolist()
result["decode_itl"] = decode_itl.tolist() result["decode_itl"] = decode_itl.tolist()
np.savez(os.path.join(args.output_dir, "perf_data.npz"), **result) np.savez(output_path, **result)
logger.info(f"Wrote perf data to {output_path}")
return output_path
def is_profile_results_dir(path: Path) -> bool:
"""
Check if the given path is a profile results directory (profiler-style format).
A profile results directory contains:
- selected_prefill_interpolation/raw_data.npz (or prefill_raw_data.json)
- selected_decode_interpolation/raw_data.npz (or decode_raw_data.json)
Args:
path: Path to check.
Returns:
True if path is a profile results directory, False otherwise.
"""
if not path.is_dir():
return False
has_prefill = (
path / "selected_prefill_interpolation" / "raw_data.npz"
).exists() or (path / "prefill_raw_data.json").exists()
has_decode = (path / "selected_decode_interpolation" / "raw_data.npz").exists() or (
path / "decode_raw_data.json"
).exists()
return has_prefill and has_decode
def is_mocker_format_npz(path: Path) -> bool:
"""
Check if the given path is a mocker-format NPZ file.
logger.info(f"Wrote perf data to {os.path.join(args.output_dir, 'perf_data.npz')}") A mocker-format NPZ file contains:
- prefill_isl, prefill_ttft_ms
- decode_active_kv_tokens, decode_context_length, decode_itl
Args:
path: Path to check.
Returns:
True if path is a valid mocker-format NPZ file, False otherwise.
"""
if not path.is_file():
return False
if path.suffix != ".npz":
return False
try:
with np.load(path) as data:
required_keys = {
"prefill_isl",
"prefill_ttft_ms",
"decode_active_kv_tokens",
"decode_context_length",
"decode_itl",
}
return required_keys.issubset(data.keys())
except Exception:
return False
if __name__ == "__main__":
import argparse
from dynamo.runtime.logging import configure_dynamo_logging
configure_dynamo_logging()
parser = argparse.ArgumentParser()
parser.add_argument("--profile_results_dir", type=str, required=True)
parser.add_argument("--resolution", type=int, default=100)
parser.add_argument("--output_dir", type=str, default="")
args = parser.parse_args()
if not args.output_dir:
output_dir = Path(args.profile_results_dir).resolve()
else:
output_dir = Path(args.output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "perf_data.npz"
convert_profile_results_to_npz(
args.profile_results_dir, output_path, args.resolution
)
...@@ -65,11 +65,10 @@ class PrefillInterpolator: ...@@ -65,11 +65,10 @@ class PrefillInterpolator:
self.prefill_ttft = np.array(data["prefill_ttft"]) # type: ignore[index] self.prefill_ttft = np.array(data["prefill_ttft"]) # type: ignore[index]
self.prefill_thpt_per_gpu = np.array(data["prefill_thpt_per_gpu"]) # type: ignore[index] self.prefill_thpt_per_gpu = np.array(data["prefill_thpt_per_gpu"]) # type: ignore[index]
except FileNotFoundError: except FileNotFoundError:
logger.error( raise FileNotFoundError(
f"Prefill interpolation files not found: {prefill_npz_fn} and {json_fn}\n" f"Prefill interpolation files not found: {prefill_npz_fn} and {json_fn}\n"
f"{MISSING_PROFILING_DATA_ERROR_MESSAGE}" f"{MISSING_PROFILING_DATA_ERROR_MESSAGE}"
) )
exit(1)
elif raw_data: elif raw_data:
self.prefill_isl = raw_data["prefill_isl"] self.prefill_isl = raw_data["prefill_isl"]
...@@ -133,11 +132,10 @@ class DecodeInterpolator: ...@@ -133,11 +132,10 @@ class DecodeInterpolator:
self.z_thpt_per_gpu = np.array(data["z_thpt_per_gpu"]) # type: ignore[index] self.z_thpt_per_gpu = np.array(data["z_thpt_per_gpu"]) # type: ignore[index]
self.max_kv_tokens = int(data["max_kv_tokens"]) # type: ignore[index] self.max_kv_tokens = int(data["max_kv_tokens"]) # type: ignore[index]
except FileNotFoundError: except FileNotFoundError:
logger.error( raise FileNotFoundError(
f"Decode interpolation files not found: {decode_npz_fn} and {json_fn}\n" f"Decode interpolation files not found: {decode_npz_fn} and {json_fn}\n"
f"{MISSING_PROFILING_DATA_ERROR_MESSAGE}" f"{MISSING_PROFILING_DATA_ERROR_MESSAGE}"
) )
exit(1)
elif raw_data: elif raw_data:
self.x_kv_usage = raw_data["x_kv_usage"] self.x_kv_usage = raw_data["x_kv_usage"]
self.y_context_length = raw_data["y_context_length"] self.y_context_length = raw_data["y_context_length"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment