Unverified Commit 4ea21079 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(replay): add agg/disagg offline replay optimization [DYN-2566] (#7774)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent b55277c9
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from . import aic, bench, engine_args, evaluate, scoring, search
from .aic import _enumerate_dense_tp_candidates, _load_aiconfigurator_modules
from .bench import compare_agg_and_disagg_with_replay, compare_aic_and_replay_disagg
from .engine_args import (
_build_agg_candidate_engine_args,
_build_candidate_engine_args,
_build_router_config,
)
from .models import (
DenseAggReplayState,
DenseReplayOptimizationResult,
DenseReplayState,
SyntheticReplayWorkload,
TraceReplayWorkload,
)
from .scoring import _pick_best_record
from .search import (
_iter_agg_tp_states_with_max_workers,
_iter_agg_worker_states,
_iter_budget_edge_worker_states,
_iter_tp_states_with_equal_workers,
optimize_dense_agg_with_replay,
optimize_dense_disagg_with_replay,
)
__all__ = [
"_build_agg_candidate_engine_args",
"_build_candidate_engine_args",
"_build_router_config",
"_enumerate_dense_tp_candidates",
"_iter_agg_tp_states_with_max_workers",
"_iter_agg_worker_states",
"_iter_budget_edge_worker_states",
"_iter_tp_states_with_equal_workers",
"_load_aiconfigurator_modules",
"_pick_best_record",
"compare_agg_and_disagg_with_replay",
"compare_aic_and_replay_disagg",
"DenseAggReplayState",
"DenseReplayOptimizationResult",
"DenseReplayState",
"SyntheticReplayWorkload",
"TraceReplayWorkload",
"aic",
"bench",
"engine_args",
"evaluate",
"optimize_dense_agg_with_replay",
"optimize_dense_disagg_with_replay",
"scoring",
"search",
]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import importlib
from typing import Any
def _load_aiconfigurator_modules() -> tuple[Any, Any, Any]:
try:
common = importlib.import_module("aiconfigurator.sdk.common")
task = importlib.import_module("aiconfigurator.sdk.task")
utils = importlib.import_module("aiconfigurator.sdk.utils")
except ModuleNotFoundError as exc:
raise RuntimeError(
"aiconfigurator is required to enumerate dense TP candidates for replay optimization"
) from exc
return common, task, utils
def _enumerate_dense_tp_candidates(
backend: str, system: str
) -> tuple[list[int], list[int]]:
common, task, utils = _load_aiconfigurator_modules()
backend_enum = getattr(common.BackendName, backend)
prefill_cfg, decode_cfg = task.build_disagg_parallel_lists(
backend_name=backend,
prefill_system=system,
decode_system=system,
is_moe=False,
should_enable_pp=False,
)
prefill_parallel = utils.enumerate_parallel_config(
num_gpu_list=prefill_cfg["num_gpu_per_worker"],
tp_list=prefill_cfg["tp_list"],
pp_list=prefill_cfg["pp_list"],
dp_list=prefill_cfg["dp_list"],
moe_tp_list=prefill_cfg["moe_tp_list"],
moe_ep_list=prefill_cfg["moe_ep_list"],
is_moe=False,
backend=backend_enum,
)
decode_parallel = utils.enumerate_parallel_config(
num_gpu_list=decode_cfg["num_gpu_per_worker"],
tp_list=decode_cfg["tp_list"],
pp_list=decode_cfg["pp_list"],
dp_list=decode_cfg["dp_list"],
moe_tp_list=decode_cfg["moe_tp_list"],
moe_ep_list=decode_cfg["moe_ep_list"],
is_moe=False,
backend=backend_enum,
)
def extract_tp(parallel_configs: list[list[int]]) -> list[int]:
return sorted(
{
tp
for tp, pp, dp, moe_tp, moe_ep in parallel_configs
if pp == 1 and dp == 1 and moe_tp == 1 and moe_ep == 1
}
)
return extract_tp(prefill_parallel), extract_tp(decode_parallel)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from collections.abc import Mapping
from typing import Any
import pandas as pd
from aiconfigurator.sdk.task import TaskConfig, TaskRunner
from dynamo.llm import MockEngineArgs
from .models import SyntheticReplayWorkload, TraceReplayWorkload
from .scoring import _pick_best_record
from .search import optimize_dense_agg_with_replay, optimize_dense_disagg_with_replay
def compare_aic_and_replay_disagg(
*,
model: str,
backend: str,
system: str,
isl: int,
osl: int,
max_total_gpus: int,
replay_request_count: int,
replay_concurrency: int,
base_prefill_engine_args: MockEngineArgs,
base_decode_engine_args: MockEngineArgs,
constraints: Mapping[str, float] | None = None,
max_parallel_evals: int = 1,
) -> dict[str, Any]:
ttft_constraint = None if constraints is None else constraints.get("mean_ttft_ms")
tpot_constraint = None if constraints is None else constraints.get("mean_tpot_ms")
request_latency_constraint = (
None if constraints is None else constraints.get("mean_e2e_latency_ms")
)
aic_task = TaskConfig(
serving_mode="disagg",
model_path=model,
system_name=system,
backend_name=backend,
total_gpus=max_total_gpus,
isl=isl,
osl=osl,
ttft=None if ttft_constraint is None else float(ttft_constraint),
tpot=None if tpot_constraint is None else float(tpot_constraint),
request_latency=(
None
if request_latency_constraint is None
else float(request_latency_constraint)
),
)
aic_result = TaskRunner().run(aic_task)
aic_df = aic_result.get("pareto_df", pd.DataFrame())
replay_result = optimize_dense_disagg_with_replay(
model=model,
backend=backend,
system=system,
workload=SyntheticReplayWorkload(
isl=isl,
osl=osl,
request_count=replay_request_count,
replay_concurrency=replay_concurrency,
),
base_prefill_engine_args=base_prefill_engine_args,
base_decode_engine_args=base_decode_engine_args,
max_total_gpus=max_total_gpus,
constraints=constraints,
router_mode="round_robin",
max_parallel_evals=max_parallel_evals,
)
aic_best = None
if not aic_df.empty:
row = aic_df.iloc[0]
aic_best = {
"prefill_tp": int(row.get("(p)tp", 0)),
"decode_tp": int(row.get("(d)tp", 0)),
"prefill_workers": int(row.get("(p)workers", 0)),
"decode_workers": int(row.get("(d)workers", 0)),
"total_gpus_used": int(row.get("num_total_gpus", 0)),
"ttft": float(row.get("ttft", 0.0)),
"tpot": float(row.get("tpot", 0.0)),
"request_latency": float(row.get("request_latency", 0.0)),
"tokens_per_s": float(row.get("tokens/s", 0.0)),
"tokens_per_s_per_gpu": float(row.get("tokens/s/gpu", 0.0)),
}
replay_best = None
if replay_result.best_feasible is not None:
replay_best_record = replay_result.best_feasible
replay_best = {
"prefill_tp": int(replay_best_record["prefill_tp"]),
"decode_tp": int(replay_best_record["decode_tp"]),
"prefill_workers": int(replay_best_record["prefill_workers"]),
"decode_workers": int(replay_best_record["decode_workers"]),
"total_gpus_used": int(replay_best_record["total_gpus_used"]),
"mean_ttft_ms": float(replay_best_record.get("mean_ttft_ms", 0.0)),
"mean_tpot_ms": float(replay_best_record.get("mean_tpot_ms", 0.0)),
"mean_e2e_latency_ms": float(
replay_best_record.get("mean_e2e_latency_ms", 0.0)
),
"output_throughput_tok_s": float(
replay_best_record.get("output_throughput_tok_s", 0.0)
),
"score": float(replay_best_record.get("score", 0.0)),
}
return {
"aic_pareto_df": aic_df,
"aic_best": aic_best,
"replay_result": replay_result,
"replay_best": replay_best,
}
def compare_agg_and_disagg_with_replay(
*,
model: str,
backend: str,
system: str,
workload: SyntheticReplayWorkload | TraceReplayWorkload,
base_engine_args: MockEngineArgs,
base_prefill_engine_args: MockEngineArgs,
base_decode_engine_args: MockEngineArgs,
max_total_gpus: int,
constraints: Mapping[str, float] | None = None,
router_mode: str = "kv_router",
overlap_score_weights: tuple[float, ...] | list[float] | None = None,
max_parallel_evals: int = 1,
) -> dict[str, Any]:
agg_result = optimize_dense_agg_with_replay(
model=model,
backend=backend,
system=system,
workload=workload,
base_engine_args=base_engine_args,
max_total_gpus=max_total_gpus,
constraints=constraints,
router_mode=router_mode,
overlap_score_weights=overlap_score_weights,
max_parallel_evals=max_parallel_evals,
)
disagg_result = optimize_dense_disagg_with_replay(
model=model,
backend=backend,
system=system,
workload=workload,
base_prefill_engine_args=base_prefill_engine_args,
base_decode_engine_args=base_decode_engine_args,
max_total_gpus=max_total_gpus,
constraints=constraints,
router_mode=router_mode,
overlap_score_weights=overlap_score_weights,
max_parallel_evals=max_parallel_evals,
)
agg_best = agg_result.best_feasible
disagg_best = disagg_result.best_feasible
if agg_best is None and disagg_best is None:
candidates = [
result.best_infeasible
for result in (agg_result, disagg_result)
if result.best_infeasible is not None
]
chosen_best = None if not candidates else _pick_best_record(candidates)
elif agg_best is None:
chosen_best = disagg_best
elif disagg_best is None:
chosen_best = agg_best
else:
chosen_best = _pick_best_record([agg_best, disagg_best])
chosen_mode = None
if chosen_best is not None:
chosen_mode = (
"agg" if "tp" in chosen_best and "workers" in chosen_best else "disagg"
)
return {
"agg_result": agg_result,
"disagg_result": disagg_result,
"chosen_mode": chosen_mode,
"chosen_best": chosen_best,
}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os
AIC_BACKEND_VERSIONS = {
"vllm": "0.12.0",
"sglang": "0.5.6.post2",
}
DEFAULT_OVERLAP_SCORE_WEIGHTS = (0.0, 0.25, 0.5, 1.0, 2.0, 4.0)
DEFAULT_MAX_PARALLEL_EVALS = min(4, os.cpu_count() or 1)
DEFAULT_SEARCH_ROUNDS = 3
SUPPORTED_CONSTRAINTS = frozenset(
{
"mean_ttft_ms",
"p95_ttft_ms",
"mean_tpot_ms",
"p95_tpot_ms",
"mean_e2e_latency_ms",
"p95_e2e_latency_ms",
"max_total_gpus",
}
)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Literal
from dynamo.llm import KvRouterConfig, MockEngineArgs
from .constants import AIC_BACKEND_VERSIONS
def _build_candidate_engine_args(
*,
base_args: MockEngineArgs,
tp_size: int,
worker_type: Literal["prefill", "decode", "aggregated"],
backend: str,
system: str,
model: str,
) -> MockEngineArgs:
args = base_args.copy()
args.worker_type = worker_type
args.enable_prefix_caching = worker_type != "decode"
# Keep the base KV block capacity fixed across TP for now.
#
# TP does not have a simple, backend-agnostic relationship with
# effective KV capacity. In particular, MLA-style attention and other
# specialized cache layouts break the usual KV-head-sharding intuition.
# A future version should derive a TP-aware capacity estimate from the
# AIC SDK instead of applying a generic heuristic here.
args.num_gpu_blocks = base_args.num_gpu_blocks
args.aic_backend = backend
args.aic_system = system
args.aic_backend_version = AIC_BACKEND_VERSIONS[backend]
args.aic_tp_size = tp_size
args.aic_model_path = model
return args
def _build_agg_candidate_engine_args(
*,
base_args: MockEngineArgs,
tp_size: int,
backend: str,
system: str,
model: str,
) -> MockEngineArgs:
return _build_candidate_engine_args(
base_args=base_args,
tp_size=tp_size,
worker_type="aggregated",
backend=backend,
system=system,
model=model,
)
def _build_router_config(
base_router_config: KvRouterConfig | None,
overlap_score_weight: float,
) -> KvRouterConfig:
if base_router_config is None:
return KvRouterConfig(overlap_score_weight=overlap_score_weight)
router_config = base_router_config.copy()
router_config.overlap_score_weight = overlap_score_weight
return router_config
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Replay evaluation helpers for the budget-focused dense search heuristic.
The search in `search.py` assumes we prefer to consume the available GPU budget
and therefore ranks visited states by raw output throughput, subject to replay
constraints, rather than by throughput normalized per GPU.
"""
from __future__ import annotations
from collections.abc import Mapping, Sequence
from concurrent.futures import Executor
from dataclasses import asdict
from pathlib import Path
from typing import Any
from dynamo.llm import KvRouterConfig, MockEngineArgs
from dynamo.replay import run_synthetic_trace_replay, run_trace_replay
from .engine_args import (
_build_agg_candidate_engine_args,
_build_candidate_engine_args,
_build_router_config,
)
from .models import (
DenseAggReplayState,
DenseReplayState,
SyntheticReplayWorkload,
TraceReplayWorkload,
)
from .scoring import _violation_penalty
def _run_replay_for_state(
*,
state: DenseReplayState,
workload: SyntheticReplayWorkload | TraceReplayWorkload,
prefill_engine_args: MockEngineArgs,
decode_engine_args: MockEngineArgs,
router_config: KvRouterConfig | None,
) -> dict[str, Any]:
if isinstance(workload, SyntheticReplayWorkload):
return run_synthetic_trace_replay(
workload.isl,
workload.osl,
workload.request_count,
prefill_engine_args=prefill_engine_args,
decode_engine_args=decode_engine_args,
router_config=router_config,
num_prefill_workers=state.prefill_workers,
num_decode_workers=state.decode_workers,
replay_concurrency=workload.replay_concurrency,
replay_mode="offline",
router_mode=state.router_mode,
arrival_interval_ms=workload.arrival_interval_ms,
turns_per_session=workload.turns_per_session,
shared_prefix_ratio=workload.shared_prefix_ratio,
num_prefix_groups=workload.num_prefix_groups,
inter_turn_delay_ms=workload.inter_turn_delay_ms,
)
return run_trace_replay(
Path(workload.trace_file),
prefill_engine_args=prefill_engine_args,
decode_engine_args=decode_engine_args,
router_config=router_config,
num_prefill_workers=state.prefill_workers,
num_decode_workers=state.decode_workers,
replay_mode="offline",
router_mode=state.router_mode,
arrival_speedup_ratio=workload.arrival_speedup_ratio,
)
def _run_agg_replay_for_state(
*,
state: DenseAggReplayState,
workload: SyntheticReplayWorkload | TraceReplayWorkload,
engine_args: MockEngineArgs,
router_config: KvRouterConfig | None,
) -> dict[str, Any]:
if isinstance(workload, SyntheticReplayWorkload):
return run_synthetic_trace_replay(
workload.isl,
workload.osl,
workload.request_count,
extra_engine_args=engine_args,
router_config=router_config,
num_workers=state.workers,
replay_concurrency=workload.replay_concurrency,
replay_mode="offline",
router_mode=state.router_mode,
arrival_interval_ms=workload.arrival_interval_ms,
turns_per_session=workload.turns_per_session,
shared_prefix_ratio=workload.shared_prefix_ratio,
num_prefix_groups=workload.num_prefix_groups,
inter_turn_delay_ms=workload.inter_turn_delay_ms,
)
return run_trace_replay(
Path(workload.trace_file),
extra_engine_args=engine_args,
router_config=router_config,
num_workers=state.workers,
replay_mode="offline",
router_mode=state.router_mode,
arrival_speedup_ratio=workload.arrival_speedup_ratio,
)
def _evaluate_state(
*,
state: DenseReplayState,
workload: SyntheticReplayWorkload | TraceReplayWorkload,
base_prefill_engine_args: MockEngineArgs,
base_decode_engine_args: MockEngineArgs,
base_router_config: KvRouterConfig | None,
model: str,
backend: str,
system: str,
constraints: Mapping[str, float],
cache: dict[DenseReplayState, dict[str, Any]],
) -> dict[str, Any]:
cached = cache.get(state)
if cached is not None:
return cached
prefill_args = _build_candidate_engine_args(
base_args=base_prefill_engine_args,
tp_size=state.prefill_tp,
worker_type="prefill",
backend=backend,
system=system,
model=model,
)
decode_args = _build_candidate_engine_args(
base_args=base_decode_engine_args,
tp_size=state.decode_tp,
worker_type="decode",
backend=backend,
system=system,
model=model,
)
router_config = None
if state.router_mode == "kv_router":
router_config = _build_router_config(
base_router_config, state.overlap_score_weight
)
report = _run_replay_for_state(
state=state,
workload=workload,
prefill_engine_args=prefill_args,
decode_engine_args=decode_args,
router_config=router_config,
)
total_gpus_used = state.total_gpus_used
throughput = float(report["output_throughput_tok_s"])
score = throughput
penalty = _violation_penalty(report, constraints, total_gpus_used)
feasible = penalty == 0.0
record = {
**report,
**asdict(state),
"total_gpus_used": total_gpus_used,
"output_throughput_tok_s": throughput,
"score": score,
"feasible": feasible,
"violation_penalty": penalty,
}
cache[state] = record
return record
def _evaluate_agg_state(
*,
state: DenseAggReplayState,
workload: SyntheticReplayWorkload | TraceReplayWorkload,
base_engine_args: MockEngineArgs,
base_router_config: KvRouterConfig | None,
model: str,
backend: str,
system: str,
constraints: Mapping[str, float],
cache: dict[DenseAggReplayState, dict[str, Any]],
) -> dict[str, Any]:
cached = cache.get(state)
if cached is not None:
return cached
engine_args = _build_agg_candidate_engine_args(
base_args=base_engine_args,
tp_size=state.tp,
backend=backend,
system=system,
model=model,
)
router_config = None
if state.router_mode == "kv_router":
router_config = _build_router_config(
base_router_config, state.overlap_score_weight
)
report = _run_agg_replay_for_state(
state=state,
workload=workload,
engine_args=engine_args,
router_config=router_config,
)
total_gpus_used = state.total_gpus_used
throughput = float(report["output_throughput_tok_s"])
score = throughput
penalty = _violation_penalty(report, constraints, total_gpus_used)
feasible = penalty == 0.0
record = {
**report,
**asdict(state),
"total_gpus_used": total_gpus_used,
"output_throughput_tok_s": throughput,
"score": score,
"feasible": feasible,
"violation_penalty": penalty,
}
cache[state] = record
return record
def _evaluate_state_from_json_payloads(payload: Mapping[str, Any]) -> dict[str, Any]:
return _evaluate_state(
state=payload["state"],
workload=payload["workload"],
base_prefill_engine_args=MockEngineArgs.from_json(
payload["base_prefill_engine_args_json"]
),
base_decode_engine_args=MockEngineArgs.from_json(
payload["base_decode_engine_args_json"]
),
base_router_config=(
KvRouterConfig.from_json(payload["base_router_config_json"])
if payload["base_router_config_json"] is not None
else None
),
model=payload["model"],
backend=payload["backend"],
system=payload["system"],
constraints=payload["constraints"],
cache={},
)
def _evaluate_agg_state_from_json_payloads(
payload: Mapping[str, Any]
) -> dict[str, Any]:
return _evaluate_agg_state(
state=payload["state"],
workload=payload["workload"],
base_engine_args=MockEngineArgs.from_json(payload["base_engine_args_json"]),
base_router_config=(
KvRouterConfig.from_json(payload["base_router_config_json"])
if payload["base_router_config_json"] is not None
else None
),
model=payload["model"],
backend=payload["backend"],
system=payload["system"],
constraints=payload["constraints"],
cache={},
)
def _evaluate_states(
*,
states: Sequence[DenseReplayState],
workload: SyntheticReplayWorkload | TraceReplayWorkload,
base_prefill_engine_args: MockEngineArgs,
base_decode_engine_args: MockEngineArgs,
base_router_config: KvRouterConfig | None,
model: str,
backend: str,
system: str,
constraints: Mapping[str, float],
cache: dict[DenseReplayState, dict[str, Any]],
max_parallel_evals: int,
executor: Executor | None = None,
) -> list[dict[str, Any]]:
records: list[dict[str, Any] | None] = [None] * len(states)
uncached_indices: list[int] = []
uncached_states: list[DenseReplayState] = []
for index, state in enumerate(states):
cached = cache.get(state)
if cached is not None:
records[index] = cached
continue
uncached_indices.append(index)
uncached_states.append(state)
if not uncached_states:
return [record for record in records if record is not None]
if max_parallel_evals <= 1 or len(uncached_states) == 1 or executor is None:
for index, state in zip(uncached_indices, uncached_states, strict=True):
records[index] = _evaluate_state(
state=state,
workload=workload,
base_prefill_engine_args=base_prefill_engine_args,
base_decode_engine_args=base_decode_engine_args,
base_router_config=base_router_config,
model=model,
backend=backend,
system=system,
constraints=constraints,
cache=cache,
)
return [record for record in records if record is not None]
base_prefill_engine_args_json = base_prefill_engine_args.dump_json()
base_decode_engine_args_json = base_decode_engine_args.dump_json()
base_router_config_json = (
None if base_router_config is None else base_router_config.dump_json()
)
payloads = [
{
"state": state,
"workload": workload,
"base_prefill_engine_args_json": base_prefill_engine_args_json,
"base_decode_engine_args_json": base_decode_engine_args_json,
"base_router_config_json": base_router_config_json,
"model": model,
"backend": backend,
"system": system,
"constraints": constraints,
}
for state in uncached_states
]
future_records = list(executor.map(_evaluate_state_from_json_payloads, payloads))
for index, state, record in zip(
uncached_indices,
uncached_states,
future_records,
strict=True,
):
cache[state] = record
records[index] = record
return [record for record in records if record is not None]
def _evaluate_agg_states(
*,
states: Sequence[DenseAggReplayState],
workload: SyntheticReplayWorkload | TraceReplayWorkload,
base_engine_args: MockEngineArgs,
base_router_config: KvRouterConfig | None,
model: str,
backend: str,
system: str,
constraints: Mapping[str, float],
cache: dict[DenseAggReplayState, dict[str, Any]],
max_parallel_evals: int,
executor: Executor | None = None,
) -> list[dict[str, Any]]:
records: list[dict[str, Any] | None] = [None] * len(states)
uncached_indices: list[int] = []
uncached_states: list[DenseAggReplayState] = []
for index, state in enumerate(states):
cached = cache.get(state)
if cached is not None:
records[index] = cached
continue
uncached_indices.append(index)
uncached_states.append(state)
if not uncached_states:
return [record for record in records if record is not None]
if max_parallel_evals <= 1 or len(uncached_states) == 1 or executor is None:
for index, state in zip(uncached_indices, uncached_states, strict=True):
records[index] = _evaluate_agg_state(
state=state,
workload=workload,
base_engine_args=base_engine_args,
base_router_config=base_router_config,
model=model,
backend=backend,
system=system,
constraints=constraints,
cache=cache,
)
return [record for record in records if record is not None]
base_engine_args_json = base_engine_args.dump_json()
base_router_config_json = (
None if base_router_config is None else base_router_config.dump_json()
)
payloads = [
{
"state": state,
"workload": workload,
"base_engine_args_json": base_engine_args_json,
"base_router_config_json": base_router_config_json,
"model": model,
"backend": backend,
"system": system,
"constraints": constraints,
}
for state in uncached_states
]
future_records = list(
executor.map(_evaluate_agg_state_from_json_payloads, payloads)
)
for index, state, record in zip(
uncached_indices,
uncached_states,
future_records,
strict=True,
):
cache[state] = record
records[index] = record
return [record for record in records if record is not None]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os
from dataclasses import dataclass
from typing import Any
import pandas as pd
@dataclass(frozen=True)
class SyntheticReplayWorkload:
isl: int
osl: int
request_count: int
replay_concurrency: int
arrival_interval_ms: float = 0.0
turns_per_session: int = 1
shared_prefix_ratio: float = 0.0
num_prefix_groups: int = 0
inter_turn_delay_ms: float = 0.0
@dataclass(frozen=True)
class TraceReplayWorkload:
trace_file: str | os.PathLike[str]
arrival_speedup_ratio: float = 1.0
@dataclass(frozen=True)
class DenseReplayState:
prefill_tp: int
decode_tp: int
prefill_workers: int
decode_workers: int
overlap_score_weight: float
router_mode: str = "kv_router"
@property
def total_gpus_used(self) -> int:
return (
self.prefill_tp * self.prefill_workers
+ self.decode_tp * self.decode_workers
)
@dataclass(frozen=True)
class DenseAggReplayState:
tp: int
workers: int
router_mode: str
overlap_score_weight: float
@property
def total_gpus_used(self) -> int:
return self.tp * self.workers
@dataclass(frozen=True)
class DenseReplayOptimizationResult:
best_feasible: dict[str, Any] | None
best_infeasible: dict[str, Any] | None
evaluated_df: pd.DataFrame
feasible_df: pd.DataFrame
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import math
from collections.abc import Mapping, Sequence
from typing import Any
def _metric_value(report: Mapping[str, Any], key: str, total_gpus_used: int) -> float:
if key == "max_total_gpus":
return float(total_gpus_used)
value = report.get(key)
if value is None:
return math.inf
return float(value)
def _violation_penalty(
report: Mapping[str, Any],
constraints: Mapping[str, float],
total_gpus_used: int,
) -> float:
penalty = 0.0
for key, bound in constraints.items():
if bound <= 0:
continue
metric = _metric_value(report, key, total_gpus_used)
penalty += max(metric / bound - 1.0, 0.0)
return penalty
def _rank_record(record: Mapping[str, Any]) -> tuple[float, float, float]:
return (
float(record["score"]),
float(record["output_throughput_tok_s"]),
-float(record.get("mean_e2e_latency_ms", math.inf)),
)
def _pick_best_record(records: Sequence[dict[str, Any]]) -> dict[str, Any]:
feasible_records = [record for record in records if record["feasible"]]
if feasible_records:
return max(
feasible_records,
key=lambda record: (
*_rank_record(record),
-float(record["total_gpus_used"]),
),
)
return min(
records,
key=lambda record: (
float(record["violation_penalty"]),
-float(record["output_throughput_tok_s"]),
float(record.get("mean_e2e_latency_ms", math.inf)),
),
)
This diff is collapsed.
......@@ -7,7 +7,7 @@ use std::path::PathBuf;
use std::pin::Pin;
use std::sync::Arc;
use pyo3::{exceptions::PyException, prelude::*};
use pyo3::{exceptions::PyException, exceptions::PyValueError, prelude::*};
use pyo3_async_runtimes::TaskLocals;
use dynamo_kv_router::config::KvRouterConfig as RsKvRouterConfig;
......@@ -113,6 +113,45 @@ impl KvRouterConfig {
.map(|inner| KvRouterConfig { inner })
.map_err(|e| PyException::new_err(format!("Failed to parse KvRouterConfig JSON: {e}")))
}
fn dump_json(&self) -> PyResult<String> {
serde_json::to_string(&self.inner)
.map_err(|e| PyException::new_err(format!("Failed to serialize KvRouterConfig: {e}")))
}
fn copy(&self) -> Self {
self.clone()
}
#[getter]
fn overlap_score_weight(&self) -> f64 {
self.inner.overlap_score_weight
}
#[setter]
fn set_overlap_score_weight(&mut self, value: f64) -> PyResult<()> {
if value < 0.0 {
return Err(PyValueError::new_err(
"overlap_score_weight must be non-negative",
));
}
self.inner.overlap_score_weight = value;
Ok(())
}
#[pyo3(signature = (overlap_score_weight=None))]
fn with_overrides(&self, overlap_score_weight: Option<f64>) -> PyResult<Self> {
let mut inner = self.inner.clone();
if let Some(weight) = overlap_score_weight {
if weight < 0.0 {
return Err(PyValueError::new_err(
"overlap_score_weight must be non-negative",
));
}
inner.overlap_score_weight = weight;
}
Ok(Self { inner })
}
}
#[pyclass]
......
......@@ -282,12 +282,15 @@ impl MockEngineArgs {
"preemption_mode": preemption_mode,
"router_queue_policy": router_queue_policy,
"sglang": self.inner.sglang,
"has_perf_model": true,
});
serde_json::to_string_pretty(&payload)
.map_err(|e| PyException::new_err(format!("Failed to serialize MockEngineArgs: {e}")))
}
fn copy(&self) -> Self {
self.clone()
}
#[getter]
fn block_size(&self) -> usize {
self.inner.block_size
......@@ -308,6 +311,16 @@ impl MockEngineArgs {
self.inner.max_num_batched_tokens
}
#[getter]
fn enable_prefix_caching(&self) -> bool {
self.inner.enable_prefix_caching
}
#[setter]
fn set_enable_prefix_caching(&mut self, value: bool) {
self.inner.enable_prefix_caching = value;
}
#[getter]
fn enable_local_indexer(&self) -> bool {
self.inner.enable_local_indexer
......@@ -323,6 +336,76 @@ impl MockEngineArgs {
self.inner.bootstrap_port
}
#[getter]
fn aic_backend(&self) -> Option<String> {
self.inner.aic_backend.clone()
}
#[setter]
fn set_aic_backend(&mut self, value: Option<String>) {
self.inner.aic_backend = value;
}
#[getter]
fn aic_system(&self) -> Option<String> {
self.inner.aic_system.clone()
}
#[setter]
fn set_aic_system(&mut self, value: Option<String>) {
self.inner.aic_system = value;
}
#[getter]
fn aic_backend_version(&self) -> Option<String> {
self.inner.aic_backend_version.clone()
}
#[setter]
fn set_aic_backend_version(&mut self, value: Option<String>) {
self.inner.aic_backend_version = value;
}
#[getter]
fn aic_tp_size(&self) -> Option<usize> {
self.inner.aic_tp_size
}
#[setter]
fn set_aic_tp_size(&mut self, value: Option<usize>) {
self.inner.aic_tp_size = value;
}
#[getter]
fn aic_model_path(&self) -> Option<String> {
self.inner.aic_model_path.clone()
}
#[setter]
fn set_aic_model_path(&mut self, value: Option<String>) {
self.inner.aic_model_path = value;
}
#[getter]
fn worker_type(&self) -> &'static str {
match self.inner.worker_type {
RsWorkerType::Aggregated => "aggregated",
RsWorkerType::Prefill => "prefill",
RsWorkerType::Decode => "decode",
}
}
#[setter]
fn set_worker_type(&mut self, value: &str) -> PyResult<()> {
self.inner.worker_type = parse_worker_type(value)?;
Ok(())
}
#[setter]
fn set_num_gpu_blocks(&mut self, value: usize) {
self.inner.num_gpu_blocks = value;
}
fn is_prefill(&self) -> bool {
self.inner.is_prefill()
}
......@@ -331,13 +414,22 @@ impl MockEngineArgs {
self.inner.is_decode()
}
#[pyo3(signature = (bootstrap_port=None, zmq_kv_events_port=None, zmq_replay_port=None, kv_bytes_per_token=None))]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (bootstrap_port=None, zmq_kv_events_port=None, zmq_replay_port=None, kv_bytes_per_token=None, num_gpu_blocks=None, aic_backend=None, aic_system=None, aic_backend_version=None, aic_tp_size=None, aic_model_path=None, enable_prefix_caching=None, worker_type=None))]
fn with_overrides(
&self,
bootstrap_port: Option<u16>,
zmq_kv_events_port: Option<u16>,
zmq_replay_port: Option<u16>,
kv_bytes_per_token: Option<usize>,
num_gpu_blocks: Option<usize>,
aic_backend: Option<String>,
aic_system: Option<String>,
aic_backend_version: Option<String>,
aic_tp_size: Option<usize>,
aic_model_path: Option<String>,
enable_prefix_caching: Option<bool>,
worker_type: Option<String>,
) -> PyResult<Self> {
let mut inner = self.inner.clone();
if let Some(port) = bootstrap_port {
......@@ -352,6 +444,30 @@ impl MockEngineArgs {
if let Some(bytes_per_token) = kv_bytes_per_token {
inner.kv_bytes_per_token = Some(bytes_per_token);
}
if let Some(blocks) = num_gpu_blocks {
inner.num_gpu_blocks = blocks;
}
if let Some(backend) = aic_backend {
inner.aic_backend = Some(backend);
}
if let Some(system) = aic_system {
inner.aic_system = Some(system);
}
if let Some(version) = aic_backend_version {
inner.aic_backend_version = Some(version);
}
if let Some(tp_size) = aic_tp_size {
inner.aic_tp_size = Some(tp_size);
}
if let Some(model_path) = aic_model_path {
inner.aic_model_path = Some(model_path);
}
if let Some(enable_prefix_caching) = enable_prefix_caching {
inner.enable_prefix_caching = enable_prefix_caching;
}
if let Some(worker_type) = worker_type {
inner.worker_type = parse_worker_type(&worker_type)?;
}
inner.normalized().map(|inner| Self { inner }).map_err(|e| {
PyException::new_err(format!("Failed to normalize MockEngineArgs overrides: {e}"))
})
......
......@@ -1221,6 +1221,21 @@ class KvRouterConfig:
def from_json(config_json: str) -> "KvRouterConfig":
...
def dump_json(self) -> str: ...
def copy(self) -> "KvRouterConfig": ...
@property
def overlap_score_weight(self) -> float: ...
@overlap_score_weight.setter
def overlap_score_weight(self, value: float) -> None: ...
def with_overrides(
self,
overlap_score_weight: Optional[float] = None,
) -> "KvRouterConfig": ...
class ReasoningConfig:
def __init__(
self,
......@@ -1280,6 +1295,8 @@ class MockEngineArgs:
def from_json(config_json: str) -> "MockEngineArgs":
...
def copy(self) -> "MockEngineArgs": ...
def dump_json(self) -> str: ...
@property
......@@ -1288,12 +1305,21 @@ class MockEngineArgs:
@property
def num_gpu_blocks(self) -> int: ...
@num_gpu_blocks.setter
def num_gpu_blocks(self, value: int) -> None: ...
@property
def max_num_seqs(self) -> Optional[int]: ...
@property
def max_num_batched_tokens(self) -> Optional[int]: ...
@property
def enable_prefix_caching(self) -> bool: ...
@enable_prefix_caching.setter
def enable_prefix_caching(self, value: bool) -> None: ...
@property
def enable_local_indexer(self) -> bool: ...
......@@ -1303,6 +1329,42 @@ class MockEngineArgs:
@property
def bootstrap_port(self) -> Optional[int]: ...
@property
def aic_backend(self) -> Optional[str]: ...
@aic_backend.setter
def aic_backend(self, value: Optional[str]) -> None: ...
@property
def aic_system(self) -> Optional[str]: ...
@aic_system.setter
def aic_system(self, value: Optional[str]) -> None: ...
@property
def aic_backend_version(self) -> Optional[str]: ...
@aic_backend_version.setter
def aic_backend_version(self, value: Optional[str]) -> None: ...
@property
def aic_tp_size(self) -> Optional[int]: ...
@aic_tp_size.setter
def aic_tp_size(self, value: Optional[int]) -> None: ...
@property
def aic_model_path(self) -> Optional[str]: ...
@aic_model_path.setter
def aic_model_path(self, value: Optional[str]) -> None: ...
@property
def worker_type(self) -> str: ...
@worker_type.setter
def worker_type(self, value: str) -> None: ...
def is_prefill(self) -> bool: ...
def is_decode(self) -> bool: ...
......@@ -1313,6 +1375,14 @@ class MockEngineArgs:
zmq_kv_events_port: Optional[int] = None,
zmq_replay_port: Optional[int] = None,
kv_bytes_per_token: Optional[int] = None,
num_gpu_blocks: Optional[int] = None,
aic_backend: Optional[str] = None,
aic_system: Optional[str] = None,
aic_backend_version: Optional[str] = None,
aic_tp_size: Optional[int] = None,
aic_model_path: Optional[str] = None,
enable_prefix_caching: Optional[bool] = None,
worker_type: Optional[str] = None,
) -> "MockEngineArgs": ...
async def register_model(
......
......@@ -484,6 +484,7 @@ impl MockEngineArgs {
"decode_speedup_ratio",
"dp_size",
"startup_time",
"worker_type",
"is_prefill",
"is_decode",
"planner_profile_data",
......@@ -502,6 +503,7 @@ impl MockEngineArgs {
"preemption_mode",
"router_queue_policy",
"sglang",
"has_perf_model",
]
.iter()
.cloned()
......@@ -551,16 +553,20 @@ impl MockEngineArgs {
builder = builder.block_size(num as usize);
}
if let Some(value) = extra_args.get("max_num_seqs")
&& let Some(num) = value.as_u64()
{
builder = builder.max_num_seqs(Some(num as usize));
if let Some(value) = extra_args.get("max_num_seqs") {
if value.is_null() {
builder = builder.max_num_seqs(None);
} else if let Some(num) = value.as_u64() {
builder = builder.max_num_seqs(Some(num as usize));
}
}
if let Some(value) = extra_args.get("max_num_batched_tokens")
&& let Some(num) = value.as_u64()
{
builder = builder.max_num_batched_tokens(Some(num as usize));
if let Some(value) = extra_args.get("max_num_batched_tokens") {
if value.is_null() {
builder = builder.max_num_batched_tokens(None);
} else if let Some(num) = value.as_u64() {
builder = builder.max_num_batched_tokens(Some(num as usize));
}
}
if let Some(value) = extra_args.get("enable_prefix_caching")
......@@ -623,7 +629,9 @@ impl MockEngineArgs {
builder = builder.kv_transfer_bandwidth(Some(num));
}
if let Some(value) = extra_args.get("reasoning") {
if let Some(value) = extra_args.get("reasoning")
&& !value.is_null()
{
let cfg: ReasoningConfig = serde_json::from_value(value.clone())
.map_err(|e| anyhow::anyhow!("Failed to parse reasoning config: {}", e))?;
builder = builder.reasoning(Some(cfg));
......@@ -664,31 +672,51 @@ impl MockEngineArgs {
builder = builder.router_queue_policy(Some(policy));
}
if let Some(value) = extra_args.get("sglang") {
if let Some(value) = extra_args.get("sglang")
&& !value.is_null()
{
let cfg: SglangArgs = serde_json::from_value(value.clone())
.map_err(|e| anyhow::anyhow!("Failed to parse sglang config: {}", e))?;
builder = builder.sglang(Some(cfg));
}
// Parse worker type from is_prefill and is_decode flags
let is_prefill = extra_args
.get("is_prefill")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let is_decode = extra_args
.get("is_decode")
.and_then(|v| v.as_bool())
.unwrap_or(false);
// Determine worker type based on flags
let worker_type = match (is_prefill, is_decode) {
(false, false) => WorkerType::Aggregated,
(true, false) => WorkerType::Prefill,
(false, true) => WorkerType::Decode,
(true, true) => panic!(
"Invalid worker configuration: is_prefill and is_decode cannot both be true. \
Worker must be either Aggregated (both false), Prefill (is_prefill=true), or Decode (is_decode=true)."
),
let worker_type = if let Some(value) = extra_args.get("worker_type") {
match value.as_str() {
Some("aggregated") => WorkerType::Aggregated,
Some("prefill") => WorkerType::Prefill,
Some("decode") => WorkerType::Decode,
Some(other) => {
return Err(anyhow::anyhow!(
"Invalid worker_type '{}'. Must be 'aggregated', 'prefill', or 'decode'.",
other
));
}
None => {
return Err(anyhow::anyhow!(
"Invalid worker_type: expected string value."
));
}
}
} else {
let is_prefill = extra_args
.get("is_prefill")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let is_decode = extra_args
.get("is_decode")
.and_then(|v| v.as_bool())
.unwrap_or(false);
match (is_prefill, is_decode) {
(false, false) => WorkerType::Aggregated,
(true, false) => WorkerType::Prefill,
(false, true) => WorkerType::Decode,
(true, true) => {
return Err(anyhow::anyhow!(
"Invalid worker configuration: is_prefill and is_decode cannot both be true."
));
}
}
};
builder = builder.worker_type(worker_type);
......@@ -756,6 +784,58 @@ mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_mock_engine_args_json_round_trip_preserves_worker_type_and_nulls() {
let args = MockEngineArgs::builder()
.worker_type(WorkerType::Decode)
.max_num_seqs(None)
.max_num_batched_tokens(None)
.reasoning(None)
.sglang(None)
.build()
.unwrap()
.normalized()
.unwrap();
let payload = serde_json::json!({
"engine_type": "vllm",
"num_gpu_blocks": args.num_gpu_blocks,
"block_size": args.block_size,
"max_num_seqs": args.max_num_seqs,
"max_num_batched_tokens": args.max_num_batched_tokens,
"enable_prefix_caching": args.enable_prefix_caching,
"enable_chunked_prefill": args.enable_chunked_prefill,
"speedup_ratio": args.speedup_ratio,
"decode_speedup_ratio": args.decode_speedup_ratio,
"dp_size": args.dp_size,
"startup_time": args.startup_time,
"worker_type": "decode",
"planner_profile_data": args.planner_profile_data,
"aic_backend": args.aic_backend,
"aic_system": args.aic_system,
"aic_backend_version": args.aic_backend_version,
"aic_tp_size": args.aic_tp_size,
"aic_model_path": args.aic_model_path,
"enable_local_indexer": args.enable_local_indexer,
"bootstrap_port": args.bootstrap_port,
"kv_bytes_per_token": args.kv_bytes_per_token,
"kv_transfer_bandwidth": args.kv_transfer_bandwidth,
"reasoning": args.reasoning,
"zmq_kv_events_port": args.zmq_kv_events_port,
"zmq_replay_port": args.zmq_replay_port,
"preemption_mode": "lifo",
"router_queue_policy": args.router_queue_policy.map(|policy| policy.to_string()),
"sglang": args.sglang,
"has_perf_model": true,
});
let restored = MockEngineArgs::from_json_str(&payload.to_string()).unwrap();
assert_eq!(restored.worker_type, WorkerType::Decode);
assert_eq!(restored.max_num_seqs, None);
assert_eq!(restored.max_num_batched_tokens, None);
}
#[test]
fn test_unique_block_default_uniqueness() {
// Create 10 default UniqueBlock instances
......
......@@ -10,7 +10,7 @@ from types import SimpleNamespace
import numpy as np
import pytest
from dynamo.llm import EngineType, EntrypointArgs
from dynamo.llm import EngineType, EntrypointArgs, MockEngineArgs
MODULE_PATH = (
Path(__file__).resolve().parents[2] / "components/src/dynamo/mocker/config.py"
......@@ -230,5 +230,26 @@ def test_build_mocker_engine_args_preserves_cli_mapped_fields(tmp_path):
"clip_max_new_tokens": 1024,
"schedule_conservativeness": 0.8,
},
}
assert "has_perf_model" not in payload
def test_mock_engine_args_from_json_ignores_legacy_has_perf_model_field():
payload = {
"engine_type": "vllm",
"num_gpu_blocks": 2048,
"block_size": 128,
"max_num_seqs": None,
"max_num_batched_tokens": None,
"worker_type": "decode",
"has_perf_model": True,
}
engine_args = MockEngineArgs.from_json(json.dumps(payload))
assert engine_args.num_gpu_blocks == 2048
assert engine_args.block_size == 128
assert engine_args.max_num_seqs is None
assert engine_args.max_num_batched_tokens is None
assert engine_args.worker_type == "decode"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment