Unverified Commit 6243bcbe authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: support MoE model in SLA Planner Sglang (#3185)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarhhzhang16 <54051230+hhzhang16@users.noreply.github.com>
parent 8f338a63
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: profile-sla
namespace: ${NAMESPACE}
spec:
template:
spec:
serviceAccountName: dynamo-sa
containers:
- name: profile-sla
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "32"
memory: "50Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: NATS_SERVER
value: nats://${NAMESPACE}-nats:4222
- name: ETCD_ENDPOINTS
value: ${NAMESPACE}-etcd:2379
workingDir: /sgl-workspace/dynamo
command: ["python", "-m", "benchmarks.profiler.profile_sla"]
args:
- --config
- /sgl-workspace/dynamo/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml
- --output-dir
- /data/profiling_results
- --namespace
- ${NAMESPACE}
- --backend
- sglang
- --is-moe-model
- --min-num-gpus-per-engine
- "8"
- --max-num-gpus-per-engine
- "16"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "200"
- --itl
- "20"
volumeMounts:
- name: output-volume
mountPath: /data
restartPolicy: Never
volumes:
- name: output-volume
persistentVolumeClaim:
claimName: dynamo-pvc
backoffLimit: 0
...@@ -22,6 +22,7 @@ if __name__ == "__main__": ...@@ -22,6 +22,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="profile a given endpoint's performance for prefill or decode" description="profile a given endpoint's performance for prefill or decode"
) )
# TODO: use kebab case
parser.add_argument( parser.add_argument(
"--mode", "--mode",
type=str, type=str,
...@@ -79,6 +80,12 @@ if __name__ == "__main__": ...@@ -79,6 +80,12 @@ if __name__ == "__main__":
default=8, default=8,
help="interpolation granularity for the results", help="interpolation granularity for the results",
) )
parser.add_argument(
"--attention_dp_size",
type=int,
default=1,
help="attention dp size of the endpoint for MoE models",
)
args = parser.parse_args() args = parser.parse_args()
os.makedirs(args.work_dir, exist_ok=True) os.makedirs(args.work_dir, exist_ok=True)
...@@ -105,6 +112,7 @@ if __name__ == "__main__": ...@@ -105,6 +112,7 @@ if __name__ == "__main__":
args.max_kv_tokens, args.max_kv_tokens,
args.max_context_length, args.max_context_length,
args.interpolation_granularity, args.interpolation_granularity,
args.attention_dp_size,
) )
else: else:
raise ValueError(f"Invalid mode: {args.mode}") raise ValueError(f"Invalid mode: {args.mode}")
This diff is collapsed.
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import json import json
import logging import logging
import math
import re import re
import shlex import shlex
from typing import Literal, Optional, Protocol from typing import Literal, Optional, Protocol
...@@ -79,6 +80,10 @@ class Config(BaseModel): ...@@ -79,6 +80,10 @@ class Config(BaseModel):
model_config = {"extra": "allow"} model_config = {"extra": "allow"}
class MultinodeConfig(BaseModel):
nodeCount: int
def break_arguments(args: list[str] | None) -> list[str]: def break_arguments(args: list[str] | None) -> list[str]:
ans: list[str] = [] ans: list[str] = []
if args is None: if args is None:
...@@ -159,15 +164,114 @@ def parse_override_engine_args(args: list[str]) -> tuple[dict, list[str]]: ...@@ -159,15 +164,114 @@ def parse_override_engine_args(args: list[str]) -> tuple[dict, list[str]]:
return override_dict, args return override_dict, args
def set_multinode_config(worker_service, gpu_count: int, num_gpus_per_node: int):
"""Helper function to set multinode configuration based on GPU count and GPUs per node."""
if gpu_count <= num_gpus_per_node:
# Single node: remove multinode configuration if present
if (
hasattr(worker_service, "multinode")
and worker_service.multinode is not None
):
worker_service.multinode = None
else:
# Multi-node: set nodeCount = math.ceil(gpu_count / num_gpus_per_node)
node_count = math.ceil(gpu_count / num_gpus_per_node)
if not hasattr(worker_service, "multinode") or worker_service.multinode is None:
# Create multinode configuration if it doesn't exist
worker_service.multinode = MultinodeConfig(nodeCount=node_count)
else:
# Handle both dict (from YAML) and MultinodeConfig object cases
if isinstance(worker_service.multinode, dict):
worker_service.multinode["nodeCount"] = node_count
else:
worker_service.multinode.nodeCount = node_count
# TODO: make is work for all frameworks
def get_worker_service_from_config(config: dict):
"""Helper function to get the SGLang decode worker service from config."""
cfg = Config.model_validate(config)
return cfg.spec.services[WORKER_COMPONENT_NAMES["sglang"].decode_worker_k8s_name]
# TODO: make is work for all frameworks
def setup_worker_service_resources(
worker_service, gpu_count: int, num_gpus_per_node: Optional[int] = None
):
"""Helper function to set up worker service resources (requests and limits)."""
# Handle multinode configuration if num_gpus_per_node is provided
if num_gpus_per_node is not None:
set_multinode_config(worker_service, gpu_count, num_gpus_per_node)
# Ensure resources exists
if worker_service.resources is None:
worker_service.resources = ServiceResources()
# Ensure requests exists
if worker_service.resources.requests is None:
worker_service.resources.requests = {}
# Set GPU requests
gpu_value = (
min(gpu_count, num_gpus_per_node)
if num_gpus_per_node is not None
else gpu_count
)
worker_service.resources.requests["gpu"] = str(gpu_value)
# Update limits if they exist
if worker_service.resources.limits is not None:
worker_service.resources.limits["gpu"] = str(gpu_value)
# TODO: make is work for all frameworks
def validate_and_get_worker_args(worker_service):
"""Helper function to validate worker service and get its arguments."""
if not worker_service.extraPodSpec or not worker_service.extraPodSpec.mainContainer:
raise ValueError(
f"Missing extraPodSpec or mainContainer in SGLang decode worker service '{WORKER_COMPONENT_NAMES['sglang'].decode_worker_k8s_name}'"
)
args = worker_service.extraPodSpec.mainContainer.args
return break_arguments(args)
def set_argument_value(args: list, arg_name: str, value: str):
"""Helper function to set an argument value, adding it if not present."""
try:
idx = args.index(arg_name)
args[idx + 1] = value
except ValueError:
args = append_argument(args, [arg_name, value])
return args
class ConfigModifierProtocol(Protocol): class ConfigModifierProtocol(Protocol):
@classmethod @classmethod
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict: def convert_config(
cls,
config: dict,
target: Literal["prefill", "decode"],
is_moe_model: bool = False,
) -> dict:
... ...
@classmethod @classmethod
def set_config_tp_size(cls, config: dict, tp_size: int) -> dict: def set_config_tp_size(cls, config: dict, tp_size: int) -> dict:
... ...
@classmethod
def set_config_tep_size(
cls, config: dict, tep_size: int, num_gpus_per_node: int
) -> dict:
...
@classmethod
def set_config_dep_size(
cls, config: dict, dep_size: int, num_gpus_per_node: int
) -> dict:
...
@classmethod @classmethod
def get_model_name(cls, config: dict) -> str: def get_model_name(cls, config: dict) -> str:
... ...
...@@ -177,13 +281,25 @@ class ConfigModifierProtocol(Protocol): ...@@ -177,13 +281,25 @@ class ConfigModifierProtocol(Protocol):
... ...
@classmethod @classmethod
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int: def get_kv_cache_size_from_dynamo_log(
cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
... ...
class VllmV1ConfigModifier: class VllmV1ConfigModifier:
@classmethod @classmethod
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict: def convert_config(
cls,
config: dict,
target: Literal["prefill", "decode"],
is_moe_model: bool = False,
) -> dict:
if is_moe_model:
raise NotImplementedError(
"MoE model support is not implemented for VLLM backend"
)
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
# set metadata name # set metadata name
...@@ -308,6 +424,18 @@ class VllmV1ConfigModifier: ...@@ -308,6 +424,18 @@ class VllmV1ConfigModifier:
return cfg.model_dump() return cfg.model_dump()
@classmethod
def set_config_tep_size(cls, config: dict, tep_size: int, num_gpus_per_node: int):
raise NotImplementedError(
"TEP (Tensor Expert Parallelism) is not implemented for VLLM backend"
)
@classmethod
def set_config_dep_size(cls, config: dict, dep_size: int, num_gpus_per_node: int):
raise NotImplementedError(
"DEP (Data Expert Parallelism) is not implemented for VLLM backend"
)
@classmethod @classmethod
def get_model_name(cls, config: dict) -> str: def get_model_name(cls, config: dict) -> str:
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
...@@ -365,8 +493,9 @@ class VllmV1ConfigModifier: ...@@ -365,8 +493,9 @@ class VllmV1ConfigModifier:
return DYNAMO_RUN_DEFAULT_PORT return DYNAMO_RUN_DEFAULT_PORT
@classmethod @classmethod
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int: def get_kv_cache_size_from_dynamo_log(
# TODO cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
try: try:
with open(dynamo_log_fn, "r") as f: with open(dynamo_log_fn, "r") as f:
for line in f: for line in f:
...@@ -390,7 +519,12 @@ class VllmV1ConfigModifier: ...@@ -390,7 +519,12 @@ class VllmV1ConfigModifier:
class SGLangConfigModifier: class SGLangConfigModifier:
@classmethod @classmethod
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict: def convert_config(
cls,
config: dict,
target: Literal["prefill", "decode"],
is_moe_model: bool = False,
) -> dict:
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
# set metadata name # set metadata name
...@@ -425,9 +559,10 @@ class SGLangConfigModifier: ...@@ -425,9 +559,10 @@ class SGLangConfigModifier:
args = break_arguments(args) args = break_arguments(args)
# remove `--disaggregation-mode` and `--disaggregation-transfer-backend` # remove disagg flags
args = remove_valued_arguments(args, "--disaggregation-mode") args = remove_valued_arguments(args, "--disaggregation-mode")
args = remove_valued_arguments(args, "--disaggregation-transfer-backend") args = remove_valued_arguments(args, "--disaggregation-transfer-backend")
args = remove_valued_arguments(args, "--disaggregation-bootstrap-port")
# disable prefix caching # disable prefix caching
if "--disable-radix-cache" not in args: if "--disable-radix-cache" not in args:
...@@ -455,14 +590,25 @@ class SGLangConfigModifier: ...@@ -455,14 +590,25 @@ class SGLangConfigModifier:
args = break_arguments(args) args = break_arguments(args)
# remove `--disaggregation-mode` and `--disaggregation-transfer-backend` # remove disagg flags
args = remove_valued_arguments(args, "--disaggregation-mode") args = remove_valued_arguments(args, "--disaggregation-mode")
args = remove_valued_arguments(args, "--disaggregation-transfer-backend") args = remove_valued_arguments(args, "--disaggregation-transfer-backend")
args = remove_valued_arguments(args, "--disaggregation-bootstrap-port")
# enable prefix caching # enable prefix caching
if "--disable-radix-cache" in args: if "--disable-radix-cache" in args:
args.remove("--disable-radix-cache") args.remove("--disable-radix-cache")
if is_moe_model:
# need to use round_robin dp attention routing for MoE models to ensure kv reuse can skip prefill
if "--load-balance-method" in args:
idx = args.index("--load-balance-method")
args[idx + 1] = "round_robin"
else:
args = append_argument(
args, ["--load-balance-method", "round_robin"]
)
worker_service.extraPodSpec.mainContainer.args = join_arguments(args) worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
# set num workers to 1 # set num workers to 1
...@@ -471,49 +617,77 @@ class SGLangConfigModifier: ...@@ -471,49 +617,77 @@ class SGLangConfigModifier:
] ]
decode_worker_config["replicas"] = 1 decode_worker_config["replicas"] = 1
return config return cfg.model_dump()
@classmethod @classmethod
def set_config_tp_size(cls, config: dict, tp_size: int): def set_config_tp_size(cls, config: dict, tp_size: int):
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
worker_service = get_worker_service_from_config(config)
worker_service = cfg.spec.services[ # Set up resources
WORKER_COMPONENT_NAMES["sglang"].decode_worker_k8s_name setup_worker_service_resources(worker_service, tp_size)
]
# Ensure resources exists # Get and validate args
if worker_service.resources is None: args = validate_and_get_worker_args(worker_service)
worker_service.resources = ServiceResources()
# Ensure requests exists # Set --tp argument
if worker_service.resources.requests is None: args = set_argument_value(args, "--tp", str(tp_size))
worker_service.resources.requests = {}
worker_service.resources.requests["gpu"] = str(tp_size) worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
return cfg.model_dump()
# Update limits if they exist @classmethod
if worker_service.resources.limits is not None: def set_config_tep_size(cls, config: dict, tep_size: int, num_gpus_per_node: int):
worker_service.resources.limits["gpu"] = str(tp_size) cfg = Config.model_validate(config)
worker_service = get_worker_service_from_config(config)
if ( # Set up resources with multinode configuration
not worker_service.extraPodSpec setup_worker_service_resources(worker_service, tep_size, num_gpus_per_node)
or not worker_service.extraPodSpec.mainContainer
):
raise ValueError(
f"Missing extraPodSpec or mainContainer in SGLang decode worker service '{WORKER_COMPONENT_NAMES['sglang'].decode_worker_k8s_name}'"
)
args = worker_service.extraPodSpec.mainContainer.args
args = break_arguments(args) # Get and validate args
args = validate_and_get_worker_args(worker_service)
try: # 1. Set --tp=tep_size, if not present add it
idx = args.index("--tp") args = set_argument_value(args, "--tp", str(tep_size))
args[idx + 1] = str(tp_size)
except ValueError: # 2. Set --ep-size=tep_size, if not present add it
args = append_argument(args, ["--tp", str(tp_size)]) args = set_argument_value(args, "--ep-size", str(tep_size))
# 3. Remove --dp if present
args = remove_valued_arguments(args, "--dp")
# 4. Remove --enable-dp-attention if present
if "--enable-dp-attention" in args:
args.remove("--enable-dp-attention")
worker_service.extraPodSpec.mainContainer.args = join_arguments(args) worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
return cfg.model_dump()
@classmethod
def set_config_dep_size(cls, config: dict, dep_size: int, num_gpus_per_node: int):
cfg = Config.model_validate(config)
worker_service = get_worker_service_from_config(config)
# Set up resources with multinode configuration
setup_worker_service_resources(worker_service, dep_size, num_gpus_per_node)
# Get and validate args
args = validate_and_get_worker_args(worker_service)
# 1. Set --tp=dep_size
args = set_argument_value(args, "--tp", str(dep_size))
# 2. Set --dp=dep_size (data parallelism across experts)
args = set_argument_value(args, "--dp", str(dep_size))
# 3. Enable --enable-dp-attention
if "--enable-dp-attention" not in args:
args = append_argument(args, "--enable-dp-attention")
# 4. Set --ep-size=dep_size (expert parallelism size)
args = set_argument_value(args, "--ep-size", str(dep_size))
worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
return cfg.model_dump() return cfg.model_dump()
@classmethod @classmethod
...@@ -573,8 +747,9 @@ class SGLangConfigModifier: ...@@ -573,8 +747,9 @@ class SGLangConfigModifier:
return DYNAMO_RUN_DEFAULT_PORT return DYNAMO_RUN_DEFAULT_PORT
@classmethod @classmethod
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int: def get_kv_cache_size_from_dynamo_log(
# TODO cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
try: try:
with open(dynamo_log_fn, "r") as f: with open(dynamo_log_fn, "r") as f:
for line in f: for line in f:
...@@ -582,7 +757,7 @@ class SGLangConfigModifier: ...@@ -582,7 +757,7 @@ class SGLangConfigModifier:
# Extract the number after "#tokens:" # Extract the number after "#tokens:"
match = re.search(r"#tokens:\s*(\d+)", line) match = re.search(r"#tokens:\s*(\d+)", line)
if match: if match:
return int(match.group(1)) return int(match.group(1)) * attention_dp_size
except Exception as e: except Exception as e:
logger.warning(f"Failed to parse KV cache size from log file. Error: {e}") logger.warning(f"Failed to parse KV cache size from log file. Error: {e}")
return 0 return 0
...@@ -590,7 +765,17 @@ class SGLangConfigModifier: ...@@ -590,7 +765,17 @@ class SGLangConfigModifier:
class TrtllmConfigModifier: class TrtllmConfigModifier:
@classmethod @classmethod
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict: def convert_config(
cls,
config: dict,
target: Literal["prefill", "decode"],
is_moe_model: bool = False,
) -> dict:
if is_moe_model:
raise NotImplementedError(
"MoE model support is not implemented for TrtLLM backend"
)
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
# set metadata name # set metadata name
...@@ -752,6 +937,18 @@ class TrtllmConfigModifier: ...@@ -752,6 +937,18 @@ class TrtllmConfigModifier:
return cfg.model_dump() return cfg.model_dump()
@classmethod
def set_config_tep_size(cls, config: dict, tep_size: int, num_gpus_per_node: int):
raise NotImplementedError(
"TEP (Tensor Expert Parallelism) is not implemented for TrtLLM backend"
)
@classmethod
def set_config_dep_size(cls, config: dict, dep_size: int, num_gpus_per_node: int):
raise NotImplementedError(
"DEP (Data Expert Parallelism) is not implemented for TrtLLM backend"
)
@classmethod @classmethod
def get_model_name(cls, config: dict) -> str: def get_model_name(cls, config: dict) -> str:
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
...@@ -810,7 +1007,9 @@ class TrtllmConfigModifier: ...@@ -810,7 +1007,9 @@ class TrtllmConfigModifier:
return DYNAMO_RUN_DEFAULT_PORT return DYNAMO_RUN_DEFAULT_PORT
@classmethod @classmethod
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int: def get_kv_cache_size_from_dynamo_log(
cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
# TRT-LLM log parsing for KV cache size # TRT-LLM log parsing for KV cache size
# Format: [TensorRT-LLM][INFO] [MemUsageChange] Allocated XX GiB for max tokens in paged KV cache (XXXXXX). # Format: [TensorRT-LLM][INFO] [MemUsageChange] Allocated XX GiB for max tokens in paged KV cache (XXXXXX).
try: try:
......
...@@ -13,22 +13,9 @@ ...@@ -13,22 +13,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
DECODE_NUM_REQUESTS_RANGE = [
1,
5,
10,
25,
50,
100,
150,
200,
250,
300,
350,
400,
450,
500,
]
DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B" DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B"
DYNAMO_RUN_DEFAULT_PORT = 8000 DYNAMO_RUN_DEFAULT_PORT = 8000
# set a decode maximum concurrency due to limits of profiling tools
# for MoE models with attn-dp, we might hit this limit
DECODE_MAX_CONCURRENCY = 2000
...@@ -32,13 +32,13 @@ logger.addHandler(console_handler) ...@@ -32,13 +32,13 @@ logger.addHandler(console_handler)
def plot_prefill_performance( def plot_prefill_performance(
prefill_tp_size, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
): ):
""" """
Plot prefill performance as a 2D scatter plot with TP size annotations. Plot prefill performance as a 2D scatter plot with GPU count annotations.
Args: Args:
prefill_tp_size: list of TP sizes prefill_num_gpu: list of GPU counts
prefill_ttft: list of time to first token values prefill_ttft: list of time to first token values
prefill_thpt_per_gpu: list of throughput per GPU values prefill_thpt_per_gpu: list of throughput per GPU values
target_ttft: target TTFT value for the vertical line target_ttft: target TTFT value for the vertical line
...@@ -46,9 +46,9 @@ def plot_prefill_performance( ...@@ -46,9 +46,9 @@ def plot_prefill_performance(
""" """
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100) plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100)
for i, tp in enumerate(prefill_tp_size): for i, num_gpu in enumerate(prefill_num_gpu):
plt.annotate( plt.annotate(
f"TP{tp}", f"{num_gpu} GPU(s)",
(prefill_ttft[i], prefill_thpt_per_gpu[i]), (prefill_ttft[i], prefill_thpt_per_gpu[i]),
xytext=(10, 0), xytext=(10, 0),
textcoords="offset points", textcoords="offset points",
...@@ -73,17 +73,17 @@ def plot_prefill_performance( ...@@ -73,17 +73,17 @@ def plot_prefill_performance(
def plot_decode_performance(decode_results, target_itl, output_dir): def plot_decode_performance(decode_results, target_itl, output_dir):
""" """
Plot decode performance with multiple TP size lines. Plot decode performance with multiple GPU count lines.
Args: Args:
decode_results: list of tuples (tp_size, itl_list, thpt_per_gpu_list) decode_results: list of tuples (num_gpu, itl_list, thpt_per_gpu_list)
target_itl: target ITL value for the vertical line target_itl: target ITL value for the vertical line
output_dir: directory to save the plot output_dir: directory to save the plot
""" """
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
for tp_size, itl_list, thpt_per_gpu_list in decode_results: for num_gpu, itl_list, thpt_per_gpu_list in decode_results:
plt.plot(itl_list, thpt_per_gpu_list, label=f"TP{tp_size}") plt.plot(itl_list, thpt_per_gpu_list, label=f"{num_gpu} GPU(s)")
plt.axvline( plt.axvline(
x=target_itl, color="r", linestyle="--", label=f"Target ITL: {target_itl} ms" x=target_itl, color="r", linestyle="--", label=f"Target ITL: {target_itl} ms"
......
...@@ -6,6 +6,7 @@ from typing import Callable, Optional, Tuple ...@@ -6,6 +6,7 @@ from typing import Callable, Optional, Tuple
import numpy as np import numpy as np
from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode from benchmarks.profiler.utils.genai_perf import benchmark_decode
from benchmarks.profiler.utils.plot import plot_decode_3d_surface from benchmarks.profiler.utils.plot import plot_decode_3d_surface
...@@ -21,6 +22,21 @@ console_handler.setFormatter(formatter) ...@@ -21,6 +22,21 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def get_num_request_range(attn_dp_size, engine_max_concurrency, granularity):
# for MoE models with attn-dp, we want the num_request to be a multiple of attn_dp_size
# so that we can make sure the request is sent to the same dp rank as the warmup request
# this is guaranteed because the dp scheduler is scheduling round-robin
max_concurrency = min(engine_max_concurrency, DECODE_MAX_CONCURRENCY)
conc_per_dp = max_concurrency // attn_dp_size
if conc_per_dp < granularity:
ans = list(range(attn_dp_size, conc_per_dp * attn_dp_size + 1, attn_dp_size))
else:
step = (conc_per_dp - 1) * attn_dp_size / (granularity - 1)
ans = [attn_dp_size + int(i * step) * attn_dp_size for i in range(granularity)]
return ans
def _profile_decode_helper( def _profile_decode_helper(
work_dir, work_dir,
num_gpus, num_gpus,
...@@ -30,6 +46,7 @@ def _profile_decode_helper( ...@@ -30,6 +46,7 @@ def _profile_decode_helper(
get_itl_and_thpt_per_gpu: Callable[ get_itl_and_thpt_per_gpu: Callable[
[int, int, int], Tuple[Optional[float], Optional[float]] [int, int, int], Tuple[Optional[float], Optional[float]]
], ],
attention_dp_size,
): ):
"""interpolate ITL - Active_KV_Cache - Decode_Context_Length""" """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
x_kv_usage = [] x_kv_usage = []
...@@ -51,18 +68,9 @@ def _profile_decode_helper( ...@@ -51,18 +68,9 @@ def _profile_decode_helper(
f" isl {isl} + osl {osl}, skipping." f" isl {isl} + osl {osl}, skipping."
) )
break break
elif max_concurrency < interpolation_granularity:
logger.warning(
f"max_concurrency {max_concurrency} is too small for"
f" interpolation granularity {interpolation_granularity}."
f" max_kv_tokens {max_kv_tokens}, isl {isl}, osl {osl}"
)
sweep_num_request = range(1, max_concurrency + 1)
else: else:
sweep_num_request = range( sweep_num_request = get_num_request_range(
1, attention_dp_size, max_concurrency, interpolation_granularity
max_concurrency,
max_concurrency // interpolation_granularity,
) )
for num_request in sweep_num_request: for num_request in sweep_num_request:
itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request) itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request)
...@@ -102,6 +110,7 @@ def profile_decode( ...@@ -102,6 +110,7 @@ def profile_decode(
max_kv_tokens, max_kv_tokens,
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
attention_dp_size,
): ):
def get_itl_and_thpt_per_gpu(isl, osl, num_request): def get_itl_and_thpt_per_gpu(isl, osl, num_request):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}" genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
...@@ -127,6 +136,7 @@ def profile_decode( ...@@ -127,6 +136,7 @@ def profile_decode(
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
get_itl_and_thpt_per_gpu, get_itl_and_thpt_per_gpu,
attention_dp_size,
) )
...@@ -137,6 +147,7 @@ def profile_decode_aiconfigurator( ...@@ -137,6 +147,7 @@ def profile_decode_aiconfigurator(
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
ai_configurator_perf_estimator: AIConfiguratorPerfEstimator, ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
attention_dp_size,
**model_config_kwargs, **model_config_kwargs,
): ):
def get_itl_and_thpt_per_gpu(isl, osl, num_request): def get_itl_and_thpt_per_gpu(isl, osl, num_request):
...@@ -156,4 +167,5 @@ def profile_decode_aiconfigurator( ...@@ -156,4 +167,5 @@ def profile_decode_aiconfigurator(
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
get_itl_and_thpt_per_gpu, get_itl_and_thpt_per_gpu,
attention_dp_size,
) )
...@@ -14,7 +14,7 @@ spec: ...@@ -14,7 +14,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/sglang-runtime:my-tag image: my-registry/sglang-runtime:my-tag
SGLangDecodeWorker: decode:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
dynamoNamespace: sglang-agg dynamoNamespace: sglang-agg
componentType: worker componentType: worker
......
...@@ -17,7 +17,7 @@ spec: ...@@ -17,7 +17,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/sglang-runtime:my-tag image: my-registry/sglang-runtime:my-tag
SGLangDecodeWorker: decode:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
dynamoNamespace: sglang-agg dynamoNamespace: sglang-agg
componentType: worker componentType: worker
......
...@@ -17,7 +17,7 @@ spec: ...@@ -17,7 +17,7 @@ spec:
envs: envs:
- name: DYN_ROUTER_MODE - name: DYN_ROUTER_MODE
value: kv value: kv
SGLangDecodeWorker: decode:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
dynamoNamespace: sglang-agg-router dynamoNamespace: sglang-agg-router
componentType: worker componentType: worker
......
...@@ -14,7 +14,7 @@ spec: ...@@ -14,7 +14,7 @@ spec:
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/sglang-runtime:my-tag image: my-registry/sglang-runtime:my-tag
SGLangDecodeWorker: decode:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
dynamoNamespace: sglang-disagg dynamoNamespace: sglang-disagg
componentType: worker componentType: worker
...@@ -41,7 +41,7 @@ spec: ...@@ -41,7 +41,7 @@ spec:
--disaggregation-mode decode --disaggregation-mode decode
--disaggregation-transfer-backend nixl --disaggregation-transfer-backend nixl
SGLangPrefillWorker: prefill:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
dynamoNamespace: sglang-disagg dynamoNamespace: sglang-disagg
componentType: worker componentType: worker
......
...@@ -98,7 +98,7 @@ spec: ...@@ -98,7 +98,7 @@ spec:
- -c - -c
args: args:
- "python3 -m dynamo.planner.prometheus" - "python3 -m dynamo.planner.prometheus"
SGLangDecodeWorker: decode:
dynamoNamespace: dynamo dynamoNamespace: dynamo
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -129,7 +129,7 @@ spec: ...@@ -129,7 +129,7 @@ spec:
- decode - decode
- --disaggregation-transfer-backend - --disaggregation-transfer-backend
- nixl - nixl
SGLangPrefillWorker: prefill:
dynamoNamespace: dynamo dynamoNamespace: dynamo
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
......
...@@ -93,10 +93,14 @@ class VllmComponentName: ...@@ -93,10 +93,14 @@ class VllmComponentName:
class SGLangComponentName: class SGLangComponentName:
prefill_worker_k8s_name = "SGLangPrefillWorker" prefill_worker_k8s_name = (
"prefill" # use short name to stay within k8s limits with grove
)
prefill_worker_component_name = "prefill" prefill_worker_component_name = "prefill"
prefill_worker_endpoint = "generate" prefill_worker_endpoint = "generate"
decode_worker_k8s_name = "SGLangDecodeWorker" decode_worker_k8s_name = (
"decode" # use short name to stay within k8s limits with grove
)
decode_worker_component_name = "backend" decode_worker_component_name = "backend"
decode_worker_endpoint = "generate" decode_worker_endpoint = "generate"
......
...@@ -86,12 +86,6 @@ def create_sla_planner_parser() -> argparse.ArgumentParser: ...@@ -86,12 +86,6 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
default=SLAPlannerDefaults.profile_results_dir, default=SLAPlannerDefaults.profile_results_dir,
help="Profile results directory", help="Profile results directory",
) )
parser.add_argument(
"--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
)
parser.add_argument(
"--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
)
parser.add_argument( parser.add_argument(
"--ttft", "--ttft",
type=float, type=float,
......
...@@ -58,7 +58,7 @@ ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH ...@@ -58,7 +58,7 @@ ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH
# Dynamo # Dynamo
WORKDIR /sgl-workspace WORKDIR /sgl-workspace
RUN git clone https://github.com/ai-dynamo/dynamo.git COPY . /sgl-workspace/dynamo
ENV RUSTUP_HOME=/usr/local/rustup \ ENV RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \ CARGO_HOME=/usr/local/cargo \
...@@ -87,6 +87,10 @@ RUN cd dynamo/lib/bindings/python && \ ...@@ -87,6 +87,10 @@ RUN cd dynamo/lib/bindings/python && \
RUN pip install --break-system-packages sglang-router==0.1.9 RUN pip install --break-system-packages sglang-router==0.1.9
# Install dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
pip install --break-system-packages --requirement /tmp/requirements.txt
RUN wget --tries=3 --waitretry=5 \ RUN wget --tries=3 --waitretry=5 \
https://github.com/nats-io/nats-server/releases/download/v2.10.28/\ https://github.com/nats-io/nats-server/releases/download/v2.10.28/\
nats-server-v2.10.28-${ARCH}.deb && \ nats-server-v2.10.28-${ARCH}.deb && \
......
...@@ -21,6 +21,7 @@ import socket ...@@ -21,6 +21,7 @@ import socket
import subprocess import subprocess
import sys import sys
import time import time
import uuid
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
...@@ -114,9 +115,9 @@ class DynamoDeploymentClient: ...@@ -114,9 +115,9 @@ class DynamoDeploymentClient:
service_name: Service name for connecting to the service, defaults to {deployment_name}-frontend service_name: Service name for connecting to the service, defaults to {deployment_name}-frontend
""" """
self.namespace = namespace self.namespace = namespace
self.deployment_name = deployment_name self.deployment_name = f"{deployment_name}-{str(uuid.uuid4())[:4]}"
self.model_name = model_name self.model_name = model_name
self.service_name = service_name or f"{deployment_name}-frontend" self.service_name = service_name or f"{self.deployment_name}-frontend"
self.components: List[str] = [] # Will store component names from CR self.components: List[str] = [] # Will store component names from CR
self.deployment_spec: Optional[ self.deployment_spec: Optional[
Dict[str, Any] Dict[str, Any]
...@@ -247,13 +248,6 @@ class DynamoDeploymentClient: ...@@ -247,13 +248,6 @@ class DynamoDeploymentClient:
self.deployment_spec["metadata"]["name"] = self.deployment_name self.deployment_spec["metadata"]["name"] = self.deployment_name
self.deployment_spec["metadata"]["namespace"] = self.namespace self.deployment_spec["metadata"]["namespace"] = self.namespace
# Disable grove as it will cause the deployment to not report ready
if "annotations" not in self.deployment_spec["metadata"]:
self.deployment_spec["metadata"]["annotations"] = {}
self.deployment_spec["metadata"]["annotations"][
"nvidia.com/enable-grove"
] = "false"
try: try:
await self.custom_api.create_namespaced_custom_object( await self.custom_api.create_namespaced_custom_object(
group="nvidia.com", group="nvidia.com",
......
...@@ -13,7 +13,7 @@ Support matrix: ...@@ -13,7 +13,7 @@ Support matrix:
| vLLM | Dense | ✅ | | vLLM | Dense | ✅ |
| vLLM | MoE | 🚧 | | vLLM | MoE | 🚧 |
| SGLang | Dense | ✅ | | SGLang | Dense | ✅ |
| SGLang | MoE | 🚧 | | SGLang | MoE | |
| TensorRT-LLM | Dense | ✅ | | TensorRT-LLM | Dense | ✅ |
| TensorRT-LLM | MoE | 🚧 | | TensorRT-LLM | MoE | 🚧 |
...@@ -63,9 +63,15 @@ After finding the best TP size for prefill and decode, the script will then inte ...@@ -63,9 +63,15 @@ After finding the best TP size for prefill and decode, the script will then inte
In prefill engine, prefills are usually done with batch size=1 and only the ISL (excluding prefix cache hit) affects the iteration time. The script profiles the selected prefill TP configuration across different ISLs and record the TTFT and prefill throughput per GPU under those ISLs. In prefill engine, prefills are usually done with batch size=1 and only the ISL (excluding prefix cache hit) affects the iteration time. The script profiles the selected prefill TP configuration across different ISLs and record the TTFT and prefill throughput per GPU under those ISLs.
For dense models, the script profiles different TP sizes.
For MoE models, the script only profiles different TEP sizes, since DEP is generally not the optimal prefill configuration.
### Decode Interpolation Data ### Decode Interpolation Data
In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases. In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
For dense models, the script profiles different TP sizes.
For MoE models, the script profiles different DEP sizes. TEP decode engines for low latency will be supported in the future.
![images](../../docs/images/itl_interpolation.png) ![images](../../docs/images/itl_interpolation.png)
The script profiles the selected decode TP configuration across different active kv blocks and average context length. The script profiles the selected decode TP configuration across different active kv blocks and average context length.
...@@ -96,7 +102,7 @@ Set up your Kubernetes namespace for profiling (one-time per namespace). First e ...@@ -96,7 +102,7 @@ Set up your Kubernetes namespace for profiling (one-time per namespace). First e
pip install -r deploy/utils/requirements.txt pip install -r deploy/utils/requirements.txt
``` ```
### Step 1: Inject your DGD configuration **Step 1: Inject your DGD configuration**
Use the injector utility to place your DGD manifest into the PVC. The profiling job will read the path you specify. Use the injector utility to place your DGD manifest into the PVC. The profiling job will read the path you specify.
...@@ -113,11 +119,14 @@ Use the injector utility to place your DGD manifest into the PVC. The profiling ...@@ -113,11 +119,14 @@ Use the injector utility to place your DGD manifest into the PVC. The profiling
> **Note**: All paths must start with `/data/` for security reasons. If you forget this prefix, the script will show a helpful error message with the correct path. > **Note**: All paths must start with `/data/` for security reasons. If you forget this prefix, the script will show a helpful error message with the correct path.
> **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues.
**Step 2: Set SLA target** **Step 2: Set SLA target**
Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to `vllm` or `sglang`. The backend type must match the dynamo deployment in the `DGD_CONFIG_FILE`. For dense models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to match the dynamo deployment in the `DGD_CONFIG_FILE`.
For MoE models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_moe_job.yaml` to set the target TEP, DEP, TTFT, and ITL.
> [!NOTE]
> If the model is too large to be downloaded every time, you can create a multi-attach PVC to cache the model. Refer to [recipes](../../recipes/README.md) for more details.
```yaml ```yaml
spec: spec:
...@@ -145,7 +154,7 @@ spec: ...@@ -145,7 +154,7 @@ spec:
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1 # or any existing image tag (TODO: update to 0.5.0 upon release as profiling with 0.4.1 is broken) export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1 # or any existing image tag (TODO: update to 0.5.0 upon release as profiling with 0.4.1 is broken)
``` ```
3. **Set the config path for the profiling job:** 2. **Set the config path for the profiling job:**
```bash ```bash
export DGD_CONFIG_FILE=/data/configs/disagg.yaml # should be the same path you set for --dest in Step 1 export DGD_CONFIG_FILE=/data/configs/disagg.yaml # should be the same path you set for --dest in Step 1
``` ```
...@@ -153,7 +162,11 @@ spec: ...@@ -153,7 +162,11 @@ spec:
**Step 4: Run profiling (required)** **Step 4: Run profiling (required)**
```bash ```bash
# for dense models
envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f - envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
# for MoE models
envsubst < benchmarks/profiler/deploy/profile_sla_moe_job.yaml | kubectl apply -f -
``` ```
**Step 5: Wait for profiling to complete** **Step 5: Wait for profiling to complete**
......
...@@ -26,27 +26,30 @@ class TestProfileSlaAiconfigurator: ...@@ -26,27 +26,30 @@ class TestProfileSlaAiconfigurator:
@pytest.fixture @pytest.fixture
def trtllm_args(self): def trtllm_args(self):
class Args: class Args:
backend = "trtllm" def __init__(self):
config = "components/backends/trtllm/deploy/disagg.yaml" self.backend = "trtllm"
output_dir = "/tmp/test_profiling_results" self.config = "components/backends/trtllm/deploy/disagg.yaml"
namespace = "test-namespace" self.output_dir = "/tmp/test_profiling_results"
min_num_gpus_per_engine = 1 self.namespace = "test-namespace"
max_num_gpus_per_engine = 8 self.min_num_gpus_per_engine = 1
skip_existing_results = False self.max_num_gpus_per_engine = 8
force_rerun = False self.skip_existing_results = False
isl = 3000 self.force_rerun = False
osl = 500 self.isl = 3000
ttft = 50 self.osl = 500
itl = 10 self.ttft = 50
max_context_length = 16384 self.itl = 10
prefill_interpolation_granularity = 16 self.max_context_length = 16384
decode_interpolation_granularity = 6 self.prefill_interpolation_granularity = 16
service_name = "" self.decode_interpolation_granularity = 6
dry_run = False self.service_name = ""
use_ai_configurator = True self.is_moe_model = False
aic_system = "h200_sxm" self.dry_run = False
aic_model_name = "QWEN3_32B" self.use_ai_configurator = True
backend_version = "0.20.0" self.aic_system = "h200_sxm"
self.aic_model_name = "QWEN3_32B"
self.backend_version = "0.20.0"
self.num_gpus_per_node = 8
return Args() return Args()
......
...@@ -28,27 +28,30 @@ class TestProfileSLADryRun: ...@@ -28,27 +28,30 @@ class TestProfileSLADryRun:
"""Create arguments for vllm backend dry-run test.""" """Create arguments for vllm backend dry-run test."""
class Args: class Args:
backend = "vllm" def __init__(self):
config = "components/backends/vllm/deploy/disagg.yaml" self.backend = "vllm"
output_dir = "/tmp/test_profiling_results" self.config = "components/backends/vllm/deploy/disagg.yaml"
namespace = "test-namespace" self.output_dir = "/tmp/test_profiling_results"
min_num_gpus_per_engine = 1 self.namespace = "test-namespace"
max_num_gpus_per_engine = 8 self.min_num_gpus_per_engine = 1
skip_existing_results = False self.max_num_gpus_per_engine = 8
force_rerun = False self.skip_existing_results = False
isl = 3000 self.force_rerun = False
osl = 500 self.isl = 3000
ttft = 50 self.osl = 500
itl = 10 self.ttft = 50
max_context_length = 16384 self.itl = 10
prefill_interpolation_granularity = 16 self.max_context_length = 16384
decode_interpolation_granularity = 6 self.prefill_interpolation_granularity = 16
service_name = "" self.decode_interpolation_granularity = 6
dry_run = True self.service_name = ""
use_ai_configurator = False self.is_moe_model = False
aic_system = None self.dry_run = True
aic_model_name = None self.use_ai_configurator = False
backend_version = None self.aic_system = None
self.aic_model_name = None
self.backend_version = None
self.num_gpus_per_node = 8
return Args() return Args()
...@@ -57,27 +60,30 @@ class TestProfileSLADryRun: ...@@ -57,27 +60,30 @@ class TestProfileSLADryRun:
"""Create arguments for sglang backend dry-run test.""" """Create arguments for sglang backend dry-run test."""
class Args: class Args:
backend = "sglang" def __init__(self):
config = "components/backends/sglang/deploy/disagg.yaml" self.backend = "sglang"
output_dir = "/tmp/test_profiling_results" self.config = "components/backends/sglang/deploy/disagg.yaml"
namespace = "test-namespace" self.output_dir = "/tmp/test_profiling_results"
min_num_gpus_per_engine = 1 self.namespace = "test-namespace"
max_num_gpus_per_engine = 8 self.min_num_gpus_per_engine = 1
skip_existing_results = False self.max_num_gpus_per_engine = 8
force_rerun = False self.skip_existing_results = False
isl = 3000 self.force_rerun = False
osl = 500 self.isl = 3000
ttft = 50 self.osl = 500
itl = 10 self.ttft = 50
max_context_length = 16384 self.itl = 10
prefill_interpolation_granularity = 16 self.max_context_length = 16384
decode_interpolation_granularity = 6 self.prefill_interpolation_granularity = 16
service_name = "" self.decode_interpolation_granularity = 6
dry_run = True self.service_name = ""
use_ai_configurator = False self.is_moe_model = False
aic_system = None self.dry_run = True
aic_model_name = None self.use_ai_configurator = False
backend_version = None self.aic_system = None
self.aic_model_name = None
self.backend_version = None
self.num_gpus_per_node = 8
return Args() return Args()
...@@ -100,27 +106,30 @@ class TestProfileSLADryRun: ...@@ -100,27 +106,30 @@ class TestProfileSLADryRun:
"""Create arguments for trtllm backend dry-run test.""" """Create arguments for trtllm backend dry-run test."""
class Args: class Args:
backend = "trtllm" def __init__(self):
config = "components/backends/trtllm/deploy/disagg.yaml" self.backend = "trtllm"
output_dir = "/tmp/test_profiling_results" self.config = "components/backends/trtllm/deploy/disagg.yaml"
namespace = "test-namespace" self.output_dir = "/tmp/test_profiling_results"
min_num_gpus_per_engine = 1 self.namespace = "test-namespace"
max_num_gpus_per_engine = 8 self.min_num_gpus_per_engine = 1
skip_existing_results = False self.max_num_gpus_per_engine = 8
force_rerun = False self.skip_existing_results = False
isl = 3000 self.force_rerun = False
osl = 500 self.isl = 3000
ttft = 50 self.osl = 500
itl = 10 self.ttft = 50
max_context_length = 16384 self.itl = 10
prefill_interpolation_granularity = 16 self.max_context_length = 16384
decode_interpolation_granularity = 6 self.prefill_interpolation_granularity = 16
service_name = "" self.decode_interpolation_granularity = 6
dry_run = True self.service_name = ""
use_ai_configurator = False self.is_moe_model = False
aic_system = None self.dry_run = True
aic_model_name = None self.use_ai_configurator = False
backend_version = None self.aic_system = None
self.aic_model_name = None
self.backend_version = None
self.num_gpus_per_node = 8
return Args() return Args()
...@@ -130,3 +139,44 @@ class TestProfileSLADryRun: ...@@ -130,3 +139,44 @@ class TestProfileSLADryRun:
"""Test that profile_sla dry-run works for trtllm backend with disagg.yaml config.""" """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
# Run the profile in dry-run mode - should complete without errors # Run the profile in dry-run mode - should complete without errors
await run_profile(trtllm_args) await run_profile(trtllm_args)
@pytest.fixture
def sglang_moe_args(self):
"""Create arguments for trtllm backend dry-run test."""
class Args:
def __init__(self):
self.backend = "sglang"
self.config = (
"recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml"
)
self.output_dir = "/tmp/test_profiling_results"
self.namespace = "test-namespace"
self.min_num_gpus_per_engine = 8
self.max_num_gpus_per_engine = 32
self.skip_existing_results = False
self.force_rerun = False
self.isl = 3000
self.osl = 500
self.ttft = 50
self.itl = 10
self.max_context_length = 16384
self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6
self.service_name = ""
self.is_moe_model = True
self.dry_run = True
self.use_ai_configurator = False
self.aic_system = None
self.aic_model_name = None
self.backend_version = None
self.num_gpus_per_node = 8
return Args()
@pytest.mark.pre_merge
@pytest.mark.asyncio
async def test_sglang_moe_dryrun(self, sglang_moe_args):
"""Test that profile_sla dry-run works for sglang backend with MoE config."""
# Run the profile in dry-run mode - should complete without errors
await run_profile(sglang_moe_args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment