Unverified Commit cfe74445 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix: correctly parse subcomponent in sla-planner + deploy/doc update (#3363)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 92b8fe07
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: profile-sla
namespace: ${NAMESPACE}
spec:
template:
spec:
serviceAccountName: dynamo-sa
containers:
- name: profile-sla
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "16"
memory: "10Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: NATS_SERVER
value: nats://${NAMESPACE}-nats:4222
- name: ETCD_ENDPOINTS
value: ${NAMESPACE}-etcd:2379
command: ["python", "-m", "benchmarks.profiler.profile_sla"]
args:
- --config
- ${DGD_CONFIG_FILE}
- --output-dir
- /data/profiling_results
- --namespace
- ${NAMESPACE}
- --backend
- vllm
- --min-num-gpus-per-engine
- "1"
- --max-num-gpus-per-engine
- "8"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "500"
- --itl
- "30"
- --use-ai-configurator
- --aic-system
- h200_sxm
- --aic-model-name
- QWEN3_32B
- --backend-version
- 0.20.0
volumeMounts:
- name: output-volume
mountPath: /data
restartPolicy: Never
volumes:
- name: output-volume
persistentVolumeClaim:
claimName: dynamo-pvc
backoffLimit: 0
...@@ -25,25 +25,6 @@ spec: ...@@ -25,25 +25,6 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: planner componentType: planner
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
volumeMounts: volumeMounts:
- name: dynamo-pvc - name: dynamo-pvc
mountPoint: /data mountPoint: /data
......
...@@ -42,25 +42,6 @@ spec: ...@@ -42,25 +42,6 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: planner componentType: planner
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
volumeMounts: volumeMounts:
- name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run - name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
mountPoint: /data mountPoint: /data
...@@ -107,12 +88,6 @@ spec: ...@@ -107,12 +88,6 @@ spec:
extraPodSpec: extraPodSpec:
terminationGracePeriodSeconds: 600 terminationGracePeriodSeconds: 600
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
command: command:
...@@ -142,12 +117,6 @@ spec: ...@@ -142,12 +117,6 @@ spec:
extraPodSpec: extraPodSpec:
terminationGracePeriodSeconds: 600 terminationGracePeriodSeconds: 600
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/trtllm-runtime:my-tag image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
command: command:
......
...@@ -22,28 +22,8 @@ spec: ...@@ -22,28 +22,8 @@ spec:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
Planner: Planner:
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: planner componentType: planner
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
volumeMounts: volumeMounts:
- name: dynamo-pvc - name: dynamo-pvc
mountPoint: /data mountPoint: /data
...@@ -65,18 +45,12 @@ spec: ...@@ -65,18 +45,12 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
subComponentType: decode subComponentType: decode
replicas: 2 replicas: 1
resources: resources:
limits: limits:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
...@@ -91,18 +65,12 @@ spec: ...@@ -91,18 +65,12 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
subComponentType: prefill subComponentType: prefill
replicas: 2 replicas: 1
resources: resources:
limits: limits:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
......
...@@ -7,12 +7,10 @@ __all__ = [ ...@@ -7,12 +7,10 @@ __all__ = [
"VirtualConnector", "VirtualConnector",
"LoadPlannerDefaults", "LoadPlannerDefaults",
"SLAPlannerDefaults", "SLAPlannerDefaults",
"ServiceConfig",
"TargetReplica", "TargetReplica",
"SubComponentType", "SubComponentType",
] ]
# Import the classes # Import the classes
from dynamo.planner.config import ServiceConfig
from dynamo.planner.defaults import ( from dynamo.planner.defaults import (
LoadPlannerDefaults, LoadPlannerDefaults,
SLAPlannerDefaults, SLAPlannerDefaults,
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import os
logger = logging.getLogger(__name__)
class ServiceConfig(dict):
"""Configuration store that inherits from dict for simpler access patterns"""
_instance = None
COMMON_CONFIG_SERVICE = "Common"
COMMON_CONFIG_KEY = "common-configs"
@classmethod
def get_instance(cls):
if cls._instance is None:
cls._instance = cls._load_from_env()
return cls._instance
@classmethod
def _load_from_env(cls):
"""Load config from environment variable"""
configs = {}
env_config = os.environ.get("DYNAMO_SERVICE_CONFIG")
if env_config:
try:
configs = json.loads(env_config)
except json.JSONDecodeError:
print("Failed to parse DYNAMO_SERVICE_CONFIG")
return cls(configs) # Initialize dict subclass with configs
def require(self, service_name, key):
"""Require a config value, raising error if not found"""
if service_name not in self or key not in self[service_name]:
raise ValueError(f"{service_name}.{key} must be specified in configuration")
return self[service_name][key]
@classmethod
def get_parsed_config(cls, service_name):
"""Get parsed config for a service with common configs applied, returned as dict"""
instance = cls.get_instance()
if service_name not in instance:
return {}
# Get service config excluding ServiceArgs if it exists
service_config = instance[service_name].copy()
if "ServiceArgs" in service_config:
del service_config["ServiceArgs"]
# Apply common configs if they exist
if (common := instance.get(cls.COMMON_CONFIG_SERVICE)) is not None and (
common_config_keys := service_config.get(cls.COMMON_CONFIG_KEY)
) is not None:
for key in common_config_keys:
if key in common and key not in service_config:
service_config[key] = common[key]
# Remove the common-configs key itself from the final config
if cls.COMMON_CONFIG_KEY in service_config:
del service_config[cls.COMMON_CONFIG_KEY]
return service_config
def as_args(self, service_name, prefix=""):
"""Extract configs as CLI args for a service, with optional prefix filtering.
Every component will additionally have the args in the `Common` configs
applied if it has subscribed to that config key, i.e. the given key is provided in
the component's `common-configs` setting, and that key has not been overriden by the
component's config.
"""
if service_name not in self:
return []
args: list[str] = []
def add_to_args(args: list[str], key: str, value):
if prefix and not key.startswith(prefix):
return
if key.endswith(self.COMMON_CONFIG_KEY):
return
# Strip prefix if needed
arg_key = key[len(prefix) :] if prefix and key.startswith(prefix) else key
# vLLM arguments that default to True and need explicit false handling
# Based on https://github.com/vllm-project/vllm/blob/v0.9.1/vllm/config.py
vllm_true_defaults = {
"enable_prefix_caching": "no-enable-prefix-caching",
"use_tqdm_on_load": "no-use-tqdm-on-load",
"multi_step_stream_outputs": "no-multi-step-stream-outputs",
}
# Normalize key for comparison (replace hyphens with underscores)
normalized_key = arg_key.replace("-", "_")
# Convert to CLI format
if isinstance(value, bool):
if value:
# Always output true values as flags
args.append(f"--{arg_key}")
else:
# For false values, check if this is a vLLM arg that defaults to True
if normalized_key in vllm_true_defaults:
# Use negative flag if available
args.append(f"--{vllm_true_defaults[normalized_key]}")
# For other false values, omit entirely (standard action="store_true" behavior)
elif isinstance(value, dict):
args.extend([f"--{arg_key}", json.dumps(value)])
else:
args.extend([f"--{arg_key}", str(value)])
# Get service config excluding ServiceArgs if it exists
# We never want args to be generated from the ServiceArgs
service_config = self[service_name].copy()
if "ServiceArgs" in service_config:
del service_config["ServiceArgs"]
if (common := self.get(self.COMMON_CONFIG_SERVICE)) is not None and (
common_config_keys := service_config.get(self.COMMON_CONFIG_KEY)
) is not None:
for key in common_config_keys:
if key in common and key not in service_config:
add_to_args(args, key, common[key])
for key, value in service_config.items():
add_to_args(args, key, value)
logger.info(f"Running {service_name} with {args=}")
return args
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import logging import logging
import os import os
import shlex
from enum import Enum from enum import Enum
from typing import Optional from typing import Optional
...@@ -143,6 +144,21 @@ class SubComponentType(str, Enum): ...@@ -143,6 +144,21 @@ class SubComponentType(str, Enum):
DECODE = "decode" DECODE = "decode"
def break_arguments(args: list[str] | None) -> list[str]:
ans: list[str] = []
if args is None:
return ans
if isinstance(args, str):
# Use shlex.split to properly handle quoted arguments and JSON values
ans = shlex.split(args)
else:
for arg in args:
if arg is not None:
# Use shlex.split to properly handle quoted arguments
ans.extend(shlex.split(arg))
return ans
class Service(BaseModel): class Service(BaseModel):
name: str name: str
service: dict service: dict
...@@ -150,6 +166,24 @@ class Service(BaseModel): ...@@ -150,6 +166,24 @@ class Service(BaseModel):
def number_replicas(self) -> int: def number_replicas(self) -> int:
return self.service.get("replicas", 0) return self.service.get("replicas", 0)
def get_model_name(self) -> Optional[str]:
args = (
self.service.get("extraPodSpec", {})
.get("mainContainer", {})
.get("args", [])
)
args = break_arguments(args)
if (
"--served-model-name" in args
and len(args) > args.index("--served-model-name") + 1
):
return args[args.index("--served-model-name") + 1]
if "--model" in args and len(args) > args.index("--model") + 1:
return args[args.index("--model") + 1]
return None
# TODO: still supporting framework component names for backwards compatibility # TODO: still supporting framework component names for backwards compatibility
# Should be deprecated in favor of service subComponentType # Should be deprecated in favor of service subComponentType
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
import logging import logging
import os import os
import shlex
from typing import Optional from typing import Optional
from pydantic import BaseModel from pydantic import BaseModel
...@@ -40,47 +39,6 @@ configure_dynamo_logging() ...@@ -40,47 +39,6 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Service(BaseModel):
name: str
service: dict
def number_replicas(self) -> int:
return self.service.get("replicas", 0)
def get_model_name(self) -> Optional[str]:
args = (
self.service.get("extraPodSpec", {})
.get("mainContainer", {})
.get("args", [])
)
args = break_arguments(args)
if (
"--served-model-name" in args
and len(args) > args.index("--served-model-name") + 1
):
return args[args.index("--served-model-name") + 1]
if "--model" in args and len(args) > args.index("--model") + 1:
return args[args.index("--model") + 1]
return None
def break_arguments(args: list[str] | None) -> list[str]:
ans: list[str] = []
if args is None:
return ans
if isinstance(args, str):
# Use shlex.split to properly handle quoted arguments and JSON values
ans = shlex.split(args)
else:
for arg in args:
if arg is not None:
# Use shlex.split to properly handle quoted arguments
ans.extend(shlex.split(arg))
return ans
class TargetReplica(BaseModel): class TargetReplica(BaseModel):
sub_component_type: SubComponentType sub_component_type: SubComponentType
component_name: Optional[str] = None component_name: Optional[str] = None
...@@ -205,11 +163,11 @@ class KubernetesConnector(PlannerConnector): ...@@ -205,11 +163,11 @@ class KubernetesConnector(PlannerConnector):
# TODO: benchmarks/profiler/utils/config.py already contains DGD config parsing # TODO: benchmarks/profiler/utils/config.py already contains DGD config parsing
# and model name logic, should consolidate # and model name logic, should consolidate
prefill_service = self.get_service_from_sub_component_type_or_name( prefill_service = get_service_from_sub_component_type_or_name(
deployment, deployment,
SubComponentType.PREFILL, SubComponentType.PREFILL,
) )
decode_service = self.get_service_from_sub_component_type_or_name( decode_service = get_service_from_sub_component_type_or_name(
deployment, deployment,
SubComponentType.DECODE, SubComponentType.DECODE,
) )
......
...@@ -67,6 +67,9 @@ class PrometheusAPIClient: ...@@ -67,6 +67,9 @@ class PrometheusAPIClient:
result = self.prom.custom_query(query=query) result = self.prom.custom_query(query=query)
if not result: if not result:
# No data available yet (no requests made) - return 0 silently # No data available yet (no requests made) - return 0 silently
logger.warning(
f"No prometheus metric data available for {full_metric_name}, use 0 instead"
)
return 0 return 0
metrics_containers = parse_frontend_metric_containers(result) metrics_containers = parse_frontend_metric_containers(result)
...@@ -79,6 +82,9 @@ class PrometheusAPIClient: ...@@ -79,6 +82,9 @@ class PrometheusAPIClient:
values.append(container.value[1]) values.append(container.value[1])
if not values: if not values:
logger.warning(
f"No prometheus metric data available for {full_metric_name} with model {model_name} and dynamo namespace {self.dynamo_namespace}, use 0 instead"
)
return 0 return 0
return sum(values) / len(values) return sum(values) / len(values)
......
...@@ -1202,7 +1202,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1202,7 +1202,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
VolumeMounts: []v1alpha1.VolumeMount{ VolumeMounts: []v1alpha1.VolumeMount{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
MountPoint: "/planner", MountPoint: "/planner",
}, },
}, },
...@@ -1426,10 +1426,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1426,10 +1426,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "planner-pvc", ClaimName: "dynamo-pvc",
}, },
}, },
}, },
...@@ -1540,7 +1540,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1540,7 +1540,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
MountPath: "/planner", MountPath: "/planner",
}, },
{ {
...@@ -1725,7 +1725,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1725,7 +1725,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
VolumeMounts: []v1alpha1.VolumeMount{ VolumeMounts: []v1alpha1.VolumeMount{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
MountPoint: "/planner", MountPoint: "/planner",
}, },
}, },
...@@ -2219,10 +2219,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2219,10 +2219,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "planner-pvc", ClaimName: "dynamo-pvc",
}, },
}, },
}, },
...@@ -2321,7 +2321,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2321,7 +2321,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
MountPath: "/planner", MountPath: "/planner",
}, },
{ {
...@@ -2529,7 +2529,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2529,7 +2529,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
VolumeMounts: []v1alpha1.VolumeMount{ VolumeMounts: []v1alpha1.VolumeMount{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
MountPoint: "/planner", MountPoint: "/planner",
}, },
}, },
...@@ -3008,10 +3008,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3008,10 +3008,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
ServiceAccountName: commonconsts.PlannerServiceAccountName, ServiceAccountName: commonconsts.PlannerServiceAccountName,
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "planner-pvc", ClaimName: "dynamo-pvc",
}, },
}, },
}, },
...@@ -3118,7 +3118,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3118,7 +3118,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
Name: "planner-pvc", Name: "dynamo-pvc",
MountPath: "/planner", MountPath: "/planner",
}, },
{ {
......
...@@ -7,7 +7,7 @@ metadata: ...@@ -7,7 +7,7 @@ metadata:
namespace: ${NAMESPACE} namespace: ${NAMESPACE}
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteMany
resources: resources:
requests: requests:
storage: 50Gi storage: 50Gi
...@@ -167,6 +167,9 @@ envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f - ...@@ -167,6 +167,9 @@ envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
# for MoE models # for MoE models
envsubst < benchmarks/profiler/deploy/profile_sla_moe_job.yaml | kubectl apply -f - envsubst < benchmarks/profiler/deploy/profile_sla_moe_job.yaml | kubectl apply -f -
# using aiconfigurator instead of real sweeping (see below for more details)
envsubst < benchmarks/profiler/deploy/profile_sla_aic_job.yaml | kubectl apply -f -
``` ```
**Step 5: Wait for profiling to complete** **Step 5: Wait for profiling to complete**
......
...@@ -9,7 +9,7 @@ Quick deployment guide for the disaggregated planner with automatic scaling. ...@@ -9,7 +9,7 @@ Quick deployment guide for the disaggregated planner with automatic scaling.
**Components:** **Components:**
- **Frontend**: Serves requests and exposes `/metrics` - **Frontend**: Serves requests and exposes `/metrics`
- **Prometheus**: Scrapes frontend metrics every adjustment interval - **Prometheus**: Scrapes frontend metrics every 5s (by default, can be updated in the podmonitor manifest)
- **Planner**: Queries Prometheus and adjusts worker scaling every adjustment interval - **Planner**: Queries Prometheus and adjusts worker scaling every adjustment interval
- **Workers**: prefill and backend workers handle inference - **Workers**: prefill and backend workers handle inference
...@@ -19,7 +19,7 @@ The adjustment interval can be defined in the planner manifest as an argument. T ...@@ -19,7 +19,7 @@ The adjustment interval can be defined in the planner manifest as an argument. T
flowchart LR flowchart LR
Frontend --"/metrics"--> Prometheus Frontend --"/metrics"--> Prometheus
Planner --"query API"--> Prometheus Planner --"query API"--> Prometheus
Planner --"scaling decisions"--> Workers["prefill<br/>backend"] Planner --"scaling decisions"--> Workers
Frontend -.->|"requests"| Workers Frontend -.->|"requests"| Workers
``` ```
...@@ -27,7 +27,7 @@ flowchart LR ...@@ -27,7 +27,7 @@ flowchart LR
- Kubernetes cluster with GPU nodes - Kubernetes cluster with GPU nodes
- [Pre-Deployment Profiling](/docs/benchmarks/pre_deployment_profiling.md) completed and its results saved to `dynamo-pvc` PVC. - [Pre-Deployment Profiling](/docs/benchmarks/pre_deployment_profiling.md) completed and its results saved to `dynamo-pvc` PVC.
- Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script. - Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script.
- [kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running. - [kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running. By default, the prometheus server is not deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
> [!NOTE] > [!NOTE]
> **Important**: The profiling that occurs before Planner deployment requires additional Kubernetes manifests (ServiceAccount, Role, RoleBinding, PVC) that are not included in standard Dynamo deployments. Apply these manifests in the same namespace as `$NAMESPACE`. For a complete setup, start with the [Quick Start guide](/deploy/utils/README.md#quick-start), which provides a fully encapsulated deployment including all required manifests. > **Important**: The profiling that occurs before Planner deployment requires additional Kubernetes manifests (ServiceAccount, Role, RoleBinding, PVC) that are not included in standard Dynamo deployments. Apply these manifests in the same namespace as `$NAMESPACE`. For a complete setup, start with the [Quick Start guide](/deploy/utils/README.md#quick-start), which provides a fully encapsulated deployment including all required manifests.
...@@ -42,8 +42,10 @@ We use vllm as the backend engine in this guide. SLA planner also supports SGLan ...@@ -42,8 +42,10 @@ We use vllm as the backend engine in this guide. SLA planner also supports SGLan
```bash ```bash
# Apply the disaggregated planner deployment # Apply the disaggregated planner deployment
kubectl apply -f components/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE # for vllm kubectl apply -f components/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE # for vllm
# kubectl apply -f components/backends/sglang/deploy/disagg_planner.yaml -n $NAMESPACE # for sglang
# kubectl apply -f components/backends/trtllm/deploy/disagg_planner.yaml -n $NAMESPACE # for trtllm kubectl apply -f components/backends/sglang/deploy/disagg_planner.yaml -n $NAMESPACE # for sglang
kubectl apply -f components/backends/trtllm/deploy/disagg_planner.yaml -n $NAMESPACE # for trtllm
# Check deployment status # Check deployment status
kubectl get pods -n $NAMESPACE kubectl get pods -n $NAMESPACE
...@@ -60,13 +62,11 @@ vllm-disagg-planner-prefill-* 1/1 Running ...@@ -60,13 +62,11 @@ vllm-disagg-planner-prefill-* 1/1 Running
## 2. Test the System ## 2. Test the System
**Important:** Streaming requests (`"stream": true`) are required for the planner to collect latency metrics and make scaling decisions. Non-streaming requests will produce successful inference outputs but won't provide the necessary telemetry for automatic scaling.
```bash ```bash
# Port forward to frontend # Port forward to frontend
kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000 kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
# Send a streaming request (required for full metrics) # Send a request
curl -N http://localhost:8000/v1/chat/completions \ curl -N http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
...@@ -98,8 +98,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10 ...@@ -98,8 +98,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
### Metrics Requirements ### Metrics Requirements
- **Basic metrics** (request count): Available with any request type - **Basic metrics** (request count): Available with any request type
- **Latency metrics** (TTFT/ITL): Only available with `"stream": true` requests - **Latency metrics** (TTFT/ITL): Available for both streaming and non-streaming requests
- **Scaling decisions**: Require sufficient request volume and streaming requests - **Scaling decisions**: Require sufficient request volume
## 4. Troubleshooting ## 4. Troubleshooting
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment