Unverified Commit dea0b201 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: add pre-deployment profiling results on H200 cluster to unblock planner testing (#2495)


Co-authored-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 46595bee
...@@ -11,8 +11,6 @@ spec: ...@@ -11,8 +11,6 @@ spec:
envs: envs:
- name: DYNAMO_SERVICE_CONFIG - name: DYNAMO_SERVICE_CONFIG
value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}' value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
- name: DYNAMO_PORT
value: "8000"
- name: DYNAMO_NAMESPACE - name: DYNAMO_NAMESPACE
value: "vllm-disagg-planner" value: "vllm-disagg-planner"
services: services:
......
...@@ -70,7 +70,7 @@ def _get_default_prometheus_endpoint(port: str, namespace: str): ...@@ -70,7 +70,7 @@ def _get_default_prometheus_endpoint(port: str, namespace: str):
class SLAPlannerDefaults(BasePlannerDefaults): class SLAPlannerDefaults(BasePlannerDefaults):
port = os.environ.get("DYNAMO_PORT", "8000") port = os.environ.get("PROMETHEUS_PORT", "9090")
namespace = os.environ.get("DYNAMO_NAMESPACE", "vllm-disagg-planner") namespace = os.environ.get("DYNAMO_NAMESPACE", "vllm-disagg-planner")
prometheus_endpoint = _get_default_prometheus_endpoint(port, namespace) prometheus_endpoint = _get_default_prometheus_endpoint(port, namespace)
profile_results_dir = "profiling_results" profile_results_dir = "profiling_results"
......
...@@ -115,4 +115,4 @@ kubectl apply -f disagg_planner.yaml -n {$NAMESPACE} ...@@ -115,4 +115,4 @@ kubectl apply -f disagg_planner.yaml -n {$NAMESPACE}
``` ```
> [!NOTE] > [!NOTE]
> The SLA planner requires a frontend that reports metrics at `/metrics` HTTP endpoint with number of requests, ISL, OSL, TTFT, ITL in the correct format. The dynamo frontend provides these metrics automatically. > The SLA planner requires a frontend that reports metrics at `/metrics` HTTP endpoint with number of requests, ISL, OSL, TTFT, ITL in the correct format. The dynamo frontend provides these metrics automatically.
\ No newline at end of file
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->
# SLA Planner Load Test
This directory contains comprehensive testing tools for validating the SLA planner's scaling behavior.
The SLA planner monitors metrics every 60 seconds (default adjustment interval) and scales
prefill/decode workers based on TTFT, ITL, and request patterns.
## Pre-Requisite: Pre-Deployment Profiling Data
You have two options to obtain the pre-deployment profiling data:
### Option A: Use Test Configuration (Quickstart)
Use the pre-configured test deployment with sample profiling data, we provide the results and the deployment configuration for the following models x hardware configurations:
- `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 with max context length 16384, TP1 Prefill, and TP1 Decode. At ISL/OSL 3000/150, it achieves 40k tokens/s/gpu prefill with 80ms TTFT and 10k tokens/s/gpu decode with 10ms ITL. See `profiling_results/H200_TP1P_TP1D/`.
### Option B: Use Your Own Profiling Results
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg
spec:
services:
Frontend:
dynamoNamespace: vllm-disagg
componentType: main
replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "16"
memory: "10Gi"
limits:
cpu: "128"
memory: "100Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0813-03
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000"
VllmDecodeWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "16"
memory: "20Gi"
gpu: "1"
limits:
cpu: "128"
memory: "100Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0813-03
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "16"
memory: "20Gi"
gpu: "1"
limits:
cpu: "128"
memory: "100Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0813-03
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment