chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)

157714aa · Hongkuan Zhou · GitHub · 8248a116 · 157714aa · 157714aa
Unverified Commit 157714aa authored Jul 29, 2025 by Hongkuan Zhou Committed by GitHub Jul 29, 2025
6 changed files
--- a/benchmarks/profiler/deploy/profile_sla_job.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_job.yaml
@@ -14,11 +14,8 @@ spec:
        image: ${DOCKER_IMAGE}
        resources:
          requests:
-            cpu: "1"
+            cpu: "16"
-            memory: "2Gi"
+            memory: "10Gi"
-          limits:
-            cpu: "2"
-            memory: "4Gi"
        env:
          - name: HUGGING_FACE_HUB_TOKEN
            valueFrom:
@@ -37,6 +34,18 @@ spec:
          - /workspace/profiling_results
          - --namespace
          - ${NAMESPACE}
+          - --min-num-gpus-per-engine
+          - "1"
+          - --max-num-gpus-per-engine
+          - "8"
+          - --isl
+          - "3000"
+          - --osl
+          - "150"
+          - --ttft
+          - "200"
+          - --itl
+          - "20"
        volumeMounts:
          - name: output-volume
            mountPath: /workspace/profiling_results

--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
        if target == "prefill":
            # convert prefill worker into decode worker
            config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
            ] = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
            ]
            del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
            ]
            args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
            ]["extraPodSpec"]["mainContainer"]["args"]
            args = break_arguments(args)
@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
            if "--no-enable-prefix-caching" not in args:
                args = append_argument(args, "--no-enable-prefix-caching")
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+            config["spec"]["services"][
-                "extraPodSpec"
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
-            ]["mainContainer"]["args"] = join_arguments(args)
+            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
        elif target == "decode":
            # delete prefill worker
            del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
            ]
            args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
            ]["extraPodSpec"]["mainContainer"]["args"]
            args = break_arguments(args)
@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
            if "--no-enable-prefix-caching" in args:
                args.remove("--no-enable-prefix-caching")
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+            config["spec"]["services"][
-                "extraPodSpec"
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
-            ]["mainContainer"]["args"] = join_arguments(args)
+            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
        # set num workers to 1
        decode_worker_config = config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm"].decode_worker
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
        ]
        decode_worker_config["replicas"] = 1
@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
    def set_config_tp_size(cls, config: dict, tp_size: int):
        config = deepcopy(config)
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+        config["spec"]["services"][
-            "resources"
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
-        ]["requests"]["gpu"] = str(tp_size)
+        ]["resources"]["requests"]["gpu"] = str(tp_size)
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+        config["spec"]["services"][
-            "resources"
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
-        ]["limits"]["gpu"] = str(tp_size)
+        ]["resources"]["limits"]["gpu"] = str(tp_size)
-        args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+        args = config["spec"]["services"][
-            "extraPodSpec"
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
-        ]["mainContainer"]["args"]
+        ]["extraPodSpec"]["mainContainer"]["args"]
        args = break_arguments(args)
@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
        except ValueError:
            args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+        config["spec"]["services"][
-            "extraPodSpec"
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
-        ]["mainContainer"]["args"] = join_arguments(args)
+        ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
        return config
    @classmethod
    def get_model_name(cls, config: dict) -> str:
-        worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
+        worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
        args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
            "args"
        ]

--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -141,7 +141,7 @@ spec:
            - -c
          args:
            - "python3 -m dynamo.planner.prometheus"
-    backend:
+    VllmDecodeWorker:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: worker
@@ -191,7 +191,7 @@ spec:
            - -c
          args:
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
-    prefill:
+    VllmPrefillWorker:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: worker

--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
 class VllmComponentName:
-    prefill_worker = "prefill"
+    prefill_worker_k8s_name = "VllmPrefillWorker"
+    prefill_worker_component_name = "prefill"
    prefill_worker_endpoint = "generate"
-    decode_worker = "backend"
+    decode_worker_k8s_name = "VllmDecodeWorker"
+    decode_worker_component_name = "backend"
    decode_worker_endpoint = "generate"

--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -106,7 +106,11 @@ class Planner:
            if self.prefill_client is None:
                self.prefill_client = (
                    await self.runtime.namespace(self.namespace)
-                    .component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
+                    .component(
+                        WORKER_COMPONENT_NAMES[
+                            self.args.backend
+                        ].prefill_worker_component_name
+                    )
                    .endpoint(
                        WORKER_COMPONENT_NAMES[
                            self.args.backend
@@ -127,7 +131,11 @@ class Planner:
            if self.workers_client is None:
                self.workers_client = (
                    await self.runtime.namespace(self.namespace)
-                    .component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
+                    .component(
+                        WORKER_COMPONENT_NAMES[
+                            self.args.backend
+                        ].decode_worker_component_name
+                    )
                    .endpoint(
                        WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
                    )
@@ -300,8 +308,12 @@ class Planner:
        if not self.args.no_operation:
            target_replicas = {
-                WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
+                WORKER_COMPONENT_NAMES[
-                WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
+                    self.args.backend
+                ].prefill_worker_k8s_name: next_num_p,
+                WORKER_COMPONENT_NAMES[
+                    self.args.backend
+                ].decode_worker_k8s_name: next_num_d,
            }
            await self.connector.set_component_replicas(target_replicas, blocking=False)

--- a/docs/architecture/pre_deployment_profiling.md
+++ b/docs/architecture/pre_deployment_profiling.md
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
 # in the project's root folder
 ./container/build.sh --framework VLLM
 # Tag and push to your container registry
+export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
+# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
+# Modify this yaml to profile different models
+export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
 ```
 Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.
-**Step 2: Run profiling (required)**
+**Step 2: Set SLA target**
+Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.
+```yaml
+spec:
+  template:
+    spec:
+      containers:
+        - name: profile-sla
+          args:
+            - --isl
+            - "3000" # average ISL is 3000 tokens
+            - --osl
+            - "150" # average OSL is 150 tokens
+            - --ttft
+            - "200" # target TTFT is 200ms
+            - --itl
+            - "20" # target ITL is 20ms
+```
+**Step 3: Run profiling (required)**
 ```bash
 cd $DYNAMO_HOME/benchmarks/profiler/deploy
 envsubst < profiling_pvc.yaml | kubectl apply -f -
 envsubst < profile_sla_sa.yaml | kubectl apply -f -
 envsubst < profile_sla_rbac.yaml | kubectl apply -f -
 envsubst < profile_sla_binding.yaml | kubectl apply -f -
-export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image
-# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
-export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
 envsubst < profile_sla_job.yaml | kubectl apply -f -
 ```
-**Step 3: Wait for profiling to complete**
+**Step 4: Wait for profiling to complete**
 ```bash
 kubectl get jobs -n $NAMESPACE
 kubectl logs job/profile-sla -n $NAMESPACE
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
 1. **Create a temporary pod to access the PVC:**
   ```bash
-   kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \
+   kubectl run temp-access --image=alpine:latest --restart=Never \
-     --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
+     --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
     -n $NAMESPACE
   ```
 2. **Inside the temporary pod, navigate to the results directory:**
   ```bash
+   kubectl exec -it temp-access -n $NAMESPACE -- sh
   cd /workspace/profiling_results
   ls -la
   ```