chore: sglang k8s health/live, update doc (#2272)

26dc6281 · Hongkuan Zhou · GitHub · 6fed066b · 26dc6281 · 26dc6281
Unverified Commit 26dc6281 authored Aug 04, 2025 by Hongkuan Zhou Committed by GitHub Aug 04, 2025
4 changed files
--- a/components/backends/sglang/README.md
+++ b/components/backends/sglang/README.md
@@ -88,14 +88,14 @@ docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.3.2
 ### Aggregated Serving

 ```bash
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/agg.sh
 ```

 ### Aggregated Serving with KV Routing

 ```bash
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/agg_router.sh
 ```

@@ -119,7 +119,7 @@ Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead
 > Disaggregated serving in SGLang currently requires each worker to have the same tensor parallel size [unless you are using an MLA based model](https://github.com/sgl-project/sglang/pull/5922)

 ```bash
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/disagg.sh
 ```

@@ -129,12 +129,32 @@ You can use this configuration to test out disaggregated serving with dp attenti

 ```bash
 # note this will require 4 GPUs
-cd $DYNAMO_ROOT/components/backends/sglang
+cd $DYNAMO_HOME/components/backends/sglang
 ./launch/disagg_dp_attn.sh
 ```

 When using MoE models, you can also use the our implementation of the native SGLang endpoints to record expert distribution data. The `disagg_dp_attn.sh` script automatically sets up the SGLang HTTP server, the environment variable that controls the expert distribution recording directory, and sets up the expert distribution recording mode to `stat`. You can learn more about expert parallelism load balancing [here](docs/expert-distribution-eplb.md).

+### Testing the Deployment
+
+Send a test request to verify your deployment:
+
+```bash
+curl localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "messages": [
+    {
+        "role": "user",
+        "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
+    }
+    ],
+    "stream": false,
+    "max_tokens": 30
+  }'
+```
+
 ## Request Migration

 You can enable [request migration](../../../docs/architecture/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:

--- a/components/backends/sglang/deploy/agg.yaml
+++ b/components/backends/sglang/deploy/agg.yaml
@@ -21,7 +21,7 @@ spec:
          command:
            - /bin/sh
            - -c
-            - "exit 0"
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
@@ -31,11 +31,11 @@ spec:
      replicas: 1
      resources:
        requests:
-          cpu: "5"
+          cpu: "10"
          memory: "10Gi"
        limits:
-          cpu: "5"
-          memory: "10Gi"
+          cpu: "32"
+          memory: "40Gi"
      extraPodSpec:
        mainContainer:
          image: my-registry/sglang-runtime:my-tag
@@ -46,24 +46,20 @@ spec:
    SGLangDecodeWorker:
      envFromSecret: hf-token-secret
      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 1
      readinessProbe:
        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 60
      dynamoNamespace: sglang-agg
      componentType: worker
      replicas: 1
@@ -73,11 +69,24 @@ spec:
          memory: "20Gi"
          gpu: "1"
        limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "80Gi"
          gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
      extraPodSpec:
        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
          image: my-registry/sglang-runtime:my-tag
          workingDir: /workspace/components/backends/sglang
          args:

--- a/components/backends/sglang/deploy/agg_router.yaml
+++ b/components/backends/sglang/deploy/agg_router.yaml
@@ -21,7 +21,7 @@ spec:
          command:
            - /bin/sh
            - -c
-            - "exit 0"
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
@@ -31,11 +31,11 @@ spec:
      replicas: 1
      resources:
        requests:
-          cpu: "5"
+          cpu: "10"
          memory: "10Gi"
        limits:
-          cpu: "5"
-          memory: "10Gi"
+          cpu: "32"
+          memory: "40Gi"
      extraPodSpec:
        mainContainer:
          image: my-registry/sglang-runtime:my-tag
@@ -46,24 +46,20 @@ spec:
    SGLangDecodeWorker:
      envFromSecret: hf-token-secret
      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 1
      readinessProbe:
        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 60
      dynamoNamespace: sglang-agg-router
      componentType: worker
      replicas: 1
@@ -73,11 +69,24 @@ spec:
          memory: "20Gi"
          gpu: "1"
        limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "80Gi"
          gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
      extraPodSpec:
        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
          image: my-registry/sglang-runtime:my-tag
          workingDir: /workspace/components/backends/sglang
          args:

--- a/components/backends/sglang/deploy/disagg.yaml
+++ b/components/backends/sglang/deploy/disagg.yaml
@@ -21,7 +21,7 @@ spec:
          command:
            - /bin/sh
            - -c
-            - "exit 0"
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
@@ -31,14 +31,14 @@ spec:
      replicas: 1
      resources:
        requests:
-          cpu: "5"
+          cpu: "10"
          memory: "10Gi"
        limits:
-          cpu: "5"
-          memory: "10Gi"
+          cpu: "32"
+          memory: "40Gi"
      extraPodSpec:
        mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
          workingDir: /workspace/components/backends/sglang
          command: ["sh", "-c"]
          args:
@@ -46,24 +46,20 @@ spec:
    SGLangDecodeWorker:
      envFromSecret: hf-token-secret
      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 1
      readinessProbe:
        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 60
      dynamoNamespace: sglang-disagg
      componentType: worker
      replicas: 1
@@ -73,12 +69,25 @@ spec:
          memory: "20Gi"
          gpu: "1"
        limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "80Gi"
          gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
      extraPodSpec:
        mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
          workingDir: /workspace/components/backends/sglang
          args:
            - "python3"
@@ -101,24 +110,20 @@ spec:
    SGLangPrefillWorker:
      envFromSecret: hf-token-secret
      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 1
      readinessProbe:
        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
        timeoutSeconds: 30
-        failureThreshold: 10
+        failureThreshold: 60
      dynamoNamespace: sglang-disagg
      componentType: worker
      replicas: 1
@@ -128,12 +133,25 @@ spec:
          memory: "20Gi"
          gpu: "1"
        limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "80Gi"
          gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
      extraPodSpec:
        mainContainer:
-          image: my-registry/sglang-runtime:my-tag
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
          workingDir: /workspace/components/backends/sglang
          args:
            - "python3"