chore: add dsr1 k8s yaml (#3101)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

chore: add dsr1 k8s yaml (#3101)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
00061061 · Hongkuan Zhou · GitHub · c34f945b · 00061061 · 00061061
Unverified Commit 00061061 authored Sep 18, 2025 by Hongkuan Zhou Committed by GitHub Sep 18, 2025
6 changed files
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -6,7 +6,7 @@
 | llama-3-70b   | vllm    | disagg-multi-node   |     ✓      |     ✓     |
 | llama-3-70b   | vllm    | disagg-single-node  |     ✓      |     ✓     |
 | oss-gpt       | trtllm  | aggregated          |     ✓      |     ✓     |
-| DeepSeek-R1   | sglang  | disaggregated       |     🚧     |    🚧     |
+| DeepSeek-R1   | sglang  | disaggregated       |     ✓      |    🚧     |


 ## Prerequisites

--- a/recipes/deepseek-r1/model_cache/model-cache.yaml
+++ b/recipes/deepseek-r1/model_cache/model-cache.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-cache
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 1000Gi
+  storageClassName: "your-storage-class-name"
\ No newline at end of file
--- a/recipes/deepseek-r1/model_cache/model-download.yaml
+++ b/recipes/deepseek-r1/model_cache/model-download.yaml
--- a/recipes/deepseek-r1/sglang-wideep/README.md
+++ b/recipes/deepseek-r1/sglang-wideep/README.md
+# Container
+
+Use the Dockerfile in `container/Dockerfile.sglang-wideep` to build the container, or
+
+```bash
+./container/build.sh --framework sglang-wideep
+```
+
+Dynamo commits after `1b3eed4b6a0e735d4ecec6681f4c0b89f2112167` (Sep 18, 2025) are required.
+
+# Hardware
+
+The two deployment recipes are for 8xH200 and 16xH200. It should also work for other GPU SKUs. Change the TDP and DEP size accordingly to match the GPU capacity.
+
+If you see NCCL errors when sending requests to the engines, it is usually caused by OOM error. Try to reduce `--mem-fraction-static` in both prefill and decode engines.
+
--- a/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml
+++ b/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sgl-dsr1-16gpu
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: sgl-dsr1-16gpu
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
+    decode:
+      dynamoNamespace: sgl-dsr1-16gpu
+      componentType: worker
+      replicas: 1
+      multinode:
+        nodeCount: 2
+      resources:
+        limits:
+          gpu: "8"
+      pvc:
+        create: false
+        name: model-cache
+        mountPoint: /root/.cache/huggingface
+      sharedMemory:
+        size: 80Gi
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - >-
+              exec python3 -m dynamo.sglang
+              --model-path deepseek-ai/DeepSeek-R1
+              --served-model-name deepseek-ai/DeepSeek-R1
+              --tp 16
+              --dp 16
+              --enable-dp-attention
+              --ep-size 16
+              --trust-remote-code
+              --skip-tokenizer-init
+              --disaggregation-mode decode
+              --disaggregation-transfer-backend nixl
+              --disaggregation-bootstrap-port 30001
+              --mem-fraction-static 0.8
+    prefill:
+      dynamoNamespace: sgl-dsr1-16gpu
+      componentType: worker
+      replicas: 1
+      multinode:
+        nodeCount: 2
+      resources:
+        limits:
+          gpu: "8"
+      pvc:
+        create: false
+        name: model-cache
+        mountPoint: /root/.cache/huggingface
+      sharedMemory:
+        size: 80Gi
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - >-
+              exec python3 -m dynamo.sglang
+              --model-path deepseek-ai/DeepSeek-R1
+              --served-model-name deepseek-ai/DeepSeek-R1
+              --tp 16
+              --ep-size 16
+              --trust-remote-code
+              --skip-tokenizer-init
+              --disaggregation-mode prefill
+              --disaggregation-transfer-backend nixl
+              --disaggregation-bootstrap-port 30001
+              --mem-fraction-static 0.8
\ No newline at end of file
--- a/recipes/deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml
+++ b/recipes/deepseek-r1/sglang-wideep/tep8p-dep8d-disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sgl-dsr1-8gpu
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: sgl-dsr1-8gpu
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
+    decode:
+      dynamoNamespace: sgl-dsr1-8gpu
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "8"
+      pvc:
+        create: false
+        name: model-cache
+        mountPoint: /root/.cache/huggingface
+      sharedMemory:
+        size: 80Gi
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - >-
+              exec python3 -m dynamo.sglang
+              --model-path deepseek-ai/DeepSeek-R1
+              --served-model-name deepseek-ai/DeepSeek-R1
+              --tp 8
+              --dp 8
+              --enable-dp-attention
+              --ep-size 8
+              --trust-remote-code
+              --skip-tokenizer-init
+              --disaggregation-mode decode
+              --disaggregation-transfer-backend nixl
+              --disaggregation-bootstrap-port 30001
+    prefill:
+      dynamoNamespace: sgl-dsr1-8gpu
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "8"
+      pvc:
+        create: false
+        name: model-cache
+        mountPoint: /root/.cache/huggingface
+      sharedMemory:
+        size: 80Gi
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - >-
+              exec python3 -m dynamo.sglang
+              --model-path deepseek-ai/DeepSeek-R1
+              --served-model-name deepseek-ai/DeepSeek-R1
+              --tp 8
+              --ep-size 8
+              --trust-remote-code
+              --skip-tokenizer-init
+              --disaggregation-mode prefill
+              --disaggregation-transfer-backend nixl
+              --disaggregation-bootstrap-port 30001
\ No newline at end of file