Unverified Commit 00061061 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: add dsr1 k8s yaml (#3101)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent c34f945b
......@@ -6,7 +6,7 @@
| llama-3-70b | vllm | disagg-multi-node | ✓ | ✓ |
| llama-3-70b | vllm | disagg-single-node | ✓ | ✓ |
| oss-gpt | trtllm | aggregated | ✓ | ✓ |
| DeepSeek-R1 | sglang | disaggregated | 🚧 | 🚧 |
| DeepSeek-R1 | sglang | disaggregated | | 🚧 |
## Prerequisites
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-cache
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 1000Gi
storageClassName: "your-storage-class-name"
\ No newline at end of file
# Container
Use the Dockerfile in `container/Dockerfile.sglang-wideep` to build the container, or
```bash
./container/build.sh --framework sglang-wideep
```
Dynamo commits after `1b3eed4b6a0e735d4ecec6681f4c0b89f2112167` (Sep 18, 2025) are required.
# Hardware
The two deployment recipes are for 8xH200 and 16xH200. It should also work for other GPU SKUs. Change the TDP and DEP size accordingly to match the GPU capacity.
If you see NCCL errors when sending requests to the engines, it is usually caused by OOM error. Try to reduce `--mem-fraction-static` in both prefill and decode engines.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sgl-dsr1-16gpu
spec:
services:
Frontend:
dynamoNamespace: sgl-dsr1-16gpu
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 8000
periodSeconds: 10
timeoutSeconds: 1800
failureThreshold: 60
image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
decode:
dynamoNamespace: sgl-dsr1-16gpu
componentType: worker
replicas: 1
multinode:
nodeCount: 2
resources:
limits:
gpu: "8"
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 1800
failureThreshold: 60
image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
workingDir: /workspace/components/backends/sglang
command:
- /bin/sh
- -c
args:
- >-
exec python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1
--served-model-name deepseek-ai/DeepSeek-R1
--tp 16
--dp 16
--enable-dp-attention
--ep-size 16
--trust-remote-code
--skip-tokenizer-init
--disaggregation-mode decode
--disaggregation-transfer-backend nixl
--disaggregation-bootstrap-port 30001
--mem-fraction-static 0.8
prefill:
dynamoNamespace: sgl-dsr1-16gpu
componentType: worker
replicas: 1
multinode:
nodeCount: 2
resources:
limits:
gpu: "8"
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 1800
failureThreshold: 60
image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
workingDir: /workspace/components/backends/sglang
command:
- /bin/sh
- -c
args:
- >-
exec python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1
--served-model-name deepseek-ai/DeepSeek-R1
--tp 16
--ep-size 16
--trust-remote-code
--skip-tokenizer-init
--disaggregation-mode prefill
--disaggregation-transfer-backend nixl
--disaggregation-bootstrap-port 30001
--mem-fraction-static 0.8
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sgl-dsr1-8gpu
spec:
services:
Frontend:
dynamoNamespace: sgl-dsr1-8gpu
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 8000
periodSeconds: 10
timeoutSeconds: 1800
failureThreshold: 60
image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
decode:
dynamoNamespace: sgl-dsr1-8gpu
componentType: worker
replicas: 1
resources:
limits:
gpu: "8"
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 1800
failureThreshold: 60
image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
workingDir: /workspace/components/backends/sglang
command:
- /bin/sh
- -c
args:
- >-
exec python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1
--served-model-name deepseek-ai/DeepSeek-R1
--tp 8
--dp 8
--enable-dp-attention
--ep-size 8
--trust-remote-code
--skip-tokenizer-init
--disaggregation-mode decode
--disaggregation-transfer-backend nixl
--disaggregation-bootstrap-port 30001
prefill:
dynamoNamespace: sgl-dsr1-8gpu
componentType: worker
replicas: 1
resources:
limits:
gpu: "8"
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 1800
failureThreshold: 60
image: nvcr.io/nvidian/dynamo-dev/dynamo-sglang-wideep-runtime:hzhou-0917-01
workingDir: /workspace/components/backends/sglang
command:
- /bin/sh
- -c
args:
- >-
exec python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1
--served-model-name deepseek-ai/DeepSeek-R1
--tp 8
--ep-size 8
--trust-remote-code
--skip-tokenizer-init
--disaggregation-mode prefill
--disaggregation-transfer-backend nixl
--disaggregation-bootstrap-port 30001
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment