Unverified Commit cfc6178a authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: add sglang disagg deployment examples (#2137)

parent f8096590
...@@ -42,7 +42,7 @@ spec: ...@@ -42,7 +42,7 @@ spec:
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"] command: ["sh", "-c"]
args: args:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend" - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000"
SGLangDecodeWorker: SGLangDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe: livenessProbe:
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-agg-router
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "5"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"]
args:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000 --router-mode kv"
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
args:
- "python3"
- "-m"
- "dynamo.sglang.worker"
- "--model-path"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--served-model-name"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--page-size"
- "16"
- "--tp"
- "1"
- "--trust-remote-code"
- "--skip-tokenizer-init"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-disagg
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "5"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"]
args:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000"
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
args:
- "python3"
- "-m"
- "dynamo.sglang.worker"
- "--model-path"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--served-model-name"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--page-size"
- "16"
- "--tp"
- "1"
- "--trust-remote-code"
- "--skip-tokenizer-init"
- "--disaggregation-mode"
- "decode"
- "--disaggregation-transfer-backend"
- "nixl"
SGLangPrefillWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
args:
- "python3"
- "-m"
- "dynamo.sglang.worker"
- "--model-path"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--served-model-name"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--page-size"
- "16"
- "--tp"
- "1"
- "--trust-remote-code"
- "--skip-tokenizer-init"
- "--disaggregation-mode"
- "prefill"
- "--disaggregation-transfer-backend"
- "nixl"
\ No newline at end of file
...@@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM ...@@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
# run ingress # run ingress
dynamo run in=http out=dyn --http-port=8000 & python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
......
...@@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM ...@@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
# run ingress # run ingress
dynamo run in=http out=dyn --http-port=8000 & python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
......
...@@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM ...@@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
# run ingress # run ingress
dynamo run in=http out=dyn --http-port=8000 & python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
...@@ -33,7 +33,7 @@ python3 -m dynamo.sglang.worker \ ...@@ -33,7 +33,7 @@ python3 -m dynamo.sglang.worker \
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
CUDA_VISIBLE_DEVICES=2,3 python3 dynamo.sglang.decode_worker \ CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang.decode_worker \
--model-path silence09/DeepSeek-R1-Small-2layers \ --model-path silence09/DeepSeek-R1-Small-2layers \
--served-model-name silence09/DeepSeek-R1-Small-2layers \ --served-model-name silence09/DeepSeek-R1-Small-2layers \
--tp 2 \ --tp 2 \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment