Unverified Commit 26dc6281 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: sglang k8s health/live, update doc (#2272)

parent 6fed066b
......@@ -88,14 +88,14 @@ docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.3.2
### Aggregated Serving
```bash
cd $DYNAMO_ROOT/components/backends/sglang
cd $DYNAMO_HOME/components/backends/sglang
./launch/agg.sh
```
### Aggregated Serving with KV Routing
```bash
cd $DYNAMO_ROOT/components/backends/sglang
cd $DYNAMO_HOME/components/backends/sglang
./launch/agg_router.sh
```
......@@ -119,7 +119,7 @@ Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead
> Disaggregated serving in SGLang currently requires each worker to have the same tensor parallel size [unless you are using an MLA based model](https://github.com/sgl-project/sglang/pull/5922)
```bash
cd $DYNAMO_ROOT/components/backends/sglang
cd $DYNAMO_HOME/components/backends/sglang
./launch/disagg.sh
```
......@@ -129,12 +129,32 @@ You can use this configuration to test out disaggregated serving with dp attenti
```bash
# note this will require 4 GPUs
cd $DYNAMO_ROOT/components/backends/sglang
cd $DYNAMO_HOME/components/backends/sglang
./launch/disagg_dp_attn.sh
```
When using MoE models, you can also use the our implementation of the native SGLang endpoints to record expert distribution data. The `disagg_dp_attn.sh` script automatically sets up the SGLang HTTP server, the environment variable that controls the expert distribution recording directory, and sets up the expert distribution recording mode to `stat`. You can learn more about expert parallelism load balancing [here](docs/expert-distribution-eplb.md).
### Testing the Deployment
Send a test request to verify your deployment:
```bash
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
}
],
"stream": false,
"max_tokens": 30
}'
```
## Request Migration
You can enable [request migration](../../../docs/architecture/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
......
......@@ -21,7 +21,7 @@ spec:
command:
- /bin/sh
- -c
- "exit 0"
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
......@@ -31,11 +31,11 @@ spec:
replicas: 1
resources:
requests:
cpu: "5"
cpu: "10"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
cpu: "32"
memory: "40Gi"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
......@@ -46,24 +46,20 @@ spec:
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 1
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 60
dynamoNamespace: sglang-agg
componentType: worker
replicas: 1
......@@ -73,11 +69,24 @@ spec:
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
args:
......
......@@ -21,7 +21,7 @@ spec:
command:
- /bin/sh
- -c
- "exit 0"
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
......@@ -31,11 +31,11 @@ spec:
replicas: 1
resources:
requests:
cpu: "5"
cpu: "10"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
cpu: "32"
memory: "40Gi"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
......@@ -46,24 +46,20 @@ spec:
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 1
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 60
dynamoNamespace: sglang-agg-router
componentType: worker
replicas: 1
......@@ -73,11 +69,24 @@ spec:
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
args:
......
......@@ -21,7 +21,7 @@ spec:
command:
- /bin/sh
- -c
- "exit 0"
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
......@@ -31,14 +31,14 @@ spec:
replicas: 1
resources:
requests:
cpu: "5"
cpu: "10"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
cpu: "32"
memory: "40Gi"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
workingDir: /workspace/components/backends/sglang
command: ["sh", "-c"]
args:
......@@ -46,24 +46,20 @@ spec:
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 1
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 60
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
......@@ -73,12 +69,25 @@ spec:
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
workingDir: /workspace/components/backends/sglang
args:
- "python3"
......@@ -101,24 +110,20 @@ spec:
SGLangPrefillWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 1
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 60
periodSeconds: 60
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 10
failureThreshold: 60
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
......@@ -128,12 +133,25 @@ spec:
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0804
workingDir: /workspace/components/backends/sglang
args:
- "python3"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment