Unverified Commit 59df10d1 authored by William Arnold's avatar William Arnold Committed by GitHub
Browse files

feat: Add glm-5 nvfp4 recipe (#7780)


Signed-off-by: default avatarWilliam Arnold <warnold@nvidia.com>
parent 9efa460c
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE="lmsysorg/sglang:v0.5.10.post1-cu130"
ARG ARCH=arm64
ARG DYNAMO_COMMIT="main"
ARG CARGO_BUILD_JOBS="16"
# ======== Stage 1: Builder (compiles Rust, builds wheel) ========
FROM ${BASE_IMAGE} AS builder
ARG DYNAMO_COMMIT
ARG CARGO_BUILD_JOBS
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS}
WORKDIR /build
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
ca-certificates curl build-essential pkg-config libclang-dev protobuf-compiler git; \
rm -rf /var/lib/apt/lists/*; \
curl -fsSL https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable
ENV CARGO_HOME=/root/.cargo \
RUSTUP_HOME=/root/.rustup
ENV PATH="${CARGO_HOME}/bin:${PATH}"
RUN cargo install maturin --locked
RUN git clone https://github.com/ai-dynamo/dynamo.git /build/dynamo && \
cd /build/dynamo && git checkout ${DYNAMO_COMMIT}
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
cd /build/dynamo/lib/bindings/python && \
maturin build --release && \
mkdir -p /build/dist && \
cp target/wheels/*.whl /build/dist/
# ======== Stage 2: Final image ========
FROM ${BASE_IMAGE}
ARG ARCH=arm64
# Install flashinfer with matching jit-cache.
# Pin to 0.6.7 and install jit-cache from flashinfer wheel index so all three
# packages (python, cubin, jit-cache) are consistent.
RUN pip install "flashinfer-python==0.6.7" "flashinfer-cubin==0.6.7" && \
pip install "flashinfer-jit-cache==0.6.7" --index-url https://flashinfer.ai/whl/cu130
# Install dynamo from source (includes _compat.py which handles sglang API changes)
COPY --from=builder /build/dynamo /sgl-workspace/dynamo
COPY --from=builder /build/dist/*.whl /tmp/
RUN pip install --no-cache-dir /tmp/*.whl && \
cd /sgl-workspace/dynamo && \
pip install --no-cache-dir -e ".[sglang]" --no-deps && \
rm -f /tmp/*.whl
<!--
SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->
# GLM-5 NVFP4 — Disaggregated Prefill/Decode on GB200
Serves [nvidia/GLM-5-NVFP4](https://huggingface.co/nvidia/GLM-5-NVFP4) using SGLang with
disaggregated prefill/decode and EAGLE speculative decoding via Dynamo on GB200 nodes.
## Topology
| Role | Nodes | GPUs/node | Total GPUs | Parallelism |
|---------|-------|-----------|------------|--------------------|
| Decode | 4 | 4 | 16 | TP16 / DP16 / EP16 |
| Prefill | 1 | 4 | 4 | TP4 |
## Prerequisites
- 5 4xGB200 nodes in an NVL36 or NVL72 domain
- A Kubernetes cluster with the [Dynamo Operator](../../docs/kubernetes/README.md) installed
- Shared NFS PVC for model weights
## Step 1: Build the Container
This recipe currently requires using unreleased versions of SGLang and Dynamo.
Use the command below to build and push a container with the necessary dependencies.
```bash
docker buildx build \
--platform linux/arm64 \
--build-arg ARCH=arm64 \
-t <your-registry>/sglang-dynamo-glm5:latest \
-f recipes/glm-5-nvfp4/Dockerfile \
--push .
```
## Step 2: Download the Model
Create the PVC, HuggingFace token secret, and download the model weights:
```bash
kubectl apply -f recipes/glm-5-nvfp4/model-cache/model-cache.yaml
kubectl create secret generic hf-token-secret \
--from-literal=HF_TOKEN=<your-hf-token>
kubectl apply -f recipes/glm-5-nvfp4/model-cache/model-download.yaml
kubectl wait --for=condition=complete job/model-download --timeout=3600s
```
## Step 3: Deploy
Edit `sglang/disagg/deploy.yaml` and replace all `<placeholder>` values:
- `<your-namespace>` — your Kubernetes namespace
- `<your-registry>/sglang-dynamo-glm5:latest` — your built container image
```bash
kubectl apply -f recipes/glm-5-nvfp4/sglang/disagg/deploy.yaml
```
Monitor startup (decode takes ~15 min to load and capture CUDA graphs):
```bash
kubectl get pods -n <your-namespace> -l app.kubernetes.io/part-of=glm5-sglang -w
```
## Step 4: Test
```bash
kubectl port-forward svc/glm5-sglang-frontend 8000:8000 -n <your-namespace> &
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"nvidia/GLM-5-NVFP4","messages":[{"role":"user","content":"Hello!"}],"max_tokens":128}'
```
## Step 5: Benchmark (optional)
Edit `sglang/disagg/perf.yaml` to set your namespace and PVC, then run:
```bash
kubectl apply -f recipes/glm-5-nvfp4/sglang/disagg/perf.yaml
kubectl logs -f -l job-name=glm5-disagg-bench -n <your-namespace>
```
Default benchmark: ISL=1000, OSL=8192, concurrency=512 (32/GPU).
## Key Configuration Notes
### Speculative Decoding (EAGLE MTP)
Two env vars enable working speculative decoding (~85-95% accept rate):
- `SGLANG_ENABLE_SPEC_V2=1` — uses EAGLEWorkerV2 with overlap scheduler
- `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE=1` — quantizes BF16 MTP layer to FP8 at load
time, matching the base model's compute path
The MTP layer weights in `nvidia/GLM-5-NVFP4` are BF16 (split across shards 271-274)
and are fully indexed in the checkpoint's `model.safetensors.index.json`.
### KV Cache
Uses `--kv-cache-dtype fp8_e4m3` (NSA backend auto-selects this on SM100/GB200).
Saves ~50% KV memory vs BF16.
### Discovery
Uses Kubernetes service discovery. Worker registration is tied to pod lifetime
via Kubernetes EndpointSlices, preventing TTL expiry issues under high load.
## Performance (ISL=1k, OSL=8k, concurrency=512)
| Metric | Value |
|--------|-------|
| Output throughput | ~19,000 tokens/sec |
| TTFT p50 | ~850ms |
| ITL avg | ~24ms/token |
| Tokens/user/sec | ~41 |
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-cache
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 400Gi
storageClassName: "your-storage-class-name"
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: model-download
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: model-download
spec:
restartPolicy: Never
containers:
- name: model-download
image: python:3.10-slim
securityContext:
capabilities:
drop: ["ALL"]
seccompProfile:
type: RuntimeDefault
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: MODEL_NAME
value: nvidia/GLM-5-NVFP4
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
hf download $MODEL_NAME
volumeMounts:
- name: model-cache
mountPath: /model-store
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# GLM-5 (NVFP4) disaggregated prefill/decode on GB200 via SGLang + Dynamo
#
# Prerequisites:
# - dynamo-platform HelmRelease applied
# - Model weights at /models/nvidia-GLM-5-NVFP4 on shared NFS PVC
# - Container built from recipes/glm-5-nvfp4/Dockerfile
# - HF_TOKEN secret created: kubectl create secret generic hf-token-secret \
# --from-literal=HF_TOKEN=<your-token> -n <namespace>
#
# Topology: 5 nodes total
# decode: 4 nodes x 4 GPUs = TP16 / DP16 / EP16 (EAGLE speculative decode)
# prefill: 1 node x 4 GPUs = TP4
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: glm5-sglang
namespace: <your-namespace>
spec:
backendFramework: sglang
envs:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: HF_HOME
value: /model-store
services:
Frontend:
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
command:
- python3
args:
- -m
- dynamo.frontend
- --no-decode-fallback
env:
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
image: <your-registry>/sglang-dynamo-glm5:latest
name: ""
resources: {}
volumeMounts:
- mountPath: /model-store
name: model-cache
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
decode:
componentType: worker
subComponentType: decode
replicas: 1
multinode:
nodeCount: 4
resources:
limits:
gpu: "4"
claims:
- name: compute-domain-channel
extraPodSpec:
mainContainer:
command:
- bash
- -c
args:
- >-
ulimit -l unlimited && ulimit -n 1048576 &&
exec python3 -m dynamo.sglang
--model-path nvidia/GLM-5-NVFP4
--served-model-name nvidia/GLM-5-NVFP4
--trust-remote-code
--quantization modelopt_fp4
--kv-cache-dtype fp8_e4m3
--tensor-parallel-size 16
--data-parallel-size 16
--expert-parallel-size 16
--enable-dp-attention
--enable-dp-lm-head
--max-running-requests 2048
--cuda-graph-max-bs 128
--mem-fraction-static 0.72
--chunked-prefill-size 32768
--max-prefill-tokens 32768
--enable-flashinfer-allreduce-fusion
--attention-backend nsa
--nsa-decode-backend trtllm
--nsa-prefill-backend trtllm
--moe-runner-backend flashinfer_trtllm
--speculative-algorithm EAGLE
--speculative-num-steps 2
--speculative-eagle-topk 1
--speculative-num-draft-tokens 3
--stream-interval 50
--incremental-streaming-output
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 32}'
--host 0.0.0.0
--disaggregation-mode decode
--disaggregation-transfer-backend nixl
--disaggregation-bootstrap-port 30001
--prefill-round-robin-balance
--watchdog-timeout 3600
env:
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: FLASHINFER_DISABLE_VERSION_CHECK
value: "1"
- name: SGLANG_ENABLE_JIT_DEEPGEMM
value: "1"
- name: SGLANG_ENABLE_SPEC_V2
value: "1"
- name: SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE
value: "1"
- name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
value: "1024"
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_CUMEM_ENABLE
value: "1"
- name: NCCL_NVLS_ENABLE
value: "1"
- name: NVIDIA_GDRCOPY
value: "1"
- name: NCCL_STORE_TIMEOUT
value: "7200"
- name: NCCL_GRAPH_MIXING_SUPPORT
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,rc"
- name: UCX_IB_GID_INDEX
value: "3"
- name: UCX_RC_TIMEOUT
value: "600s"
- name: UCX_KEEPALIVE_INTERVAL
value: "300s"
- name: UCX_LOG_LEVEL
value: error
- name: NIXL_LOG_LEVEL
value: ERROR
image: <your-registry>/sglang-dynamo-glm5:latest
name: ""
resources: {}
securityContext:
capabilities:
add:
- IPC_LOCK
startupProbe:
failureThreshold: 60
httpGet:
path: /live
port: 9090
periodSeconds: 60
timeoutSeconds: 5
volumeMounts:
- mountPath: /model-store
name: model-cache
workingDir: /workspace/
resourceClaims:
- name: compute-domain-channel
resourceClaimTemplateName: glm5-compute-domain-channel
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
prefill:
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
gpu: "4"
extraPodSpec:
mainContainer:
command:
- bash
- -c
args:
- >-
ulimit -l unlimited && ulimit -n 1048576 &&
exec python3 -m dynamo.sglang
--model-path nvidia/GLM-5-NVFP4
--served-model-name nvidia/GLM-5-NVFP4
--trust-remote-code
--quantization modelopt_fp4
--kv-cache-dtype fp8_e4m3
--tensor-parallel-size 4
--data-parallel-size 1
--expert-parallel-size 1
--enable-dp-lm-head
--max-running-requests 2048
--cuda-graph-max-bs 128
--mem-fraction-static 0.72
--chunked-prefill-size 32768
--max-prefill-tokens 32768
--enable-flashinfer-allreduce-fusion
--attention-backend nsa
--nsa-decode-backend trtllm
--nsa-prefill-backend trtllm
--moe-runner-backend flashinfer_trtllm
--stream-interval 50
--incremental-streaming-output
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 32}'
--host 0.0.0.0
--disaggregation-mode prefill
--disaggregation-transfer-backend nixl
--disaggregation-bootstrap-port 30001
--load-balance-method round_robin
--watchdog-timeout 3600
env:
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: FLASHINFER_DISABLE_VERSION_CHECK
value: "1"
- name: SGLANG_ENABLE_JIT_DEEPGEMM
value: "1"
- name: SGLANG_ENABLE_SPEC_V2
value: "1"
- name: SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE
value: "1"
- name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
value: "1024"
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_CUMEM_ENABLE
value: "1"
- name: NCCL_NVLS_ENABLE
value: "1"
- name: NVIDIA_GDRCOPY
value: "1"
- name: NCCL_STORE_TIMEOUT
value: "7200"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,rc"
- name: UCX_IB_GID_INDEX
value: "3"
- name: UCX_RC_TIMEOUT
value: "600s"
- name: UCX_KEEPALIVE_INTERVAL
value: "300s"
- name: UCX_LOG_LEVEL
value: error
- name: NIXL_LOG_LEVEL
value: ERROR
image: <your-registry>/sglang-dynamo-glm5:latest
name: ""
resources: {}
securityContext:
capabilities:
add:
- IPC_LOCK
startupProbe:
failureThreshold: 60
httpGet:
path: /live
port: 9090
periodSeconds: 60
timeoutSeconds: 5
volumeMounts:
- mountPath: /model-store
name: model-cache
workingDir: /workspace/
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
---
# ComputeDomain enables MNNVL scheduling for the 5-node decode+prefill group.
apiVersion: resource.nvidia.com/v1beta1
kind: ComputeDomain
metadata:
name: glm5-compute-domain
namespace: <your-namespace>
spec:
channel:
allocationMode: Single
resourceClaimTemplate:
name: glm5-compute-domain-channel
numNodes: 5
---
apiVersion: resource.k8s.io/v1
kind: ResourceClaimTemplate
metadata:
name: glm5-compute-domain-channel
namespace: <your-namespace>
spec:
spec:
devices:
config:
- opaque:
driver: compute-domain.nvidia.com
parameters:
allocationMode: Single
apiVersion: resource.nvidia.com/v1beta1
kind: ComputeDomainChannelConfig
requests:
- channel
requests:
- exactly:
allocationMode: ExactCount
count: 1
deviceClassName: compute-domain-default-channel.nvidia.com
name: channel
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: glm5-disagg-bench
namespace: <your-namespace>
spec:
backoffLimit: 1
completions: 1
parallelism: 1
template:
spec:
restartPolicy: Never
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: bench
image: python:3.12-slim
securityContext:
allowPrivilegeEscalation: false
imagePullPolicy: IfNotPresent
env:
- name: TARGET_MODEL
value: nvidia/GLM-5-NVFP4
- name: ENDPOINT
value: glm5-sglang-frontend:8000
- name: ISL
value: "1000"
- name: OSL
value: "8192"
# Concurrency = CONCURRENCY_PER_GPU * num_decode_gpus
# Default: 32 req/GPU * 16 GPUs (4 nodes x 4 GPUs) = 512
- name: CONCURRENCY
value: "512"
- name: ARTIFACT_DIR
value: /model-store/perf/glm5-disagg-isl1k-osl8k
- name: TOKENIZER
value: nvidia/GLM-5-NVFP4
- name: HF_HOME
value: /model-store
- name: PYTHONUNBUFFERED
value: "1"
- name: COLUMNS
value: "200"
command:
- /bin/sh
- -c
- |
set -eu
apt-get update -qq && apt-get install -y -qq curl jq && apt-get clean
pip install -q aiperf transformers tokenizers
echo "Waiting for model at http://$ENDPOINT/v1/models..."
while ! curl -sf "http://$ENDPOINT/v1/models" | jq -e --arg m "$TARGET_MODEL" '.data[]? | select(.id == $m)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] not ready, retrying in 10s..."
sleep 10
done
echo "Model ready!"
mkdir -p "$ARTIFACT_DIR"
aiperf profile \
--artifact-dir "$ARTIFACT_DIR" \
--model "$TARGET_MODEL" \
--tokenizer "$TOKENIZER" \
--tokenizer-trust-remote-code \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url "http://$ENDPOINT" \
--synthetic-input-tokens-mean $ISL \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $OSL \
--output-tokens-stddev 0 \
--extra-inputs "max_tokens:$OSL" \
--extra-inputs "min_tokens:$OSL" \
--extra-inputs "ignore_eos:true" \
--concurrency $CONCURRENCY \
--request-count $((CONCURRENCY * 3)) \
--warmup-request-count 16 \
--num-dataset-entries 12800 \
--random-seed 100 \
--ui simple
echo "Done. Results in $ARTIFACT_DIR"
ls -la "$ARTIFACT_DIR"
volumeMounts:
- name: model-cache
mountPath: /model-store
workingDir: /workspace
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment