Unverified Commit b73e6eb5 authored by Rohan Varma's avatar Rohan Varma Committed by GitHub
Browse files

feat: restructure dsr1 recipes and add gb200 (#3891)


Signed-off-by: default avatarRohan Varma <rohanv@nvidia.com>
parent e20adb44
......@@ -162,7 +162,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
```bash
cd $DYNAMO_HOME/examples/backends/trtllm
export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# nvidia/DeepSeek-R1-FP4 is a large model
export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
......
......@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:
```bash
# Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml"
# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
......@@ -165,8 +165,8 @@ deployment across 8 nodes:
```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml"
# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
......
......@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
NUM_NODES=${NUM_NODES:-4}
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"
export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}"
# Automate settings of certain variables for convenience, but you are free
# to manually set these for more control as well.
......
......@@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}"
NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}"
DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
......
......@@ -3,11 +3,11 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-cache
name: model-cache-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 1000Gi
storage: 1500Gi
storageClassName: "your-storage-class-name"
\ No newline at end of file
......@@ -14,31 +14,24 @@ spec:
app: model-download
spec:
restartPolicy: Never
tolerations: []
containers:
- name: model-download
image: python:3.10-slim
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: MODEL_NAME
value: deepseek-ai/DeepSeek-R1
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
- name: MODEL_REVISION
value: 56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
hf download $MODEL_NAME --revision $MODEL_REVISION
hf download nvidia/DeepSeek-R1-FP4 --local-dir /model-cache/deepseek-r1-fp4
hf download deepseek-ai/DeepSeek-R1 --local-dir /model-cache/deepseek-r1
volumeMounts:
- name: model-cache
mountPath: /model-store
mountPath: /model-cache
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
\ No newline at end of file
claimName: model-cache-pvc
\ No newline at end of file
......@@ -7,7 +7,7 @@ metadata:
name: sgl-dsr1-16gpu
spec:
pvcs:
- name: model-cache
- name: model-cache-pvc
create: false
services:
Frontend:
......@@ -34,8 +34,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
......@@ -55,7 +55,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
......@@ -87,8 +87,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
......@@ -108,7 +108,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
......
......@@ -7,7 +7,7 @@ metadata:
name: sgl-dsr1-8gpu
spec:
pvcs:
- name: model-cache
- name: model-cache-pvc
create: false
services:
Frontend:
......@@ -32,8 +32,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
......@@ -53,7 +53,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
......@@ -81,8 +81,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
......@@ -102,7 +102,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
......
......@@ -11,7 +11,7 @@ moe_config:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml
tensor_parallel_size: 16
moe_expert_parallel_size: 16
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# moe_load_balancer settings for TRTLLM based on:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
num_slots: 288
layer_updates_per_iter: 2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Instructions:
# 1. First, create the model cache PersistentVolumeClaim:
# kubectl apply -f model-cache.yaml -n <namespace>
# 2. Download the model to the model cache:
# kubectl apply -f model-download.yaml -n <namespace>
# 3. Once the above steps are complete, deploy the prefill and decode workers via this yaml:
# kubectl apply -f deploy.yaml -n <namespace>
# 4. To benchmark the service, run:
# kubectl apply -f perf.yaml -n <namespace>
# ConfigMap for prefill engine configuration
# This configuration sets up a DEP 4 prefill worker
apiVersion: v1
kind: ConfigMap
metadata:
name: prefill-config
data:
prefill_config.yaml: |
build_config:
max_batch_size: 4
max_num_tokens: 4608
max_seq_len: 1227
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
pipeline_parallel_size: 1
cuda_graph_config: null
print_iter_log: true
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: DEFAULT
---
# ConfigMap for decode engine configuration
# This configuration sets up a DEP 32 decode worker
apiVersion: v1
kind: ConfigMap
metadata:
name: decode-config
data:
decode_config_dep32.yaml: |
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
build_config:
max_batch_size: 32
max_num_tokens: 32
max_seq_len: 2251
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 384
- 512
- 768
- 1024
- 2048
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: DEFAULT
stream_interval: 20
---
# NOTE: The numNodes value should equal the total number of nodes across prefill and decode
# as specified in their respective sections below (prefill.multinode.nodeCount + decode.multinode.nodeCount).
# For autoscaling deployments, the compute domain will automatically adjust as needed.
apiVersion: resource.nvidia.com/v1beta1
kind: ComputeDomain
metadata:
name: trtllm-test-compute-domain
spec:
numNodes: 9
channel:
resourceClaimTemplate:
name: trtllm-test-compute-domain-channel
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: trtllm-disagg-multinode
spec:
pvcs:
- name: model-cache-pvc
create: false
envs:
- name: NCCL_MNNVL_ENABLE
value: "1"
- name: NCCL_CUMEM_ENABLE
value: "1"
- name: TLLM_LOG_LEVEL
value: "info"
- name: TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER
value: "1"
- name: TRTLLM_ENABLE_PDL
value: "1"
backendFramework: trtllm
services:
Frontend:
dynamoNamespace: trtllm-disagg-multinode
componentType: frontend
replicas: 1
extraPodSpec:
tolerations: []
affinity: {}
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
args:
- |
python3 -m dynamo.frontend --http-port 8000
command:
- /bin/sh
- -c
prefill:
dynamoNamespace: trtllm-disagg-multinode
componentType: worker
replicas: 1
# NOTE: Prefill uses 1 node (no multinode section = single node)
# and contributes to ComputeDomain.numNodes (see above)
volumeMounts:
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 800Gi
resources:
requests:
cpu: "130"
memory: "850Gi"
limits:
cpu: "130"
memory: "850Gi"
gpu: "4"
claims:
- name: compute-domain-channel
extraPodSpec:
tolerations: []
affinity: {}
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup
startupProbe:
httpGet:
path: /live
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 500
volumeMounts:
- name: prefill-config-volume
mountPath: /config
command:
- /bin/sh
- -c
args:
- >-
python3 -m dynamo.trtllm
--model-path /model-cache/deepseek-r1-fp4
--served-model-name deepseek-ai/DeepSeek-R1
--extra-engine-args /config/prefill_config.yaml
--disaggregation-mode prefill
resourceClaims:
- name: compute-domain-channel
resourceClaimTemplateName: trtllm-test-compute-domain-channel
volumes:
- name: prefill-config-volume
configMap:
name: prefill-config
decode:
dynamoNamespace: trtllm-disagg-multinode
componentType: worker
replicas: 1
volumeMounts:
- name: model-cache-pvc
mountPoint: /model-cache
multinode:
# NOTE: This nodeCount contributes to ComputeDomain.numNodes (see above)
nodeCount: 8
sharedMemory:
size: 800Gi
resources:
requests:
cpu: "130"
memory: "850Gi"
limits:
cpu: "130"
memory: "850Gi"
gpu: "4"
claims:
- name: compute-domain-channel
extraPodSpec:
tolerations: []
affinity: {}
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup
startupProbe:
httpGet:
path: /live
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 500
volumeMounts:
- name: decode-config-volume
mountPath: /config
command:
- /bin/sh
- -c
args:
- >-
python3 -m dynamo.trtllm
--model-path /model-cache/deepseek-r1-fp4
--served-model-name deepseek-ai/DeepSeek-R1
--extra-engine-args /config/decode_config_dep32.yaml
--disaggregation-mode decode
resourceClaims:
- name: compute-domain-channel
resourceClaimTemplateName: trtllm-test-compute-domain-channel
volumes:
- name: decode-config-volume
configMap:
name: decode-config
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: deepseek-r1-bench
spec:
backoffLimit: 1
completions: 1
parallelism: 1
template:
metadata:
labels:
app: deepseek-r1-bench
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- trtllm-disagg-multinode
topologyKey: kubernetes.io/hostname
containers:
- command:
- /bin/sh
- -c
- |
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install aiperf;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /model-cache/deepseek-r1-fp4 \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs "max_tokens:$osl" \
--extra-inputs "min_tokens:$osl" \
--extra-inputs "ignore_eos:true" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "repetition_penalty:1.0" \
--extra-inputs "temperature: 0.0" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max 252 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: deepseek-ai/DeepSeek-R1
- name: ENDPOINT
value: trtllm-disagg-multinode-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "30"
- name: DEPLOYMENT_GPU_COUNT
value: "36"
- name: ISL
value: "1024"
- name: OSL
value: "1024"
- name: DEPLOYMENT_MODE
value: disagg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "252"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /model-cache/perf
- name: HF_HOME
value: /model-cache
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts:
- name: model-cache
mountPath: /model-cache
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
restartPolicy: Never
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache-pvc
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment