feat: restructure dsr1 recipes and add gb200 (#3891)

Signed-off-by: Rohan Varma <rohanv@nvidia.com>

feat: restructure dsr1 recipes and add gb200 (#3891)
Signed-off-by: Rohan Varma <rohanv@nvidia.com>
b73e6eb5 · Rohan Varma · GitHub · e20adb44 · b73e6eb5 · b73e6eb5
Unverified Commit b73e6eb5 authored Oct 31, 2025 by Rohan Varma Committed by GitHub Nov 01, 2025
20 changed files
--- a/docs/backends/trtllm/README.md
+++ b/docs/backends/trtllm/README.md
@@ -162,7 +162,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
 ```bash
 cd $DYNAMO_HOME/examples/backends/trtllm

-export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
+export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml
 export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
 # nvidia/DeepSeek-R1-FP4 is a large model
 export MODEL_PATH="nvidia/DeepSeek-R1-FP4"

--- a/docs/backends/trtllm/multinode/multinode-examples.md
+++ b/docs/backends/trtllm/multinode/multinode-examples.md
@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:

 ```bash
 # Default set in srun_aggregated.sh, but can customize here.
-# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"
+# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml"

 # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
 # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
@@ -165,8 +165,8 @@ deployment across 8 nodes:

 ```bash
 # Defaults set in srun_disaggregated.sh, but can customize here.
-# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
-# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"
+# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml"
+# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml"

 # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
 # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG

--- a/examples/basics/multinode/trtllm/srun_aggregated.sh
+++ b/examples/basics/multinode/trtllm/srun_aggregated.sh
@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
 NUM_NODES=${NUM_NODES:-4}
 NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

-export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"
+export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}"

 # Automate settings of certain variables for convenience, but you are free
 # to manually set these for more control as well.

--- a/examples/basics/multinode/trtllm/srun_disaggregated.sh
+++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh
@@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

 NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
 NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
-PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"
+PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}"

 NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
 NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
-DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"
+DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}"

 DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}


--- a/recipes/deepseek-r1/model-cache/model-cache.yaml
+++ b/recipes/deepseek-r1/model-cache/model-cache.yaml
@@ -3,11 +3,11 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: model-cache
+  name: model-cache-pvc
 spec:
  accessModes:
    - ReadWriteMany
  resources:
    requests:
-      storage: 1000Gi
+      storage: 1500Gi
  storageClassName: "your-storage-class-name"
\ No newline at end of file
--- a/recipes/deepseek-r1/model-cache/model-download.yaml
+++ b/recipes/deepseek-r1/model-cache/model-download.yaml
@@ -14,31 +14,24 @@ spec:
        app: model-download
    spec:
      restartPolicy: Never
+      tolerations: []
      containers:
        - name: model-download
          image: python:3.10-slim
          command: ["sh", "-c"]
-          envFrom:
-            - secretRef:
-                name: hf-token-secret
          env:
-            - name: MODEL_NAME
-              value: deepseek-ai/DeepSeek-R1
-            - name: HF_HOME
-              value: /model-store
            - name: HF_HUB_ENABLE_HF_TRANSFER
              value: "1"
-            - name: MODEL_REVISION
-              value: 56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad
          args:
            - |
              set -eux
              pip install --no-cache-dir huggingface_hub hf_transfer
-              hf download $MODEL_NAME --revision $MODEL_REVISION
+              hf download nvidia/DeepSeek-R1-FP4 --local-dir /model-cache/deepseek-r1-fp4
+              hf download deepseek-ai/DeepSeek-R1 --local-dir /model-cache/deepseek-r1
          volumeMounts:
            - name: model-cache
-              mountPath: /model-store
+              mountPath: /model-cache
      volumes:
      - name: model-cache
        persistentVolumeClaim:
-          claimName: model-cache
\ No newline at end of file
+          claimName: model-cache-pvc
\ No newline at end of file
--- a/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml
+++ b/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml
@@ -7,7 +7,7 @@ metadata:
  name: sgl-dsr1-16gpu
 spec:
  pvcs:
-    - name: model-cache
+    - name: model-cache-pvc
      create: false
  services:
    Frontend:
@@ -34,8 +34,8 @@ spec:
        limits:
          gpu: "8"
      volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -55,7 +55,7 @@ spec:
            - dynamo.sglang
          args:
            - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
            - --served-model-name
            - deepseek-ai/DeepSeek-R1
            - --tp
@@ -87,8 +87,8 @@ spec:
        limits:
          gpu: "8"
      volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -108,7 +108,7 @@ spec:
            - dynamo.sglang
          args:
            - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
            - --served-model-name
            - deepseek-ai/DeepSeek-R1
            - --tp

--- a/recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml
+++ b/recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml
@@ -7,7 +7,7 @@ metadata:
  name: sgl-dsr1-8gpu
 spec:
  pvcs:
-    - name: model-cache
+    - name: model-cache-pvc
      create: false
  services:
    Frontend:
@@ -32,8 +32,8 @@ spec:
        limits:
          gpu: "8"
      volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -53,7 +53,7 @@ spec:
            - dynamo.sglang
          args:
            - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
            - --served-model-name
            - deepseek-ai/DeepSeek-R1
            - --tp
@@ -81,8 +81,8 @@ spec:
        limits:
          gpu: "8"
      volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -102,7 +102,7 @@ spec:
            - dynamo.sglang
          args:
            - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
            - --served-model-name
            - deepseek-ai/DeepSeek-R1
            - --tp

--- a/recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
+++ b/recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
--- a/recipes/deepseek-r1/trtllm/simple/agg.yaml
+++ b/recipes/deepseek-r1/trtllm/simple/agg.yaml
--- a/recipes/deepseek-r1/trtllm/wide_ep/dep16_agg.yaml
+++ b/recipes/deepseek-r1/trtllm/wide_ep/dep16_agg.yaml
--- a/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
+++ b/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
--- a/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml
+++ b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml
@@ -11,7 +11,7 @@ moe_config:
  #   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
  #   4096 = 256 * 16
  # moe_max_num_tokens: 4096
-  load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
+  load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml

 tensor_parallel_size: 16
 moe_expert_parallel_size: 16

--- a/recipes/deepseek-r1/trtllm/mtp/mtp_decode.yaml
+++ b/recipes/deepseek-r1/trtllm/mtp/mtp_decode.yaml
--- a/recipes/deepseek-r1/trtllm/mtp/mtp_prefill.yaml
+++ b/recipes/deepseek-r1/trtllm/mtp/mtp_prefill.yaml
--- a/recipes/deepseek-r1/trtllm/simple/decode.yaml
+++ b/recipes/deepseek-r1/trtllm/simple/decode.yaml
--- a/recipes/deepseek-r1/trtllm/simple/prefill.yaml
+++ b/recipes/deepseek-r1/trtllm/simple/prefill.yaml
--- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
+++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# moe_load_balancer settings for TRTLLM based on:
+# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
+num_slots: 288
+layer_updates_per_iter: 2
--- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
+++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# Instructions:
+# 1. First, create the model cache PersistentVolumeClaim:
+#    kubectl apply -f model-cache.yaml -n <namespace>
+# 2. Download the model to the model cache:
+#    kubectl apply -f model-download.yaml -n <namespace>
+# 3. Once the above steps are complete, deploy the prefill and decode workers via this yaml:
+#    kubectl apply -f deploy.yaml -n <namespace>
+# 4. To benchmark the service, run:
+#    kubectl apply -f perf.yaml -n <namespace>
+
+# ConfigMap for prefill engine configuration
+# This configuration sets up a DEP 4 prefill worker
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prefill-config
+data:
+  prefill_config.yaml: |
+    build_config:
+        max_batch_size: 4
+        max_num_tokens: 4608
+        max_seq_len: 1227
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    cuda_graph_config: null
+    print_iter_log: true
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 4608
+      backend: DEFAULT
+---
+
+# ConfigMap for decode engine configuration
+# This configuration sets up a DEP 32 decode worker
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: decode-config
+data:
+  decode_config_dep32.yaml: |
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    build_config:
+        max_batch_size: 32
+        max_num_tokens: 32
+        max_seq_len: 2251
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 384
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+    cache_transceiver_config:
+      max_tokens_in_buffer: 4608
+      backend: DEFAULT
+    stream_interval: 20
+---
+
+# NOTE: The numNodes value should equal the total number of nodes across prefill and decode
+#       as specified in their respective sections below (prefill.multinode.nodeCount + decode.multinode.nodeCount).
+#       For autoscaling deployments, the compute domain will automatically adjust as needed.
+apiVersion: resource.nvidia.com/v1beta1
+kind: ComputeDomain
+metadata:
+  name: trtllm-test-compute-domain
+spec:
+  numNodes: 9
+  channel:
+    resourceClaimTemplate:
+      name: trtllm-test-compute-domain-channel
+---
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: trtllm-disagg-multinode
+spec:
+  pvcs:
+    - name: model-cache-pvc
+      create: false
+  envs:
+    - name: NCCL_MNNVL_ENABLE
+      value: "1"
+    - name: NCCL_CUMEM_ENABLE
+      value: "1"
+    - name: TLLM_LOG_LEVEL
+      value: "info"
+    - name: TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER
+      value: "1"
+    - name: TRTLLM_ENABLE_PDL
+      value: "1"
+  backendFramework: trtllm
+  services:
+    Frontend:
+      dynamoNamespace: trtllm-disagg-multinode
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        tolerations: []
+        affinity: {}
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
+          args:
+          - |
+            python3 -m dynamo.frontend --http-port 8000
+          command:
+          - /bin/sh
+          - -c
+    prefill:
+      dynamoNamespace: trtllm-disagg-multinode
+      componentType: worker
+      replicas: 1
+      # NOTE: Prefill uses 1 node (no multinode section = single node)
+      #       and contributes to ComputeDomain.numNodes (see above)
+      volumeMounts:
+        - name: model-cache-pvc
+          mountPoint: /model-cache
+      sharedMemory:
+        size: 800Gi
+      resources:
+        requests:
+          cpu: "130"
+          memory: "850Gi"
+        limits:
+          cpu: "130"
+          memory: "850Gi"
+          gpu: "4"
+        claims:
+          - name: compute-domain-channel
+      extraPodSpec:
+        tolerations: []
+        affinity: {}
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
+          workingDir: /workspace/components/backends/trtllm
+          # NOTE: If your PVCs (Persistent Volume Claims) are really slow,
+          #       you might need to increase 'failureThreshold' below to allow more time for startup
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 500
+          volumeMounts:
+            - name: prefill-config-volume
+              mountPath: /config
+          command:
+          - /bin/sh
+          - -c
+          args:
+          - >-
+            python3 -m dynamo.trtllm
+            --model-path /model-cache/deepseek-r1-fp4
+            --served-model-name deepseek-ai/DeepSeek-R1
+            --extra-engine-args /config/prefill_config.yaml
+            --disaggregation-mode prefill
+        resourceClaims:
+          - name: compute-domain-channel
+            resourceClaimTemplateName: trtllm-test-compute-domain-channel
+        volumes:
+          - name: prefill-config-volume
+            configMap:
+              name: prefill-config
+    decode:
+      dynamoNamespace: trtllm-disagg-multinode
+      componentType: worker
+      replicas: 1
+      volumeMounts:
+        - name: model-cache-pvc
+          mountPoint: /model-cache
+      multinode:
+        # NOTE: This nodeCount contributes to ComputeDomain.numNodes (see above)
+        nodeCount: 8
+      sharedMemory:
+        size: 800Gi
+      resources:
+        requests:
+          cpu: "130"
+          memory: "850Gi"
+        limits:
+          cpu: "130"
+          memory: "850Gi"
+          gpu: "4"
+        claims:
+          - name: compute-domain-channel
+      extraPodSpec:
+        tolerations: []
+        affinity: {}
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
+          workingDir: /workspace/components/backends/trtllm
+          # NOTE: If your PVCs (Persistent Volume Claims) are really slow,
+          #       you might need to increase 'failureThreshold' below to allow more time for startup
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 500
+          volumeMounts:
+            - name: decode-config-volume
+              mountPath: /config
+          command:
+          - /bin/sh
+          - -c
+          args:
+          - >-
+            python3 -m dynamo.trtllm
+            --model-path /model-cache/deepseek-r1-fp4
+            --served-model-name deepseek-ai/DeepSeek-R1
+            --extra-engine-args /config/decode_config_dep32.yaml
+            --disaggregation-mode decode
+        resourceClaims:
+          - name: compute-domain-channel
+            resourceClaimTemplateName: trtllm-test-compute-domain-channel
+        volumes:
+          - name: decode-config-volume
+            configMap:
+              name: decode-config
--- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
+++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: deepseek-r1-bench
+spec:
+  backoffLimit: 1
+  completions: 1
+  parallelism: 1
+  template:
+    metadata:
+      labels:
+        app: deepseek-r1-bench
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchExpressions:
+                  - key: nvidia.com/dynamo-graph-deployment-name
+                    operator: In
+                    values:
+                      - trtllm-disagg-multinode
+              topologyKey: kubernetes.io/hostname
+      containers:
+      - command:
+        - /bin/sh
+        - -c
+        - |
+          apt-get update && apt-get install -y curl jq procps git && apt-get clean
+          pip install aiperf;
+          echo "aiperf installation completed";
+          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
+          cat /proc/sys/net/ipv4/ip_local_port_range
+          export COLUMNS=200
+          EPOCH=$(date +%s)
+          ## utility functions -- can be moved to a bash script / configmap
+          wait_for_model_ready() {
+            echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
+            while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
+                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
+                sleep 5
+            done
+            echo "✅ Model '$TARGET_MODEL' is now available!"
+            echo "Model '$TARGET_MODEL' is now available!"
+            curl -s "http://$ENDPOINT/v1/models" | jq .
+          }
+          run_perf() {
+            local concurrency=$1
+            local isl=$2
+            local osl=$3
+            key=concurrency_${concurrency}
+            export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
+            mkdir -p "$ARTIFACT_DIR"
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
+            aiperf profile --artifact-dir $ARTIFACT_DIR \
+                --model $TARGET_MODEL \
+                --tokenizer /model-cache/deepseek-r1-fp4  \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
+                --streaming \
+                --url http://$ENDPOINT \
+                --synthetic-input-tokens-mean $isl \
+                --synthetic-input-tokens-stddev 0 \
+                --output-tokens-mean $osl \
+                --output-tokens-stddev 0 \
+                --extra-inputs "max_tokens:$osl" \
+                --extra-inputs "min_tokens:$osl" \
+                --extra-inputs "ignore_eos:true" \
+                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+                --extra-inputs "repetition_penalty:1.0" \
+                --extra-inputs "temperature: 0.0" \
+                --concurrency $concurrency \
+                --request-count $((10*concurrency)) \
+                --warmup-request-count $concurrency \
+                --conversation-num 12800 \
+                --random-seed 100 \
+                --workers-max 252 \
+                -H 'Authorization: Bearer NOT USED' \
+                -H 'Accept: text/event-stream'\
+                --record-processors 32 \
+                --ui simple
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
+            ls -la $ARTIFACT_DIR
+          }
+          #### Actual execution ####
+          wait_for_model_ready
+          mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
+          # Calculate total concurrency based on per-GPU concurrency and GPU count
+          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
+          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
+          # Write input_config.json
+          cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
+          {
+            "gpu_count": $DEPLOYMENT_GPU_COUNT,
+            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
+            "total_concurrency": $TOTAL_CONCURRENCY,
+            "mode": "$DEPLOYMENT_MODE",
+            "isl": $ISL,
+            "osl": $OSL,
+            "endpoint": "$ENDPOINT",
+            "model endpoint": "$TARGET_MODEL"
+          }
+          EOF
+
+          # Run perf with calculated total concurrency
+          run_perf $TOTAL_CONCURRENCY $ISL $OSL
+          echo "done with concurrency $TOTAL_CONCURRENCY"
+        env:
+        - name: TARGET_MODEL
+          value: deepseek-ai/DeepSeek-R1
+        - name: ENDPOINT
+          value: trtllm-disagg-multinode-frontend:8000
+        - name: CONCURRENCY_PER_GPU
+          value: "30"
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "36"
+        - name: ISL
+          value: "1024"
+        - name: OSL
+          value: "1024"
+        - name: DEPLOYMENT_MODE
+          value: disagg
+        - name: AIPERF_HTTP_CONNECTION_LIMIT
+          value: "252"
+        - name: JOB_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['job-name']
+        - name: ROOT_ARTIFACT_DIR
+          value: /model-cache/perf
+        - name: HF_HOME
+          value: /model-cache
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        image: python:3.12-slim
+        imagePullPolicy: IfNotPresent
+        name: perf
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: model-cache
+          mountPath: /model-cache
+        workingDir: /workspace
+      imagePullSecrets:
+      - name: nvcrimagepullsecret
+      restartPolicy: Never
+      volumes:
+      - name: model-cache
+        persistentVolumeClaim:
+          claimName: model-cache-pvc
+