fix: update model recipe for llama-3 70b to match with common recipe template (#3637)

13fc3c65 · Biswa Panda · GitHub · 15a01f75 · 13fc3c65 · 13fc3c65
Unverified Commit 13fc3c65 authored Oct 15, 2025 by Biswa Panda Committed by GitHub Oct 15, 2025
7 changed files
--- a/recipes/llama-3-70b/model-cache/model-download.yaml
+++ b/recipes/llama-3-70b/model-cache/model-download.yaml
@@ -22,24 +22,22 @@ spec:
            - secretRef:
                name: hf-token-secret
          env:
-            # NOTE: This is the model name for the llama-3-70b model
-            # Update this to model name for the model you are downloading
            - name: MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-            - name: HF_TOKEN
+            - name: HF_HOME
-              valueFrom:
+              value: /model-store
-                secretKeyRef:
+            - name: HF_HUB_ENABLE_HF_TRANSFER
-                  name: hf-token-secret
+              value: "1"
-                  key: HF_TOKEN
+            - name: MODEL_REVISION
+              value: ddb4128556dfcff99e0c41aee159ea6c3e655dcd
          args:
            - |
              set -eux
              pip install --no-cache-dir huggingface_hub hf_transfer
-              export HF_HUB_ENABLE_HF_TRANSFER=1
+              hf download $MODEL_NAME --revision $MODEL_REVISION
-              huggingface-cli download $MODEL_NAME
          volumeMounts:
            - name: model-cache
-              mountPath: /root/.cache/huggingface/hub
+              mountPath: /model-store
      volumes:
      - name: model-cache
        persistentVolumeClaim:

--- a/recipes/llama-3-70b/vllm/agg/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml
@@ -32,8 +32,13 @@ spec:
        size: 20Gi
      extraPodSpec:
        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
          args:
-          - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c
@@ -42,6 +47,6 @@ spec:
      replicas: 1
      resources:
        limits:
-          gpu: "8"
+          gpu: "4"
        requests:
-          gpu: "8"
+          gpu: "4"
\ No newline at end of file
--- a/recipes/llama-3-70b/vllm/agg/perf.yaml
+++ b/recipes/llama-3-70b/vllm/agg/perf.yaml
@@ -5,7 +5,7 @@ kind: Job
 metadata:
  name: llama3-70b-agg-perf
 spec:
-  backoffLimit: 3
+  backoffLimit: 1
  completions: 1
  parallelism: 1
  template:
@@ -15,58 +15,129 @@ spec:
    spec:
      restartPolicy: Never
      containers:
-      - name: perf
+      - command:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
-        workingDir: /workspace/components/backends/vllm
-        command:
        - /bin/sh
        - -c
        - |
-          # wait for the model to be ready
+          apt-get update && apt-get install -y curl jq procps git && apt-get clean
-          export ENDPOINT=llama3-70b-agg-0:8000
+          pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
-          export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+          echo "aiperf installation completed";
-          export INTERVAL=5
+          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
-          echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
+          cat /proc/sys/net/ipv4/ip_local_port_range
+          export COLUMNS=200
+          EPOCH=$(date +%s)
+          ## utility functions -- can be moved to a bash script / configmap
+          wait_for_model_ready() {
+            echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
            while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
-              echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
+                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
-              sleep $INTERVAL
+                sleep 5
            done
            echo "✅ Model '$TARGET_MODEL' is now available!"
+            echo "Model '$TARGET_MODEL' is now available!"
            curl -s "http://$ENDPOINT/v1/models" | jq .
-          # now run the benchmark
+          }
-          export ARTIFACT_DIR="/tmp/genai"
+          run_perf() {
+            local concurrency=$1
+            local isl=$2
+            local osl=$3
+            local max_threads=${concurrency}
+            key=concurrency_${concurrency}
+            export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
            mkdir -p "$ARTIFACT_DIR"
-          echo "Running benchmark..."
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
-          export COLUMNS=200
+            aiperf profile --artifact-dir $ARTIFACT_DIR \
-          aiperf profile \
+                --model $TARGET_MODEL \
-            --model "$TARGET_MODEL" \
+                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-            --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
+                --endpoint-type chat  --endpoint /v1/chat/completions \
-            --endpoint-type chat --url "$ENDPOINT" --streaming \
+                --streaming \
-            --concurrency 64 \
+                --url http://$ENDPOINT \
-            --warmup-request-count 2 \
+                --synthetic-input-tokens-mean $isl \
-            --request-count 320 \
-            --extra-inputs max_tokens:1024 \
-            --synthetic-input-tokens-mean 8192 \
                --synthetic-input-tokens-stddev 0 \
-            --output-tokens-mean 1024 \
+                --output-tokens-mean $osl \
                --output-tokens-stddev 0 \
-            --extra-inputs min_tokens:1024 \
+                --extra-inputs max_tokens:$osl \
+                --extra-inputs min_tokens:$osl \
                --extra-inputs ignore_eos:true \
+                --extra-inputs repetition_penalty:1.0 \
+                --extra-inputs temperature:0.0 \
                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-            --random-seed 1418186270 \
+                --concurrency $concurrency \
-            --artifact-dir $ARTIFACT_DIR \
+                --request-count $((10*concurrency)) \
-            --num-dataset-entries=3000 -- \
+                --warmup-request-count $concurrency \
-            --max-threads 64
+                --conversation-num 12800 \
-          echo "----------------json----------------"
+                --random-seed 100 \
-          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
+                --workers-max $max_threads \
-          cat $PERF_JSON | jq .
+                -H 'Authorization: Bearer NOT USED' \
-          echo "----------------csv-----------------"
+                -H 'Accept: text/event-stream'\
-          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
+                --record-processors 32 \
-          cat $PERF_CSV
+                --ui simple
-          echo "Benchmark completed successfully!"
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
+            ls -la $ARTIFACT_DIR
+          }
+          #### Actual execution ####
+          wait_for_model_ready
+          mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
+          # Calculate total concurrency based on per-GPU concurrency and GPU count
+          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
+          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
+          # Write input_config.json
+          cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
+          {
+            "gpu_count": $DEPLOYMENT_GPU_COUNT,
+            "max_threads": $max_threads,
+            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
+            "total_concurrency": $TOTAL_CONCURRENCY,
+            "mode": "$DEPLOYMENT_MODE",
+            "isl": $ISL,
+            "osl": $OSL,
+            "endpoint": "$ENDPOINT",
+            "model endpoint": "$TARGET_MODEL"
+          }
+          EOF
+          # Run perf with calculated total concurrency
+          run_perf $TOTAL_CONCURRENCY $ISL $OSL
+          echo "done with concurrency $TOTAL_CONCURRENCY"
+        env:
+        - name: TARGET_MODEL
+          value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+        - name: ENDPOINT
+          value: llama3-70b-agg-frontend:8000
+        - name: CONCURRENCY_PER_GPU
+          value: "16"
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "4"
+        - name: ISL
+          value: "8192"
+        - name: OSL
+          value: "1024"
+        - name: DEPLOYMENT_MODE
+          value: agg
+        - name: AIPERF_HTTP_CONNECTION_LIMIT
+          value: "200"
+        - name: JOB_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['job-name']
+        - name: ROOT_ARTIFACT_DIR
+          value: /root/.cache/huggingface/perf
+        - name: HF_HOME
+          value: /root/.cache/huggingface
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        image: python:3.12-slim
+        imagePullPolicy: IfNotPresent
+        name: perf
+        securityContext:
+          privileged: true
        volumeMounts:
        - name: model-cache
          mountPath: /root/.cache/huggingface
+        workingDir: /workspace
+      imagePullSecrets:
+      - name: nvcrimagepullsecret
      volumes:
      - name: model-cache
        persistentVolumeClaim:

--- a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
@@ -32,8 +32,13 @@ spec:
        size: 80Gi
      extraPodSpec:
        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
          args:
-          - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c
@@ -56,8 +61,13 @@ spec:
        size: 80Gi
      extraPodSpec:
        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
          args:
-          - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c

--- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
@@ -5,7 +5,7 @@ kind: Job
 metadata:
  name: llama3-70b-disagg-mn-perf
 spec:
-  backoffLimit: 3
+  backoffLimit: 1
  completions: 1
  parallelism: 1
  template:
@@ -15,58 +15,129 @@ spec:
    spec:
      restartPolicy: Never
      containers:
-      - name: perf
+      - command:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
-        workingDir: /workspace/components/backends/vllm
-        command:
        - /bin/sh
        - -c
        - |
-          # wait for the model to be ready
+          apt-get update && apt-get install -y curl jq procps git && apt-get clean
-          export ENDPOINT=llama3-70b-disagg-mn-frontend:8000
+          pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
-          export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+          echo "aiperf installation completed";
-          export INTERVAL=5
+          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
-          echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
+          cat /proc/sys/net/ipv4/ip_local_port_range
+          export COLUMNS=200
+          EPOCH=$(date +%s)
+          ## utility functions -- can be moved to a bash script / configmap
+          wait_for_model_ready() {
+            echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
            while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
-              echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
+                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
-              sleep $INTERVAL
+                sleep 5
            done
            echo "✅ Model '$TARGET_MODEL' is now available!"
+            echo "Model '$TARGET_MODEL' is now available!"
            curl -s "http://$ENDPOINT/v1/models" | jq .
-          # now run the benchmark
+          }
-          export ARTIFACT_DIR="/tmp/genai"
+          run_perf() {
+            local concurrency=$1
+            local isl=$2
+            local osl=$3
+            local max_threads=${concurrency}
+            key=concurrency_${concurrency}
+            export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
            mkdir -p "$ARTIFACT_DIR"
-          echo "Running benchmark..."
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
-          export COLUMNS=200
+            aiperf profile --artifact-dir $ARTIFACT_DIR \
-          aiperf profile \
+                --model $TARGET_MODEL \
-            --model "$TARGET_MODEL" \
+                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-            --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
+                --endpoint-type chat  --endpoint /v1/chat/completions \
-            --endpoint-type chat --url "$ENDPOINT" --streaming \
+                --streaming \
-            --concurrency 64 \
+                --url http://$ENDPOINT \
-            --warmup-request-count 2 \
+                --synthetic-input-tokens-mean $isl \
-            --request-count 320 \
-            --extra-inputs max_tokens:1024 \
-            --synthetic-input-tokens-mean 8192 \
                --synthetic-input-tokens-stddev 0 \
-            --output-tokens-mean 1024 \
+                --output-tokens-mean $osl \
                --output-tokens-stddev 0 \
-            --extra-inputs min_tokens:1024 \
+                --extra-inputs max_tokens:$osl \
+                --extra-inputs min_tokens:$osl \
                --extra-inputs ignore_eos:true \
+                --extra-inputs repetition_penalty:1.0 \
+                --extra-inputs temperature:0.0 \
                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-            --random-seed 1418186270 \
+                --concurrency $concurrency \
-            --artifact-dir $ARTIFACT_DIR \
+                --request-count $((10*concurrency)) \
-            --num-dataset-entries=3000 -- \
+                --warmup-request-count $concurrency \
-            --max-threads 64
+                --conversation-num 12800 \
-          echo "----------------json----------------"
+                --random-seed 100 \
-          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
+                --workers-max $max_threads \
-          cat $PERF_JSON | jq .
+                -H 'Authorization: Bearer NOT USED' \
-          echo "----------------csv-----------------"
+                -H 'Accept: text/event-stream'\
-          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
+                --record-processors 32 \
-          cat $PERF_CSV
+                --ui simple
-          echo "Benchmark completed successfully!"
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
+            ls -la $ARTIFACT_DIR
+          }
+          #### Actual execution ####
+          wait_for_model_ready
+          mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
+          # Calculate total concurrency based on per-GPU concurrency and GPU count
+          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
+          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
+          # Write input_config.json
+          cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
+          {
+            "gpu_count": $DEPLOYMENT_GPU_COUNT,
+            "max_threads": $max_threads,
+            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
+            "total_concurrency": $TOTAL_CONCURRENCY,
+            "mode": "$DEPLOYMENT_MODE",
+            "isl": $ISL,
+            "osl": $OSL,
+            "endpoint": "$ENDPOINT",
+            "model endpoint": "$TARGET_MODEL"
+          }
+          EOF
+          # Run perf with calculated total concurrency
+          run_perf $TOTAL_CONCURRENCY $ISL $OSL
+          echo "done with concurrency $TOTAL_CONCURRENCY"
+        env:
+        - name: TARGET_MODEL
+          value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+        - name: ENDPOINT
+          value: llama3-70b-disagg-mn-frontend:8000
+        - name: CONCURRENCY_PER_GPU
+          value: "16"
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "16"
+        - name: ISL
+          value: "8192"
+        - name: OSL
+          value: "1024"
+        - name: DEPLOYMENT_MODE
+          value: disagg-mn
+        - name: AIPERF_HTTP_CONNECTION_LIMIT
+          value: "200"
+        - name: JOB_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['job-name']
+        - name: ROOT_ARTIFACT_DIR
+          value: /root/.cache/huggingface/perf
+        - name: HF_HOME
+          value: /root/.cache/huggingface
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        image: python:3.12-slim
+        imagePullPolicy: IfNotPresent
+        name: perf
+        securityContext:
+          privileged: true
        volumeMounts:
        - name: model-cache
          mountPath: /root/.cache/huggingface
+        workingDir: /workspace
+      imagePullSecrets:
+      - name: nvcrimagepullsecret
      volumes:
      - name: model-cache
        persistentVolumeClaim:

--- a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
@@ -42,8 +42,13 @@ spec:
                        - worker
                topologyKey: kubernetes.io/hostname
        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
          args:
-          - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c
@@ -76,8 +81,13 @@ spec:
                        - worker
                topologyKey: kubernetes.io/hostname
        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
          args:
-          - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c

--- a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
@@ -5,7 +5,7 @@ kind: Job
 metadata:
  name: llama3-70b-disagg-sn-perf
 spec:
-  backoffLimit: 3
+  backoffLimit: 1
  completions: 1
  parallelism: 1
  template:
@@ -15,58 +15,129 @@ spec:
    spec:
      restartPolicy: Never
      containers:
-      - name: perf
+      - command:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
-        workingDir: /workspace/components/backends/vllm
-        command:
        - /bin/sh
        - -c
        - |
-          # wait for the model to be ready
+          apt-get update && apt-get install -y curl jq procps git && apt-get clean
-          export ENDPOINT=llama3-70b-disagg-sn-frontend:8000
+          pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
-          export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+          echo "aiperf installation completed";
-          export INTERVAL=5
+          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
-          echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
+          cat /proc/sys/net/ipv4/ip_local_port_range
+          export COLUMNS=200
+          EPOCH=$(date +%s)
+          ## utility functions -- can be moved to a bash script / configmap
+          wait_for_model_ready() {
+            echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
            while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
-              echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
+                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
-              sleep $INTERVAL
+                sleep 5
            done
            echo "✅ Model '$TARGET_MODEL' is now available!"
+            echo "Model '$TARGET_MODEL' is now available!"
            curl -s "http://$ENDPOINT/v1/models" | jq .
-          # now run the benchmark
+          }
-          export ARTIFACT_DIR="/tmp/genai-$RANDOM"
+          run_perf() {
+            local concurrency=$1
+            local isl=$2
+            local osl=$3
+            local max_threads=${concurrency}
+            key=concurrency_${concurrency}
+            export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
            mkdir -p "$ARTIFACT_DIR"
-          echo "Running benchmark..."
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
-          export COLUMNS=200
+            aiperf profile --artifact-dir $ARTIFACT_DIR \
-          aiperf profile \
+                --model $TARGET_MODEL \
-            --model "$TARGET_MODEL" \
+                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-            --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
+                --endpoint-type chat  --endpoint /v1/chat/completions \
-            --endpoint-type chat --url "$ENDPOINT" --streaming \
+                --streaming \
-            --concurrency 64 \
+                --url http://$ENDPOINT \
-            --warmup-request-count 2 \
+                --synthetic-input-tokens-mean $isl \
-            --request-count 320 \
-            --extra-inputs max_tokens:1024 \
-            --synthetic-input-tokens-mean 8192 \
                --synthetic-input-tokens-stddev 0 \
-            --output-tokens-mean 1024 \
+                --output-tokens-mean $osl \
                --output-tokens-stddev 0 \
-            --extra-inputs min_tokens:1024 \
+                --extra-inputs max_tokens:$osl \
+                --extra-inputs min_tokens:$osl \
                --extra-inputs ignore_eos:true \
+                --extra-inputs repetition_penalty:1.0 \
+                --extra-inputs temperature:0.0 \
                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-            --random-seed 1418186270 \
+                --concurrency $concurrency \
-            --artifact-dir $ARTIFACT_DIR \
+                --request-count $((10*concurrency)) \
-            --num-dataset-entries=3000 -- \
+                --warmup-request-count $concurrency \
-            --max-threads 64
+                --conversation-num 12800 \
-          echo "----------------json----------------"
+                --random-seed 100 \
-          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
+                --workers-max $max_threads \
-          cat $PERF_JSON | jq .
+                -H 'Authorization: Bearer NOT USED' \
-          echo "----------------csv-----------------"
+                -H 'Accept: text/event-stream'\
-          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
+                --record-processors 32 \
-          cat $PERF_CSV
+                --ui simple
-          echo "Benchmark completed successfully!"
+            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
+            ls -la $ARTIFACT_DIR
+          }
+          #### Actual execution ####
+          wait_for_model_ready
+          mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
+          # Calculate total concurrency based on per-GPU concurrency and GPU count
+          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
+          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
+          # Write input_config.json
+          cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
+          {
+            "gpu_count": $DEPLOYMENT_GPU_COUNT,
+            "max_threads": $max_threads,
+            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
+            "total_concurrency": $TOTAL_CONCURRENCY,
+            "mode": "$DEPLOYMENT_MODE",
+            "isl": $ISL,
+            "osl": $OSL,
+            "endpoint": "$ENDPOINT",
+            "model endpoint": "$TARGET_MODEL"
+          }
+          EOF
+          # Run perf with calculated total concurrency
+          run_perf $TOTAL_CONCURRENCY $ISL $OSL
+          echo "done with concurrency $TOTAL_CONCURRENCY"
+        env:
+        - name: TARGET_MODEL
+          value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+        - name: ENDPOINT
+          value: llama3-70b-disagg-sn-frontend:8000
+        - name: CONCURRENCY_PER_GPU
+          value: "16"
+        - name: DEPLOYMENT_GPU_COUNT
+          value: "8"
+        - name: ISL
+          value: "8192"
+        - name: OSL
+          value: "1024"
+        - name: DEPLOYMENT_MODE
+          value: disagg-sn
+        - name: AIPERF_HTTP_CONNECTION_LIMIT
+          value: "200"
+        - name: JOB_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['job-name']
+        - name: ROOT_ARTIFACT_DIR
+          value: /root/.cache/huggingface/perf
+        - name: HF_HOME
+          value: /root/.cache/huggingface
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        image: python:3.12-slim
+        imagePullPolicy: IfNotPresent
+        name: perf
+        securityContext:
+          privileged: true
        volumeMounts:
        - name: model-cache
          mountPath: /root/.cache/huggingface
+        workingDir: /workspace
+      imagePullSecrets:
+      - name: nvcrimagepullsecret
      volumes:
      - name: model-cache
        persistentVolumeClaim: