"lib/bindings/c/vscode:/vscode.git/clone" did not exist on "432fae67b17b9aeb18d19fe2cfb595bd75e34938"
Unverified Commit 23033136 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: add single-liner deployment and benchmarking recipe for llama3-70b models (#2792)


Signed-off-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent dcd331ab
# Dynamo model serving recipes
| Model family | Backend | Mode | Deployment | Benchmark |
|---------------|---------|---------------------|------------|-----------|
| llama-3-70b | vllm | agg | ✓ | ✓ |
| llama-3-70b | vllm | disagg-multi-node | ✓ | ✓ |
| llama-3-70b | vllm | disagg-single-node | ✓ | ✓ |
| oss-gpt | trtllm | aggregated | ✓ | ✓ |
| DeepSeek-R1 | sglang | disaggregated | 🚧 | 🚧 |
## Prerequisites
1. Create a namespace and populate NAMESPACE environment variable
This environment variable is used in later steps to deploy and perf-test the model.
```bash
export NAMESPACE=your-namespace
kubectl create namespace ${NAMESPACE}
```
2. **Dynamo Cloud Platform installed** - Follow [Quickstart Guide](../docs/guides/dynamo_deploy/README.md)
3. **Kubernetes cluster with GPU support**
4. **Container registry access** for vLLM runtime images
5. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`)
Update the `hf-token-secret.yaml` file with your HuggingFace token.
```bash
kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE}
```
6. (Optional) Create a shared model cache pvc to store the model weights.
Choose a storage class to create the model cache pvc. You'll need to use this storage class name to update the `storageClass` field in the model-cache/model-cache.yaml file.
```bash
kubectl get storageclass
```
## Running the recipes
Run the recipe to deploy a model:
```bash
./run.sh --model <model> --framework <framework> <deployment-type>
```
Arguments:
<deployment-type> Deployment type (e.g., agg, disagg-single-node, disagg-multi-node)
Required Options:
--model <model> Model name (e.g., llama-3-70b)
--framework <fw> Framework one of VLLM TRTLLM SGLANG (default: VLLM)
Optional:
--skip-model-cache Skip model downloading (assumes model cache already exists)
-h, --help Show this help message
Environment Variables:
NAMESPACE Kubernetes namespace (default: dynamo)
Examples:
./run.sh --model llama-3-70b --framework vllm agg
./run.sh --skip-model-cache --model llama-3-70b --framework vllm agg
./run.sh --model llama-3-70b --framework trtllm disagg-single-node
Example:
```bash
./run.sh --model llama-3-70b --framework vllm --deployment-type agg
```
## Dry run mode
To dry run the recipe, add the `--dry-run` flag.
```bash
./run.sh --dry-run --model llama-3-70b --framework vllm agg
```
## (Optional) Running the recipes with model cache
You may need to cache the model weights on a PVC to avoid repeated downloads of the model weights.
See the [Prerequisites](#prerequisites) section for more details.
```bash
./run.sh --model llama-3-70b --framework vllm --deployment-type agg --skip-model-cache
```
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: oss-gpt120b-bench
spec:
backoffLimit: 1
completions: 1
parallelism: 1
template:
metadata:
labels:
app: oss-gpt120b
spec:
restartPolicy: Never
containers:
- name: perf
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:aiperf-0637181
workingDir: /workspace/components/backends/vllm
env:
- name: TARGET_MODEL
value: openai/gpt-oss-120b
- name: ENDPOINT
value: gpt-oss-agg-trtllmworker:8000
- name: CONCURRENCIES
value: "13000 13500 1400"
- name: ISL
value: "16"
- name: OSL
value: "1000"
- name: DEPLOYMENT_MODE
value: "agg"
- name: DEPLOYMENT_GPU_COUNT
value: "32"
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/hub/perf
command:
- /bin/sh
- -c
- |
#TODO: this can be baked into the aiperf image
apt-get update && apt-get install -y curl jq
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..."
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs "{\"max_tokens\":$osl}" \
--extra-inputs "{\"min_tokens\":$osl}" \
--extra-inputs "{\"ignore_eos\":true}" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $((3*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 1 \
--random-seed 100 \
--request-rate 100000 \
--workers-max 128 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf for each concurrency
for concurrency in $CONCURRENCIES; do
run_perf $concurrency $ISL $OSL
sleep 10
done
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
imagePullSecrets:
- name: nvcrimagepullsecret
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-config
data:
config.yaml: |
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
build_config:
max_batch_size: 640
max_num_tokens: 20000
moe_config:
backend: CUTLASS
cuda_graph_config:
max_batch_size: 640
enable_padding: true
kv_cache_config:
free_gpu_memory_fraction: 0.9
enable_block_reuse: false
print_iter_log: false
stream_interval: 50
use_torch_sampler: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: gpt-oss-agg-shm
spec:
backendFramework: trtllm
services:
TrtllmWorker:
componentType: main
dynamoNamespace: gpt-oss-agg-shm
envFromSecret: hf-token-secret
pvc:
create: false
name: model-cache-oss-gpt120b
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
tolerations:
- key: "dedicated"
operator: "Equal"
value: "user-workload"
effect: "NoSchedule"
- key: "dedicated"
operator: "Equal"
value: "user-workload"
effect: "NoExecute"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
mainContainer:
args:
- |
export TRTLLM_ENABLE_PDL=1
export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True
export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"}
export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
trap 'echo Cleaning up...; kill 0' EXIT
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$ENGINE_ARGS" \
--max-num-tokens 20000 \
--max-batch-size 640 \
--free-gpu-memory-fraction 0.9
command:
- /bin/sh
- -c
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:gpt-oss-dynamo-nvl72-debug-trtllm-tot
workingDir: /workspace/components/backends/trtllm
replicas: 1
resources:
limits:
gpu: "4"
requests:
gpu: "4"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: Service
metadata:
name: gpt-oss-agg-trtllmworker
spec:
selector:
nvidia.com/selector: gpt-oss-agg-trtllmworker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
stringData:
HF_TOKEN: "<Huggingface token with access to the model>"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-cache
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 100Gi
storageClassName: "your-storage-class-name"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: model-download
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: model-download
spec:
restartPolicy: Never
containers:
- name: model-download
image: python:3.10-slim
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
# NOTE: This is the model name for the llama-3-70b model
# Update this to model name for the model you are downloading
- name: MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
export HF_HUB_ENABLE_HF_TRANSFER=1
huggingface-cli download $MODEL_NAME
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface/hub
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llama3-70b-agg
spec:
backendFramework: vllm
services:
Frontend:
componentType: frontend
dynamoNamespace: llama3-70b-agg
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
VllmPrefillWorker:
componentType: worker
dynamoNamespace: llama3-70b-agg
envFromSecret: hf-token-secret
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 20Gi
extraPodSpec:
mainContainer:
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
resources:
limits:
gpu: "8"
requests:
gpu: "8"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: llama3-70b-agg-perf
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: llama3-70b-agg-perf
spec:
restartPolicy: Never
containers:
- name: perf
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
- |
# wait for the model to be ready
export ENDPOINT=llama3-70b-agg-0:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
export COLUMNS=200
genai-perf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llama3-70b-disagg-mn
spec:
backendFramework: vllm
services:
Frontend:
componentType: frontend
dynamoNamespace: llama3-70b-disagg-mn
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
VllmPrefillWorker:
componentType: worker
dynamoNamespace: llama3-70b-disagg-mn
envFromSecret: hf-token-secret
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
resources:
limits:
gpu: "8"
requests:
gpu: "8"
VllmDecodeWorker:
componentType: worker
dynamoNamespace: llama3-70b-disagg-mn
envFromSecret: hf-token-secret
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
mainContainer:
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
resources:
limits:
gpu: "8"
requests:
gpu: "8"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: llama3-70b-disagg-mn-perf
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: llama3-70b-disagg-mn-perf
spec:
restartPolicy: Never
containers:
- name: perf
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
- |
# wait for the model to be ready
export ENDPOINT=llama3-70b-disagg-mn-frontend:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
export COLUMNS=200
genai-perf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llama3-70b-disagg-sn
spec:
backendFramework: vllm
services:
Frontend:
componentType: frontend
dynamoNamespace: llama3-70b-disagg-sn
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
VllmPrefillWorker:
componentType: worker
dynamoNamespace: llama3-70b-disagg-sn
envFromSecret: hf-token-secret
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 2
resources:
limits:
gpu: "2"
requests:
gpu: "2"
VllmDecodeWorker:
componentType: worker
dynamoNamespace: llama3-70b-disagg-sn
envFromSecret: hf-token-secret
pvc:
create: false
name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
replicas: 1
resources:
limits:
gpu: "4"
requests:
gpu: "4"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: llama3-70b-disagg-sn-perf
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: llama3-70b-disagg-sn-perf
spec:
restartPolicy: Never
containers:
- name: perf
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
- |
# wait for the model to be ready
export ENDPOINT=llama3-70b-disagg-sn-frontend:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai-$RANDOM"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
export COLUMNS=200
genai-perf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )"
# Default values
NAMESPACE="${NAMESPACE:-dynamo}"
DOWNLOAD_MODEL=true
DEPLOY_TYPE=""
MODEL=""
FRAMEWORK=""
DRY_RUN=""
# Frameworks - following container/build.sh pattern
declare -A FRAMEWORKS=(["VLLM"]=1 ["TRTLLM"]=2 ["SGLANG"]=3)
DEFAULT_FRAMEWORK=VLLM
# Function to show usage
usage() {
echo "Usage: $0 [OPTIONS] --model <model> --framework <framework> <deployment-type>"
echo ""
echo "Arguments:"
echo " <deployment-type> Deployment type (e.g., agg, disagg-single-node, disagg-multi-node)"
echo ""
echo "Required Options:"
echo " --model <model> Model name (e.g., llama-3-70b)"
echo " --framework <fw> Framework one of ${!FRAMEWORKS[*]} (default: ${DEFAULT_FRAMEWORK})"
echo ""
echo "Optional:"
echo " --namespace <ns> Kubernetes namespace (default: dynamo)"
echo " --skip-model-cache Skip model downloading (assumes model cache already exists)"
echo " --dry-run Print commands without executing them"
echo " -h, --help Show this help message"
echo ""
echo "Environment Variables:"
echo " NAMESPACE Kubernetes namespace (default: dynamo)"
echo ""
echo "Examples:"
echo " $0 --model llama-3-70b --framework vllm agg"
echo " $0 --skip-model-cache --model llama-3-70b --framework vllm agg"
echo " $0 --namespace my-ns --model llama-3-70b --framework trtllm disagg-single-node"
exit 1
}
missing_requirement() {
echo "ERROR: $1 requires an argument."
usage
}
error() {
printf '%s %s\n' "$1" "$2" >&2
exit 1
}
while [[ $# -gt 0 ]]; do
case $1 in
--skip-model-cache)
DOWNLOAD_MODEL=false
shift
;;
--dry-run)
DRY_RUN="echo"
shift
;;
--model)
if [ "$2" ]; then
MODEL=$2
shift 2
else
missing_requirement "$1"
fi
;;
--framework)
if [ "$2" ]; then
FRAMEWORK=$2
shift 2
else
missing_requirement "$1"
fi
;;
--namespace)
if [ "$2" ]; then
NAMESPACE=$2
shift 2
else
missing_requirement "$1"
fi
;;
-h|--help)
usage
;;
-*)
error 'ERROR: Unknown option: ' "$1"
;;
*)
if [[ -z "$DEPLOY_TYPE" ]]; then
DEPLOY_TYPE="$1"
else
error "ERROR: Multiple deployment type arguments provided: " "$1"
fi
shift
;;
esac
done
if [ -z "$FRAMEWORK" ]; then
FRAMEWORK=$DEFAULT_FRAMEWORK
fi
if [ -n "$FRAMEWORK" ]; then
FRAMEWORK=${FRAMEWORK^^}
if [[ -z "${FRAMEWORKS[$FRAMEWORK]}" ]]; then
error 'ERROR: Unknown framework: ' "$FRAMEWORK"
fi
fi
# Validate required arguments
if [[ -z "$MODEL" ]] || [[ -z "$DEPLOY_TYPE" ]]; then
if [[ -z "$MODEL" ]]; then
echo "ERROR: --model argument is required"
fi
if [[ -z "$DEPLOY_TYPE" ]]; then
echo "ERROR: deployment-type argument is required"
fi
echo ""
usage
fi
# Construct paths based on new structure: recipes/<model>/<framework>/<deployment-type>/
MODEL_DIR="$RECIPES_DIR/$MODEL"
FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}"
DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOY_TYPE"
# Check if model directory exists
if [[ ! -d "$MODEL_DIR" ]]; then
echo "Error: Model directory '$MODEL' does not exist in $RECIPES_DIR"
echo "Available models:"
ls -1 "$RECIPES_DIR" | grep -v "\.sh$\|\.md$\|model-cache$" | sed 's/^/ /'
exit 1
fi
# Check if framework directory exists
if [[ ! -d "$FRAMEWORK_DIR" ]]; then
echo "Error: Framework directory '${FRAMEWORK,,}' does not exist in $MODEL_DIR"
echo "Available frameworks for $MODEL:"
ls -1 "$MODEL_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/ /'
exit 1
fi
# Check if deployment directory exists
if [[ ! -d "$DEPLOY_PATH" ]]; then
echo "Error: Deployment type '$DEPLOY_TYPE' does not exist in $FRAMEWORK_DIR"
echo "Available deployment types for $MODEL/${FRAMEWORK,,}:"
ls -1 "$FRAMEWORK_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/ /'
exit 1
fi
# Check if deployment files exist
DEPLOY_FILE="$DEPLOY_PATH/deploy.yaml"
PERF_FILE="$DEPLOY_PATH/perf.yaml"
if [[ ! -f "$DEPLOY_FILE" ]]; then
echo "Error: Deployment file '$DEPLOY_FILE' not found"
exit 1
fi
if [[ ! -f "$PERF_FILE" ]]; then
echo "Error: Performance file '$PERF_FILE' not found"
exit 1
fi
# Show deployment information
echo "======================================"
echo "Dynamo Recipe Deployment"
echo "======================================"
echo "Model: $MODEL"
echo "Framework: ${FRAMEWORK,,}"
echo "Deployment Type: $DEPLOY_TYPE"
echo "Namespace: $NAMESPACE"
echo "Model Download: $DOWNLOAD_MODEL"
echo "======================================"
# Handle model downloading
MODEL_CACHE_DIR="$MODEL_DIR/model-cache"
if [[ "$DOWNLOAD_MODEL" == "true" ]]; then
echo "Creating PVC for model cache and downloading model..."
$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-cache.yaml
$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-download.yaml
# Wait for the model download to complete
echo "Waiting for the model download to complete..."
$DRY_RUN kubectl wait --for=condition=Complete job/model-download-${MODEL} -n $NAMESPACE --timeout=6000s
else
echo "Skipping model download (using existing model cache)..."
# Still create the PVC in case it doesn't exist
$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-cache.yaml
fi
# Deploy the specified configuration
echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOY_TYPE configuration..."
$DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE
# Launch the benchmark job
echo "Launching benchmark job..."
$DRY_RUN kubectl apply -n $NAMESPACE -f $PERF_FILE
# Construct job name from the perf file
JOB_NAME=$(grep "name:" $PERF_FILE | head -1 | awk '{print $2}')
echo "Waiting for job '$JOB_NAME' to complete..."
$DRY_RUN kubectl wait --for=condition=Complete job/$JOB_NAME -n $NAMESPACE --timeout=6000s
# Print logs from the benchmark job
echo "======================================"
echo "Benchmark completed. Logs:"
echo "======================================"
$DRY_RUN kubectl logs job/$JOB_NAME -n $NAMESPACE
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment