Unverified Commit 48b622c5 authored by Alec's avatar Alec Committed by GitHub
Browse files

docs: initial trtllm recipe for qwen32b-fp8 (#3827)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 818d72ae
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-cache
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 100Gi
storageClassName: "your-storage-class-name"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: model-download
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: model-download
spec:
restartPolicy: Never
containers:
- name: model-download
image: python:3.10-slim
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: MODEL_NAME
value: Qwen/Qwen3-32B-FP8
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
- name: MODEL_REVISION
value: aa55da1ecc13d006e8b8e4f54579b1ea8c3db2df
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
hf download $MODEL_NAME --revision $MODEL_REVISION"
volumeMounts:
- name: model-cache
mountPath: /model-store
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-config
data:
config.yaml: |
backend: pytorch
tensor_parallel_size: 2
pipeline_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
max_batch_size: 96
max_num_tokens: 7964
max_seq_len: 7964
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.7
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 96
disable_overlap_scheduler: false
print_iter_log: false
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: qwen3-32b-fp8-agg
spec:
backendFramework: trtllm
pvcs:
- name: model-cache
create: false
services:
Frontend:
componentType: frontend
dynamoNamespace: qwen3-32b-fp8-agg
extraPodSpec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- qwen3-32b-fp8-agg-frontend
topologyKey: kubernetes.io/hostname
mainContainer:
args:
- python3 -m dynamo.frontend --router-mode round-robin --http-port 8000
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
replicas: 1
TrtllmWorker:
componentType: main
dynamoNamespace: qwen3-32b-fp8-agg
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
mainContainer:
args:
- |
python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \
--served-model-name "${MODEL_PATH}" \
--extra-engine-args "${ENGINE_ARGS}" \
--tensor-parallel-size 2 \
--max-batch-size 96 \
--free-gpu-memory-fraction 0.9
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
env:
- name: TRTLLM_ENABLE_PDL
value: "1"
- name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
value: "True"
- name: ENGINE_ARGS
value: "/opt/dynamo/configs/config.yaml"
- name: MODEL_PATH
value: "Qwen/Qwen3-32B-FP8"
volumeMounts:
- mountPath: /opt/dynamo/configs
name: llm-config
readOnly: true
workingDir: /workspace/components/backends/trtllm
volumes:
- configMap:
name: llm-config
name: llm-config
replicas: 1
resources:
limits:
gpu: "2"
requests:
gpu: "2"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: qwen3-32b-fp8-bench
spec:
backoffLimit: 1
completions: 1
parallelism: 1
template:
metadata:
labels:
app: qwen3-32b-fp8-bench
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- qwen3-32b-fp8-agg
topologyKey: kubernetes.io/hostname
containers:
- command:
- /bin/sh
- -c
- |
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer $TARGET_MODEL \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs "max_tokens:$osl" \
--extra-inputs "min_tokens:$osl" \
--extra-inputs "ignore_eos:true" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "repetition_penalty:1.0" \
--extra-inputs "temperature: 0.0" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max 252 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: Qwen/Qwen3-32B-FP8
- name: ENDPOINT
value: qwen3-32b-fp8-agg-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "2"
- name: DEPLOYMENT_GPU_COUNT
value: "2"
- name: ISL
value: "4000"
- name: OSL
value: "500"
- name: DEPLOYMENT_MODE
value: agg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "200"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /model-cache/perf
- name: HF_HOME
value: /model-cache
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts:
- name: model-cache
mountPath: /model-cache
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
restartPolicy: Never
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-config-decode
data:
config-decode.yaml: |
backend: pytorch
tensor_parallel_size: 2
pipeline_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
max_batch_size: 128
max_num_tokens: 7800
max_seq_len: 7800
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.7
dtype: fp8
cache_transceiver_config:
backend: DEFAULT
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
disable_overlap_scheduler: false
print_iter_log: false
---
apiVersion: v1
kind: ConfigMap
metadata:
name: llm-config-prefill
data:
config-prefill.yaml: |
backend: pytorch
tensor_parallel_size: 1
pipeline_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
max_batch_size: 1
max_num_tokens: 7800
max_seq_len: 7800
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.7
dtype: fp8
cache_transceiver_config:
backend: DEFAULT
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
disable_overlap_scheduler: true
print_iter_log: false
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: qwen3-32b-fp8-disagg
spec:
backendFramework: trtllm
pvcs:
- name: model-cache
create: false
services:
Frontend:
componentType: frontend
dynamoNamespace: qwen3-32b-fp8-disagg
extraPodSpec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- qwen3-32b-fp8-disagg-frontend
topologyKey: kubernetes.io/hostname
mainContainer:
args:
- python3 -m dynamo.frontend --router-mode round-robin --http-port 8000
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
replicas: 1
TrtllmPrefillWorker:
componentType: worker
subComponentType: prefill
dynamoNamespace: qwen3-32b-fp8-disagg
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
mainContainer:
args:
- |
python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \
--served-model-name "${MODEL_PATH}" \
--extra-engine-args "${ENGINE_ARGS}" \
--tensor-parallel-size 1 \
--max-batch-size 1 \
--free-gpu-memory-fraction 0.9 \
--disaggregation-mode prefill \
--disaggregation-strategy prefill_first
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
env:
- name: TRTLLM_ENABLE_PDL
value: "1"
- name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
value: "True"
- name: ENGINE_ARGS
value: "/opt/dynamo/configs/config-prefill.yaml"
- name: MODEL_PATH
value: "Qwen/Qwen3-32B-FP8"
volumeMounts:
- mountPath: /opt/dynamo/configs
name: llm-config-prefill
readOnly: true
workingDir: /workspace/components/backends/trtllm
volumes:
- configMap:
name: llm-config-prefill
name: llm-config-prefill
replicas: 4
resources:
limits:
gpu: "1"
requests:
gpu: "1"
TrtllmDecodeWorker:
componentType: worker
subComponentType: decode
dynamoNamespace: qwen3-32b-fp8-disagg
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
mainContainer:
args:
- |
python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \
--served-model-name "${MODEL_PATH}" \
--extra-engine-args "${ENGINE_ARGS}" \
--tensor-parallel-size 2 \
--max-batch-size 128 \
--free-gpu-memory-fraction 0.9 \
--disaggregation-mode decode \
--disaggregation-strategy prefill_first
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
env:
- name: TRTLLM_ENABLE_PDL
value: "1"
- name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
value: "True"
- name: ENGINE_ARGS
value: "/opt/dynamo/configs/config-decode.yaml"
- name: MODEL_PATH
value: "Qwen/Qwen3-32B-FP8"
volumeMounts:
- mountPath: /opt/dynamo/configs
name: llm-config-decode
readOnly: true
workingDir: /workspace/components/backends/trtllm
volumes:
- configMap:
name: llm-config-decode
name: llm-config-decode
replicas: 2
resources:
limits:
gpu: "2"
requests:
gpu: "2"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: qwen3-32b-fp8-bench
spec:
backoffLimit: 1
completions: 1
parallelism: 1
template:
metadata:
labels:
app: qwen3-32b-fp8-bench
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- qwen3-32b-fp8-disagg
topologyKey: kubernetes.io/hostname
containers:
- command:
- /bin/sh
- -c
- |
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer $TARGET_MODEL \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs "max_tokens:$osl" \
--extra-inputs "min_tokens:$osl" \
--extra-inputs "ignore_eos:true" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "repetition_penalty:1.0" \
--extra-inputs "temperature: 0.0" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max 252 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: "Qwen/Qwen3-32B-FP8"
- name: ENDPOINT
value: qwen3-32b-fp8-disagg-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "6"
- name: DEPLOYMENT_GPU_COUNT
value: "8"
- name: ISL
value: "4000"
- name: OSL
value: "500"
- name: DEPLOYMENT_MODE
value: disagg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "200"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /model-cache/perf
- name: HF_HOME
value: /model-cache
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts:
- name: model-cache
mountPath: /model-cache
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
restartPolicy: Never
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment