Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
13fc3c65
Unverified
Commit
13fc3c65
authored
Oct 15, 2025
by
Biswa Panda
Committed by
GitHub
Oct 15, 2025
Browse files
fix: update model recipe for llama-3 70b to match with common recipe template (#3637)
parent
15a01f75
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
391 additions
and
155 deletions
+391
-155
recipes/llama-3-70b/model-cache/model-download.yaml
recipes/llama-3-70b/model-cache/model-download.yaml
+8
-10
recipes/llama-3-70b/vllm/agg/deploy.yaml
recipes/llama-3-70b/vllm/agg/deploy.yaml
+8
-3
recipes/llama-3-70b/vllm/agg/perf.yaml
recipes/llama-3-70b/vllm/agg/perf.yaml
+117
-46
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+12
-2
recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
+117
-46
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+12
-2
recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
+117
-46
No files found.
recipes/llama-3-70b/model-cache/model-download.yaml
View file @
13fc3c65
...
...
@@ -22,24 +22,22 @@ spec:
-
secretRef
:
name
:
hf-token-secret
env
:
# NOTE: This is the model name for the llama-3-70b model
# Update this to model name for the model you are downloading
-
name
:
MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
HF_TOKEN
-
name
:
HF_HOME
value
:
/model-store
-
name
:
HF_HUB_ENABLE_HF_TRANSFER
value
:
"
1"
-
name
:
MODEL_REVISION
value
:
ddb4128556dfcff99e0c41aee159ea6c3e655dcd
args
:
-
|
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
export HF_HUB_ENABLE_HF_TRANSFER=1
huggingface-cli download $MODEL_NAME
hf download $MODEL_NAME --revision $MODEL_REVISION
volumeMounts
:
-
name
:
model-cache
mountPath
:
/
root/.cache/huggingface/hub
mountPath
:
/
model-store
volumes
:
-
name
:
model-cache
persistentVolumeClaim
:
...
...
recipes/llama-3-70b/vllm/agg/deploy.yaml
View file @
13fc3c65
...
...
@@ -32,8 +32,13 @@ spec:
size
:
20Gi
extraPodSpec
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -42,6 +47,6 @@ spec:
replicas
:
1
resources
:
limits
:
gpu
:
"
8
"
gpu
:
"
4
"
requests
:
gpu
:
"
8"
\ No newline at end of file
gpu
:
"
4"
\ No newline at end of file
recipes/llama-3-70b/vllm/agg/perf.yaml
View file @
13fc3c65
...
...
@@ -5,7 +5,7 @@ kind: Job
metadata
:
name
:
llama3-70b-agg-perf
spec
:
backoffLimit
:
3
backoffLimit
:
1
completions
:
1
parallelism
:
1
template
:
...
...
@@ -15,58 +15,129 @@ spec:
spec
:
restartPolicy
:
Never
containers
:
-
name
:
perf
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
command
:
-
/bin/sh
-
-c
-
|
# wait for the model to be ready
export ENDPOINT=llama3-70b-agg-0:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
aiperf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max $max_threads \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env
:
-
name
:
TARGET_MODEL
value
:
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-
name
:
ENDPOINT
value
:
llama3-70b-agg-frontend:8000
-
name
:
CONCURRENCY_PER_GPU
value
:
"
16"
-
name
:
DEPLOYMENT_GPU_COUNT
value
:
"
4"
-
name
:
ISL
value
:
"
8192"
-
name
:
OSL
value
:
"
1024"
-
name
:
DEPLOYMENT_MODE
value
:
agg
-
name
:
AIPERF_HTTP_CONNECTION_LIMIT
value
:
"
200"
-
name
:
JOB_NAME
valueFrom
:
fieldRef
:
apiVersion
:
v1
fieldPath
:
metadata.labels['job-name']
-
name
:
ROOT_ARTIFACT_DIR
value
:
/root/.cache/huggingface/perf
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
PYTHONUNBUFFERED
value
:
"
1"
image
:
python:3.12-slim
imagePullPolicy
:
IfNotPresent
name
:
perf
securityContext
:
privileged
:
true
volumeMounts
:
-
name
:
model-cache
mountPath
:
/root/.cache/huggingface
workingDir
:
/workspace
imagePullSecrets
:
-
name
:
nvcrimagepullsecret
volumes
:
-
name
:
model-cache
persistentVolumeClaim
:
...
...
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
View file @
13fc3c65
...
...
@@ -32,8 +32,13 @@ spec:
size
:
80Gi
extraPodSpec
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -56,8 +61,13 @@ spec:
size
:
80Gi
extraPodSpec
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
View file @
13fc3c65
...
...
@@ -5,7 +5,7 @@ kind: Job
metadata
:
name
:
llama3-70b-disagg-mn-perf
spec
:
backoffLimit
:
3
backoffLimit
:
1
completions
:
1
parallelism
:
1
template
:
...
...
@@ -15,58 +15,129 @@ spec:
spec
:
restartPolicy
:
Never
containers
:
-
name
:
perf
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
command
:
-
/bin/sh
-
-c
-
|
# wait for the model to be ready
export ENDPOINT=llama3-70b-disagg-mn-frontend:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
aiperf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max $max_threads \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env
:
-
name
:
TARGET_MODEL
value
:
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-
name
:
ENDPOINT
value
:
llama3-70b-disagg-mn-frontend:8000
-
name
:
CONCURRENCY_PER_GPU
value
:
"
16"
-
name
:
DEPLOYMENT_GPU_COUNT
value
:
"
16"
-
name
:
ISL
value
:
"
8192"
-
name
:
OSL
value
:
"
1024"
-
name
:
DEPLOYMENT_MODE
value
:
disagg-mn
-
name
:
AIPERF_HTTP_CONNECTION_LIMIT
value
:
"
200"
-
name
:
JOB_NAME
valueFrom
:
fieldRef
:
apiVersion
:
v1
fieldPath
:
metadata.labels['job-name']
-
name
:
ROOT_ARTIFACT_DIR
value
:
/root/.cache/huggingface/perf
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
PYTHONUNBUFFERED
value
:
"
1"
image
:
python:3.12-slim
imagePullPolicy
:
IfNotPresent
name
:
perf
securityContext
:
privileged
:
true
volumeMounts
:
-
name
:
model-cache
mountPath
:
/root/.cache/huggingface
workingDir
:
/workspace
imagePullSecrets
:
-
name
:
nvcrimagepullsecret
volumes
:
-
name
:
model-cache
persistentVolumeClaim
:
...
...
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
View file @
13fc3c65
...
...
@@ -42,8 +42,13 @@ spec:
-
worker
topologyKey
:
kubernetes.io/hostname
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
2
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -76,8 +81,13 @@ spec:
-
worker
topologyKey
:
kubernetes.io/hostname
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
View file @
13fc3c65
...
...
@@ -5,7 +5,7 @@ kind: Job
metadata
:
name
:
llama3-70b-disagg-sn-perf
spec
:
backoffLimit
:
3
backoffLimit
:
1
completions
:
1
parallelism
:
1
template
:
...
...
@@ -15,58 +15,129 @@ spec:
spec
:
restartPolicy
:
Never
containers
:
-
name
:
perf
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
command
:
-
/bin/sh
-
-c
-
|
# wait for the model to be ready
export ENDPOINT=llama3-70b-disagg-sn-frontend:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai-$RANDOM"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
aiperf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max $max_threads \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env
:
-
name
:
TARGET_MODEL
value
:
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-
name
:
ENDPOINT
value
:
llama3-70b-disagg-sn-frontend:8000
-
name
:
CONCURRENCY_PER_GPU
value
:
"
16"
-
name
:
DEPLOYMENT_GPU_COUNT
value
:
"
8"
-
name
:
ISL
value
:
"
8192"
-
name
:
OSL
value
:
"
1024"
-
name
:
DEPLOYMENT_MODE
value
:
disagg-sn
-
name
:
AIPERF_HTTP_CONNECTION_LIMIT
value
:
"
200"
-
name
:
JOB_NAME
valueFrom
:
fieldRef
:
apiVersion
:
v1
fieldPath
:
metadata.labels['job-name']
-
name
:
ROOT_ARTIFACT_DIR
value
:
/root/.cache/huggingface/perf
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
PYTHONUNBUFFERED
value
:
"
1"
image
:
python:3.12-slim
imagePullPolicy
:
IfNotPresent
name
:
perf
securityContext
:
privileged
:
true
volumeMounts
:
-
name
:
model-cache
mountPath
:
/root/.cache/huggingface
workingDir
:
/workspace
imagePullSecrets
:
-
name
:
nvcrimagepullsecret
volumes
:
-
name
:
model-cache
persistentVolumeClaim
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment