Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
13fc3c65
Unverified
Commit
13fc3c65
authored
Oct 15, 2025
by
Biswa Panda
Committed by
GitHub
Oct 15, 2025
Browse files
fix: update model recipe for llama-3 70b to match with common recipe template (#3637)
parent
15a01f75
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
391 additions
and
155 deletions
+391
-155
recipes/llama-3-70b/model-cache/model-download.yaml
recipes/llama-3-70b/model-cache/model-download.yaml
+8
-10
recipes/llama-3-70b/vllm/agg/deploy.yaml
recipes/llama-3-70b/vllm/agg/deploy.yaml
+8
-3
recipes/llama-3-70b/vllm/agg/perf.yaml
recipes/llama-3-70b/vllm/agg/perf.yaml
+117
-46
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+12
-2
recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
+117
-46
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+12
-2
recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
+117
-46
No files found.
recipes/llama-3-70b/model-cache/model-download.yaml
View file @
13fc3c65
...
@@ -22,24 +22,22 @@ spec:
...
@@ -22,24 +22,22 @@ spec:
-
secretRef
:
-
secretRef
:
name
:
hf-token-secret
name
:
hf-token-secret
env
:
env
:
# NOTE: This is the model name for the llama-3-70b model
# Update this to model name for the model you are downloading
-
name
:
MODEL_NAME
-
name
:
MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
HF_TOKEN
-
name
:
HF_HOME
valueFrom
:
value
:
/model-store
secretKeyRef
:
-
name
:
HF_HUB_ENABLE_HF_TRANSFER
name
:
hf-token-secret
value
:
"
1"
key
:
HF_TOKEN
-
name
:
MODEL_REVISION
value
:
ddb4128556dfcff99e0c41aee159ea6c3e655dcd
args
:
args
:
-
|
-
|
set -eux
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
pip install --no-cache-dir huggingface_hub hf_transfer
export HF_HUB_ENABLE_HF_TRANSFER=1
hf download $MODEL_NAME --revision $MODEL_REVISION
huggingface-cli download $MODEL_NAME
volumeMounts
:
volumeMounts
:
-
name
:
model-cache
-
name
:
model-cache
mountPath
:
/
root/.cache/huggingface/hub
mountPath
:
/
model-store
volumes
:
volumes
:
-
name
:
model-cache
-
name
:
model-cache
persistentVolumeClaim
:
persistentVolumeClaim
:
...
...
recipes/llama-3-70b/vllm/agg/deploy.yaml
View file @
13fc3c65
...
@@ -32,8 +32,13 @@ spec:
...
@@ -32,8 +32,13 @@ spec:
size
:
20Gi
size
:
20Gi
extraPodSpec
:
extraPodSpec
:
mainContainer
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
...
@@ -42,6 +47,6 @@ spec:
...
@@ -42,6 +47,6 @@ spec:
replicas
:
1
replicas
:
1
resources
:
resources
:
limits
:
limits
:
gpu
:
"
8
"
gpu
:
"
4
"
requests
:
requests
:
gpu
:
"
8"
gpu
:
"
4"
\ No newline at end of file
\ No newline at end of file
recipes/llama-3-70b/vllm/agg/perf.yaml
View file @
13fc3c65
...
@@ -5,7 +5,7 @@ kind: Job
...
@@ -5,7 +5,7 @@ kind: Job
metadata
:
metadata
:
name
:
llama3-70b-agg-perf
name
:
llama3-70b-agg-perf
spec
:
spec
:
backoffLimit
:
3
backoffLimit
:
1
completions
:
1
completions
:
1
parallelism
:
1
parallelism
:
1
template
:
template
:
...
@@ -15,58 +15,129 @@ spec:
...
@@ -15,58 +15,129 @@ spec:
spec
:
spec
:
restartPolicy
:
Never
restartPolicy
:
Never
containers
:
containers
:
-
name
:
perf
-
command
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
-
|
-
|
# wait for the model to be ready
apt-get update && apt-get install -y curl jq procps git && apt-get clean
export ENDPOINT=llama3-70b-agg-0:8000
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
echo "aiperf installation completed";
export INTERVAL=5
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet,
waiting ${INTERVAL}s...
"
echo "[$(date '+%H:%M:%S')] Model not ready yet,
sleeping 5s before checking again http://$ENDPOINT/v1/models
"
sleep
$INTERVAL
sleep
5
done
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
}
export ARTIFACT_DIR="/tmp/genai"
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
export COLUMNS=200
aiperf profile --artifact-dir $ARTIFACT_DIR \
aiperf profile \
--model $TARGET_MODEL \
--model "$TARGET_MODEL" \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--streaming \
--concurrency 64 \
--url http://$ENDPOINT \
--warmup-request-count 2 \
--synthetic-input-tokens-mean $isl \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean
1024
\
--output-tokens-mean
$osl
\
--output-tokens-stddev 0 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--concurrency $concurrency \
--artifact-dir $ARTIFACT_DIR \
--request-count $((10*concurrency)) \
--num-dataset-entries=3000 -- \
--warmup-request-count $concurrency \
--max-threads 64
--conversation-num 12800 \
echo "----------------json----------------"
--random-seed 100 \
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
--workers-max $max_threads \
cat $PERF_JSON | jq .
-H 'Authorization: Bearer NOT USED' \
echo "----------------csv-----------------"
-H 'Accept: text/event-stream'\
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
--record-processors 32 \
cat $PERF_CSV
--ui simple
echo "Benchmark completed successfully!"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env
:
-
name
:
TARGET_MODEL
value
:
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-
name
:
ENDPOINT
value
:
llama3-70b-agg-frontend:8000
-
name
:
CONCURRENCY_PER_GPU
value
:
"
16"
-
name
:
DEPLOYMENT_GPU_COUNT
value
:
"
4"
-
name
:
ISL
value
:
"
8192"
-
name
:
OSL
value
:
"
1024"
-
name
:
DEPLOYMENT_MODE
value
:
agg
-
name
:
AIPERF_HTTP_CONNECTION_LIMIT
value
:
"
200"
-
name
:
JOB_NAME
valueFrom
:
fieldRef
:
apiVersion
:
v1
fieldPath
:
metadata.labels['job-name']
-
name
:
ROOT_ARTIFACT_DIR
value
:
/root/.cache/huggingface/perf
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
PYTHONUNBUFFERED
value
:
"
1"
image
:
python:3.12-slim
imagePullPolicy
:
IfNotPresent
name
:
perf
securityContext
:
privileged
:
true
volumeMounts
:
volumeMounts
:
-
name
:
model-cache
-
name
:
model-cache
mountPath
:
/root/.cache/huggingface
mountPath
:
/root/.cache/huggingface
workingDir
:
/workspace
imagePullSecrets
:
-
name
:
nvcrimagepullsecret
volumes
:
volumes
:
-
name
:
model-cache
-
name
:
model-cache
persistentVolumeClaim
:
persistentVolumeClaim
:
...
...
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
View file @
13fc3c65
...
@@ -32,8 +32,13 @@ spec:
...
@@ -32,8 +32,13 @@ spec:
size
:
80Gi
size
:
80Gi
extraPodSpec
:
extraPodSpec
:
mainContainer
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
...
@@ -56,8 +61,13 @@ spec:
...
@@ -56,8 +61,13 @@ spec:
size
:
80Gi
size
:
80Gi
extraPodSpec
:
extraPodSpec
:
mainContainer
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
View file @
13fc3c65
...
@@ -5,7 +5,7 @@ kind: Job
...
@@ -5,7 +5,7 @@ kind: Job
metadata
:
metadata
:
name
:
llama3-70b-disagg-mn-perf
name
:
llama3-70b-disagg-mn-perf
spec
:
spec
:
backoffLimit
:
3
backoffLimit
:
1
completions
:
1
completions
:
1
parallelism
:
1
parallelism
:
1
template
:
template
:
...
@@ -15,58 +15,129 @@ spec:
...
@@ -15,58 +15,129 @@ spec:
spec
:
spec
:
restartPolicy
:
Never
restartPolicy
:
Never
containers
:
containers
:
-
name
:
perf
-
command
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
-
|
-
|
# wait for the model to be ready
apt-get update && apt-get install -y curl jq procps git && apt-get clean
export ENDPOINT=llama3-70b-disagg-mn-frontend:8000
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
echo "aiperf installation completed";
export INTERVAL=5
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet,
waiting ${INTERVAL}s...
"
echo "[$(date '+%H:%M:%S')] Model not ready yet,
sleeping 5s before checking again http://$ENDPOINT/v1/models
"
sleep
$INTERVAL
sleep
5
done
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
}
export ARTIFACT_DIR="/tmp/genai"
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
export COLUMNS=200
aiperf profile --artifact-dir $ARTIFACT_DIR \
aiperf profile \
--model $TARGET_MODEL \
--model "$TARGET_MODEL" \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--streaming \
--concurrency 64 \
--url http://$ENDPOINT \
--warmup-request-count 2 \
--synthetic-input-tokens-mean $isl \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean
1024
\
--output-tokens-mean
$osl
\
--output-tokens-stddev 0 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--concurrency $concurrency \
--artifact-dir $ARTIFACT_DIR \
--request-count $((10*concurrency)) \
--num-dataset-entries=3000 -- \
--warmup-request-count $concurrency \
--max-threads 64
--conversation-num 12800 \
echo "----------------json----------------"
--random-seed 100 \
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
--workers-max $max_threads \
cat $PERF_JSON | jq .
-H 'Authorization: Bearer NOT USED' \
echo "----------------csv-----------------"
-H 'Accept: text/event-stream'\
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
--record-processors 32 \
cat $PERF_CSV
--ui simple
echo "Benchmark completed successfully!"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env
:
-
name
:
TARGET_MODEL
value
:
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-
name
:
ENDPOINT
value
:
llama3-70b-disagg-mn-frontend:8000
-
name
:
CONCURRENCY_PER_GPU
value
:
"
16"
-
name
:
DEPLOYMENT_GPU_COUNT
value
:
"
16"
-
name
:
ISL
value
:
"
8192"
-
name
:
OSL
value
:
"
1024"
-
name
:
DEPLOYMENT_MODE
value
:
disagg-mn
-
name
:
AIPERF_HTTP_CONNECTION_LIMIT
value
:
"
200"
-
name
:
JOB_NAME
valueFrom
:
fieldRef
:
apiVersion
:
v1
fieldPath
:
metadata.labels['job-name']
-
name
:
ROOT_ARTIFACT_DIR
value
:
/root/.cache/huggingface/perf
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
PYTHONUNBUFFERED
value
:
"
1"
image
:
python:3.12-slim
imagePullPolicy
:
IfNotPresent
name
:
perf
securityContext
:
privileged
:
true
volumeMounts
:
volumeMounts
:
-
name
:
model-cache
-
name
:
model-cache
mountPath
:
/root/.cache/huggingface
mountPath
:
/root/.cache/huggingface
workingDir
:
/workspace
imagePullSecrets
:
-
name
:
nvcrimagepullsecret
volumes
:
volumes
:
-
name
:
model-cache
-
name
:
model-cache
persistentVolumeClaim
:
persistentVolumeClaim
:
...
...
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
View file @
13fc3c65
...
@@ -42,8 +42,13 @@ spec:
...
@@ -42,8 +42,13 @@ spec:
-
worker
-
worker
topologyKey
:
kubernetes.io/hostname
topologyKey
:
kubernetes.io/hostname
mainContainer
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
2
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
...
@@ -76,8 +81,13 @@ spec:
...
@@ -76,8 +81,13 @@ spec:
-
worker
-
worker
topologyKey
:
kubernetes.io/hostname
topologyKey
:
kubernetes.io/hostname
mainContainer
:
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args
:
args
:
-
"
python3
-m
dynamo.vllm
--model
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
View file @
13fc3c65
...
@@ -5,7 +5,7 @@ kind: Job
...
@@ -5,7 +5,7 @@ kind: Job
metadata
:
metadata
:
name
:
llama3-70b-disagg-sn-perf
name
:
llama3-70b-disagg-sn-perf
spec
:
spec
:
backoffLimit
:
3
backoffLimit
:
1
completions
:
1
completions
:
1
parallelism
:
1
parallelism
:
1
template
:
template
:
...
@@ -15,58 +15,129 @@ spec:
...
@@ -15,58 +15,129 @@ spec:
spec
:
spec
:
restartPolicy
:
Never
restartPolicy
:
Never
containers
:
containers
:
-
name
:
perf
-
command
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
/bin/sh
-
/bin/sh
-
-c
-
-c
-
|
-
|
# wait for the model to be ready
apt-get update && apt-get install -y curl jq procps git && apt-get clean
export ENDPOINT=llama3-70b-disagg-sn-frontend:8000
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
echo "aiperf installation completed";
export INTERVAL=5
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet,
waiting ${INTERVAL}s...
"
echo "[$(date '+%H:%M:%S')] Model not ready yet,
sleeping 5s before checking again http://$ENDPOINT/v1/models
"
sleep
$INTERVAL
sleep
5
done
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
}
export ARTIFACT_DIR="/tmp/genai-$RANDOM"
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
export COLUMNS=200
aiperf profile --artifact-dir $ARTIFACT_DIR \
aiperf profile \
--model $TARGET_MODEL \
--model "$TARGET_MODEL" \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--streaming \
--concurrency 64 \
--url http://$ENDPOINT \
--warmup-request-count 2 \
--synthetic-input-tokens-mean $isl \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean
1024
\
--output-tokens-mean
$osl
\
--output-tokens-stddev 0 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--concurrency $concurrency \
--artifact-dir $ARTIFACT_DIR \
--request-count $((10*concurrency)) \
--num-dataset-entries=3000 -- \
--warmup-request-count $concurrency \
--max-threads 64
--conversation-num 12800 \
echo "----------------json----------------"
--random-seed 100 \
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
--workers-max $max_threads \
cat $PERF_JSON | jq .
-H 'Authorization: Bearer NOT USED' \
echo "----------------csv-----------------"
-H 'Accept: text/event-stream'\
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
--record-processors 32 \
cat $PERF_CSV
--ui simple
echo "Benchmark completed successfully!"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env
:
-
name
:
TARGET_MODEL
value
:
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-
name
:
ENDPOINT
value
:
llama3-70b-disagg-sn-frontend:8000
-
name
:
CONCURRENCY_PER_GPU
value
:
"
16"
-
name
:
DEPLOYMENT_GPU_COUNT
value
:
"
8"
-
name
:
ISL
value
:
"
8192"
-
name
:
OSL
value
:
"
1024"
-
name
:
DEPLOYMENT_MODE
value
:
disagg-sn
-
name
:
AIPERF_HTTP_CONNECTION_LIMIT
value
:
"
200"
-
name
:
JOB_NAME
valueFrom
:
fieldRef
:
apiVersion
:
v1
fieldPath
:
metadata.labels['job-name']
-
name
:
ROOT_ARTIFACT_DIR
value
:
/root/.cache/huggingface/perf
-
name
:
HF_HOME
value
:
/root/.cache/huggingface
-
name
:
PYTHONUNBUFFERED
value
:
"
1"
image
:
python:3.12-slim
imagePullPolicy
:
IfNotPresent
name
:
perf
securityContext
:
privileged
:
true
volumeMounts
:
volumeMounts
:
-
name
:
model-cache
-
name
:
model-cache
mountPath
:
/root/.cache/huggingface
mountPath
:
/root/.cache/huggingface
workingDir
:
/workspace
imagePullSecrets
:
-
name
:
nvcrimagepullsecret
volumes
:
volumes
:
-
name
:
model-cache
-
name
:
model-cache
persistentVolumeClaim
:
persistentVolumeClaim
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment