Unverified Commit 13fc3c65 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

fix: update model recipe for llama-3 70b to match with common recipe template (#3637)

parent 15a01f75
...@@ -22,24 +22,22 @@ spec: ...@@ -22,24 +22,22 @@ spec:
- secretRef: - secretRef:
name: hf-token-secret name: hf-token-secret
env: env:
# NOTE: This is the model name for the llama-3-70b model
# Update this to model name for the model you are downloading
- name: MODEL_NAME - name: MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_TOKEN - name: HF_HOME
valueFrom: value: /model-store
secretKeyRef: - name: HF_HUB_ENABLE_HF_TRANSFER
name: hf-token-secret value: "1"
key: HF_TOKEN - name: MODEL_REVISION
value: ddb4128556dfcff99e0c41aee159ea6c3e655dcd
args: args:
- | - |
set -eux set -eux
pip install --no-cache-dir huggingface_hub hf_transfer pip install --no-cache-dir huggingface_hub hf_transfer
export HF_HUB_ENABLE_HF_TRANSFER=1 hf download $MODEL_NAME --revision $MODEL_REVISION
huggingface-cli download $MODEL_NAME
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPath: /root/.cache/huggingface/hub mountPath: /model-store
volumes: volumes:
- name: model-cache - name: model-cache
persistentVolumeClaim: persistentVolumeClaim:
......
...@@ -32,8 +32,13 @@ spec: ...@@ -32,8 +32,13 @@ spec:
size: 20Gi size: 20Gi
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args: args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
...@@ -42,6 +47,6 @@ spec: ...@@ -42,6 +47,6 @@ spec:
replicas: 1 replicas: 1
resources: resources:
limits: limits:
gpu: "8" gpu: "4"
requests: requests:
gpu: "8" gpu: "4"
\ No newline at end of file \ No newline at end of file
...@@ -5,7 +5,7 @@ kind: Job ...@@ -5,7 +5,7 @@ kind: Job
metadata: metadata:
name: llama3-70b-agg-perf name: llama3-70b-agg-perf
spec: spec:
backoffLimit: 3 backoffLimit: 1
completions: 1 completions: 1
parallelism: 1 parallelism: 1
template: template:
...@@ -15,58 +15,129 @@ spec: ...@@ -15,58 +15,129 @@ spec:
spec: spec:
restartPolicy: Never restartPolicy: Never
containers: containers:
- name: perf - command:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh - /bin/sh
- -c - -c
- | - |
# wait for the model to be ready apt-get update && apt-get install -y curl jq procps git && apt-get clean
export ENDPOINT=llama3-70b-agg-0:8000 pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic echo "aiperf installation completed";
export INTERVAL=5 sysctl -w net.ipv4.ip_local_port_range="1024 65000"
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..." cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep $INTERVAL sleep 5
done done
echo "✅ Model '$TARGET_MODEL' is now available!" echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq . curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark }
export ARTIFACT_DIR="/tmp/genai" run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..." echo "ARTIFACT_DIR: $ARTIFACT_DIR"
export COLUMNS=200 aiperf profile --artifact-dir $ARTIFACT_DIR \
aiperf profile \ --model $TARGET_MODEL \
--model "$TARGET_MODEL" \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat --url "$ENDPOINT" --streaming \ --streaming \
--concurrency 64 \ --url http://$ENDPOINT \
--warmup-request-count 2 \ --synthetic-input-tokens-mean $isl \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \ --synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \ --output-tokens-mean $osl \
--output-tokens-stddev 0 \ --output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \ --extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \ --extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \ --concurrency $concurrency \
--artifact-dir $ARTIFACT_DIR \ --request-count $((10*concurrency)) \
--num-dataset-entries=3000 -- \ --warmup-request-count $concurrency \
--max-threads 64 --conversation-num 12800 \
echo "----------------json----------------" --random-seed 100 \
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) --workers-max $max_threads \
cat $PERF_JSON | jq . -H 'Authorization: Bearer NOT USED' \
echo "----------------csv-----------------" -H 'Accept: text/event-stream'\
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) --record-processors 32 \
cat $PERF_CSV --ui simple
echo "Benchmark completed successfully!" echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
- name: ENDPOINT
value: llama3-70b-agg-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "16"
- name: DEPLOYMENT_GPU_COUNT
value: "4"
- name: ISL
value: "8192"
- name: OSL
value: "1024"
- name: DEPLOYMENT_MODE
value: agg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "200"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/perf
- name: HF_HOME
value: /root/.cache/huggingface
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPath: /root/.cache/huggingface mountPath: /root/.cache/huggingface
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
volumes: volumes:
- name: model-cache - name: model-cache
persistentVolumeClaim: persistentVolumeClaim:
......
...@@ -32,8 +32,13 @@ spec: ...@@ -32,8 +32,13 @@ spec:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args: args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
...@@ -56,8 +61,13 @@ spec: ...@@ -56,8 +61,13 @@ spec:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args: args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
......
...@@ -5,7 +5,7 @@ kind: Job ...@@ -5,7 +5,7 @@ kind: Job
metadata: metadata:
name: llama3-70b-disagg-mn-perf name: llama3-70b-disagg-mn-perf
spec: spec:
backoffLimit: 3 backoffLimit: 1
completions: 1 completions: 1
parallelism: 1 parallelism: 1
template: template:
...@@ -15,58 +15,129 @@ spec: ...@@ -15,58 +15,129 @@ spec:
spec: spec:
restartPolicy: Never restartPolicy: Never
containers: containers:
- name: perf - command:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh - /bin/sh
- -c - -c
- | - |
# wait for the model to be ready apt-get update && apt-get install -y curl jq procps git && apt-get clean
export ENDPOINT=llama3-70b-disagg-mn-frontend:8000 pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic echo "aiperf installation completed";
export INTERVAL=5 sysctl -w net.ipv4.ip_local_port_range="1024 65000"
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..." cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep $INTERVAL sleep 5
done done
echo "✅ Model '$TARGET_MODEL' is now available!" echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq . curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark }
export ARTIFACT_DIR="/tmp/genai" run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..." echo "ARTIFACT_DIR: $ARTIFACT_DIR"
export COLUMNS=200 aiperf profile --artifact-dir $ARTIFACT_DIR \
aiperf profile \ --model $TARGET_MODEL \
--model "$TARGET_MODEL" \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat --url "$ENDPOINT" --streaming \ --streaming \
--concurrency 64 \ --url http://$ENDPOINT \
--warmup-request-count 2 \ --synthetic-input-tokens-mean $isl \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \ --synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \ --output-tokens-mean $osl \
--output-tokens-stddev 0 \ --output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \ --extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \ --extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \ --concurrency $concurrency \
--artifact-dir $ARTIFACT_DIR \ --request-count $((10*concurrency)) \
--num-dataset-entries=3000 -- \ --warmup-request-count $concurrency \
--max-threads 64 --conversation-num 12800 \
echo "----------------json----------------" --random-seed 100 \
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) --workers-max $max_threads \
cat $PERF_JSON | jq . -H 'Authorization: Bearer NOT USED' \
echo "----------------csv-----------------" -H 'Accept: text/event-stream'\
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) --record-processors 32 \
cat $PERF_CSV --ui simple
echo "Benchmark completed successfully!" echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
- name: ENDPOINT
value: llama3-70b-disagg-mn-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "16"
- name: DEPLOYMENT_GPU_COUNT
value: "16"
- name: ISL
value: "8192"
- name: OSL
value: "1024"
- name: DEPLOYMENT_MODE
value: disagg-mn
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "200"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/perf
- name: HF_HOME
value: /root/.cache/huggingface
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPath: /root/.cache/huggingface mountPath: /root/.cache/huggingface
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
volumes: volumes:
- name: model-cache - name: model-cache
persistentVolumeClaim: persistentVolumeClaim:
......
...@@ -42,8 +42,13 @@ spec: ...@@ -42,8 +42,13 @@ spec:
- worker - worker
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
mainContainer: mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args: args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
...@@ -76,8 +81,13 @@ spec: ...@@ -76,8 +81,13 @@ spec:
- worker - worker
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
mainContainer: mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args: args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
......
...@@ -5,7 +5,7 @@ kind: Job ...@@ -5,7 +5,7 @@ kind: Job
metadata: metadata:
name: llama3-70b-disagg-sn-perf name: llama3-70b-disagg-sn-perf
spec: spec:
backoffLimit: 3 backoffLimit: 1
completions: 1 completions: 1
parallelism: 1 parallelism: 1
template: template:
...@@ -15,58 +15,129 @@ spec: ...@@ -15,58 +15,129 @@ spec:
spec: spec:
restartPolicy: Never restartPolicy: Never
containers: containers:
- name: perf - command:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh - /bin/sh
- -c - -c
- | - |
# wait for the model to be ready apt-get update && apt-get install -y curl jq procps git && apt-get clean
export ENDPOINT=llama3-70b-disagg-sn-frontend:8000 pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic echo "aiperf installation completed";
export INTERVAL=5 sysctl -w net.ipv4.ip_local_port_range="1024 65000"
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..." cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep $INTERVAL sleep 5
done done
echo "✅ Model '$TARGET_MODEL' is now available!" echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq . curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark }
export ARTIFACT_DIR="/tmp/genai-$RANDOM" run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..." echo "ARTIFACT_DIR: $ARTIFACT_DIR"
export COLUMNS=200 aiperf profile --artifact-dir $ARTIFACT_DIR \
aiperf profile \ --model $TARGET_MODEL \
--model "$TARGET_MODEL" \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat --url "$ENDPOINT" --streaming \ --streaming \
--concurrency 64 \ --url http://$ENDPOINT \
--warmup-request-count 2 \ --synthetic-input-tokens-mean $isl \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \ --synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \ --output-tokens-mean $osl \
--output-tokens-stddev 0 \ --output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \ --extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \ --extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \ --concurrency $concurrency \
--artifact-dir $ARTIFACT_DIR \ --request-count $((10*concurrency)) \
--num-dataset-entries=3000 -- \ --warmup-request-count $concurrency \
--max-threads 64 --conversation-num 12800 \
echo "----------------json----------------" --random-seed 100 \
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) --workers-max $max_threads \
cat $PERF_JSON | jq . -H 'Authorization: Bearer NOT USED' \
echo "----------------csv-----------------" -H 'Accept: text/event-stream'\
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) --record-processors 32 \
cat $PERF_CSV --ui simple
echo "Benchmark completed successfully!" echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
- name: ENDPOINT
value: llama3-70b-disagg-sn-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "16"
- name: DEPLOYMENT_GPU_COUNT
value: "8"
- name: ISL
value: "8192"
- name: OSL
value: "1024"
- name: DEPLOYMENT_MODE
value: disagg-sn
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "200"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/perf
- name: HF_HOME
value: /root/.cache/huggingface
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPath: /root/.cache/huggingface mountPath: /root/.cache/huggingface
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
volumes: volumes:
- name: model-cache - name: model-cache
persistentVolumeClaim: persistentVolumeClaim:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment