Unverified Commit de6fdf0c authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: update gpt-oss 120b model recipe (#3143)


Signed-off-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent 162065fe
# GPT-OSS-120B Recipe Guide
This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup.
## Prerequisites
Follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token.
## Quick Start
To run the model, simply execute this command in your terminal:
```bash
cd recipe
./run.sh --model gpt-oss-120b --framework trtllm agg
```
## (Alternative) Step by Step Guide
### 1. Download the Model
```bash
cd recipes/gpt-oss-120b
kubectl apply -n $NAMESPACE -f ./model-cache
```
### 2. Deploy and Benchmark the Model
```bash
cd recipes/gpt-oss-120b
kubectl apply -n $NAMESPACE -f ./trtllm/agg
```
### Container Image
This recipe was tested with dynamo trtllm runtime container for ARM64 processors.
**Important Note:**
Before dynamo v0.5.1 release, following container image is supported:
```
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
```
After dynamo v0.5.1 release, following container image will be supported:
```
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1
```
## Notes
1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup.
2. storage class is not specified in the recipe, you need to specify it in the `deploy.yaml` file.
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: Service
kind: PersistentVolumeClaim
metadata:
name: gpt-oss-agg-trtllmworker
name: model-cache
spec:
selector:
nvidia.com/selector: gpt-oss-agg-trtllmworker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
accessModes:
- ReadWriteMany
resources:
requests:
storage: 100Gi
storageClassName: "your-storage-class-name"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: model-download
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: model-download
spec:
restartPolicy: Never
containers:
- name: model-download
image: python:3.10-slim
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: MODEL_NAME
value: openai/gpt-oss-120b
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
- name: MODEL_REVISION
value: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
hf download $MODEL_NAME --revision $MODEL_REVISION --exclude "original/*" --exclude "metal/*"
volumeMounts:
- name: model-cache
mountPath: /model-store
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
\ No newline at end of file
......@@ -6,20 +6,12 @@ metadata:
name: llm-config
data:
config.yaml: |
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
build_config:
max_batch_size: 640
max_num_tokens: 20000
moe_config:
backend: CUTLASS
cuda_graph_config:
max_batch_size: 640
max_batch_size: 800
enable_padding: true
kv_cache_config:
free_gpu_memory_fraction: 0.9
enable_block_reuse: false
print_iter_log: false
stream_interval: 50
use_torch_sampler: true
\ No newline at end of file
stream_interval: 20
moe_config:
backend: CUTLASS
\ No newline at end of file
......@@ -3,16 +3,42 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: gpt-oss-agg-shm
name: gpt-oss-agg
spec:
backendFramework: trtllm
pvcs:
- name: model-cache-oss-gpt120b
create: false
services:
Frontend:
componentType: frontend
dynamoNamespace: gpt-oss-agg
extraPodSpec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- gpt-oss-agg-frontend
topologyKey: kubernetes.io/hostname
mainContainer:
args:
- python3 -m dynamo.frontend --router-mode round-robin --http-port 8000
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
pvc:
create: false
mountPoint: /model-store
name: model-cache
replicas: 18
TrtllmWorker:
componentType: main
dynamoNamespace: gpt-oss-agg-shm
dynamoNamespace: gpt-oss-agg
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache-oss-gpt120b
......@@ -20,15 +46,6 @@ spec:
sharedMemory:
size: 80Gi
extraPodSpec:
tolerations:
- key: "dedicated"
operator: "Equal"
value: "user-workload"
effect: "NoSchedule"
- key: "dedicated"
operator: "Equal"
value: "user-workload"
effect: "NoExecute"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......@@ -41,26 +58,43 @@ spec:
mainContainer:
args:
- |
export TRTLLM_ENABLE_PDL=1
export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True
export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"}
export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
trap 'echo Cleaning up...; kill 0' EXIT
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$ENGINE_ARGS" \
--max-num-tokens 20000 \
--max-batch-size 640 \
--model-path "${MODEL_PATH}" \
--served-model-name "openai/gpt-oss-120b" \
--extra-engine-args "${ENGINE_ARGS}" \
--tensor-parallel-size 4 \
--expert-parallel-size 4 \
--max-batch-size 800 \
--free-gpu-memory-fraction 0.9
command:
- /bin/sh
- -c
image: my-registry/vllm-runtime:my-tag
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
env:
- name: TRTLLM_ENABLE_PDL
value: "1"
- name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
value: "True"
- name: SERVED_MODEL_NAME
value: "openai/gpt-oss-120b"
- name: ENGINE_ARGS
value: "/opt/dynamo/configs/config.yaml"
- name: MODEL_PATH
value: "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
volumeMounts:
- mountPath: /opt/dynamo/configs
name: llm-config
readOnly: true
workingDir: /workspace/components/backends/trtllm
replicas: 1
volumes:
- configMap:
name: llm-config
name: llm-config
pvc:
create: false
mountPoint: /model-store
name: model-cache
replicas: 18
resources:
limits:
gpu: "4"
......
......@@ -11,47 +11,35 @@ spec:
template:
metadata:
labels:
app: oss-gpt120b
app: oss-gpt120b-bench
spec:
restartPolicy: Never
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- gpt-oss-agg
topologyKey: kubernetes.io/hostname
containers:
- name: perf
image: my-registry/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
env:
- name: TARGET_MODEL
value: openai/gpt-oss-120b
- name: ENDPOINT
value: gpt-oss-agg-trtllmworker:8000
- name: CONCURRENCIES
value: "13000 13500 1400"
- name: ISL
value: "16"
- name: OSL
value: "1000"
- name: DEPLOYMENT_MODE
value: "agg"
- name: DEPLOYMENT_GPU_COUNT
value: "32"
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/hub/perf
command:
- command:
- /bin/sh
- -c
- |
#TODO: this can be baked into the aiperf image
apt-get update && apt-get install -y curl jq
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..."
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
......@@ -65,11 +53,11 @@ spec:
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \
--endpoint-type chat --endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
......@@ -80,13 +68,14 @@ spec:
--extra-inputs "{\"min_tokens\":$osl}" \
--extra-inputs "{\"ignore_eos\":true}" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"repetition_penalty\":1.0}" \
--extra-inputs "{\"temperature\": 0.0}" \
--concurrency $concurrency \
--request-count $((3*concurrency)) \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 1 \
--conversation-num 12800 \
--random-seed 100 \
--request-rate 100000 \
--workers-max 128 \
--workers-max 252 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
......@@ -97,10 +86,15 @@ spec:
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
......@@ -108,16 +102,50 @@ spec:
"model endpoint": "$TARGET_MODEL"
}
EOF
# Run perf for each concurrency
for concurrency in $CONCURRENCIES; do
run_perf $concurrency $ISL $OSL
sleep 10
done
# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: openai/gpt-oss-120b
- name: ENDPOINT
value: gpt-oss-agg-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "900"
- name: DEPLOYMENT_GPU_COUNT
value: "72"
- name: ISL
value: "128"
- name: OSL
value: "1000"
- name: DEPLOYMENT_MODE
value: agg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "252"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /model-cache/perf
- name: HF_HOME
value: /model-cache
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
mountPath: /model-cache
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
restartPolicy: Never
volumes:
- name: model-cache
persistentVolumeClaim:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment