Unverified Commit de6fdf0c authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: update gpt-oss 120b model recipe (#3143)


Signed-off-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent 162065fe
# GPT-OSS-120B Recipe Guide
This guide will help you run the GPT-OSS-120B language model using Dynamo's optimized setup.
## Prerequisites
Follow the instructions in recipe [README.md](../README.md) to create a namespace and kubernetes secret for huggingface token.
## Quick Start
To run the model, simply execute this command in your terminal:
```bash
cd recipe
./run.sh --model gpt-oss-120b --framework trtllm agg
```
## (Alternative) Step by Step Guide
### 1. Download the Model
```bash
cd recipes/gpt-oss-120b
kubectl apply -n $NAMESPACE -f ./model-cache
```
### 2. Deploy and Benchmark the Model
```bash
cd recipes/gpt-oss-120b
kubectl apply -n $NAMESPACE -f ./trtllm/agg
```
### Container Image
This recipe was tested with dynamo trtllm runtime container for ARM64 processors.
**Important Note:**
Before dynamo v0.5.1 release, following container image is supported:
```
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
```
After dynamo v0.5.1 release, following container image will be supported:
```
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1
```
## Notes
1. The benchmark container image uses a specific commit of aiperf to ensure reproducible results and compatibility with the benchmarking setup.
2. storage class is not specified in the recipe, you need to specify it in the `deploy.yaml` file.
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
apiVersion: v1 apiVersion: v1
kind: Service kind: PersistentVolumeClaim
metadata: metadata:
name: gpt-oss-agg-trtllmworker name: model-cache
spec: spec:
selector: accessModes:
nvidia.com/selector: gpt-oss-agg-trtllmworker - ReadWriteMany
ports: resources:
- protocol: TCP requests:
port: 8000 storage: 100Gi
targetPort: 8000 storageClassName: "your-storage-class-name"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: model-download
spec:
backoffLimit: 3
completions: 1
parallelism: 1
template:
metadata:
labels:
app: model-download
spec:
restartPolicy: Never
containers:
- name: model-download
image: python:3.10-slim
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: MODEL_NAME
value: openai/gpt-oss-120b
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
- name: MODEL_REVISION
value: b5c939de8f754692c1647ca79fbf85e8c1e70f8a
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
hf download $MODEL_NAME --revision $MODEL_REVISION --exclude "original/*" --exclude "metal/*"
volumeMounts:
- name: model-cache
mountPath: /model-store
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
\ No newline at end of file
...@@ -6,20 +6,12 @@ metadata: ...@@ -6,20 +6,12 @@ metadata:
name: llm-config name: llm-config
data: data:
config.yaml: | config.yaml: |
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true enable_attention_dp: true
build_config:
max_batch_size: 640
max_num_tokens: 20000
moe_config:
backend: CUTLASS
cuda_graph_config: cuda_graph_config:
max_batch_size: 640 max_batch_size: 800
enable_padding: true enable_padding: true
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.9
enable_block_reuse: false enable_block_reuse: false
print_iter_log: false stream_interval: 20
stream_interval: 50 moe_config:
use_torch_sampler: true backend: CUTLASS
\ No newline at end of file \ No newline at end of file
...@@ -3,16 +3,42 @@ ...@@ -3,16 +3,42 @@
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: gpt-oss-agg-shm name: gpt-oss-agg
spec: spec:
backendFramework: trtllm backendFramework: trtllm
pvcs: pvcs:
- name: model-cache-oss-gpt120b - name: model-cache-oss-gpt120b
create: false create: false
services: services:
Frontend:
componentType: frontend
dynamoNamespace: gpt-oss-agg
extraPodSpec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- gpt-oss-agg-frontend
topologyKey: kubernetes.io/hostname
mainContainer:
args:
- python3 -m dynamo.frontend --router-mode round-robin --http-port 8000
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
pvc:
create: false
mountPoint: /model-store
name: model-cache
replicas: 18
TrtllmWorker: TrtllmWorker:
componentType: main componentType: main
dynamoNamespace: gpt-oss-agg-shm dynamoNamespace: gpt-oss-agg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache-oss-gpt120b - name: model-cache-oss-gpt120b
...@@ -20,47 +46,55 @@ spec: ...@@ -20,47 +46,55 @@ spec:
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
tolerations:
- key: "dedicated"
operator: "Equal"
value: "user-workload"
effect: "NoSchedule"
- key: "dedicated"
operator: "Equal"
value: "user-workload"
effect: "NoExecute"
affinity: affinity:
nodeAffinity: nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms: nodeSelectorTerms:
- matchExpressions: - matchExpressions:
- key: nvidia.com/gpu.present - key: nvidia.com/gpu.present
operator: In operator: In
values: values:
- "true" - "true"
mainContainer: mainContainer:
args: args:
- | - |
export TRTLLM_ENABLE_PDL=1
export TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True
export ENGINE_ARGS=${AGG_ENGINE_ARGS:-"/root/.cache/huggingface/gpt-oss-120b/config.yaml"}
export MODEL_PATH=${MODEL_PATH:-"/root/.cache/huggingface/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
trap 'echo Cleaning up...; kill 0' EXIT
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "${MODEL_PATH}" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "openai/gpt-oss-120b" \
--extra-engine-args "$ENGINE_ARGS" \ --extra-engine-args "${ENGINE_ARGS}" \
--max-num-tokens 20000 \ --tensor-parallel-size 4 \
--max-batch-size 640 \ --expert-parallel-size 4 \
--max-batch-size 800 \
--free-gpu-memory-fraction 0.9 --free-gpu-memory-fraction 0.9
command: command:
- /bin/sh - /bin/sh
- -c - -c
image: my-registry/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
env:
- name: TRTLLM_ENABLE_PDL
value: "1"
- name: TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL
value: "True"
- name: SERVED_MODEL_NAME
value: "openai/gpt-oss-120b"
- name: ENGINE_ARGS
value: "/opt/dynamo/configs/config.yaml"
- name: MODEL_PATH
value: "/model-store/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
volumeMounts:
- mountPath: /opt/dynamo/configs
name: llm-config
readOnly: true
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
replicas: 1 volumes:
- configMap:
name: llm-config
name: llm-config
pvc:
create: false
mountPoint: /model-store
name: model-cache
replicas: 18
resources: resources:
limits: limits:
gpu: "4" gpu: "4"
......
...@@ -11,47 +11,35 @@ spec: ...@@ -11,47 +11,35 @@ spec:
template: template:
metadata: metadata:
labels: labels:
app: oss-gpt120b app: oss-gpt120b-bench
spec: spec:
restartPolicy: Never affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- gpt-oss-agg
topologyKey: kubernetes.io/hostname
containers: containers:
- name: perf - command:
image: my-registry/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
env:
- name: TARGET_MODEL
value: openai/gpt-oss-120b
- name: ENDPOINT
value: gpt-oss-agg-trtllmworker:8000
- name: CONCURRENCIES
value: "13000 13500 1400"
- name: ISL
value: "16"
- name: OSL
value: "1000"
- name: DEPLOYMENT_MODE
value: "agg"
- name: DEPLOYMENT_GPU_COUNT
value: "32"
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/hub/perf
command:
- /bin/sh - /bin/sh
- -c - -c
- | - |
#TODO: this can be baked into the aiperf image apt-get update && apt-get install -y curl jq procps git && apt-get clean
apt-get update && apt-get install -y curl jq pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200 export COLUMNS=200
EPOCH=$(date +%s) EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap ## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() { wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting 5s..." echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5 sleep 5
done done
echo "✅ Model '$TARGET_MODEL' is now available!" echo "✅ Model '$TARGET_MODEL' is now available!"
...@@ -65,11 +53,11 @@ spec: ...@@ -65,11 +53,11 @@ spec:
key=concurrency_${concurrency} key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \ aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \ --model $TARGET_MODEL \
--tokenizer ~/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \
--endpoint-type chat \ --endpoint-type chat --endpoint /v1/chat/completions \
--endpoint /v1/chat/completions \
--streaming \ --streaming \
--url http://$ENDPOINT \ --url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-mean $isl \
...@@ -80,13 +68,14 @@ spec: ...@@ -80,13 +68,14 @@ spec:
--extra-inputs "{\"min_tokens\":$osl}" \ --extra-inputs "{\"min_tokens\":$osl}" \
--extra-inputs "{\"ignore_eos\":true}" \ --extra-inputs "{\"ignore_eos\":true}" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"repetition_penalty\":1.0}" \
--extra-inputs "{\"temperature\": 0.0}" \
--concurrency $concurrency \ --concurrency $concurrency \
--request-count $((3*concurrency)) \ --request-count $((10*concurrency)) \
--warmup-request-count $concurrency \ --warmup-request-count $concurrency \
--conversation-num 1 \ --conversation-num 12800 \
--random-seed 100 \ --random-seed 100 \
--request-rate 100000 \ --workers-max 252 \
--workers-max 128 \
-H 'Authorization: Bearer NOT USED' \ -H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\ -H 'Accept: text/event-stream'\
--record-processors 32 \ --record-processors 32 \
...@@ -97,10 +86,15 @@ spec: ...@@ -97,10 +86,15 @@ spec:
#### Actual execution #### #### Actual execution ####
wait_for_model_ready wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json # Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{ {
"gpu_count": $DEPLOYMENT_GPU_COUNT, "gpu_count": $DEPLOYMENT_GPU_COUNT,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE", "mode": "$DEPLOYMENT_MODE",
"isl": $ISL, "isl": $ISL,
"osl": $OSL, "osl": $OSL,
...@@ -108,16 +102,50 @@ spec: ...@@ -108,16 +102,50 @@ spec:
"model endpoint": "$TARGET_MODEL" "model endpoint": "$TARGET_MODEL"
} }
EOF EOF
# Run perf for each concurrency
for concurrency in $CONCURRENCIES; do # Run perf with calculated total concurrency
run_perf $concurrency $ISL $OSL run_perf $TOTAL_CONCURRENCY $ISL $OSL
sleep 10 echo "done with concurrency $TOTAL_CONCURRENCY"
done env:
- name: TARGET_MODEL
value: openai/gpt-oss-120b
- name: ENDPOINT
value: gpt-oss-agg-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "900"
- name: DEPLOYMENT_GPU_COUNT
value: "72"
- name: ISL
value: "128"
- name: OSL
value: "1000"
- name: DEPLOYMENT_MODE
value: agg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "252"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /model-cache/perf
- name: HF_HOME
value: /model-cache
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
name: perf
securityContext:
privileged: true
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPath: /root/.cache/huggingface mountPath: /model-cache
workingDir: /workspace
imagePullSecrets: imagePullSecrets:
- name: nvcrimagepullsecret - name: nvcrimagepullsecret
restartPolicy: Never
volumes: volumes:
- name: model-cache - name: model-cache
persistentVolumeClaim: persistentVolumeClaim:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment