"examples/vscode:/vscode.git/clone" did not exist on "14eceb43df78d11407df03059f0e857d88c991ea"
Unverified Commit bc3456b4 authored by Neal Vaidya's avatar Neal Vaidya Committed by GitHub
Browse files

docs: fix issues in gpt-oss guide (#2304)

parent 20d284e8
......@@ -70,7 +70,7 @@ docker build -f container/Dockerfile.tensorrt_llm_prebuilt . \
```bash
export MODEL_PATH=<LOCAL_MODEL_DIRECTORY>
huggingface-cli download openai/gpt-oss-120b --include "original/*" --local-dir $MODEL_PATH
huggingface-cli download openai/gpt-oss-120b --exclude "original/*" --exclude "metal/*" --local-dir $MODEL_PATH
```
### 3. Run the Container
......@@ -84,7 +84,7 @@ docker run \
--rm \
--network host \
--volume $MODEL_PATH:/model \
--volume $PWD:/workspace/dynamo \
--volume $PWD:/workspace \
--shm-size=10G \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
......@@ -149,7 +149,7 @@ You can use the provided launch script or run the components manually:
#### Option A: Using the Launch Script
```bash
cd /workspace/dynamo/components/backends/trtllm
cd /workspace/components/backends/trtllm
./launch/gpt_oss_disagg.sh
```
......@@ -170,7 +170,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
--model-path /model \
--served-model-name gpt-oss-120b \
--served-model-name openai/gpt-oss-120b \
--extra-engine-args engine_configs/gpt_oss/prefill.yaml \
--disaggregation-mode prefill \
--disaggregation-strategy prefill_first \
......@@ -185,7 +185,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
```bash
CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
--model-path /model \
--served-model-name gpt-oss-120b \
--served-model-name openai/gpt-oss-120b \
--extra-engine-args engine_configs/gpt_oss/decode.yaml \
--disaggregation-mode decode \
--disaggregation-strategy prefill_first \
......@@ -204,7 +204,7 @@ Send a test request to verify the deployment:
curl -X POST http://localhost:8000/v1/responses \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-oss-120b",
"model": "openai/gpt-oss-120b",
"input": "Explain the concept of disaggregated serving in LLM inference in 3 sentences.",
"max_output_tokens": 200,
"stream": false
......@@ -227,7 +227,7 @@ mkdir -p /tmp/benchmark-results
# Run the benchmark - this command tests the deployment with high-concurrency synthetic workload
genai-perf profile \
--model gpt-oss-120b \
--model openai/gpt-oss-120b \
--tokenizer /model \
--endpoint-type chat \
--endpoint /v1/chat/completions \
......
......@@ -4,7 +4,7 @@
# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"/model"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"gpt-oss-120b"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/gpt_oss/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/gpt_oss/decode.yaml"}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment