Unverified Commit 03bdced9 authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

refactor: remove sglang config and util files from components (#3467)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
parent 51f65757
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
usage() {
echo "Usage: $0 <ip> [port] [--type e2e|custom_completions|warmup]"
echo " ip: server IP address"
echo " port: server port (defaults to 8000)"
echo " --type: endpoint type - 'e2e' for chat completions, 'custom_completions' for completions, 'warmup' for warmup phases"
exit 1
}
if [ $# -lt 1 ]; then
usage
fi
IP=$1
PORT=8000
TYPE="e2e"
# Check if second argument is a port number or an option
if [[ $# -gt 1 && $2 =~ ^[0-9]+$ ]]; then
PORT=$2
shift 2
else
shift 1
fi
# Parse remaining arguments
while [[ $# -gt 0 ]]; do
case $1 in
--type)
TYPE="$2"
shift 2
;;
*)
usage
;;
esac
done
if [[ "$TYPE" != "e2e" && "$TYPE" != "custom_completions" && "$TYPE" != "warmup" ]]; then
echo "Error: --type must be 'e2e', 'custom_completions', or 'warmup'"
usage
fi
MODEL="deepseek-ai/DeepSeek-R1"
ARTIFACT_DIR="/benchmarks/"
if [[ "$TYPE" == "e2e" ]]; then
# E2E chat completions configuration
ISL=8000
OSL=256
CONCURRENCY_ARRAY=(1 2 4 16 64 256 512 1024 2048 4096 8192)
for concurrency in "${CONCURRENCY_ARRAY[@]}"; do
echo "Run e2e concurrency: $concurrency"
genai-perf profile \
--model ${MODEL} \
--tokenizer ${MODEL} \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url ${IP}:${PORT} \
--synthetic-input-tokens-mean ${ISL} \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean ${OSL} \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:${OSL} \
--extra-inputs min_tokens:${OSL} \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency ${concurrency} \
--request-count $(($concurrency*10)) \
--num-dataset-entries $(($concurrency*12)) \
--random-seed 100 \
--artifact-dir ${ARTIFACT_DIR} \
-- \
-v \
--max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
done
elif [[ "$TYPE" == "warmup" ]]; then
echo "Starting warmup phases..."
# Phase configurations: "ISL OSL CONCURRENCY_LIST"
PHASES=(
"500 100 1,2,4,8"
"2000 100 1,2,4,8"
"4000 256 1,2,8,64"
)
for i in "${!PHASES[@]}"; do
phase_num=$((i + 1))
phase_config=(${PHASES[$i]})
ISL=${phase_config[0]}
OSL=${phase_config[1]}
concurrency_list=${phase_config[2]}
echo "Phase $phase_num: ISL=$ISL, OSL=$OSL"
# Convert comma-separated list to array
IFS=',' read -ra CONCURRENCY_ARRAY <<< "$concurrency_list"
for concurrency in "${CONCURRENCY_ARRAY[@]}"; do
echo "Run warmup phase $phase_num, concurrency: $concurrency, ISL: $ISL, OSL: $OSL"
genai-perf profile \
--model ${MODEL} \
--tokenizer ${MODEL} \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url ${IP}:${PORT} \
--synthetic-input-tokens-mean ${ISL} \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean ${OSL} \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:${OSL} \
--extra-inputs min_tokens:${OSL} \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency ${concurrency} \
--request-count $(($concurrency)) \
--warmup-request-count $(($concurrency)) \
--num-dataset-entries $(($concurrency*12)) \
--random-seed 100 \
--artifact-dir ${ARTIFACT_DIR} \
-- \
-v \
--max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
echo "Sleeping for 5 seconds..."
sleep 5
done
echo "Phase $phase_num complete"
done
else
# Custom completions configuration
OSL=5
INPUT_FILE=data.jsonl
CONCURRENCY_ARRAY=(8192)
for concurrency in "${CONCURRENCY_ARRAY[@]}"; do
echo "Run custom_completions concurrency: $concurrency"
genai-perf profile \
--model ${MODEL} \
--tokenizer ${MODEL} \
--endpoint-type completions \
--streaming \
--url ${IP}:${PORT} \
--input-file ${INPUT_FILE} \
--extra-inputs max_tokens:${OSL} \
--extra-inputs min_tokens:${OSL} \
--extra-inputs ignore_eos:true \
--concurrency ${concurrency} \
--request-count ${concurrency} \
--random-seed 100 \
--artifact-dir ${ARTIFACT_DIR} \
--warmup-requests 10 \
-- \
-v \
--max-threads 256 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
done
fi
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import random
import numpy as np
from sglang.bench_serving import sample_random_requests
from transformers import AutoTokenizer, PreTrainedTokenizerBase
"""
Helper script that uses SGLang's random request generator to sample ShareGPT data
and then converts it to a jsonl file that can be used by GenAI perf for benchmarking
Example usage:
python3 generate_bench_data.py --model deepseek-ai/DeepSeek-R1 --output data.jsonl
"""
def main():
parser = argparse.ArgumentParser(
description="Use sglang.sample_random_requests to generate token-based JSONL for GenAI-Perf"
)
parser.add_argument(
"--dataset-path", type=str, default="", help="Path or URL to ShareGPT JSON"
)
parser.add_argument(
"--output", type=str, required=True, help="Output JSONL filename"
)
parser.add_argument(
"--model",
type=str,
required=True,
help="Model identifier for payloads and tokenizer name",
)
parser.add_argument(
"--num-prompts", type=int, default=8192, help="Total number of samples"
)
parser.add_argument(
"--input-len", type=int, default=4096, help="Target input token length"
)
parser.add_argument(
"--output-len", type=int, default=5, help="Target output token length"
)
parser.add_argument(
"--range-ratio", type=float, default=1.0, help="Sampling length range ratio"
)
parser.add_argument(
"--random-seed", type=int, default=1, help="Random seed for reproducibility"
)
args = parser.parse_args()
random.seed(args.random_seed)
np.random.seed(args.random_seed)
tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
args.model, trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# this is what SGL uses in their benchmarking
# https://github.com/sgl-project/sglang/blob/b783c1cb829ec451639d1a3ce68380fb7a7be4a3/python/sglang/bench_one_batch_server.py#L131
# We return text instead of returning raw tokens as GenAI Perf expects text during benchmarking
samples = sample_random_requests(
input_len=args.input_len,
output_len=args.output_len,
num_prompts=args.num_prompts,
range_ratio=args.range_ratio,
tokenizer=tokenizer,
dataset_path=args.dataset_path,
random_sample=True,
return_text=True,
)
with open(args.output, "w", encoding="utf-8") as fout:
for row in samples:
# genai-perf expects this format
payload = {
"text": row.prompt,
"output_length": row.output_len,
}
fout.write(json.dumps(payload) + "\n")
if __name__ == "__main__":
main()
...@@ -56,13 +56,7 @@ docker run \ ...@@ -56,13 +56,7 @@ docker run \
dynamo-wideep-gb200:latest dynamo-wideep-gb200:latest
``` ```
3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. 3. Run the ingress and prefill worker
```bash
./utils/gen_env_vars.sh
```
4. Run the ingress and prefill worker
```bash ```bash
# run ingress # run ingress
...@@ -115,7 +109,7 @@ python3 -m dynamo.sglang \ ...@@ -115,7 +109,7 @@ python3 -m dynamo.sglang \
--log-level debug --log-level debug
``` ```
5. Run the decode worker on the head decode node 4. Run the decode worker on the head decode node
```bash ```bash
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
......
...@@ -41,13 +41,7 @@ docker run \ ...@@ -41,13 +41,7 @@ docker run \
In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory.
3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. 3. Run the ingress and prefill worker
```bash
./utils/gen_env_vars.sh
```
4. Run the ingress and prefill worker
```bash ```bash
# run ingress # run ingress
...@@ -87,7 +81,7 @@ python3 -m dynamo.sglang \ ...@@ -87,7 +81,7 @@ python3 -m dynamo.sglang \
On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3 On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3
5. Run the decode worker on the head decode node 4. Run the decode worker on the head decode node
```bash ```bash
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
......
...@@ -18,12 +18,7 @@ docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache ...@@ -18,12 +18,7 @@ docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache
You can use a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) by adding `--build-arg SGLANG_IMAGE_TAG=<tag>` to the build command. You can use a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) by adding `--build-arg SGLANG_IMAGE_TAG=<tag>` to the build command.
**Step 1**: Use the provided helper script to generate commands to start NATS/ETCD on your head prefill node. This script will also give you environment variables to export on each other node. You will need the IP addresses of your head prefill and head decode node to run this script. **Step 1**: Ensure that your configuration file has the required arguments. Here's an example configuration that runs prefill and the model in TP16:
```bash
./utils/gen_env_vars.sh
```
**Step 2**: Ensure that your configuration file has the required arguments. Here's an example configuration that runs prefill and the model in TP16:
Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker
```bash ```bash
...@@ -104,7 +99,7 @@ python3 -m dynamo.sglang \ ...@@ -104,7 +99,7 @@ python3 -m dynamo.sglang \
--mem-fraction-static 0.82 --mem-fraction-static 0.82
``` ```
**Step 3**: Run inference **Step 2**: Run inference
SGLang typically requires a warmup period to ensure the DeepGEMM kernels are loaded. We recommend running a few warmup requests and ensuring that the DeepGEMM kernels load in. SGLang typically requires a warmup period to ensure the DeepGEMM kernels are loaded. We recommend running a few warmup requests and ensuring that the DeepGEMM kernels load in.
```bash ```bash
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Helper script to generate environment variables for each node during a multinode SGLang deployment
echo "=== USAGE ==="
echo "1. Run this script: ./gen_env_vars.sh"
echo "2. Enter the IP addresses when prompted"
echo "3. Copy the commands for the head prefill node and run them"
echo "4. Copy the commands for all other nodes and run them on each node"
echo "5. Proceed with starting your prefill and decode workers"
echo ""
# Prompt for IP addresses
read -p "Enter HEAD_PREFILL_NODE IP: " HEAD_PREFILL_NODE
read -p "Enter HEAD_DECODE_NODE IP: " HEAD_DECODE_NODE
# Validate inputs
if [ -z "$HEAD_PREFILL_NODE" ] || [ -z "$HEAD_DECODE_NODE" ]; then
echo "Error: Both IP addresses are required"
exit 1
fi
echo "=== HEAD PREFILL NODE ($HEAD_PREFILL_NODE) ==="
echo "Run all of these commands on the head prefill node:"
echo ""
echo "nats-server -js &"
echo "etcd --listen-client-urls http://0.0.0.0:2379 \\"
echo " --advertise-client-urls http://0.0.0.0:2379 \\"
echo " --listen-peer-urls http://0.0.0.0:2380 \\"
echo " --initial-cluster default=http://$HEAD_PREFILL_NODE:2380 &"
echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE"
echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE"
echo ""
echo "=== ALL OTHER NODES ==="
echo "Run these commands on all other nodes (prefill and decode):"
echo ""
echo "# Export environment variables"
echo "export NATS_SERVER=nats://$HEAD_PREFILL_NODE:4222"
echo "export ETCD_ENDPOINTS=http://$HEAD_PREFILL_NODE:2379"
echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE"
echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment