refactor: remove sglang config and util files from components (#3467)

Signed-off-by: Anant Sharma <anants@nvidia.com>

refactor: remove sglang config and util files from components (#3467)
Signed-off-by: Anant Sharma <anants@nvidia.com>
03bdced9 · Anant Sharma · GitHub · 51f65757 · 51f65757 · 51f65757
Unverified Commit 03bdced9 authored Oct 09, 2025 by Anant Sharma Committed by GitHub Oct 09, 2025
7 changed files
--- a/components/backends/sglang/benchmarks/bench.sh
+++ b/components/backends/sglang/benchmarks/bench.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-usage() {
-    echo "Usage: $0 <ip> [port] [--type e2e|custom_completions|warmup]"
-    echo "  ip: server IP address"
-    echo "  port: server port (defaults to 8000)"
-    echo "  --type: endpoint type - 'e2e' for chat completions, 'custom_completions' for completions, 'warmup' for warmup phases"
-    exit 1
-}
-if [ $# -lt 1 ]; then
-    usage
-fi
-IP=$1
-PORT=8000
-TYPE="e2e"
-# Check if second argument is a port number or an option
-if [[ $# -gt 1 && $2 =~ ^[0-9]+$ ]]; then
-    PORT=$2
-    shift 2
-else
-    shift 1
-fi
-# Parse remaining arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --type)
-            TYPE="$2"
-            shift 2
-            ;;
-        *)
-            usage
-            ;;
-    esac
-done
-if [[ "$TYPE" != "e2e" && "$TYPE" != "custom_completions" && "$TYPE" != "warmup" ]]; then
-    echo "Error: --type must be 'e2e', 'custom_completions', or 'warmup'"
-    usage
-fi
-MODEL="deepseek-ai/DeepSeek-R1"
-ARTIFACT_DIR="/benchmarks/"
-if [[ "$TYPE" == "e2e" ]]; then
-    # E2E chat completions configuration
-    ISL=8000
-    OSL=256
-    CONCURRENCY_ARRAY=(1 2 4 16 64 256 512 1024 2048 4096 8192)
-    for concurrency in "${CONCURRENCY_ARRAY[@]}"; do
-        echo "Run e2e concurrency: $concurrency"
-        genai-perf profile \
-            --model ${MODEL} \
-            --tokenizer ${MODEL} \
-            --endpoint-type chat \
-            --endpoint /v1/chat/completions \
-            --streaming \
-            --url ${IP}:${PORT} \
-            --synthetic-input-tokens-mean ${ISL} \
-            --synthetic-input-tokens-stddev 0 \
-            --output-tokens-mean ${OSL} \
-            --output-tokens-stddev 0 \
-            --extra-inputs max_tokens:${OSL} \
-            --extra-inputs min_tokens:${OSL} \
-            --extra-inputs ignore_eos:true \
-            --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-            --concurrency ${concurrency} \
-            --request-count $(($concurrency*10)) \
-            --num-dataset-entries $(($concurrency*12)) \
-            --random-seed 100 \
-            --artifact-dir ${ARTIFACT_DIR} \
-            -- \
-            -v \
-            --max-threads ${concurrency} \
-            -H 'Authorization: Bearer NOT USED' \
-            -H 'Accept: text/event-stream'
-    done
-elif [[ "$TYPE" == "warmup" ]]; then
-    echo "Starting warmup phases..."
-    # Phase configurations: "ISL OSL CONCURRENCY_LIST"
-    PHASES=(
-        "500 100 1,2,4,8"
-        "2000 100 1,2,4,8"
-        "4000 256 1,2,8,64"
-    )
-    for i in "${!PHASES[@]}"; do
-        phase_num=$((i + 1))
-        phase_config=(${PHASES[$i]})
-        ISL=${phase_config[0]}
-        OSL=${phase_config[1]}
-        concurrency_list=${phase_config[2]}
-        echo "Phase $phase_num: ISL=$ISL, OSL=$OSL"
-        # Convert comma-separated list to array
-        IFS=',' read -ra CONCURRENCY_ARRAY <<< "$concurrency_list"
-        for concurrency in "${CONCURRENCY_ARRAY[@]}"; do
-            echo "Run warmup phase $phase_num, concurrency: $concurrency, ISL: $ISL, OSL: $OSL"
-            genai-perf profile \
-                --model ${MODEL} \
-                --tokenizer ${MODEL} \
-                --endpoint-type chat \
-                --endpoint /v1/chat/completions \
-                --streaming \
-                --url ${IP}:${PORT} \
-                --synthetic-input-tokens-mean ${ISL} \
-                --synthetic-input-tokens-stddev 0 \
-                --output-tokens-mean ${OSL} \
-                --output-tokens-stddev 0 \
-                --extra-inputs max_tokens:${OSL} \
-                --extra-inputs min_tokens:${OSL} \
-                --extra-inputs ignore_eos:true \
-                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-                --concurrency ${concurrency} \
-                --request-count $(($concurrency)) \
-                --warmup-request-count $(($concurrency)) \
-                --num-dataset-entries $(($concurrency*12)) \
-                --random-seed 100 \
-                --artifact-dir ${ARTIFACT_DIR} \
-                -- \
-                -v \
-                --max-threads ${concurrency} \
-                -H 'Authorization: Bearer NOT USED' \
-                -H 'Accept: text/event-stream'
-            echo "Sleeping for 5 seconds..."
-            sleep 5
-        done
-        echo "Phase $phase_num complete"
-    done
-else
-    # Custom completions configuration
-    OSL=5
-    INPUT_FILE=data.jsonl
-    CONCURRENCY_ARRAY=(8192)
-    for concurrency in "${CONCURRENCY_ARRAY[@]}"; do
-        echo "Run custom_completions concurrency: $concurrency"
-        genai-perf profile \
-            --model ${MODEL} \
-            --tokenizer ${MODEL} \
-            --endpoint-type completions \
-            --streaming \
-            --url ${IP}:${PORT} \
-            --input-file ${INPUT_FILE} \
-            --extra-inputs max_tokens:${OSL} \
-            --extra-inputs min_tokens:${OSL} \
-            --extra-inputs ignore_eos:true \
-            --concurrency ${concurrency} \
-            --request-count ${concurrency} \
-            --random-seed 100 \
-            --artifact-dir ${ARTIFACT_DIR} \
-            --warmup-requests 10 \
-            -- \
-            -v \
-            --max-threads 256 \
-            -H 'Authorization: Bearer NOT USED' \
-            -H 'Accept: text/event-stream'
-    done
-fi
\ No newline at end of file
--- a/components/backends/sglang/benchmarks/generate_bench_data.py
+++ b/components/backends/sglang/benchmarks/generate_bench_data.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import argparse
-import json
-import random
-import numpy as np
-from sglang.bench_serving import sample_random_requests
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
-"""
-Helper script that uses SGLang's random request generator to sample ShareGPT data
-and then converts it to a jsonl file that can be used by GenAI perf for benchmarking
-Example usage:
-python3 generate_bench_data.py --model deepseek-ai/DeepSeek-R1 --output data.jsonl
-"""
-def main():
-    parser = argparse.ArgumentParser(
-        description="Use sglang.sample_random_requests to generate token-based JSONL for GenAI-Perf"
-    )
-    parser.add_argument(
-        "--dataset-path", type=str, default="", help="Path or URL to ShareGPT JSON"
-    )
-    parser.add_argument(
-        "--output", type=str, required=True, help="Output JSONL filename"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="Model identifier for payloads and tokenizer name",
-    )
-    parser.add_argument(
-        "--num-prompts", type=int, default=8192, help="Total number of samples"
-    )
-    parser.add_argument(
-        "--input-len", type=int, default=4096, help="Target input token length"
-    )
-    parser.add_argument(
-        "--output-len", type=int, default=5, help="Target output token length"
-    )
-    parser.add_argument(
-        "--range-ratio", type=float, default=1.0, help="Sampling length range ratio"
-    )
-    parser.add_argument(
-        "--random-seed", type=int, default=1, help="Random seed for reproducibility"
-    )
-    args = parser.parse_args()
-    random.seed(args.random_seed)
-    np.random.seed(args.random_seed)
-    tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
-        args.model, trust_remote_code=True
-    )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    # this is what SGL uses in their benchmarking
-    # https://github.com/sgl-project/sglang/blob/b783c1cb829ec451639d1a3ce68380fb7a7be4a3/python/sglang/bench_one_batch_server.py#L131
-    # We return text instead of returning raw tokens as GenAI Perf expects text during benchmarking
-    samples = sample_random_requests(
-        input_len=args.input_len,
-        output_len=args.output_len,
-        num_prompts=args.num_prompts,
-        range_ratio=args.range_ratio,
-        tokenizer=tokenizer,
-        dataset_path=args.dataset_path,
-        random_sample=True,
-        return_text=True,
-    )
-    with open(args.output, "w", encoding="utf-8") as fout:
-        for row in samples:
-            # genai-perf expects this format
-            payload = {
-                "text": row.prompt,
-                "output_length": row.output_len,
-            }
-            fout.write(json.dumps(payload) + "\n")
-if __name__ == "__main__":
-    main()
--- a/components/backends/sglang/docs/dsr1-wideep-gb200.md
+++ b/components/backends/sglang/docs/dsr1-wideep-gb200.md
@@ -56,13 +56,7 @@ docker run \
    dynamo-wideep-gb200:latest
 ```
-3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.
+3. Run the ingress and prefill worker
-```bash
-./utils/gen_env_vars.sh
-```
-4. Run the ingress and prefill worker
 ```bash
 # run ingress
@@ -115,7 +109,7 @@ python3 -m dynamo.sglang \
  --log-level debug
 ```
-5. Run the decode worker on the head decode node
+4. Run the decode worker on the head decode node
 ```bash
 SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \

--- a/components/backends/sglang/docs/dsr1-wideep-h100.md
+++ b/components/backends/sglang/docs/dsr1-wideep-h100.md
@@ -41,13 +41,7 @@ docker run \
 In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory.
-3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.
+3. Run the ingress and prefill worker
-```bash
-./utils/gen_env_vars.sh
-```
-4. Run the ingress and prefill worker
 ```bash
 # run ingress
@@ -87,7 +81,7 @@ python3 -m dynamo.sglang \
 On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3
-5. Run the decode worker on the head decode node
+4. Run the decode worker on the head decode node
 ```bash
 python3 -m dynamo.sglang \

--- a/components/backends/sglang/docs/multinode-examples.md
+++ b/components/backends/sglang/docs/multinode-examples.md
@@ -18,12 +18,7 @@ docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache
 You can use a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) by adding `--build-arg SGLANG_IMAGE_TAG=<tag>` to the build command.
-**Step 1**: Use the provided helper script to generate commands to start NATS/ETCD on your head prefill node. This script will also give you environment variables to export on each other node. You will need the IP addresses of your head prefill and head decode node to run this script.
+**Step 1**: Ensure that your configuration file has the required arguments. Here's an example configuration that runs prefill and the model in TP16:
-```bash
-./utils/gen_env_vars.sh
-```
-**Step 2**: Ensure that your configuration file has the required arguments. Here's an example configuration that runs prefill and the model in TP16:
 Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker
 ```bash
@@ -104,7 +99,7 @@ python3 -m dynamo.sglang \
  --mem-fraction-static 0.82
 ```
-**Step 3**: Run inference
+**Step 2**: Run inference
 SGLang typically requires a warmup period to ensure the DeepGEMM kernels are loaded. We recommend running a few warmup requests and ensuring that the DeepGEMM kernels load in.
 ```bash

--- a/components/backends/sglang/utils/gen_env_vars.sh
+++ b/components/backends/sglang/utils/gen_env_vars.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# Helper script to generate environment variables for each node during a multinode SGLang deployment
-echo "=== USAGE ==="
-echo "1. Run this script: ./gen_env_vars.sh"
-echo "2. Enter the IP addresses when prompted"
-echo "3. Copy the commands for the head prefill node and run them"
-echo "4. Copy the commands for all other nodes and run them on each node"
-echo "5. Proceed with starting your prefill and decode workers"
-echo ""
-# Prompt for IP addresses
-read -p "Enter HEAD_PREFILL_NODE IP: " HEAD_PREFILL_NODE
-read -p "Enter HEAD_DECODE_NODE IP: " HEAD_DECODE_NODE
-# Validate inputs
-if [ -z "$HEAD_PREFILL_NODE" ] || [ -z "$HEAD_DECODE_NODE" ]; then
-    echo "Error: Both IP addresses are required"
-    exit 1
-fi
-echo "=== HEAD PREFILL NODE ($HEAD_PREFILL_NODE) ==="
-echo "Run all of these commands on the head prefill node:"
-echo ""
-echo "nats-server -js &"
-echo "etcd --listen-client-urls http://0.0.0.0:2379 \\"
-echo "     --advertise-client-urls http://0.0.0.0:2379 \\"
-echo "     --listen-peer-urls http://0.0.0.0:2380 \\"
-echo "     --initial-cluster default=http://$HEAD_PREFILL_NODE:2380 &"
-echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE"
-echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE"
-echo ""
-echo "=== ALL OTHER NODES ==="
-echo "Run these commands on all other nodes (prefill and decode):"
-echo ""
-echo "# Export environment variables"
-echo "export NATS_SERVER=nats://$HEAD_PREFILL_NODE:4222"
-echo "export ETCD_ENDPOINTS=http://$HEAD_PREFILL_NODE:2379"
-echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE"
-echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE"
--- a/components/backends/sglang/configs/deepseek_r1/wideep/deepep.json
+++ b/components/backends/sglang/configs/deepseek_r1/wideep/deepep.json