"lib/vscode:/vscode.git/clone" did not exist on "3d4997057f5db3cf71ed1b4e4d6fe6cf620fa4a8"
Unverified Commit 4cdc49c2 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: split build_gpu_mem_args into engine-specific functions (#7916)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent b1c18bb1
......@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
......@@ -55,7 +55,7 @@ if [ "$ENABLE_OTEL" = true ]; then
TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
fi
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT"
......
......@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
......@@ -40,7 +40,7 @@ while [[ $# -gt 0 ]]; do
esac
done
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
......
......@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments
......@@ -54,7 +54,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
......
......@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments
......@@ -48,7 +48,7 @@ fi
MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
DISAGG_BOOTSTRAP_PORT="${DYN_DISAGG_BOOTSTRAP_PORT:-12345}"
......
......@@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
#
# Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via build_gpu_mem_args (see gpu_utils.sh).
# Per-worker VRAM is controlled via build_sglang_gpu_mem_args (see gpu_utils.sh).
# Override individual knobs (CONTEXT_LENGTH, MAX_RUNNING_REQUESTS) via env vars.
#
# Measured reference (Qwen/Qwen3-0.6B, --context-length 4096, RTX 6000 Ada 48 GiB):
......@@ -25,7 +25,7 @@ MODEL="Qwen/Qwen3-0.6B"
CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args sglang --workers-per-gpu 2)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
......
......@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
......@@ -99,7 +99,7 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
GPU_MEM_ARGS=$(build_gpu_mem_args sglang --workers-per-gpu 3)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
ENCODE_EXTRA_ARGS=""
PREFILL_EXTRA_ARGS=""
......
......@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
......@@ -94,7 +94,7 @@ fi
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9}
GPU_MEM_ARGS=$(build_gpu_mem_args sglang --workers-per-gpu 2)
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
ENCODE_EXTRA_ARGS=""
WORKER_EXTRA_ARGS=""
......
......@@ -4,13 +4,13 @@
#
# Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via env vars (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS).
# TODO: unify with build_gpu_mem_args once trtllm --override-engine-args JSON
# TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
# merging is supported.
#
# NOTE — trtllm fraction semantics differ from vllm/sglang:
# vllm/sglang: fraction of TOTAL VRAM (weights + KV + activations all inside)
# trtllm: fraction of FREE VRAM (KV cache only, after model load)
# build_gpu_mem_args handles this — see gpu_utils.sh / gpu_utils.md.
# build_vllm_gpu_mem_args / build_sglang_gpu_mem_args handle this — see gpu_utils.sh / gpu_utils.md.
#
# Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB):
# estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total)
......@@ -30,7 +30,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# TODO: unify with build_gpu_mem_args once trtllm --override-engine-args JSON
# TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
# merging is supported.
GPU_MEM_FRACTION="${GPU_MEM_FRACTION:-}"
......
......@@ -8,7 +8,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_vllm_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default model
......@@ -33,7 +33,7 @@ done
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
......
......@@ -24,7 +24,7 @@ python -m dynamo.frontend \
# run workers with KVBM enabled
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of hardcoded fractions
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \
CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
......
......@@ -17,7 +17,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
......
......@@ -27,7 +27,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
......
......@@ -97,7 +97,7 @@ case "$MODEL_NAME" in
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
......
......@@ -50,7 +50,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
......
......@@ -29,7 +29,7 @@ python -m dynamo.frontend \
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
#
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
......
......@@ -23,7 +23,7 @@ python -m dynamo.frontend \
#
# If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
......
......@@ -21,7 +21,7 @@ python -m dynamo.frontend --http-port="$HTTP_PORT" &
# 2. Speculative Main Worker
# ---------------------------
# This runs the main model with EAGLE as the draft model for speculative decoding
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of hardcoded fractions
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
--model "$MODEL" \
......
......@@ -18,7 +18,7 @@ print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_P
python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model "$MODEL" \
......
......@@ -79,7 +79,7 @@ python -m dynamo.frontend &
EXTRA_ARGS=""
# GPU assignments (override via environment variables)
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions
# TODO: use build_vllm_gpu_mem_args to measure VRAM instead of hardcoded fractions
# In single-GPU mode both workers share the same GPU.
if [[ "$SINGLE_GPU" == "true" ]]; then
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
......
......@@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
#
# Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via build_gpu_mem_args (see gpu_utils.sh).
# Per-worker VRAM is controlled via build_vllm_gpu_mem_args (see gpu_utils.sh).
# Override individual knobs (MAX_MODEL_LEN, MAX_CONCURRENT_SEQS) via env vars.
#
# Measured reference (Qwen/Qwen3-0.6B, --max-model-len 4096, RTX 6000 Ada 48 GiB):
......@@ -25,7 +25,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm --workers-per-gpu 2)
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment