Unverified Commit 4cdc49c2 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: split build_gpu_mem_args into engine-specific functions (#7916)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent b1c18bb1
...@@ -9,7 +9,7 @@ set -e ...@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
...@@ -55,7 +55,7 @@ if [ "$ENABLE_OTEL" = true ]; then ...@@ -55,7 +55,7 @@ if [ "$ENABLE_OTEL" = true ]; then
TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
fi fi
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT"
......
...@@ -9,7 +9,7 @@ set -e ...@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
...@@ -40,7 +40,7 @@ while [[ $# -gt 0 ]]; do ...@@ -40,7 +40,7 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT" print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
......
...@@ -9,7 +9,7 @@ set -e ...@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments # Parse command line arguments
...@@ -54,7 +54,7 @@ fi ...@@ -54,7 +54,7 @@ fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
......
...@@ -9,7 +9,7 @@ set -e ...@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments # Parse command line arguments
...@@ -48,7 +48,7 @@ fi ...@@ -48,7 +48,7 @@ fi
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_ARGS=$(build_gpu_mem_args sglang) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
DISAGG_BOOTSTRAP_PORT="${DYN_DISAGG_BOOTSTRAP_PORT:-12345}" DISAGG_BOOTSTRAP_PORT="${DYN_DISAGG_BOOTSTRAP_PORT:-12345}"
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
# Disaggregated prefill/decode on a SINGLE GPU. # Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via build_gpu_mem_args (see gpu_utils.sh). # Per-worker VRAM is controlled via build_sglang_gpu_mem_args (see gpu_utils.sh).
# Override individual knobs (CONTEXT_LENGTH, MAX_RUNNING_REQUESTS) via env vars. # Override individual knobs (CONTEXT_LENGTH, MAX_RUNNING_REQUESTS) via env vars.
# #
# Measured reference (Qwen/Qwen3-0.6B, --context-length 4096, RTX 6000 Ada 48 GiB): # Measured reference (Qwen/Qwen3-0.6B, --context-length 4096, RTX 6000 Ada 48 GiB):
...@@ -25,7 +25,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -25,7 +25,7 @@ MODEL="Qwen/Qwen3-0.6B"
CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}" CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}" MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args sglang --workers-per-gpu 2) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/launch_utils.sh"
......
...@@ -9,7 +9,7 @@ set -e ...@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
...@@ -99,7 +99,7 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9} ...@@ -99,7 +99,7 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9} DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9} DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
GPU_MEM_ARGS=$(build_gpu_mem_args sglang --workers-per-gpu 3) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
ENCODE_EXTRA_ARGS="" ENCODE_EXTRA_ARGS=""
PREFILL_EXTRA_ARGS="" PREFILL_EXTRA_ARGS=""
......
...@@ -9,7 +9,7 @@ set -e ...@@ -9,7 +9,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_sglang_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values # Default values
...@@ -94,7 +94,7 @@ fi ...@@ -94,7 +94,7 @@ fi
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9} DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9} DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9}
GPU_MEM_ARGS=$(build_gpu_mem_args sglang --workers-per-gpu 2) GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
ENCODE_EXTRA_ARGS="" ENCODE_EXTRA_ARGS=""
WORKER_EXTRA_ARGS="" WORKER_EXTRA_ARGS=""
......
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
# #
# Disaggregated prefill/decode on a SINGLE GPU. # Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via env vars (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS). # Per-worker VRAM is controlled via env vars (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS).
# TODO: unify with build_gpu_mem_args once trtllm --override-engine-args JSON # TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
# merging is supported. # merging is supported.
# #
# NOTE — trtllm fraction semantics differ from vllm/sglang: # NOTE — trtllm fraction semantics differ from vllm/sglang:
# vllm/sglang: fraction of TOTAL VRAM (weights + KV + activations all inside) # vllm/sglang: fraction of TOTAL VRAM (weights + KV + activations all inside)
# trtllm: fraction of FREE VRAM (KV cache only, after model load) # trtllm: fraction of FREE VRAM (KV cache only, after model load)
# build_gpu_mem_args handles this — see gpu_utils.sh / gpu_utils.md. # build_vllm_gpu_mem_args / build_sglang_gpu_mem_args handle this — see gpu_utils.sh / gpu_utils.md.
# #
# Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB): # Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB):
# estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total) # estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total)
...@@ -30,7 +30,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -30,7 +30,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}" MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# TODO: unify with build_gpu_mem_args once trtllm --override-engine-args JSON # TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
# merging is supported. # merging is supported.
GPU_MEM_FRACTION="${GPU_MEM_FRACTION:-}" GPU_MEM_FRACTION="${GPU_MEM_FRACTION:-}"
......
...@@ -8,7 +8,7 @@ set -e ...@@ -8,7 +8,7 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # gpu_gb_to_total_fraction source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_vllm_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default model # Default model
...@@ -33,7 +33,7 @@ done ...@@ -33,7 +33,7 @@ done
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving (1 GPU)" "$MODEL" "$HTTP_PORT"
......
...@@ -24,7 +24,7 @@ python -m dynamo.frontend \ ...@@ -24,7 +24,7 @@ python -m dynamo.frontend \
# run workers with KVBM enabled # run workers with KVBM enabled
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts # Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of hardcoded fractions
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \
CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
......
...@@ -17,7 +17,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -17,7 +17,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving + LMCache (1 GPU)" "$MODEL" "$HTTP_PORT"
......
...@@ -27,7 +27,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -27,7 +27,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated + LMCache + Multiproc (1 GPU)" "$MODEL" "$HTTP_PORT"
......
...@@ -97,7 +97,7 @@ case "$MODEL_NAME" in ...@@ -97,7 +97,7 @@ case "$MODEL_NAME" in
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;; MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac esac
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# Start vLLM worker with vision model # Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production) # --enforce-eager: Quick deployment (remove for production)
......
...@@ -50,7 +50,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" ...@@ -50,7 +50,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
export DYN_REQUEST_PLANE=$REQUEST_PLANE export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE" echo "Using request plane mode: $REQUEST_PLANE"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT" print_launch_banner "Launching Aggregated Serving + Request Planes (1 GPU)" "$MODEL" "$HTTP_PORT"
......
...@@ -29,7 +29,7 @@ python -m dynamo.frontend \ ...@@ -29,7 +29,7 @@ python -m dynamo.frontend \
# #
# If multiple workers are launched, they must not share the same system/metrics port. # If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set. # Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
# #
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
......
...@@ -23,7 +23,7 @@ python -m dynamo.frontend \ ...@@ -23,7 +23,7 @@ python -m dynamo.frontend \
# #
# If multiple workers are launched, they must not share the same system/metrics port. # If multiple workers are launched, they must not share the same system/metrics port.
# Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set. # Use DYN_SYSTEM_PORT{1,2} so tests/launchers can provide a simple numbered port set.
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
......
...@@ -21,7 +21,7 @@ python -m dynamo.frontend --http-port="$HTTP_PORT" & ...@@ -21,7 +21,7 @@ python -m dynamo.frontend --http-port="$HTTP_PORT" &
# 2. Speculative Main Worker # 2. Speculative Main Worker
# --------------------------- # ---------------------------
# This runs the main model with EAGLE as the draft model for speculative decoding # This runs the main model with EAGLE as the draft model for speculative decoding
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of hardcoded fractions
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm \
--model "$MODEL" \ --model "$MODEL" \
......
...@@ -18,7 +18,7 @@ print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_P ...@@ -18,7 +18,7 @@ print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_P
python -m dynamo.frontend & python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
# TODO: use build_gpu_mem_args to measure VRAM instead of relying on vLLM defaults # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of relying on vLLM defaults
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model "$MODEL" \ --model "$MODEL" \
......
...@@ -79,7 +79,7 @@ python -m dynamo.frontend & ...@@ -79,7 +79,7 @@ python -m dynamo.frontend &
EXTRA_ARGS="" EXTRA_ARGS=""
# GPU assignments (override via environment variables) # GPU assignments (override via environment variables)
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions # TODO: use build_vllm_gpu_mem_args to measure VRAM instead of hardcoded fractions
# In single-GPU mode both workers share the same GPU. # In single-GPU mode both workers share the same GPU.
if [[ "$SINGLE_GPU" == "true" ]]; then if [[ "$SINGLE_GPU" == "true" ]]; then
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0} DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
# Disaggregated prefill/decode on a SINGLE GPU. # Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via build_gpu_mem_args (see gpu_utils.sh). # Per-worker VRAM is controlled via build_vllm_gpu_mem_args (see gpu_utils.sh).
# Override individual knobs (MAX_MODEL_LEN, MAX_CONCURRENT_SEQS) via env vars. # Override individual knobs (MAX_MODEL_LEN, MAX_CONCURRENT_SEQS) via env vars.
# #
# Measured reference (Qwen/Qwen3-0.6B, --max-model-len 4096, RTX 6000 Ada 48 GiB): # Measured reference (Qwen/Qwen3-0.6B, --max-model-len 4096, RTX 6000 Ada 48 GiB):
...@@ -25,7 +25,7 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -25,7 +25,7 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm --workers-per-gpu 2) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
source "$SCRIPT_DIR/../../../common/launch_utils.sh" source "$SCRIPT_DIR/../../../common/launch_utils.sh"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment