Unverified Commit 6ed8ba0a authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: GPU VRAM profiler + profiled test markers for 1 GPU SGLang (part 3) (#7508)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent a04a9401
......@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
MODEL="Qwen/Qwen3-0.6B"
......@@ -54,6 +55,8 @@ if [ "$ENABLE_OTEL" = true ]; then
TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
fi
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated Serving" "$MODEL" "$HTTP_PORT"
......@@ -72,6 +75,7 @@ python3 -m dynamo.sglang \
--trust-remote-code \
--skip-tokenizer-init \
--enable-metrics \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
"${TRACE_ARGS[@]}" \
"${EXTRA_ARGS[@]}" &
......
......@@ -9,14 +9,24 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
MODEL="Qwen/Qwen3-Embedding-4B"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model-path)
MODEL="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model-path <name> Specify model (default: $MODEL)"
echo " -h, --help Show this help message"
echo ""
echo "Note: System metrics are enabled by default on port 8081 (worker)"
......@@ -24,14 +34,14 @@ while [[ $# -gt 0 ]]; do
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
EXTRA_ARGS+=("$1")
shift
;;
esac
done
MODEL="Qwen/Qwen3-Embedding-4B"
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL" 2>/dev/null || true)
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Embedding Worker" "$MODEL" "$HTTP_PORT"
......@@ -52,13 +62,15 @@ python3 -m dynamo.frontend &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \
--embedding-worker \
--model-path Qwen/Qwen3-Embedding-4B \
--served-model-name Qwen/Qwen3-Embedding-4B \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--use-sglang-tokenizer \
--enable-metrics &
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
--enable-metrics \
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments
ENABLE_OTEL=false
......@@ -52,6 +53,9 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Aggregated + KV Routing (2 GPUs)" "$MODEL" "$HTTP_PORT"
......@@ -75,22 +79,24 @@ fi
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
"${KV_EVENTS_ARGS_1[@]}" \
--enable-metrics \
"${TRACE_ARGS[@]}" &
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
"${KV_EVENTS_ARGS_2[@]}" \
--enable-metrics \
"${TRACE_ARGS[@]}" &
......
......@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Parse command line arguments
ENABLE_OTEL=false
......@@ -46,6 +47,9 @@ if [ "$ENABLE_OTEL" = true ]; then
fi
MODEL="Qwen/Qwen3-0.6B"
GPU_MEM_FRACTION=$(build_gpu_mem_args sglang --model "$MODEL")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated Serving (2 GPUs)" "$MODEL" "$HTTP_PORT"
......@@ -61,8 +65,8 @@ python3 -m dynamo.frontend &
# harnesses can set one simple pair for disaggregated deployments.
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......@@ -71,14 +75,15 @@ python3 -m dynamo.sglang \
--host 0.0.0.0 \
--port 40000 \
--disaggregation-transfer-backend nixl \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
--enable-metrics \
"${TRACE_ARGS[@]}" &
# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......@@ -86,6 +91,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disaggregation-bootstrap-port 12345 \
--host 0.0.0.0 \
--disaggregation-transfer-backend nixl \
${GPU_MEM_FRACTION:+--mem-fraction-static "$GPU_MEM_FRACTION"} \
--enable-metrics \
"${TRACE_ARGS[@]}" &
......
......@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
......@@ -86,6 +87,14 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
# Profiler override: scale prefill/decode fractions proportionally.
# Encode worker has no --mem-fraction-static in single-gpu mode, so it's unaffected.
if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" && "$SINGLE_GPU" == "true" ]]; then
_TOTAL_FRAC=$(awk -v p="$DYN_PREFILL_GPU_MEM" -v d="$DYN_DECODE_GPU_MEM" 'BEGIN { printf "%.4f", p + d }')
DYN_PREFILL_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v p="$DYN_PREFILL_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * p / t }')
DYN_DECODE_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v d="$DYN_DECODE_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * d / t }')
fi
ENCODE_EXTRA_ARGS=""
PREFILL_EXTRA_ARGS=""
DECODE_EXTRA_ARGS=""
......
......@@ -9,7 +9,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh" # build_gpu_mem_args
source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait_any_exit
# Default values
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
......@@ -78,6 +79,14 @@ DYN_WORKER_GPU=${DYN_WORKER_GPU:-1}
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_WORKER_GPU_MEM=${DYN_WORKER_GPU_MEM:-0.9}
# Profiler override: split _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE between workers
# preserving the ratio set by the env vars.
if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" && "$SINGLE_GPU" == "true" ]]; then
_TOTAL_FRAC=$(awk -v e="$DYN_ENCODE_GPU_MEM" -v w="$DYN_WORKER_GPU_MEM" 'BEGIN { printf "%.4f", e + w }')
DYN_ENCODE_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v e="$DYN_ENCODE_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * e / t }')
DYN_WORKER_GPU_MEM=$(awk -v o="$_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE" -v w="$DYN_WORKER_GPU_MEM" -v t="$_TOTAL_FRAC" 'BEGIN { printf "%.2f", o * w / t }')
fi
ENCODE_EXTRA_ARGS=""
WORKER_EXTRA_ARGS=""
......
......@@ -76,7 +76,7 @@ kv_cache_total = kv_cache_per_token * max_model_len * max_concurrent_seqs
overhead ≈ engine-dependent (auto-computed by estimate_worker_vram):
vllm: 1.2 + 1.0 * sqrt(params_b) GiB (0.6B≈2.0, 8B≈4.0)
sglang: 2.5 + 1.5 * sqrt(params_b) GiB (0.6B≈3.7, 8B≈6.7)
sglang: 1.5 + 1.0 * sqrt(params_b) GiB (0.6B≈2.3, 8B≈4.3)
trtllm: 2.0 + 1.2 * sqrt(params_b) GiB (0.6B≈2.9, 8B≈5.4)
```
......@@ -104,11 +104,27 @@ This is slightly different from vLLM (which includes activations in the budget).
sglang recommends keeping 5-8 GiB free for activations and overhead. If you
see OOM errors, decrease `--mem-fraction-static` by 0.01-0.05 increments.
### How `--context-length` works
### How `--context-length` and `--max-running-requests` work
Equivalent to vLLM's `--max-model-len`. Defaults to the model's native context
window. Reducing it shrinks the per-request KV cache requirement and allows more
concurrent sequences.
Unlike vLLM (where `--max-model-len` directly affects KV cache sizing), sglang's
`--context-length` and `--max-running-requests` do **not** affect KV cache
allocation. The KV cache pool is sized entirely from `--mem-fraction-static`:
```
kv_cache_pool = total_vram * mem_fraction_static - model_weights
```
Profiling confirmed this: changing `--context-length` from 512 to 40960 produced
identical `max_total_num_tokens` values (269,136 on a 48 GiB GPU at fraction 0.95).
These flags only affect **request scheduling**:
- `--context-length` caps the per-request token usage from the KV pool
- `--max-running-requests` limits concurrent request slots (allocated from
memory outside the `--mem-fraction-static` budget)
Setting `--max-running-requests` too high at high fractions can cause OOM because
the request slot pool competes for the small amount of memory left after KV cache
allocation.
### Estimating total GPU usage
......@@ -117,9 +133,9 @@ total_vram ≈ model_weights + kv_cache_pool + activations_and_overhead
kv_cache_pool = total_vram * mem_fraction_static - model_weights
activations_and_overhead ≈ 1-8 GiB (depends on model size, batch size, seq len;
~1-2 GiB for small models like 0.6B,
~5-8 GiB for larger models like 8B+ with CUDA graphs)
activations_and_overhead ≈ 1-2 GiB for small models (0.6B-4B)
~3-5 GiB for larger models (7B+)
(CUDA context, graphs, request pools — allocated outside mem_fraction_static)
```
---
......
......@@ -182,6 +182,11 @@ get_model_params() {
# MHA (not GQA): num_key_value_heads == num_attention_heads == 32
deepseek-ai/deepseek-llm-7b-base)
pb=6.9; wb=2; layers=30; kvh=32; hd=128 ;;
# https://huggingface.co/Qwen/Qwen3-Embedding-4B/raw/main/config.json
# params_b from model.safetensors.index.json metadata.total_size / 2 / 1e9
# head_dim = hidden_size(2560) / num_attention_heads(32) = 80
Qwen/Qwen3-Embedding-4B)
pb=4.0; wb=2; layers=36; kvh=8; hd=80 ;;
# https://huggingface.co/llava-hf/llava-1.5-7b-hf/raw/main/config.json (text_config)
# MHA: num_key_value_heads == num_attention_heads == 32
llava-hf/llava-1.5-7b-hf)
......@@ -216,9 +221,13 @@ get_model_params() {
#
# Per-engine constants (calibrated from measurements on RTX 6000 Ada 48 GiB):
# vllm: base=1.2, scale=1.0 → 0.6B≈2.0, 8B≈4.0, 30B≈6.7
# sglang: base=2.5, scale=1.5 → 0.6B≈3.7, 8B≈6.7, 30B≈10.8
# sglang: base=1.5, scale=1.0 → 0.6B≈2.3, 8B≈4.3, 30B≈7.0
# trtllm: base=2.0, scale=1.2 → 0.6B≈2.9, 8B≈5.4, 30B≈8.6
#
# sglang overhead was re-calibrated via profile_pytest.py bisection on
# RTX 6000 Ada 48 GiB. Observed CUDA overhead (outside --mem-fraction-static):
# Qwen3-0.6B: ~1.8 GiB. Previous coefficients (2.5, 1.5) over-estimated by ~2x.
#
# If the 4th argument is a number, it's used directly (backward compatible).
# If omitted, defaults to 2.0 (backward compatible).
#
......@@ -241,7 +250,7 @@ estimate_worker_vram() {
local overhead
case "$engine_or_overhead" in
vllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.2 + 1.0 * sqrt(p) }') ;;
sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.5 + 1.5 * sqrt(p) }') ;;
sglang) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 1.5 + 1.0 * sqrt(p) }') ;;
trtllm) overhead=$(awk -v p="$pb" 'BEGIN { printf "%.1f", 2.0 + 1.2 * sqrt(p) }') ;;
*) overhead="$engine_or_overhead" ;;
esac
......
......@@ -456,11 +456,13 @@ The profiler sets the `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` environment variable
| Engine | CLI flag | Launch script support |
|---------|----------------------------------|-----------------------|
| vLLM | `--gpu-memory-utilization` | Implemented in `agg.sh`, `disagg.sh`, etc. |
| SGLang | `--mem-fraction-static` | Not yet implemented (TODO) |
| vLLM | `--gpu-memory-utilization` | Implemented in `agg.sh`, `disagg.sh`, etc. via `build_gpu_mem_args` |
| SGLang | `--mem-fraction-static` | Implemented in `agg.sh`, `agg_embed.sh`, `disagg.sh`, `agg_router.sh`, `disagg_same_gpu.sh` via `build_gpu_mem_args`. Multimodal scripts (`multimodal_epd.sh`, `multimodal_disagg.sh`) split the override proportionally between workers. |
| TRT-LLM | `--free-gpu-memory-fraction` | Not yet implemented (has its own `DYN_TRTLLM_FREE_GPU_MEMORY_FRACTION`, TODO: unify) |
Scripts that already hard-code their own memory fraction (e.g. `agg_multimodal.sh` with 0.85) have a TODO to honor `_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE` in the future. If the profiler detects constant VRAM across all probes (meaning the env var is ignored), it prints a warning and skips marker recommendations.
**Note on sglang:** Unlike vLLM (where `--max-model-len` affects KV cache sizing), sglang's `--mem-fraction-static` is the sole knob for KV cache allocation. `--context-length` and `--max-running-requests` only affect request scheduling, not memory allocation. See `examples/common/gpu_utils.md` for details.
If the profiler detects constant VRAM across all probes (meaning the env var is ignored), it prints a warning and skips marker recommendations.
### Usage
......
......@@ -45,7 +45,10 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
# SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
# TODO: Now that these tests use dynamic ports and each config has a max_vram_gib marker,
# optimize the runtime by bin-packing multiple engine deployments in parallel on the same GPU.
# A future collector/launcher can sum max_vram_gib values to decide how many tests fit
# concurrently without exceeding available VRAM.
sglang_configs = {
"aggregated": SGLangConfig(
# Uses backend agg.sh (with metrics enabled) for testing standard
......@@ -55,8 +58,9 @@ sglang_configs = {
script_name="agg.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(6.1), # observed peak 5.6 GiB (+10% safety)
pytest.mark.timeout(240), # profiled 34.4s on A6000
pytest.mark.pre_merge,
pytest.mark.timeout(240), # 3x measured time (39s) + download time (120s)
],
model="Qwen/Qwen3-0.6B",
env={},
......@@ -76,7 +80,7 @@ sglang_configs = {
marks=[
pytest.mark.gpu_2,
pytest.mark.pre_merge,
],
], # TODO(gpu_2): profile max_vram, timeout, add markers (separate PR)
model="Qwen/Qwen3-0.6B",
env={},
frontend_port=DefaultPort.FRONTEND.value,
......@@ -96,8 +100,10 @@ sglang_configs = {
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.skip(reason="unstable"),
# TODO: profile to get max_vram and timeout (currently skipped)
],
model="Qwen/Qwen3-0.6B",
delayed_start=30,
env={},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
......@@ -126,7 +132,7 @@ sglang_configs = {
marks=[
pytest.mark.gpu_2,
pytest.mark.pre_merge,
],
], # TODO(gpu_2): profile max_vram, timeout, add markers (separate PR)
model="Qwen/Qwen3-0.6B",
env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
......@@ -154,9 +160,9 @@ sglang_configs = {
script_name="template_verifier.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(240), # profiled 11.7s on A6000 (no GPU model load)
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(240), # 3x measured time (20s) + download time (180s)
],
model="Qwen/Qwen3-0.6B",
env={},
......@@ -167,13 +173,21 @@ sglang_configs = {
)
],
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements.
# NOTE: multimodal_epd.sh uses explicit --mem-fraction-static via DYN_ENCODE_GPU_MEM
# / DYN_WORKER_GPU_MEM env vars, so _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect.
# Regardless of fraction overrides, the workers combined consistently use ~23.6 GiB.
"multimodal_e_pd_qwen": SGLangConfig(
# E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0
name="multimodal_e_pd_qwen",
directory=sglang_dir,
script_name="multimodal_epd.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(13.3), # observed peak 12.1 GiB (+10% safety)
pytest.mark.timeout(360), # profiled 31.0s on A6000
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=360,
......@@ -212,8 +226,9 @@ sglang_configs = {
script_name="multimodal_disagg.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(17.7), # observed peak 16.1 GiB (+10% safety)
pytest.mark.timeout(360), # profiled 36.0s on A6000
pytest.mark.pre_merge,
pytest.mark.timeout(360),
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
......@@ -246,9 +261,10 @@ sglang_configs = {
script_name="agg.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(21.0), # observed peak 19.1 GiB (+10% safety)
pytest.mark.timeout(300), # profiled 41.3s on A6000
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(300),
],
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=[
......@@ -284,9 +300,10 @@ sglang_configs = {
script_name="agg_embed.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(12.1), # observed peak 11.0 GiB (+10% safety)
pytest.mark.timeout(270), # profiled 25.5s on A6000
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(270), # 3x measured time (29s) + download time (180s)
],
model="Qwen/Qwen3-Embedding-4B",
delayed_start=0,
......@@ -321,10 +338,9 @@ sglang_configs = {
script_name="agg.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.max_vram_gib(16.2), # observed peak 14.8 GiB (+10% safety)
pytest.mark.timeout(420), # profiled 73s on A6000
pytest.mark.post_merge,
pytest.mark.timeout(
420
), # Total test timeout: 2x measured average (79.36s) + download time (240s) for 7B model
],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
......@@ -346,6 +362,7 @@ sglang_configs = {
pytest.mark.post_merge,
pytest.mark.timeout(240),
pytest.mark.skip(reason="DYN-2261"),
# TODO: profile to get max_vram (currently skipped)
],
model="Qwen/Qwen3-0.6B",
env={"DYN_ENABLE_ANTHROPIC_API": "1"},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment