Unverified Commit e3e728a8 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix: disagg_same_gpu - profiling markers, GPU pinning, and memory args (#7996)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 47c4bd46
......@@ -166,8 +166,11 @@ async def init_prefill(
# Use pre-created engine if provided (snapshot mode)
if snapshot_engine is not None:
engine = snapshot_engine
load_time = 0.0
else:
start_time = time.time()
engine = sgl.Engine(server_args=server_args)
load_time = time.time() - start_time
generate_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
......@@ -179,6 +182,8 @@ async def init_prefill(
engine, config, generate_endpoint
)
publisher.component_gauges.set_model_load_time(load_time)
if server_args.node_rank >= 1:
await handle_non_leader_node(engine, publisher, metrics_task)
return
......
......@@ -24,8 +24,13 @@ MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-2}"
MAX_TOTAL_TOKENS="${MAX_TOTAL_TOKENS:-25000}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
if [[ -z "$GPU_MEM_ARGS" ]]; then
GPU_MEM_ARGS="--max-total-tokens $MAX_TOTAL_TOKENS"
fi
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
......@@ -35,14 +40,15 @@ HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated (same GPU)" "$MODEL" "$HTTP_PORT" \
"Workers: 2 (prefill + decode, fraction is per worker)"
# run ingress with KV router mode for disaggregated setup
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
python3 -m dynamo.frontend &
# NOTE: Each worker picks a random NCCL port (get_free_port) for torch.distributed.
# This has a TOCTOU race — the port can be grabbed before init_process_group binds it,
# causing sporadic EADDRINUSE. Pass --nccl-port <unique_port> per worker to avoid this.
# run prefill worker with metrics on port 8081
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \
--model-path "$MODEL" \
......@@ -63,16 +69,15 @@ python3 -m dynamo.sglang \
--max-running-requests "$MAX_RUNNING_REQUESTS" \
--enable-metrics &
# Wait for prefill worker to initialize before starting decode worker
# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
# The prefill worker needs time to:
# 1. Load model weights and allocate its memory fraction
# 2. Initialize KV cache with --delete-ckpt-after-loading to free checkpoint memory
# 3. Register with NATS service discovery so decode worker can find it
echo "Waiting for prefill worker to initialize..."
sleep 5
# Wait for prefill worker to initialize before starting decode worker.
# Both workers share one GPU with --delete-ckpt-after-loading; without this
# wait they compete for GPU memory during model loading and the scheduler OOMs.
# || true: don't let set -e kill the script on timeout (wait_for_ready returns 1).
PREFILL_SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
wait_for_ready "http://localhost:${PREFILL_SYSTEM_PORT}/health" 45 || true
# run decode worker with metrics on port 8082
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \
--model-path "$MODEL" \
......
......@@ -3,20 +3,14 @@
# SPDX-License-Identifier: Apache-2.0
#
# Disaggregated prefill/decode on a SINGLE GPU.
# Per-worker VRAM is controlled via env vars (MAX_SEQ_LEN, MAX_CONCURRENT_SEQS).
# TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
# merging is supported.
# Per-worker VRAM is controlled via absolute KV token caps (not fractions).
# Profiler overrides (_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS) are handled via
# build_trtllm_override_args_with_mem; standalone runs use MAX_TOTAL_TOKENS.
#
# NOTE — trtllm fraction semantics differ from vllm/sglang:
# vllm/sglang: fraction of TOTAL VRAM (weights + KV + activations all inside)
# trtllm: fraction of FREE VRAM (KV cache only, after model load)
# build_vllm_gpu_mem_args / build_sglang_gpu_mem_args handle this — see gpu_utils.sh / gpu_utils.md.
#
# Measured reference (Qwen/Qwen3-0.6B, --max-seq-len 4096, RTX 6000 Ada 48 GiB):
# estimate (from gpu_utils.sh) : ~8.0 GiB per worker (~16.0 GiB total)
# actual (nvidia-smi) : ~7.4 GiB per worker (~14.8 GiB total)
# fraction per worker (free) : 0.05
# Overestimating is intentional -- better to pad than OOM.
# Measured reference (Qwen/Qwen3-0.6B, RTX 6000 Ada 48 GiB):
# peak VRAM (nvidia-smi) : ~6.6 GiB total (both workers)
# default MAX_TOTAL_TOKENS : 25000 per worker
# min tokens (profiled) : 256 per worker
set -e
trap 'echo Cleaning up...; kill 0' EXIT
......@@ -29,10 +23,7 @@ MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_SEQ_LEN="${MAX_SEQ_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# TODO: unify with build_trtllm_override_args_with_mem once trtllm --override-engine-args JSON
# merging is supported.
GPU_MEM_FRACTION="${GPU_MEM_FRACTION:-}"
MAX_TOTAL_TOKENS="${MAX_TOTAL_TOKENS:-25000}"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
......@@ -67,20 +58,35 @@ while [[ $# -gt 0 ]]; do
done
# Build --override-engine-args JSON.
# Always override free_gpu_memory_fraction so the script controls KV cache size,
# matching how vllm (--gpu-memory-utilization) and sglang (--mem-fraction-static)
# pass memory parameters from the launch script.
OVERRIDE_PAIRS=""
if [[ -n "$GPU_MEM_FRACTION" ]]; then
OVERRIDE_PAIRS="\"kv_cache_config\": {\"free_gpu_memory_fraction\": ${GPU_MEM_FRACTION}}"
fi
#
# KV cache control (always absolute caps, never fractions):
# 1. Profiler env var (_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS or
# _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES) via build_trtllm_override_args_with_mem.
# 2. MAX_TOTAL_TOKENS env var (default 25000) for standalone runs.
# Collect non-memory override pairs (otel, etc.)
NON_MEM_PAIRS=""
if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
OVERRIDE_PAIRS="${OVERRIDE_PAIRS}, \"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\""
NON_MEM_PAIRS="\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\""
fi
if [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS:-}" ]] || [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES:-}" ]]; then
# Profiler provides absolute cap
BASE_JSON=""
[[ -n "$NON_MEM_PAIRS" ]] && BASE_JSON="{${NON_MEM_PAIRS}}"
FINAL_JSON=$(build_trtllm_override_args_with_mem ${BASE_JSON:+--merge-with-json "$BASE_JSON"})
OVERRIDE_ARGS=(--override-engine-args "$FINAL_JSON")
else
# No profiler — use absolute token cap from MAX_TOTAL_TOKENS
OVERRIDE_PAIRS="\"kv_cache_config\": {\"max_tokens\": ${MAX_TOTAL_TOKENS}}"
if [[ -n "$NON_MEM_PAIRS" ]]; then
OVERRIDE_PAIRS="${OVERRIDE_PAIRS}, $NON_MEM_PAIRS"
fi
OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")
fi
OVERRIDE_ARGS=(--override-engine-args "{${OVERRIDE_PAIRS}}")
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner "Launching Disaggregated on Same GPU (1 GPU)" "$MODEL" "$HTTP_PORT" \
......@@ -104,6 +110,13 @@ python3 -m dynamo.trtllm \
--disaggregation-mode prefill \
"${OVERRIDE_ARGS[@]}" &
# Wait for prefill worker to load model and allocate KV cache before starting
# decode. Both workers share one GPU; without this wait they compete for GPU
# memory during model loading, which can cause OOM.
# || true: don't let set -e kill the script on timeout (wait_for_ready returns 1).
PREFILL_SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
wait_for_ready "http://localhost:${PREFILL_SYSTEM_PORT}/health" 45 || true
# run decode worker (shares GPU with prefill)
OTEL_SERVICE_NAME=dynamo-worker-decode \
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
......
......@@ -24,8 +24,20 @@ MODEL="Qwen/Qwen3-0.6B"
# ---- Tunable (override via env vars) ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Inherit GPU from parent (profiler/test harness sets CUDA_VISIBLE_DEVICES);
# default to GPU 0 for standalone use.
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
# Per-worker KV cache byte cap (deterministic, GPU-size independent).
# Profiled safe value: 1_023_525_000 bytes (~976 MiB, 2x over min 512 MiB).
# --gpu-memory-utilization 0.01 prevents vLLM's startup free-memory check from
# rejecting the launch when a co-resident worker already holds VRAM.
# The profiler/parallel runner overrides via _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES.
DEFAULT_KV_CACHE_BYTES="${DEFAULT_KV_CACHE_BYTES:-1023525000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
if [[ -z "$GPU_MEM_ARGS" ]]; then
GPU_MEM_ARGS="--kv-cache-memory-bytes $DEFAULT_KV_CACHE_BYTES --gpu-memory-utilization 0.01"
fi
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
......@@ -41,8 +53,8 @@ python3 -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# For disaggregated deployments we standardize on DYN_SYSTEM_PORT1/2 instead of
# *_PREFILL/*_DECODE env names so test harnesses can set one simple pair.
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \
--model "$MODEL" \
--enforce-eager \
......@@ -51,19 +63,17 @@ python3 -m dynamo.vllm \
$GPU_MEM_ARGS \
--max-model-len "$MAX_MODEL_LEN" &
# Wait for decode worker to initialize before starting prefill worker
# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
# The decode worker needs time to:
# 1. Load model weights and allocate its memory fraction
# 2. Initialize KV cache
# 3. Register with NATS service discovery so prefill worker can find it
echo "Waiting for decode worker to initialize..."
sleep 10
# Wait for decode worker to initialize before starting prefill worker.
# Both workers share one GPU; without this wait they compete for GPU memory
# during model loading and the scheduler OOMs.
# || true: don't let set -e kill the script on timeout (wait_for_ready returns 1).
DECODE_SYSTEM_PORT="${DYN_SYSTEM_PORT1:-8081}"
wait_for_ready "http://localhost:${DECODE_SYSTEM_PORT}/health" 45 || true
# run prefill worker with metrics on port 8082
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \
--model "$MODEL" \
--enforce-eager \
......
......@@ -183,6 +183,34 @@ CURL_EOF
echo "=========================================="
}
# wait_for_ready <url> [timeout_seconds]
#
# Polls an HTTP endpoint until it returns 200 or timeout is reached.
# Useful for waiting for a worker to finish loading before starting the
# next one (e.g. disaggregated same-GPU deployments where concurrent
# model loading causes OOM).
#
# Args:
# url HTTP URL to poll (e.g. http://localhost:8081/health)
# timeout_seconds Max seconds to wait (default: 30)
#
# Returns 0 on success, 1 on timeout.
wait_for_ready() {
local _url="$1"
local _timeout="${2:-30}"
local _start=$SECONDS
echo "Polling $_url (timeout: ${_timeout}s)..."
while (( SECONDS - _start < _timeout )); do
if curl -sf --max-time 2 "$_url" > /dev/null 2>&1; then
echo "Ready after $(( SECONDS - _start ))s"
return 0
fi
sleep 1
done
echo "WARNING: $_url not ready after ${_timeout}s" >&2
return 1
}
# print_curl_footer
#
# Prints a custom curl example wrapped in the standard framing (matching
......
......@@ -552,6 +552,14 @@ The profiler automatically detects the engine type and uses the appropriate bina
**Requirement (TRT-LLM):** The launch script must honor `_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS` (and optionally `_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES`). This is handled by `build_trtllm_override_args_with_mem` in `gpu_utils.sh` (returns JSON for `--override-engine-args`). Note: this is a separate function from `build_vllm_gpu_mem_args` / `build_sglang_gpu_mem_args` because TRT-LLM requires JSON merging.
**Requirement (all engines):** Do not hardcode `CUDA_VISIBLE_DEVICES` in launch scripts. The profiler and parallel test runner set `CUDA_VISIBLE_DEVICES` to pin each test to a specific GPU. A script that overrides this (e.g. `CUDA_VISIBLE_DEVICES=0`) will ignore the assignment and land on the wrong GPU. Instead, inherit from the environment with a default:
```bash
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
```
Then pass the variable to each worker: `CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python3 -m dynamo.vllm ...`. For multi-GPU scripts that assign distinct GPUs per worker, use named env vars with defaults (e.g. `PREFILL_CUDA_VISIBLE_DEVICES="${PREFILL_CUDA_VISIBLE_DEVICES:-0}"`).
### Engine-specific mapping
Launch scripts call engine-specific functions from `examples/common/gpu_utils.sh` which check env var overrides and return the appropriate CLI flags:
......
......@@ -145,6 +145,13 @@ async def test_deployment(
framework = deployment_target.framework
profile = deployment_target.profile
# NIXL_ERR_BACKEND: vCluster CI nodes lack RDMA/UCX for inter-pod KV
# transfer. Prefill workers crash in NixlWrapper.create_backend.
if framework == "vllm" and profile in ("disagg", "disagg_router"):
pytest.skip(
"NIXL_ERR_BACKEND: CI cluster lacks RDMA/UCX for inter-pod KV transfer"
)
model = next((s.model for s in deployment_spec.services if s.model), None)
if not model:
pytest.fail(
......
......@@ -32,6 +32,11 @@ from tests.utils.payload_builder import (
logger = logging.getLogger(__name__)
def _is_cuda13() -> bool:
v = os.environ.get("CUDA_VERSION", "")
return v.startswith("13")
@dataclass
class SGLangConfig(EngineConfig):
"""Configuration for SGLang test scenarios"""
......@@ -106,29 +111,37 @@ sglang_configs = {
script_name="disagg_same_gpu.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(9.9), # actual profiled peak with kv-tokens
pytest.mark.requested_sglang_kv_tokens(
37472
), # KV cache cap (2x safety over min=18736)
# Local repro took ~289s wall time with worker readiness reaching
# "ready" at ~176s on a warm-cache RTX 6000 Ada.
pytest.mark.timeout(420),
pytest.mark.pre_merge,
pytest.mark.skip(reason="unstable"),
# TODO: profile to get max_vram and timeout (currently skipped)
pytest.mark.skipif(
_is_cuda13(),
reason="torch-memory-saver preload .so links libcudart.so.12, missing in cuda13 images",
),
],
model="Qwen/Qwen3-0.6B",
delayed_start=30,
delayed_start=10,
health_check_workers=True,
env={},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
# Validate dynamo_component_* and sglang:* metrics from prefill worker
# (DefaultPort.SYSTEM1)
# Disagg workers expose fewer sglang:* metrics (~14 vs ~25 for aggregated)
# because each only runs half the scheduler pipeline.
metric_payload_default(
min_num_requests=6,
backend="sglang",
backend="sglang_disagg",
port=DefaultPort.SYSTEM1.value,
),
# Validate dynamo_component_* and sglang:* metrics from decode worker
# (DefaultPort.SYSTEM2)
metric_payload_default(
min_num_requests=6,
backend="sglang",
backend="sglang_disagg",
port=DefaultPort.SYSTEM2.value,
),
],
......
......@@ -117,16 +117,19 @@ trtllm_configs = {
directory=trtllm_dir,
script_name="disagg_same_gpu.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.gpu_1, # 1 GPU(s) used, peak 6.6 GiB
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.skip(reason="unstable"),
pytest.mark.timeout(
480
), # 3x measured time (103.66s) + download time (150s)
pytest.mark.profiled_vram_gib(6.6), # actual nvidia-smi peak 6.6 GiB
pytest.mark.requested_trtllm_kv_tokens(
512
), # KV cache cap (2x safety over min=256)
pytest.mark.timeout(432), # ~6x profiled wall time 72s
],
model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value,
delayed_start=10,
health_check_workers=True,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
......
......@@ -306,6 +306,29 @@ vllm_configs = {
completion_payload_default(),
],
),
"disaggregated_same_gpu": VLLMConfig(
name="disaggregated_same_gpu",
directory=vllm_dir,
script_name="disagg_same_gpu.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(7.3), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_023_525_000
), # KV cache cap (2x safety over min=511_762_432)
pytest.mark.timeout(300), # ~6x observed 50s
# post_merge: cumulative sequential test time exceeds 35-min job budget.
# Move back to pre_merge once GPU tests run in parallel.
pytest.mark.post_merge,
],
model="Qwen/Qwen3-0.6B",
delayed_start=10,
health_check_workers=True,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"deepep": VLLMConfig(
name="deepep",
directory=vllm_dir,
......
......@@ -16,6 +16,7 @@ from tests.utils.payloads import BasePayload, check_health_generate, check_model
logger = logging.getLogger(__name__)
FRONTEND_PORT = (
DefaultPort.FRONTEND.value
) # Do NOT use this in tests! Use allocate_port() instead.
......@@ -49,6 +50,7 @@ class EngineConfig:
frontend_port: int = DefaultPort.FRONTEND.value
timeout: int = 600
delayed_start: int = 0
health_check_workers: bool = False
env: Dict[str, str] = field(default_factory=dict)
stragglers: list[str] = field(default_factory=list)
......@@ -169,14 +171,7 @@ class EngineProcess(ManagedProcess):
if extra_env:
env.update(extra_env)
return cls(
command=command,
env=env,
timeout=config.timeout,
display_output=True,
working_dir=config.directory,
health_check_ports=[],
health_check_urls=[
frontend_checks = [
(
f"http://localhost:{config.frontend_port}/v1/models",
check_models_api,
......@@ -185,8 +180,38 @@ class EngineProcess(ManagedProcess):
f"http://localhost:{config.frontend_port}/health",
check_health_generate,
),
],
delayed_start=config.delayed_start,
]
# For disagg-same-gpu deployments, health-check each worker's
# system port so we wait for ALL workers to be ready, not just the
# first one to register with the frontend. Worker liveness checks
# run FIRST so the frontend has time to discover newly-registered
# workers before the frontend endpoint checks run.
#
# NOTE: DYN_SYSTEM_PORT* env vars are injected by the dynamic port
# fixtures for ALL tests, so we gate on health_check_workers (only
# set by same-gpu disagg configs) to avoid health-checking ports
# that don't serve /health in regular multi-GPU tests.
delayed = config.delayed_start
worker_checks: list[tuple] = []
if config.health_check_workers:
for key, val in sorted(env.items()):
if key.startswith("DYN_SYSTEM_PORT") and val.isdigit():
worker_checks.append((f"http://localhost:{val}/health", None))
if worker_checks:
delayed = 0
health_urls = worker_checks + frontend_checks
return cls(
command=command,
env=env,
timeout=config.timeout,
display_output=True,
working_dir=config.directory,
health_check_ports=[],
health_check_urls=health_urls,
delayed_start=delayed,
# Must stay False: command[0] is "bash", so True would kill every
# bash process system-wide. Stale cleanup relies on stragglers list
# and process-group termination in __exit__ instead.
......
......@@ -1175,6 +1175,29 @@ class SGLangMetricsPayload(MetricsPayload):
return checks
@dataclass
class SGLangDisaggMetricsPayload(SGLangMetricsPayload):
"""Metrics validation for SGLang disaggregated workers.
Disagg workers (prefill/decode) expose fewer sglang:* metrics than
aggregated workers because each only runs half the scheduler pipeline.
Observed: ~14 unique sglang:* metrics vs ~25 for aggregated.
"""
def _get_backend_specific_checks(self) -> list[MetricCheck]:
checks = super()._get_backend_specific_checks()
for check in checks:
if check.name == "sglang:*":
check.validator = lambda value: len(set(value)) >= 10
check.error_msg = lambda name, value: (
f"Expected at least 10 unique sglang:* metrics, but found only {len(set(value))}"
)
check.success_msg = lambda name, value: (
f"SUCCESS: Found {len(set(value))} unique sglang:* metrics (minimum required: 10)"
)
return checks
@dataclass
class TRTLLMMetricsPayload(MetricsPayload):
"""Metrics validation for TensorRT-LLM backend"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment