Unverified Commit 3e015595 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: TensorRT-LLM and SGLang metrics validation (part 2/3 + 3/3) (#3842)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 2f89c176
......@@ -16,10 +16,13 @@ trap cleanup EXIT INT TERM
python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$!
# run worker
# run worker with metrics enabled
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code
--trust-remote-code \
--skip-tokenizer-init \
--enable-metrics
......@@ -58,6 +58,8 @@ class DynamoSglangPublisher:
# Set default values (can be overridden later if needed)
self.request_total_slots = 1024
self.dp_rank = 0
# TODO: Get actual GPU blocks from SGLang engine instead of hardcoded value
# This hardcoded value causes dynamo_component_kvstats_total_blocks to be incorrect.
self.num_gpu_block = 1024
# ZMQ setup for receiving scheduler metrics
......@@ -96,6 +98,7 @@ class DynamoSglangPublisher:
)
kv_stats = KvStats(
kv_active_blocks=0,
# TODO: num_gpu_block to get actual GPU blocks from SGLang engine instead of hardcoded value
kv_total_blocks=self.num_gpu_block,
gpu_cache_usage_perc=0.0,
gpu_prefix_cache_hit_rate=0.0,
......
......@@ -4,6 +4,7 @@
"""Common base classes and utilities for engine tests (vLLM, TRT-LLM, etc.)"""
import logging
import os
from collections.abc import Mapping
from typing import Any, Dict, Optional
......@@ -13,7 +14,20 @@ from tests.utils.client import send_request
from tests.utils.engine_process import EngineConfig, EngineProcess
DEFAULT_TIMEOUT = 10
SERVE_TEST_DIR = "/workspace/tests/serve"
# Determine WORKSPACE_DIR with precedence: current path -> env WORKSPACE_DIR -> /workspace
if os.path.exists(os.path.join(os.getcwd(), "Cargo.toml")):
WORKSPACE_DIR = os.getcwd()
else:
_workspace_dir = os.environ.get("WORKSPACE_DIR")
if _workspace_dir:
WORKSPACE_DIR = _workspace_dir
elif os.path.exists("/workspace"):
WORKSPACE_DIR = "/workspace"
else:
WORKSPACE_DIR = os.getcwd()
SERVE_TEST_DIR = os.path.join(WORKSPACE_DIR, "tests/serve")
def run_serve_deployment(
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$!
# run worker
python3 -m dynamo.sglang \
--model-path "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
--served-model-name "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init
......@@ -10,7 +10,7 @@ from transformers import AutoTokenizer
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
SERVE_TEST_DIR = "/workspace/tests/serve"
SERVE_TEST_DIR = "/workspace/tests/serve" # do not import from tests.serve.common because on CI, PYTHONPATH is not set and it'll fail
class TemplateVerificationHandler:
......
......@@ -19,6 +19,7 @@ from tests.utils.payload_builder import (
completion_payload_default,
embedding_payload,
embedding_payload_default,
metric_payload_default,
)
logger = logging.getLogger(__name__)
......@@ -35,17 +36,19 @@ sglang_dir = os.environ.get("SGLANG_DIR", "/workspace/components/backends/sglang
sglang_configs = {
"aggregated": SGLangConfig(
# Uses backend agg.sh (with metrics enabled) for testing standard
# aggregated deployment with metrics collection
name="aggregated",
directory=SERVE_TEST_DIR,
script_name="sglang_agg.sh",
directory=sglang_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
# TODO: Add metric_payload_default(min_num_requests=N, backend="sglang")
metric_payload_default(min_num_requests=6, backend="sglang"),
],
),
"disaggregated": SGLangConfig(
......@@ -83,8 +86,10 @@ sglang_configs = {
# marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
# The backend (launch/template_verifier.*) checks for this marker
# and returns "Successfully Applied Chat Template" if found.
# Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
# are test-specific mock scripts in tests/serve/launch/
name="template_verification",
directory=SERVE_TEST_DIR,
directory=SERVE_TEST_DIR, # special directory for test-specific scripts
script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1],
model="Qwen/Qwen3-0.6B",
......
......@@ -7,9 +7,17 @@ from dataclasses import dataclass, field
import pytest
from tests.serve.common import params_with_model_mark, run_serve_deployment
from tests.serve.common import (
WORKSPACE_DIR,
params_with_model_mark,
run_serve_deployment,
)
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import chat_payload_default, completion_payload_default
from tests.utils.payload_builder import (
chat_payload_default,
completion_payload_default,
metric_payload_default,
)
logger = logging.getLogger(__name__)
......@@ -21,21 +29,23 @@ class TRTLLMConfig(EngineConfig):
stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
trtllm_dir = os.environ.get("TRTLLM_DIR", "/workspace/components/backends/trtllm")
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
WORKSPACE_DIR, "components", "backends", "trtllm"
)
# trtllm test configurations
trtllm_configs = {
"aggregated": TRTLLMConfig(
name="aggregated",
directory=trtllm_dir,
script_name="agg.sh",
script_name="agg_metrics.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
# TODO: Add metric_payload_default(min_num_requests=N, backend="trtllm")
metric_payload_default(min_num_requests=6, backend="trtllm"),
],
),
"disaggregated": TRTLLMConfig(
......
......@@ -7,7 +7,11 @@ from dataclasses import dataclass, field
import pytest
from tests.serve.common import params_with_model_mark, run_serve_deployment
from tests.serve.common import (
WORKSPACE_DIR,
params_with_model_mark,
run_serve_deployment,
)
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
chat_payload,
......@@ -26,7 +30,9 @@ class VLLMConfig(EngineConfig):
stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
vllm_dir = os.environ.get("VLLM_DIR", "/workspace/components/backends/vllm")
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "components", "backends", "vllm"
)
# vLLM test configurations
vllm_configs = {
......@@ -100,7 +106,7 @@ vllm_configs = {
),
"multimodal_agg_llava": VLLMConfig(
name="multimodal_agg_llava",
directory="/workspace/examples/multimodal",
directory=os.path.join(WORKSPACE_DIR, "examples", "multimodal"),
script_name="agg.sh",
marks=[pytest.mark.gpu_2],
model="llava-hf/llava-1.5-7b-hf",
......@@ -124,7 +130,7 @@ vllm_configs = {
),
"multimodal_agg_qwen": VLLMConfig(
name="multimodal_agg_qwen",
directory="/workspace/examples/multimodal",
directory=os.path.join(WORKSPACE_DIR, "examples", "multimodal"),
script_name="agg.sh",
marks=[pytest.mark.gpu_2],
model="Qwen/Qwen2.5-VL-7B-Instruct",
......@@ -149,7 +155,7 @@ vllm_configs = {
),
"multimodal_video_agg": VLLMConfig(
name="multimodal_video_agg",
directory="/workspace/examples/multimodal",
directory=os.path.join(WORKSPACE_DIR, "examples", "multimodal"),
script_name="video_agg.sh",
marks=[pytest.mark.gpu_2],
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
......
......@@ -268,8 +268,9 @@ class MetricsPayload(BasePayload):
MetricCheck(
name=f"{prefix}_{prometheus_names.kvstats.TOTAL_BLOCKS}",
pattern=metric_pattern,
validator=lambda value: int(float(value)) > 0,
error_msg=lambda name, value: f"{name} should be > 0, but got {value}",
validator=lambda value: int(float(value))
>= 0, # Allow 0 for SGLang (hardcoded issue in components/src/dynamo/sglang/publisher.py:70)
error_msg=lambda name, value: f"{name} should be >= 0, but got {value}",
success_msg=lambda name, value: f"SUCCESS: Found {name} = {value}",
),
]
......@@ -288,7 +289,32 @@ class MetricsPayload(BasePayload):
multiline=True,
)
)
# TODO: Add sglang:* and trtllm:* metrics checks (similar to vllm above)
elif backend == "sglang":
metrics_to_check.append(
MetricCheck(
# Check: Minimum count of unique sglang:* metrics
name="sglang:*",
pattern=lambda name: r"^sglang:\w+",
validator=lambda value: len(set(value))
>= 20, # 80% of typical ~25 sglang metrics (excluding _bucket) as of 2025-10-22 (but will grow)
error_msg=lambda name, value: f"Expected at least 20 unique sglang:* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique sglang:* metrics (minimum required: 20)",
multiline=True,
)
)
elif backend == "trtllm":
metrics_to_check.append(
MetricCheck(
# Check: Minimum count of unique trtllm:* metrics
name="trtllm:*",
pattern=lambda name: r"^trtllm:\w+",
validator=lambda value: len(set(value))
>= 4, # 80% of typical ~5 trtllm metrics (excluding _bucket) as of 2025-10-22 (but will grow)
error_msg=lambda name, value: f"Expected at least 4 unique trtllm:* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm:* metrics (minimum required: 4)",
multiline=True,
)
)
# Check all metrics
for metric in metrics_to_check:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment