Unverified Commit 3e015595 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: TensorRT-LLM and SGLang metrics validation (part 2/3 + 3/3) (#3842)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 2f89c176
...@@ -16,10 +16,13 @@ trap cleanup EXIT INT TERM ...@@ -16,10 +16,13 @@ trap cleanup EXIT INT TERM
python3 -m dynamo.frontend --http-port=8000 & python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker with metrics enabled
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code --trust-remote-code \
--skip-tokenizer-init \
--enable-metrics
...@@ -58,6 +58,8 @@ class DynamoSglangPublisher: ...@@ -58,6 +58,8 @@ class DynamoSglangPublisher:
# Set default values (can be overridden later if needed) # Set default values (can be overridden later if needed)
self.request_total_slots = 1024 self.request_total_slots = 1024
self.dp_rank = 0 self.dp_rank = 0
# TODO: Get actual GPU blocks from SGLang engine instead of hardcoded value
# This hardcoded value causes dynamo_component_kvstats_total_blocks to be incorrect.
self.num_gpu_block = 1024 self.num_gpu_block = 1024
# ZMQ setup for receiving scheduler metrics # ZMQ setup for receiving scheduler metrics
...@@ -96,6 +98,7 @@ class DynamoSglangPublisher: ...@@ -96,6 +98,7 @@ class DynamoSglangPublisher:
) )
kv_stats = KvStats( kv_stats = KvStats(
kv_active_blocks=0, kv_active_blocks=0,
# TODO: num_gpu_block to get actual GPU blocks from SGLang engine instead of hardcoded value
kv_total_blocks=self.num_gpu_block, kv_total_blocks=self.num_gpu_block,
gpu_cache_usage_perc=0.0, gpu_cache_usage_perc=0.0,
gpu_prefix_cache_hit_rate=0.0, gpu_prefix_cache_hit_rate=0.0,
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
"""Common base classes and utilities for engine tests (vLLM, TRT-LLM, etc.)""" """Common base classes and utilities for engine tests (vLLM, TRT-LLM, etc.)"""
import logging import logging
import os
from collections.abc import Mapping from collections.abc import Mapping
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
...@@ -13,7 +14,20 @@ from tests.utils.client import send_request ...@@ -13,7 +14,20 @@ from tests.utils.client import send_request
from tests.utils.engine_process import EngineConfig, EngineProcess from tests.utils.engine_process import EngineConfig, EngineProcess
DEFAULT_TIMEOUT = 10 DEFAULT_TIMEOUT = 10
SERVE_TEST_DIR = "/workspace/tests/serve"
# Determine WORKSPACE_DIR with precedence: current path -> env WORKSPACE_DIR -> /workspace
if os.path.exists(os.path.join(os.getcwd(), "Cargo.toml")):
WORKSPACE_DIR = os.getcwd()
else:
_workspace_dir = os.environ.get("WORKSPACE_DIR")
if _workspace_dir:
WORKSPACE_DIR = _workspace_dir
elif os.path.exists("/workspace"):
WORKSPACE_DIR = "/workspace"
else:
WORKSPACE_DIR = os.getcwd()
SERVE_TEST_DIR = os.path.join(WORKSPACE_DIR, "tests/serve")
def run_serve_deployment( def run_serve_deployment(
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$!
# run worker
python3 -m dynamo.sglang \
--model-path "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
--served-model-name "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init
...@@ -10,7 +10,7 @@ from transformers import AutoTokenizer ...@@ -10,7 +10,7 @@ from transformers import AutoTokenizer
from dynamo.llm import ModelInput, ModelType, register_llm from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
SERVE_TEST_DIR = "/workspace/tests/serve" SERVE_TEST_DIR = "/workspace/tests/serve" # do not import from tests.serve.common because on CI, PYTHONPATH is not set and it'll fail
class TemplateVerificationHandler: class TemplateVerificationHandler:
......
...@@ -19,6 +19,7 @@ from tests.utils.payload_builder import ( ...@@ -19,6 +19,7 @@ from tests.utils.payload_builder import (
completion_payload_default, completion_payload_default,
embedding_payload, embedding_payload,
embedding_payload_default, embedding_payload_default,
metric_payload_default,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -35,17 +36,19 @@ sglang_dir = os.environ.get("SGLANG_DIR", "/workspace/components/backends/sglang ...@@ -35,17 +36,19 @@ sglang_dir = os.environ.get("SGLANG_DIR", "/workspace/components/backends/sglang
sglang_configs = { sglang_configs = {
"aggregated": SGLangConfig( "aggregated": SGLangConfig(
# Uses backend agg.sh (with metrics enabled) for testing standard
# aggregated deployment with metrics collection
name="aggregated", name="aggregated",
directory=SERVE_TEST_DIR, directory=sglang_dir,
script_name="sglang_agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1],
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
completion_payload_default(), completion_payload_default(),
# TODO: Add metric_payload_default(min_num_requests=N, backend="sglang") metric_payload_default(min_num_requests=6, backend="sglang"),
], ],
), ),
"disaggregated": SGLangConfig( "disaggregated": SGLangConfig(
...@@ -83,8 +86,10 @@ sglang_configs = { ...@@ -83,8 +86,10 @@ sglang_configs = {
# marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages. # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
# The backend (launch/template_verifier.*) checks for this marker # The backend (launch/template_verifier.*) checks for this marker
# and returns "Successfully Applied Chat Template" if found. # and returns "Successfully Applied Chat Template" if found.
# Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
# are test-specific mock scripts in tests/serve/launch/
name="template_verification", name="template_verification",
directory=SERVE_TEST_DIR, directory=SERVE_TEST_DIR, # special directory for test-specific scripts
script_name="template_verifier.sh", script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
......
...@@ -7,9 +7,17 @@ from dataclasses import dataclass, field ...@@ -7,9 +7,17 @@ from dataclasses import dataclass, field
import pytest import pytest
from tests.serve.common import params_with_model_mark, run_serve_deployment from tests.serve.common import (
WORKSPACE_DIR,
params_with_model_mark,
run_serve_deployment,
)
from tests.utils.engine_process import EngineConfig from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import chat_payload_default, completion_payload_default from tests.utils.payload_builder import (
chat_payload_default,
completion_payload_default,
metric_payload_default,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -21,21 +29,23 @@ class TRTLLMConfig(EngineConfig): ...@@ -21,21 +29,23 @@ class TRTLLMConfig(EngineConfig):
stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"]) stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
trtllm_dir = os.environ.get("TRTLLM_DIR", "/workspace/components/backends/trtllm") trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
WORKSPACE_DIR, "components", "backends", "trtllm"
)
# trtllm test configurations # trtllm test configurations
trtllm_configs = { trtllm_configs = {
"aggregated": TRTLLMConfig( "aggregated": TRTLLMConfig(
name="aggregated", name="aggregated",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg.sh", script_name="agg_metrics.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker], marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
completion_payload_default(), completion_payload_default(),
# TODO: Add metric_payload_default(min_num_requests=N, backend="trtllm") metric_payload_default(min_num_requests=6, backend="trtllm"),
], ],
), ),
"disaggregated": TRTLLMConfig( "disaggregated": TRTLLMConfig(
......
...@@ -7,7 +7,11 @@ from dataclasses import dataclass, field ...@@ -7,7 +7,11 @@ from dataclasses import dataclass, field
import pytest import pytest
from tests.serve.common import params_with_model_mark, run_serve_deployment from tests.serve.common import (
WORKSPACE_DIR,
params_with_model_mark,
run_serve_deployment,
)
from tests.utils.engine_process import EngineConfig from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import ( from tests.utils.payload_builder import (
chat_payload, chat_payload,
...@@ -26,7 +30,9 @@ class VLLMConfig(EngineConfig): ...@@ -26,7 +30,9 @@ class VLLMConfig(EngineConfig):
stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"]) stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
vllm_dir = os.environ.get("VLLM_DIR", "/workspace/components/backends/vllm") vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "components", "backends", "vllm"
)
# vLLM test configurations # vLLM test configurations
vllm_configs = { vllm_configs = {
...@@ -100,7 +106,7 @@ vllm_configs = { ...@@ -100,7 +106,7 @@ vllm_configs = {
), ),
"multimodal_agg_llava": VLLMConfig( "multimodal_agg_llava": VLLMConfig(
name="multimodal_agg_llava", name="multimodal_agg_llava",
directory="/workspace/examples/multimodal", directory=os.path.join(WORKSPACE_DIR, "examples", "multimodal"),
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_2], marks=[pytest.mark.gpu_2],
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
...@@ -124,7 +130,7 @@ vllm_configs = { ...@@ -124,7 +130,7 @@ vllm_configs = {
), ),
"multimodal_agg_qwen": VLLMConfig( "multimodal_agg_qwen": VLLMConfig(
name="multimodal_agg_qwen", name="multimodal_agg_qwen",
directory="/workspace/examples/multimodal", directory=os.path.join(WORKSPACE_DIR, "examples", "multimodal"),
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_2], marks=[pytest.mark.gpu_2],
model="Qwen/Qwen2.5-VL-7B-Instruct", model="Qwen/Qwen2.5-VL-7B-Instruct",
...@@ -149,7 +155,7 @@ vllm_configs = { ...@@ -149,7 +155,7 @@ vllm_configs = {
), ),
"multimodal_video_agg": VLLMConfig( "multimodal_video_agg": VLLMConfig(
name="multimodal_video_agg", name="multimodal_video_agg",
directory="/workspace/examples/multimodal", directory=os.path.join(WORKSPACE_DIR, "examples", "multimodal"),
script_name="video_agg.sh", script_name="video_agg.sh",
marks=[pytest.mark.gpu_2], marks=[pytest.mark.gpu_2],
model="llava-hf/LLaVA-NeXT-Video-7B-hf", model="llava-hf/LLaVA-NeXT-Video-7B-hf",
......
...@@ -268,8 +268,9 @@ class MetricsPayload(BasePayload): ...@@ -268,8 +268,9 @@ class MetricsPayload(BasePayload):
MetricCheck( MetricCheck(
name=f"{prefix}_{prometheus_names.kvstats.TOTAL_BLOCKS}", name=f"{prefix}_{prometheus_names.kvstats.TOTAL_BLOCKS}",
pattern=metric_pattern, pattern=metric_pattern,
validator=lambda value: int(float(value)) > 0, validator=lambda value: int(float(value))
error_msg=lambda name, value: f"{name} should be > 0, but got {value}", >= 0, # Allow 0 for SGLang (hardcoded issue in components/src/dynamo/sglang/publisher.py:70)
error_msg=lambda name, value: f"{name} should be >= 0, but got {value}",
success_msg=lambda name, value: f"SUCCESS: Found {name} = {value}", success_msg=lambda name, value: f"SUCCESS: Found {name} = {value}",
), ),
] ]
...@@ -288,7 +289,32 @@ class MetricsPayload(BasePayload): ...@@ -288,7 +289,32 @@ class MetricsPayload(BasePayload):
multiline=True, multiline=True,
) )
) )
# TODO: Add sglang:* and trtllm:* metrics checks (similar to vllm above) elif backend == "sglang":
metrics_to_check.append(
MetricCheck(
# Check: Minimum count of unique sglang:* metrics
name="sglang:*",
pattern=lambda name: r"^sglang:\w+",
validator=lambda value: len(set(value))
>= 20, # 80% of typical ~25 sglang metrics (excluding _bucket) as of 2025-10-22 (but will grow)
error_msg=lambda name, value: f"Expected at least 20 unique sglang:* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique sglang:* metrics (minimum required: 20)",
multiline=True,
)
)
elif backend == "trtllm":
metrics_to_check.append(
MetricCheck(
# Check: Minimum count of unique trtllm:* metrics
name="trtllm:*",
pattern=lambda name: r"^trtllm:\w+",
validator=lambda value: len(set(value))
>= 4, # 80% of typical ~5 trtllm metrics (excluding _bucket) as of 2025-10-22 (but will grow)
error_msg=lambda name, value: f"Expected at least 4 unique trtllm:* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm:* metrics (minimum required: 4)",
multiline=True,
)
)
# Check all metrics # Check all metrics
for metric in metrics_to_check: for metric in metrics_to_check:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment