Unverified Commit da40db40 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

feat: add unified backend architecture with DynamoBackend (#8003)


Signed-off-by: default avatarTanmay Verma <tanmayv@nvidia.com>
parent f3b181a9
......@@ -20,6 +20,7 @@ export MODALITY=${MODALITY:-"text"}
ENABLE_OTEL=false
USE_UNIFIED=false
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
......@@ -27,10 +28,15 @@ while [[ $# -gt 0 ]]; do
ENABLE_OTEL=true
shift
;;
--unified)
USE_UNIFIED=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --enable-otel Enable OpenTelemetry tracing"
echo " --unified Use unified_main entry point (Worker)"
echo " -h, --help Show this help message"
echo ""
echo "Any additional options are passed through to dynamo.trtllm."
......@@ -71,8 +77,12 @@ python3 -m dynamo.frontend &
# run worker
# Additional command line args can be passed
WORKER_MODULE="dynamo.trtllm"
if [ "$USE_UNIFIED" = true ]; then
WORKER_MODULE="dynamo.trtllm.unified_main"
fi
OTEL_SERVICE_NAME=dynamo-worker \
python3 -m dynamo.trtllm \
python3 -m "$WORKER_MODULE" \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
......
......@@ -13,6 +13,7 @@ source "$SCRIPT_DIR/../../../common/launch_utils.sh" # print_launch_banner, wait
# Default model
MODEL="Qwen/Qwen3-0.6B"
USE_UNIFIED=false
# Parse command line arguments
EXTRA_ARGS=()
......@@ -22,6 +23,20 @@ while [[ $# -gt 0 ]]; do
MODEL="$2"
shift 2
;;
--unified)
USE_UNIFIED=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model <name> Specify model (default: $MODEL)"
echo " --unified Use unified_main entry point (Worker)"
echo " -h, --help Show this help message"
echo ""
echo "Any additional options are passed through to dynamo.vllm."
exit 0
;;
*)
EXTRA_ARGS+=("$1")
shift
......@@ -47,8 +62,12 @@ python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
WORKER_MODULE="dynamo.vllm"
if [ "$USE_UNIFIED" = true ]; then
WORKER_MODULE="dynamo.vllm.unified_main"
fi
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \
python -m "$WORKER_MODULE" --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \
$GPU_MEM_ARGS \
......
......@@ -55,7 +55,7 @@ pub enum FinishReason {
#[serde(rename = "error")]
Error(String),
#[serde(rename = "cancelled")]
#[serde(rename = "cancelled", alias = "abort")]
Cancelled,
#[serde(rename = "content_filter")]
......@@ -83,7 +83,7 @@ impl std::str::FromStr for FinishReason {
"eos" => Ok(FinishReason::EoS),
"length" => Ok(FinishReason::Length),
"stop" => Ok(FinishReason::Stop),
"cancelled" => Ok(FinishReason::Cancelled),
"cancelled" | "abort" => Ok(FinishReason::Cancelled),
s if s.starts_with("error: ") => Ok(FinishReason::Error(s[7..].to_string())),
_ => Err(anyhow::anyhow!("Invalid FinishReason variant: '{}'", s)),
}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import dataclasses
import logging
import os
import pytest
from tests.serve.common import (
WORKSPACE_DIR,
params_with_model_mark,
run_serve_deployment,
)
from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import chat_payload_default, completion_payload_default
logger = logging.getLogger(__name__)
sample_dir = os.path.join(WORKSPACE_DIR, "examples/backends/sample")
sample_configs = {
"aggregated": EngineConfig(
name="aggregated",
directory=sample_dir,
script_name="agg.sh",
script_args=["--model-name", "Qwen/Qwen3-0.6B"],
marks=[
pytest.mark.gpu_0,
pytest.mark.timeout(300),
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
}
@pytest.fixture(params=params_with_model_mark(sample_configs))
def sample_config_test(request):
"""Fixture that provides different sample test configurations"""
return sample_configs[request.param]
@pytest.mark.e2e
def test_sample_deployment(
sample_config_test,
request,
runtime_services_dynamic_ports,
dynamo_dynamic_ports,
predownload_models,
):
"""Test sample backend deployment using the unified Worker."""
config = dataclasses.replace(
sample_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
......@@ -89,6 +89,26 @@ sglang_configs = {
metric_payload_default(min_num_requests=6, backend="sglang"),
],
),
"aggregated_unified": SGLangConfig(
name="aggregated_unified",
directory=sglang_dir,
script_name="agg.sh",
script_args=["--unified"],
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(3.7),
pytest.mark.requested_sglang_kv_tokens(96),
pytest.mark.timeout(195),
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-0.6B",
env={},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"disaggregated": SGLangConfig(
name="disaggregated",
directory=sglang_dir,
......
......@@ -100,6 +100,27 @@ trtllm_configs = {
metric_payload_default(min_num_requests=6, backend="trtllm"),
],
),
"aggregated_unified": TRTLLMConfig(
name="aggregated_unified",
directory=trtllm_dir,
script_name="agg.sh",
script_args=["--unified"],
marks=[
pytest.mark.gpu_1,
pytest.mark.trtllm,
pytest.mark.profiled_vram_gib(3.9),
pytest.mark.requested_trtllm_kv_tokens(2592),
pytest.mark.timeout(300),
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-0.6B",
frontend_port=DefaultPort.FRONTEND.value,
delayed_start=5,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"disaggregated": TRTLLMConfig(
name="disaggregated",
directory=trtllm_dir,
......
......@@ -104,6 +104,24 @@ vllm_configs = {
metric_payload_default(min_num_requests=6, backend="vllm"),
],
),
"aggregated_unified": VLLMConfig(
name="aggregated_unified",
directory=vllm_dir,
script_name="agg.sh",
script_args=["--unified"],
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(3.8),
pytest.mark.requested_vllm_kv_cache_bytes(1_119_388_000),
pytest.mark.timeout(360),
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
completion_payload_default(),
],
),
"aggregated_logprobs": VLLMConfig(
name="aggregated_logprobs",
directory=vllm_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment