test_trtllm.py

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import dataclasses
import logging
import os
from dataclasses import dataclass, field

import pytest

from tests.serve.common import (
    SERVE_TEST_DIR,
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    TEXT_PROMPT,
    chat_payload,
    chat_payload_default,
    completion_payload,
    completion_payload_default,
    metric_payload_default,
    multimodal_payload_default,
)
from tests.utils.payloads import ImageGenerationPayload, VideoGenerationPayload

logger = logging.getLogger(__name__)


@dataclass
class TRTLLMConfig(EngineConfig):
    """Configuration for trtllm test scenarios"""

    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])


trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/trtllm"
)

# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
        directory=trtllm_dir,
        script_name="agg_metrics.sh",
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.9 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.9),  # actual nvidia-smi peak 3.9 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="trtllm"),
        ],
    ),
    "aggregated_unified": TRTLLMConfig(
        name="aggregated_unified",
        directory=trtllm_dir,
        script_name="agg.sh",
        script_args=["--unified"],
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.9),
            pytest.mark.requested_trtllm_kv_tokens(2592),
            pytest.mark.timeout(300),
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
        directory=trtllm_dir,
        script_name="disagg.sh",
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
        marks=[
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450"
            ),
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(6.6),  # actual nvidia-smi peak 6.6 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                512
            ),  # KV cache cap (2x safety over min=256)
            pytest.mark.timeout(432),  # ~6x profiled wall time 72s
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=10,
        health_check_workers=True,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(
                port=DefaultPort.SYSTEM1.value, min_num_requests=6, backend="trtllm"
            ),
            metric_payload_default(
                port=DefaultPort.SYSTEM2.value, min_num_requests=6, backend="trtllm"
            ),
        ],
    ),
    "aggregated_logprobs": TRTLLMConfig(
        name="aggregated_logprobs",
        directory=trtllm_dir,
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 3.8 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.profiled_vram_gib(3.8),  # actual nvidia-smi peak 3.8 GiB
            pytest.mark.requested_trtllm_kv_tokens(
                2592
            ),  # KV cache cap (2x safety over min=1296)
            pytest.mark.timeout(300),  # 3x measured time (~44s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        delayed_start=5,
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "disaggregated_logprobs": TRTLLMConfig(
        name="disaggregated_logprobs",
        directory=trtllm_dir,
        script_name="disagg.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
        ],
    ),
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
        directory=trtllm_dir,
        script_name="agg_router.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
        ],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
        },
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
        directory=trtllm_dir,
        script_name="disagg_router.sh",
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
        model="Qwen/Qwen3-0.6B",
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen2-VL-7B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
    "aggregated_multimodal_router": TRTLLMConfig(
        name="aggregated_multimodal_router",
        directory=trtllm_dir,
        script_name="agg_multimodal_router.sh",
        marks=[
            pytest.mark.skip(
                reason="Nightly CI failure: https://linear.app/nvidia/issue/DYN-2608"
            ),
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
    ),
    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
        directory=trtllm_dir,
        script_name="epd_multimodal_image_and_embeddings.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
            "DECODE_CUDA_VISIBLE_DEVICES": "0",
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
    # Test Encoder with Aggregated PD worker on same GPU
    # Make this pre-merge after TRTLLM #5938603 is fixed
    "e_pd_multimodal": TRTLLMConfig(
        name="e_pd_multimodal",
        directory=trtllm_dir,
        script_name="disagg_e_pd.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.nightly,
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
    # (.safetensors file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.safetensors
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
    "raw_embeddings_epd": TRTLLMConfig(
        name="raw_embeddings_epd",
        directory=SERVE_TEST_DIR,
        script_name="agg_raw_embeddings_llava.sh",
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(
                900
            ),  # Embeddings generation (~60s) + model load (~120s) + inference
        ],
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=600,
        # Embeddings generation + worker startup takes longer than normal
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
                image_url="file:///tmp/llava_embeddings.safetensors",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
        ],
        env={
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
            "PD_CUDA_VISIBLE_DEVICES": "1",
        },
    ),
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
    # to fit within CI GPU memory constraints.
    "video_diffusion": TRTLLMConfig(
        name="video_diffusion",
        directory=trtllm_dir,
        script_name="agg_video_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "272",
            "--default-width",
            "480",
            "--default-num-frames",
            "17",
        ],
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 17.1 GiB
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(17.1),  # actual nvidia-smi peak 17.1 GiB
            pytest.mark.requested_trtllm_vram_gib(17.1),
            pytest.mark.timeout(
                600
            ),  # Video generation is slow even at small resolution
        ],
        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
        delayed_start=5,
        request_payloads=[
            VideoGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "480x272",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "num_frames": 17,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                timeout=300,
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
    # TensorRT-LLM image diffusion test using Flux.1-dev model.
    # Validates the end-to-end image generation pipeline (frontend → worker → /v1/images/generations).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
    # --disable-torch-compile, and small default resolution (256x256)
    # to fit within CI GPU memory constraints.
    "image_diffusion": TRTLLMConfig(
        name="image_diffusion",
        directory=trtllm_dir,
        script_name="agg_image_diffusion.sh",
        script_args=[
            "--skip-warmup",
            "--disable-torch-compile",
            "--default-height",
            "256",
            "--default-width",
            "256",
            "--default-num-images-per-prompt",
            "1",
        ],
        marks=[
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 20.0 GiB
            pytest.mark.trtllm,
            pytest.mark.pre_merge,
            # Diffusion models don't use KV cache, so requested_trtllm_kv_tokens
            # doesn't apply.  requested_trtllm_vram_gib maps to
            # KvCacheConfig.max_gpu_total_bytes which has no effect on the
            # diffusion engine itself, but the parallel scheduler requires one
            # of the KV/VRAM markers to accept the test.  We set it to the
            # profiled peak so the scheduler's VRAM budget is accurate.
            pytest.mark.profiled_vram_gib(
                20.0
            ),  # actual nvidia-smi peak 20.0 GiB [gluo FIXME] reprofil as new model is used
            pytest.mark.requested_trtllm_vram_gib(20.0),
            pytest.mark.timeout(
                600
            ),  # Image generation is slow even at small resolution
        ],
        model="black-forest-labs/FLUX.2-klein-4B",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=300,
        delayed_start=5,
        request_payloads=[
            ImageGenerationPayload(
                body={
                    "prompt": "A golden retriever running on a beach",
                    "size": "256x256",
                    "response_format": "url",
                    "nvext": {
                        "num_inference_steps": 10,
                        "guidance_scale": 5.0,
                        "seed": 42,
                    },
                },
                repeat_count=1,
                expected_response=[],
                expected_log=[],
            ),
        ],
    ),
    # Aggregated multimodal with --frontend-decoding enabled.
    # Verifies image URL inference works when images are decoded by the Rust
    # MediaDecoder in the frontend instead of the Python backend.
    "aggregated_multimodal_frontend_decoding": TRTLLMConfig(
        name="aggregated_multimodal_frontend_decoding",
        directory=trtllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
            pytest.mark.pre_merge,
            pytest.mark.timeout(900),
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=60,
        request_payloads=[
            multimodal_payload_default(
                text="Describe what you see in this image.",
                expected_response=["mountain", "rock", "trees", "road"],
            )
        ],
        env={
            "AGG_ENGINE_ARGS": "/workspace/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml",
            "DYN_TRTLLM_FRONTEND_DECODING": "true",
        },
    ),
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.post_merge,
            pytest.mark.skip(reason="DIS-1566"),
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
        ],
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
        ],
    ),
}


@pytest.fixture(params=params_with_model_mark(trtllm_configs))
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
    return trtllm_configs[request.param]


@pytest.mark.trtllm
@pytest.mark.e2e
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
def test_deployment(
    trtllm_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    num_system_ports,
    predownload_models,
):
    """
    Test dynamo deployments with different configurations.
    """
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
    # Use per-test ports so tests can run safely under pytest-xdist.
    config = dataclasses.replace(
        trtllm_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    # Non-port env stays here; ports are wired by run_serve_deployment(ports=...).
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)


# TODO make this a normal guy
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.trtllm
@pytest.mark.pre_merge
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
def test_chat_only_aggregated_with_test_logits_processor(
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    monkeypatch,
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
        request_payloads=[
            chat_payload_default(expected_response=["Hello world!"]),
        ],
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

    config = dataclasses.replace(
        config, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    config.env.update(
        {
            "MODEL_PATH": config.model,
            "SERVED_MODEL_NAME": config.model,
        }
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)