test_trtllm.py 6.78 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

10
11
12
13
14
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
from tests.utils.engine_process import EngineConfig
16
17
18
19
from tests.utils.payload_builder import (
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
20
    multimodal_payload_default,
21
)
22
23
24
25
26

logger = logging.getLogger(__name__)


@dataclass
27
class TRTLLMConfig(EngineConfig):
28
29
    """Configuration for trtllm test scenarios"""

30
    stragglers: list[str] = field(default_factory=lambda: ["TRTLLM:EngineCore"])
31

32

33
trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
34
    WORKSPACE_DIR, "examples/backends/trtllm"
35
)
36

37
38
39
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
40
41
42
trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
43
        directory=trtllm_dir,
44
        script_name="agg_metrics.sh",
45
46
47
48
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
49
50
51
            pytest.mark.timeout(
                300
            ),  # 3x measured time (44.66s) + download time (150s)
52
        ],
53
        model="Qwen/Qwen3-0.6B",
54
55
56
57
        models_port=8000,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
58
            metric_payload_default(min_num_requests=6, backend="trtllm"),
59
        ],
60
61
62
    ),
    "disaggregated": TRTLLMConfig(
        name="disaggregated",
63
        directory=trtllm_dir,
64
        script_name="disagg.sh",
65
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.post_merge],
66
        model="Qwen/Qwen3-0.6B",
67
68
69
70
71
        models_port=8000,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
72
    ),
73
74
75
76
    "disaggregated_same_gpu": TRTLLMConfig(
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
77
78
79
80
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
81
82
83
            pytest.mark.timeout(
                480
            ),  # 3x measured time (103.66s) + download time (150s)
84
        ],
85
86
87
88
89
90
91
92
93
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"),
            metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
        ],
    ),
94
95
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
96
        directory=trtllm_dir,
97
        script_name="agg_router.sh",
98
99
100
101
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.trtllm,
102
103
104
            pytest.mark.timeout(
                300
            ),  # 3x measured time (37.91s) + download time (180s)
105
        ],
106
        model="Qwen/Qwen3-0.6B",
107
108
109
110
111
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
112
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
113
114
115
116
117
118
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
119
120
121
    ),
    "disaggregated_router": TRTLLMConfig(
        name="disaggregated_router",
122
        directory=trtllm_dir,
123
        script_name="disagg_router.sh",
124
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
125
        model="Qwen/Qwen3-0.6B",
126
127
128
129
130
        models_port=8000,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
131
    ),
132
133
134
135
    "disaggregated_multimodal": TRTLLMConfig(
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
136
        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.multimodal],
137
138
139
140
141
142
        model="Qwen/Qwen2-VL-7B-Instruct",
        models_port=8000,
        timeout=900,
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
143
144
145
146
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
147
148
149
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.trtllm,
150
151
152
            pytest.mark.timeout(
                480
            ),  # 3x measured time (83.85s) + download time (210s) for 7B model
153
        ],
154
155
156
157
158
159
160
161
162
163
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
        },
        request_payloads=[
            completion_payload_default(),
        ],
    ),
164
165
166
}


Alec's avatar
Alec committed
167
@pytest.fixture(params=params_with_model_mark(trtllm_configs))
168
169
def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
170
    return trtllm_configs[request.param]
171
172


173
@pytest.mark.trtllm
174
@pytest.mark.e2e
Alec's avatar
Alec committed
175
def test_deployment(trtllm_config_test, request, runtime_services, predownload_models):
176
177
178
179
    """
    Test dynamo deployments with different configurations.
    """
    config = trtllm_config_test
180
181
    extra_env = {"MODEL_PATH": config.model, "SERVED_MODEL_NAME": config.model}
    run_serve_deployment(config, request, extra_env=extra_env)
182
183


184
# TODO make this a normal guy
185
186
@pytest.mark.e2e
@pytest.mark.gpu_1
187
@pytest.mark.trtllm
188
@pytest.mark.timeout(660)  # 3x measured time (159.68s) + download time (180s)
189
def test_chat_only_aggregated_with_test_logits_processor(
Alec's avatar
Alec committed
190
    request, runtime_services, predownload_models, monkeypatch
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
):
    """
    Run a single aggregated chat-completions test using Qwen 0.6B with the
    test logits processor enabled, and expect "Hello world" in the response.
    """

    # Enable HelloWorld logits processor only for this test
    monkeypatch.setenv("DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR", "1")

    base = trtllm_configs["aggregated"]
    config = TRTLLMConfig(
        name="aggregated_qwen_chatonly",
        directory=base.directory,
        script_name=base.script_name,  # agg.sh
        marks=[],  # not used by this direct test
206
        request_payloads=[
207
            chat_payload_default(expected_response=["Hello world!"]),
208
        ],
209
210
211
212
213
        model="Qwen/Qwen3-0.6B",
        delayed_start=base.delayed_start,
        timeout=base.timeout,
    )

214
    run_serve_deployment(config, request)