test_helpers_thorough.py

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Unit tests for thorough.py's _pick_thorough_best_config helper.

Benchmarking helpers (_benchmark_prefill_candidates, _benchmark_decode_candidates)
require live K8s deployments and are covered by the mocked end-to-end tests
in test_profile_sla_dgdr.py.
"""

import sys
from pathlib import Path
from unittest.mock import patch

import pandas as pd
import pytest

project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

try:
    from dynamo.profiler.thorough import _pick_thorough_best_config
    from dynamo.profiler.utils.aic_dataframe import build_decode_row, build_prefill_row
    from dynamo.profiler.utils.dgdr_v1beta1_types import (
        DynamoGraphDeploymentRequestSpec,
        HardwareSpec,
        SLASpec,
        WorkloadSpec,
    )
except ImportError as e:
    pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)


# ---------------------------------------------------------------------------
# Shared fixtures
# ---------------------------------------------------------------------------


def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
    base = dict(
        model="Qwen/Qwen3-32B",
        backend="trtllm",
        image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
        hardware=HardwareSpec(gpuSku="h200_sxm", totalGpus=8, numGpusPerNode=8),
        workload=WorkloadSpec(isl=4000, osl=1000),
        sla=SLASpec(ttft=2000.0, itl=50.0),
    )
    base.update(overrides)
    return DynamoGraphDeploymentRequestSpec(**base)


def _stub_dfs():
    """Minimal prefill/decode DataFrames that satisfy pick function inputs.

    Uses build_prefill_row / build_decode_row so the DataFrames contain all
    columns expected by _build_disagg_summary_dict (called via
    build_disagg_df_from_static in load_match / default paths).
    """
    prefill_row = build_prefill_row(
        model="Qwen/Qwen3-32B",
        isl=4000,
        osl=1000,
        ttft=50.0,
        tp=1,
        pp=1,
        dp=1,
        moe_tp=1,
        moe_ep=1,
        backend="trtllm",
        system="h200_sxm",
    )
    decode_row = build_decode_row(
        tpot=10.0,
        thpt_per_gpu=100.0,
        num_request=1,
        num_gpus=1,
        osl=1000,
        tp=1,
        pp=1,
        dp=1,
        moe_tp=1,
        moe_ep=1,
        backend="trtllm",
        system="h200_sxm",
    )
    prefill_df = pd.DataFrame([prefill_row])
    decode_df = pd.DataFrame([decode_row])
    return prefill_df, decode_df


def _mock_result():
    return {
        "best_config_df": pd.DataFrame(),
        "best_latencies": {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0},
    }


# ---------------------------------------------------------------------------
# _pick_thorough_best_config
# ---------------------------------------------------------------------------


class TestPickThoroughBestConfig:
    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_autoscale_calls_pick_autoscale(self):
        """autoscale mode delegates to pick_autoscale with ttft/tpot targets."""
        prefill_df, decode_df = _stub_dfs()
        dgdr = _make_dgdr()
        mock_result = _mock_result()

        with patch(
            "dynamo.profiler.thorough.pick_autoscale", return_value=mock_result
        ) as mock_pick:
            result = _pick_thorough_best_config(
                prefill_df,
                decode_df,
                "autoscale",
                2000.0,
                50.0,
                None,
                8,
                dgdr,
            )

        mock_pick.assert_called_once_with(prefill_df, decode_df, 2000.0, 50.0)
        assert result is mock_result

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_load_match_uses_request_latency_when_set(self):
        """load_match passes target_request_latency when request_latency is provided."""
        prefill_df, decode_df = _stub_dfs()
        dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))

        with patch(
            "dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
        ) as mock_pick:
            _pick_thorough_best_config(
                prefill_df,
                decode_df,
                "load_match",
                2000.0,
                50.0,
                35000.0,
                8,
                dgdr,
            )

        kwargs = mock_pick.call_args.kwargs
        assert kwargs["target_request_latency"] == 35000.0
        assert "target_tpot" not in kwargs
        assert kwargs["target_request_rate"] == 5.0
        assert kwargs["max_total_gpus"] == 8

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_load_match_falls_back_to_target_tpot(self):
        """load_match passes target_tpot when no request_latency."""
        prefill_df, decode_df = _stub_dfs()
        dgdr = _make_dgdr()

        with patch(
            "dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
        ) as mock_pick:
            _pick_thorough_best_config(
                prefill_df,
                decode_df,
                "load_match",
                2000.0,
                50.0,
                None,
                8,
                dgdr,
            )

        kwargs = mock_pick.call_args.kwargs
        assert kwargs["target_tpot"] == 50.0
        assert "target_request_latency" not in kwargs

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_default_uses_request_latency_when_set(self):
        """default mode passes target_request_latency when provided."""
        prefill_df, decode_df = _stub_dfs()
        dgdr = _make_dgdr()

        with patch(
            "dynamo.profiler.thorough.pick_default", return_value=_mock_result()
        ) as mock_pick:
            _pick_thorough_best_config(
                prefill_df,
                decode_df,
                "default",
                2000.0,
                50.0,
                35000.0,
                8,
                dgdr,
            )

        kwargs = mock_pick.call_args.kwargs
        assert kwargs["target_request_latency"] == 35000.0
        assert kwargs["total_gpus"] == 8
        assert kwargs["serving_mode"] == "disagg"

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_default_falls_back_to_target_tpot(self):
        """default mode passes target_tpot when no request_latency."""
        prefill_df, decode_df = _stub_dfs()
        dgdr = _make_dgdr()

        with patch(
            "dynamo.profiler.thorough.pick_default", return_value=_mock_result()
        ) as mock_pick:
            _pick_thorough_best_config(
                prefill_df,
                decode_df,
                "default",
                2000.0,
                50.0,
                None,
                8,
                dgdr,
            )

        kwargs = mock_pick.call_args.kwargs
        assert kwargs["target_tpot"] == 50.0
        assert "target_request_latency" not in kwargs

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_load_match_omits_workload_kwargs_when_no_workload(self):
        """When dgdr.workload has no rate/concurrency, those kwargs are absent."""
        prefill_df, decode_df = _stub_dfs()
        dgdr = _make_dgdr()  # no requestRate or concurrency

        with patch(
            "dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
        ) as mock_pick:
            _pick_thorough_best_config(
                prefill_df,
                decode_df,
                "load_match",
                2000.0,
                50.0,
                None,
                0,
                dgdr,
            )

        kwargs = mock_pick.call_args.kwargs
        assert "target_request_rate" not in kwargs
        assert "max_total_gpus" not in kwargs