profile_prefill.py

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
from typing import Callable, Optional

import numpy as np

from benchmarks.profiler.utils.aiperf import benchmark_prefill
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.plot import plot_prefill_interpolation

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


def _profile_prefill_helper(
    work_dir,
    num_gpus,
    max_context_length,
    interpolation_granularity,
    get_ttft: Callable[[int], Optional[float]],
):
    prefill_isl = []
    prefill_ttft = []
    prefill_thpt_per_gpu = []
    max_context_length -= 512  # leave some room for chat template and system prompt
    if max_context_length <= 100:
        error_message = (
            f"max_context_length {max_context_length} is too small to profile prefill"
        )
        logger.error(error_message)
        raise ValueError(error_message)
    for isl in range(
        100,
        max_context_length,
        (max_context_length - 100) // interpolation_granularity,
    ):
        ttft = get_ttft(isl)
        if ttft is not None:
            prefill_isl.append(isl)
            prefill_ttft.append(ttft)
            prefill_thpt_per_gpu.append(isl / ttft / num_gpus * 1000)

    # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
    if len(prefill_isl) > 2:
        logger.info("Interpolating prefill TTFT and throughput vs ISL...")

        # Convert to numpy arrays for easier manipulation
        prefill_isl_np = np.array(prefill_isl)
        prefill_ttft_np = np.array(prefill_ttft)
        prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)

        save_path = f"{work_dir}/raw_data.npz"
        np.savez(
            save_path,
            prefill_isl=prefill_isl_np,
            prefill_ttft=prefill_ttft_np,
            prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
        )

        # Call the plotting function
        plot_prefill_interpolation(
            prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
        )
    else:
        logger.warning(
            "Not enough data points to perform interpolation (need at least 3 points)"
        )

    return


def profile_prefill(
    work_dir,
    model_name,
    tokenizer,
    url,
    num_gpus,
    max_context_length,
    interpolation_granularity,
):
    def get_ttft(isl):
        ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
        aiperf_result = benchmark_prefill(
            isl,
            ai_perf_artifact_dir,
            model_name,
            tokenizer,
            base_url=url,
        )
        if aiperf_result is not None:
            return aiperf_result["time_to_first_token"]["avg"]
        return None

    return _profile_prefill_helper(
        work_dir,
        num_gpus,
        max_context_length,
        interpolation_granularity,
        get_ttft,
    )


def profile_prefill_aiconfigurator(
    work_dir,
    num_gpus,
    max_context_length,
    interpolation_granularity,
    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
    **model_config_kwargs,
):
    def get_ttft(isl):
        perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
            isl,
            **model_config_kwargs,
        )

        ttft = perf_dict["context_latency"]
        logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
        return ttft

    return _profile_prefill_helper(
        work_dir,
        num_gpus,
        max_context_length,
        interpolation_granularity,
        get_ttft,
    )