profile_prefill.py 3.94 KB
Newer Older
1
2
3
4
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
5
from typing import Callable, Optional
6
7

import numpy as np
8

9
from benchmarks.profiler.utils.aiperf import get_prefill_ttft
10
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
11
from benchmarks.profiler.utils.plot import plot_prefill_interpolation
12
13
14
15
16
17
18
19
20
21
22
23

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


24
def _profile_prefill_helper(
25
26
27
28
    work_dir,
    num_gpus,
    max_context_length,
    interpolation_granularity,
29
    get_ttft: Callable[[int], Optional[float]],
30
    attention_dp_size: int = 1,
31
32
33
34
):
    prefill_isl = []
    prefill_ttft = []
    prefill_thpt_per_gpu = []
35
36
37
38
39
40
41
    max_context_length -= 512  # leave some room for chat template and system prompt
    if max_context_length <= 100:
        error_message = (
            f"max_context_length {max_context_length} is too small to profile prefill"
        )
        logger.error(error_message)
        raise ValueError(error_message)
42
43
44
45
46
    for isl in range(
        100,
        max_context_length,
        (max_context_length - 100) // interpolation_granularity,
    ):
47
48
        ttft = get_ttft(isl)
        if ttft is not None:
49
50
            prefill_isl.append(isl)
            prefill_ttft.append(ttft)
51
52
53
            prefill_thpt_per_gpu.append(
                isl / ttft / num_gpus * 1000 * attention_dp_size
            )
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

    # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
    if len(prefill_isl) > 2:
        logger.info("Interpolating prefill TTFT and throughput vs ISL...")

        # Convert to numpy arrays for easier manipulation
        prefill_isl_np = np.array(prefill_isl)
        prefill_ttft_np = np.array(prefill_ttft)
        prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)

        save_path = f"{work_dir}/raw_data.npz"
        np.savez(
            save_path,
            prefill_isl=prefill_isl_np,
            prefill_ttft=prefill_ttft_np,
            prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
        )

        # Call the plotting function
        plot_prefill_interpolation(
            prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
        )
    else:
        logger.warning(
            "Not enough data points to perform interpolation (need at least 3 points)"
        )

    return
82
83
84
85
86
87
88
89
90
91


def profile_prefill(
    work_dir,
    model_name,
    tokenizer,
    url,
    num_gpus,
    max_context_length,
    interpolation_granularity,
92
    attention_dp_size: int = 1,
93
94
):
    def get_ttft(isl):
95
        ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
96
        return get_prefill_ttft(
97
            isl,
98
            ai_perf_artifact_dir,
99
100
101
            model_name,
            tokenizer,
            base_url=url,
102
            attention_dp_size=attention_dp_size,
103
104
105
106
107
108
109
110
        )

    return _profile_prefill_helper(
        work_dir,
        num_gpus,
        max_context_length,
        interpolation_granularity,
        get_ttft,
111
        attention_dp_size=attention_dp_size,
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
    )


def profile_prefill_aiconfigurator(
    work_dir,
    num_gpus,
    max_context_length,
    interpolation_granularity,
    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
    **model_config_kwargs,
):
    def get_ttft(isl):
        perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
            isl,
            **model_config_kwargs,
        )

        ttft = perf_dict["context_latency"]
        logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
        return ttft

    return _profile_prefill_helper(
        work_dir,
        num_gpus,
        max_context_length,
        interpolation_granularity,
        get_ttft,
    )