profile_decode.py 4.78 KB
Newer Older
1
2
3
4
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
5
from typing import Callable, Optional, Tuple
6
7

import numpy as np
8

9
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
10
11
from benchmarks.profiler.utils.genai_perf import benchmark_decode
from benchmarks.profiler.utils.plot import plot_decode_3d_surface
12
13
14
15
16
17
18
19
20
21
22
23

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


24
def _profile_decode_helper(
25
26
27
28
29
    work_dir,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
30
31
32
    get_itl_and_thpt_per_gpu: Callable[
        [int, int, int], Tuple[Optional[float], Optional[float]]
    ],
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
):
    """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
    x_kv_usage = []
    y_context_length = []
    z_itl = []
    z_thpt_per_gpu = []

    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement

    for isl in range(
        100,
        max_context_length - osl,
        (max_context_length - osl) // interpolation_granularity,
    ):
        max_concurrency = max_kv_tokens // (isl + osl)
48
49
50
51
52
53
54
        if max_concurrency == 0:
            logger.warning(
                f"max_kv_tokens {max_kv_tokens} is too small for"
                f" isl {isl} + osl {osl}, skipping."
            )
            break
        elif max_concurrency < interpolation_granularity:
55
56
57
58
59
            logger.warning(
                f"max_concurrency {max_concurrency} is too small for"
                f" interpolation granularity {interpolation_granularity}."
                f" max_kv_tokens {max_kv_tokens}, isl {isl}, osl {osl}"
            )
60
61
62
63
64
65
66
            sweep_num_request = range(1, max_concurrency + 1)
        else:
            sweep_num_request = range(
                1,
                max_concurrency,
                max_concurrency // interpolation_granularity,
            )
67
        for num_request in sweep_num_request:
68
69
70
            itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request)

            if itl is not None and thpt_per_gpu is not None:
71
72
73
                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
                y_context_length.append(isl + osl / 2)
                z_itl.append(itl)
74
                z_thpt_per_gpu.append(thpt_per_gpu)
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

    # Save the data points to a .npz file
    save_path = f"{work_dir}/raw_data.npz"
    np.savez(
        save_path,
        x_kv_usage=np.array(x_kv_usage),
        y_context_length=np.array(y_context_length),
        z_itl=np.array(z_itl),
        z_thpt_per_gpu=np.array(z_thpt_per_gpu),
        max_kv_tokens=np.array([max_kv_tokens]),
    )
    logger.info(f"Saved data points to {save_path}")

    # Plot 3D surface
    plot_decode_3d_surface(
        x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
    )

    return
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159


def profile_decode(
    work_dir,
    model_name,
    tokenizer,
    url,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
        gap_result = benchmark_decode(
            isl,
            osl,
            num_request,
            genai_perf_artifact_dir,
            model_name,
            tokenizer,
            base_url=url,
        )
        if gap_result is not None:
            itl = gap_result["inter_token_latency"]["avg"]
            thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus
            return itl, thpt_per_gpu
        return None, None

    return _profile_decode_helper(
        work_dir,
        num_gpus,
        max_kv_tokens,
        max_context_length,
        interpolation_granularity,
        get_itl_and_thpt_per_gpu,
    )


def profile_decode_aiconfigurator(
    work_dir,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
    **model_config_kwargs,
):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
        perf_dict = ai_configurator_perf_estimator.estimate_perf(
            isl,
            osl,
            num_request,
            mode="decode",
            **model_config_kwargs,
        )
        return perf_dict["tpot"], perf_dict["tokens/s/gpu"]

    return _profile_decode_helper(
        work_dir,
        num_gpus,
        max_kv_tokens,
        max_context_length,
        interpolation_granularity,
        get_itl_and_thpt_per_gpu,
    )