profile_decode.py 5.39 KB
Newer Older
1
2
3
4
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
5
from typing import Callable, Optional, Tuple
6
7

import numpy as np
8

9
from benchmarks.profiler.utils.aiperf import benchmark_decode
10
from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY
11
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
12
from benchmarks.profiler.utils.plot import plot_decode_3d_surface
13
14
15
16
17
18
19
20
21
22
23
24

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def get_num_request_range(attn_dp_size, engine_max_concurrency, granularity):
    # for MoE models with attn-dp, we want the num_request to be a multiple of attn_dp_size
    # so that we can make sure the request is sent to the same dp rank as the warmup request
    # this is guaranteed because the dp scheduler is scheduling round-robin

    max_concurrency = min(engine_max_concurrency, DECODE_MAX_CONCURRENCY)
    conc_per_dp = max_concurrency // attn_dp_size
    if conc_per_dp < granularity:
        ans = list(range(attn_dp_size, conc_per_dp * attn_dp_size + 1, attn_dp_size))
    else:
        step = (conc_per_dp - 1) * attn_dp_size / (granularity - 1)
        ans = [attn_dp_size + int(i * step) * attn_dp_size for i in range(granularity)]
    return ans


40
def _profile_decode_helper(
41
42
43
44
45
    work_dir,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
46
47
48
    get_itl_and_thpt_per_gpu: Callable[
        [int, int, int], Tuple[Optional[float], Optional[float]]
    ],
49
    attention_dp_size,
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
):
    """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
    x_kv_usage = []
    y_context_length = []
    z_itl = []
    z_thpt_per_gpu = []

    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement

    for isl in range(
        100,
        max_context_length - osl,
        (max_context_length - osl) // interpolation_granularity,
    ):
        max_concurrency = max_kv_tokens // (isl + osl)
65
66
67
68
69
70
71
        if max_concurrency == 0:
            logger.warning(
                f"max_kv_tokens {max_kv_tokens} is too small for"
                f" isl {isl} + osl {osl}, skipping."
            )
            break
        else:
72
73
            sweep_num_request = get_num_request_range(
                attention_dp_size, max_concurrency, interpolation_granularity
74
            )
75
        for num_request in sweep_num_request:
76
77
78
            itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request)

            if itl is not None and thpt_per_gpu is not None:
79
80
81
                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
                y_context_length.append(isl + osl / 2)
                z_itl.append(itl)
82
                z_thpt_per_gpu.append(thpt_per_gpu)
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

    # Save the data points to a .npz file
    save_path = f"{work_dir}/raw_data.npz"
    np.savez(
        save_path,
        x_kv_usage=np.array(x_kv_usage),
        y_context_length=np.array(y_context_length),
        z_itl=np.array(z_itl),
        z_thpt_per_gpu=np.array(z_thpt_per_gpu),
        max_kv_tokens=np.array([max_kv_tokens]),
    )
    logger.info(f"Saved data points to {save_path}")

    # Plot 3D surface
    plot_decode_3d_surface(
        x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
    )

    return
102
103
104
105
106
107
108
109
110
111
112


def profile_decode(
    work_dir,
    model_name,
    tokenizer,
    url,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
113
    attention_dp_size,
114
115
):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
116
117
        ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}_osl{osl}_n{num_request}"
        aiperf_result = benchmark_decode(
118
119
120
            isl,
            osl,
            num_request,
121
            ai_perf_artifact_dir,
122
123
124
125
            model_name,
            tokenizer,
            base_url=url,
        )
126
127
128
129
130
        if aiperf_result is not None:
            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
            thpt_per_gpu = (
                aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
            )
131
132
133
134
135
136
137
138
139
140
            return itl, thpt_per_gpu
        return None, None

    return _profile_decode_helper(
        work_dir,
        num_gpus,
        max_kv_tokens,
        max_context_length,
        interpolation_granularity,
        get_itl_and_thpt_per_gpu,
141
        attention_dp_size,
142
143
144
145
146
147
148
149
150
151
    )


def profile_decode_aiconfigurator(
    work_dir,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
152
    attention_dp_size,
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
    **model_config_kwargs,
):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
        perf_dict = ai_configurator_perf_estimator.estimate_perf(
            isl,
            osl,
            num_request,
            mode="decode",
            **model_config_kwargs,
        )
        return perf_dict["tpot"], perf_dict["tokens/s/gpu"]

    return _profile_decode_helper(
        work_dir,
        num_gpus,
        max_kv_tokens,
        max_context_length,
        interpolation_granularity,
        get_itl_and_thpt_per_gpu,
172
        attention_dp_size,
173
    )