aiperf.py 10.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os
import random
import subprocess
21
from typing import Optional, Tuple
22
23
24
25
26
27
28
29
30
31
32
33

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


34
def _get_common_aiperf_cmd(
35
36
37
    artifact_dir,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
38
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
39
    base_url="http://localhost:8000",
40
    warmup_request_count: int = 3,
41
42
):
    return [
43
        "aiperf",
44
45
46
47
        "profile",
        "--model",
        model,
        "--tokenizer",
48
        tokenizer,
49
50
51
52
53
54
        "--endpoint-type",
        "chat",
        "--endpoint",
        "/v1/chat/completions",
        "--streaming",
        "--url",
55
        base_url,
56
57
58
59
60
        "--extra-inputs",
        "ignore_eos:true",
        "--extra-inputs",
        '{"nvext":{"ignore_eos":true}}',
        "--warmup-request-count",
61
        str(warmup_request_count),
62
63
64
65
        "--artifact-dir",
        artifact_dir,
        "--random-seed",
        str(seed),
66
67
        "--request-timeout-seconds",
        "1800",
68
69
70
    ]


71
def get_prefill_aiperf_cmd(
72
73
74
75
    isl,
    artifact_dir,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
76
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
77
    osl=5,
78
    base_url="http://localhost:8000",
79
80
81
    concurrency: int = 1,
    request_count: int = 1,
    warmup_request_count: int = 3,
82
):
83
    return _get_common_aiperf_cmd(
84
85
86
        artifact_dir,
        seed,
        model,
87
        tokenizer,
88
        base_url,
89
        warmup_request_count=warmup_request_count,
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
        "--synthetic-input-tokens-stddev",
        "0",
        "--output-tokens-mean",
        str(osl),
        "--output-tokens-stddev",
        "0",
        "--extra-inputs",
        f"max_tokens:{osl}",
        "--extra-inputs",
        f"min_tokens:{osl}",
        "--concurrency",
104
        str(concurrency),
105
        "--request-count",
106
        str(request_count),
107
108
109
    ]


110
def get_decode_aiperf_cmd(
111
112
113
114
115
116
    isl,
    osl,
    artifact_dir,
    num_request,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
117
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
118
    base_url="http://localhost:8000",
119
):
120
    return _get_common_aiperf_cmd(
121
122
123
        artifact_dir,
        seed,
        model,
124
        tokenizer,
125
        base_url,
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
        "--synthetic-input-tokens-stddev",
        "0",
        "--output-tokens-mean",
        str(osl),
        "--output-tokens-stddev",
        "0",
        "--extra-inputs",
        f"max_tokens:{osl}",
        "--extra-inputs",
        f"min_tokens:{osl}",
        "--concurrency",
        str(num_request),
        "--num-dataset-entries",
        str(num_request),
        "--request-count",
        str(num_request),
    ]


148
def get_aiperf_result(artifact_dir: str) -> dict:
149
150
    json_file_path = None
    for root, _, files in os.walk(artifact_dir):
151
152
        if "profile_export_aiperf.json" in files:
            json_file_path = os.path.join(root, "profile_export_aiperf.json")
153
154
155
            break
    if json_file_path is None:
        raise FileNotFoundError(
156
            f"profile_export_aiperf.json not found in {artifact_dir}"
157
158
159
160
161
        )
    with open(json_file_path, "r") as f:
        return json.load(f)


162
def benchmark_prefill(
163
    isl,
164
    aiperf_artifact_dir,
165
166
167
    model_name,
    tokenizer,
    base_url="http://localhost:8000",
168
169
170
    concurrency: int = 1,
    request_count: int = 1,
    warmup_request_count: int = 3,
171
):
172
173
    logger.info(f"Running aiperf with isl {isl}")
    aiperf_cmd = get_prefill_aiperf_cmd(
174
        isl,
175
        aiperf_artifact_dir,
176
177
178
        model=model_name,
        tokenizer=tokenizer,
        base_url=base_url,
179
180
181
        concurrency=concurrency,
        request_count=request_count,
        warmup_request_count=warmup_request_count,
182
    )
183
184
    logger.debug(f"aiperf cmd: {aiperf_cmd}")

185
186
    aiperf_process = subprocess.Popen(
        aiperf_cmd,
187
188
189
190
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
191
192
193
    stdout, stderr = aiperf_process.communicate()
    if aiperf_process.returncode == 0:
        logger.info("AIperf profiling completed successfully")
194
        logger.debug(stdout)
195
196
        aiperf_result = get_aiperf_result(aiperf_artifact_dir)
        return aiperf_result
197
    else:
198
        logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
199
200
201
202
        logger.error(f"stderr: {stderr}")
        return None


203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def get_prefill_ttft(
    isl: int,
    aiperf_artifact_dir: str,
    model_name: str,
    tokenizer: str,
    base_url: str = "http://localhost:8000",
    attention_dp_size: int = 1,
    attn_dp_num_req_ratio: int = 4,
) -> Optional[float]:
    """
    Run prefill benchmark and extract TTFT (ms). Returns None on failure.
    If attention_dp_size > 1 (DEP), send attn_dp_size * attn_dp_num_req_ratio concurrent requests (single burst),
    then compute TTFT as (max TTFT across burst) / attn_dp_num_req_ratio.
    attn_dp_num_req_ratio defaults to 4 rounds to account for the error margin caused
    by the first batch being launched too early without enough requests.
    """
    # DEP-aware measurement (waves of size attention_dp_size)
    if attention_dp_size > 1:
        total_concurrency = attention_dp_size * attn_dp_num_req_ratio
        logger.info(
            f"DEP prefill measurement: isl={isl}, attn_dp={attention_dp_size}, attn_dp_num_req_ratio={attn_dp_num_req_ratio}, "
            f"total_concurrency={total_concurrency}"
        )
        # Run aiperf with the requested concurrency; allow normal warmup behavior
        aiperf_result = benchmark_prefill(
            isl,
            aiperf_artifact_dir,
            model_name,
            tokenizer,
            base_url=base_url,
            concurrency=total_concurrency,
            request_count=total_concurrency,
        )
        try:
            max_ttft = float(aiperf_result["time_to_first_token"]["max"])
            return max_ttft / float(attn_dp_num_req_ratio)
        except (KeyError, TypeError, ValueError):
            logger.warning(
                "Failed to extract max TTFT from AIPerf result for DEP prefill"
            )
            return None

    # Default path (non-DEP): use AIPerf's TTFT metric
    aiperf_result = benchmark_prefill(
        isl,
        aiperf_artifact_dir,
        model_name,
        tokenizer,
        base_url=base_url,
    )
    try:
        return float(aiperf_result["time_to_first_token"]["avg"])
    except (KeyError, TypeError, ValueError):
        logger.warning("Failed to extract TTFT from AIPerf result")
        return None


def get_decode_itl_and_thpt_per_gpu(
    isl: int,
    osl: int,
    num_request: int,
    aiperf_artifact_dir: str,
    model_name: str,
    tokenizer: str,
    base_url: str = "http://localhost:8000",
    num_gpus: int = 1,
) -> Tuple[Optional[float], Optional[float]]:
    """
    Run decode benchmark and extract (ITL ms, throughput per GPU).
    Returns (None, None) on failure.
    """
    aiperf_result = benchmark_decode(
        isl,
        osl,
        num_request,
        aiperf_artifact_dir,
        model_name,
        tokenizer,
        base_url=base_url,
    )
    if aiperf_result is None:
        return None, None
    try:
        itl = float(aiperf_result["inter_token_latency"]["avg"])
        thpt_total = float(aiperf_result["output_token_throughput"]["avg"])
        thpt_per_gpu = thpt_total / max(num_gpus, 1)
        return itl, thpt_per_gpu
    except (KeyError, TypeError, ValueError):
        logger.warning("Failed to extract decode metrics from AIPerf result")
        return None, None


295
296
297
298
def benchmark_decode(
    isl,
    osl,
    num_request,
299
    aiperf_artifact_dir,
300
    model_name,
301
    tokenizer,
302
303
    base_url="http://localhost:8000",
):
304
305
306
307
308
    logger.info(f"Profiling decode with num_request {num_request}...")

    # first warm-up the engine by pre-computing all prefill tokens
    # we use the same random seed to make sure the prompt is the same
    seed = random.randint(0, 1000000)
309

310
    aiperf_cmd = get_decode_aiperf_cmd(
311
312
        isl,
        osl,
313
        f"{aiperf_artifact_dir}_warmup",
314
315
316
        num_request,
        seed=seed,
        model=model_name,
317
        tokenizer=tokenizer,
318
        base_url=base_url,
319
    )
320
321
    aiperf_process = subprocess.Popen(
        aiperf_cmd,
322
323
324
325
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
326
    aiperf_process.communicate()
327
    # then send out the real requests, hopefully, this will skip all prefill computation
328
    aiperf_cmd = get_decode_aiperf_cmd(
329
330
        isl,
        osl,
331
        aiperf_artifact_dir,
332
333
334
        num_request,
        seed=seed,
        model=model_name,
335
        tokenizer=tokenizer,
336
        base_url=base_url,
337
    )
338
339
    aiperf_process = subprocess.Popen(
        aiperf_cmd,
340
341
342
343
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
344
345
346
    stdout, stderr = aiperf_process.communicate()
    if aiperf_process.returncode == 0:
        logger.info("AIperf profiling completed successfully")
347
        logger.debug(stdout)
348
349
        aiperf_result = get_aiperf_result(aiperf_artifact_dir)
        return aiperf_result
350
    else:
351
        logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
352
353
        logger.error(f"stderr: {stderr}")
        return None