prefix_ratio_benchmark.py 14.9 KB
Newer Older
Yan Ru Pei's avatar
Yan Ru Pei committed
1
2
#!/usr/bin/env python3

3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Yan Ru Pei's avatar
Yan Ru Pei committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# SPDX-License-Identifier: Apache-2.0

import argparse
import json
import logging
import os
import subprocess
from typing import Dict, List, Optional

import matplotlib

matplotlib.use("Agg")  # Use non-interactive backend
import matplotlib.pyplot as plt

# Setup logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


30
def get_aiperf_cmd(
Yan Ru Pei's avatar
Yan Ru Pei committed
31
32
33
34
35
36
37
38
39
40
41
    model,
    tokenizer,  # Add tokenizer parameter
    prefix_ratio,
    isl,
    osl,
    requests,
    concurrency,
    seed,
    num_prefix_prompts,
    artifact_dir,
    url="http://localhost:8888",
42
    use_expected_osl=False,
Yan Ru Pei's avatar
Yan Ru Pei committed
43
):
44
    """Build aiperf command based on prefix ratio"""
Yan Ru Pei's avatar
Yan Ru Pei committed
45
46
47
    prefix_length = int(isl * prefix_ratio)
    synthetic_input_length = int(isl * (1 - prefix_ratio))

48
49
50
51
52
53
    # Build nvext JSON with optional expected_output_tokens
    nvext_dict = {"ignore_eos": True}
    if use_expected_osl:
        nvext_dict["expected_output_tokens"] = osl
    nvext_json = json.dumps({"nvext": nvext_dict})

Yan Ru Pei's avatar
Yan Ru Pei committed
54
    return [
55
        "aiperf",
Yan Ru Pei's avatar
Yan Ru Pei committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        "profile",
        "--model",
        model,
        "--tokenizer",
        tokenizer,  # Use the tokenizer parameter instead of model
        "--endpoint-type",
        "chat",
        "--endpoint",
        "v1/chat/completions",
        "--streaming",
        "--url",
        url,
        "--synthetic-input-tokens-mean",
        str(synthetic_input_length),
        "--synthetic-input-tokens-stddev",
        str(round(synthetic_input_length / 4)),
        "--output-tokens-mean",
        str(osl),
        "--output-tokens-stddev",
        str(round(osl / 4)),
        "--extra-inputs",
        "ignore_eos:true",
        "--extra-inputs",
79
        nvext_json,
Yan Ru Pei's avatar
Yan Ru Pei committed
80
81
82
83
84
85
86
87
88
89
90
91
92
93
        "--concurrency",
        str(concurrency),
        "--request-count",
        str(requests),
        "--num-dataset-entries",
        str(requests),
        "--random-seed",
        str(seed),
        "--prefix-prompt-length",
        str(prefix_length),
        "--num-prefix-prompts",
        str(num_prefix_prompts),
        "--artifact-dir",
        artifact_dir,
94
95
        "--dataset-sampling-strategy",
        "shuffle",
Yan Ru Pei's avatar
Yan Ru Pei committed
96
97
98
99
100
101
102
        "-H",
        "Authorization: Bearer NOT USED",
        "-H",
        "Accept: text/event-stream",
    ]


103
104
def get_aiperf_result(artifact_dir: str) -> dict:
    """Parse aiperf results from JSON file"""
Yan Ru Pei's avatar
Yan Ru Pei committed
105
106
    json_file_path = None
    for root, _, files in os.walk(artifact_dir):
107
108
        if "profile_export_aiperf.json" in files:
            json_file_path = os.path.join(root, "profile_export_aiperf.json")
Yan Ru Pei's avatar
Yan Ru Pei committed
109
110
111
112
            break

    if json_file_path is None:
        raise FileNotFoundError(
113
            f"profile_export_aiperf.json not found in {artifact_dir}"
Yan Ru Pei's avatar
Yan Ru Pei committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
        )

    with open(json_file_path, "r") as f:
        return json.load(f)


def run_benchmark_single_url(
    model,
    tokenizer,  # Add tokenizer parameter
    prefix_ratio,
    isl,
    osl,
    requests,
    concurrency,
    seed,
    num_prefix_prompts,
    artifact_dir,
    url,
132
    use_expected_osl=False,
Yan Ru Pei's avatar
Yan Ru Pei committed
133
) -> Optional[Dict]:
134
135
    """Run aiperf benchmark for a single URL"""
    aiperf_cmd = get_aiperf_cmd(
Yan Ru Pei's avatar
Yan Ru Pei committed
136
137
138
139
140
141
142
143
144
145
146
        model,
        tokenizer,  # Pass tokenizer parameter
        prefix_ratio,
        isl,
        osl,
        requests,
        concurrency,
        seed,
        num_prefix_prompts,
        artifact_dir,
        url,
147
        use_expected_osl,
Yan Ru Pei's avatar
Yan Ru Pei committed
148
149
    )

150
    logger.info(f"Running command for URL {url}: {' '.join(aiperf_cmd)}")
Yan Ru Pei's avatar
Yan Ru Pei committed
151
152

    try:
153
154
        # Run aiperf and let it output directly to terminal
        subprocess.run(aiperf_cmd, check=True)
Yan Ru Pei's avatar
Yan Ru Pei committed
155

156
        logger.info(f"AIPerf profiling completed successfully for URL {url}")
Yan Ru Pei's avatar
Yan Ru Pei committed
157

158
159
        aiperf_result = get_aiperf_result(artifact_dir)
        return aiperf_result
Yan Ru Pei's avatar
Yan Ru Pei committed
160
161

    except subprocess.CalledProcessError as e:
162
        logger.error(f"AIPerf failed for URL {url} with error code: {e.returncode}")
Yan Ru Pei's avatar
Yan Ru Pei committed
163
164
165
166
167
168
169
170
        return None


def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]:
    """Aggregate results from multiple URLs"""
    if not results:
        return None

171
172
    valid_results = [r for r in results if r is not None]
    if not valid_results:
Yan Ru Pei's avatar
Yan Ru Pei committed
173
174
        return None

175
176
177
178
179
180
181
182
183
184
    # For TTFT percentiles, average across URLs
    ttft_p25_values = [r["time_to_first_token"]["p25"] for r in valid_results]
    ttft_p50_values = [r["time_to_first_token"]["p50"] for r in valid_results]
    ttft_p75_values = [r["time_to_first_token"]["p75"] for r in valid_results]

    # For ITL percentiles, average across URLs
    itl_p25_values = [r["inter_token_latency"]["p25"] for r in valid_results]
    itl_p50_values = [r["inter_token_latency"]["p50"] for r in valid_results]
    itl_p75_values = [r["inter_token_latency"]["p75"] for r in valid_results]

Yan Ru Pei's avatar
Yan Ru Pei committed
185
    aggregated = {
186
187
188
189
190
191
192
193
194
        "time_to_first_token": {
            "p25": sum(ttft_p25_values) / len(ttft_p25_values),
            "p50": sum(ttft_p50_values) / len(ttft_p50_values),
            "p75": sum(ttft_p75_values) / len(ttft_p75_values),
        },
        "inter_token_latency": {
            "p25": sum(itl_p25_values) / len(itl_p25_values),
            "p50": sum(itl_p50_values) / len(itl_p50_values),
            "p75": sum(itl_p75_values) / len(itl_p75_values),
Yan Ru Pei's avatar
Yan Ru Pei committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
        },
    }

    return aggregated


def run_benchmark(
    model,
    tokenizer,  # Add tokenizer parameter
    prefix_ratio,
    isl,
    osl,
    requests,
    concurrency,
    seed,
    num_prefix_prompts,
    output_dir,
    urls,
213
    use_expected_osl=False,
Yan Ru Pei's avatar
Yan Ru Pei committed
214
) -> Optional[Dict]:
215
    """Run aiperf benchmark for a specific prefix ratio"""
Yan Ru Pei's avatar
Yan Ru Pei committed
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
    logger.info(
        f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}"
    )

    # If single URL, maintain existing behavior
    if isinstance(urls, str):
        urls = [urls]

    if len(urls) == 1:
        artifact_dir = f"{output_dir}/prefix_ratio_{prefix_ratio}_seed_{seed}"
        os.makedirs(artifact_dir, exist_ok=True)

        return run_benchmark_single_url(
            model,
            tokenizer,  # Pass tokenizer parameter
            prefix_ratio,
            isl,
            osl,
            requests,
            concurrency,
            seed,
            num_prefix_prompts,
            artifact_dir,
            urls[0],
240
            use_expected_osl,
Yan Ru Pei's avatar
Yan Ru Pei committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
        )

    # Multiple URLs: split requests and concurrency
    num_urls = len(urls)
    base_requests_per_url = requests // num_urls
    remainder_requests = requests % num_urls
    base_concurrency_per_url = max(1, concurrency // num_urls)

    # Launch parallel processes
    processes = []
    artifact_dirs = []

    for i, url in enumerate(urls):
        # Distribute remainder requests to first few URLs
        url_requests = base_requests_per_url + (1 if i < remainder_requests else 0)

        artifact_dir = f"{output_dir}/prefix_ratio_{prefix_ratio}_seed_{seed}_url_{i}"
        os.makedirs(artifact_dir, exist_ok=True)
        artifact_dirs.append(artifact_dir)

261
        aiperf_cmd = get_aiperf_cmd(
Yan Ru Pei's avatar
Yan Ru Pei committed
262
263
264
265
266
267
268
269
270
271
272
            model,
            tokenizer,  # Pass tokenizer parameter
            prefix_ratio,
            isl,
            osl,
            url_requests,
            base_concurrency_per_url,
            seed,
            num_prefix_prompts,
            artifact_dir,
            url,
273
            use_expected_osl,
Yan Ru Pei's avatar
Yan Ru Pei committed
274
275
        )

276
        logger.info(f"Launching process for URL {url}: {' '.join(aiperf_cmd)}")
Yan Ru Pei's avatar
Yan Ru Pei committed
277

278
279
        # Run process without capturing output - let it stream to terminal
        process = subprocess.Popen(aiperf_cmd)
Yan Ru Pei's avatar
Yan Ru Pei committed
280
281
282
283
284
        processes.append((process, url, artifact_dir))

    # Wait for all processes to complete and collect results
    results: List[Optional[Dict]] = []
    for process, url, artifact_dir in processes:
285
        return_code = process.wait()
Yan Ru Pei's avatar
Yan Ru Pei committed
286

287
        if return_code == 0:
288
            logger.info(f"AIPerf completed successfully for URL {url}")
Yan Ru Pei's avatar
Yan Ru Pei committed
289
290

            try:
291
292
                aiperf_result = get_aiperf_result(artifact_dir)
                results.append(aiperf_result)
Yan Ru Pei's avatar
Yan Ru Pei committed
293
294
295
296
            except Exception as e:
                logger.error(f"Failed to get results for URL {url}: {e}")
                results.append(None)
        else:
297
            logger.error(f"AIPerf failed for URL {url} with error code: {return_code}")
Yan Ru Pei's avatar
Yan Ru Pei committed
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
            results.append(None)

    # Aggregate results
    return aggregate_results(results)


def main():
    parser = argparse.ArgumentParser(
        description="Benchmark prefix ratios and plot results"
    )
    parser.add_argument(
        "--model",
        type=str,
        default="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
        help="Model name",
    )
    parser.add_argument(
        "--tokenizer",
        type=str,
        default="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
        help="Tokenizer name (defaults to model)",
    )
    parser.add_argument(
        "--url",
        type=str,
        nargs="+",  # Accept multiple URLs
Yan Ru Pei's avatar
Yan Ru Pei committed
324
        default=["http://localhost:8000"],
Yan Ru Pei's avatar
Yan Ru Pei committed
325
326
327
328
329
330
331
332
333
334
335
336
337
338
        # default=["http://localhost:8090", "http://localhost:8090"],
        help="Server URL(s). Can specify multiple URLs for parallel benchmarking",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="kv_router",
        help="Output directory for results",
    )
    parser.add_argument("--num-prefix-prompts", type=int, default=20)
    parser.add_argument("--isl", type=int, default=14000, help="Input sequence length")
    parser.add_argument("--osl", type=int, default=200, help="Output sequence length")
    parser.add_argument("--requests", type=int, default=200, help="Number of requests")
    parser.add_argument("--concurrency", type=int, default=20, help="Concurrency level")
339
    parser.add_argument("--seed", type=int, default=0, help="Initial random seed")
Yan Ru Pei's avatar
Yan Ru Pei committed
340
341
342
343
344
345
346
    parser.add_argument(
        "--prefix-ratios",
        type=float,
        nargs="+",
        default=[0.1, 0.3, 0.5, 0.7, 0.9],
        help="List of prefix ratios to test",
    )
347
348
349
350
351
    parser.add_argument(
        "--use-expected-osl",
        action="store_true",
        help="Pass expected_output_tokens to nvext for router tracking",
    )
Yan Ru Pei's avatar
Yan Ru Pei committed
352
353
354
355
356
357
358
359

    args = parser.parse_args()

    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)

    # Store results
    prefix_ratios = []
360
361
362
363
364
365
    ttft_p25_values = []
    ttft_p50_values = []
    ttft_p75_values = []
    itl_p25_values = []
    itl_p50_values = []
    itl_p75_values = []
Yan Ru Pei's avatar
Yan Ru Pei committed
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

    current_seed = args.seed

    # Run benchmarks for each prefix ratio
    for prefix_ratio in args.prefix_ratios:
        result = run_benchmark(
            args.model,
            args.tokenizer,
            prefix_ratio,
            args.isl,
            args.osl,
            args.requests,
            args.concurrency,
            current_seed,
            args.num_prefix_prompts,
            args.output_dir,
            args.url,  # Now passing list of URLs
383
            args.use_expected_osl,
Yan Ru Pei's avatar
Yan Ru Pei committed
384
385
386
        )

        if result is not None:
387
388
            ttft = result["time_to_first_token"]
            itl = result["inter_token_latency"]
Yan Ru Pei's avatar
Yan Ru Pei committed
389
390

            prefix_ratios.append(prefix_ratio)
391
392
393
394
395
396
            ttft_p25_values.append(ttft["p25"])
            ttft_p50_values.append(ttft["p50"])
            ttft_p75_values.append(ttft["p75"])
            itl_p25_values.append(itl["p25"])
            itl_p50_values.append(itl["p50"])
            itl_p75_values.append(itl["p75"])
Yan Ru Pei's avatar
Yan Ru Pei committed
397
398

            logger.info(
399
400
                f"Prefix ratio {prefix_ratio}: TTFT p50={ttft['p50']:.2f}ms (p25={ttft['p25']:.2f}, p75={ttft['p75']:.2f}), "
                f"ITL p50={itl['p50']:.2f}ms (p25={itl['p25']:.2f}, p75={itl['p75']:.2f})"
Yan Ru Pei's avatar
Yan Ru Pei committed
401
402
403
404
405
            )

        current_seed += 1

    # Create plots
406
    if prefix_ratios and ttft_p50_values and itl_p50_values:
Yan Ru Pei's avatar
Yan Ru Pei committed
407
408
        plt.figure(figsize=(12, 5))

409
        # Plot TTFT vs Prefix Ratio with shaded p25-p75 region
Yan Ru Pei's avatar
Yan Ru Pei committed
410
        plt.subplot(1, 2, 1)
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
        plt.fill_between(
            prefix_ratios,
            ttft_p25_values,
            ttft_p75_values,
            alpha=0.3,
            color="blue",
            label="p25-p75",
        )
        plt.plot(
            prefix_ratios,
            ttft_p50_values,
            "bo-",
            linewidth=2,
            markersize=8,
            label="p50",
        )
Yan Ru Pei's avatar
Yan Ru Pei committed
427
428
429
430
        plt.xlabel("Prefix Ratio")
        plt.ylabel("Time to First Token (ms)")
        plt.title("TTFT vs Prefix Ratio")
        plt.grid(True, alpha=0.3)
431
432
        plt.legend()
        for i, (pr, p50) in enumerate(zip(prefix_ratios, ttft_p50_values)):
Yan Ru Pei's avatar
Yan Ru Pei committed
433
            plt.annotate(
434
435
                f"{p50:.1f}ms",
                (pr, p50),
Yan Ru Pei's avatar
Yan Ru Pei committed
436
437
438
439
440
                textcoords="offset points",
                xytext=(0, 10),
                ha="center",
            )

441
        # Plot ITL vs Prefix Ratio with shaded p25-p75 region
Yan Ru Pei's avatar
Yan Ru Pei committed
442
        plt.subplot(1, 2, 2)
443
444
445
446
447
448
449
450
451
452
453
        plt.fill_between(
            prefix_ratios,
            itl_p25_values,
            itl_p75_values,
            alpha=0.3,
            color="red",
            label="p25-p75",
        )
        plt.plot(
            prefix_ratios, itl_p50_values, "ro-", linewidth=2, markersize=8, label="p50"
        )
Yan Ru Pei's avatar
Yan Ru Pei committed
454
        plt.xlabel("Prefix Ratio")
455
456
        plt.ylabel("Inter-Token Latency (ms)")
        plt.title("ITL vs Prefix Ratio")
Yan Ru Pei's avatar
Yan Ru Pei committed
457
        plt.grid(True, alpha=0.3)
458
459
        plt.legend()
        for i, (pr, p50) in enumerate(zip(prefix_ratios, itl_p50_values)):
Yan Ru Pei's avatar
Yan Ru Pei committed
460
            plt.annotate(
461
462
                f"{p50:.1f}ms",
                (pr, p50),
Yan Ru Pei's avatar
Yan Ru Pei committed
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
                textcoords="offset points",
                xytext=(0, 10),
                ha="center",
            )

        plt.tight_layout()

        # Save plot
        plot_path = f"{args.output_dir}/prefix_ratio_performance.png"
        plt.savefig(plot_path, dpi=300, bbox_inches="tight")
        logger.info(f"Performance plot saved to {plot_path}")

        # Save results to JSON
        results_data = {
            "prefix_ratios": prefix_ratios,
478
479
480
481
482
483
            "ttft_p25_values": ttft_p25_values,
            "ttft_p50_values": ttft_p50_values,
            "ttft_p75_values": ttft_p75_values,
            "itl_p25_values": itl_p25_values,
            "itl_p50_values": itl_p50_values,
            "itl_p75_values": itl_p75_values,
Yan Ru Pei's avatar
Yan Ru Pei committed
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
            "config": {
                "model": args.model,
                "tokenizer": args.tokenizer,
                "isl": args.isl,
                "osl": args.osl,
                "requests": args.requests,
                "concurrency": args.concurrency,
                "initial_seed": args.seed,
            },
        }

        results_path = f"{args.output_dir}/results_summary.json"
        with open(results_path, "w") as f:
            json.dump(results_data, f, indent=2)
        logger.info(f"Results summary saved to {results_path}")

    else:
        logger.error("No successful benchmark results to plot")


if __name__ == "__main__":
    main()