aiperf.py 6.53 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os
import random
import subprocess

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


33
def _get_common_aiperf_cmd(
34
35
36
    artifact_dir,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
37
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
38
    base_url="http://localhost:8000",
39
40
):
    return [
41
        "aiperf",
42
43
44
45
        "profile",
        "--model",
        model,
        "--tokenizer",
46
        tokenizer,
47
48
49
50
51
52
        "--endpoint-type",
        "chat",
        "--endpoint",
        "/v1/chat/completions",
        "--streaming",
        "--url",
53
        base_url,
54
55
56
57
58
59
60
61
62
63
64
65
66
        "--extra-inputs",
        "ignore_eos:true",
        "--extra-inputs",
        '{"nvext":{"ignore_eos":true}}',
        "--warmup-request-count",
        "3",
        "--artifact-dir",
        artifact_dir,
        "--random-seed",
        str(seed),
    ]


67
def get_prefill_aiperf_cmd(
68
69
70
71
    isl,
    artifact_dir,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
72
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
73
    osl=5,
74
    base_url="http://localhost:8000",
75
):
76
    return _get_common_aiperf_cmd(
77
78
79
        artifact_dir,
        seed,
        model,
80
        tokenizer,
81
        base_url,
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
        "--synthetic-input-tokens-stddev",
        "0",
        "--output-tokens-mean",
        str(osl),
        "--output-tokens-stddev",
        "0",
        "--extra-inputs",
        f"max_tokens:{osl}",
        "--extra-inputs",
        f"min_tokens:{osl}",
        "--concurrency",
        "1",
        "--request-count",
        "1",
    ]


102
def get_decode_aiperf_cmd(
103
104
105
106
107
108
    isl,
    osl,
    artifact_dir,
    num_request,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
109
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
110
    base_url="http://localhost:8000",
111
):
112
    return _get_common_aiperf_cmd(
113
114
115
        artifact_dir,
        seed,
        model,
116
        tokenizer,
117
        base_url,
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
        "--synthetic-input-tokens-stddev",
        "0",
        "--output-tokens-mean",
        str(osl),
        "--output-tokens-stddev",
        "0",
        "--extra-inputs",
        f"max_tokens:{osl}",
        "--extra-inputs",
        f"min_tokens:{osl}",
        "--concurrency",
        str(num_request),
        "--num-dataset-entries",
        str(num_request),
        "--request-count",
        str(num_request),
    ]


140
def get_aiperf_result(artifact_dir: str) -> dict:
141
142
    json_file_path = None
    for root, _, files in os.walk(artifact_dir):
143
144
        if "profile_export_aiperf.json" in files:
            json_file_path = os.path.join(root, "profile_export_aiperf.json")
145
146
147
            break
    if json_file_path is None:
        raise FileNotFoundError(
148
            f"profile_export_aiperf.json not found in {artifact_dir}"
149
150
151
152
153
        )
    with open(json_file_path, "r") as f:
        return json.load(f)


154
def benchmark_prefill(
155
    isl,
156
    aiperf_artifact_dir,
157
158
159
    model_name,
    tokenizer,
    base_url="http://localhost:8000",
160
):
161
162
    logger.info(f"Running aiperf with isl {isl}")
    aiperf_cmd = get_prefill_aiperf_cmd(
163
        isl,
164
        aiperf_artifact_dir,
165
166
167
        model=model_name,
        tokenizer=tokenizer,
        base_url=base_url,
168
    )
169
    print(f"aiperf cmd: {aiperf_cmd}")
170
    # import pdb; pdb.set_trace()
171
172
    aiperf_process = subprocess.Popen(
        aiperf_cmd,
173
174
175
176
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
177
178
179
    stdout, stderr = aiperf_process.communicate()
    if aiperf_process.returncode == 0:
        logger.info("AIperf profiling completed successfully")
180
        logger.info(stdout)
181
182
        aiperf_result = get_aiperf_result(aiperf_artifact_dir)
        return aiperf_result
183
    else:
184
        logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
185
186
187
188
        logger.error(f"stderr: {stderr}")
        return None


189
190
191
192
def benchmark_decode(
    isl,
    osl,
    num_request,
193
    aiperf_artifact_dir,
194
    model_name,
195
    tokenizer,
196
197
    base_url="http://localhost:8000",
):
198
199
200
201
202
    logger.info(f"Profiling decode with num_request {num_request}...")

    # first warm-up the engine by pre-computing all prefill tokens
    # we use the same random seed to make sure the prompt is the same
    seed = random.randint(0, 1000000)
203

204
    aiperf_cmd = get_decode_aiperf_cmd(
205
206
        isl,
        osl,
207
        f"{aiperf_artifact_dir}_warmup",
208
209
210
        num_request,
        seed=seed,
        model=model_name,
211
        tokenizer=tokenizer,
212
        base_url=base_url,
213
    )
214
215
    aiperf_process = subprocess.Popen(
        aiperf_cmd,
216
217
218
219
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
220
    aiperf_process.communicate()
221
    # then send out the real requests, hopefully, this will skip all prefill computation
222
    aiperf_cmd = get_decode_aiperf_cmd(
223
224
        isl,
        osl,
225
        aiperf_artifact_dir,
226
227
228
        num_request,
        seed=seed,
        model=model_name,
229
        tokenizer=tokenizer,
230
        base_url=base_url,
231
    )
232
233
    aiperf_process = subprocess.Popen(
        aiperf_cmd,
234
235
236
237
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
238
239
240
    stdout, stderr = aiperf_process.communicate()
    if aiperf_process.returncode == 0:
        logger.info("AIperf profiling completed successfully")
241
        logger.info(stdout)
242
243
        aiperf_result = get_aiperf_result(aiperf_artifact_dir)
        return aiperf_result
244
    else:
245
        logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
246
247
        logger.error(f"stderr: {stderr}")
        return None