bench_vllm.py 6.85 KB
Newer Older
PanZezhong's avatar
PanZezhong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import argparse
import itertools
import time
import random

import torch
import json


from vllm.engine.llm_engine import LLMEngine
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import TokensPrompt
from vllm.sampling_params import SamplingParams

# import os
# import logging
# os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
# logging.getLogger("vllm").setLevel(logging.ERROR)


def parse_list(value: str):
    """Parse parse_list argument: can be a single int or a list of ints.

    Examples:
        "1" -> 1
        "[1,2,4]" -> [1, 2, 4]
        "1,2,4" -> [1, 2, 4]
    """
    value = value.strip()
    # Try to parse as JSON list first
    if value.startswith("[") and value.endswith("]"):
        try:
            result = json.loads(value)
            if isinstance(result, list):
                return [int(x) for x in result]
            return int(result)
        except (json.JSONDecodeError, ValueError):
            pass

    # Try to parse as comma-separated values
    if "," in value:
        try:
            return [int(x.strip()) for x in value.split(",")]
        except ValueError:
            pass

    # Try to parse as a single integer
    try:
        return [int(value)]
    except ValueError:
        raise argparse.ArgumentTypeError(
            f"batch-size must be an int or list[int], got: {value}"
        )


def run_one_case(
    engine: LLMEngine,
    batch_size: int,
    input_len: int,
    output_len: int,
    vocab_size: int,
):
    # ------------------------------------------------------------
    # 1. Random input token IDs
    # ------------------------------------------------------------
    input_ids_list = [
        [random.randint(0, vocab_size - 1) for _ in range(input_len)]
        for _ in range(batch_size)
    ]

    sampling_params = SamplingParams(
        max_tokens=output_len,
        ignore_eos=True,
        temperature=1.0,
        top_p=0.9,
        top_k=50,
    )

    request_ids = []
    for i, input_ids in enumerate(input_ids_list):
        rid = f"req_{i}"
        engine.add_request(
            request_id=rid,
            prompt=TokensPrompt(prompt_token_ids=input_ids),
            params=sampling_params,
        )
        request_ids.append(rid)

    # ------------------------------------------------------------
PanZezhong's avatar
PanZezhong committed
90
    # 2. Run until first decode token appears for all requests (prefill timing)
PanZezhong's avatar
PanZezhong committed
91
92
    # ------------------------------------------------------------
    t0 = time.perf_counter()
PanZezhong's avatar
PanZezhong committed
93
94
95
    pre_decode = 0  # some decode tokens can be mixed with prefill batch
    pending = set(f"req_{i}" for i in range(batch_size))
    while pending:
PanZezhong's avatar
PanZezhong committed
96
97
        outputs = engine.step()
        for out in outputs:
PanZezhong's avatar
PanZezhong committed
98
99
100
101
102
            if len(out.outputs[0].token_ids) > 0:
                if out.request_id in pending:
                    pending.remove(out.request_id)
                else:
                    pre_decode += 1
PanZezhong's avatar
PanZezhong committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    torch.cuda.synchronize()
    t1 = time.perf_counter()

    prefill_time = t1 - t0
    prefill_tokens = batch_size * input_len

    # ------------------------------------------------------------
    # 3. Decode until all requests finish
    # ------------------------------------------------------------
    decode_start = time.perf_counter()

    while engine.has_unfinished_requests():
        outputs = engine.step()

    torch.cuda.synchronize()
    decode_end = time.perf_counter()

    decode_time = decode_end - decode_start
PanZezhong's avatar
PanZezhong committed
121
122
123
    decode_tokens = (
        batch_size * (output_len - 1) - pre_decode
    )  # exclude prefill-mixed tokens
PanZezhong's avatar
PanZezhong committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

    return {
        "batch_size": batch_size,
        "input_len": input_len,
        "output_len": output_len,
        "prefill_tput": prefill_tokens / prefill_time,
        "decode_tput": decode_tokens / decode_time,
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True)
    parser.add_argument(
        "--batch-size",
        type=parse_list,
        default=[1],
        help=(
            "number of prompts in a batch (int or list, e.g. '1', '1,2,4', '[1,2,4]')"
        ),
    )
    parser.add_argument(
        "--input-len",
        type=parse_list,
        default=[256],
        help="input sequence length(s)",
    )
    parser.add_argument(
        "--output-len",
        type=parse_list,
        default=[256],
        help="output sequence length(s)",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        "--tp",
        type=int,
        default=1,
        help="total rank for tensor parallel",
    )
    parser.add_argument("--dtype", type=str, default="auto")
    parser.add_argument("--num-iters-warmup", type=int, default=2)
    args = parser.parse_args()

    # ------------------------------------------------------------
    # Engine init (TP supported here)
    # ------------------------------------------------------------
    engine_args = EngineArgs(
        model=args.model,
        dtype=args.dtype,
        tensor_parallel_size=args.tensor_parallel_size,
        trust_remote_code=True,
        distributed_executor_backend="mp",
        enable_chunked_prefill=False,
    )

    engine = LLMEngine.from_engine_args(engine_args)
    vocab_size = engine.model_config.get_vocab_size()

    # ------------------------------------------------------------
    # Sweep all combinations
    # ------------------------------------------------------------
    print("\n=== Running benchmark ===")
    results = []

    try:
        for bs, il, ol in itertools.product(
            args.batch_size, args.input_len, args.output_len
        ):
            # Warmup
            for _ in range(args.num_iters_warmup):
                run_one_case(
                    engine,
                    batch_size=bs,
                    input_len=il,
                    output_len=ol,
                    vocab_size=vocab_size,
                )

            res = run_one_case(
                engine,
                batch_size=bs,
                input_len=il,
                output_len=ol,
                vocab_size=vocab_size,
            )
            results.append(res)

            print(
                f"[TP={args.tensor_parallel_size} | "
                f"bs={bs} in={il} out={ol}] "
                f"prefill={res['prefill_tput']} tok/s | "
                f"decode={res['decode_tput']} tok/s"
            )
    except Exception as e:
        print(f"Error Occured: {e}")
    # ------------------------------------------------------------
    # Summary
    # ------------------------------------------------------------
    print("\n=== Summary ===")
    print("bs   in_len   out_len   prefill_tok/s     decode_tok/s")
    for r in results:
        print(
            f"{r['batch_size']:3d}  "
            f"{r['input_len']:7d}  "
            f"{r['output_len']:8d}  "
            f"{r['prefill_tput']:14.2f}  "
            f"{r['decode_tput']:14.2f}"
        )


if __name__ == "__main__":
    torch.manual_seed(0)
    random.seed(0)
    main()