benchmark_latency.py 10.8 KB
Newer Older
1
"""Benchmark the latency of processing a single batch of requests."""
2
import argparse
3
import json
4
import time
5
from pathlib import Path
6
from typing import List, Optional
7
8
9

import numpy as np
import torch
10
from tqdm import tqdm
11

Woosuk Kwon's avatar
Woosuk Kwon committed
12
from vllm import LLM, SamplingParams
13
from vllm.engine.arg_utils import EngineArgs
14
from vllm.inputs import PromptStrictInputs
15
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
16
17
18


def main(args: argparse.Namespace):
19
20
21
    print(args)

    # NOTE(woosuk): If the request cannot be processed in a single batch,
Zhuohan Li's avatar
Zhuohan Li committed
22
    # the engine will automatically process the request in multiple batches.
23
    llm = LLM(model=args.model,
24
25
              speculative_model=args.speculative_model,
              num_speculative_tokens=args.num_speculative_tokens,
26
27
28
29
30
31
32
              tokenizer=args.tokenizer,
              quantization=args.quantization,
              tensor_parallel_size=args.tensor_parallel_size,
              trust_remote_code=args.trust_remote_code,
              dtype=args.dtype,
              enforce_eager=args.enforce_eager,
              kv_cache_dtype=args.kv_cache_dtype,
33
              quantization_param_path=args.quantization_param_path,
34
35
              device=args.device,
              ray_workers_use_nsight=args.ray_workers_use_nsight,
36
              use_v2_block_manager=args.use_v2_block_manager,
37
38
              enable_chunked_prefill=args.enable_chunked_prefill,
              download_dir=args.download_dir,
39
              block_size=args.block_size,
40
              gpu_memory_utilization=args.gpu_memory_utilization,
41
              load_format=args.load_format,
42
              distributed_executor_backend=args.distributed_executor_backend)
43

Woosuk Kwon's avatar
Woosuk Kwon committed
44
45
46
47
48
    sampling_params = SamplingParams(
        n=args.n,
        temperature=0.0 if args.use_beam_search else 1.0,
        top_p=1.0,
        use_beam_search=args.use_beam_search,
49
        ignore_eos=True,
Woosuk Kwon's avatar
Woosuk Kwon committed
50
51
        max_tokens=args.output_len,
    )
52
    print(sampling_params)
53
54
55
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
56
57
58
    dummy_inputs: List[PromptStrictInputs] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]
59

60
61
62
63
64
65
66
67
68
    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
                    activities=[
                        torch.profiler.ProfilerActivity.CPU,
                        torch.profiler.ProfilerActivity.CUDA,
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
69
                llm.generate(dummy_inputs,
70
71
72
73
74
                             sampling_params=sampling_params,
                             use_tqdm=False)
            print(p.key_averages())
        else:
            start_time = time.perf_counter()
75
            llm.generate(dummy_inputs,
76
77
78
79
80
                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency
81

82
    print("Warming up...")
83
84
    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
        run_to_completion(profile_dir=None)
85

86
    if args.profile:
87
88
        profile_dir = args.profile_result_dir
        if not profile_dir:
89
90
91
            profile_dir = Path(
                "."
            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
92
        print(f"Profiling (results will be saved to '{profile_dir}')...")
93
        run_to_completion(profile_dir=profile_dir)
94
95
        return

96
97
    # Benchmark.
    latencies = []
98
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
99
        latencies.append(run_to_completion(profile_dir=None))
100
101
102
    latencies = np.array(latencies)
    percentages = [10, 25, 50, 75, 90]
    percentiles = np.percentile(latencies, percentages)
103
    print(f'Avg latency: {np.mean(latencies)} seconds')
104
105
    for percentage, percentile in zip(percentages, percentiles):
        print(f'{percentage}% percentile latency: {percentile} seconds')
106

107
108
109
110
111
112
113
114
115
116
    # Output JSON results if specified
    if args.output_json:
        results = {
            "avg_latency": np.mean(latencies),
            "latencies": latencies.tolist(),
            "percentiles": dict(zip(percentages, percentiles.tolist())),
        }
        with open(args.output_json, "w") as f:
            json.dump(results, f, indent=4)

117
118

if __name__ == '__main__':
119
    parser = argparse.ArgumentParser(
120
        description='Benchmark the latency of processing a single batch of '
121
        'requests till completion.')
122
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
123
124
    parser.add_argument('--speculative-model', type=str, default=None)
    parser.add_argument('--num-speculative-tokens', type=int, default=None)
125
    parser.add_argument('--tokenizer', type=str, default=None)
126
127
    parser.add_argument('--quantization',
                        '-q',
128
                        choices=[*QUANTIZATION_METHODS, None],
129
                        default=None)
130
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
131
132
133
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
134
135
136
    parser.add_argument('--n',
                        type=int,
                        default=1,
137
                        help='Number of generated sequences per prompt.')
138
    parser.add_argument('--use-beam-search', action='store_true')
139
140
141
142
    parser.add_argument('--num-iters-warmup',
                        type=int,
                        default=10,
                        help='Number of iterations to run for warmup.')
143
144
    parser.add_argument('--num-iters',
                        type=int,
145
                        default=30,
146
                        help='Number of iterations to run.')
147
148
    parser.add_argument('--trust-remote-code',
                        action='store_true',
149
                        help='trust remote code from huggingface')
150
151
152
153
154
155
156
157
158
    parser.add_argument(
        '--dtype',
        type=str,
        default='auto',
        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
        help='data type for model weights and activations. '
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
159
160
161
    parser.add_argument('--enforce-eager',
                        action='store_true',
                        help='enforce eager mode and disable CUDA graph')
162
    parser.add_argument(
163
        '--kv-cache-dtype',
164
        type=str,
165
166
167
168
169
        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
        default="auto",
        help='Data type for kv cache storage. If "auto", will use model '
        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
170
171
172
173
174
175
176
177
178
179
    parser.add_argument(
        '--quantization-param-path',
        type=str,
        default=None,
        help='Path to the JSON file containing the KV cache scaling factors. '
        'This should generally be supplied, when KV cache dtype is FP8. '
        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
180
181
182
183
    parser.add_argument(
        '--profile',
        action='store_true',
        help='profile the generation process of a single batch')
184
185
186
187
    parser.add_argument(
        '--profile-result-dir',
        type=str,
        default=None,
188
189
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
190
191
192
193
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
194
        choices=["cuda", "cpu", "tpu", "xpu"],
195
        help='device type for vLLM execution, supporting CUDA and CPU.')
196
197
198
199
200
201
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
                        help='block size of key/value cache')
    parser.add_argument(
        '--enable-chunked-prefill',
202
        action='store_true',
203
204
        help='If True, the prefill requests can be chunked based on the '
        'max_num_batched_tokens')
205
    parser.add_argument('--use-v2-block-manager', action='store_true')
206
207
208
209
210
    parser.add_argument(
        "--ray-workers-use-nsight",
        action='store_true',
        help="If specified, use nsight to profile ray workers",
    )
211
212
213
214
215
    parser.add_argument('--download-dir',
                        type=str,
                        default=None,
                        help='directory to download and load the weights, '
                        'default to the default cache dir of huggingface')
216
217
218
219
220
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the latency results in JSON format.')
221
222
223
224
225
226
    parser.add_argument('--gpu-memory-utilization',
                        type=float,
                        default=0.9,
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
    parser.add_argument(
        '--load-format',
        type=str,
        default=EngineArgs.load_format,
        choices=[
            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
            'bitsandbytes'
        ],
        help='The format of the model weights to load.\n\n'
        '* "auto" will try to load the weights in the safetensors format '
        'and fall back to the pytorch bin format if safetensors format '
        'is not available.\n'
        '* "pt" will load the weights in the pytorch bin format.\n'
        '* "safetensors" will load the weights in the safetensors format.\n'
        '* "npcache" will load the weights in pytorch format and store '
        'a numpy cache to speed up the loading.\n'
        '* "dummy" will initialize the weights with random values, '
        'which is mainly for profiling.\n'
        '* "tensorizer" will load the weights using tensorizer from '
        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
        'section for more information.\n'
        '* "bitsandbytes" will load the weights using bitsandbytes '
        'quantization.\n')
250
251
252
253
254
255
256
    parser.add_argument(
        '--distributed-executor-backend',
        choices=['ray', 'mp'],
        default=None,
        help='Backend to use for distributed serving. When more than 1 GPU '
        'is used, will be automatically set to "ray" if installed '
        'or "mp" (multiprocessing) otherwise.')
257
258
    args = parser.parse_args()
    main(args)