Enable --profile in 'vllm bench throughput' (#24575)

Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>

Enable --profile in 'vllm bench throughput' (#24575)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
ee0bc5e1 · Tomas Ruiz · GitHub · 3d1393f6 · ee0bc5e1
Unverified Commit ee0bc5e1 authored Sep 11, 2025 by Tomas Ruiz Committed by GitHub Sep 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 4 deletions

vllm/benchmarks/throughput.py vllm/benchmarks/throughput.py +34 -4

No files found.
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -37,6 +37,7 @@ def run_vllm(
    requests: list[SampleRequest],
    n: int,
    engine_args: EngineArgs,
+    do_profile: bool,
    disable_detokenize: bool = False,
 ) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
@@ -75,10 +76,14 @@ def run_vllm(
    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
+        if do_profile:
+            llm.start_profile()
        outputs = llm.generate(prompts,
                               sampling_params,
                               lora_request=lora_requests,
                               use_tqdm=True)
+        if do_profile:
+            llm.stop_profile()
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
@@ -88,6 +93,8 @@ def run_vllm(
        for request in requests:
            assert request.expected_output_len == output_len
        start = time.perf_counter()
+        if do_profile:
+            llm.start_profile()
        llm.beam_search(
            prompts,
            BeamSearchParams(
@@ -95,6 +102,8 @@ def run_vllm(
                max_tokens=output_len,
                ignore_eos=True,
            ))
+        if do_profile:
+            llm.stop_profile()
        end = time.perf_counter()
    return end - start, outputs
@@ -103,6 +112,7 @@ def run_vllm_chat(
        requests: list[SampleRequest],
        n: int,
        engine_args: EngineArgs,
+        do_profile: bool,
        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
    """
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
@@ -133,7 +143,11 @@ def run_vllm_chat(
                detokenize=not disable_detokenize,
            ))
    start = time.perf_counter()
+    if do_profile:
+        llm.start_profile()
    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    if do_profile:
+        llm.stop_profile()
    end = time.perf_counter()
    return end - start, outputs
@@ -142,6 +156,7 @@ async def run_vllm_async(
    requests: list[SampleRequest],
    n: int,
    engine_args: AsyncEngineArgs,
+    do_profile: bool,
    disable_frontend_multiprocessing: bool = False,
    disable_detokenize: bool = False,
 ) -> float:
@@ -185,6 +200,8 @@ async def run_vllm_async(
        generators = []
        start = time.perf_counter()
+        if do_profile:
+            await llm.start_profile()
        for i, (prompt, sp,
                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
            generator = llm.generate(prompt,
@@ -195,6 +212,8 @@ async def run_vllm_async(
        all_gens = merge_async_iterators(*generators)
        async for i, res in all_gens:
            pass
+        if do_profile:
+            await llm.stop_profile()
        end = time.perf_counter()
        return end - start
@@ -543,6 +562,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
                        type=str,
                        default=None,
                        help="Split of the HF dataset.")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        default=False,
+        help="Use Torch Profiler. The env variable "
+        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
    # prefix repetition dataset
    prefix_repetition_group = parser.add_argument_group(
@@ -600,22 +625,27 @@ def main(args: argparse.Namespace):
                    requests,
                    args.n,
                    AsyncEngineArgs.from_cli_args(args),
-                    args.disable_frontend_multiprocessing,
+                    disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
-                    args.disable_detokenize,
+                    disable_detokenize=args.disable_detokenize,
+                    do_profile=args.profile,
                ))
        else:
            elapsed_time, request_outputs = run_vllm(
                requests, args.n, EngineArgs.from_cli_args(args),
-                args.disable_detokenize)
+                disable_detokenize=args.disable_detokenize,
+                do_profile=args.profile)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
+        if args.profile:
+            raise NotImplementedError(
+                "Profiling not implemented yet for backend='hf'.")
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
                              args.hf_max_batch_size, args.trust_remote_code,
                              args.disable_detokenize)
    elif args.backend == "vllm-chat":
        elapsed_time, request_outputs = run_vllm_chat(
            requests, args.n, EngineArgs.from_cli_args(args),
-            args.disable_detokenize)
+            disable_detokenize=args.disable_detokenize, do_profile=args.profile)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")