update bench run error

f246ee95 · zhuwenwen · 10184690 · f246ee95 · f246ee95 · f246ee95
Commit f246ee95 authored Jun 23, 2025 by zhuwenwen
Showing with 13 additions and 7 deletions

benchmarks/benchmark_throughput.py benchmarks/benchmark_throughput.py +4 -4

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +6 -0

vllm/perf/benchmark_throughput.py vllm/perf/benchmark_throughput.py +3 -3

No files found.
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -103,9 +103,11 @@ def run_vllm(
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]
+    use_beam_search = False
    print("Warming up...")
    for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
-        if not args.use_beam_search:
+        if not use_beam_search:
            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
        else:
            llm.beam_search(
@@ -117,8 +119,6 @@ def run_vllm(
                ),
            )
-    use_beam_search = False
    outputs = None
    if not use_beam_search:
        if args.profile:
@@ -790,4 +790,4 @@ if __name__ == "__main__":
    if args.tokenizer is None:
        args.tokenizer = args.model
    validate_args(args)
    main(args)
\ No newline at end of file
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -788,6 +788,12 @@ class EngineArgs:
            default=None,
            help="The configurations for speculative decoding. Should be a "
            "JSON string.")
+        parser.add_argument(
+            '--num-speculative-heads',
+            type=int,
+            default=EngineArgs.num_speculative_heads,
+            help='The number of speculative heads to sample from '
+                 'the draft model in speculative decoding.')
        # Observability arguments
        observability_kwargs = get_kwargs(ObservabilityConfig)

--- a/vllm/perf/benchmark_throughput.py
+++ b/vllm/perf/benchmark_throughput.py
@@ -103,9 +103,11 @@ def run_vllm(
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]
+    use_beam_search = False
    print("Warming up...")
    for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
-        if not args.use_beam_search:
+        if not use_beam_search:
            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
        else:
            llm.beam_search(
@@ -117,8 +119,6 @@ def run_vllm(
                ),
            )
-    use_beam_search = False
    outputs = None
    if not use_beam_search:
        if args.profile: