Unverified Commit b9fb74f3 authored by Xinyuan Tong's avatar Xinyuan Tong Committed by GitHub
Browse files

fix: bench_serving ITL calculation when using spec-decoding (#12064)

parent e15b63a1
...@@ -1626,6 +1626,7 @@ def calculate_metrics( ...@@ -1626,6 +1626,7 @@ def calculate_metrics(
dur_s: float, dur_s: float,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
backend: str, backend: str,
accept_length: Optional[float] = None,
) -> Tuple[BenchmarkMetrics, List[int]]: ) -> Tuple[BenchmarkMetrics, List[int]]:
output_lens: List[int] = [] output_lens: List[int] = []
retokenized_output_lens: List[int] = [] retokenized_output_lens: List[int] = []
...@@ -1650,6 +1651,13 @@ def calculate_metrics( ...@@ -1650,6 +1651,13 @@ def calculate_metrics(
total_input_vision += input_requests[i].vision_prompt_len total_input_vision += input_requests[i].vision_prompt_len
if output_len > 1: if output_len > 1:
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1)) tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
if (
accept_length
and accept_length > 0
and backend in ("sglang-oai", "sglang-oai-chat")
):
itls += [v / accept_length for v in outputs[i].itl]
else:
itls += outputs[i].itl itls += outputs[i].itl
ttfts.append(outputs[i].ttft) ttfts.append(outputs[i].ttft)
...@@ -1929,6 +1937,7 @@ async def benchmark( ...@@ -1929,6 +1937,7 @@ async def benchmark(
dur_s=benchmark_duration, dur_s=benchmark_duration,
tokenizer=tokenizer, tokenizer=tokenizer,
backend=backend, backend=backend,
accept_length=accept_length,
) )
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment