Unverified Commit b9fb74f3 authored by Xinyuan Tong's avatar Xinyuan Tong Committed by GitHub
Browse files

fix: bench_serving ITL calculation when using spec-decoding (#12064)

parent e15b63a1
......@@ -1626,6 +1626,7 @@ def calculate_metrics(
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
backend: str,
accept_length: Optional[float] = None,
) -> Tuple[BenchmarkMetrics, List[int]]:
output_lens: List[int] = []
retokenized_output_lens: List[int] = []
......@@ -1650,7 +1651,14 @@ def calculate_metrics(
total_input_vision += input_requests[i].vision_prompt_len
if output_len > 1:
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
itls += outputs[i].itl
if (
accept_length
and accept_length > 0
and backend in ("sglang-oai", "sglang-oai-chat")
):
itls += [v / accept_length for v in outputs[i].itl]
else:
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
e2e_latencies.append(outputs[i].latency)
......@@ -1929,6 +1937,7 @@ async def benchmark(
dur_s=benchmark_duration,
tokenizer=tokenizer,
backend=backend,
accept_length=accept_length,
)
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment