Unverified Commit ea385ae8 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Fix ITL metrics when using openai endpoint with spec (#12156)

parent 9e949e58
......@@ -88,6 +88,7 @@ class RequestFuncOutput:
latency: float = 0.0
ttft: float = 0.0 # Time to first token
itl: List[float] = field(default_factory=list) # List of inter-token latencies
text_chunks: List[str] = field(default_factory=list)
prompt_len: int = 0
error: str = ""
output_len: int = 0
......@@ -258,6 +259,9 @@ async def async_request_openai_completions(
# Decoding phase
else:
output.text_chunks.append(
data["choices"][0]["text"]
)
output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
......@@ -574,9 +578,8 @@ async def async_request_sglang_generate(
num_new_tokens = output_len - last_output_len
if num_new_tokens == 0:
continue
adjust_itl = (
timestamp - most_recent_timestamp
) / num_new_tokens
chunk_gap = timestamp - most_recent_timestamp
adjust_itl = chunk_gap / num_new_tokens
output.itl.extend([adjust_itl] * num_new_tokens)
most_recent_timestamp = timestamp
......@@ -1638,6 +1641,14 @@ def calculate_metrics(
tpots: List[float] = []
ttfts: List[float] = []
e2e_latencies: List[float] = []
retokenized_itls: List[float] = []
use_retokenized_itl = (
accept_length is not None
and accept_length > 0
and backend in ("sglang-oai", "sglang-oai-chat")
)
for i in range(len(outputs)):
if outputs[i].success:
output_len = outputs[i].output_len
......@@ -1651,12 +1662,15 @@ def calculate_metrics(
total_input_vision += input_requests[i].vision_prompt_len
if output_len > 1:
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
if (
accept_length
and accept_length > 0
and backend in ("sglang-oai", "sglang-oai-chat")
):
itls += [v / accept_length for v in outputs[i].itl]
if use_retokenized_itl:
for k, itl in enumerate(outputs[i].itl):
num_tokens = len(
tokenizer.encode(
outputs[i].text_chunks[k], add_special_tokens=False
)
)
adjusted_itl = itl / num_tokens
retokenized_itls.extend([adjusted_itl] * num_tokens)
else:
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
......@@ -1674,6 +1688,8 @@ def calculate_metrics(
"on the benchmark arguments.",
stacklevel=2,
)
itls = retokenized_itls if use_retokenized_itl else itls
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment