Unverified Commit ea385ae8 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Fix ITL metrics when using openai endpoint with spec (#12156)

parent 9e949e58
...@@ -88,6 +88,7 @@ class RequestFuncOutput: ...@@ -88,6 +88,7 @@ class RequestFuncOutput:
latency: float = 0.0 latency: float = 0.0
ttft: float = 0.0 # Time to first token ttft: float = 0.0 # Time to first token
itl: List[float] = field(default_factory=list) # List of inter-token latencies itl: List[float] = field(default_factory=list) # List of inter-token latencies
text_chunks: List[str] = field(default_factory=list)
prompt_len: int = 0 prompt_len: int = 0
error: str = "" error: str = ""
output_len: int = 0 output_len: int = 0
...@@ -258,6 +259,9 @@ async def async_request_openai_completions( ...@@ -258,6 +259,9 @@ async def async_request_openai_completions(
# Decoding phase # Decoding phase
else: else:
output.text_chunks.append(
data["choices"][0]["text"]
)
output.itl.append(timestamp - most_recent_timestamp) output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
...@@ -574,9 +578,8 @@ async def async_request_sglang_generate( ...@@ -574,9 +578,8 @@ async def async_request_sglang_generate(
num_new_tokens = output_len - last_output_len num_new_tokens = output_len - last_output_len
if num_new_tokens == 0: if num_new_tokens == 0:
continue continue
adjust_itl = ( chunk_gap = timestamp - most_recent_timestamp
timestamp - most_recent_timestamp adjust_itl = chunk_gap / num_new_tokens
) / num_new_tokens
output.itl.extend([adjust_itl] * num_new_tokens) output.itl.extend([adjust_itl] * num_new_tokens)
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
...@@ -1638,6 +1641,14 @@ def calculate_metrics( ...@@ -1638,6 +1641,14 @@ def calculate_metrics(
tpots: List[float] = [] tpots: List[float] = []
ttfts: List[float] = [] ttfts: List[float] = []
e2e_latencies: List[float] = [] e2e_latencies: List[float] = []
retokenized_itls: List[float] = []
use_retokenized_itl = (
accept_length is not None
and accept_length > 0
and backend in ("sglang-oai", "sglang-oai-chat")
)
for i in range(len(outputs)): for i in range(len(outputs)):
if outputs[i].success: if outputs[i].success:
output_len = outputs[i].output_len output_len = outputs[i].output_len
...@@ -1651,12 +1662,15 @@ def calculate_metrics( ...@@ -1651,12 +1662,15 @@ def calculate_metrics(
total_input_vision += input_requests[i].vision_prompt_len total_input_vision += input_requests[i].vision_prompt_len
if output_len > 1: if output_len > 1:
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1)) tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
if ( if use_retokenized_itl:
accept_length for k, itl in enumerate(outputs[i].itl):
and accept_length > 0 num_tokens = len(
and backend in ("sglang-oai", "sglang-oai-chat") tokenizer.encode(
): outputs[i].text_chunks[k], add_special_tokens=False
itls += [v / accept_length for v in outputs[i].itl] )
)
adjusted_itl = itl / num_tokens
retokenized_itls.extend([adjusted_itl] * num_tokens)
else: else:
itls += outputs[i].itl itls += outputs[i].itl
ttfts.append(outputs[i].ttft) ttfts.append(outputs[i].ttft)
...@@ -1674,6 +1688,8 @@ def calculate_metrics( ...@@ -1674,6 +1688,8 @@ def calculate_metrics(
"on the benchmark arguments.", "on the benchmark arguments.",
stacklevel=2, stacklevel=2,
) )
itls = retokenized_itls if use_retokenized_itl else itls
metrics = BenchmarkMetrics( metrics = BenchmarkMetrics(
completed=completed, completed=completed,
total_input=total_input, total_input=total_input,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment