[Bugfix] API stream returning two stops (#3450)

Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>

[Bugfix] API stream returning two stops (#3450)
Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
0b4997e0 · Dylan Hawk · GitHub · c13ad1b7 · 0b4997e0 · 0b4997e0
Unverified Commit 0b4997e0 authored Mar 25, 2024 by Dylan Hawk Committed by GitHub Mar 25, 2024
Showing with 25 additions and 27 deletions

tests/entrypoints/test_openai_server.py tests/entrypoints/test_openai_server.py +12 -0

vllm/entrypoints/openai/serving_completion.py vllm/entrypoints/openai/serving_completion.py +13 -27

No files found.
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -322,9 +322,15 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
                                             temperature=0.0,
                                             stream=True)
    chunks = []
+    finish_reason_count = 0
    async for chunk in stream:
        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
    assert chunk.usage == single_usage
    assert "".join(chunks) == single_output
@@ -363,13 +369,19 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
        stream=True,
    )
    chunks = []
+    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.role:
            assert delta.role == "assistant"
        if delta.content:
            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
    assert "".join(chunks) == output

--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -266,23 +266,7 @@ class OpenAIServingCompletion(OpenAIServing):
                    previous_texts[i] = output.text
                    previous_num_tokens[i] = len(output.token_ids)
                    finish_reason = output.finish_reason
-                    response_json = CompletionStreamResponse(
-                        id=request_id,
-                        created=created_time,
-                        model=model_name,
-                        choices=[
-                            CompletionResponseStreamChoice(
-                                index=i,
-                                text=delta_text,
-                                logprobs=logprobs,
-                                finish_reason=finish_reason,
-                            )
-                        ]).model_dump_json()
-                    yield f"data: {response_json}\n\n"
                    if output.finish_reason is not None:  # return final usage
-                        logprobs = LogProbs(
-                        ) if request.logprobs is not None else None
                        prompt_tokens = len(res.prompt_token_ids)
                        completion_tokens = len(output.token_ids)
                        final_usage = UsageInfo(
@@ -290,6 +274,8 @@ class OpenAIServingCompletion(OpenAIServing):
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
+                    else:
+                        final_usage = None
                    response_json = CompletionStreamResponse(
                        id=request_id,
                        created=created_time,
@@ -297,13 +283,13 @@ class OpenAIServingCompletion(OpenAIServing):
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
-                                    text="",
+                                text=delta_text,
                                logprobs=logprobs,
-                                    finish_reason=output.finish_reason,
+                                finish_reason=finish_reason,
                            )
                        ],
                        usage=final_usage,
-                        ).model_dump_json()
+                    ).model_dump_json(exclude_unset=True)
                    yield f"data: {response_json}\n\n"
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error