[Fix] Correct OpenAI batch response format (#5554)

3ce2c050 · zifeitong · GitHub · 1c0afa13 · 3ce2c050 · 3ce2c050
Unverified Commit 3ce2c050 authored Jun 15, 2024 by zifeitong Committed by GitHub Jun 15, 2024
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 5 deletions

vllm/entrypoints/openai/protocol.py vllm/entrypoints/openai/protocol.py +12 -1

vllm/entrypoints/openai/run_batch.py vllm/entrypoints/openai/run_batch.py +13 -4

No files found.
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -672,6 +672,17 @@ class BatchRequestInput(OpenAIBaseModel):
    body: Union[ChatCompletionRequest, ]


+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: Union[ChatCompletionResponse, ]
+
+
 class BatchRequestOutput(OpenAIBaseModel):
    """
    The per-line object of the batch output and error files
@@ -683,7 +694,7 @@ class BatchRequestOutput(OpenAIBaseModel):
    # inputs.
    custom_id: str

-    response: Optional[ChatCompletionResponse]
+    response: Optional[BatchResponseData]

    # For requests that failed with a non-HTTP error, this will contain more
    # information on the cause of the failure.

--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -10,7 +10,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                              BatchRequestOutput,
-                                              ChatCompletionResponse)
+                                              BatchResponseData,
+                                              ChatCompletionResponse,
+                                              ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
@@ -77,20 +79,27 @@ async def run_request(chat_serving: OpenAIServingChat,
                      request: BatchRequestInput) -> BatchRequestOutput:
    chat_request = request.body
    chat_response = await chat_serving.create_chat_completion(chat_request)
+
    if isinstance(chat_response, ChatCompletionResponse):
        batch_output = BatchRequestOutput(
            id=f"vllm-{random_uuid()}",
            custom_id=request.custom_id,
-            response=chat_response,
+            response=BatchResponseData(
+                body=chat_response, request_id=f"vllm-batch-{random_uuid()}"),
            error=None,
        )
-    else:
+    elif isinstance(chat_response, ErrorResponse):
        batch_output = BatchRequestOutput(
            id=f"vllm-{random_uuid()}",
            custom_id=request.custom_id,
-            response=None,
+            response=BatchResponseData(
+                status_code=chat_response.code,
+                request_id=f"vllm-batch-{random_uuid()}"),
            error=chat_response,
        )
+    else:
+        raise ValueError("Request must not be sent in stream mode")
+
    return batch_output