[Docs] Update transcriptions API to use openai client with `stream=True` (#20271)

Signed-off-by: NickLucche <nlucches@redhat.com>

[Docs] Update transcriptions API to use openai client with `stream=True` (#20271)
Signed-off-by: NickLucche <nlucches@redhat.com>
314af861 · Nicolò Lucchesi · GitHub · 0e96cc9b · 314af861 · 314af861
Unverified Commit 314af861 authored Jul 01, 2025 by Nicolò Lucchesi Committed by GitHub Jul 01, 2025
Showing with 31 additions and 37 deletions

examples/online_serving/openai_transcription_client.py examples/online_serving/openai_transcription_client.py +28 -33

vllm/entrypoints/openai/protocol.py vllm/entrypoints/openai/protocol.py +3 -4

No files found.
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -19,10 +19,8 @@ The script performs:
 """
 import asyncio
-import json
-import httpx
+from openai import AsyncOpenAI, OpenAI
-from openai import OpenAI
 from vllm.assets.audio import AudioAsset
@@ -47,37 +45,30 @@ def sync_openai(audio_path: str, client: OpenAI):
        print("transcription result:", transcription.text)
-async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+async def stream_openai_response(audio_path: str, client: AsyncOpenAI):
    """
-    Perform streaming transcription using vLLM's raw HTTP streaming API.
+    Perform asynchronous transcription using OpenAI-compatible API.
    """
-    data = {
+    print("\ntranscription result:", end=" ")
-        "language": "en",
-        "stream": True,
-        "model": "openai/whisper-large-v3",
-    }
-    url = base_url + "/audio/transcriptions"
-    headers = {"Authorization": f"Bearer {api_key}"}
-    print("transcription result:", end=" ")
-    # OpenAI Transcription API client does not support streaming.
-    async with httpx.AsyncClient() as client:
    with open(audio_path, "rb") as f:
-            async with client.stream(
+        transcription = await client.audio.transcriptions.create(
-                "POST", url, files={"file": f}, data=data, headers=headers
+            file=f,
-            ) as response:
+            model="openai/whisper-large-v3",
-                async for line in response.aiter_lines():
+            language="en",
-                    # Each line is a JSON object prefixed with 'data: '
+            response_format="json",
-                    if line:
+            temperature=0.0,
-                        if line.startswith("data: "):
+            # Additional sampling params not provided by OpenAI API.
-                            line = line[len("data: ") :]
+            extra_body=dict(
-                        # Last chunk, stream ends
+                seed=420,
-                        if line.strip() == "[DONE]":
+                top_p=0.6,
-                            break
+            ),
-                        # Parse the JSON response
+            stream=True,
-                        chunk = json.loads(line)
+        )
-                        # Extract and print the content
+        async for chunk in transcription:
-                        content = chunk["choices"][0].get("delta", {}).get("content")
+            if chunk.choices:
-                        print(content, end="")
+                content = chunk.choices[0].get("delta", {}).get("content")
+                print(content, end="", flush=True)
    print()  # Final newline after stream ends
@@ -95,7 +86,11 @@ def main():
    sync_openai(mary_had_lamb, client)
    # Run the asynchronous function
-    asyncio.run(stream_openai_response(winning_call, openai_api_base, openai_api_key))
+    client = AsyncOpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    asyncio.run(stream_openai_response(winning_call, client))
 if __name__ == "__main__":

--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1750,12 +1750,11 @@ class TranscriptionRequest(OpenAIBaseModel):
    timestamps incurs additional latency.
    """
-    # --8<-- [start:transcription-extra-params]
    stream: Optional[bool] = False
-    """Custom field not present in the original OpenAI definition. When set,
+    """When set, it will enable output to be streamed in a similar fashion 
-    it will enable output to be streamed in a similar fashion as the Chat
+    as the Chat Completion endpoint.
-    Completion endpoint.
    """
+    # --8<-- [start:transcription-extra-params]
    # Flattened stream option to simplify form data.
    stream_include_usage: Optional[bool] = False
    stream_continuous_usage_stats: Optional[bool] = False