feat: add usage to TranscriptionResponse (text and json response_format) (#23576)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>

feat: add usage to TranscriptionResponse (text and json response_format) (#23576)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
ebd5a77b · Guillaume Calmettes · GitHub · 384dd1b0 · ebd5a77b · ebd5a77b
Unverified Commit ebd5a77b authored Aug 26, 2025 by Guillaume Calmettes Committed by GitHub Aug 26, 2025
3 changed files
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
            language="en",
            response_format="text",
            temperature=0.0)
-        out = json.loads(transcription)['text']
+        out = json.loads(transcription)
-        assert "Mary had a little lamb," in out
+        out_text = out['text']
+        out_usage = out['usage']
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
 @pytest.mark.asyncio
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
        language="en",
        response_format="text",
        temperature=0.0)
-    out = json.loads(transcription)['text']
+    out = json.loads(transcription)
-    counts = out.count("Mary had a little lamb")
+    out_text = out['text']
+    out_usage = out['usage']
+    counts = out_text.count("Mary had a little lamb")
    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
 @pytest.mark.asyncio

--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel):
 # Transcription response objects
+class TranscriptionUsageAudio(OpenAIBaseModel):
+    type: Literal["duration"] = "duration"
+    seconds: int
 class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
+    usage: TranscriptionUsageAudio
 class TranscriptionWord(OpenAIBaseModel):

--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing):
            for result_generator in list_result_generator:
                async for op in result_generator:
                    text += op.outputs[0].text
-            return cast(T, response_class(text=text))
+            if self.task_type == "transcribe":
+                # add usage in TranscriptionResponse.
+                usage = {
+                    "type": "duration",
+                    # rounded up as per openAI specs
+                    "seconds": int(math.ceil(duration_s)),
+                }
+                final_response = cast(T, response_class(text=text,
+                                                        usage=usage))
+            else:
+                # no usage in response for translation task
+                final_response = cast(
+                    T, response_class(text=text))  # type: ignore[call-arg]
+            return final_response
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e: