speech_to_text.py 15.8 KB
Newer Older
1
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import io
import math
import time
from collections.abc import AsyncGenerator
8
from functools import cached_property
9
10
11
12
13
from typing import Callable, Literal, Optional, TypeVar, Union, cast

import numpy as np
from fastapi import Request

14
import vllm.envs as envs
15
16
17
18
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import (
19
20
21
22
23
24
25
26
27
28
29
30
    DeltaMessage,
    ErrorResponse,
    RequestResponseMetadata,
    TranscriptionResponse,
    TranscriptionResponseStreamChoice,
    TranscriptionStreamResponse,
    TranslationResponse,
    TranslationResponseStreamChoice,
    TranslationStreamResponse,
    UsageInfo,
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
31
32
33
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
34
from vllm.model_executor.models import SupportsTranscription
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from vllm.outputs import RequestOutput
from vllm.utils import PlaceholderModule

try:
    import librosa
except ImportError:
    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]

SpeechToTextResponse = Union[TranscriptionResponse, TranslationResponse]
T = TypeVar("T", bound=SpeechToTextResponse)

logger = init_logger(__name__)


class OpenAISpeechToText(OpenAIServing):
50
    """Base class for speech-to-text operations like transcription and
51
52
53
54
55
56
57
58
59
60
61
    translation."""

    def __init__(
        self,
        engine_client: EngineClient,
        model_config: ModelConfig,
        models: OpenAIServingModels,
        *,
        request_logger: Optional[RequestLogger],
        return_tokens_as_token_ids: bool = False,
        task_type: Literal["transcribe", "translate"] = "transcribe",
62
        log_error_stack: bool = False,
63
    ):
64
65
66
67
68
69
70
71
72
73
        super().__init__(
            engine_client=engine_client,
            model_config=model_config,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            log_error_stack=log_error_stack,
        )

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
74
75
        self.task_type = task_type

76
        self.asr_config = self.model_cls.get_speech_to_text_config(
77
78
            model_config, task_type
        )
79

80
81
        self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB

82
83
84
        if self.default_sampling_params:
            logger.info(
                "Overwriting default completion sampling param with: %s",
85
86
                self.default_sampling_params,
            )
87

88
    @cached_property
89
    def model_cls(self) -> type[SupportsTranscription]:
90
        from vllm.model_executor.model_loader import get_model_cls
91

92
93
        model_cls = get_model_cls(self.model_config)
        return cast(type[SupportsTranscription], model_cls)
94

95
96
97
98
99
100
    async def _preprocess_speech_to_text(
        self,
        request: SpeechToTextRequest,
        audio_data: bytes,
    ) -> tuple[list[PromptType], float]:
        # Validate request
101
        language = self.model_cls.validate_language(request.language)
102
        # Skip to_language validation to avoid extra logging for Whisper.
103
104
105
106
107
        to_language = (
            self.model_cls.validate_language(request.to_language)
            if request.to_language
            else None
        )
108

109
        if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
110
111
112
113
114
            raise ValueError("Maximum file size exceeded.")

        with io.BytesIO(audio_data) as bytes_:
            # NOTE resample to model SR here for efficiency. This is also a
            # pre-requisite for chunking, as it assumes Whisper SR.
115
            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
116
117

        duration = librosa.get_duration(y=y, sr=sr)
118
119
120
121
        do_split_audio = (
            self.asr_config.allow_audio_chunking
            and duration > self.asr_config.max_audio_clip_s
        )
122
        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
123
124
        prompts = []
        for chunk in chunks:
125
126
127
128
129
            # The model has control over the construction, as long as it
            # returns a valid PromptType.
            prompt = self.model_cls.get_generation_prompt(
                audio=chunk,
                stt_config=self.asr_config,
Patrick von Platen's avatar
Patrick von Platen committed
130
                model_config=self.model_config,
131
                language=language,
132
                task_type=self.task_type,
133
134
135
                request_prompt=request.prompt,
                to_language=to_language,
            )
136
            prompts.append(prompt)
137
138
139
140
141
142
143
144
145
146
        return prompts, duration

    async def _create_speech_to_text(
        self,
        audio_data: bytes,
        request: SpeechToTextRequest,
        raw_request: Request,
        response_class: type[T],
        stream_generator_method: Callable[..., AsyncGenerator[str, None]],
    ) -> Union[T, AsyncGenerator[str, None], ErrorResponse]:
147
        """Base method for speech-to-text operations like transcription and
148
149
150
151
152
153
154
155
156
157
158
        translation."""
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

159
        if request.response_format not in ["text", "json"]:
160
            return self.create_error_response(
161
162
                "Currently only support response_format `text` or `json`"
            )
163
164
165
166
167
168
169
170

        request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
171
            lora_request = self._maybe_get_adapters(request)
172
173
174

            if lora_request:
                return self.create_error_response(
175
176
                    f"Currently do not support LoRA for {self.task_type.title()}."
                )
177
178
179
180
181
182
183
184
185
186

            prompts, duration_s = await self._preprocess_speech_to_text(
                request=request,
                audio_data=audio_data,
            )

        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))

187
188
189
        list_result_generator: Optional[list[AsyncGenerator[RequestOutput, None]]] = (
            None
        )
190
191
192
193
194
195
        try:
            # Unlike most decoder-only models, whisper generation length is not
            # constrained by the size of the input audio, which is mapped to a
            # fixed-size log-mel-spectogram.
            default_max_tokens = self.model_config.max_model_len
            sampling_params = request.to_sampling_params(
196
197
                default_max_tokens, self.default_sampling_params
            )
198
199
200

            self._log_inputs(
                request_id,
201
202
                # It will not display special tokens like <|startoftranscript|>
                request.prompt,
203
                params=sampling_params,
204
205
                lora_request=None,
            )
206
207
208
209
210
211

            list_result_generator = [
                self.engine_client.generate(
                    prompt,
                    sampling_params,
                    request_id,
212
213
                )
                for prompt in prompts
214
215
216
217
218
219
            ]
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))

        if request.stream:
220
221
222
            return stream_generator_method(
                request, list_result_generator, request_id, request_metadata, duration_s
            )
223
224
225
226
227
228
229
        # Non-streaming response.
        try:
            assert list_result_generator is not None
            text = ""
            for result_generator in list_result_generator:
                async for op in result_generator:
                    text += op.outputs[0].text
230
231
232
233
234
235
236
237

            if self.task_type == "transcribe":
                # add usage in TranscriptionResponse.
                usage = {
                    "type": "duration",
                    # rounded up as per openAI specs
                    "seconds": int(math.ceil(duration_s)),
                }
238
                final_response = cast(T, response_class(text=text, usage=usage))
239
240
            else:
                # no usage in response for translation task
241
                final_response = cast(T, response_class(text=text))  # type: ignore[call-arg]
242
243

            return final_response
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))

    async def _speech_to_text_stream_generator(
        self,
        request: SpeechToTextRequest,
        list_result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
        chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
        response_stream_choice_class: Union[
            type[TranscriptionResponseStreamChoice],
260
261
262
263
264
            type[TranslationResponseStreamChoice],
        ],
        stream_response_class: Union[
            type[TranscriptionStreamResponse], type[TranslationStreamResponse]
        ],
265
266
267
268
269
270
271
    ) -> AsyncGenerator[str, None]:
        created_time = int(time.time())
        model_name = request.model

        completion_tokens = 0
        num_prompt_tokens = 0

272
273
274
275
276
277
        include_usage = (
            request.stream_include_usage if request.stream_include_usage else False
        )
        include_continuous_usage = (
            request.stream_continuous_usage_stats
            if include_usage and request.stream_continuous_usage_stats
278
            else False
279
        )
280
281
282
283
284
285

        try:
            for result_generator in list_result_generator:
                async for res in result_generator:
                    # On first result.
                    if res.prompt_token_ids is not None:
286
287
                        num_prompt_tokens = len(res.prompt_token_ids)
                        if audio_tokens := self.model_cls.get_num_audio_tokens(
288
289
                            audio_duration_s, self.asr_config, self.model_config
                        ):
290
                            num_prompt_tokens += audio_tokens
291
292
293
294
295
296
297
298
299
300
301
302
303
304

                    # We need to do it here, because if there are exceptions in
                    # the result_generator, it needs to be sent as the FIRST
                    # response (by the try...catch).

                    # Just one output (n=1) supported.
                    assert len(res.outputs) == 1
                    output = res.outputs[0]

                    delta_message = DeltaMessage(content=output.text)
                    completion_tokens += len(output.token_ids)

                    if output.finish_reason is None:
                        # Still generating, send delta update.
305
                        choice_data = response_stream_choice_class(delta=delta_message)
306
307
308
309
310
                    else:
                        # Model is finished generating.
                        choice_data = response_stream_choice_class(
                            delta=delta_message,
                            finish_reason=output.finish_reason,
311
312
                            stop_reason=output.stop_reason,
                        )
313

314
315
316
317
318
319
320
                    chunk = stream_response_class(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name,
                    )
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

                    # handle usage stats if requested & if continuous
                    if include_continuous_usage:
                        chunk.usage = UsageInfo(
                            prompt_tokens=num_prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=num_prompt_tokens + completion_tokens,
                        )

                    data = chunk.model_dump_json(exclude_unset=True)
                    yield f"data: {data}\n\n"

            # Once the final token is handled, if stream_options.include_usage
            # is sent, send the usage.
            if include_usage:
336
337
338
339
340
                final_usage = UsageInfo(
                    prompt_tokens=num_prompt_tokens,
                    completion_tokens=completion_tokens,
                    total_tokens=num_prompt_tokens + completion_tokens,
                )
341
342
343
344
345
346
347

                final_usage_chunk = stream_response_class(
                    id=request_id,
                    object=chunk_object_type,
                    created=created_time,
                    choices=[],
                    model=model_name,
348
349
350
351
352
                    usage=final_usage,
                )
                final_usage_data = final_usage_chunk.model_dump_json(
                    exclude_unset=True, exclude_none=True
                )
353
354
355
356
357
358
                yield f"data: {final_usage_data}\n\n"

            # report to FastAPI middleware aggregate usage across all choices
            request_metadata.final_usage_info = UsageInfo(
                prompt_tokens=num_prompt_tokens,
                completion_tokens=completion_tokens,
359
360
                total_tokens=num_prompt_tokens + completion_tokens,
            )
361
362
363
364
365
366
367
368
369

        except Exception as e:
            # TODO: Use a vllm-specific Validation Error
            logger.exception("Error in %s stream generator.", self.task_type)
            data = self.create_streaming_error_response(str(e))
            yield f"data: {data}\n\n"
        # Send the final done message after all response.n are finished
        yield "data: [DONE]\n\n"

370
371
372
    def _split_audio(
        self, audio_data: np.ndarray, sample_rate: int
    ) -> list[np.ndarray]:
373
374
        chunk_size = sample_rate * self.asr_config.max_audio_clip_s
        overlap_size = sample_rate * self.asr_config.overlap_chunk_second
375
376
377
378
379
380
381
382
383
384
385
        chunks = []
        i = 0
        while i < audio_data.shape[-1]:
            if i + chunk_size >= audio_data.shape[-1]:
                # handle last chunk
                chunks.append(audio_data[..., i:])
                break

            # Find the best split point in the overlap region
            search_start = i + chunk_size - overlap_size
            search_end = min(i + chunk_size, audio_data.shape[-1])
386
            split_point = self._find_split_point(audio_data, search_start, search_end)
387
388
389
390
391
392

            # Extract chunk up to the split point
            chunks.append(audio_data[..., i:split_point])
            i = split_point
        return chunks

393
394
    def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
        """Find the best point to split audio by
395
396
397
398
399
400
401
402
403
404
405
406
407
        looking for silence or low amplitude.
        Args:
            wav: Audio tensor [1, T]
            start_idx: Start index of search region
            end_idx: End index of search region
        Returns:
            Index of best splitting point
        """
        segment = wav[start_idx:end_idx]

        # Calculate RMS energy in small windows
        min_energy = math.inf
        quietest_idx = 0
408
409
410
        min_energy_window = self.asr_config.min_energy_split_window_size
        assert min_energy_window is not None
        for i in range(0, len(segment) - min_energy_window, min_energy_window):
411
412
            window = segment[i : i + min_energy_window]
            energy = (window**2).mean() ** 0.5
413
414
415
416
            if energy < min_energy:
                quietest_idx = i + start_idx
                min_energy = energy
        return quietest_idx