speech_to_text.py 2.64 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
from __future__ import annotations
4

5
6
from dataclasses import dataclass
from typing import TYPE_CHECKING
7
8
9

from vllm.config.utils import config

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
if TYPE_CHECKING:
    import numpy as np

    from vllm.config.model import ModelConfig


@dataclass
class SpeechToTextParams:
    """All parameters consumed by ``get_generation_prompt()``.

    ``TranscriptionRequest.build_stt_params()`` constructs this object,
    mapping API-level fields into typed attributes.  Models only receive
    this object, so new parameters can be added here without changing the
    ``get_generation_prompt`` signature.
    """

    audio: np.ndarray
    """Resampled audio waveform for a single chunk."""

    stt_config: SpeechToTextConfig
    """Server-level speech-to-text configuration."""

    model_config: ModelConfig
    """Model configuration."""

    language: str | None = None
    """ISO 639-1 language code (validated / auto-detected)."""

    task_type: str = "transcribe"
    """``"transcribe"`` or ``"translate"``."""

    request_prompt: str = ""
    """Optional text prompt to guide the model."""

    to_language: str | None = None
    """Target language for translation (model-dependent)."""

47
48
49
50
51
52
53
54
55
56

@config
class SpeechToTextConfig:
    """Configuration for speech-to-text models."""

    sample_rate: float = 16_000
    """Sample rate (Hz) to resample input audio to. Most speech models expect
    16kHz audio input. The input audio will be automatically resampled to this
    rate before processing."""

57
    max_audio_clip_s: int | None = 30
58
59
    """Maximum duration in seconds for a single audio clip without chunking.
    Audio longer than this will be split into smaller chunks if
60
61
    `allow_audio_chunking` evaluates to True, otherwise it will be rejected. 
    `None` means audio duration can be unlimited and won't be chunked."""
62
63
64
65
66
67

    overlap_chunk_second: int = 1
    """Overlap duration in seconds between consecutive audio chunks when
    splitting long audio. This helps maintain context across chunk boundaries
    and improves transcription quality at split points."""

68
    min_energy_split_window_size: int | None = 1600
69
70
71
72
73
74
75
    """Window size in samples for finding low-energy (quiet) regions to split
    audio chunks. The algorithm looks for the quietest moment within this
    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
    at 16kHz. If None, no chunking will be done."""

    @property
    def allow_audio_chunking(self) -> bool:
76
77
78
79
        return (
            self.min_energy_split_window_size is not None
            and self.max_audio_clip_s is not None
        )