voxtral.py 2.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from math import ceil

import numpy as np
import torch
from mistral_common.tokens.tokenizers.audio import AudioEncoder
from transformers import BatchFeature, ProcessorMixin, TensorType
from transformers.audio_utils import AudioInput

from vllm.tokenizers.mistral import MistralTokenizer


class MistralCommonFeatureExtractor:
    """
    Provide a HF-compatible interface for
    `mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
    """

    def __init__(self, audio_encoder: AudioEncoder) -> None:
        self.audio_encoder = audio_encoder

    @property
    def sampling_rate(self):
        return self.audio_encoder.audio_config.sampling_rate

    @property
    def frame_rate(self):
        return self.audio_encoder.audio_config.frame_rate

    def __call__(
        self,
        audios: AudioInput,
        return_tensors: str | TensorType | None = None,
        **kwargs,
    ) -> BatchFeature:
        audios_lst = [audios] if not isinstance(audios, list) else audios

        audios_processed = list[torch.Tensor]()

        for audio in audios_lst:
            audio = np.asarray(audio, dtype=np.float32).ravel()
            if not self.audio_encoder.audio_config.is_streaming:
                audio = self.audio_encoder.pad(audio, self.sampling_rate)

            audios_processed.append(torch.tensor(audio))

        return BatchFeature(
            {"audio_arrays": audios_processed}, tensor_type=return_tensors
        )

    def get_num_audio_tokens(self, audio_length: int) -> int:
        return ceil(audio_length / (self.sampling_rate // self.frame_rate))


class MistralCommonVoxtralProcessor(ProcessorMixin):
    attributes = ["feature_extractor", "tokenizer"]

60
61
62
63
64
    def __init__(
        self,
        tokenizer: MistralTokenizer,
        feature_extractor: MistralCommonFeatureExtractor,
    ) -> None:
65
        self.tokenizer = tokenizer.transformers_tokenizer
66
67
68
69
70

        # Back-compatibility for Transformers v4
        if not hasattr(self.tokenizer, "init_kwargs"):
            self.tokenizer.init_kwargs = {}

71
        self.feature_extractor = feature_extractor
72

73
74
75
        audio_special_ids = self.feature_extractor.audio_encoder.special_ids
        self.audio_token_id = audio_special_ids.audio
        self.begin_audio_token_id = audio_special_ids.begin_audio