minicpm.py 5.2 KB
Newer Older
Mick's avatar
Mick committed
1
2
3
4
from typing import List, Union

import torch

Mick's avatar
Mick committed
5
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
Mick's avatar
Mick committed
6
7
from sglang.srt.models.minicpmo import MiniCPMO
from sglang.srt.models.minicpmv import MiniCPMV
8
9
10
11
from sglang.srt.multimodal.processors.base_processor import (
    BaseMultimodalProcessor,
    MultimodalSpecialTokens,
)
Mick's avatar
Mick committed
12
13
14
15
16
17
18
19


# Compatible with both 'O' and 'V'
class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
    models = [MiniCPMV, MiniCPMO]

    def __init__(self, hf_config, server_args, _processor):
        super().__init__(hf_config, server_args, _processor)
20
21
22
23
24
25
26
27
28
29
        # Collect special token ids
        tokenizer = self._processor.tokenizer
        self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
        self.slice_end_id = getattr(tokenizer, "slice_end_id", None)
        self.audio_start_id = getattr(tokenizer, "audio_start_id", None)
        self.audio_end_id = getattr(tokenizer, "audio_end_id", None)
        self.im_start_id = getattr(tokenizer, "im_start_id", None)
        self.im_end_id = getattr(tokenizer, "im_end_id", None)
        self.im_token_id = getattr(tokenizer, "unk_id", None)

30
31
32
33
        self.mm_tokens = MultimodalSpecialTokens(
            image_token="(<image>./</image>)",
            audio_token="(<audio>./</audio>)",
            video_token="(<video>./</video>)",
34
            image_token_id=self.im_token_id,
35
        ).build(_processor)
Mick's avatar
Mick committed
36
37
38
39

    async def process_mm_data_async(
        self,
        image_data: List[Union[str, bytes]],
40
        audio_data: List[Union[str, bytes]],
41
        input_text,
Mick's avatar
Mick committed
42
        request_obj,
43
        **kwargs,
Mick's avatar
Mick committed
44
45
    ):
        base_output = self.load_mm_data(
46
            prompt=input_text,
Mick's avatar
Mick committed
47
48
            audio_data=audio_data,
            image_data=image_data,
49
            multimodal_tokens=self.mm_tokens,
Mick's avatar
Mick committed
50
51
52
53
        )
        if base_output is None:
            return None

Mick's avatar
Mick committed
54
        res = self.process_mm_data(
Mick's avatar
Mick committed
55
            input_text=base_output.input_text,
Mick's avatar
Mick committed
56
            images=base_output.images,
Mick's avatar
Mick committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
            audios=base_output.audios,
        )

        pixel_values = res["pixel_values"]
        tgt_sizes = res["tgt_sizes"]

        if not isinstance(pixel_values, (torch.Tensor, list)):
            raise ValueError(
                "Incorrect type of pixel values. " f"Got type: {type(pixel_values)}"
            )

        if not isinstance(tgt_sizes, (torch.Tensor, list)):
            raise ValueError(
                "Incorrect type of target sizes. " f"Got type: {type(tgt_sizes)}"
            )

        if len(pixel_values) != len(tgt_sizes):
            raise ValueError(
                "Inconsistent batch lengths, found: "
                f"{len(pixel_values)} vs. {len(tgt_sizes)}"
            )

        pixel_values_flat: List[torch.Tensor] = []
        tgt_sizes_flat: List[torch.Tensor] = []
        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
            # per image
            if len(pixel_b) != len(tgt_b):
                raise ValueError(
                    "Inconsistent N lengths, found: " f"{len(pixel_b)} vs {len(tgt_b)}"
                )
            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
                pixel_values_flat += [pixel_n]
                tgt_sizes_flat += [tgt_n]

        pixel_values = pixel_values_flat
Mick's avatar
Mick committed
92
93

        items = []
94
95
        input_ids = res["input_ids"].flatten()
        image_offsets = self.get_mm_items_offset_by_pair(
96
            input_ids=input_ids, mm_start_id=self.im_start_id, mm_end_id=self.im_end_id
97
98
        )
        slice_offsets = self.get_mm_items_offset_by_pair(
99
100
101
            input_ids=input_ids,
            mm_start_id=self.slice_start_id,
            mm_end_id=self.slice_end_id,
102
103
104
105
        )
        image_offsets.extend(slice_offsets)
        image_offsets = sorted(image_offsets)

Mick's avatar
Mick committed
106
107
        if len(pixel_values) != 0:
            item = MultimodalDataItem(
108
                feature=pixel_values,
109
                offsets=image_offsets,
110
                model_specific_data={"tgt_size": tgt_sizes_flat},
Mick's avatar
Mick committed
111
112
113
114
115
116
117
118
119
                modality=Modality.IMAGE,
            )
            items += [item]

        if (
            "audio_features" in res
            and res["audio_features"] is not None
            and len(res["audio_features"]) != 0
        ):
120
            if self.audio_start_id is not None and self.audio_end_id is not None:
121
122
                audio_offsets = self.get_mm_items_offset_by_pair(
                    input_ids=input_ids,
123
124
                    mm_start_id=self.audio_start_id,
                    mm_end_id=self.audio_end_id,
125
126
127
                )
            else:
                audio_offsets = None
Mick's avatar
Mick committed
128
            item = MultimodalDataItem(
129
                feature=[res["audio_features"]],
130
                model_specific_data={"audio_feature_lens": res["audio_feature_lens"]},
131
                offsets=audio_offsets,
Mick's avatar
Mick committed
132
133
134
                modality=Modality.AUDIO,
            )
            items += [item]
Mick's avatar
Mick committed
135
        return {
Mick's avatar
Mick committed
136
            "mm_items": items,
137
            "input_ids": input_ids.tolist(),
138
139
140
141
142
143
144
            "audio_start_id": self.audio_start_id,
            "audio_end_id": self.audio_end_id,
            "im_token_id": self.im_token_id,
            "im_start_id": self.im_start_id,
            "im_end_id": self.im_end_id,
            "slice_start_id": self.slice_start_id,
            "slice_end_id": self.slice_end_id,
Mick's avatar
Mick committed
145
        }