mlama.py 1.37 KB
Newer Older
1
2
from typing import List, Union

Mick's avatar
Mick committed
3
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
from sglang.srt.models.mllama import MllamaForConditionalGeneration
5
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
6
7
8
from sglang.srt.utils import load_image


Mick's avatar
Mick committed
9
class MllamaImageProcessor(BaseMultimodalProcessor):
10
11
    models = [MllamaForConditionalGeneration]

12
13
14
    def __init__(self, hf_config, server_args, _processor):
        super().__init__(hf_config, server_args, _processor)

Mick's avatar
Mick committed
15
    async def process_mm_data_async(
16
17
18
19
20
21
        self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
    ):
        if isinstance(input_text, list):
            assert len(input_text) and isinstance(input_text[0], int)
            input_text = self._processor.tokenizer.decode(input_text)

22
        images = [load_image(image)[0] for image in image_data]
Mick's avatar
Mick committed
23
        image_inputs = self.process_mm_data(input_text=input_text, images=images)
24
        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
Mick's avatar
Mick committed
25
26
        image_inputs["mm_items"] = [
            MultimodalDataItem(
27
                feature=image_inputs["pixel_values"],
Mick's avatar
Mick committed
28
29
30
31
32
                aspect_ratio_id=image_inputs["aspect_ratio_ids"],
                aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
                modality=Modality.IMAGE,
            )
        ]
33
34

        return image_inputs