"vllm/vscode:/vscode.git/clone" did not exist on "24679788ed389f63106a9733ae0c87559bd11604"
rvl.py 3.51 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Mapping
5
from typing import Optional
6
7
8
9
10
11

import torch
import torch.nn as nn
from transformers.activations import GELUActivation

from vllm.config import VllmConfig
12
from vllm.config.multimodal import BaseDummyOptions
13
14
15
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict

16
17
18
19
20
from .llava_next import (
    LlavaDummyInputsBuilder,
    LlavaNextMultiModalProcessor,
    LlavaNextProcessingInfo,
)
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from .llava_onevision import LlavaOnevisionForConditionalGeneration
from .utils import WeightsMapper


class RVLProcessingInfo(LlavaNextProcessingInfo):
    def get_hf_config(self):
        return self.ctx.get_hf_config()

    def get_hf_processor(self, **kwargs: object):
        return self.ctx.get_hf_processor(**kwargs)


class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)
        image_token = "<image>"

        return image_token * num_images

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
44
        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
45
46
47
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

48
        target_width, target_height = self.info.get_image_size_with_most_features()
49

50
51
        image_overrides = mm_options.get("image") if mm_options else None

52
        return {
53
54
55
56
57
58
            "image": self._get_dummy_images(
                width=target_width,
                height=target_height,
                num_images=num_images,
                overrides=image_overrides,
            ),
59
60
61
62
63
64
        }


class RVLMultiModalProjector(nn.Module):
    def __init__(self, config):
        super().__init__()
65
        self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=1e-06)
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
        self.linear_1 = nn.Linear(
            config.vision_config.hidden_size,
            config.text_config.hidden_size,
            bias=True,
        )
        self.act = GELUActivation()
        self.linear_2 = nn.Linear(
            config.text_config.hidden_size,
            config.text_config.hidden_size,
            bias=True,
        )

    def forward(self, image_feature: torch.Tensor) -> torch.Tensor:
        image_feature = self.pre_norm(image_feature)
        hidden_states = self.linear_1(image_feature)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)

        return hidden_states


@MULTIMODAL_REGISTRY.register_processor(
    LlavaNextMultiModalProcessor,
    info=RVLProcessingInfo,
    dummy_inputs=RVLDummyInputsBuilder,
)
class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration):
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            # mapping for new names in checkpoint saved after transformers
            # v4.52
            "model.language_model.": "language_model.model.",
            "model.vision_tower.": "vision_tower.",
            "model.multi_modal_projector.": "multi_modal_projector.",
            "model.image_newline": "image_newline",
            "lm_head.": "language_model.lm_head.",
102
103
        }
    )
104
105
106
107
108

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        config = vllm_config.model_config.hf_config
        self.multi_modal_projector = RVLMultiModalProjector(config)