deepseek_vl2.py 4.23 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268

6
from transformers import DeepseekV2Config, PretrainedConfig
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


class VisionEncoderConfig(PretrainedConfig):
    model_type: str = "vision"

    model_name: str = "vit_so400m_patch14_siglip_384.webli"
    image_size: int = 384
    patch_size: int = 16
    width: int = 1024
    layers: int = 24
    heads: int = 16
    mlp_ratio: int = 4
    global_pool: str = "map"
    ignore_head: bool = True
    class_token: bool = False
    num_classes: int = 0
    use_checkpoint: bool = False
    weight_init: str = "skip"
    deterministic: bool = False
    num_recomputing_layers: int = 0

28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
    def __init__(
        self,
        model_name: str = "vit_so400m_patch14_siglip_384.webli",
        image_size: int = 384,
        patch_size: int = 16,
        width: int = 1024,
        layers: int = 24,
        heads: int = 16,
        mlp_ratio: int = 4,
        global_pool: str = "map",
        ignore_head: bool = True,
        class_token: bool = False,
        num_classes: int = 0,
        use_checkpoint: bool = False,
        **kwargs,
    ):
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
        self.model_name = model_name
        self.image_size = image_size
        self.patch_size = patch_size
        self.width = width
        self.layers = layers
        self.heads = heads
        self.mlp_ratio = mlp_ratio
        self.global_pool = global_pool
        self.ignore_head = ignore_head
        self.class_token = class_token
        self.num_classes = num_classes
        self.use_checkpoint = use_checkpoint

        super().__init__(**kwargs)


class MlpProjectorConfig(PretrainedConfig):
    model_type = "mlp_projector"
    projector_type: str = "downsample_mlp_gelu"
    input_dim: int = 1152
    n_embed: int = 2048
    depth: int = 2
    mlp_ratio: int = 1
    downsample_ratio: int = 2
    token_pooling: bool = False

70
71
72
73
74
75
76
77
78
79
    def __init__(
        self,
        projector_type: str = "downsample_mlp_gelu",
        input_dim: int = 1152,
        n_embed: int = 2048,
        depth: int = 2,
        mlp_ratio: int = 1,
        downsample_ratio: int = 2,
        **kwargs,
    ):
80
81
82
83
84
85
86
87
88
89
        self.projector_type = projector_type
        self.input_dim = input_dim
        self.n_embed = n_embed
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.downsample_ratio = downsample_ratio

        super().__init__(**kwargs)


90
91
92
93
94
95
96
97
98
99
100
101
if hasattr(DeepseekV2Config, "validate"):
    # Transformers v5
    from huggingface_hub.dataclasses import strict

    @strict
    class DeepseekVLV2TextConfig(DeepseekV2Config):
        kv_lora_rank: int | None = None
else:
    # Transformers v4
    DeepseekVLV2TextConfig = DeepseekV2Config  # type: ignore[misc]


102
103
class DeepseekVLV2Config(PretrainedConfig):
    model_type = "deepseek_vl_v2"
104
    architectures: list[str] | None = None
105
106
107

    tile_tag: str = "2D"
    global_view_pos: str = "head"
108
109
110
111
112
113
114
115
116
    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)

    def __init__(
        self,
        tile_tag: str = "tile_tag",
        global_view_pos: str = "head",
        candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
        **kwargs,
    ):
117
118
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["DeepseekVLV2ForCausalLM"]
119

120
        vision_config = kwargs.pop("vision_config", {})
121
122
        self.vision_config = VisionEncoderConfig(**vision_config)

123
        projector_config = kwargs.pop("projector_config", {})
124
125
        self.projector_config = MlpProjectorConfig(**projector_config)

126
127
        language_config = kwargs.pop("language_config", {})
        self.text_config = DeepseekVLV2TextConfig(**language_config)
128
129
130
131
132

        self.tile_tag = tile_tag
        self.global_view_pos = global_view_pos
        self.candidate_resolutions = candidate_resolutions
        self.vocab_size = self.text_config.vocab_size
133

134
        # update model_type for OCR models
135
        if "DeepseekOCRForCausalLM" in kwargs["architectures"]:
136
            self.model_type = "deepseek_ocr"
137
        elif "DeepseekOCR2ForCausalLM" in kwargs["architectures"]:
138
            self.model_type = "deepseek_ocr2"
139
        super().__init__(**kwargs)