"vllm/model_executor/models/step1.py" did not exist on "eebad39f265606cfe35af4d1e0bea678516648a3"
phi3v.py 25.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright 2024 The vLLM team.
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
18
from collections.abc import Iterable, Mapping, Sequence
19
from typing import Annotated, Any, Literal, TypeAlias
20

21
import regex as re
22
23
import torch
import torch.nn as nn
24
25
26
27
28
29
from transformers import (
    BatchFeature,
    CLIPVisionConfig,
    PretrainedConfig,
    ProcessorMixin,
)
30

31
from vllm.config import VllmConfig
32
from vllm.config.multimodal import BaseDummyOptions
33
from vllm.logger import init_logger
34
from vllm.model_executor.layers.quantization import QuantizationConfig
35
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
36
from vllm.multimodal import MULTIMODAL_REGISTRY
37
38
39
40
41
42
43
44
45
46
47
from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
    MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
    ImageEmbeddingItems,
    ImageProcessorItems,
    ImageSize,
    MultiModalDataItems,
)
48
49
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.multimodal.processing.processor import (
50
51
52
53
54
55
56
57
    BaseMultiModalProcessor,
    BaseProcessingInfo,
    MultiModalPromptUpdates,
    PlaceholderFeaturesInfo,
    PromptReplacement,
    PromptUpdate,
    ResolvedPromptUpdate,
)
58
from vllm.sequence import IntermediateTensors
59
from vllm.utils.tensor_schema import TensorSchema, TensorShape
60

61
from .clip import CLIPVisionModel
62
63
64
65
66
from .interfaces import (
    MultiModalEmbeddings,
    SupportsMultiModal,
    SupportsPP,
    SupportsQuant,
67
    _require_is_multimodal,
68
69
70
71
72
73
74
75
)
from .utils import (
    AutoWeightsLoader,
    WeightsMapper,
    _merge_multimodal_embeddings,
    init_vllm_registered_model,
    maybe_prefix,
)
76

77
78
logger = init_logger(__name__)

79
80
81
# Cannot find the following 2 numbers from hf config.
_IMAGE_TOKEN_ID = 32044

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
    dropout=0.0,
    hidden_act="quick_gelu",
    hidden_size=1024,
    image_size=336,
    intermediate_size=4096,
    num_attention_heads=16,
    num_channels=3,
    num_hidden_layers=24,
    patch_size=14,
    projection_dim=768,
)


def _init_img_processor(
    hf_config: PretrainedConfig,
98
    quant_config: QuantizationConfig | None,
99
100
    prefix: str = "",
) -> CLIPVisionModel:
101
    clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
102
    layer_idx = hf_config.img_processor.get("layer_idx", -2)
103
104
105

    # Initialize the CLIP only up to the required feature layer
    if layer_idx < 0:
106
        num_hidden_layers = clip_config.num_hidden_layers + layer_idx + 1
107
108
109
110
    else:
        num_hidden_layers = layer_idx + 1

    img_processor = CLIPVisionModel(
111
        clip_config,
112
        quant_config=quant_config,
113
        num_hidden_layers_override=num_hidden_layers,
114
        prefix=prefix,
115
    )
116
117
118
119

    return img_processor


120
class Phi3VImagePixelInputs(TensorSchema):
121
    """
122
123
124
125
126
127
    Dimensions:
        - b: Batch size
        - n: Number of images
        - p: Number of patches
        - h: Height of each patch
        - w: Width of each patch
128
129
    """

130
    type: Literal["pixel_values", "image_embeds"] = "pixel_values"
131

132
    # Supports either a stacked tensor or a list of (p, 3, h, w) tensors
133
    pixel_values: Annotated[
134
        torch.Tensor | list[torch.Tensor],
135
136
137
        TensorShape(
            "bn", "p", 3, "h", "w", dynamic_dims={"p"}
        ),  # 'p' may vary across items
138
    ]
139

140
    # Stacked tensor with height and width for each image
141
    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
142
143


144
class Phi3VImageEmbeddingInputs(TensorSchema):
145
    """
146
147
148
149
150
151
    Dimensions:
        - b: Batch size
        - n: Number of images
        - f: Image feature size (e.g., number of tokens per image)
        - h: Hidden size (must match language model backbone)
    """
152

153
154
    type: Literal["image_embeds"] = "image_embeds"
    data: Annotated[
155
        torch.Tensor | list[torch.Tensor],
156
157
        TensorShape("bn", "f", "h"),
    ]
158
159


160
Phi3VImageInputs: TypeAlias = Phi3VImagePixelInputs | Phi3VImageEmbeddingInputs
161
162


163
# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py
164
class Phi3HDImageEmbedding(nn.Module):
165
166
    """Phi3 Image embedding with HD transform."""

167
168
169
    def __init__(
        self,
        config: PretrainedConfig,
170
        quant_config: QuantizationConfig | None,
171
172
        prefix: str = "",
    ) -> None:
173
        super().__init__()
174
175

        # n_embed or hidden_size
176
        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
177

178
        self.img_processor = _init_img_processor(
179
180
181
            config,
            quant_config=quant_config,
            prefix=f"{prefix}.img_processor",
182
        )
183

184
185
        image_dim_out = config.img_processor["image_dim_out"]
        self.num_img_tokens = config.img_processor["num_img_tokens"]
186
187
188
189

        self.image_dim_out = image_dim_out

        # global_gn and sub_gn for hd transform, serves as line separator
190
        self.use_hd_transform = config.embd_layer.get("use_hd_transform", False)
191
        self.with_learnable_separator = config.embd_layer.get(
192
193
194
            "with_learnable_separator", False
        )
        self.hd_transform_order = config.embd_layer.get("hd_transform_order", "glb_sub")
195
196
197
198
199
        # with_hd_transform and with_learnable_separator should have same value
        assert self.use_hd_transform and self.with_learnable_separator

        # 1024 * 4, merge spatial to channel dimension
        self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4]))
200
        self.sub_GN = nn.Parameter(torch.empty([1, 1, 1, self.image_dim_out * 4]))
201
202
203

        dim_projection = hidden_size
        depth = 2
204
        layers: list[nn.Module] = [nn.Linear(image_dim_out * 4, dim_projection)]
205
        for _ in range(1, depth):
206
            layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
207
208
        self.img_projection = nn.Sequential(*layers)

209
        self.type_feature = config.img_processor.get("type_feature", "patch")
210

211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
        type_feature = self.type_feature

        # NOTE: we skip the step to select the vision feature layer since
        # this is already done inside the img_processor
        img_feature = self.img_processor(img_embeds)

        if type_feature == "patch":
            patch_feature = img_feature[:, 1:]
            return patch_feature

        if type_feature == "cls_patch":
            return img_feature

        raise NotImplementedError(type_feature)

227
228
229
    def forward(
        self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor
    ) -> torch.FloatTensor:
230
231
232
233
234
235
236
237
238
        """
        process image and return vision embeddings.

        pixel_values: (num_images, num_crops, c, h, w)
        output: (num_images, num_img_tokens, hidden_size)
        """
        num_images, num_crops, c, h, w = pixel_values.shape
        pixel_values = pixel_values.flatten(0, 1)
        img_features = self.get_img_features(pixel_values)
239
240
241
242
        img_features = img_features.reshape(
            num_images, num_crops, -1, self.image_dim_out
        )
        image_features_proj = self.hd_feature_transform(img_features, image_sizes)
243
244
245
246
247
248
        return image_features_proj

    def hd_feature_transform(self, image_features, image_sizes):
        """
        image_features: (num_images, num_crops+1, 24*24, 1024)
        """
249
250
251
        assert self.hd_transform_order == "sub_glb", (
            f"hd_transform_order `{self.hd_transform_order}` not implemented"
        )
252
253
254
255
256
257
258
        if isinstance(self.img_projection, nn.Sequential):
            target_device = self.img_projection[0].bias.device
            target_dtype = self.img_projection[0].bias.dtype
        else:  # It's a single nn.Linear layer
            target_device = self.img_projection.bias.device
            target_dtype = self.img_projection.bias.dtype

259
        global_image_features = image_features[:, 0]  # (num_images, 24*24, 1024)
260
261
        # global feature can be viewed as a special HD case with num_crops 1x1
        global_image_features_hd = self.reshape_hd_patches_2x2merge(
262
263
            global_image_features, 1, 1
        )
264
        global_image_features_hd_newline = self.add_image_newline(
265
266
            global_image_features_hd
        )
267

268
        batch_image_features_proj = []
269
270
271
272
273
274
275
276
277
278
        # need a for loop to process each image because of different image sizes
        # (patch arrangement is different for each image)
        for i, img_size in enumerate(image_sizes):
            h, w = img_size
            h_crop = h // 336
            w_crop = w // 336
            num_crops = h_crop * w_crop

            # NOTE: real num_crops is padded
            # (num_crops, 24*24, 1024)
279
            sub_image_features = image_features[i, 1 : 1 + num_crops]
280
            sub_image_features_hd = self.reshape_hd_patches_2x2merge(
281
282
                sub_image_features, h_crop, w_crop
            )
283
            sub_image_features_hd_newline = self.add_image_newline(
284
285
                sub_image_features_hd
            )
286
287

            # [sub features, separator, global features]
288
289
290
291
292
293
294
295
296
            image_embeddings = torch.cat(
                [
                    sub_image_features_hd_newline.squeeze(
                        0
                    ),  # (h_crop*12*(w_crop*12+1), 4096)
                    self.glb_GN.squeeze(0),
                    global_image_features_hd_newline[i],
                ]
            )
297
            img_proj = self.img_projection(
298
299
                image_embeddings.to(target_device, target_dtype)
            )
300
301
302
            batch_image_features_proj.append(img_proj)

        return batch_image_features_proj
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
        """
        image_features: (num_images*num_crops, 24*24, 1024)
        output: (num_images, h_crop*12, w_crop*12, 4096)
        where h_crop*w_crop == num_crops
        """
        N, L, C = image_features.shape
        assert L == 576 and C == 1024 and N % (h_crop * w_crop) == 0
        num_images = N // (h_crop * w_crop)
        H = int(L**0.5)
        image_features_hd = (
            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
            .reshape(N, -1, 4 * C)  # N, 144, 4096
319
320
321
            .reshape(
                num_images, h_crop, w_crop, H // 2, H // 2, -1
            )  # n_img, h_crop, w_crop, 12, 12, 4096
322
            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
323
324
325
            .reshape(
                num_images, h_crop * H // 2, w_crop * H // 2, 4 * C
            )  # n_img, h_crop*12, w_crop*12, 4096
326
327
328
329
330
331
332
333
334
335
        )
        return image_features_hd

    def add_image_newline(self, image_features_hd):
        """
        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
        """
        num_images, h, w, hid_dim = image_features_hd.shape
        # add the newline token to the HD image feature patches
336
337
338
        newline_embeddings = self.sub_GN.expand(
            num_images, h, -1, -1
        )  # (n_img, h, 1, hid_dim)
339
        image_features_hd_newline = torch.cat(
340
341
            [image_features_hd, newline_embeddings], dim=2
        ).reshape(num_images, -1, hid_dim)
342
        return image_features_hd_newline
343
344


345
class Phi3VProcessingInfo(BaseProcessingInfo):
346
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
347
348
        return {"image": None}

349
350
351
352
353
    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
354
        processor: ProcessorMixin | None = None,
355
356
357
358
359
360
361
362
363
364
    ) -> int:
        if processor is None:
            processor = self.get_hf_processor()

        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
            width=image_width,
            height=image_height,
        )

    def get_image_size_with_most_features(self) -> ImageSize:
365
366
367
        # Result in the max possible feature size (h:w = 16:1)
        return ImageSize(height=8000, width=50)

368
369

class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
370
371
372
373
374
375
376
377
378
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

        hf_processor = self.info.get_hf_processor()
        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore

        return "".join(image_tokens[:num_images])

    def get_dummy_mm_data(
379
        self,
380
381
        seq_len: int,
        mm_counts: Mapping[str, int],
382
        mm_options: Mapping[str, BaseDummyOptions] | None = None,
383
        mm_processor_kwargs: Mapping[str, object] | None = None,
384
    ) -> MultiModalDataDict:
385
        num_images = mm_counts.get("image", 0)
386

387
        target_width, target_height = self.info.get_image_size_with_most_features()
388

389
390
        image_overrides = mm_options.get("image") if mm_options else None

391
        return {
392
393
394
395
396
397
            "image": self._get_dummy_images(
                width=target_width,
                height=target_height,
                num_images=num_images,
                overrides=image_overrides,
            )
398
399
400
        }


401
class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
402
    def _call_hf_processor(
403
404
        self,
        prompt: str,
405
406
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
407
        tok_kwargs: Mapping[str, object],
408
    ) -> BatchFeature:
409
410
        processed_outputs = super()._call_hf_processor(
            prompt=prompt,
411
412
            mm_data=mm_data,
            mm_kwargs=mm_kwargs,
413
            tok_kwargs=tok_kwargs,
414
415
        )

416
417
418
        input_ids = processed_outputs["input_ids"]
        assert isinstance(input_ids, torch.Tensor)

419
420
421
        # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
        # which will cause OverflowError when decoding the prompt_ids.
        # Therefore, we need to do an early replacement here
422
        input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID)
423

424
425
        return processed_outputs

426
427
428
429
430
431
432
433
434
435
436
    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(
            pixel_values=MultiModalFieldConfig.batched("image"),
            image_sizes=MultiModalFieldConfig.batched("image"),
            image_embeds=MultiModalFieldConfig.batched("image"),
        )

437
    def _get_prompt_updates(
438
439
        self,
        mm_items: MultiModalDataItems,
440
        hf_processor_mm_kwargs: Mapping[str, Any],
441
        out_mm_kwargs: MultiModalKwargsItems,
442
    ) -> Sequence[PromptUpdate]:
443
        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
444
445
446
        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore

        def get_replacement_phi3v(item_idx: int):
447
            images = mm_items.get_items(
448
449
                "image", (ImageEmbeddingItems, ImageProcessorItems)
            )
450
451
452
453
454

            if isinstance(images, ImageEmbeddingItems):
                num_image_tokens = images.get_feature_size(item_idx)
            else:
                image_size = images.get_image_size(item_idx)
455
                num_image_tokens = self.info.get_num_image_tokens(
456
457
                    image_width=image_size.width,
                    image_height=image_size.height,
458
                    processor=hf_processor,
459
460
                )

461
            return [_IMAGE_TOKEN_ID] * num_image_tokens
462
463
464
465

        return [
            PromptReplacement(
                modality="image",
466
                target=image_tokens.__getitem__,
467
                replacement=get_replacement_phi3v,
468
            )
469
470
        ]

471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
    def _recompute_cached_prompt_update(
        self,
        cached_update: ResolvedPromptUpdate,
        new_item_idx: int,
    ) -> ResolvedPromptUpdate:
        new_update = super()._recompute_cached_prompt_update(
            cached_update,
            new_item_idx,
        )

        if cached_update.modality == "image":
            hf_processor = self.info.get_hf_processor()
            image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
            new_update = new_update.with_target(image_tokens[new_item_idx])

        return new_update

488
    def _apply_prompt_updates(
489
490
        self,
        token_ids: list[int],
491
        mm_prompt_updates: MultiModalPromptUpdates,
492
    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
pansicheng's avatar
pansicheng committed
493
        # align to hf behavior when there are images
494
        if len(mm_prompt_updates):
pansicheng's avatar
pansicheng committed
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
            tokenizer = self.info.get_tokenizer()
            # to decode token_ids to the original text, we need to
            # 1. remove the first bos token
            # 2. remove space after each special token
            #    introduced by the tokenizer
            if len(token_ids) and token_ids[0] == tokenizer.bos_token_id:
                token_ids = token_ids[1:]
            text = tokenizer.decode(token_ids)
            for special_tokens in tokenizer.special_tokens_map.values():
                if isinstance(special_tokens, str):
                    text = text.replace(f"{special_tokens} ", special_tokens)
                elif isinstance(special_tokens, list):
                    for special_token in special_tokens:
                        text = text.replace(f"{special_token} ", special_token)
            # perform hf behavior
            # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/64f88b6/processing_phi3_v.py#L407
            pattern = r"<\|image_\d+\|>"
            prompt_chunks = [
513
                tokenizer(chunk).input_ids for chunk in re.split(pattern, text)
pansicheng's avatar
pansicheng committed
514
515
516
517
518
519
520
521
            ]
            image_tags = [
                tokenizer(chunk, add_special_tokens=False).input_ids
                for chunk in re.findall(pattern, text)
            ]
            if len(prompt_chunks) > len(image_tags):
                image_tags.append([])
            token_ids = [
522
523
524
525
                e
                for sublist in zip(prompt_chunks, image_tags)
                for ele in sublist
                for e in ele
pansicheng's avatar
pansicheng committed
526
527
            ]

528
        token_ids, placeholders = super()._apply_prompt_updates(
529
            token_ids=token_ids,
530
            mm_prompt_updates=mm_prompt_updates,
531
532
533
        )

        # Keep the behavior in line with HF processor
534
535
536
        if len(mm_prompt_updates) and (
            token_ids[:2] == tokenizer.encode("<s> <|image|>", add_special_tokens=False)
        ):
537
            token_ids = [token_ids[0], *token_ids[2:]]
538
539
            placeholders = {
                modality: [
540
                    PlaceholderFeaturesInfo(
541
542
543
                        modality=p.modality,
                        item_idx=p.item_idx,
                        start_idx=p.start_idx - 1,
544
                        tokens=p.tokens,
545
                        is_embed=p.is_embed,
546
547
                    )
                    for p in ps
548
549
550
                ]
                for modality, ps in placeholders.items()
            }
551

552
        return token_ids, placeholders
553

554

555
556
557
558
559
560
@MULTIMODAL_REGISTRY.register_processor(
    Phi3VMultiModalProcessor,
    info=Phi3VProcessingInfo,
    dummy_inputs=Phi3VDummyInputsBuilder,
)
class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant):
561
562
563
564
565
566
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "model.vision_embed_tokens.wte": "embed_tokens",
            "model.vision_embed_tokens.": "vision_embed_tokens.",
            "lm_head.": "language_model.lm_head.",
            "model.": "language_model.model.",
567
568
        }
    )
569

570
    @classmethod
571
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
572
573
574
575
576
        if modality.startswith("image"):
            return f"<|image_{i}|>"

        raise ValueError("Only image modality is supported")

577
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
578
        super().__init__()
579
        config = vllm_config.model_config.hf_config
580
        quant_config = vllm_config.quant_config
581
        multimodal_config = vllm_config.model_config.multimodal_config
582
        self.config = config
583
        self.multimodal_config = multimodal_config
584
        self.image_token_id = _IMAGE_TOKEN_ID
585

586
587
588
589
590
591
592
593
594
595
596
597
        with self._mark_tower_model(vllm_config, "image"):
            self.embed_tokens = VocabParallelEmbedding(
                config.vocab_size,
                config.hidden_size,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "model.embed_tokens"),
            )
            self.vision_embed_tokens = Phi3HDImageEmbedding(
                config,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "model.vision_embed_tokens"),
            )
598

599
600
601
602
603
604
605
606
607
608
609
        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                # The prefix is empty intentionally because default prefix of
                # LlamaForCausalLM is "model"
                prefix="",
                # We don't directly initialize vLLM's LlamaForCausalLM so we
                # can automatically apply embedding wrapper if this model is
                # initialized as an embedding model
                architectures=["LlamaForCausalLM"],
            )
610

611
        self.make_empty_intermediate_tensors = (
612
613
            self.language_model.make_empty_intermediate_tensors
        )
614

615
    def _parse_and_validate_image_input(
616
        self, **kwargs: object
617
    ) -> Phi3VImageInputs | None:
618
619
        pixel_values = kwargs.pop("pixel_values", None)
        image_sizes = kwargs.pop("image_sizes", None)
620
        image_embeds = kwargs.pop("image_embeds", None)
621

622
623
624
625
626
627
        if pixel_values is None and image_embeds is None:
            return None

        if pixel_values is not None:
            return Phi3VImagePixelInputs(
                type="pixel_values",
628
629
                pixel_values=pixel_values,
                image_sizes=image_sizes,
630
631
                resolve_bindings={
                    "h": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size,
632
633
634
                    "w": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size,
                },
            )
635
636
637
638

        if image_embeds is not None:
            return Phi3VImageEmbeddingInputs(
                type="image_embeds",
639
                data=image_embeds,
640
641
642
643
644
645
646
647
648
            )

        raise AssertionError("This line should be unreachable.")

    def _process_image_input(
        self,
        image_input: Phi3VImageInputs,
    ) -> torch.Tensor:
        if image_input["type"] == "image_embeds":
649
            return image_input["data"]
650

651
652
653
        image_embeds = self.vision_embed_tokens(
            image_input["pixel_values"], image_input["image_sizes"]
        )
654

655
        return image_embeds
656

657
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
658
659
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
660
            return []
661
662
663
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings

664
    def embed_input_ids(
665
666
        self,
        input_ids: torch.Tensor,
667
        multimodal_embeddings: MultiModalEmbeddings | None = None,
668
        *,
669
        is_multimodal: torch.Tensor | None = None,
670
        handle_oov_mm_token: bool = False,
671
    ) -> torch.Tensor:
672
        inputs_embeds = self._embed_text_input_ids(
673
674
675
676
677
678
679
680
681
682
683
684
            input_ids,
            self.embed_tokens,
            is_multimodal=is_multimodal,
            handle_oov_mm_token=handle_oov_mm_token,
        )

        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
            return inputs_embeds

        return _merge_multimodal_embeddings(
            inputs_embeds=inputs_embeds,
            multimodal_embeddings=multimodal_embeddings,
685
            is_multimodal=_require_is_multimodal(is_multimodal),
686
        )
687

688
689
    def forward(
        self,
690
        input_ids: torch.Tensor | None,
691
        positions: torch.Tensor,
692
693
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
694
695
        **kwargs: object,
    ):
696
        if intermediate_tensors is not None:
697
            inputs_embeds = None
698

699
700
701
        hidden_states = self.language_model.model(
            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
        )
702
703
704

        return hidden_states

705
706
707
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
708
    ) -> torch.Tensor | None:
709
        return self.language_model.compute_logits(hidden_states)
710

711
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
712
        loader = AutoWeightsLoader(self)
713
        autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
714
715
716

        # The HF config doesn't specify whether these are tied,
        # so we detect it this way
717
        if "embed_tokens.weight" not in autoloaded_weights:
718
            self.embed_tokens = self.language_model.model.embed_tokens
719
720
            autoloaded_weights.add("embed_tokens.weight")
        return autoloaded_weights