"examples/multimodal/configs/agg-phi3v.yaml" did not exist on "f122aa4ec1ce10f10919e608572a7e12f24243aa"
video.py 2.67 KB
Newer Older
1
from functools import lru_cache
2
from typing import Any, Dict, List, Optional, Union
3
4
5
6
7
8

import numpy as np

from vllm.config import ModelConfig
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
9
from vllm.transformers_utils.processor import get_video_processor
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import is_list_of

from .base import MultiModalData, MultiModalInputs
from .image import ImagePlugin

logger = init_logger(__name__)

cached_get_video_processor = lru_cache(get_video_processor)
cached_get_tokenizer = lru_cache(get_tokenizer)

VideoInput = Union[
    "np.ndarray",  # single video input
    List["np.ndarray"],
    # TODO: support more types
    # List[Image.Image], List[List[Image.Image]],
    # "torch.Tensor",
    # List["torch.Tensor"],
    # List[List["np.ndarrray"]],
    # List[List["torch.Tensor"]],
]


class VideoPlugin(ImagePlugin):
    """Plugin for video data."""

    def get_data_key(self) -> str:
        return "video"

39
40
41
42
43
44
45
    def _get_hf_video_processor(
        self,
        model_config: ModelConfig,
        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
    ):
        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}
46
47
        return cached_get_video_processor(
            model_config.model,
48
49
            trust_remote_code=model_config.trust_remote_code,
            **mm_processor_kwargs)
50
51
52
53
54

    def _default_input_mapper(
        self,
        ctx: InputContext,
        data: MultiModalData[object],
55
        **mm_processor_kwargs,
56
57
58
    ) -> MultiModalInputs:
        model_config = ctx.model_config

59
        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
60
61
62
63
            video_processor = self._get_hf_video_processor(
                model_config,
                mm_processor_kwargs,
            )
64
65
            if video_processor is None:
                raise RuntimeError("No HuggingFace processor is available "
66
                                   "to process the video object")
67
            try:
68
69
70
71
                # NOTE: Similar to image; it may be a good idea to filter and
                # pass mm_processor_kwargs here too, but for now we don't to
                # avoid extra complexity if the initializer and preprocess
                # signatures of the processor don't align
72
73
                batch_data = video_processor(data, return_tensors="pt").data
            except Exception:
74
                logger.error("Failed to process video (%s)", data)
75
76
77
78
79
80
81
82
                raise

            return MultiModalInputs(batch_data)

        raise TypeError(f"Invalid video type: {type(data)}")

    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
        return 4096