video.py 2.54 KB
Newer Older
1
2
3
4
5
6
7
8
from functools import lru_cache
from typing import List, Union

import numpy as np

from vllm.config import ModelConfig
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
9
from vllm.transformers_utils.processor import get_video_processor
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import is_list_of

from .base import MultiModalData, MultiModalInputs
from .image import ImagePlugin

logger = init_logger(__name__)

cached_get_video_processor = lru_cache(get_video_processor)
cached_get_tokenizer = lru_cache(get_tokenizer)

VideoInput = Union[
    "np.ndarray",  # single video input
    List["np.ndarray"],
    # TODO: support more types
    # List[Image.Image], List[List[Image.Image]],
    # "torch.Tensor",
    # List["torch.Tensor"],
    # List[List["np.ndarrray"]],
    # List[List["torch.Tensor"]],
]


class VideoPlugin(ImagePlugin):
    """Plugin for video data."""

    def get_data_key(self) -> str:
        return "video"

    def _get_hf_video_processor(self, model_config: ModelConfig):
40
41
42
43
        mm_processor_kwargs = ({} if model_config.mm_processor_kwargs is None
                               else model_config.mm_processor_kwargs)
        # We don't explicitly check kwarg overrides to the HF class
        # since the automodel just takes kwargs, so we can't inspect it
44
45
        return cached_get_video_processor(
            model_config.model,
46
47
            trust_remote_code=model_config.trust_remote_code,
            **mm_processor_kwargs)
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

    def _default_input_mapper(
        self,
        ctx: InputContext,
        data: MultiModalData[object],
    ) -> MultiModalInputs:
        model_config = ctx.model_config

        # single video input as np.ndarray
        if isinstance(data, np.ndarray):
            video_processor = self._get_hf_video_processor(model_config)
            if video_processor is None:
                raise RuntimeError("No HuggingFace processor is available "
                                   "to process the image object")
            try:
                batch_data = video_processor(data, return_tensors="pt").data
            except Exception:
                logger.error("Failed to process image (%s)", data)
                raise

            return MultiModalInputs(batch_data)
        elif is_list_of(data, np.ndarray):
            raise NotImplementedError(
                "Multi video for a prompt is not supported yet")

        raise TypeError(f"Invalid video type: {type(data)}")

    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
        return 4096