"platforms/cuda2/src/kernels/customGBValueN2.cu" did not exist on "9a4e2b0209278f73e097ac33cdad0def870a09b6"
video.py 4.27 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
from dataclasses import dataclass
from functools import lru_cache
6
from typing import Any, ClassVar, Literal
7
8
9
10
11
12

import numpy as np
import numpy.typing as npt
from huggingface_hub import hf_hub_download
from PIL import Image

13
from vllm.multimodal.media.audio import load_audio_pyav
14

15
16
17
18
19
20
21
22
23
from .base import get_cache_dir


@lru_cache
def download_video_asset(filename: str) -> str:
    """
    Download and open an image from huggingface
    repo: raushan-testing-hf/videos-test
    """
24
    video_directory = get_cache_dir() / "video-example-data"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    video_directory.mkdir(parents=True, exist_ok=True)

    video_path = video_directory / filename
    video_path_str = str(video_path)
    if not video_path.exists():
        video_path_str = hf_hub_download(
            repo_id="raushan-testing-hf/videos-test",
            filename=filename,
            repo_type="dataset",
            cache_dir=video_directory,
        )
    return video_path_str


def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
Angela Yi's avatar
Angela Yi committed
40
41
    import cv2

42
43
44
45
46
47
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video file {path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []
48
49
50
51
52
53
54
55
56
57

    num_frames = num_frames if num_frames > 0 else total_frames
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    for idx in range(total_frames):
        ok = cap.grab()  # next img
        if not ok:
            break
        if idx in frame_indices:  # only decompress needed
            ret, frame = cap.retrieve()
            if ret:
58
59
60
                # OpenCV uses BGR format, we need to convert it to RGB
                # for PIL and transformers compatibility
                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
61
62
63

    frames = np.stack(frames)
    if len(frames) < num_frames:
64
65
66
67
        raise ValueError(
            f"Could not read enough frames from video file {path}"
            f" (expected {num_frames} frames, got {len(frames)})"
        )
68
69
70
    return frames


71
def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Image]:
72
    frames = video_to_ndarrays(path, num_frames)
73
    return [Image.fromarray(frame) for frame in frames]
74
75


76
def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
Angela Yi's avatar
Angela Yi committed
77
78
    import cv2

79
80
81
82
83
84
85
86
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video file {path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps if fps > 0 else 0

87
88
89
    if num_frames == -1 or num_frames > total_frames:
        num_frames = total_frames

90
    metadata = {
91
        "total_num_frames": num_frames,
92
        "fps": duration / num_frames,
93
        "duration": duration,
94
95
96
97
98
        "video_backend": "opencv",
        "frames_indices": list(range(num_frames)),
        # extra field used to control hf processor's video
        # sampling behavior
        "do_sample_frames": num_frames == total_frames,
99
100
101
102
    }
    return metadata


103
104
105
VideoAssetName = Literal["baby_reading"]


106
107
@dataclass(frozen=True)
class VideoAsset:
108
    name: VideoAssetName
109
110
    num_frames: int = -1

111
112
113
114
115
116
117
118
    _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
        "baby_reading": "sample_demo_1.mp4",
    }

    @property
    def filename(self) -> str:
        return self._NAME_TO_FILE[self.name]

119
120
121
122
    @property
    def video_path(self) -> str:
        return download_video_asset(self.filename)

123
    @property
124
    def pil_images(self) -> list[Image.Image]:
125
        ret = video_to_pil_images_list(self.video_path, self.num_frames)
126
127
128
        return ret

    @property
129
    def np_ndarrays(self) -> npt.NDArray:
130
        ret = video_to_ndarrays(self.video_path, self.num_frames)
131
        return ret
132

133
134
    @property
    def metadata(self) -> dict[str, Any]:
135
        ret = video_get_metadata(self.video_path, self.num_frames)
136
137
        return ret

138
    def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:
139
140
        """
        Read audio data from the video asset, used in Qwen2.5-Omni examples.
141

142
143
        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
        """
144
        return load_audio_pyav(self.video_path, sr=sampling_rate)[0]