# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from abc import abstractmethod from io import BytesIO from typing import Any, ClassVar, Literal, NamedTuple, cast import numpy as np import numpy.typing as npt from vllm.logger import init_logger from vllm.utils.import_utils import PlaceholderModule from vllm.utils.registry import ExtensionManager try: import cv2 import cv2.videoio_registry as vr except ImportError: cv2 = PlaceholderModule("cv2") vr = PlaceholderModule("cv2").placeholder_attr("videoio_registry") try: import av except ImportError: av = PlaceholderModule("av") # type: ignore[assignment] logger = init_logger(__name__) def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: num_frames, _, _, channels = frames.shape new_height, new_width = size resized_frames = np.empty( (num_frames, new_height, new_width, channels), dtype=frames.dtype ) for i, frame in enumerate(frames): resized_frame = cv2.resize(frame, (new_width, new_height)) resized_frames[i] = resized_frame return resized_frames def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: _, height, width, _ = frames.shape new_height = int(height * size_factor) new_width = int(width * size_factor) return resize_video(frames, (new_height, new_width)) def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArray: total_frames = frames.shape[0] if num_frames == -1: return frames frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) sampled_frames = frames[frame_indices, ...] return sampled_frames class VideoTargetMetadata(NamedTuple): """Metadata represents target video.""" num_frames: int fps: float max_duration: float class VideoSourceMetadata(NamedTuple): """Metadata represents source video.""" total_frames_num: int original_fps: float duration: float class VideoLoader: @classmethod def compute_frames_index_to_sample( cls, source: VideoSourceMetadata, target: VideoTargetMetadata, **kwargs, ) -> list[int]: """Return the list of frame indices to sample from the video.""" raise NotImplementedError @classmethod @abstractmethod def load_bytes( cls, data: bytes, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: """Load video frames from bytes and return (frames_array, metadata_dict).""" raise NotImplementedError @classmethod def create_hf_metadata( cls, source: VideoSourceMetadata, valid_frame_indices: list[int], video_backend: str, ): return { "total_num_frames": source.total_frames_num, "fps": source.original_fps, "duration": source.duration, "video_backend": video_backend, "frames_indices": valid_frame_indices, "do_sample_frames": len(valid_frame_indices) == source.total_frames_num, } VIDEO_LOADER_REGISTRY = ExtensionManager() class OpenCVVideoBackendMixin: @staticmethod def get_cv2_video_api(): api_pref = None for backend in vr.getStreamBufferedBackends(): if not vr.hasBackend(backend): continue if not vr.isBackendBuiltIn(backend): _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend) if abi < 1 or (abi == 1 and api < 2): continue api_pref = backend break return api_pref @classmethod def open_video_capture(cls, data: bytes) -> "cv2.VideoCapture": backend = cls.get_cv2_video_api() cap = cv2.VideoCapture(BytesIO(data), backend, []) if not cap.isOpened(): raise ValueError("Could not open video stream") return cap @staticmethod def get_video_metadata(cap: "cv2.VideoCapture") -> VideoSourceMetadata: total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 return VideoSourceMetadata( total_frames_num=total_frames_num, original_fps=original_fps, duration=duration, ) @classmethod def _can_use_for_recovery( cls, idx: int, failed_frames: list[int], next_target_map: dict[int, int], total_frames: int, ) -> bool: """Check if current frame can recover the oldest failed frame.""" if not failed_frames: return False oldest_failed = failed_frames[0] limit = next_target_map.get(oldest_failed, total_frames) return idx < limit @classmethod def _read_frames_with_recovery( cls, cap: "cv2.VideoCapture", frame_indices: list[int], total_frames: int, ) -> tuple[npt.NDArray, list[int], dict[int, int]]: """ Read frames with dynamic window forward-scan recovery. When a target frame fails to load, the next successfully grabbed frame (before the next target frame) will be used to recover it. Args: cap: OpenCV VideoCapture object frame_indices: Sorted list of target frame indices to load total_frames: Total number of frames in the video Returns: Tuple of (frames_array, valid_frame_indices, recovered_map) - frames_array: Array of loaded frames - valid_frame_indices: List of frame indices that were loaded - recovered_map: Dict mapping recovered_idx -> source_idx """ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) assert width > 0 and height > 0, ( f"Invalid video frame size: width={width}, height={height}" ) frame_idx_set = set(frame_indices) max_frame_idx = frame_indices[-1] if frame_indices else 0 # Build map: target_idx -> next_target_idx (for recovery window) next_target_map: dict[int, int] = {} for k in range(len(frame_indices) - 1): next_target_map[frame_indices[k]] = frame_indices[k + 1] next_target_map[frame_indices[-1]] = total_frames frames_list: list[npt.NDArray] = [] valid_frame_indices: list[int] = [] failed_frames_idx: list[int] = [] recovered_map: dict[int, int] = {} i = 0 for idx in range(max_frame_idx + 1): is_target_frame = idx in frame_idx_set # Attempt to grab the current frame ok = cap.grab() if not ok: if is_target_frame: logger.warning( "Failed to grab frame %d during video loading.", idx, ) failed_frames_idx.append(idx) continue # Check if we should retrieve: target frame OR can recover a failed one can_recover = cls._can_use_for_recovery( idx, failed_frames_idx, next_target_map, total_frames ) if is_target_frame or can_recover: ret, frame = cap.retrieve() if ret and frame is not None and frame.size > 0: rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames_list.append(rgb_frame) valid_frame_indices.append(idx) i += 1 if can_recover: recovered_idx = failed_frames_idx.pop(0) recovered_map[recovered_idx] = idx logger.info( "Recovered frame %d using frame %d (delay: %d)", recovered_idx, idx, idx - recovered_idx, ) elif is_target_frame: logger.warning( "Failed to retrieve frame %d during video loading.", idx, ) failed_frames_idx.append(idx) # Log any remaining failed frames for failed_idx in failed_frames_idx: logger.warning( "Frame %d could not be recovered (end of video).", failed_idx, ) # Stack frames if frames_list: frames = np.stack(frames_list) else: frames = np.empty((0, height, width, 3), dtype=np.uint8) return frames, valid_frame_indices, recovered_map @classmethod def _read_frames_no_recovery( cls, cap, frame_indices: set[int], max_frame_idx: int, ) -> tuple[npt.NDArray, list[int]]: num_expected_frames = len(frame_indices) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8) i = 0 valid_frame_indices = [] for idx in range(max_frame_idx + 1): ok = cap.grab() if not ok: # Frame is broken/unreadable, log warning if idx in frame_indices: logger.warning( "Failed to grab frame %d during video loading. " "This frame will be skipped.", idx, ) continue if idx in frame_indices: ret, frame = cap.retrieve() if ret: frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) valid_frame_indices.append(idx) i += 1 else: # retrieve() failed even though grab() succeeded logger.warning( "Failed to retrieve frame %d during video loading. " "This frame will be skipped.", idx, ) valid_num_frames = len(valid_frame_indices) if valid_num_frames < num_expected_frames: logger.warning( "Video loading completed with %d broken/unreadable frames. " "Expected %d frames but only loaded %d frames.", num_expected_frames - valid_num_frames, num_expected_frames, valid_num_frames, ) return frames[:valid_num_frames], valid_frame_indices @classmethod def read_frames( cls, cap: "cv2.VideoCapture", frame_idx: list[int], total_frames_num: int, *, frame_recovery: bool = False, ) -> tuple[npt.NDArray, list[int]]: if frame_recovery: num_frames_to_sample = len(frame_idx) frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery( cap, frame_idx, total_frames_num ) if recovered_map: logger.info( "Frame recovery: %d frames recovered using forward scan.", len(recovered_map), ) else: frame_idx_set = set(frame_idx) num_frames_to_sample = len(frame_idx_set) frames, valid_frame_indices = cls._read_frames_no_recovery( cap, frame_idx_set, max(frame_idx) ) valid_num_frames = len(valid_frame_indices) if valid_num_frames < num_frames_to_sample: logger.warning( "Video loading completed with %d broken/unreadable frames. " "Expected to sample %d frames but only loaded %d frames.", num_frames_to_sample - valid_num_frames, num_frames_to_sample, valid_num_frames, ) return frames, valid_frame_indices class PyAVVideoBackendMixin: """PyAV (in-process FFmpeg bindings) codec utilities. Reads stream metadata and decodes target frames via per-frame ``container.seek()``. The seek releases the GIL between frames and scales with the number of sampled frames rather than the video length, enabling concurrent decoding under serving load. """ @staticmethod def get_metadata( container: "av.container.InputContainer", ) -> VideoSourceMetadata: if not container.streams.video: raise ValueError("No video streams found in container") stream = container.streams.video[0] total_frames = stream.frames or 0 fps = float(stream.average_rate) if stream.average_rate else 0.0 duration = float(stream.duration * stream.time_base) if stream.duration else 0.0 if total_frames == 0 and duration > 0 and fps > 0: total_frames = int(duration * fps) return VideoSourceMetadata(total_frames, fps, duration) @staticmethod def decode_frames( container: "av.container.InputContainer", frame_indices: list[int], fps: float, duration: float, ) -> tuple[npt.NDArray, list[int]]: """Decode target frames via per-frame seek + keyframe decode.""" stream = container.streams.video[0] # SLICE parallelizes within a single frame without the # one-frame-per-thread latency penalty of FRAME threading. stream.thread_type = "SLICE" time_base = stream.time_base frames_list: list[npt.NDArray] = [] valid_indices: list[int] = [] frame_interval = 1.0 / fps if fps > 0 else 0.1 max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf") for idx in frame_indices: ts = min(idx / fps, max_ts) if fps > 0 else 0.0 pts = int(ts / time_base) container.seek(pts, stream=stream) frame = next(container.decode(video=0), None) if frame is not None: frames_list.append(frame.to_ndarray(format="rgb24")) valid_indices.append(idx) if not frames_list: return np.empty((0,), dtype=np.uint8), valid_indices return np.stack(frames_list), valid_indices @VIDEO_LOADER_REGISTRY.register("opencv") class VideoBackend(VideoLoader, OpenCVVideoBackendMixin, PyAVVideoBackendMixin): """Uniform-sampling video backend. Samples ``num_frames`` uniformly across the video (or one frame every ``1/fps`` seconds, whichever produces fewer frames). The decoding codec is selected via the ``backend`` kwarg (``"opencv"`` or ``"pyav"``), which can be passed through ``--media-io-kwargs``. Defaults to ``"pyav"`` for concurrent decoding. """ _sampling_suffix: ClassVar[str] = "" @classmethod def compute_frames_index_to_sample( cls, source: VideoSourceMetadata, target: VideoTargetMetadata, **kwargs, ) -> list[int]: total_frames_num = source.total_frames_num duration = source.duration num_frames = target.num_frames fps = target.fps # resample video to target num_frames and fps # - the minimum of the two will be used num_frames_to_sample = total_frames_num if num_frames > 0: num_frames_to_sample = min(num_frames, total_frames_num) if fps > 0: num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps)) num_frames_to_sample = max(1, num_frames_to_sample) if num_frames_to_sample == total_frames_num: return list(range(num_frames_to_sample)) return np.linspace( 0, total_frames_num - 1, num_frames_to_sample, dtype=int ).tolist() @classmethod def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata: """Sampling-algorithm-specific metadata adjustment hook.""" return source @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, fps: int = -1, max_duration: int = 300, frame_recovery: bool = False, *, backend: Literal["opencv", "pyav"] = "opencv", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: """Load sampled frames from raw video bytes. Args: data: Raw video bytes. num_frames: Target number of frames to sample (``-1`` for all). fps: Target FPS for sampling (``-1`` for original). max_duration: Maximum duration in seconds — only used by the dynamic subclass; ignored here. frame_recovery: Enable forward-scan recovery for failed frames. Only honored by the OpenCV codec. backend: Decoding codec — ``"opencv"`` or ``"pyav"`` . Returns: Tuple of ``(frames_array, metadata_dict)``. """ target = VideoTargetMetadata( num_frames=num_frames, fps=fps, max_duration=max_duration ) if backend == "opencv": cap = cls.open_video_capture(data) source = cls._prepare_source(cls.get_video_metadata(cap)) frame_idx = cls.compute_frames_index_to_sample( source=source, target=target, **kwargs ) frames, valid = cls.read_frames( cap, frame_idx, total_frames_num=source.total_frames_num, frame_recovery=frame_recovery, ) elif backend == "pyav": assert not frame_recovery, ( "frame_recovery is only available for `opencv` backend" ) with av.open(BytesIO(data)) as container: source = cls._prepare_source(cls.get_metadata(container)) frame_idx = cls.compute_frames_index_to_sample( source=source, target=target, **kwargs ) frames, valid = cls.decode_frames( container, frame_idx, source.original_fps, source.duration ) else: raise ValueError( f"Unknown video codec backend {backend!r}; " "valid options: 'opencv', 'pyav'." ) if len(valid) < len(frame_idx): logger.warning( "%s video loading: expected %d frames but got %d.", backend, len(frame_idx), len(valid), ) return frames, cls.create_hf_metadata( source=source, video_backend=f"{backend}{cls._sampling_suffix}", valid_frame_indices=valid, ) @VIDEO_LOADER_REGISTRY.register("opencv_dynamic") class DynamicVideoBackend(VideoBackend): """Duration-aware dynamic-sampling video backend. Samples at ``fps`` up to ``max_duration`` seconds, falling back to uniform sampling across the full duration when the video is longer than ``max_duration``. Codec is selectable the same way as :class:`VideoBackend`. """ _sampling_suffix: ClassVar[str] = "_dynamic" @classmethod def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata: # Estimate duration from frame count and fps when the container # does not report it (common for WebM/streaming inputs). if source.duration: return source if source.original_fps > 0: max_frame_idx = source.total_frames_num - 1 duration = round(max_frame_idx / source.original_fps) + 1 else: duration = 0 return VideoSourceMetadata( source.total_frames_num, source.original_fps, duration ) @classmethod def compute_frames_index_to_sample( cls, source: VideoSourceMetadata, target: VideoTargetMetadata, **kwargs, ) -> list[int]: total_frames_num = source.total_frames_num duration = source.duration original_fps = source.original_fps max_duration = target.max_duration fps = target.fps max_frame_idx = source.total_frames_num - 1 # Refer to: # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 frame_indices_list: list[int] if duration <= max_duration: n = int(math.floor(duration * fps)) frame_indices_list = sorted( { min(max_frame_idx, int(math.ceil(i * original_fps / fps))) for i in range(n) } ) else: num_samples = int(max_duration * fps) if num_samples >= total_frames_num: frame_indices_list = list(range(total_frames_num)) else: target_seconds = np.linspace(0, duration, num_samples, endpoint=True) frame_indices_list = sorted( { min(max_frame_idx, int(math.ceil(t * original_fps))) for t in target_seconds } ) return frame_indices_list @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, fps: int = 2, max_duration: int = 300, frame_recovery: bool = False, *, backend: Literal["opencv", "pyav"] = "opencv", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: return super().load_bytes( data, num_frames=num_frames, fps=fps, max_duration=max_duration, frame_recovery=frame_recovery, backend=backend, **kwargs, ) @VIDEO_LOADER_REGISTRY.register("molmo2") class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin): @classmethod def get_candidate_target_fps( cls, video_fps: float, sampling_fps: float, max_fps: float = 8.0, ) -> list[float]: """ Return the subset of `video_fps` factors that remain multiples of `sampling_fps`. Examples: >>> get_candidate_target_fps(video_fps=6, sampling_fps=2) [2, 6] >>> get_candidate_target_fps(video_fps=5, sampling_fps=1) [1, 5] >>> get_candidate_target_fps(video_fps=2, sampling_fps=2) [2] >>> get_candidate_target_fps(video_fps=5, sampling_fps=2) Traceback (most recent call last): ... ValueError: sampling_fps=2 must divide video_fps=5 to produce consistent frame steps. """ video_fps = int(video_fps) sampling_fps = int(sampling_fps) max_fps = int(max_fps) if sampling_fps is None: raise ValueError("sampling_fps must be provided") if video_fps <= 0 or sampling_fps <= 0: raise ValueError( "video_fps and sampling_fps must be positive " f"(got {video_fps}, {sampling_fps})" ) if video_fps % sampling_fps != 0: raise ValueError( f"sampling_fps={sampling_fps} must divide video_fps={video_fps}." ) candidates = [] for candidate in range(sampling_fps, video_fps + 1, sampling_fps): if candidate > max_fps: break if video_fps % candidate == 0: candidates.append(float(candidate)) return candidates @classmethod def get_target_fps( cls, video_fps: float, max_frames: int, total_frames: int, frame_sample_mode: str, candidate_target_fps: list[float], ) -> float | None: """ Get the target fps that best spans the videoand has the most frames sampled """ num_frames_sampled = 0 selected_target_fps = None for target_fps in candidate_target_fps: step_size = max(int(video_fps / target_fps), 1) num_frames_sampled_at_fps = int(total_frames / step_size) if num_frames_sampled == 0: if ( "uniform" in frame_sample_mode and num_frames_sampled_at_fps > max_frames ): break selected_target_fps = target_fps num_frames_sampled = num_frames_sampled_at_fps else: # the candidate sampling fps increases so frame count can't decrease assert num_frames_sampled <= num_frames_sampled_at_fps if num_frames_sampled_at_fps > max_frames: # choose the sampling fps that spans the video continue elif num_frames_sampled_at_fps > num_frames_sampled: # both are less than max_frames; choose the one with higher # density of frames sampled selected_target_fps = target_fps num_frames_sampled = num_frames_sampled_at_fps return selected_target_fps @classmethod def get_frame_times_and_chosen_fps( cls, selected_target_fps: float | None, total_frames: int, max_frames: int, video_fps: float, ) -> tuple[float | None, npt.NDArray]: if selected_target_fps is None: frame_indices = np.linspace( 0, total_frames, max_frames, endpoint=False, dtype=int ) else: step_size = max(int(video_fps / selected_target_fps), 1) frame_indices = np.arange(0, total_frames, step_size) if len(frame_indices) > max_frames: frame_indices = frame_indices[:max_frames] return selected_target_fps, frame_indices @classmethod def sample_times( cls, duration: float, max_frames: int, frame_sample_mode: str, max_fps: int | None, candidate_target_fps: list[float] | None = None, **kwargs, ) -> npt.NDArray: if frame_sample_mode == "fps": assert candidate_target_fps is not None # Try larger and larger FPSs until we hit one that can't span the video sampling_fps = candidate_target_fps[0] for candidate_fps in candidate_target_fps[1:]: if max_frames / candidate_fps < duration: break sampling_fps = candidate_fps times = np.arange(0, max_frames) / sampling_fps times = times[times < duration] return times elif frame_sample_mode == "uniform_last_frame": if max_fps is not None: max_duration = ( max_frames - 1 ) / max_fps # -1 to include the last frame if max_duration < duration: times = np.linspace( 0, duration, num=max_frames, endpoint=True, dtype=np.float64 ) else: times = np.arange(0.0, stop=duration, step=1 / max_fps) times = np.concatenate([times, [duration]], axis=0) assert len(times) <= max_frames else: times = np.linspace( 0, duration, num=max_frames, endpoint=True, dtype=np.float64 ) return times else: raise NotImplementedError(frame_sample_mode) @classmethod def compute_frames_index_to_sample( cls, source: VideoSourceMetadata, target: VideoTargetMetadata, **kwargs, ): max_fps = kwargs.get("max_fps") frame_sample_mode = kwargs.get("frame_sample_mode") if frame_sample_mode is None: return list(range(0, source.total_frames_num)) if frame_sample_mode not in {"uniform_last_frame", "fps"}: raise NotImplementedError( f"Unsupported frame_sample_mode: {frame_sample_mode}" ) duration = source.duration video_fps = source.original_fps total_num_frames = source.total_frames_num num_frames = target.num_frames sampling_fps = target.fps if frame_sample_mode == "uniform_last_frame" and max_fps is not None: if total_num_frames <= 2: indices = np.arange(total_num_frames).astype(int) elif duration > (num_frames - 1) / max_fps: # -1 to include the last frame # uniform fallback indices = np.linspace( 0, total_num_frames - 1, num=min(num_frames, total_num_frames), endpoint=True, ).astype(int) else: float_indices = np.arange( 0.0, stop=total_num_frames - 1, step=float(video_fps / max_fps), ) if np.round(float_indices[-1]) != total_num_frames - 1: float_indices = np.concatenate( [float_indices, [total_num_frames - 1]], axis=0 ) indices = np.round(float_indices).astype(int) assert indices[-1] < total_num_frames assert len(float_indices) <= num_frames elif frame_sample_mode == "uniform_last_frame": indices = np.linspace( 0, total_num_frames - 1, num=min(num_frames, total_num_frames), endpoint=True, ).astype(int) elif frame_sample_mode == "fps": candidate_target_fps = cls.get_candidate_target_fps(video_fps, sampling_fps) selected_target_fps = cls.get_target_fps( video_fps, num_frames, total_num_frames, frame_sample_mode, candidate_target_fps, ) _, indices = cls.get_frame_times_and_chosen_fps( selected_target_fps, total_num_frames, num_frames, video_fps, ) return indices.tolist() @classmethod def load_bytes_opencv( cls, data: bytes, frame_sample_mode: str | None = None, num_frames: int = -1, max_fps: int = 2, sampling_fps: int = 2, frame_recovery: bool = False, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: cap = cls.open_video_capture(data) source = OpenCVVideoBackendMixin.get_video_metadata(cap) target = VideoTargetMetadata( num_frames=num_frames, fps=sampling_fps, max_duration=source.duration, ) frame_idx = cls.compute_frames_index_to_sample( source=source, target=target, frame_sample_mode=frame_sample_mode, max_fps=max_fps, ) frames, valid_frame_indices = cls.read_frames( cap, frame_idx, total_frames_num=source.total_frames_num, frame_recovery=frame_recovery, ) metadata = cls.create_hf_metadata( source=source, video_backend="opencv", valid_frame_indices=valid_frame_indices, ) return frames, metadata @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: frame_sample_mode = cast(str | None, kwargs.pop("frame_sample_mode", None)) max_fps = cast(int, kwargs.pop("max_fps", 2)) sampling_fps = cast(int, kwargs.pop("sampling_fps", 2)) out = cls.load_bytes_opencv( data, frame_sample_mode, num_frames, max_fps, sampling_fps, **kwargs, ) return out @VIDEO_LOADER_REGISTRY.register("nemotron_vl") class NemotronVLVideoBackend(VideoBackend): @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, fps: int = -1, max_duration: int = 300, frame_recovery: bool = False, *, backend: Literal["opencv", "pyav"] = "opencv", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: frames, metadata = super().load_bytes( data, num_frames=num_frames, fps=fps, max_duration=max_duration, frame_recovery=frame_recovery, backend=backend, **kwargs, ) metadata = dict(metadata) metadata["original_video_bytes"] = data return frames, metadata @VIDEO_LOADER_REGISTRY.register("openpangu") class OpenCVDynamicOpenPanguVideoBackend(VideoLoader, OpenCVVideoBackendMixin): @classmethod def compute_frames_index_to_sample( cls, source: VideoSourceMetadata, target: VideoTargetMetadata, **kwargs, ) -> list[int]: total_frames_num = source.total_frames_num original_fps = source.original_fps num_frames = target.num_frames fps = target.fps # The timestamp of the rightmost frame, cannot be used to calculate frame 0. if total_frames_num >= 1 and original_fps > 0: total_duration = (total_frames_num - 1) / original_fps else: total_duration = 0 # `fps` is the FPS parameter passed in for sampling, # -1 indicates that sampling can be performed directly without FPS limitation. if fps > 0: # Num_frames is the maximum number of frames to sample. # If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501 if num_frames >= int(total_duration * fps) + 1: num_frames = int(total_duration * fps) + 1 # Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501 # cannot be calculated for frame 0. total_duration = min(total_duration, (num_frames - 1) / fps) elif fps != -1: raise ValueError( f"requires dataset fps is -1 or greater than 0 but got {fps}" ) sample_frame_timestamps = np.linspace( 0, total_duration, num_frames, dtype=float ) frames_indices = [ min(total_frames_num - 1, round(t * original_fps)) for t in sample_frame_timestamps ] return frames_indices @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, fps: int = 2, max_duration: int = 300, frame_recovery: bool = False, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: """ Load video frames with dynamic sampling based on duration. Args: data: Raw video bytes num_frames: Not used in dynamic backend fps: Target FPS for sampling (default: 2) max_duration: Maximum video duration to process (default: 300s) frame_recovery: Enable forward-scan recovery for failed frames Returns: Tuple of (frames_array, metadata_dict) """ cap = cls.open_video_capture(data) source = OpenCVVideoBackendMixin.get_video_metadata(cap) # recompute source metadata with adjusted duration to ensure correct # sampling indices computation target = VideoTargetMetadata( num_frames=num_frames, fps=fps, max_duration=max_duration, ) frame_indices_list = cls.compute_frames_index_to_sample( source=source, target=target, ) frames, valid_frame_indices = cls.read_frames( cap, frame_indices_list, total_frames_num=source.total_frames_num, frame_recovery=frame_recovery, ) # Use transformers.video_utils.VideoMetadata format metadata = cls.create_hf_metadata( source=source, video_backend="opencv_dynamic", valid_frame_indices=valid_frame_indices, ) return frames, metadata