Unverified Commit 6f2c71be authored by Jaseel Muhammad's avatar Jaseel Muhammad Committed by GitHub
Browse files

[Multimodal] Add PyAV video backend for concurrent video decoding (#39986)


Signed-off-by: default avatarJaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Signed-off-by: default avatarIsotr0py <2037008807@qq.com>
Co-authored-by: default avatarIsotr0py <2037008807@qq.com>
Co-authored-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 2463f00f
...@@ -6,7 +6,7 @@ import pytest ...@@ -6,7 +6,7 @@ import pytest
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import batched_tensors_equal from vllm.multimodal.inputs import batched_tensors_equal
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend from vllm.multimodal.video import DynamicVideoBackend, VideoBackend
from ...utils import build_model_context from ...utils import build_model_context
...@@ -70,9 +70,11 @@ def test_processor_override( ...@@ -70,9 +70,11 @@ def test_processor_override(
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) @pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2]) @pytest.mark.parametrize("fps", [2])
@pytest.mark.parametrize("backend", ["opencv", "pyav"])
def test_video_loader_consistency( def test_video_loader_consistency(
model_id: str, model_id: str,
fps: int, fps: int,
backend: str,
): ):
""" """
Ensure dynamic video loader (pre-sampled by loader) and normal video Ensure dynamic video loader (pre-sampled by loader) and normal video
...@@ -93,9 +95,11 @@ def test_video_loader_consistency( ...@@ -93,9 +95,11 @@ def test_video_loader_consistency(
with open(video_path, "rb") as f: with open(video_path, "rb") as f:
video_bytes = f.read() video_bytes = f.read()
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes) static_video, static_metadata = VideoBackend.load_bytes(
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes( video_bytes, backend=backend
video_bytes, fps=fps )
dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes(
video_bytes, fps=fps, backend=backend
) )
# pre-sampled loader shouldn't read all frames # pre-sampled loader shouldn't read all frames
......
...@@ -71,7 +71,9 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): ...@@ -71,7 +71,9 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
video_data = f.read() video_data = f.read()
loader = VIDEO_LOADER_REGISTRY.load("opencv") loader = VIDEO_LOADER_REGISTRY.load("opencv")
frames, metadata = loader.load_bytes(video_data, num_frames=-1) frames, metadata = loader.load_bytes(
video_data, num_frames=-1, backend="opencv"
)
# Verify metadata consistency: # Verify metadata consistency:
# frames_indices must match actual loaded frames # frames_indices must match actual loaded frames
...@@ -158,12 +160,12 @@ def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch): ...@@ -158,12 +160,12 @@ def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch):
# Test WITHOUT recovery - should have fewer frames due to failures # Test WITHOUT recovery - should have fewer frames due to failures
frames_no_recovery, meta_no = loader.load_bytes( frames_no_recovery, meta_no = loader.load_bytes(
video_data, num_frames=8, frame_recovery=False video_data, num_frames=8, frame_recovery=False, backend="opencv"
) )
# Test WITH recovery - should recover using next valid frames # Test WITH recovery - should recover using next valid frames
frames_with_recovery, meta_yes = loader.load_bytes( frames_with_recovery, meta_yes = loader.load_bytes(
video_data, num_frames=8, frame_recovery=True video_data, num_frames=8, frame_recovery=True, backend="opencv"
) )
# With recovery should have MORE frames than without # With recovery should have MORE frames than without
...@@ -214,12 +216,12 @@ def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch): ...@@ -214,12 +216,12 @@ def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
# Test without recovery - frame 17 will be skipped # Test without recovery - frame 17 will be skipped
frames_no_recovery, meta_no_recovery = loader.load_bytes( frames_no_recovery, meta_no_recovery = loader.load_bytes(
video_data, num_frames=8, frame_recovery=False video_data, num_frames=8, frame_recovery=False, backend="opencv"
) )
# Test with recovery - frame 18 should fill in for frame 17 # Test with recovery - frame 18 should fill in for frame 17
frames_with_recovery, meta_with_recovery = loader.load_bytes( frames_with_recovery, meta_with_recovery = loader.load_bytes(
video_data, num_frames=8, frame_recovery=True video_data, num_frames=8, frame_recovery=True, backend="opencv"
) )
# Verify metadata consistency for both modes # Verify metadata consistency for both modes
...@@ -271,12 +273,16 @@ def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch): ...@@ -271,12 +273,16 @@ def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
# Test without recovery # Test without recovery
frames_no_recovery, meta_no = loader.load_bytes( frames_no_recovery, meta_no = loader.load_bytes(
video_data, fps=2, max_duration=10, frame_recovery=False video_data,
fps=2,
max_duration=10,
frame_recovery=False,
backend="opencv",
) )
# Test with frame_recovery enabled # Test with frame_recovery enabled
frames_with_recovery, meta_with = loader.load_bytes( frames_with_recovery, meta_with = loader.load_bytes(
video_data, fps=2, max_duration=10, frame_recovery=True video_data, fps=2, max_duration=10, frame_recovery=True, backend="opencv"
) )
# Verify basic properties # Verify basic properties
...@@ -310,27 +316,81 @@ def dummy_video_path(tmp_path): ...@@ -310,27 +316,81 @@ def dummy_video_path(tmp_path):
return video_path return video_path
# ============================================================================
# PyAV Backend Tests
# ============================================================================
def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
"""Test that the pyav codec backend can load frames from a valid video."""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
with open(dummy_video_path, "rb") as f:
video_data = f.read()
loader = VIDEO_LOADER_REGISTRY.load("opencv")
frames, metadata = loader.load_bytes(video_data, num_frames=8, backend="pyav")
assert frames.ndim == 4
assert frames.shape[3] == 3 # RGB
assert frames.shape[0] == 8
assert frames.shape[0] == len(metadata["frames_indices"])
assert metadata["video_backend"] == "pyav"
assert "total_num_frames" in metadata
assert "fps" in metadata
assert "duration" in metadata
def test_pyav_dynamic_backend_loads_frames(
dummy_video_path, monkeypatch: pytest.MonkeyPatch
):
"""Test that the pyav codec with dynamic sampling can load frames."""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
with open(dummy_video_path, "rb") as f:
video_data = f.read()
loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
frames, metadata = loader.load_bytes(
video_data, fps=2, max_duration=10, backend="pyav"
)
assert frames.ndim == 4
assert frames.shape[3] == 3 # RGB
assert frames.shape[0] > 0
assert frames.shape[0] == len(metadata["frames_indices"])
assert metadata["video_backend"] == "pyav_dynamic"
@pytest.mark.parametrize( @pytest.mark.parametrize(
"backend, kwargs, expected_num_frames", "loader_key, kwargs, expected_num_frames",
[ [
# opencv: num_frames directly controls count # uniform sampling + opencv codec
pytest.param("opencv", {"num_frames": 32}, 32, id="opencv-num_frames"),
pytest.param("opencv", {"fps": 2}, 120, id="opencv-fps"),
pytest.param( pytest.param(
"opencv", "opencv",
{"num_frames": 500, "fps": 2}, {"num_frames": 32, "backend": "opencv"},
32,
id="opencv-num_frames",
),
pytest.param("opencv", {"fps": 2, "backend": "opencv"}, 120, id="opencv-fps"),
pytest.param(
"opencv",
{"num_frames": 500, "fps": 2, "backend": "opencv"},
120, 120,
id="opencv-num_frames_wins_fps", id="opencv-num_frames_wins_fps",
), ),
# dynamic sampling + opencv codec
pytest.param( pytest.param(
"opencv_dynamic", "opencv_dynamic",
{"fps": 1, "max_duration": 60}, {"fps": 1, "max_duration": 60, "backend": "opencv"},
60, 60,
id="opencv_dynamic-within_max_duration", id="opencv_dynamic-within_max_duration",
), ),
pytest.param( pytest.param(
"opencv_dynamic", "opencv_dynamic",
{"fps": 2, "max_duration": 30}, {"fps": 2, "max_duration": 30, "backend": "opencv"},
60, 60,
id="opencv_dynamic-exceeds_max_duration", id="opencv_dynamic-exceeds_max_duration",
), ),
...@@ -349,18 +409,45 @@ def dummy_video_path(tmp_path): ...@@ -349,18 +409,45 @@ def dummy_video_path(tmp_path):
119, 119,
id="molmo2-fps", id="molmo2-fps",
), ),
# uniform sampling + pyav codec (same frame counts as opencv)
pytest.param(
"opencv",
{"num_frames": 32, "backend": "pyav"},
32,
id="pyav-num_frames",
),
pytest.param("opencv", {"fps": 2, "backend": "pyav"}, 120, id="pyav-fps"),
pytest.param(
"opencv",
{"num_frames": 500, "fps": 2, "backend": "pyav"},
120,
id="pyav-num_frames_wins_fps",
),
# dynamic sampling + pyav codec
pytest.param(
"opencv_dynamic",
{"fps": 1, "max_duration": 60, "backend": "pyav"},
60,
id="pyav_dynamic-within_max_duration",
),
pytest.param(
"opencv_dynamic",
{"fps": 2, "max_duration": 30, "backend": "pyav"},
60,
id="pyav_dynamic-exceeds_max_duration",
),
], ],
) )
def test_video_loader_frames_sampling( def test_video_loader_frames_sampling(
dummy_video_path, dummy_video_path,
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
backend: str, loader_key: str,
kwargs: dict, kwargs: dict,
expected_num_frames: int, expected_num_frames: int,
): ):
"""Test video loader frames sampling functionality.""" """Test video loader frames sampling functionality."""
monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", backend) monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", loader_key)
loader = VIDEO_LOADER_REGISTRY.load(backend) loader = VIDEO_LOADER_REGISTRY.load(loader_key)
with open(dummy_video_path, "rb") as f: with open(dummy_video_path, "rb") as f:
long_video_bytes = f.read() long_video_bytes = f.read()
......
...@@ -829,9 +829,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -829,9 +829,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int( "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25") os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
), ),
# Backend for Video IO # Backend for Video IO — selects the frame-sampling algorithm.
# - "opencv": Default backend that uses OpenCV stream buffered backend. # - "opencv": uniform sampling.
# - "identity": Returns raw video bytes for model processor to handle. # - "opencv_dynamic": duration-aware dynamic sampling.
# - "identity": returns raw video bytes for model processor to handle.
# #
# Custom backend implementations can be registered # Custom backend implementations can be registered
# via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import math import math
from abc import abstractmethod from abc import abstractmethod
from io import BytesIO from io import BytesIO
from typing import Any, NamedTuple, cast from typing import Any, ClassVar, Literal, NamedTuple, cast
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
...@@ -19,6 +19,11 @@ except ImportError: ...@@ -19,6 +19,11 @@ except ImportError:
cv2 = PlaceholderModule("cv2") cv2 = PlaceholderModule("cv2")
vr = PlaceholderModule("cv2").placeholder_attr("videoio_registry") vr = PlaceholderModule("cv2").placeholder_attr("videoio_registry")
try:
import av
except ImportError:
av = PlaceholderModule("av") # type: ignore[assignment]
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -355,8 +360,75 @@ class OpenCVVideoBackendMixin: ...@@ -355,8 +360,75 @@ class OpenCVVideoBackendMixin:
return frames, valid_frame_indices return frames, valid_frame_indices
class PyAVVideoBackendMixin:
"""PyAV (in-process FFmpeg bindings) codec utilities.
Reads stream metadata and decodes target frames via per-frame
``container.seek()``. The seek releases the GIL between frames and
scales with the number of sampled frames rather than the video
length, enabling concurrent decoding under serving load.
"""
@staticmethod
def get_metadata(
container: "av.container.InputContainer",
) -> VideoSourceMetadata:
if not container.streams.video:
raise ValueError("No video streams found in container")
stream = container.streams.video[0]
total_frames = stream.frames or 0
fps = float(stream.average_rate) if stream.average_rate else 0.0
duration = float(stream.duration * stream.time_base) if stream.duration else 0.0
if total_frames == 0 and duration > 0 and fps > 0:
total_frames = int(duration * fps)
return VideoSourceMetadata(total_frames, fps, duration)
@staticmethod
def decode_frames(
container: "av.container.InputContainer",
frame_indices: list[int],
fps: float,
duration: float,
) -> tuple[npt.NDArray, list[int]]:
"""Decode target frames via per-frame seek + keyframe decode."""
stream = container.streams.video[0]
# SLICE parallelizes within a single frame without the
# one-frame-per-thread latency penalty of FRAME threading.
stream.thread_type = "SLICE"
time_base = stream.time_base
frames_list: list[npt.NDArray] = []
valid_indices: list[int] = []
frame_interval = 1.0 / fps if fps > 0 else 0.1
max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf")
for idx in frame_indices:
ts = min(idx / fps, max_ts) if fps > 0 else 0.0
pts = int(ts / time_base)
container.seek(pts, stream=stream)
frame = next(container.decode(video=0), None)
if frame is not None:
frames_list.append(frame.to_ndarray(format="rgb24"))
valid_indices.append(idx)
if not frames_list:
return np.empty((0,), dtype=np.uint8), valid_indices
return np.stack(frames_list), valid_indices
@VIDEO_LOADER_REGISTRY.register("opencv") @VIDEO_LOADER_REGISTRY.register("opencv")
class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin): class VideoBackend(VideoLoader, OpenCVVideoBackendMixin, PyAVVideoBackendMixin):
"""Uniform-sampling video backend.
Samples ``num_frames`` uniformly across the video (or one frame every
``1/fps`` seconds, whichever produces fewer frames). The decoding codec
is selected via the ``backend`` kwarg (``"opencv"`` or ``"pyav"``),
which can be passed through ``--media-io-kwargs``. Defaults to
``"pyav"`` for concurrent decoding.
"""
_sampling_suffix: ClassVar[str] = ""
@classmethod @classmethod
def compute_frames_index_to_sample( def compute_frames_index_to_sample(
cls, cls,
...@@ -366,7 +438,6 @@ class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin): ...@@ -366,7 +438,6 @@ class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
) -> list[int]: ) -> list[int]:
total_frames_num = source.total_frames_num total_frames_num = source.total_frames_num
duration = source.duration duration = source.duration
num_frames = target.num_frames num_frames = target.num_frames
fps = target.fps fps = target.fps
# resample video to target num_frames and fps # resample video to target num_frames and fps
...@@ -376,16 +447,18 @@ class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin): ...@@ -376,16 +447,18 @@ class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
num_frames_to_sample = min(num_frames, total_frames_num) num_frames_to_sample = min(num_frames, total_frames_num)
if fps > 0: if fps > 0:
num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps)) num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample num_frames_to_sample = max(1, num_frames_to_sample)
if num_frames_to_sample == total_frames_num: if num_frames_to_sample == total_frames_num:
frame_idx = list(range(0, num_frames_to_sample)) return list(range(num_frames_to_sample))
else: return np.linspace(
uniform_sampled_frames = np.linspace(
0, total_frames_num - 1, num_frames_to_sample, dtype=int 0, total_frames_num - 1, num_frames_to_sample, dtype=int
) ).tolist()
frame_idx = uniform_sampled_frames.tolist()
return frame_idx @classmethod
def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata:
"""Sampling-algorithm-specific metadata adjustment hook."""
return source
@classmethod @classmethod
def load_bytes( def load_bytes(
...@@ -395,55 +468,101 @@ class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin): ...@@ -395,55 +468,101 @@ class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
fps: int = -1, fps: int = -1,
max_duration: int = 300, max_duration: int = 300,
frame_recovery: bool = False, frame_recovery: bool = False,
*,
backend: Literal["opencv", "pyav"] = "opencv",
**kwargs, **kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]: ) -> tuple[npt.NDArray, dict[str, Any]]:
""" """Load sampled frames from raw video bytes.
Load video frames from bytes.
Args: Args:
data: Raw video bytes data: Raw video bytes.
num_frames: Target number of frames to sample (-1 for all) num_frames: Target number of frames to sample (``-1`` for all).
fps: Target FPS for sampling (-1 for original) fps: Target FPS for sampling (``-1`` for original).
max_duration: Maximum duration (unused in base backend) max_duration: Maximum duration in seconds — only used by the
frame_recovery: Enable forward-scan recovery for failed frames dynamic subclass; ignored here.
frame_recovery: Enable forward-scan recovery for failed frames.
Only honored by the OpenCV codec.
backend: Decoding codec — ``"opencv"`` or ``"pyav"`` .
Returns: Returns:
Tuple of (frames_array, metadata_dict) Tuple of ``(frames_array, metadata_dict)``.
""" """
cap = cls.open_video_capture(data)
source = OpenCVVideoBackendMixin.get_video_metadata(cap)
target = VideoTargetMetadata( target = VideoTargetMetadata(
num_frames=num_frames, num_frames=num_frames, fps=fps, max_duration=max_duration
fps=fps,
max_duration=max_duration,
) )
# resample video to target num_frames and fps if backend == "opencv":
# - the minimum of the two will be used cap = cls.open_video_capture(data)
source = cls._prepare_source(cls.get_video_metadata(cap))
frame_idx = cls.compute_frames_index_to_sample( frame_idx = cls.compute_frames_index_to_sample(
source=source, source=source, target=target, **kwargs
target=target,
) )
frames, valid = cls.read_frames(
frames, valid_frame_indices = cls.read_frames(
cap, cap,
frame_idx, frame_idx,
total_frames_num=source.total_frames_num, total_frames_num=source.total_frames_num,
frame_recovery=frame_recovery, frame_recovery=frame_recovery,
) )
elif backend == "pyav":
assert not frame_recovery, (
"frame_recovery is only available for `opencv` backend"
)
with av.open(BytesIO(data)) as container:
source = cls._prepare_source(cls.get_metadata(container))
frame_idx = cls.compute_frames_index_to_sample(
source=source, target=target, **kwargs
)
frames, valid = cls.decode_frames(
container, frame_idx, source.original_fps, source.duration
)
else:
raise ValueError(
f"Unknown video codec backend {backend!r}; "
"valid options: 'opencv', 'pyav'."
)
metadata = cls.create_hf_metadata( if len(valid) < len(frame_idx):
source=source, logger.warning(
video_backend="opencv", "%s video loading: expected %d frames but got %d.",
valid_frame_indices=valid_frame_indices, backend,
len(frame_idx),
len(valid),
) )
return frames, metadata return frames, cls.create_hf_metadata(
source=source,
video_backend=f"{backend}{cls._sampling_suffix}",
valid_frame_indices=valid,
)
@VIDEO_LOADER_REGISTRY.register("opencv_dynamic") @VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin): class DynamicVideoBackend(VideoBackend):
"""Duration-aware dynamic-sampling video backend.
Samples at ``fps`` up to ``max_duration`` seconds, falling back to
uniform sampling across the full duration when the video is longer
than ``max_duration``. Codec is selectable the same way as
:class:`VideoBackend`.
"""
_sampling_suffix: ClassVar[str] = "_dynamic"
@classmethod
def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata:
# Estimate duration from frame count and fps when the container
# does not report it (common for WebM/streaming inputs).
if source.duration:
return source
if source.original_fps > 0:
max_frame_idx = source.total_frames_num - 1
duration = round(max_frame_idx / source.original_fps) + 1
else:
duration = 0
return VideoSourceMetadata(
source.total_frames_num, source.original_fps, duration
)
@classmethod @classmethod
def compute_frames_index_to_sample( def compute_frames_index_to_sample(
cls, cls,
...@@ -456,8 +575,8 @@ class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin): ...@@ -456,8 +575,8 @@ class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
original_fps = source.original_fps original_fps = source.original_fps
max_duration = target.max_duration max_duration = target.max_duration
fps = target.fps fps = target.fps
max_frame_idx = source.total_frames_num - 1 max_frame_idx = source.total_frames_num - 1
# Refer to: # Refer to:
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
frame_indices_list: list[int] frame_indices_list: list[int]
...@@ -491,62 +610,20 @@ class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin): ...@@ -491,62 +610,20 @@ class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
fps: int = 2, fps: int = 2,
max_duration: int = 300, max_duration: int = 300,
frame_recovery: bool = False, frame_recovery: bool = False,
*,
backend: Literal["opencv", "pyav"] = "opencv",
**kwargs, **kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]: ) -> tuple[npt.NDArray, dict[str, Any]]:
""" return super().load_bytes(
Load video frames with dynamic sampling based on duration. data,
Args:
data: Raw video bytes
num_frames: Not used in dynamic backend
fps: Target FPS for sampling (default: 2)
max_duration: Maximum video duration to process (default: 300s)
frame_recovery: Enable forward-scan recovery for failed frames
Returns:
Tuple of (frames_array, metadata_dict)
"""
cap = cls.open_video_capture(data)
orig_source = OpenCVVideoBackendMixin.get_video_metadata(cap)
max_frame_idx = orig_source.total_frames_num - 1
duration = (
orig_source.duration or round(max_frame_idx / orig_source.original_fps) + 1
)
# recompute source metadata with adjusted duration to ensure correct
# sampling indices computation
source = VideoSourceMetadata(
total_frames_num=orig_source.total_frames_num,
original_fps=orig_source.original_fps,
duration=duration,
)
target = VideoTargetMetadata(
num_frames=num_frames, num_frames=num_frames,
fps=fps, fps=fps,
max_duration=max_duration, max_duration=max_duration,
)
frame_indices_list = cls.compute_frames_index_to_sample(
source=source,
target=target,
)
frames, valid_frame_indices = cls.read_frames(
cap,
frame_indices_list,
total_frames_num=source.total_frames_num,
frame_recovery=frame_recovery, frame_recovery=frame_recovery,
backend=backend,
**kwargs,
) )
metadata = cls.create_hf_metadata(
source=source,
video_backend="opencv_dynamic",
valid_frame_indices=valid_frame_indices,
)
return frames, metadata
@VIDEO_LOADER_REGISTRY.register("molmo2") @VIDEO_LOADER_REGISTRY.register("molmo2")
class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin): class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin):
...@@ -835,7 +912,7 @@ class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin): ...@@ -835,7 +912,7 @@ class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin):
@VIDEO_LOADER_REGISTRY.register("nemotron_vl") @VIDEO_LOADER_REGISTRY.register("nemotron_vl")
class NemotronVLVideoBackend(OpenCVVideoBackend): class NemotronVLVideoBackend(VideoBackend):
@classmethod @classmethod
def load_bytes( def load_bytes(
cls, cls,
...@@ -844,14 +921,17 @@ class NemotronVLVideoBackend(OpenCVVideoBackend): ...@@ -844,14 +921,17 @@ class NemotronVLVideoBackend(OpenCVVideoBackend):
fps: int = -1, fps: int = -1,
max_duration: int = 300, max_duration: int = 300,
frame_recovery: bool = False, frame_recovery: bool = False,
*,
backend: Literal["opencv", "pyav"] = "opencv",
**kwargs, **kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]: ) -> tuple[npt.NDArray, dict[str, Any]]:
frames, metadata = OpenCVVideoBackend.load_bytes( frames, metadata = super().load_bytes(
data, data,
num_frames=num_frames, num_frames=num_frames,
fps=fps, fps=fps,
max_duration=max_duration, max_duration=max_duration,
frame_recovery=frame_recovery, frame_recovery=frame_recovery,
backend=backend,
**kwargs, **kwargs,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment