Pyav backend for VideoReader API (#6598)

* Test: add backend parameter * VideoReader object now works on backend * Frame reading now passes * Keyframe seek now passes * Pyav backend now supports metadata * changes in test to reflect GPU decoder change * Linter? * Test GPU output * Addressing Joao's comments * lint * lint * Revert "Test GPU output" This reverts commit f62e955d7dc81bcb23b40d58ea75413b9b62e76d. * lint? * lint * lint * Address issues in build? * hopefully doc fix * Arrgh * arrgh * fix typos * fix input options * remove read from memory option in pyav * skip read from mem test for gpu and pyab be * fix test * remove unused import * Hack to get reading from memory work with pyav * patch audio test Co-authored-by: Bruno Korbar <bkorbar@quansight.com> Co-authored-by: Joao Gomes <jdsgomes@fb.com>

Pyav backend for VideoReader API (#6598)
* Test: add backend parameter * VideoReader object now works on backend * Frame reading now passes * Keyframe seek now passes * Pyav backend now supports metadata * changes in test to reflect GPU decoder change * Linter? * Test GPU output * Addressing Joao's comments * lint * lint * Revert "Test GPU output" This reverts commit f62e955d7dc81bcb23b40d58ea75413b9b62e76d. * lint? * lint * lint * Address issues in build? * hopefully doc fix * Arrgh * arrgh * fix typos * fix input options * remove read from memory option in pyav * skip read from mem test for gpu and pyab be * fix test * remove unused import * Hack to get reading from memory work with pyav * patch audio test Co-authored-by: Bruno Korbar <bkorbar@quansight.com> Co-authored-by: Joao Gomes <jdsgomes@fb.com>
2e833520 · Bruno Korbar · GitHub · 70faba91 · 2e833520 · 2e833520
Unverified Commit 2e833520 authored Oct 31, 2022 by Bruno Korbar Committed by GitHub Oct 31, 2022
6 changed files
--- a/test/test_video_gpu_decoder.py
+++ b/test/test_video_gpu_decoder.py
@@ -3,7 +3,9 @@ import os
 import pytest
 import torch
-from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
+import torchvision
+from torchvision import _HAS_GPU_VIDEO_DECODER
+from torchvision.io import VideoReader
 try:
    import av
@@ -29,8 +31,9 @@ class TestVideoGPUDecoder:
        ],
    )
    def test_frame_reading(self, video_file):
+        torchvision.set_video_backend("cuda")
        full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
        with av.open(full_path) as container:
            for av_frame in container.decode(container.streams.video[0]):
                av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -54,7 +57,8 @@ class TestVideoGPUDecoder:
        ],
    )
    def test_seek_reading(self, keyframes, full_path, duration):
-        decoder = VideoReader(full_path, device="cuda")
+        torchvision.set_video_backend("cuda")
+        decoder = VideoReader(full_path)
        time = duration / 2
        decoder.seek(time, keyframes_only=keyframes)
        with av.open(full_path) as container:
@@ -79,8 +83,9 @@ class TestVideoGPUDecoder:
        ],
    )
    def test_metadata(self, video_file):
+        torchvision.set_video_backend("cuda")
        full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
        video_metadata = decoder.get_metadata()["video"]
        with av.open(full_path) as container:
            video = container.streams.video[0]

--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -53,7 +53,9 @@ test_videos = {
 class TestVideoApi:
    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading(self, test_video, backend):
+        torchvision.set_video_backend(backend)
        full_path = os.path.join(VIDEO_DIR, test_video)
        with av.open(full_path) as av_reader:
            if av_reader.streams.video:
@@ -117,50 +119,60 @@ class TestVideoApi:
    @pytest.mark.parametrize("stream", ["video", "audio"])
    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading_mem_vs_file(self, test_video, stream):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
+        torchvision.set_video_backend(backend)
        full_path = os.path.join(VIDEO_DIR, test_video)
-        # Test video reading from file vs from memory
+        reader = VideoReader(full_path)
-        vr_frames, vr_frames_mem = [], []
+        reader_md = reader.get_metadata()
-        vr_pts, vr_pts_mem = [], []
-        # get vr frames
+        if stream in reader_md:
-        video_reader = VideoReader(full_path, stream)
+            # Test video reading from file vs from memory
-        for vr_frame in video_reader:
+            vr_frames, vr_frames_mem = [], []
-            vr_frames.append(vr_frame["data"])
+            vr_pts, vr_pts_mem = [], []
-            vr_pts.append(vr_frame["pts"])
+            # get vr frames
+            video_reader = VideoReader(full_path, stream)
-        # get vr frames = read from memory
+            for vr_frame in video_reader:
-        f = open(full_path, "rb")
+                vr_frames.append(vr_frame["data"])
-        fbytes = f.read()
+                vr_pts.append(vr_frame["pts"])
-        f.close()
-        video_reader_from_mem = VideoReader(fbytes, stream)
+            # get vr frames = read from memory
+            f = open(full_path, "rb")
-        for vr_frame_from_mem in video_reader_from_mem:
+            fbytes = f.read()
-            vr_frames_mem.append(vr_frame_from_mem["data"])
+            f.close()
-            vr_pts_mem.append(vr_frame_from_mem["pts"])
+            video_reader_from_mem = VideoReader(fbytes, stream)
-        # same number of frames
+            for vr_frame_from_mem in video_reader_from_mem:
-        assert len(vr_frames) == len(vr_frames_mem)
+                vr_frames_mem.append(vr_frame_from_mem["data"])
-        assert len(vr_pts) == len(vr_pts_mem)
+                vr_pts_mem.append(vr_frame_from_mem["pts"])
-        # compare the frames and ptss
+            # same number of frames
-        for i in range(len(vr_frames)):
+            assert len(vr_frames) == len(vr_frames_mem)
-            assert vr_pts[i] == vr_pts_mem[i]
+            assert len(vr_pts) == len(vr_pts_mem)
-            mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-            # on average the difference is very small and caused
+            # compare the frames and ptss
-            # by decoding (around 1%)
+            for i in range(len(vr_frames)):
-            # TODO: asses empirically how to set this? atm it's 1%
+                assert vr_pts[i] == vr_pts_mem[i]
-            # averaged over all frames
+                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-            assert mean_delta.item() < 2.55
+                # on average the difference is very small and caused
+                # by decoding (around 1%)
-        del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+                # TODO: asses empirically how to set this? atm it's 1%
+                # averaged over all frames
+                assert mean_delta.item() < 2.55
+            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        else:
+            del reader, reader_md
    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_metadata(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_metadata(self, test_video, config, backend):
        """
        Test that the metadata returned via pyav corresponds to the one returned
        by the new video decoder API
        """
+        torchvision.set_video_backend(backend)
        full_path = os.path.join(VIDEO_DIR, test_video)
        reader = VideoReader(full_path, "video")
        reader_md = reader.get_metadata()
@@ -168,7 +180,9 @@ class TestVideoApi:
        assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_seek_start(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_seek_start(self, test_video, backend):
+        torchvision.set_video_backend(backend)
        full_path = os.path.join(VIDEO_DIR, test_video)
        video_reader = VideoReader(full_path, "video")
        num_frames = 0
@@ -194,7 +208,9 @@ class TestVideoApi:
        assert start_num_frames == num_frames
    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_accurateseek_middle(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader"])
+    def test_accurateseek_middle(self, test_video, backend):
+        torchvision.set_video_backend(backend)
        full_path = os.path.join(VIDEO_DIR, test_video)
        stream = "video"
        video_reader = VideoReader(full_path, stream)
@@ -233,7 +249,9 @@ class TestVideoApi:
    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_keyframe_reading(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
+    def test_keyframe_reading(self, test_video, config, backend):
+        torchvision.set_video_backend(backend)
        full_path = os.path.join(VIDEO_DIR, test_video)
        av_reader = av.open(full_path)

--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
 import os
 import warnings
+from modulefinder import Module
 import torch
 from torchvision import datasets, io, models, ops, transforms, utils
-from .extension import _HAS_OPS
+from .extension import _HAS_OPS, _load_library
 try:
    from .version import __version__  # noqa: F401
 except ImportError:
    pass
+try:
+    _load_library("Decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError, ModuleNotFoundError):
+    _HAS_GPU_VIDEO_DECODER = False
 # Check if torchvision is being imported within the root folder
 if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
    os.path.realpath(os.getcwd()), "torchvision"
@@ -66,11 +74,16 @@ def set_video_backend(backend):
        backend, please compile torchvision from source.
    """
    global _video_backend
-    if backend not in ["pyav", "video_reader"]:
+    if backend not in ["pyav", "video_reader", "cuda"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
    if backend == "video_reader" and not io._HAS_VIDEO_OPT:
+        # TODO: better messages
        message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        warnings.warn(message)
+        raise RuntimeError(message)
+    elif backend == "cuda" and not _HAS_GPU_VIDEO_DECODER:
+        # TODO: better messages
+        message = "cuda video backend is not available."
+        raise RuntimeError(message)
    else:
        _video_backend = backend

--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -4,10 +4,6 @@ import torch
 from ..utils import _log_api_usage_once
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import (
    _HAS_VIDEO_OPT,
    _probe_video_from_file,
@@ -47,7 +43,6 @@ __all__ = [
    "_read_video_timestamps_from_memory",
    "_probe_video_from_memory",
    "_HAS_VIDEO_OPT",
-    "_HAS_GPU_VIDEO_DECODER",
    "_read_video_clip_from_memory",
    "_read_video_meta_data",
    "VideoMetaData",

--- a/torchvision/io/_load_gpu_decoder.py
+++ b/torchvision/io/_load_gpu_decoder.py
-from ..extension import _load_library
-try:
-    _load_library("Decoder")
-    _HAS_GPU_VIDEO_DECODER = True
-except (ImportError, OSError):
-    _HAS_GPU_VIDEO_DECODER = False
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
+import io
 import warnings
 from typing import Any, Dict, Iterator, Optional
 import torch
 from ..utils import _log_api_usage_once
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import _HAS_VIDEO_OPT
 if _HAS_VIDEO_OPT:
@@ -22,11 +20,37 @@ else:
        return False
+try:
+    import av
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
 class VideoReader:
    """
    Fine-grained video-reading API.
    Supports frame-by-frame reading of various streams from a single video
-    container.
+    container. Much like previous video_reader API it supports the following
+    backends: video_reader, pyav, and cuda.
+    Backends can be set via `torchvision.set_video_backend` function.
    .. betastatus:: VideoReader class
@@ -88,16 +112,11 @@ class VideoReader:
            Default value (0) enables multithreading with codec-dependent heuristic. The performance
            will depend on the version of FFMPEG codecs supported.
-        device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
-            To use GPU decoding, pass ``device="cuda"``.
        path (str, optional):
            .. warning:
                This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
                Please use ``src`` instead.
    """
    def __init__(
@@ -105,45 +124,59 @@ class VideoReader:
        src: str = "",
        stream: str = "video",
        num_threads: int = 0,
-        device: str = "cpu",
        path: Optional[str] = None,
    ) -> None:
        _log_api_usage_once(self)
-        self.is_cuda = False
+        from .. import get_video_backend
-        device = torch.device(device)
-        if device.type == "cuda":
-            if not _HAS_GPU_VIDEO_DECODER:
-                raise RuntimeError("Not compiled with GPU decoder support.")
-            self.is_cuda = True
-            self._c = torch.classes.torchvision.GPUDecoder(src, device)
-            return
-        if not _has_video_opt():
-            raise RuntimeError(
-                "Not compiled with video_reader support, "
-                + "to enable video_reader support, please install "
-                + "ffmpeg (version 4.2 is currently supported) and "
-                + "build torchvision from source."
-            )
-        if src == "":
-            if path is None:
-                raise TypeError("src cannot be empty")
-            src = path
-            warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
-        elif isinstance(src, bytes):
-            src = torch.frombuffer(src, dtype=torch.uint8)
+        self.backend = get_video_backend()
        if isinstance(src, str):
-            self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            if src == "":
+                if path is None:
+                    raise TypeError("src cannot be empty")
+                src = path
+                warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
+        elif isinstance(src, bytes):
+            if self.backend in ["cuda"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
+                )
+            elif self.backend == "pyav":
+                src = io.BytesIO(src)
+            else:
+                src = torch.frombuffer(src, dtype=torch.uint8)
        elif isinstance(src, torch.Tensor):
-            if self.is_cuda:
+            if self.backend in ["cuda", "pyav"]:
-                raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
+                raise RuntimeError(
-            self._c = torch.classes.torchvision.Video("", "", 0)
+                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
-            self._c.init_from_memory(src, stream, num_threads)
+                )
        else:
            raise TypeError("`src` must be either string, Tensor or bytes object.")
+        if self.backend == "cuda":
+            device = torch.device("cuda")
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+        elif self.backend == "video_reader":
+            if isinstance(src, str):
+                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            elif isinstance(src, torch.Tensor):
+                self._c = torch.classes.torchvision.Video("", "", 0)
+                self._c.init_from_memory(src, stream, num_threads)
+        elif self.backend == "pyav":
+            self.container = av.open(src, metadata_errors="ignore")
+            # TODO: load metadata
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            # TODO: add extradata exception
+        else:
+            raise RuntimeError("Unknown video backend: {}".format(self.backend))
    def __next__(self) -> Dict[str, Any]:
        """Decodes and returns the next frame of the current stream.
        Frames are encoded as a dict with mandatory
@@ -156,14 +189,29 @@ class VideoReader:
            and corresponding timestamp (``pts``) in seconds
        """
-        if self.is_cuda:
+        if self.backend == "cuda":
            frame = self._c.next()
            if frame.numel() == 0:
                raise StopIteration
-            return {"data": frame}
+            return {"data": frame, "pts": None}
-        frame, pts = self._c.next()
+        elif self.backend == "video_reader":
+            frame, pts = self._c.next()
+        else:
+            try:
+                frame = next(self._c)
+                pts = float(frame.pts * frame.time_base)
+                if "video" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
+                elif "audio" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_ndarray()).permute(1, 0)
+                else:
+                    frame = None
+            except av.error.EOFError:
+                raise StopIteration
        if frame.numel() == 0:
            raise StopIteration
        return {"data": frame, "pts": pts}
    def __iter__(self) -> Iterator[Dict[str, Any]]:
@@ -182,7 +230,18 @@ class VideoReader:
            frame with the exact timestamp if it exists or
            the first frame with timestamp larger than ``time_s``.
        """
-        self._c.seek(time_s, keyframes_only)
+        if self.backend in ["cuda", "video_reader"]:
+            self._c.seek(time_s, keyframes_only)
+        else:
+            # handle special case as pyav doesn't catch it
+            if time_s < 0:
+                time_s = 0
+            temp_str = self.container.streams.get(**self.pyav_stream)[0]
+            offset = int(round(time_s / temp_str.time_base))
+            if not keyframes_only:
+                warnings.warn("Accurate seek is not implemented for pyav backend")
+            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
+            self._c = self.container.decode(**self.pyav_stream)
        return self
    def get_metadata(self) -> Dict[str, Any]:
@@ -191,6 +250,21 @@ class VideoReader:
        Returns:
            (dict): dictionary containing duration and frame rate for every stream
        """
+        if self.backend == "pyav":
+            metadata = {}  # type:  Dict[str, Any]
+            for stream in self.container.streams:
+                if stream.type not in metadata:
+                    if stream.type == "video":
+                        rate_n = "fps"
+                    else:
+                        rate_n = "framerate"
+                    metadata[stream.type] = {rate_n: [], "duration": []}
+                rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
+                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
+                metadata[stream.type][rate_n].append(float(rate))
+            return metadata
        return self._c.get_metadata()
    def set_current_stream(self, stream: str) -> bool:
@@ -210,6 +284,12 @@ class VideoReader:
        Returns:
            (bool): True on succes, False otherwise
        """
-        if self.is_cuda:
+        if self.backend == "cuda":
-            print("GPU decoding only works with video stream.")
+            warnings.warn("GPU decoding only works with video stream.")
+        if self.backend == "pyav":
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            return True
        return self._c.set_current_stream(stream)