[documentation] video API documentation and wrapper (#2778)

* initial API documentation attempt * test the docs * initial commit * updating test to match the registration * adding the warning on unsucessful import * Try to do conditional import * Simple fix? * clearing up docs * docstring commit * Adding types in arguments Co-authored-by: Francisco Massa <fvsmassa@gmail.com> * reverting warning commit * addressing Francisco's comments * Apply suggestions from code review Co-authored-by: Francisco Massa <fvsmassa@gmail.com> * Revert "reverting warning commit" This reverts commit bd1a3dd4f3b97709ab59c744962e11174757f8ce. * Revert "adding the warning on unsucessful import" This reverts commit afef7df9eaa73bf80246e6d9114cb4c30b16f0ce. * remove warnings import Co-authored-by: Francisco Massa <fvsmassa@gmail.com>

[documentation] video API documentation and wrapper (#2778)
* initial API documentation attempt * test the docs * initial commit * updating test to match the registration * adding the warning on unsucessful import * Try to do conditional import * Simple fix? * clearing up docs * docstring commit * Adding types in arguments Co-authored-by: Francisco Massa <fvsmassa@gmail.com> * reverting warning commit * addressing Francisco's comments * Apply suggestions from code review Co-authored-by: Francisco Massa <fvsmassa@gmail.com> * Revert "reverting warning commit" This reverts commit bd1a3dd4f3b97709ab59c744962e11174757f8ce. * Revert "adding the warning on unsucessful import" This reverts commit afef7df9eaa73bf80246e6d9114cb4c30b16f0ce. * remove warnings import Co-authored-by: Francisco Massa <fvsmassa@gmail.com>
d5379656 · Bruno Korbar · GitHub · b2171653 · d5379656 · d5379656
Unverified Commit d5379656 authored Oct 09, 2020 by Bruno Korbar Committed by GitHub Oct 09, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 134 additions and 13 deletions

docs/source/io.rst docs/source/io.rst +34 -0

test/test_video.py test/test_video.py +6 -6

torchvision/io/__init__.py torchvision/io/__init__.py +94 -7

No files found.
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -17,6 +17,40 @@ Video
 .. autofunction:: write_video
+Fine-grained video API
+-------------------
+In addition to the :mod:`read_video` function, we provide a high-performance 
+lower-level API for more fine-grained control compared to the :mod:`read_video` function.
+It does all this whilst fully supporting torchscript.
+.. autoclass:: Video
+    :members: next, get_metadata, set_current_stream, seek
+Example of usage:
+.. code:: python
+    import torchvision
+    video_path = "path to a test video"
+    # Constructor allocates memory and a threaded decoder
+    # instance per video. At the momet it takes two arguments:
+    # path to the video file, and a wanted stream.
+    reader = torchvision.io.Video(video_path, "video")
+    # The information about the video can be retrieved using the 
+    # `get_metadata()` method. It returns a dictionary for every stream, with
+    # duration and other relevant metadata (often frame rate)
+    reader_md = reader.get_metadata()
+    # metadata is structured as a dict of dicts with following structure
+    # {"stream_type": {"attribute": [attribute per stream]}}
+    #
+    # following would print out the list of frame rates for every present video stream
+    print(reader_md["video"]["fps"])
 Image
 -----

--- a/test/test_video.py
+++ b/test/test_video.py
@@ -10,7 +10,7 @@ import numpy as np
 import torch
 import torchvision
-from torchvision.io import _HAS_VIDEO_OPT
+from torchvision.io import _HAS_VIDEO_OPT, Video
 try:
    import av
@@ -289,7 +289,7 @@ class TestVideo(unittest.TestCase):
            tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
            tv_result = tv_result.permute(0, 3, 1, 2)
            # pass 2: decode all frames using new api
-            reader = torch.classes.torchvision.Video(full_path, "video")
+            reader = Video(full_path, "video")
            frames = []
            t, _ = reader.next()
            while t.numel() > 0:
@@ -310,7 +310,7 @@ class TestVideo(unittest.TestCase):
    #         s = min(r)
    #         e = max(r)
-    #         reader = torch.classes.torchvision.Video(full_path, "video")
+    #         reader = Video(full_path, "video")
    #         results = _template_read_video(reader, s, e)
    #         tv_video, tv_audio, info = torchvision.io.read_video(
    #             full_path, start_pts=s, end_pts=e, pts_unit="sec"
@@ -329,7 +329,7 @@ class TestVideo(unittest.TestCase):
    #             full_path, pts_unit="sec"
    #         )
    #         # pass 2: decode all frames using new api
-    #         reader = torch.classes.torchvision.Video(full_path, "video")
+    #         reader = Video(full_path, "video")
    #         pts = []
    #         t, p = reader.next()
    #         while t.numel() > 0:
@@ -353,7 +353,7 @@ class TestVideo(unittest.TestCase):
        torchvision.set_video_backend("pyav")
        for test_video, config in test_videos.items():
            full_path = os.path.join(VIDEO_DIR, test_video)
-            reader = torch.classes.torchvision.Video(full_path, "video")
+            reader = Video(full_path, "video")
            reader_md = reader.get_metadata()
            self.assertAlmostEqual(
                config.video_fps, reader_md["video"]["fps"][0], delta=0.0001
@@ -372,7 +372,7 @@ class TestVideo(unittest.TestCase):
            ref_result = _decode_frames_by_av_module(full_path)
-            reader = torch.classes.torchvision.Video(full_path, "video")
+            reader = Video(full_path, "video")
            newapi_result = _template_read_video(reader)
            # First we check if the frames are approximately the same

--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
+import torch
 from ._video_opt import (
    Timebase,
    VideoMetaData,
@@ -20,10 +22,94 @@ from .image import (
    encode_jpeg,
    write_jpeg,
    encode_png,
-    write_png
+    write_png,
 )
+if _HAS_VIDEO_OPT:
+    class Video:
+        """
+        Fine-grained video-reading API.
+        Supports frame-by-frame reading of various streams from a single video
+        container.
+        Args:
+            path (string): Path to the video file in supported format
+            stream (string, optional): descriptor of the required stream. Defaults to "video:0"
+                Currently available options include :mod:`['video', 'audio', 'cc', 'sub']`
+        Example:
+            The following examples creates :mod:`Video` object, seeks into 2s
+            point, and returns a single frame::
+                    import torchvision
+                    video_path = "path_to_a_test_video"
+                    reader = torchvision.io.Video(video_path, "video")
+                    reader.seek(2.0)
+                    frame, timestamp = reader.next()
+        """
+        def __init__(self, path, stream="video"):
+            self._c = torch.classes.torchvision.Video(path, stream)
+        def next(self):
+            """Iterator that decodes the next frame of the current stream
+            Returns:
+                ([torch.Tensor, float]): list containing decoded frame and corresponding timestamp
+            """
+            return self._c.next()
+        def seek(self, time_s: float):
+            """Seek within current stream.
+            Args:
+                time_s (float): seek time in seconds
+            .. note::
+                Current implementation is the so-called precise seek. This
+                means following seek, call to :mod:`next()` will return the
+                frame with the exact timestamp if it exists or
+                the first frame with timestamp larger than time_s.
+            """
+            self._c.seek(time_s)
+        def get_metadata(self):
+            """Returns video metadata
+            Returns:
+                (dict): dictionary containing duration and frame rate for every stream
+            """
+            return self._c.get_metadata()
+        def set_current_stream(self, stream: str):
+            """Set current stream.
+            Explicitly define the stream we are operating on.
+            Args:
+                stream (string): descriptor of the required stream. Defaults to "video:0"
+                    Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`.
+                    Each descriptor consists of two parts: stream type (e.g. 'video') and
+                    a unique stream id (which are determined by video encoding).
+                    In this way, if the video contaner contains multiple
+                    streams of the same type, users can acces the one they want.
+                    If only stream type is passed, the decoder auto-detects first stream
+                    of that type and returns it.
+            Returns:
+                (bool): True on succes, False otherwise
+            """
+            return self._c.set_current_stream(stream)
+else:
+    Video = None
 __all__ = [
    "write_video",
    "read_video",
@@ -39,10 +125,11 @@ __all__ = [
    "_read_video_meta_data",
    "VideoMetaData",
    "Timebase",
-    'read_image',
+    "read_image",
-    'decode_image',
+    "decode_image",
-    'encode_jpeg',
+    "encode_jpeg",
-    'write_jpeg',
+    "write_jpeg",
-    'encode_png',
+    "encode_png",
-    'write_png',
+    "write_png",
+    "Video",
 ]