VideoAPI docs update (#2802)

* Video reader now returns dicts * docs update * Minor improvements Co-authored-by: Bruno Korbar <bjuncek@Frazz.local> Co-authored-by: Francisco Massa <fvsmassa@gmail.com>

VideoAPI docs update (#2802)
* Video reader now returns dicts * docs update * Minor improvements Co-authored-by: Bruno Korbar <bjuncek@Frazz.local> Co-authored-by: Francisco Massa <fvsmassa@gmail.com>
2831f11a · Bruno Korbar · GitHub · b8e93084 · 2831f11a · 2831f11a
Unverified Commit 2831f11a authored Oct 13, 2020 by Bruno Korbar Committed by GitHub Oct 13, 2020
Show whitespace changes
Inline Side-by-side

Showing with 54 additions and 21 deletions

docs/source/io.rst docs/source/io.rst +7 -2

test/test_video.py test/test_video.py +10 -10

torchvision/io/__init__.py torchvision/io/__init__.py +37 -9

No files found.
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -25,10 +25,10 @@ lower-level API for more fine-grained control compared to the :mod:`read_video`
 It does all this whilst fully supporting torchscript.
 .. autoclass:: VideoReader
-    :members: next, get_metadata, set_current_stream, seek
+    :members: __next__, get_metadata, set_current_stream, seek
-Example of usage:
+Example of inspecting a video:
 .. code:: python
@@ -50,6 +50,11 @@ Example of usage:
    # following would print out the list of frame rates for every present video stream
    print(reader_md["video"]["fps"])
+    # we explicitly select the stream we would like to operate on. In
+    # the constructor we select a default video stream, but
+    # in practice, we can set whichever stream we would like 
+    video.set_current_stream("video:0")
 Image
 -----

--- a/test/test_video.py
+++ b/test/test_video.py
@@ -244,11 +244,11 @@ def _template_read_video(video_object, s=0, e=None):
    video_frames = torch.empty(0)
    frames = []
    video_pts = []
-    for t, pts in itertools.takewhile(lambda x: x[1] <= e, video_object):
+    for frame in itertools.takewhile(lambda x: x['pts'] <= e, video_object):
-        if pts < s:
+        if frame['pts'] < s:
            continue
-        frames.append(t)
+        frames.append(frame['data'])
-        video_pts.append(pts)
+        video_pts.append(frame['pts'])
    if len(frames) > 0:
        video_frames = torch.stack(frames, 0)
@@ -257,11 +257,11 @@ def _template_read_video(video_object, s=0, e=None):
    audio_frames = torch.empty(0)
    frames = []
    audio_pts = []
-    for t, pts in itertools.takewhile(lambda x: x[1] <= e, video_object):
+    for frame in itertools.takewhile(lambda x: x['pts'] <= e, video_object):
-        if pts < s:
+        if frame['pts'] < s:
            continue
-        frames.append(t)
+        frames.append(frame['data'])
-        audio_pts.append(pts)
+        audio_pts.append(frame['pts'])
    if len(frames) > 0:
        audio_frames = torch.stack(frames, 0)
@@ -293,8 +293,8 @@ class TestVideo(unittest.TestCase):
            # pass 2: decode all frames using new api
            reader = VideoReader(full_path, "video")
            frames = []
-            for t, _ in reader:
+            for frame in reader:
-                frames.append(t)
+                frames.append(frame['data'])
            new_api = torch.stack(frames, 0)
            self.assertEqual(tv_result.size(), new_api.size())

--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -41,21 +41,48 @@ class VideoReader:
    container.
    Example:
-        The following examples creates :mod:`Video` object, seeks into 2s
+        The following examples creates a :mod:`VideoReader` object, seeks into 2s
        point, and returns a single frame::
                import torchvision
                video_path = "path_to_a_test_video"
                reader = torchvision.io.VideoReader(video_path, "video")
                reader.seek(2.0)
-                frame, timestamp = next(reader)
+                frame = next(reader)
+        :mod:`VideoReader` implements the iterable API, which makes it suitable to
+        using it in conjunction with :mod:`itertools` for more advanced reading.
+        As such, we can use a :mod:`VideoReader` instance inside for loops::
+            reader.seek(2)
+            for frame in reader:
+                frames.append(frame['data'])
+            # additionally, `seek` implements a fluent API, so we can do
+            for frame in reader.seek(2):
+                frames.append(frame['data'])
+        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
+        following code::
+            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
+                frames.append(frame['data'])
+        and similarly, reading 10 frames after the 2s timestamp can be achieved
+        as follows::
+            for frame in itertools.islice(reader.seek(2), 10):
+                frames.append(frame['data'])
+    .. note::
+        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
+        a unique stream id (which are determined by the video encoding).
+        In this way, if the video contaner contains multiple
+        streams of the same type, users can acces the one they want.
+        If only stream type is passed, the decoder auto-detects first stream of that type.
    Args:
        path (string): Path to the video file in supported format
-        stream (string, optional): descriptor of the required stream. Defaults to "video:0"
+        stream (string, optional): descriptor of the required stream, followed by the stream id,
-            Currently available options include :mod:`['video', 'audio', 'cc', 'sub']`
+            in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
+            Currently available options include ``['video', 'audio']``
    """
    def __init__(self, path, stream="video"):
@@ -67,13 +94,14 @@ class VideoReader:
        """Decodes and returns the next frame of the current stream
        Returns:
-            ([torch.Tensor, float]): list containing decoded frame and corresponding timestamp
+            (dict): a dictionary with fields ``data`` and ``pts``
+            containing decoded frame and corresponding timestamp
        """
        frame, pts = self._c.next()
        if frame.numel() == 0:
            raise StopIteration
-        return frame, pts
+        return {"data": frame, "pts": pts}
    def __iter__(self):
        return self
@@ -88,7 +116,7 @@ class VideoReader:
            Current implementation is the so-called precise seek. This
            means following seek, call to :mod:`next()` will return the
            frame with the exact timestamp if it exists or
-            the first frame with timestamp larger than time_s.
+            the first frame with timestamp larger than ``time_s``.
        """
        self._c.seek(time_s)
        return self
@@ -106,8 +134,8 @@ class VideoReader:
        Explicitly define the stream we are operating on.
        Args:
-            stream (string): descriptor of the required stream. Defaults to "video:0"
+            stream (string): descriptor of the required stream. Defaults to ``"video:0"``
-                Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`.
+                Currently available stream types include ``['video', 'audio']``.
                Each descriptor consists of two parts: stream type (e.g. 'video') and
                a unique stream id (which are determined by video encoding).
                In this way, if the video contaner contains multiple