Fixed missing audio with pyav backend (#4064)

693e0ae8 · Prabhat Roy · GitHub · bdc88f52 · 693e0ae8 · 693e0ae8
Unverified Commit 693e0ae8 authored Jul 25, 2021 by Prabhat Roy Committed by GitHub Jul 25, 2021
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 11 deletions

test/test_video_reader.py test/test_video_reader.py +30 -6

torchvision/io/video.py torchvision/io/video.py +12 -5

No files found.
--- a/test/test_video_reader.py
+++ b/test/test_video_reader.py
 import collections
+import itertools
 import math
 import os
 import unittest
@@ -1243,14 +1244,37 @@ class TestVideoReader(unittest.TestCase):
        with self.assertRaises(RuntimeError):
            io.read_video('foo.mp4')
-    def test_audio_present(self):
+    def test_audio_present_pts(self):
-        """Test if audio frames are returned with video_reader backend."""
+        """Test if audio frames are returned with pts unit."""
-        set_video_backend('video_reader')
+        backends = ['video_reader', 'pyav']
+        start_offsets = [0, 1000]
+        end_offsets = [3000, None]
+        for test_video, _ in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            container = av.open(full_path)
+            if container.streams.audio:
+                for backend, start_offset, end_offset in itertools.product(
+                        backends, start_offsets, end_offsets):
+                    set_video_backend(backend)
+                    _, audio, _ = io.read_video(
+                        full_path, start_offset, end_offset, pts_unit='pts')
+                    self.assertGreaterEqual(audio.shape[0], 1)
+                    self.assertGreaterEqual(audio.shape[1], 1)
+    def test_audio_present_sec(self):
+        """Test if audio frames are returned with sec unit."""
+        backends = ['video_reader', 'pyav']
+        start_offsets = [0, 0.1]
+        end_offsets = [0.3, None]
        for test_video, _ in test_videos.items():
            full_path = os.path.join(VIDEO_DIR, test_video)
            container = av.open(full_path)
            if container.streams.audio:
-                _, audio, _ = io.read_video(full_path)
+                for backend, start_offset, end_offset in itertools.product(
+                        backends, start_offsets, end_offsets):
+                    set_video_backend(backend)
+                    _, audio, _ = io.read_video(
+                        full_path, start_offset, end_offset, pts_unit='sec')
                    self.assertGreaterEqual(audio.shape[0], 1)
                    self.assertGreaterEqual(audio.shape[1], 1)

--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -283,22 +283,25 @@ def read_video(
    info = {}
    video_frames = []
    audio_frames = []
+    audio_timebase = _video_opt.default_timebase
    try:
        with av.open(filename, metadata_errors="ignore") as container:
+            if container.streams.audio:
+                audio_timebase = container.streams.audio[0].time_base
            time_base = _video_opt.default_timebase
            if container.streams.video:
                time_base = container.streams.video[0].time_base
            elif container.streams.audio:
                time_base = container.streams.audio[0].time_base
            # video_timebase is the default time_base
-            start_pts_sec, end_pts_sec, pts_unit = _video_opt._convert_to_sec(
+            start_pts, end_pts, pts_unit = _video_opt._convert_to_sec(
                start_pts, end_pts, pts_unit, time_base)
            if container.streams.video:
                video_frames = _read_from_stream(
                    container,
-                    start_pts_sec,
+                    start_pts,
-                    end_pts_sec,
+                    end_pts,
                    pts_unit,
                    container.streams.video[0],
                    {"video": 0},
@@ -311,8 +314,8 @@ def read_video(
            if container.streams.audio:
                audio_frames = _read_from_stream(
                    container,
-                    start_pts_sec,
+                    start_pts,
-                    end_pts_sec,
+                    end_pts,
                    pts_unit,
                    container.streams.audio[0],
                    {"audio": 0},
@@ -334,6 +337,10 @@ def read_video(
    if aframes_list:
        aframes = np.concatenate(aframes_list, 1)
        aframes = torch.as_tensor(aframes)
+        if pts_unit == 'sec':
+            start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+            if end_pts != float("inf"):
+                end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
        aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
    else:
        aframes = torch.empty((1, 0), dtype=torch.float32)