Update the handling of videos without PTS values (#2970)

Summary: filter graph does not fallback to `best_effort_timestamp`, thus applying filters (like changing fps) on videos without PTS values failed. This commit changes the behavior by overwriting the PTS values with best_effort_timestamp. Pull Request resolved: https://github.com/pytorch/audio/pull/2970 Reviewed By: YosuaMichael Differential Revision: D42425771 Pulled By: mthrok fbshipit-source-id: 7b7a033ea2ad89bb49d6e1663d35d377dab2aae9

Update the handling of videos without PTS values (#2970)
Summary: filter graph does not fallback to `best_effort_timestamp`, thus applying filters (like changing fps) on videos without PTS values failed. This commit changes the behavior by overwriting the PTS values with best_effort_timestamp. Pull Request resolved: https://github.com/pytorch/audio/pull/2970 Reviewed By: YosuaMichael Differential Revision: D42425771 Pulled By: mthrok fbshipit-source-id: 7b7a033ea2ad89bb49d6e1663d35d377dab2aae9
1717edaa · moto · Facebook GitHub Bot · e1cddb46 · 1717edaa · 1717edaa
Commit 1717edaa authored Jan 10, 2023 by moto Committed by Facebook GitHub Bot Jan 10, 2023
4 changed files
--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -51,9 +51,6 @@ class _MediaSourceMixin:
            with open(path, "rb") as fileobj:
                data = fileobj.read()
            self.src = torch.frombuffer(data, dtype=torch.uint8)
-            print(self.src.data_ptr())
-            print(len(data))
-            print(self.src.shape)
        return self.src

    def tearDown(self):
@@ -467,10 +464,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
            ("nasa_13013.avi", "precise", 8.1, (0, slice(238, None))),
            ("nasa_13013.avi", "precise", 8.14, (0, slice(239, None))),
            ("nasa_13013.avi", "precise", 8.17, (0, slice(240, None))),
-            # Test precise seek on video with invalid PTS
+            # Test precise seek on video with missing PTS
            ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.0, (0, slice(None))),
-            ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.2, (0, slice(4, -1))),
-            ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.3, (0, slice(7, -1))),
+            ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.2, (0, slice(4, None))),
+            ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.3, (0, slice(7, None))),
            # Test any seek
            # The source avi video has one keyframe every twelve frames 0, 12, 24,.. or every 0.4004 seconds.
            ("nasa_13013.avi", "any", 0.0, (0, slice(None))),
@@ -514,6 +511,25 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
        print(hyp.shape, ref.shape)
        self.assertEqual(hyp, ref)

+    @parameterized.expand(
+        [
+            ("nasa_13013.mp4", [195, 3, 270, 480]),
+            # RATRACE does not have valid PTS metadata.
+            ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", [36, 3, 240, 560]),
+        ]
+    )
+    def test_change_fps(self, src, shape):
+        """Can change the FPS of videos"""
+        tgt_frame_rate = 15
+        s = StreamReader(self.get_src(src))
+        info = s.get_src_stream_info(s.default_video_stream)
+        assert info.frame_rate != tgt_frame_rate
+        s.add_basic_video_stream(frames_per_chunk=-1, frame_rate=tgt_frame_rate)
+        s.process_all_packets()
+        (chunk,) = s.pop_chunks()
+
+        assert chunk.shape == torch.Size(shape)
+

 def _to_fltp(original):
    """Convert Tensor to float32 with value range [-1, 1]"""

--- a/torchaudio/csrc/ffmpeg/stream_reader/decoder.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/decoder.cpp
@@ -123,6 +123,10 @@ int Decoder::get_frame(AVFrame* pFrame) {
  return avcodec_receive_frame(pCodecContext, pFrame);
 }

+int Decoder::get_frame_number() const {
+  return pCodecContext->frame_number;
+}
+
 void Decoder::flush_buffer() {
  avcodec_flush_buffers(pCodecContext);
 }

--- a/torchaudio/csrc/ffmpeg/stream_reader/decoder.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/decoder.h
@@ -29,6 +29,7 @@ class Decoder {
  int process_packet(AVPacket* pPacket);
  // Fetch a decoded frame
  int get_frame(AVFrame* pFrame);
+  int get_frame_number() const;
  // Flush buffer (for seek)
  void flush_buffer();
 };

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -87,6 +87,27 @@ int StreamProcessor::process_packet(AVPacket* packet) {
    if (ret < 0)
      return ret;

+    // If pts is undefined then overwrite with best effort estimate.
+    // In this case, best_effort_timestamp is basically the number of frames
+    // emit from decoder.
+    //
+    // We need valid pts because filter_graph does not fall back to
+    // best_effort_timestamp.
+    if (pFrame1->pts == AV_NOPTS_VALUE) {
+      if (pFrame1->best_effort_timestamp == AV_NOPTS_VALUE) {
+        // This happens in drain mode.
+        // When the decoder enters drain mode, it starts flushing the internally
+        // buffered frames, of which PTS cannot be estimated.
+        //
+        // This is because they might be intra-frames not in chronological
+        // order. In this case, we use received frames as-is in the order they
+        // are received.
+        pFrame1->pts = decoder.get_frame_number() + 1;
+      } else {
+        pFrame1->pts = pFrame1->best_effort_timestamp;
+      }
+    }
+
    // When the value of discard_before_pts is 0, we consider that the seek is
    // not performed and all the frames are passed to downstream
    // unconditionally.
@@ -96,14 +117,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
    //    In this case discard_before_pts is set to zero.
    // 2. When users seek to zero, what they expect is to get to the beginning
    //    of the data.
-    //    There are many videos with invalid PTS values, such as
-    //    -9223372036854775808, and though it is not possible to seek videos
-    //    without decoding, we can still support `seek(0)` as a special case,
-    //    and just not discard any.
    //
    // Note: discard_before_pts < 0 is UB.
-    if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts ||
-        pFrame1->best_effort_timestamp >= discard_before_pts) {
+    if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts) {
      send_frame(pFrame1);
    }