Fix metadata fetch (#2464)

Summary: In https://github.com/pytorch/audio/issues/2461, `metadata` field was added to StreamInfo. However, the value attached to this new field was source-level metadata, while each stream can have different metadata. * source level metadata [AVFormatContext->metadata](https://ffmpeg.org/doxygen/4.1/structAVFormatContext.html#a3019a56080ed2e3297ff25bc2ff88adf) * stream level metadata [AVFormatContext->streams[]->metadata](https://ffmpeg.org/doxygen/4.1/structAVStream.html#a50d250a128a3da9ce3d135e84213fb82) This commit moves source level metadata to dedicated method, `get_metadata`, and fix the stream-level metadata to report stream metadata. Pull Request resolved: https://github.com/pytorch/audio/pull/2464 Reviewed By: hwangjeff, xiaohui-zhang Differential Revision: D36995452 Pulled By: mthrok fbshipit-source-id: 534be1f7feb07790a0ce8624c336cdb7b65a8697

Fix metadata fetch (#2464)
Summary: In https://github.com/pytorch/audio/issues/2461, `metadata` field was added to StreamInfo. However, the value attached to this new field was source-level metadata, while each stream can have different metadata. * source level metadata [AVFormatContext->metadata](https://ffmpeg.org/doxygen/4.1/structAVFormatContext.html#a3019a56080ed2e3297ff25bc2ff88adf) * stream level metadata [AVFormatContext->streams[]->metadata](https://ffmpeg.org/doxygen/4.1/structAVStream.html#a50d250a128a3da9ce3d135e84213fb82) This commit moves source level metadata to dedicated method, `get_metadata`, and fix the stream-level metadata to report stream metadata. Pull Request resolved: https://github.com/pytorch/audio/pull/2464 Reviewed By: hwangjeff, xiaohui-zhang Differential Revision: D36995452 Pulled By: mthrok fbshipit-source-id: 534be1f7feb07790a0ce8624c336cdb7b65a8697
4d2fa190 · moto · Facebook GitHub Bot · 711d6016 · 4d2fa190 · 4d2fa190
Commit 4d2fa190 authored Jun 08, 2022 by moto Committed by Facebook GitHub Bot Jun 08, 2022
8 changed files
--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -89,12 +89,11 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
        s = StreamReader(self.get_src())
        assert s.num_src_streams == 6
-        metadata = {
+        # Note:
-            "compatible_brands": "isomiso2avc1mp41",
+        # FFmpeg 4.4.1 and FFmpeg 5 also report
-            "encoder": "Lavf58.76.100",
+        # `"vendor_id": "[0][0][0][0]"` in audio/video metadata.
-            "major_brand": "isom",
+        # TODO:
-            "minor_version": "512",
+        # change expected metadata value based on FFmpeg version.
-        }
        expected = [
            StreamReaderSourceVideoStream(
                media_type="video",
@@ -104,7 +103,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                bit_rate=71925,
                num_frames=325,
                bits_per_sample=8,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "\x1fMainconcept Video Media Handler",
+                    "language": "eng",
+                },
                width=320,
                height=180,
                frame_rate=25.0,
@@ -117,7 +119,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                bit_rate=72093,
                num_frames=103,
                bits_per_sample=0,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "#Mainconcept MP4 Sound Media Handler",
+                    "language": "eng",
+                },
                sample_rate=8000.0,
                num_channels=2,
            ),
@@ -129,7 +134,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                bit_rate=None,
                num_frames=None,
                bits_per_sample=None,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "SubtitleHandler",
+                    "language": "eng",
+                },
            ),
            StreamReaderSourceVideoStream(
                media_type="video",
@@ -139,7 +147,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                bit_rate=128783,
                num_frames=390,
                bits_per_sample=8,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "\x1fMainconcept Video Media Handler",
+                    "language": "eng",
+                },
                width=480,
                height=270,
                frame_rate=29.97002997002997,
@@ -152,7 +163,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                bit_rate=128837,
                num_frames=205,
                bits_per_sample=0,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "#Mainconcept MP4 Sound Media Handler",
+                    "language": "eng",
+                },
                sample_rate=16000.0,
                num_channels=2,
            ),
@@ -164,32 +178,44 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                bit_rate=None,
                num_frames=None,
                bits_per_sample=None,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "SubtitleHandler",
+                    "language": "eng",
+                },
            ),
        ]
        output = [s.get_src_stream_info(i) for i in range(6)]
+        # Remove "vendor_id" if exists
+        # TODO: don't remove "vendor_id", instead,
+        # change expected based on FFmpeg version
+        for sinfo in output:
+            if "vendor_id" in sinfo.metadata:
+                del sinfo.metadata["vendor_id"]
        assert expected == output
    def test_id3tag(self):
+        """get_metadata method can fetch id3tag properly"""
        s = StreamReader(self.get_src("steam-train-whistle-daniel_simon.mp3"))
-        output = s.get_src_stream_info(s.default_audio_stream)
+        output = s.get_metadata()
-        expected = StreamReaderSourceAudioStream(
+        expected = {
-            media_type="audio",
-            codec="mp3",
-            codec_long_name="MP3 (MPEG audio layer 3)",
-            format="fltp",
-            bit_rate=210571,
-            num_frames=0,
-            bits_per_sample=0,
-            metadata={
            "title": "SoundBible.com Must Credit",
            "artist": "SoundBible.com Must Credit",
            "date": "2017",
-            },
+        }
-            sample_rate=44100.0,
+        assert output == expected
-            num_channels=2,
-        )
+    def test_video_metadata(self):
+        """get_metadata method can fetch video metadata"""
+        s = StreamReader(self.get_src())
+        output = s.get_metadata()
+        expected = {
+            "compatible_brands": "isomiso2avc1mp41",
+            "encoder": "Lavf58.76.100",
+            "major_brand": "isom",
+            "minor_version": "512",
+        }
        assert output == expected
    def test_src_info_invalid_index(self):

--- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -22,6 +22,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
      .def(
          "find_best_video_stream",
          &StreamReaderFileObj::find_best_video_stream)
+      .def("get_metadata", &StreamReaderFileObj::get_metadata)
      .def(
          "get_src_stream_info",
          &StreamReaderFileObj::get_src_stream_info_pybind)

--- a/torchaudio/csrc/ffmpeg/pybind/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/stream_reader.cpp
@@ -15,5 +15,12 @@ StreamReaderFileObj::StreamReaderFileObj(
          option.value_or(OptionDict{}),
          pAVIO)) {}
+std::map<std::string, std::string> StreamReaderFileObj::get_metadata() const {
+  std::map<std::string, std::string> ret;
+  for (const auto& it : StreamReader::get_metadata()) {
+    ret.insert({it.key(), it.value()});
+  }
+  return ret;
+};
 } // namespace ffmpeg
 } // namespace torchaudio
--- a/torchaudio/csrc/ffmpeg/pybind/stream_reader.h
+++ b/torchaudio/csrc/ffmpeg/pybind/stream_reader.h
@@ -15,6 +15,8 @@ class StreamReaderFileObj : protected FileObj, public StreamReaderBinding {
      const c10::optional<std::string>& format,
      const c10::optional<OptionDict>& option,
      int64_t buffer_size);
+  std::map<std::string, std::string> get_metadata() const;
 };
 } // namespace ffmpeg

--- a/torchaudio/csrc/ffmpeg/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader.cpp
@@ -83,6 +83,10 @@ c10::Dict<std::string, std::string> parse_metadata(
 }
 } // namespace
+c10::Dict<std::string, std::string> StreamReader::get_metadata() const {
+  return parse_metadata(pFormatContext->metadata);
+}
 SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
  validate_src_stream_index(i);
  AVStream* stream = pFormatContext->streams[i];
@@ -93,7 +97,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
  ret.bit_rate = codecpar->bit_rate;
  ret.num_frames = stream->nb_frames;
  ret.bits_per_sample = codecpar->bits_per_raw_sample;
-  ret.metadata = parse_metadata(pFormatContext->metadata);
+  ret.metadata = parse_metadata(stream->metadata);
  const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
  if (desc) {
    ret.codec_name = desc->name;

--- a/torchaudio/csrc/ffmpeg/stream_reader.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader.h
@@ -44,6 +44,8 @@ class StreamReader {
  // Find a suitable audio/video streams using heuristics from ffmpeg
  int64_t find_best_audio_stream() const;
  int64_t find_best_video_stream() const;
+  // Fetch metadata of the source
+  c10::Dict<std::string, std::string> get_metadata() const;
  // Fetch information about source streams
  int64_t num_src_streams() const;
  SrcStreamInfo get_src_stream_info(int i) const;

--- a/torchaudio/csrc/ffmpeg/stream_reader_binding.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader_binding.cpp
@@ -40,6 +40,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
      .def(torch::init<>(init))
      .def("num_src_streams", [](S self) { return self->num_src_streams(); })
      .def("num_out_streams", [](S self) { return self->num_out_streams(); })
+      .def("get_metadata", [](S self) { return self->get_metadata(); })
      .def(
          "get_src_stream_info",
          [](S s, int64_t i) { return s->get_src_stream_info(i); })

--- a/torchaudio/io/_stream_reader.py
+++ b/torchaudio/io/_stream_reader.py
@@ -62,8 +62,7 @@ class StreamReaderSourceStream:
    For compressed format, it can be 0.
    """
    metadata: Dict[str, str]
-    """Metadata attached to the source media.
+    """Metadata attached to the source stream."""
-    Note that metadata is common across the source streams."""
 @dataclass
@@ -397,6 +396,14 @@ class StreamReader:
        """
        return self._default_video_stream
+    def get_metadata(self) -> Dict[str, str]:
+        """Get the metadata of the source media.
+        Returns:
+            dict
+        """
+        return self._be.get_metadata()
    def get_src_stream_info(self, i: int) -> torchaudio.io.StreamReaderSourceStream:
        """Get the metadata of source stream