Commit 4d2fa190 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Fix metadata fetch (#2464)

Summary:
In https://github.com/pytorch/audio/issues/2461, `metadata` field was added to StreamInfo.
However, the value attached to this new field was source-level metadata,
while each stream can have different metadata.

* source level metadata
[AVFormatContext->metadata](https://ffmpeg.org/doxygen/4.1/structAVFormatContext.html#a3019a56080ed2e3297ff25bc2ff88adf)
* stream level metadata
[AVFormatContext->streams[]->metadata](https://ffmpeg.org/doxygen/4.1/structAVStream.html#a50d250a128a3da9ce3d135e84213fb82)

This commit moves source level metadata to dedicated method, `get_metadata`, and
fix the stream-level metadata to report stream metadata.

Pull Request resolved: https://github.com/pytorch/audio/pull/2464

Reviewed By: hwangjeff, xiaohui-zhang

Differential Revision: D36995452

Pulled By: mthrok

fbshipit-source-id: 534be1f7feb07790a0ce8624c336cdb7b65a8697
parent 711d6016
...@@ -89,12 +89,11 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -89,12 +89,11 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
s = StreamReader(self.get_src()) s = StreamReader(self.get_src())
assert s.num_src_streams == 6 assert s.num_src_streams == 6
metadata = { # Note:
"compatible_brands": "isomiso2avc1mp41", # FFmpeg 4.4.1 and FFmpeg 5 also report
"encoder": "Lavf58.76.100", # `"vendor_id": "[0][0][0][0]"` in audio/video metadata.
"major_brand": "isom", # TODO:
"minor_version": "512", # change expected metadata value based on FFmpeg version.
}
expected = [ expected = [
StreamReaderSourceVideoStream( StreamReaderSourceVideoStream(
media_type="video", media_type="video",
...@@ -104,7 +103,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -104,7 +103,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
bit_rate=71925, bit_rate=71925,
num_frames=325, num_frames=325,
bits_per_sample=8, bits_per_sample=8,
metadata=metadata, metadata={
"handler_name": "\x1fMainconcept Video Media Handler",
"language": "eng",
},
width=320, width=320,
height=180, height=180,
frame_rate=25.0, frame_rate=25.0,
...@@ -117,7 +119,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -117,7 +119,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
bit_rate=72093, bit_rate=72093,
num_frames=103, num_frames=103,
bits_per_sample=0, bits_per_sample=0,
metadata=metadata, metadata={
"handler_name": "#Mainconcept MP4 Sound Media Handler",
"language": "eng",
},
sample_rate=8000.0, sample_rate=8000.0,
num_channels=2, num_channels=2,
), ),
...@@ -129,7 +134,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -129,7 +134,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
bit_rate=None, bit_rate=None,
num_frames=None, num_frames=None,
bits_per_sample=None, bits_per_sample=None,
metadata=metadata, metadata={
"handler_name": "SubtitleHandler",
"language": "eng",
},
), ),
StreamReaderSourceVideoStream( StreamReaderSourceVideoStream(
media_type="video", media_type="video",
...@@ -139,7 +147,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -139,7 +147,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
bit_rate=128783, bit_rate=128783,
num_frames=390, num_frames=390,
bits_per_sample=8, bits_per_sample=8,
metadata=metadata, metadata={
"handler_name": "\x1fMainconcept Video Media Handler",
"language": "eng",
},
width=480, width=480,
height=270, height=270,
frame_rate=29.97002997002997, frame_rate=29.97002997002997,
...@@ -152,7 +163,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -152,7 +163,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
bit_rate=128837, bit_rate=128837,
num_frames=205, num_frames=205,
bits_per_sample=0, bits_per_sample=0,
metadata=metadata, metadata={
"handler_name": "#Mainconcept MP4 Sound Media Handler",
"language": "eng",
},
sample_rate=16000.0, sample_rate=16000.0,
num_channels=2, num_channels=2,
), ),
...@@ -164,32 +178,44 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -164,32 +178,44 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
bit_rate=None, bit_rate=None,
num_frames=None, num_frames=None,
bits_per_sample=None, bits_per_sample=None,
metadata=metadata, metadata={
"handler_name": "SubtitleHandler",
"language": "eng",
},
), ),
] ]
output = [s.get_src_stream_info(i) for i in range(6)] output = [s.get_src_stream_info(i) for i in range(6)]
# Remove "vendor_id" if exists
# TODO: don't remove "vendor_id", instead,
# change expected based on FFmpeg version
for sinfo in output:
if "vendor_id" in sinfo.metadata:
del sinfo.metadata["vendor_id"]
assert expected == output assert expected == output
def test_id3tag(self): def test_id3tag(self):
"""get_metadata method can fetch id3tag properly"""
s = StreamReader(self.get_src("steam-train-whistle-daniel_simon.mp3")) s = StreamReader(self.get_src("steam-train-whistle-daniel_simon.mp3"))
output = s.get_src_stream_info(s.default_audio_stream) output = s.get_metadata()
expected = StreamReaderSourceAudioStream( expected = {
media_type="audio",
codec="mp3",
codec_long_name="MP3 (MPEG audio layer 3)",
format="fltp",
bit_rate=210571,
num_frames=0,
bits_per_sample=0,
metadata={
"title": "SoundBible.com Must Credit", "title": "SoundBible.com Must Credit",
"artist": "SoundBible.com Must Credit", "artist": "SoundBible.com Must Credit",
"date": "2017", "date": "2017",
}, }
sample_rate=44100.0, assert output == expected
num_channels=2,
) def test_video_metadata(self):
"""get_metadata method can fetch video metadata"""
s = StreamReader(self.get_src())
output = s.get_metadata()
expected = {
"compatible_brands": "isomiso2avc1mp41",
"encoder": "Lavf58.76.100",
"major_brand": "isom",
"minor_version": "512",
}
assert output == expected assert output == expected
def test_src_info_invalid_index(self): def test_src_info_invalid_index(self):
......
...@@ -22,6 +22,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { ...@@ -22,6 +22,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
.def( .def(
"find_best_video_stream", "find_best_video_stream",
&StreamReaderFileObj::find_best_video_stream) &StreamReaderFileObj::find_best_video_stream)
.def("get_metadata", &StreamReaderFileObj::get_metadata)
.def( .def(
"get_src_stream_info", "get_src_stream_info",
&StreamReaderFileObj::get_src_stream_info_pybind) &StreamReaderFileObj::get_src_stream_info_pybind)
......
...@@ -15,5 +15,12 @@ StreamReaderFileObj::StreamReaderFileObj( ...@@ -15,5 +15,12 @@ StreamReaderFileObj::StreamReaderFileObj(
option.value_or(OptionDict{}), option.value_or(OptionDict{}),
pAVIO)) {} pAVIO)) {}
std::map<std::string, std::string> StreamReaderFileObj::get_metadata() const {
std::map<std::string, std::string> ret;
for (const auto& it : StreamReader::get_metadata()) {
ret.insert({it.key(), it.value()});
}
return ret;
};
} // namespace ffmpeg } // namespace ffmpeg
} // namespace torchaudio } // namespace torchaudio
...@@ -15,6 +15,8 @@ class StreamReaderFileObj : protected FileObj, public StreamReaderBinding { ...@@ -15,6 +15,8 @@ class StreamReaderFileObj : protected FileObj, public StreamReaderBinding {
const c10::optional<std::string>& format, const c10::optional<std::string>& format,
const c10::optional<OptionDict>& option, const c10::optional<OptionDict>& option,
int64_t buffer_size); int64_t buffer_size);
std::map<std::string, std::string> get_metadata() const;
}; };
} // namespace ffmpeg } // namespace ffmpeg
......
...@@ -83,6 +83,10 @@ c10::Dict<std::string, std::string> parse_metadata( ...@@ -83,6 +83,10 @@ c10::Dict<std::string, std::string> parse_metadata(
} }
} // namespace } // namespace
c10::Dict<std::string, std::string> StreamReader::get_metadata() const {
return parse_metadata(pFormatContext->metadata);
}
SrcStreamInfo StreamReader::get_src_stream_info(int i) const { SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
validate_src_stream_index(i); validate_src_stream_index(i);
AVStream* stream = pFormatContext->streams[i]; AVStream* stream = pFormatContext->streams[i];
...@@ -93,7 +97,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { ...@@ -93,7 +97,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
ret.bit_rate = codecpar->bit_rate; ret.bit_rate = codecpar->bit_rate;
ret.num_frames = stream->nb_frames; ret.num_frames = stream->nb_frames;
ret.bits_per_sample = codecpar->bits_per_raw_sample; ret.bits_per_sample = codecpar->bits_per_raw_sample;
ret.metadata = parse_metadata(pFormatContext->metadata); ret.metadata = parse_metadata(stream->metadata);
const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id); const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
if (desc) { if (desc) {
ret.codec_name = desc->name; ret.codec_name = desc->name;
......
...@@ -44,6 +44,8 @@ class StreamReader { ...@@ -44,6 +44,8 @@ class StreamReader {
// Find a suitable audio/video streams using heuristics from ffmpeg // Find a suitable audio/video streams using heuristics from ffmpeg
int64_t find_best_audio_stream() const; int64_t find_best_audio_stream() const;
int64_t find_best_video_stream() const; int64_t find_best_video_stream() const;
// Fetch metadata of the source
c10::Dict<std::string, std::string> get_metadata() const;
// Fetch information about source streams // Fetch information about source streams
int64_t num_src_streams() const; int64_t num_src_streams() const;
SrcStreamInfo get_src_stream_info(int i) const; SrcStreamInfo get_src_stream_info(int i) const;
......
...@@ -40,6 +40,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) { ...@@ -40,6 +40,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
.def(torch::init<>(init)) .def(torch::init<>(init))
.def("num_src_streams", [](S self) { return self->num_src_streams(); }) .def("num_src_streams", [](S self) { return self->num_src_streams(); })
.def("num_out_streams", [](S self) { return self->num_out_streams(); }) .def("num_out_streams", [](S self) { return self->num_out_streams(); })
.def("get_metadata", [](S self) { return self->get_metadata(); })
.def( .def(
"get_src_stream_info", "get_src_stream_info",
[](S s, int64_t i) { return s->get_src_stream_info(i); }) [](S s, int64_t i) { return s->get_src_stream_info(i); })
......
...@@ -62,8 +62,7 @@ class StreamReaderSourceStream: ...@@ -62,8 +62,7 @@ class StreamReaderSourceStream:
For compressed format, it can be 0. For compressed format, it can be 0.
""" """
metadata: Dict[str, str] metadata: Dict[str, str]
"""Metadata attached to the source media. """Metadata attached to the source stream."""
Note that metadata is common across the source streams."""
@dataclass @dataclass
...@@ -397,6 +396,14 @@ class StreamReader: ...@@ -397,6 +396,14 @@ class StreamReader:
""" """
return self._default_video_stream return self._default_video_stream
def get_metadata(self) -> Dict[str, str]:
"""Get the metadata of the source media.
Returns:
dict
"""
return self._be.get_metadata()
def get_src_stream_info(self, i: int) -> torchaudio.io.StreamReaderSourceStream: def get_src_stream_info(self, i: int) -> torchaudio.io.StreamReaderSourceStream:
"""Get the metadata of source stream """Get the metadata of source stream
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment