Update source info (#2418)

Summary: Add num_frames and bits_per_sample to match with the current `torchaudio.info` capability. Pull Request resolved: https://github.com/pytorch/audio/pull/2418 Reviewed By: carolineechen Differential Revision: D36749077 Pulled By: mthrok fbshipit-source-id: 7b368ee993cf5ed63ff2f53c9e3b1f50fcce7713

Update source info (#2418)
Summary: Add num_frames and bits_per_sample to match with the current `torchaudio.info` capability. Pull Request resolved: https://github.com/pytorch/audio/pull/2418 Reviewed By: carolineechen Differential Revision: D36749077 Pulled By: mthrok fbshipit-source-id: 7b368ee993cf5ed63ff2f53c9e3b1f50fcce7713
bb77cbeb · moto · Facebook GitHub Bot · fd7ace17 · bb77cbeb · bb77cbeb
Commit bb77cbeb authored May 28, 2022 by moto Committed by Facebook GitHub Bot May 28, 2022
6 changed files
--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -96,6 +96,8 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                codec_long_name="H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10",
                format="yuv420p",
                bit_rate=71925,
+                num_frames=325,
+                bits_per_sample=8,
                width=320,
                height=180,
                frame_rate=25.0,
@@ -106,6 +108,8 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                codec_long_name="AAC (Advanced Audio Coding)",
                format="fltp",
                bit_rate=72093,
+                num_frames=103,
+                bits_per_sample=0,
                sample_rate=8000.0,
                num_channels=2,
            ),
@@ -115,6 +119,8 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                codec_long_name="MOV text",
                format=None,
                bit_rate=None,
+                num_frames=None,
+                bits_per_sample=None,
            ),
            StreamReaderSourceVideoStream(
                media_type="video",
@@ -122,6 +128,8 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                codec_long_name="H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10",
                format="yuv420p",
                bit_rate=128783,
+                num_frames=390,
+                bits_per_sample=8,
                width=480,
                height=270,
                frame_rate=29.97002997002997,
@@ -132,6 +140,8 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                codec_long_name="AAC (Advanced Audio Coding)",
                format="fltp",
                bit_rate=128837,
+                num_frames=205,
+                bits_per_sample=0,
                sample_rate=16000.0,
                num_channels=2,
            ),
@@ -141,6 +151,8 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
                codec_long_name="MOV text",
                format=None,
                bit_rate=None,
+                num_frames=None,
+                bits_per_sample=None,
            ),
        ]
        output = [s.get_src_stream_info(i) for i in range(6)]

--- a/torchaudio/csrc/ffmpeg/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader.cpp
@@ -79,6 +79,8 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
  SrcStreamInfo ret;
  ret.media_type = codecpar->codec_type;
  ret.bit_rate = codecpar->bit_rate;
+  ret.num_frames = stream->nb_frames;
+  ret.bits_per_sample = codecpar->bits_per_raw_sample;
  const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
  if (desc) {
    ret.codec_name = desc->name;

--- a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp
@@ -11,6 +11,8 @@ SrcInfo convert(SrcStreamInfo ssi) {
      ssi.codec_long_name,
      ssi.fmt_name,
      ssi.bit_rate,
+      ssi.num_frames,
+      ssi.bits_per_sample,
      ssi.sample_rate,
      ssi.num_channels,
      ssi.width,

--- a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h
@@ -11,6 +11,8 @@ using SrcInfo = std::tuple<
    std::string, // codec long name
    std::string, // format name
    int64_t, // bit_rate
+    int64_t, // num_frames
+    int64_t, // bits_per_sample
    // Audio
    double, // sample_rate
    int64_t, // num_channels

--- a/torchaudio/csrc/ffmpeg/typedefs.h
+++ b/torchaudio/csrc/ffmpeg/typedefs.h
@@ -12,6 +12,8 @@ struct SrcStreamInfo {
  const char* codec_long_name = "N/A";
  const char* fmt_name = "N/A";
  int64_t bit_rate = 0;
+  int64_t num_frames = 0;
+  int bits_per_sample = 0;
  // Audio
  double sample_rate = 0;
  int num_channels = 0;

--- a/torchaudio/io/_stream_reader.py
+++ b/torchaudio/io/_stream_reader.py
@@ -55,6 +55,12 @@ class StreamReaderSourceStream:
    This is an estimated values based on the initial few frames of the stream.
    For container formats and variable bit rate, it can be 0.
    """
+    num_frames: Optional[int]
+    """The number of frames in the stream"""
+    bits_per_sample: Optional[int]
+    """This is the number of valid bits in each output sample.
+    For compressed format, it can be 0.
+    """
 @dataclass
@@ -100,41 +106,59 @@ _CODEC = 1
 _CODEC_LONG = 2
 _FORMAT = 3
 _BIT_RATE = 4
+_NUM_FRAMES = 5
+_BPS = 6
 # - AUDIO
-_SAMPLE_RATE = 5
+_SAMPLE_RATE = 7
-_NUM_CHANNELS = 6
+_NUM_CHANNELS = 8
 # - VIDEO
-_WIDTH = 7
+_WIDTH = 9
-_HEIGHT = 8
+_HEIGHT = 10
-_FRAME_RATE = 9
+_FRAME_RATE = 11
 def _parse_si(i):
    media_type = i[_MEDIA_TYPE]
    codec_name = i[_CODEC]
    codec_long_name = i[_CODEC_LONG]
+    fmt = i[_FORMAT]
+    bit_rate = i[_BIT_RATE]
+    num_frames = i[_NUM_FRAMES]
+    bps = i[_BPS]
    if media_type == "audio":
        return StreamReaderSourceAudioStream(
-            media_type,
+            media_type=media_type,
-            codec_name,
+            codec=codec_name,
-            codec_long_name,
+            codec_long_name=codec_long_name,
-            i[_FORMAT],
+            format=fmt,
-            i[_BIT_RATE],
+            bit_rate=bit_rate,
-            i[_SAMPLE_RATE],
+            num_frames=num_frames,
-            i[_NUM_CHANNELS],
+            bits_per_sample=bps,
+            sample_rate=i[_SAMPLE_RATE],
+            num_channels=i[_NUM_CHANNELS],
        )
    if media_type == "video":
        return StreamReaderSourceVideoStream(
-            media_type,
+            media_type=media_type,
-            codec_name,
+            codec=codec_name,
-            codec_long_name,
+            codec_long_name=codec_long_name,
-            i[_FORMAT],
+            format=fmt,
-            i[_BIT_RATE],
+            bit_rate=bit_rate,
-            i[_WIDTH],
+            num_frames=num_frames,
-            i[_HEIGHT],
+            bits_per_sample=bps,
-            i[_FRAME_RATE],
+            width=i[_WIDTH],
+            height=i[_HEIGHT],
+            frame_rate=i[_FRAME_RATE],
        )
-    return StreamReaderSourceStream(media_type, codec_name, codec_long_name, None, None)
+    return StreamReaderSourceStream(
+        media_type=media_type,
+        codec=codec_name,
+        codec_long_name=codec_long_name,
+        format=None,
+        bit_rate=None,
+        num_frames=None,
+        bits_per_sample=None,
+    )
 @dataclass