Document StreamReader/Writer C++ code (#2997)

Summary: Extraction from https://github.com/pytorch/audio/issues/2994 Add docstrings to C++ StreamReader/Writer. Pull Request resolved: https://github.com/pytorch/audio/pull/2997 Reviewed By: nateanl Differential Revision: D42628016 Pulled By: mthrok fbshipit-source-id: b22c43b80997af4a9087142340c67bed28e54917

Document StreamReader/Writer C++ code (#2997)
Summary: Extraction from https://github.com/pytorch/audio/issues/2994 Add docstrings to C++ StreamReader/Writer. Pull Request resolved: https://github.com/pytorch/audio/pull/2997 Reviewed By: nateanl Differential Revision: D42628016 Pulled By: mthrok fbshipit-source-id: b22c43b80997af4a9087142340c67bed28e54917
de628226 · moto · Facebook GitHub Bot · bcfa9eed · de628226 · de628226
Commit de628226 authored Jan 20, 2023 by moto Committed by Facebook GitHub Bot Jan 20, 2023
3 changed files
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
@@ -7,6 +7,9 @@
 namespace torchaudio {
 namespace ffmpeg {

+///
+/// Fetch and decode audio/video streams chunk by chunk.
+///
 class StreamReader {
  AVFormatInputContextPtr pFormatContext;
  AVPacketPtr pPacket;
@@ -30,7 +33,17 @@ class StreamReader {
  int64_t seek_timestamp = 0;

 public:
+  /// @name Constructors
+  ///
+  ///@{
+
+  /// @todo Introduce a constructor that takes std::string and abstracts away
+  /// ffmpeg-native structs
+  ///
  explicit StreamReader(AVFormatInputContextPtr&& p);
+
+  ///@}
+
  ~StreamReader() = default;
  // Non-copyable
  StreamReader(const StreamReader&) = delete;
@@ -52,25 +65,99 @@ class StreamReader {
  // Query methods
  //////////////////////////////////////////////////////////////////////////////
 public:
-  // Find a suitable audio/video streams using heuristics from ffmpeg
+  /// @name Query methods
+  ///@{
+
+  /// Find a suitable audio stream using heuristics from ffmpeg.
+  ///
+  /// If successful, the index of the best stream (>=0) is returned.
+  /// Otherwise a negative value is returned.
  int64_t find_best_audio_stream() const;
+  /// Find a suitable video stream using heuristics from ffmpeg.
+  ///
+  /// If successful, the index of the best stream (0>=) is returned.
+  /// otherwise a negative value is returned.
  int64_t find_best_video_stream() const;
-  // Fetch metadata of the source
+  /// Fetch metadata of the source media.
  OptionDict get_metadata() const;
-  // Fetch information about source streams
+  /// Fetch the number of source streams found in the input media.
+  ///
+  /// The source streams include not only audio/video streams but also
+  /// subtitle and others.
  int64_t num_src_streams() const;
+  /// Fetch information about the specified source stream.
+  ///
+  /// The valid value range is ``[0, num_src_streams())``.
  SrcStreamInfo get_src_stream_info(int i) const;
-  // Fetch information about output streams
+  /// Fetch the number of output streams defined by client code.
+  ///
+  /// @addtogroup stream_reader_query_methods
  int64_t num_out_streams() const;
+  /// Fetch information about the specified output stream.
+  ///
+  /// The valid value range is ``[0, num_out_streams())``.
  OutputStreamInfo get_out_stream_info(int i) const;
-  // Check if all the buffers of the output streams are ready.
+  /// Check if all the buffers of the output streams have enough decoded frames.
  bool is_buffer_ready() const;

+  ///@}
+
  //////////////////////////////////////////////////////////////////////////////
  // Configure methods
  //////////////////////////////////////////////////////////////////////////////
-  void seek(double timestamp_s, int64_t mode);
+  /// @name Configure methods
+  ///@{

+  /// Define an output audio stream.
+  ///
+  /// @param i The index of the source stream.
+  ///
+  /// @param frames_per_chunk Number of frames returned as one chunk.
+  /// @parblock
+  ///   If a source stream is exhausted before ``frames_per_chunk``  frames
+  ///   are buffered, the chunk is returned as-is. Thus the number of frames
+  ///   in the chunk may be smaller than ````frames_per_chunk``.
+  ///
+  ///   Providing ``-1`` disables chunking, in which case, method
+  /// ``pop_chunks()`` returns all the buffered frames as one chunk.
+  /// @endparblock
+  ///
+  /// @param num_chunks Internal buffer size.
+  /// @parblock
+  ///   When the number of buffered chunks exceeds this number, old chunks are
+  ///   dropped. For example, if `frames_per_chunk` is 5 and `buffer_chunk_size`
+  ///   is 3, then frames older than 15 are dropped.
+  ///
+  ///   Providing ``-1`` disables this behavior, forcing the retention of all
+  ///   chunks.
+  /// @endparblock
+  ///
+  /// @param filter_desc Description of filter graph applied to the source
+  /// stream.
+  ///
+  /// @param decoder The name of the decoder to be used.
+  ///   When provided, use the specified decoder instead of the default one.
+  ///
+  /// @param decoder_option Options passed to decoder.
+  /// @parblock
+  ///   To list decoder options for a decoder, you can use
+  ///   `ffmpeg -h decoder=<DECODER>` command.
+  ///
+  ///   In addition to decoder-specific options, you can also pass options
+  ///   related to multithreading. They are effective only if the decoder
+  ///   supports them. If neither of them are provided, StreamReader defaults to
+  ///   single thread.
+  ///    - ``"threads"``: The number of threads or the value ``"0"``
+  ///      to let FFmpeg decide based on its heuristics.
+  ///    - ``"thread_type"``: Which multithreading method to use.
+  ///      The valid values are ``"frame"`` or ``"slice"``.
+  ///      Note that each decoder supports a different set of methods.
+  ///      If not provided, a default value is used.
+  ///       - ``"frame"``: Decode more than one frame at once.
+  ///         Each thread handles one frame.
+  ///         This will increase decoding delay by one frame per thread
+  ///       - ``"slice"``: Decode more than one part of a single frame at once.
+  /// @endparblock
  void add_audio_stream(
      int64_t i,
      int64_t frames_per_chunk,
@@ -78,6 +165,20 @@ class StreamReader {
      const c10::optional<std::string>& filter_desc,
      const c10::optional<std::string>& decoder,
      const c10::optional<OptionDict>& decoder_option);
+  /// Define an output video stream.
+  ///
+  /// @param i,frames_per_chunk,num_chunks,filter_desc,decoder,decoder_option
+  /// See `add_audio_stream()`.
+  ///
+  /// @param hw_accel Enable hardware acceleration.
+  /// @parblock
+  /// When video is decoded on CUDA hardware, (for example by specifying
+  /// `"h264_cuvid"` decoder), passing CUDA device indicator to ``hw_accel``
+  /// (i.e. ``hw_accel="cuda:0"``) will make StreamReader place the resulting
+  /// frames directly on the specified CUDA device as a CUDA tensor.
+  ///
+  /// If `None`, the chunk will be moved to CPU memory.
+  /// @endparblock
  void add_video_stream(
      int64_t i,
      int64_t frames_per_chunk,
@@ -86,8 +187,14 @@ class StreamReader {
      const c10::optional<std::string>& decoder,
      const c10::optional<OptionDict>& decoder_option,
      const c10::optional<std::string>& hw_accel);
+  /// Remove an output stream.
+  ///
+  /// @param i The index of the output stream to be removed.
+  /// The valid value range is `[0, num_out_streams())`.
  void remove_stream(int64_t i);

+  ///@}
+
 private:
  void add_stream(
      int i,
@@ -99,13 +206,49 @@ class StreamReader {
      const c10::optional<OptionDict>& decoder_option,
      const torch::Device& device);

- public:
  //////////////////////////////////////////////////////////////////////////////
  // Stream methods
  //////////////////////////////////////////////////////////////////////////////
+ public:
+  /// @name Stream methods
+  ///@{
+
+  /// Seek into the given time stamp.
+  ///
+  /// @param timestamp Target time stamp in second.
+  /// @param mode Seek mode.
+  /// - ``0``: Keyframe mode. Seek into nearest key frame before the given
+  /// timestamp.
+  /// - ``1``: Any mode. Seek into any frame (including non-key frames) before
+  ///   the given timestamp.
+  /// - ``2``: Precise mode. First seek into the nearest key frame before the
+  ///   given timestamp, then decode frames until it reaches the frame closest
+  ///   to the given timestamp.
+  void seek(double timestamp, int64_t mode);
+
+  /// Demultiplex and process one packet.
+  ///
+  /// @return
+  /// - ``0``: A packet was processed successfully and there are still
+  ///   packets left in the stream, so client code can call this method again.
+  /// - ``1``: A packet was processed successfully and it reached EOF.
+  ///   Client code should not call this method again.
+  /// - ``<0``: An error has happened.
  int process_packet();
+  /// Similar to `process_packet()`, but in case it fails due to resource
+  /// temporarily being unavailable, it automatically retries.
+  ///
+  /// This behavior is helpful when using device input, such as a microphone,
+  /// during which the buffer may be busy while sample acquisition is happening.
+  ///
+  /// @param timeout Timeout in milli seconds.
+  /// - ``>=0``: Keep retrying until the given time passes.
+  /// - ``<0``: Keep retrying forever.
+  /// @param backoff Time to wait before retrying in milli seconds.
  int process_packet_block(double timeout, double backoff);

+  ///@}
+
 private:
  int drain();

@@ -113,7 +256,13 @@ class StreamReader {
  // Retrieval
  //////////////////////////////////////////////////////////////////////////////
 public:
+  /// @name Retrieval methods
+  ///@{
+
+  /// Pop one chunk from each output stream if it is available.
  std::vector<c10::optional<torch::Tensor>> pop_chunks();
+
+  ///@}
 };

 } // namespace ffmpeg

--- a/torchaudio/csrc/ffmpeg/stream_reader/typedefs.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/typedefs.h
@@ -7,25 +7,100 @@ namespace torchaudio {
 namespace ffmpeg {

 struct SrcStreamInfo {
+  /// @name COMMON MEMBERS
+  ///@{
+
+  ///
+  /// The stream media type.
+  ///
+  /// Please see refer to
+  /// [the FFmpeg
+  /// documentation](https://ffmpeg.org/doxygen/4.1/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48)
+  /// for the available values
+  ///
+  /// @todo Introduce own enum and get rid of FFmpeg dependency
+  ///
  AVMediaType media_type;
+  /// The name of codec.
  const char* codec_name = "N/A";
+  /// The name of codec in long, human friendly form.
  const char* codec_long_name = "N/A";
+  /// For audio, it is sample format.
+  ///
+  /// Commonly found values are;
+  /// - ``"u8"``, ``"u8p"``: 8-bit unsigned integer.
+  /// - ``"s16"``, ``"s16p"``: 16-bit signed integer.
+  /// - ``"s32"``, ``"s32p"``: 32-bit signed integer.
+  /// - ``"s64"``, ``"s64p"``: 64-bit signed integer.
+  /// - ``"flt"``, ``"fltp"``: 32-bit floating point.
+  /// - ``"dbl"``, ``"dblp"``: 64-bit floating point.
+  ///
+  /// For video, it is color channel format.
+  ///
+  /// Commonly found values include;
+  /// - ``"gray8"``: grayscale
+  /// - ``"rgb24"``: RGB
+  /// - ``"bgr24"``: BGR
+  /// - ``"yuv420p"``: YUV420p
  const char* fmt_name = "N/A";
+
+  /// Bit rate
  int64_t bit_rate = 0;
+
+  /// Number of frames.
+  /// @note In some formats, the value is not reliable or unavailable.
  int64_t num_frames = 0;
+
+  /// Bits per sample
  int bits_per_sample = 0;
+
+  /// Metadata
+  ///
+  /// This method can fetch ID3 tag from MP3.
+  ///
+  /// Example:
+  ///
+  /// ```
+  /// {
+  ///   "title": "foo",
+  ///   "artist": "bar",
+  ///   "date": "2017"
+  /// }
+  /// ```
  OptionDict metadata{};
-  // Audio
+
+  ///@}
+
+  /// @name AUDIO-SPECIFIC MEMBERS
+  ///@{
+
+  /// Sample rate
  double sample_rate = 0;
+
+  /// The number of channels
  int num_channels = 0;
-  // Video
+
+  ///@}
+
+  /// @name VIDEO-SPECIFIC MEMBERS
+  ///@{
+
+  /// Width
  int width = 0;
+
+  /// Height
  int height = 0;
+
+  /// Frame rate
  double frame_rate = 0;
+  ///@}
 };

 struct OutputStreamInfo {
+  /// The index of the input source stream
  int source_index;
+  /// Filter graph definition, such as
+  /// ``"aresample=16000,aformat=sample_fmts=fltp"``.
  std::string filter_description;
 };


--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
@@ -22,6 +22,9 @@ struct OutputStream {
  AVBufferRefPtr hw_frame_ctx;
 };

+///
+/// Encode and write audio/video streams chunk by chunk
+///
 class StreamWriter {
  AVFormatOutputContextPtr pFormatContext;
  AVBufferRefPtr pHWBufferRef;
@@ -38,13 +41,48 @@ class StreamWriter {
  // Query methods
  //////////////////////////////////////////////////////////////////////////////
 public:
-  // Print the configured outputs
+  /// @internal
+
+  /// Print the configured outputs
  void dump_format(int64_t i);

+  /// @endinternal
+
  //////////////////////////////////////////////////////////////////////////////
  // Configure methods
  //////////////////////////////////////////////////////////////////////////////
 public:
+  /// Add an output audio stream.
+  ///
+  /// @param sample_rate The sample rate.
+  /// @param num_channels The number of channels.
+  /// @param format Input sample format, which determines the dtype
+  /// of the input tensor.
+  /// @parblock
+  ///
+  /// - ``"u8"``: The input tensor must be ``torch.uint8`` type.
+  /// - ``"s16"``: The input tensor must be ``torch.int16`` type.
+  /// - ``"s32"``: The input tensor must be ``torch.int32`` type.
+  /// - ``"s64"``: The input tensor must be ``torch.int64`` type.
+  /// - ``"flt"``: The input tensor must be ``torch.float32`` type.
+  /// - ``"dbl"``: The input tensor must be ``torch.float64`` type.
+  ///
+  /// Default: ``"flt"``.
+  /// @endparblock
+  /// @param encoder The name of the encoder to be used.
+  /// @parblock
+  /// When provided, use the specified encoder instead of the default one.
+  ///
+  /// To list the available encoders, you can use ``ffmpeg -encoders`` command.
+  /// @endparblock
+  /// @param encoder_option Options passed to encoder.
+  /// To list encoder options for a encoder, you can use
+  /// ``ffmpeg -h encoder=<ENCODER>``.
+  /// @param encoder_format Format used to encode media.
+  /// When encoder supports multiple formats, passing this argument will
+  /// override the format used for encoding.
+  ///  To list supported formats for the encoder, you can use
+  /// ``ffmpeg -h encoder=<ENCODER>`` command.
  void add_audio_stream(
      int64_t sample_rate,
      int64_t num_channels,
@@ -52,6 +90,35 @@ class StreamWriter {
      const c10::optional<std::string>& encoder,
      const c10::optional<OptionDict>& encoder_option,
      const c10::optional<std::string>& encoder_format);
+  /// Add an output video stream.
+  ///
+  /// @param frame_rate Frame rate
+  /// @param width Width
+  /// @param height Height
+  /// @param format Input pixel format, which determines the
+  /// color channel order of the input tensor.
+  /// @parblock
+  ///
+  /// - ``"gray8"``: One channel, grayscale.
+  /// - ``"rgb24"``: Three channels in the order of RGB.
+  /// - ``"bgr24"``: Three channels in the order of BGR.
+  /// - ``"yuv444p"``: Three channels in the order of YUV.
+  ///
+  /// In either case, the input tensor has to be ``torch.uint8`` type and
+  /// the shape must be (frame, channel, height, width).
+  /// @endparblock
+  /// @param encoder See ``add_audio_stream()``.
+  /// @param encoder_option See ``add_audio_stream()``.
+  /// @param encoder_format See ``add_audio_stream()``.
+  /// @param hw_accel Enable hardware acceleration.
+  /// @parblock
+  /// When video is encoded on CUDA hardware, for example
+  /// `encoder="h264_nvenc"`, passing CUDA device indicator to `hw_accel`
+  /// (i.e. `hw_accel="cuda:0"`) will make StreamWriter expect video
+  /// chunk to be a CUDA Tensor. Passing CPU Tensor will result in an error.
+  ///
+  /// If `None`, the video chunk Tensor has to be a CPU Tensor.
+  /// @endparblock
  void add_video_stream(
      double frame_rate,
      int64_t width,
@@ -61,6 +128,8 @@ class StreamWriter {
      const c10::optional<OptionDict>& encoder_option,
      const c10::optional<std::string>& encoder_format,
      const c10::optional<std::string>& hw_accel);
+  /// Set file-level metadata
+  /// @param metadata metadata.
  void set_metadata(const OptionDict& metadata);

 private:
@@ -70,11 +139,26 @@ class StreamWriter {
  // Write methods
  //////////////////////////////////////////////////////////////////////////////
 public:
+  /// Open the output file / device and write the header.
+  ///
+  /// @param opt Private options for protocol, device and muxer.
  void open(const c10::optional<OptionDict>& opt);
+  /// Close the output file / device and finalize metadata.
  void close();

+  /// Write audio data
+  /// @param i Stream index.
+  /// @param chunk Waveform tensor. Shape: ``(frame, channel)``.
+  /// The ``dtype`` must match what was passed to ``add_audio_stream()`` method.
  void write_audio_chunk(int i, const torch::Tensor& chunk);
+  /// Write video data
+  /// @param i Stream index.
+  /// @param chunk Video/image tensor. Shape: ``(time, channel, height,
+  /// width)``. The ``dtype`` must be ``torch.uint8``. The shape ``(height,
+  /// width and the number of channels)`` must match what was configured when
+  /// calling ``add_video_stream()``.
  void write_video_chunk(int i, const torch::Tensor& chunk);
+  /// Flush the frames from encoders and write the frames to the destination.
  void flush();

 private: