Commit de628226 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Document StreamReader/Writer C++ code (#2997)

Summary:
Extraction from https://github.com/pytorch/audio/issues/2994

Add docstrings to C++ StreamReader/Writer.

Pull Request resolved: https://github.com/pytorch/audio/pull/2997

Reviewed By: nateanl

Differential Revision: D42628016

Pulled By: mthrok

fbshipit-source-id: b22c43b80997af4a9087142340c67bed28e54917
parent bcfa9eed
......@@ -7,6 +7,9 @@
namespace torchaudio {
namespace ffmpeg {
///
/// Fetch and decode audio/video streams chunk by chunk.
///
class StreamReader {
AVFormatInputContextPtr pFormatContext;
AVPacketPtr pPacket;
......@@ -30,7 +33,17 @@ class StreamReader {
int64_t seek_timestamp = 0;
public:
/// @name Constructors
///
///@{
/// @todo Introduce a constructor that takes std::string and abstracts away
/// ffmpeg-native structs
///
explicit StreamReader(AVFormatInputContextPtr&& p);
///@}
~StreamReader() = default;
// Non-copyable
StreamReader(const StreamReader&) = delete;
......@@ -52,25 +65,99 @@ class StreamReader {
// Query methods
//////////////////////////////////////////////////////////////////////////////
public:
// Find a suitable audio/video streams using heuristics from ffmpeg
/// @name Query methods
///@{
/// Find a suitable audio stream using heuristics from ffmpeg.
///
/// If successful, the index of the best stream (>=0) is returned.
/// Otherwise a negative value is returned.
int64_t find_best_audio_stream() const;
/// Find a suitable video stream using heuristics from ffmpeg.
///
/// If successful, the index of the best stream (0>=) is returned.
/// otherwise a negative value is returned.
int64_t find_best_video_stream() const;
// Fetch metadata of the source
/// Fetch metadata of the source media.
OptionDict get_metadata() const;
// Fetch information about source streams
/// Fetch the number of source streams found in the input media.
///
/// The source streams include not only audio/video streams but also
/// subtitle and others.
int64_t num_src_streams() const;
/// Fetch information about the specified source stream.
///
/// The valid value range is ``[0, num_src_streams())``.
SrcStreamInfo get_src_stream_info(int i) const;
// Fetch information about output streams
/// Fetch the number of output streams defined by client code.
///
/// @addtogroup stream_reader_query_methods
int64_t num_out_streams() const;
/// Fetch information about the specified output stream.
///
/// The valid value range is ``[0, num_out_streams())``.
OutputStreamInfo get_out_stream_info(int i) const;
// Check if all the buffers of the output streams are ready.
/// Check if all the buffers of the output streams have enough decoded frames.
bool is_buffer_ready() const;
///@}
//////////////////////////////////////////////////////////////////////////////
// Configure methods
//////////////////////////////////////////////////////////////////////////////
void seek(double timestamp_s, int64_t mode);
/// @name Configure methods
///@{
/// Define an output audio stream.
///
/// @param i The index of the source stream.
///
/// @param frames_per_chunk Number of frames returned as one chunk.
/// @parblock
/// If a source stream is exhausted before ``frames_per_chunk`` frames
/// are buffered, the chunk is returned as-is. Thus the number of frames
/// in the chunk may be smaller than ````frames_per_chunk``.
///
/// Providing ``-1`` disables chunking, in which case, method
/// ``pop_chunks()`` returns all the buffered frames as one chunk.
/// @endparblock
///
/// @param num_chunks Internal buffer size.
/// @parblock
/// When the number of buffered chunks exceeds this number, old chunks are
/// dropped. For example, if `frames_per_chunk` is 5 and `buffer_chunk_size`
/// is 3, then frames older than 15 are dropped.
///
/// Providing ``-1`` disables this behavior, forcing the retention of all
/// chunks.
/// @endparblock
///
/// @param filter_desc Description of filter graph applied to the source
/// stream.
///
/// @param decoder The name of the decoder to be used.
/// When provided, use the specified decoder instead of the default one.
///
/// @param decoder_option Options passed to decoder.
/// @parblock
/// To list decoder options for a decoder, you can use
/// `ffmpeg -h decoder=<DECODER>` command.
///
/// In addition to decoder-specific options, you can also pass options
/// related to multithreading. They are effective only if the decoder
/// supports them. If neither of them are provided, StreamReader defaults to
/// single thread.
/// - ``"threads"``: The number of threads or the value ``"0"``
/// to let FFmpeg decide based on its heuristics.
/// - ``"thread_type"``: Which multithreading method to use.
/// The valid values are ``"frame"`` or ``"slice"``.
/// Note that each decoder supports a different set of methods.
/// If not provided, a default value is used.
/// - ``"frame"``: Decode more than one frame at once.
/// Each thread handles one frame.
/// This will increase decoding delay by one frame per thread
/// - ``"slice"``: Decode more than one part of a single frame at once.
/// @endparblock
void add_audio_stream(
int64_t i,
int64_t frames_per_chunk,
......@@ -78,6 +165,20 @@ class StreamReader {
const c10::optional<std::string>& filter_desc,
const c10::optional<std::string>& decoder,
const c10::optional<OptionDict>& decoder_option);
/// Define an output video stream.
///
/// @param i,frames_per_chunk,num_chunks,filter_desc,decoder,decoder_option
/// See `add_audio_stream()`.
///
/// @param hw_accel Enable hardware acceleration.
/// @parblock
/// When video is decoded on CUDA hardware, (for example by specifying
/// `"h264_cuvid"` decoder), passing CUDA device indicator to ``hw_accel``
/// (i.e. ``hw_accel="cuda:0"``) will make StreamReader place the resulting
/// frames directly on the specified CUDA device as a CUDA tensor.
///
/// If `None`, the chunk will be moved to CPU memory.
/// @endparblock
void add_video_stream(
int64_t i,
int64_t frames_per_chunk,
......@@ -86,8 +187,14 @@ class StreamReader {
const c10::optional<std::string>& decoder,
const c10::optional<OptionDict>& decoder_option,
const c10::optional<std::string>& hw_accel);
/// Remove an output stream.
///
/// @param i The index of the output stream to be removed.
/// The valid value range is `[0, num_out_streams())`.
void remove_stream(int64_t i);
///@}
private:
void add_stream(
int i,
......@@ -99,13 +206,49 @@ class StreamReader {
const c10::optional<OptionDict>& decoder_option,
const torch::Device& device);
public:
//////////////////////////////////////////////////////////////////////////////
// Stream methods
//////////////////////////////////////////////////////////////////////////////
public:
/// @name Stream methods
///@{
/// Seek into the given time stamp.
///
/// @param timestamp Target time stamp in second.
/// @param mode Seek mode.
/// - ``0``: Keyframe mode. Seek into nearest key frame before the given
/// timestamp.
/// - ``1``: Any mode. Seek into any frame (including non-key frames) before
/// the given timestamp.
/// - ``2``: Precise mode. First seek into the nearest key frame before the
/// given timestamp, then decode frames until it reaches the frame closest
/// to the given timestamp.
void seek(double timestamp, int64_t mode);
/// Demultiplex and process one packet.
///
/// @return
/// - ``0``: A packet was processed successfully and there are still
/// packets left in the stream, so client code can call this method again.
/// - ``1``: A packet was processed successfully and it reached EOF.
/// Client code should not call this method again.
/// - ``<0``: An error has happened.
int process_packet();
/// Similar to `process_packet()`, but in case it fails due to resource
/// temporarily being unavailable, it automatically retries.
///
/// This behavior is helpful when using device input, such as a microphone,
/// during which the buffer may be busy while sample acquisition is happening.
///
/// @param timeout Timeout in milli seconds.
/// - ``>=0``: Keep retrying until the given time passes.
/// - ``<0``: Keep retrying forever.
/// @param backoff Time to wait before retrying in milli seconds.
int process_packet_block(double timeout, double backoff);
///@}
private:
int drain();
......@@ -113,7 +256,13 @@ class StreamReader {
// Retrieval
//////////////////////////////////////////////////////////////////////////////
public:
/// @name Retrieval methods
///@{
/// Pop one chunk from each output stream if it is available.
std::vector<c10::optional<torch::Tensor>> pop_chunks();
///@}
};
} // namespace ffmpeg
......
......@@ -7,25 +7,100 @@ namespace torchaudio {
namespace ffmpeg {
struct SrcStreamInfo {
/// @name COMMON MEMBERS
///@{
///
/// The stream media type.
///
/// Please see refer to
/// [the FFmpeg
/// documentation](https://ffmpeg.org/doxygen/4.1/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48)
/// for the available values
///
/// @todo Introduce own enum and get rid of FFmpeg dependency
///
AVMediaType media_type;
/// The name of codec.
const char* codec_name = "N/A";
/// The name of codec in long, human friendly form.
const char* codec_long_name = "N/A";
/// For audio, it is sample format.
///
/// Commonly found values are;
/// - ``"u8"``, ``"u8p"``: 8-bit unsigned integer.
/// - ``"s16"``, ``"s16p"``: 16-bit signed integer.
/// - ``"s32"``, ``"s32p"``: 32-bit signed integer.
/// - ``"s64"``, ``"s64p"``: 64-bit signed integer.
/// - ``"flt"``, ``"fltp"``: 32-bit floating point.
/// - ``"dbl"``, ``"dblp"``: 64-bit floating point.
///
/// For video, it is color channel format.
///
/// Commonly found values include;
/// - ``"gray8"``: grayscale
/// - ``"rgb24"``: RGB
/// - ``"bgr24"``: BGR
/// - ``"yuv420p"``: YUV420p
const char* fmt_name = "N/A";
/// Bit rate
int64_t bit_rate = 0;
/// Number of frames.
/// @note In some formats, the value is not reliable or unavailable.
int64_t num_frames = 0;
/// Bits per sample
int bits_per_sample = 0;
/// Metadata
///
/// This method can fetch ID3 tag from MP3.
///
/// Example:
///
/// ```
/// {
/// "title": "foo",
/// "artist": "bar",
/// "date": "2017"
/// }
/// ```
OptionDict metadata{};
// Audio
///@}
/// @name AUDIO-SPECIFIC MEMBERS
///@{
/// Sample rate
double sample_rate = 0;
/// The number of channels
int num_channels = 0;
// Video
///@}
/// @name VIDEO-SPECIFIC MEMBERS
///@{
/// Width
int width = 0;
/// Height
int height = 0;
/// Frame rate
double frame_rate = 0;
///@}
};
struct OutputStreamInfo {
/// The index of the input source stream
int source_index;
/// Filter graph definition, such as
/// ``"aresample=16000,aformat=sample_fmts=fltp"``.
std::string filter_description;
};
......
......@@ -22,6 +22,9 @@ struct OutputStream {
AVBufferRefPtr hw_frame_ctx;
};
///
/// Encode and write audio/video streams chunk by chunk
///
class StreamWriter {
AVFormatOutputContextPtr pFormatContext;
AVBufferRefPtr pHWBufferRef;
......@@ -38,13 +41,48 @@ class StreamWriter {
// Query methods
//////////////////////////////////////////////////////////////////////////////
public:
// Print the configured outputs
/// @internal
/// Print the configured outputs
void dump_format(int64_t i);
/// @endinternal
//////////////////////////////////////////////////////////////////////////////
// Configure methods
//////////////////////////////////////////////////////////////////////////////
public:
/// Add an output audio stream.
///
/// @param sample_rate The sample rate.
/// @param num_channels The number of channels.
/// @param format Input sample format, which determines the dtype
/// of the input tensor.
/// @parblock
///
/// - ``"u8"``: The input tensor must be ``torch.uint8`` type.
/// - ``"s16"``: The input tensor must be ``torch.int16`` type.
/// - ``"s32"``: The input tensor must be ``torch.int32`` type.
/// - ``"s64"``: The input tensor must be ``torch.int64`` type.
/// - ``"flt"``: The input tensor must be ``torch.float32`` type.
/// - ``"dbl"``: The input tensor must be ``torch.float64`` type.
///
/// Default: ``"flt"``.
/// @endparblock
/// @param encoder The name of the encoder to be used.
/// @parblock
/// When provided, use the specified encoder instead of the default one.
///
/// To list the available encoders, you can use ``ffmpeg -encoders`` command.
/// @endparblock
/// @param encoder_option Options passed to encoder.
/// To list encoder options for a encoder, you can use
/// ``ffmpeg -h encoder=<ENCODER>``.
/// @param encoder_format Format used to encode media.
/// When encoder supports multiple formats, passing this argument will
/// override the format used for encoding.
/// To list supported formats for the encoder, you can use
/// ``ffmpeg -h encoder=<ENCODER>`` command.
void add_audio_stream(
int64_t sample_rate,
int64_t num_channels,
......@@ -52,6 +90,35 @@ class StreamWriter {
const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format);
/// Add an output video stream.
///
/// @param frame_rate Frame rate
/// @param width Width
/// @param height Height
/// @param format Input pixel format, which determines the
/// color channel order of the input tensor.
/// @parblock
///
/// - ``"gray8"``: One channel, grayscale.
/// - ``"rgb24"``: Three channels in the order of RGB.
/// - ``"bgr24"``: Three channels in the order of BGR.
/// - ``"yuv444p"``: Three channels in the order of YUV.
///
/// In either case, the input tensor has to be ``torch.uint8`` type and
/// the shape must be (frame, channel, height, width).
/// @endparblock
/// @param encoder See ``add_audio_stream()``.
/// @param encoder_option See ``add_audio_stream()``.
/// @param encoder_format See ``add_audio_stream()``.
/// @param hw_accel Enable hardware acceleration.
/// @parblock
/// When video is encoded on CUDA hardware, for example
/// `encoder="h264_nvenc"`, passing CUDA device indicator to `hw_accel`
/// (i.e. `hw_accel="cuda:0"`) will make StreamWriter expect video
/// chunk to be a CUDA Tensor. Passing CPU Tensor will result in an error.
///
/// If `None`, the video chunk Tensor has to be a CPU Tensor.
/// @endparblock
void add_video_stream(
double frame_rate,
int64_t width,
......@@ -61,6 +128,8 @@ class StreamWriter {
const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format,
const c10::optional<std::string>& hw_accel);
/// Set file-level metadata
/// @param metadata metadata.
void set_metadata(const OptionDict& metadata);
private:
......@@ -70,11 +139,26 @@ class StreamWriter {
// Write methods
//////////////////////////////////////////////////////////////////////////////
public:
/// Open the output file / device and write the header.
///
/// @param opt Private options for protocol, device and muxer.
void open(const c10::optional<OptionDict>& opt);
/// Close the output file / device and finalize metadata.
void close();
/// Write audio data
/// @param i Stream index.
/// @param chunk Waveform tensor. Shape: ``(frame, channel)``.
/// The ``dtype`` must match what was passed to ``add_audio_stream()`` method.
void write_audio_chunk(int i, const torch::Tensor& chunk);
/// Write video data
/// @param i Stream index.
/// @param chunk Video/image tensor. Shape: ``(time, channel, height,
/// width)``. The ``dtype`` must be ``torch.uint8``. The shape ``(height,
/// width and the number of channels)`` must match what was configured when
/// calling ``add_video_stream()``.
void write_video_chunk(int i, const torch::Tensor& chunk);
/// Flush the frames from encoders and write the frames to the destination.
void flush();
private:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment