"...text-generation-inference.git" did not exist on "1f69fb9ed4fb91fe0bb9b94edda5729c67e6f02a"
Commit 1b648626 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Support encode spec change in StreamWriter (#3207)

Summary:
This commit adds support for changing the spec of media
(such as sample rate, #channels, image size and frame rate)
on-the-fly at encoding time.

The motivation behind this addition is that certain media
formats support only limited number of spec, and it is
cumbersome to require client code to change the spec
every time.

For example, OPUS supports only 48kHz sampling rate, and
vorbis only supports stereo.

To make it easy to work with media of different formats,
this commit makes it so that anything that's not compatible
with the format is automatically converted, and allows
users to specify the override.

Notable implementation detail is that, for sample format and
pixel format, the default value of encoder has higher precedent
to source value, while for other attributes like sample rate and
#channels, the source value has higher precedent as long as
they are supported.

Pull Request resolved: https://github.com/pytorch/audio/pull/3207

Reviewed By: nateanl

Differential Revision: D44439622

Pulled By: mthrok

fbshipit-source-id: 09524f201d485d201150481884a3e9e4d2aab081
parent 4bc4ca75
...@@ -594,8 +594,10 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase): ...@@ -594,8 +594,10 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
def test_filter_graph_video(self): def test_filter_graph_video(self):
"""Can apply additional effect with filter graph""" """Can apply additional effect with filter graph"""
rate = 30 src_rate = 30
num_frames, width, height = 400, 160, 90 num_frames, width, height = 400, 160, 90
filter_desc = "framestep=2"
enc_rate = 15
ext = "mp4" ext = "mp4"
filename = f"test.{ext}" filename = f"test.{ext}"
...@@ -603,7 +605,15 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase): ...@@ -603,7 +605,15 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
dst = self.get_temp_path(filename) dst = self.get_temp_path(filename)
w = StreamWriter(dst, format=ext) w = StreamWriter(dst, format=ext)
w.add_video_stream(frame_rate=rate, format="rgb24", height=height, width=width, filter_desc="framestep=2") w.add_video_stream(
frame_rate=src_rate,
format="rgb24",
height=height,
width=width,
filter_desc=filter_desc,
encoder_format="yuv420p",
encoder_frame_rate=enc_rate,
)
with w.open(): with w.open():
w.write_video_chunk(0, original) w.write_video_chunk(0, original)
...@@ -614,3 +624,129 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase): ...@@ -614,3 +624,129 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
(output,) = reader.pop_chunks() (output,) = reader.pop_chunks()
self.assertEqual(output.shape, [num_frames // 2, 3, height, width]) self.assertEqual(output.shape, [num_frames // 2, 3, height, width])
@parameterized.expand(
[
("wav", "pcm_s16le", 8000, 16000, 1, 2),
("wav", "pcm_s16le", 8000, 16000, 2, 1),
("wav", "pcm_s16le", 8000, 16000, 2, 4),
("wav", "pcm_s16le", 16000, 8000, 1, 2),
("wav", "pcm_s16le", 16000, 8000, 2, 1),
("wav", "pcm_s16le", 16000, 8000, 2, 4),
("wav", "pcm_f32le", 8000, 16000, 1, 2),
("wav", "pcm_f32le", 8000, 16000, 2, 1),
("wav", "pcm_f32le", 8000, 16000, 2, 4),
("wav", "pcm_f32le", 16000, 8000, 1, 2),
("wav", "pcm_f32le", 16000, 8000, 2, 1),
("wav", "pcm_f32le", 16000, 8000, 2, 4),
("ogg", "opus", 8000, 48000, 1, 2),
("ogg", "opus", 8000, 48000, 2, 1),
("ogg", "flac", 8000, 41000, 1, 2),
("ogg", "flac", 8000, 41000, 2, 1),
("ogg", "vorbis", 16000, 8000, 1, 2),
("ogg", "vorbis", 16000, 8000, 4, 2),
]
)
def test_change_audio_encoder_spec(self, ext, encoder, src_sr, enc_sr, src_num_channels, enc_num_channels):
"""Can change sample rate and channels on-the-fly"""
filename = f"test.{ext}"
original = get_sinusoid(sample_rate=src_sr, n_channels=src_num_channels, channels_first=False, duration=0.1)
dst = self.get_temp_path(filename)
w = StreamWriter(dst, format=ext)
w.add_audio_stream(
sample_rate=src_sr,
format="flt",
num_channels=src_num_channels,
encoder=encoder,
encoder_sample_rate=enc_sr,
encoder_num_channels=enc_num_channels,
)
with w.open():
w.write_audio_chunk(0, original)
# check
reader = torchaudio.io.StreamReader(src=self.get_temp_path(filename))
i = reader.get_src_stream_info(0)
self.assertEqual(i.sample_rate, enc_sr)
self.assertEqual(i.num_channels, enc_num_channels)
@parameterized.expand(
[
# opus only supports 48kHz
("ogg", "opus", 8000, 48000, 1, 1),
("ogg", "opus", 16000, 48000, 2, 2),
# vorbis only supports 2 channels
("ogg", "vorbis", 16000, 16000, 1, 2),
("ogg", "vorbis", 16000, 16000, 2, 2),
("ogg", "vorbis", 16000, 16000, 4, 2),
]
)
def test_change_encoder_spec_default(
self, ext, encoder, src_sr, expected_sr, src_num_channels, expected_num_channels
):
"""If input rate/channels are not supported, encoder picks supported one automatically."""
filename = f"test.{ext}"
original = get_sinusoid(sample_rate=src_sr, n_channels=src_num_channels, channels_first=False, duration=0.1)
dst = self.get_temp_path(filename)
w = StreamWriter(dst, format=ext)
w.add_audio_stream(
sample_rate=src_sr,
format="flt",
num_channels=src_num_channels,
encoder=encoder,
)
with w.open():
w.write_audio_chunk(0, original)
# check
reader = torchaudio.io.StreamReader(src=self.get_temp_path(filename))
i = reader.get_src_stream_info(0)
self.assertEqual(i.sample_rate, expected_sr)
self.assertEqual(i.num_channels, expected_num_channels)
@parameterized.expand(
[
("mp4", None, 10, 30, (100, 160), (200, 320)),
("mp4", None, 10, 30, (100, 160), (50, 80)),
("mp4", None, 30, 10, (100, 160), (200, 320)),
("mp4", None, 30, 10, (100, 160), (50, 80)),
]
)
def test_change_video_encoder_spec(self, ext, encoder, src_rate, enc_rate, src_size, enc_size):
"""Can change the frame rate and image size on-the-fly"""
width, height = src_size
enc_width, enc_height = enc_size
ext = "mp4"
filename = f"test.{ext}"
num_frames = 256
original = torch.zeros((num_frames, 3, height, width), dtype=torch.uint8)
dst = self.get_temp_path(filename)
w = StreamWriter(dst, format=ext)
w.add_video_stream(
frame_rate=src_rate,
format="rgb24",
height=height,
width=width,
encoder_format="yuv420p",
encoder_frame_rate=enc_rate,
encoder_width=enc_width,
encoder_height=enc_height,
)
with w.open():
w.write_video_chunk(0, original)
# check
reader = torchaudio.io.StreamReader(src=self.get_temp_path(filename))
i = reader.get_src_stream_info(0)
self.assertEqual(i.frame_rate, enc_rate)
self.assertEqual(i.width, enc_width)
self.assertEqual(i.height, enc_height)
...@@ -28,9 +28,10 @@ class EncodeProcess { ...@@ -28,9 +28,10 @@ class EncodeProcess {
void process(const torch::Tensor& tensor, const c10::optional<double>& pts); void process(const torch::Tensor& tensor, const c10::optional<double>& pts);
void process_frame(AVFrame* src);
void flush(); void flush();
private:
void process_frame(AVFrame* src);
}; };
EncodeProcess get_audio_encode_process( EncodeProcess get_audio_encode_process(
...@@ -41,6 +42,8 @@ EncodeProcess get_audio_encode_process( ...@@ -41,6 +42,8 @@ EncodeProcess get_audio_encode_process(
const c10::optional<std::string>& encoder, const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option, const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format, const c10::optional<std::string>& encoder_format,
const c10::optional<int>& encoder_sample_rate,
const c10::optional<int>& encoder_num_channels,
const c10::optional<CodecConfig>& codec_config, const c10::optional<CodecConfig>& codec_config,
const c10::optional<std::string>& filter_desc); const c10::optional<std::string>& filter_desc);
...@@ -53,6 +56,9 @@ EncodeProcess get_video_encode_process( ...@@ -53,6 +56,9 @@ EncodeProcess get_video_encode_process(
const c10::optional<std::string>& encoder, const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option, const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format, const c10::optional<std::string>& encoder_format,
const c10::optional<double>& encoder_frame_rate,
const c10::optional<int>& encoder_width,
const c10::optional<int>& encoder_height,
const c10::optional<std::string>& hw_accel, const c10::optional<std::string>& hw_accel,
const c10::optional<CodecConfig>& codec_config, const c10::optional<CodecConfig>& codec_config,
const c10::optional<std::string>& filter_desc); const c10::optional<std::string>& filter_desc);
......
...@@ -60,6 +60,8 @@ void StreamWriter::add_audio_stream( ...@@ -60,6 +60,8 @@ void StreamWriter::add_audio_stream(
const c10::optional<std::string>& encoder, const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option, const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format, const c10::optional<std::string>& encoder_format,
const c10::optional<int>& encoder_sample_rate,
const c10::optional<int>& encoder_num_channels,
const c10::optional<CodecConfig>& codec_config, const c10::optional<CodecConfig>& codec_config,
const c10::optional<std::string>& filter_desc) { const c10::optional<std::string>& filter_desc) {
TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream."); TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream.");
...@@ -74,6 +76,8 @@ void StreamWriter::add_audio_stream( ...@@ -74,6 +76,8 @@ void StreamWriter::add_audio_stream(
encoder, encoder,
encoder_option, encoder_option,
encoder_format, encoder_format,
encoder_sample_rate,
encoder_num_channels,
codec_config, codec_config,
filter_desc)); filter_desc));
} }
...@@ -86,6 +90,9 @@ void StreamWriter::add_video_stream( ...@@ -86,6 +90,9 @@ void StreamWriter::add_video_stream(
const c10::optional<std::string>& encoder, const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option, const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format, const c10::optional<std::string>& encoder_format,
const c10::optional<double>& encoder_frame_rate,
const c10::optional<int>& encoder_width,
const c10::optional<int>& encoder_height,
const c10::optional<std::string>& hw_accel, const c10::optional<std::string>& hw_accel,
const c10::optional<CodecConfig>& codec_config, const c10::optional<CodecConfig>& codec_config,
const c10::optional<std::string>& filter_desc) { const c10::optional<std::string>& filter_desc) {
...@@ -102,6 +109,9 @@ void StreamWriter::add_video_stream( ...@@ -102,6 +109,9 @@ void StreamWriter::add_video_stream(
encoder, encoder,
encoder_option, encoder_option,
encoder_format, encoder_format,
encoder_frame_rate,
encoder_width,
encoder_height,
hw_accel, hw_accel,
codec_config, codec_config,
filter_desc)); filter_desc));
......
...@@ -109,6 +109,8 @@ class StreamWriter { ...@@ -109,6 +109,8 @@ class StreamWriter {
const c10::optional<std::string>& encoder = c10::nullopt, const c10::optional<std::string>& encoder = c10::nullopt,
const c10::optional<OptionDict>& encoder_option = c10::nullopt, const c10::optional<OptionDict>& encoder_option = c10::nullopt,
const c10::optional<std::string>& encoder_format = c10::nullopt, const c10::optional<std::string>& encoder_format = c10::nullopt,
const c10::optional<int>& encoder_sample_rate = c10::nullopt,
const c10::optional<int>& encoder_num_channels = c10::nullopt,
const c10::optional<CodecConfig>& codec_config = c10::nullopt, const c10::optional<CodecConfig>& codec_config = c10::nullopt,
const c10::optional<std::string>& filter_desc = c10::nullopt); const c10::optional<std::string>& filter_desc = c10::nullopt);
...@@ -152,6 +154,9 @@ class StreamWriter { ...@@ -152,6 +154,9 @@ class StreamWriter {
const c10::optional<std::string>& encoder = c10::nullopt, const c10::optional<std::string>& encoder = c10::nullopt,
const c10::optional<OptionDict>& encoder_option = c10::nullopt, const c10::optional<OptionDict>& encoder_option = c10::nullopt,
const c10::optional<std::string>& encoder_format = c10::nullopt, const c10::optional<std::string>& encoder_format = c10::nullopt,
const c10::optional<double>& encoder_frame_rate = c10::nullopt,
const c10::optional<int>& encoder_width = c10::nullopt,
const c10::optional<int>& encoder_height = c10::nullopt,
const c10::optional<std::string>& hw_accel = c10::nullopt, const c10::optional<std::string>& hw_accel = c10::nullopt,
const c10::optional<CodecConfig>& codec_config = c10::nullopt, const c10::optional<CodecConfig>& codec_config = c10::nullopt,
const c10::optional<std::string>& filter_desc = c10::nullopt); const c10::optional<std::string>& filter_desc = c10::nullopt);
......
...@@ -45,7 +45,18 @@ _encoder_format = """Format used to encode media. ...@@ -45,7 +45,18 @@ _encoder_format = """Format used to encode media.
To list supported formats for the encoder, you can use To list supported formats for the encoder, you can use
``ffmpeg -h encoder=<ENCODER>`` command. ``ffmpeg -h encoder=<ENCODER>`` command.
Default: ``None``.""" Default: ``None``.
Note:
When ``encoder_format`` option is not provided, encoder uses its default format.
For example, when encoding audio into wav format, 16-bit signed integer is used,
and when encoding video into mp4 format (h264 encoder), one of YUV format is used.
This is because typically, 32-bit or 16-bit floating point is used in audio models but
they are not commonly used in audio formats. Similarly, RGB24 is commonly used in vision
models, but video formats usually (and better) support YUV formats.
"""
_codec_config = """Codec configuration. Please refer to :py:class:`CodecConfig` for _codec_config = """Codec configuration. Please refer to :py:class:`CodecConfig` for
configuration options. configuration options.
...@@ -162,6 +173,8 @@ class StreamWriter: ...@@ -162,6 +173,8 @@ class StreamWriter:
encoder: Optional[str] = None, encoder: Optional[str] = None,
encoder_option: Optional[Dict[str, str]] = None, encoder_option: Optional[Dict[str, str]] = None,
encoder_format: Optional[str] = None, encoder_format: Optional[str] = None,
encoder_sample_rate: Optional[int] = None,
encoder_num_channels: Optional[int] = None,
codec_config: Optional[CodecConfig] = None, codec_config: Optional[CodecConfig] = None,
filter_desc: Optional[str] = None, filter_desc: Optional[str] = None,
): ):
...@@ -190,12 +203,53 @@ class StreamWriter: ...@@ -190,12 +203,53 @@ class StreamWriter:
encoder_format (str or None, optional): {encoder_format} encoder_format (str or None, optional): {encoder_format}
encoder_sample_rate (int or None, optional): Override the sample rate used for encoding time.
Some encoders pose restriction on the sample rate used for encoding.
If the source sample rate is not supported by the encoder, the source sample rate is used,
otherwise a default one is picked.
For example, ``"opus"`` encoder only supports 48k Hz, so, when encoding a
waveform with ``"opus"`` encoder, it is always encoded as 48k Hz.
Meanwhile ``"mp3"`` (``"libmp3lame"``) supports 44.1k, 48k, 32k, 22.05k,
24k, 16k, 11.025k, 12k and 8k Hz.
If the original sample rate is one of these, then the original sample rate
is used, otherwise it will be resampled to a default one (44.1k).
When encoding into WAV format, there is no restriction on sample rate,
so the original sample rate will be used.
Providing ``encoder_sample_rate`` will override this behavior and
make encoder attempt to use the provided sample rate.
The provided value must be one support by the encoder.
encoder_num_channels (int or None, optional): Override the number of channels used for encoding.
Similar to sample rate, some encoders (such as ``"opus"``,
``"vorbis"`` and ``"g722"``) pose restriction on
the numbe of channels that can be used for encoding.
If the original number of channels is supported by encoder,
then it will be used, otherwise, the encoder attempts to
remix the channel to one of the supported ones.
Providing ``encoder_num_channels`` will override this behavior and
make encoder attempt to use the provided number of channels.
The provided value must be one support by the encoder.
codec_config (CodecConfig or None, optional): {codec_config} codec_config (CodecConfig or None, optional): {codec_config}
filter_desc (str or None, optional): {filter_desc} filter_desc (str or None, optional): {filter_desc}
""" """
self._s.add_audio_stream( self._s.add_audio_stream(
sample_rate, num_channels, format, encoder, encoder_option, encoder_format, codec_config, filter_desc sample_rate,
num_channels,
format,
encoder,
encoder_option,
encoder_format,
encoder_sample_rate,
encoder_num_channels,
codec_config,
filter_desc,
) )
@_format_common_args @_format_common_args
...@@ -208,6 +262,9 @@ class StreamWriter: ...@@ -208,6 +262,9 @@ class StreamWriter:
encoder: Optional[str] = None, encoder: Optional[str] = None,
encoder_option: Optional[Dict[str, str]] = None, encoder_option: Optional[Dict[str, str]] = None,
encoder_format: Optional[str] = None, encoder_format: Optional[str] = None,
encoder_frame_rate: Optional[float] = None,
encoder_width: Optional[int] = None,
encoder_height: Optional[int] = None,
hw_accel: Optional[str] = None, hw_accel: Optional[str] = None,
codec_config: Optional[CodecConfig] = None, codec_config: Optional[CodecConfig] = None,
filter_desc: Optional[str] = None, filter_desc: Optional[str] = None,
...@@ -242,6 +299,24 @@ class StreamWriter: ...@@ -242,6 +299,24 @@ class StreamWriter:
encoder_format (str or None, optional): {encoder_format} encoder_format (str or None, optional): {encoder_format}
encoder_frame_rate (float or None, optional): Override the frame rate used for encoding.
Some encoders, (such as ``"mpeg1"`` and ``"mpeg2"``) pose restriction on the
frame rate that can be used for encoding.
If such case, if the source frame rate (provided as ``frame_rate``) is not
one of the supported frame rate, then a default one is picked, and the frame rate
is changed on-the-fly. Otherwise the source frame rate is used.
Providing ``encoder_frame_rate`` will override this behavior and
make encoder attempts to use the provided sample rate.
The provided value must be one support by the encoder.
encoder_width (int or None, optional): Width of the image used for encoding.
This allows to change the image size during encoding.
encoder_height (int or None, optional): Height of the image used for encoding.
This allows to change the image size during encoding.
hw_accel (str or None, optional): Enable hardware acceleration. hw_accel (str or None, optional): Enable hardware acceleration.
When video is encoded on CUDA hardware, for example When video is encoded on CUDA hardware, for example
...@@ -264,6 +339,9 @@ class StreamWriter: ...@@ -264,6 +339,9 @@ class StreamWriter:
encoder, encoder,
encoder_option, encoder_option,
encoder_format, encoder_format,
encoder_frame_rate,
encoder_width,
encoder_height,
hw_accel, hw_accel,
codec_config, codec_config,
filter_desc, filter_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment