Commit 1717edaa authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update the handling of videos without PTS values (#2970)

Summary:
filter graph does not fallback to `best_effort_timestamp`, thus applying filters (like changing fps) on videos without PTS values failed.

This commit changes the behavior by overwriting the PTS values with best_effort_timestamp.

Pull Request resolved: https://github.com/pytorch/audio/pull/2970

Reviewed By: YosuaMichael

Differential Revision: D42425771

Pulled By: mthrok

fbshipit-source-id: 7b7a033ea2ad89bb49d6e1663d35d377dab2aae9
parent e1cddb46
...@@ -51,9 +51,6 @@ class _MediaSourceMixin: ...@@ -51,9 +51,6 @@ class _MediaSourceMixin:
with open(path, "rb") as fileobj: with open(path, "rb") as fileobj:
data = fileobj.read() data = fileobj.read()
self.src = torch.frombuffer(data, dtype=torch.uint8) self.src = torch.frombuffer(data, dtype=torch.uint8)
print(self.src.data_ptr())
print(len(data))
print(self.src.shape)
return self.src return self.src
def tearDown(self): def tearDown(self):
...@@ -467,10 +464,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -467,10 +464,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
("nasa_13013.avi", "precise", 8.1, (0, slice(238, None))), ("nasa_13013.avi", "precise", 8.1, (0, slice(238, None))),
("nasa_13013.avi", "precise", 8.14, (0, slice(239, None))), ("nasa_13013.avi", "precise", 8.14, (0, slice(239, None))),
("nasa_13013.avi", "precise", 8.17, (0, slice(240, None))), ("nasa_13013.avi", "precise", 8.17, (0, slice(240, None))),
# Test precise seek on video with invalid PTS # Test precise seek on video with missing PTS
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.0, (0, slice(None))), ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.0, (0, slice(None))),
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.2, (0, slice(4, -1))), ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.2, (0, slice(4, None))),
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.3, (0, slice(7, -1))), ("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.3, (0, slice(7, None))),
# Test any seek # Test any seek
# The source avi video has one keyframe every twelve frames 0, 12, 24,.. or every 0.4004 seconds. # The source avi video has one keyframe every twelve frames 0, 12, 24,.. or every 0.4004 seconds.
("nasa_13013.avi", "any", 0.0, (0, slice(None))), ("nasa_13013.avi", "any", 0.0, (0, slice(None))),
...@@ -514,6 +511,25 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC ...@@ -514,6 +511,25 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
print(hyp.shape, ref.shape) print(hyp.shape, ref.shape)
self.assertEqual(hyp, ref) self.assertEqual(hyp, ref)
@parameterized.expand(
[
("nasa_13013.mp4", [195, 3, 270, 480]),
# RATRACE does not have valid PTS metadata.
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", [36, 3, 240, 560]),
]
)
def test_change_fps(self, src, shape):
"""Can change the FPS of videos"""
tgt_frame_rate = 15
s = StreamReader(self.get_src(src))
info = s.get_src_stream_info(s.default_video_stream)
assert info.frame_rate != tgt_frame_rate
s.add_basic_video_stream(frames_per_chunk=-1, frame_rate=tgt_frame_rate)
s.process_all_packets()
(chunk,) = s.pop_chunks()
assert chunk.shape == torch.Size(shape)
def _to_fltp(original): def _to_fltp(original):
"""Convert Tensor to float32 with value range [-1, 1]""" """Convert Tensor to float32 with value range [-1, 1]"""
......
...@@ -123,6 +123,10 @@ int Decoder::get_frame(AVFrame* pFrame) { ...@@ -123,6 +123,10 @@ int Decoder::get_frame(AVFrame* pFrame) {
return avcodec_receive_frame(pCodecContext, pFrame); return avcodec_receive_frame(pCodecContext, pFrame);
} }
int Decoder::get_frame_number() const {
return pCodecContext->frame_number;
}
void Decoder::flush_buffer() { void Decoder::flush_buffer() {
avcodec_flush_buffers(pCodecContext); avcodec_flush_buffers(pCodecContext);
} }
......
...@@ -29,6 +29,7 @@ class Decoder { ...@@ -29,6 +29,7 @@ class Decoder {
int process_packet(AVPacket* pPacket); int process_packet(AVPacket* pPacket);
// Fetch a decoded frame // Fetch a decoded frame
int get_frame(AVFrame* pFrame); int get_frame(AVFrame* pFrame);
int get_frame_number() const;
// Flush buffer (for seek) // Flush buffer (for seek)
void flush_buffer(); void flush_buffer();
}; };
......
...@@ -87,6 +87,27 @@ int StreamProcessor::process_packet(AVPacket* packet) { ...@@ -87,6 +87,27 @@ int StreamProcessor::process_packet(AVPacket* packet) {
if (ret < 0) if (ret < 0)
return ret; return ret;
// If pts is undefined then overwrite with best effort estimate.
// In this case, best_effort_timestamp is basically the number of frames
// emit from decoder.
//
// We need valid pts because filter_graph does not fall back to
// best_effort_timestamp.
if (pFrame1->pts == AV_NOPTS_VALUE) {
if (pFrame1->best_effort_timestamp == AV_NOPTS_VALUE) {
// This happens in drain mode.
// When the decoder enters drain mode, it starts flushing the internally
// buffered frames, of which PTS cannot be estimated.
//
// This is because they might be intra-frames not in chronological
// order. In this case, we use received frames as-is in the order they
// are received.
pFrame1->pts = decoder.get_frame_number() + 1;
} else {
pFrame1->pts = pFrame1->best_effort_timestamp;
}
}
// When the value of discard_before_pts is 0, we consider that the seek is // When the value of discard_before_pts is 0, we consider that the seek is
// not performed and all the frames are passed to downstream // not performed and all the frames are passed to downstream
// unconditionally. // unconditionally.
...@@ -96,14 +117,9 @@ int StreamProcessor::process_packet(AVPacket* packet) { ...@@ -96,14 +117,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
// In this case discard_before_pts is set to zero. // In this case discard_before_pts is set to zero.
// 2. When users seek to zero, what they expect is to get to the beginning // 2. When users seek to zero, what they expect is to get to the beginning
// of the data. // of the data.
// There are many videos with invalid PTS values, such as
// -9223372036854775808, and though it is not possible to seek videos
// without decoding, we can still support `seek(0)` as a special case,
// and just not discard any.
// //
// Note: discard_before_pts < 0 is UB. // Note: discard_before_pts < 0 is UB.
if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts || if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts) {
pFrame1->best_effort_timestamp >= discard_before_pts) {
send_frame(pFrame1); send_frame(pFrame1);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment