Commit 1717edaa authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update the handling of videos without PTS values (#2970)

Summary:
filter graph does not fallback to `best_effort_timestamp`, thus applying filters (like changing fps) on videos without PTS values failed.

This commit changes the behavior by overwriting the PTS values with best_effort_timestamp.

Pull Request resolved: https://github.com/pytorch/audio/pull/2970

Reviewed By: YosuaMichael

Differential Revision: D42425771

Pulled By: mthrok

fbshipit-source-id: 7b7a033ea2ad89bb49d6e1663d35d377dab2aae9
parent e1cddb46
......@@ -51,9 +51,6 @@ class _MediaSourceMixin:
with open(path, "rb") as fileobj:
data = fileobj.read()
self.src = torch.frombuffer(data, dtype=torch.uint8)
print(self.src.data_ptr())
print(len(data))
print(self.src.shape)
return self.src
def tearDown(self):
......@@ -467,10 +464,10 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
("nasa_13013.avi", "precise", 8.1, (0, slice(238, None))),
("nasa_13013.avi", "precise", 8.14, (0, slice(239, None))),
("nasa_13013.avi", "precise", 8.17, (0, slice(240, None))),
# Test precise seek on video with invalid PTS
# Test precise seek on video with missing PTS
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.0, (0, slice(None))),
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.2, (0, slice(4, -1))),
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.3, (0, slice(7, -1))),
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.2, (0, slice(4, None))),
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", "precise", 0.3, (0, slice(7, None))),
# Test any seek
# The source avi video has one keyframe every twelve frames 0, 12, 24,.. or every 0.4004 seconds.
("nasa_13013.avi", "any", 0.0, (0, slice(None))),
......@@ -514,6 +511,25 @@ class StreamReaderInterfaceTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestC
print(hyp.shape, ref.shape)
self.assertEqual(hyp, ref)
@parameterized.expand(
[
("nasa_13013.mp4", [195, 3, 270, 480]),
# RATRACE does not have valid PTS metadata.
("RATRACE_wave_f_nm_np1_fr_goo_37.avi", [36, 3, 240, 560]),
]
)
def test_change_fps(self, src, shape):
"""Can change the FPS of videos"""
tgt_frame_rate = 15
s = StreamReader(self.get_src(src))
info = s.get_src_stream_info(s.default_video_stream)
assert info.frame_rate != tgt_frame_rate
s.add_basic_video_stream(frames_per_chunk=-1, frame_rate=tgt_frame_rate)
s.process_all_packets()
(chunk,) = s.pop_chunks()
assert chunk.shape == torch.Size(shape)
def _to_fltp(original):
"""Convert Tensor to float32 with value range [-1, 1]"""
......
......@@ -123,6 +123,10 @@ int Decoder::get_frame(AVFrame* pFrame) {
return avcodec_receive_frame(pCodecContext, pFrame);
}
int Decoder::get_frame_number() const {
return pCodecContext->frame_number;
}
void Decoder::flush_buffer() {
avcodec_flush_buffers(pCodecContext);
}
......
......@@ -29,6 +29,7 @@ class Decoder {
int process_packet(AVPacket* pPacket);
// Fetch a decoded frame
int get_frame(AVFrame* pFrame);
int get_frame_number() const;
// Flush buffer (for seek)
void flush_buffer();
};
......
......@@ -87,6 +87,27 @@ int StreamProcessor::process_packet(AVPacket* packet) {
if (ret < 0)
return ret;
// If pts is undefined then overwrite with best effort estimate.
// In this case, best_effort_timestamp is basically the number of frames
// emit from decoder.
//
// We need valid pts because filter_graph does not fall back to
// best_effort_timestamp.
if (pFrame1->pts == AV_NOPTS_VALUE) {
if (pFrame1->best_effort_timestamp == AV_NOPTS_VALUE) {
// This happens in drain mode.
// When the decoder enters drain mode, it starts flushing the internally
// buffered frames, of which PTS cannot be estimated.
//
// This is because they might be intra-frames not in chronological
// order. In this case, we use received frames as-is in the order they
// are received.
pFrame1->pts = decoder.get_frame_number() + 1;
} else {
pFrame1->pts = pFrame1->best_effort_timestamp;
}
}
// When the value of discard_before_pts is 0, we consider that the seek is
// not performed and all the frames are passed to downstream
// unconditionally.
......@@ -96,14 +117,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
// In this case discard_before_pts is set to zero.
// 2. When users seek to zero, what they expect is to get to the beginning
// of the data.
// There are many videos with invalid PTS values, such as
// -9223372036854775808, and though it is not possible to seek videos
// without decoding, we can still support `seek(0)` as a special case,
// and just not discard any.
//
// Note: discard_before_pts < 0 is UB.
if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts ||
pFrame1->best_effort_timestamp >= discard_before_pts) {
if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts) {
send_frame(pFrame1);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment