Unverified Commit c50d4884 authored by Prabhat Roy's avatar Prabhat Roy Committed by GitHub
Browse files

Improve test_video_reader (#5498)

* Improve test_video_reader

* Fix linter error
parent e3f1a822
import collections import collections
import itertools
import math import math
import os import os
from fractions import Fraction from fractions import Fraction
...@@ -112,7 +111,7 @@ DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtim ...@@ -112,7 +111,7 @@ DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtim
# av_seek_frame is imprecise so seek to a timestamp earlier by a margin # av_seek_frame is imprecise so seek to a timestamp earlier by a margin
# The unit of margin is second # The unit of margin is second
seek_frame_margin = 0.25 SEEK_FRAME_MARGIN = 0.25
def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4): def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4):
...@@ -369,7 +368,8 @@ class TestVideoReader: ...@@ -369,7 +368,8 @@ class TestVideoReader:
assert_equal(atimebase, ref_result.atimebase) assert_equal(atimebase, ref_result.atimebase)
def test_stress_test_read_video_from_file(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_stress_test_read_video_from_file(self, test_video):
pytest.skip( pytest.skip(
"This stress test will iteratively decode the same set of videos." "This stress test will iteratively decode the same set of videos."
"It helps to detect memory leak but it takes lots of time to run." "It helps to detect memory leak but it takes lots of time to run."
...@@ -386,52 +386,12 @@ class TestVideoReader: ...@@ -386,52 +386,12 @@ class TestVideoReader:
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for _i in range(num_iter): for _i in range(num_iter):
for test_video, _config in test_videos.items():
full_path = os.path.join(VIDEO_DIR, test_video)
# pass 1: decode all frames using new decoder
torch.ops.video_reader.read_video_from_file(
full_path,
seek_frame_margin,
0, # getPtsOnly
1, # readVideoStream
width,
height,
min_dimension,
max_dimension,
video_start_pts,
video_end_pts,
video_timebase_num,
video_timebase_den,
1, # readAudioStream
samples,
channels,
audio_start_pts,
audio_end_pts,
audio_timebase_num,
audio_timebase_den,
)
def test_read_video_from_file(self):
"""
Test the case when decoder starts with a video file to decode frames.
"""
# video related
width, height, min_dimension, max_dimension = 0, 0, 0, 0
video_start_pts, video_end_pts = 0, -1
video_timebase_num, video_timebase_den = 0, 1
# audio related
samples, channels = 0, 0
audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1
for test_video, config in test_videos.items():
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
# pass 1: decode all frames using new decoder # pass 1: decode all frames using new decoder
tv_result = torch.ops.video_reader.read_video_from_file( torch.ops.video_reader.read_video_from_file(
full_path, full_path,
seek_frame_margin, SEEK_FRAME_MARGIN,
0, # getPtsOnly 0, # getPtsOnly
1, # readVideoStream 1, # readVideoStream
width, width,
...@@ -450,14 +410,57 @@ class TestVideoReader: ...@@ -450,14 +410,57 @@ class TestVideoReader:
audio_timebase_num, audio_timebase_num,
audio_timebase_den, audio_timebase_den,
) )
# pass 2: decode all frames using av
pyav_result = _decode_frames_by_av_module(full_path)
# check results from TorchVision decoder
self.check_separate_decoding_result(tv_result, config)
# compare decoding results
self.compare_decoding_result(tv_result, pyav_result, config)
def test_read_video_from_file_read_single_stream_only(self): @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_read_video_from_file(self, test_video, config):
"""
Test the case when decoder starts with a video file to decode frames.
"""
# video related
width, height, min_dimension, max_dimension = 0, 0, 0, 0
video_start_pts, video_end_pts = 0, -1
video_timebase_num, video_timebase_den = 0, 1
# audio related
samples, channels = 0, 0
audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1
full_path = os.path.join(VIDEO_DIR, test_video)
# pass 1: decode all frames using new decoder
tv_result = torch.ops.video_reader.read_video_from_file(
full_path,
SEEK_FRAME_MARGIN,
0, # getPtsOnly
1, # readVideoStream
width,
height,
min_dimension,
max_dimension,
video_start_pts,
video_end_pts,
video_timebase_num,
video_timebase_den,
1, # readAudioStream
samples,
channels,
audio_start_pts,
audio_end_pts,
audio_timebase_num,
audio_timebase_den,
)
# pass 2: decode all frames using av
pyav_result = _decode_frames_by_av_module(full_path)
# check results from TorchVision decoder
self.check_separate_decoding_result(tv_result, config)
# compare decoding results
self.compare_decoding_result(tv_result, pyav_result, config)
@pytest.mark.parametrize("test_video,config", test_videos.items())
@pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)])
def test_read_video_from_file_read_single_stream_only(
self, test_video, config, read_video_stream, read_audio_stream
):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
only reads video stream and ignores audio stream only reads video stream and ignores audio stream
...@@ -471,57 +474,56 @@ class TestVideoReader: ...@@ -471,57 +474,56 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video) # decode all frames using new decoder
for readVideoStream, readAudioStream in [(1, 0), (0, 1)]: tv_result = torch.ops.video_reader.read_video_from_file(
# decode all frames using new decoder full_path,
tv_result = torch.ops.video_reader.read_video_from_file( SEEK_FRAME_MARGIN,
full_path, 0, # getPtsOnly
seek_frame_margin, read_video_stream,
0, # getPtsOnly width,
readVideoStream, height,
width, min_dimension,
height, max_dimension,
min_dimension, video_start_pts,
max_dimension, video_end_pts,
video_start_pts, video_timebase_num,
video_end_pts, video_timebase_den,
video_timebase_num, read_audio_stream,
video_timebase_den, samples,
readAudioStream, channels,
samples, audio_start_pts,
channels, audio_end_pts,
audio_start_pts, audio_timebase_num,
audio_end_pts, audio_timebase_den,
audio_timebase_num, )
audio_timebase_den,
) (
vframes,
( vframe_pts,
vframes, vtimebase,
vframe_pts, vfps,
vtimebase, vduration,
vfps, aframes,
vduration, aframe_pts,
aframes, atimebase,
aframe_pts, asample_rate,
atimebase, aduration,
asample_rate, ) = tv_result
aduration,
) = tv_result assert (vframes.numel() > 0) is bool(read_video_stream)
assert (vframe_pts.numel() > 0) is bool(read_video_stream)
assert (vframes.numel() > 0) is bool(readVideoStream) assert (vtimebase.numel() > 0) is bool(read_video_stream)
assert (vframe_pts.numel() > 0) is bool(readVideoStream) assert (vfps.numel() > 0) is bool(read_video_stream)
assert (vtimebase.numel() > 0) is bool(readVideoStream)
assert (vfps.numel() > 0) is bool(readVideoStream) expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None
assert (aframes.numel() > 0) is bool(expect_audio_data)
expect_audio_data = readAudioStream == 1 and config.audio_sample_rate is not None assert (aframe_pts.numel() > 0) is bool(expect_audio_data)
assert (aframes.numel() > 0) is bool(expect_audio_data) assert (atimebase.numel() > 0) is bool(expect_audio_data)
assert (aframe_pts.numel() > 0) is bool(expect_audio_data) assert (asample_rate.numel() > 0) is bool(expect_audio_data)
assert (atimebase.numel() > 0) is bool(expect_audio_data)
assert (asample_rate.numel() > 0) is bool(expect_audio_data) @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_file_rescale_min_dimension(self, test_video):
def test_read_video_from_file_rescale_min_dimension(self):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
video min dimension between height and width is set. video min dimension between height and width is set.
...@@ -535,33 +537,33 @@ class TestVideoReader: ...@@ -535,33 +537,33 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, _config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video)
tv_result = torch.ops.video_reader.read_video_from_file(
tv_result = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
def test_read_video_from_file_rescale_max_dimension(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_file_rescale_max_dimension(self, test_video):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
video min dimension between height and width is set. video min dimension between height and width is set.
...@@ -575,33 +577,33 @@ class TestVideoReader: ...@@ -575,33 +577,33 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, _config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video)
tv_result = torch.ops.video_reader.read_video_from_file(
tv_result = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
def test_read_video_from_file_rescale_both_min_max_dimension(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
video min dimension between height and width is set. video min dimension between height and width is set.
...@@ -615,34 +617,34 @@ class TestVideoReader: ...@@ -615,34 +617,34 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, _config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video)
tv_result = torch.ops.video_reader.read_video_from_file(
tv_result = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
def test_read_video_from_file_rescale_width(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_file_rescale_width(self, test_video):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
video width is set. video width is set.
...@@ -656,33 +658,33 @@ class TestVideoReader: ...@@ -656,33 +658,33 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, _config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video)
tv_result = torch.ops.video_reader.read_video_from_file(
tv_result = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert tv_result[0].size(2) == width
assert tv_result[0].size(2) == width
def test_read_video_from_file_rescale_height(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_file_rescale_height(self, test_video):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
video height is set. video height is set.
...@@ -696,33 +698,33 @@ class TestVideoReader: ...@@ -696,33 +698,33 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, _config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video)
tv_result = torch.ops.video_reader.read_video_from_file(
tv_result = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert tv_result[0].size(1) == height
assert tv_result[0].size(1) == height
def test_read_video_from_file_rescale_width_and_height(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_file_rescale_width_and_height(self, test_video):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
both video height and width are set. both video height and width are set.
...@@ -736,93 +738,92 @@ class TestVideoReader: ...@@ -736,93 +738,92 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, _config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video)
tv_result = torch.ops.video_reader.read_video_from_file(
tv_result = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert tv_result[0].size(1) == height
assert tv_result[0].size(1) == height assert tv_result[0].size(2) == width
assert tv_result[0].size(2) == width
def test_read_video_from_file_audio_resampling(self): @pytest.mark.parametrize("test_video", test_videos.keys())
@pytest.mark.parametrize("samples", [9600, 96000])
def test_read_video_from_file_audio_resampling(self, test_video, samples):
""" """
Test the case when decoder starts with a video file to decode frames, and Test the case when decoder starts with a video file to decode frames, and
audio waveform are resampled audio waveform are resampled
""" """
# video related
width, height, min_dimension, max_dimension = 0, 0, 0, 0
video_start_pts, video_end_pts = 0, -1
video_timebase_num, video_timebase_den = 0, 1
# audio related
channels = 0
audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1
for samples in [9600, 96000]: # downsampling # upsampling full_path = os.path.join(VIDEO_DIR, test_video)
# video related
width, height, min_dimension, max_dimension = 0, 0, 0, 0 tv_result = torch.ops.video_reader.read_video_from_file(
video_start_pts, video_end_pts = 0, -1 full_path,
video_timebase_num, video_timebase_den = 0, 1 SEEK_FRAME_MARGIN,
# audio related 0, # getPtsOnly
channels = 0 1, # readVideoStream
audio_start_pts, audio_end_pts = 0, -1 width,
audio_timebase_num, audio_timebase_den = 0, 1 height,
min_dimension,
for test_video, _config in test_videos.items(): max_dimension,
full_path = os.path.join(VIDEO_DIR, test_video) video_start_pts,
video_end_pts,
tv_result = torch.ops.video_reader.read_video_from_file( video_timebase_num,
full_path, video_timebase_den,
seek_frame_margin, 1, # readAudioStream
0, # getPtsOnly samples,
1, # readVideoStream channels,
width, audio_start_pts,
height, audio_end_pts,
min_dimension, audio_timebase_num,
max_dimension, audio_timebase_den,
video_start_pts, )
video_end_pts, (
video_timebase_num, vframes,
video_timebase_den, vframe_pts,
1, # readAudioStream vtimebase,
samples, vfps,
channels, vduration,
audio_start_pts, aframes,
audio_end_pts, aframe_pts,
audio_timebase_num, atimebase,
audio_timebase_den, asample_rate,
) aduration,
( ) = tv_result
vframes, if aframes.numel() > 0:
vframe_pts, assert samples == asample_rate.item()
vtimebase, assert 1 == aframes.size(1)
vfps, # when audio stream is found
vduration, duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1])
aframes, assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item())
aframe_pts,
atimebase, @pytest.mark.parametrize("test_video,config", test_videos.items())
asample_rate, def test_compare_read_video_from_memory_and_file(self, test_video, config):
aduration,
) = tv_result
if aframes.numel() > 0:
assert samples == asample_rate.item()
assert 1 == aframes.size(1)
# when audio stream is found
duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1])
assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item())
def test_compare_read_video_from_memory_and_file(self):
""" """
Test the case when video is already in memory, and decoder reads data in memory Test the case when video is already in memory, and decoder reads data in memory
""" """
...@@ -835,60 +836,60 @@ class TestVideoReader: ...@@ -835,60 +836,60 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, config in test_videos.items(): full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
# pass 1: decode all frames using cpp decoder
# pass 1: decode all frames using cpp decoder tv_result_memory = torch.ops.video_reader.read_video_from_memory(
tv_result_memory = torch.ops.video_reader.read_video_from_memory( video_tensor,
video_tensor, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) self.check_separate_decoding_result(tv_result_memory, config)
self.check_separate_decoding_result(tv_result_memory, config) # pass 2: decode all frames from file
# pass 2: decode all frames from file tv_result_file = torch.ops.video_reader.read_video_from_file(
tv_result_file = torch.ops.video_reader.read_video_from_file( full_path,
full_path, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
)
self.check_separate_decoding_result(tv_result_file, config) self.check_separate_decoding_result(tv_result_file, config)
# finally, compare results decoded from memory and file # finally, compare results decoded from memory and file
self.compare_decoding_result(tv_result_memory, tv_result_file) self.compare_decoding_result(tv_result_memory, tv_result_file)
def test_read_video_from_memory(self): @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_read_video_from_memory(self, test_video, config):
""" """
Test the case when video is already in memory, and decoder reads data in memory Test the case when video is already in memory, and decoder reads data in memory
""" """
...@@ -901,38 +902,38 @@ class TestVideoReader: ...@@ -901,38 +902,38 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, config in test_videos.items(): full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
# pass 1: decode all frames using cpp decoder
# pass 1: decode all frames using cpp decoder tv_result = torch.ops.video_reader.read_video_from_memory(
tv_result = torch.ops.video_reader.read_video_from_memory( video_tensor,
video_tensor, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) # pass 2: decode all frames using av
# pass 2: decode all frames using av pyav_result = _decode_frames_by_av_module(full_path)
pyav_result = _decode_frames_by_av_module(full_path)
self.check_separate_decoding_result(tv_result, config) self.check_separate_decoding_result(tv_result, config)
self.compare_decoding_result(tv_result, pyav_result, config) self.compare_decoding_result(tv_result, pyav_result, config)
def test_read_video_from_memory_get_pts_only(self): @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_read_video_from_memory_get_pts_only(self, test_video, config):
""" """
Test the case when video is already in memory, and decoder reads data in memory. Test the case when video is already in memory, and decoder reads data in memory.
Compare frame pts between decoding for pts only and full decoding Compare frame pts between decoding for pts only and full decoding
...@@ -947,234 +948,234 @@ class TestVideoReader: ...@@ -947,234 +948,234 @@ class TestVideoReader:
audio_start_pts, audio_end_pts = 0, -1 audio_start_pts, audio_end_pts = 0, -1
audio_timebase_num, audio_timebase_den = 0, 1 audio_timebase_num, audio_timebase_den = 0, 1
for test_video, config in test_videos.items(): _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
# pass 1: decode all frames using cpp decoder
# pass 1: decode all frames using cpp decoder tv_result = torch.ops.video_reader.read_video_from_memory(
tv_result = torch.ops.video_reader.read_video_from_memory( video_tensor,
video_tensor, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) assert abs(config.video_fps - tv_result[3].item()) < 0.01
assert abs(config.video_fps - tv_result[3].item()) < 0.01
# pass 2: decode all frames to get PTS only using cpp decoder
# pass 2: decode all frames to get PTS only using cpp decoder tv_result_pts_only = torch.ops.video_reader.read_video_from_memory(
tv_result_pts_only = torch.ops.video_reader.read_video_from_memory( video_tensor,
video_tensor, SEEK_FRAME_MARGIN,
seek_frame_margin, 1, # getPtsOnly
1, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
)
assert not tv_result_pts_only[0].numel() assert not tv_result_pts_only[0].numel()
assert not tv_result_pts_only[5].numel() assert not tv_result_pts_only[5].numel()
self.compare_decoding_result(tv_result, tv_result_pts_only) self.compare_decoding_result(tv_result, tv_result_pts_only)
def test_read_video_in_range_from_memory(self): @pytest.mark.parametrize("test_video,config", test_videos.items())
@pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128])
def test_read_video_in_range_from_memory(self, test_video, config, num_frames):
""" """
Test the case when video is already in memory, and decoder reads data in memory. Test the case when video is already in memory, and decoder reads data in memory.
In addition, decoder takes meaningful start- and end PTS as input, and decode In addition, decoder takes meaningful start- and end PTS as input, and decode
frames within that interval frames within that interval
""" """
for test_video, config in test_videos.items(): full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) # video related
# video related width, height, min_dimension, max_dimension = 0, 0, 0, 0
width, height, min_dimension, max_dimension = 0, 0, 0, 0 video_start_pts, video_end_pts = 0, -1
video_start_pts, video_end_pts = 0, -1 video_timebase_num, video_timebase_den = 0, 1
video_timebase_num, video_timebase_den = 0, 1 # audio related
# audio related samples, channels = 0, 0
samples, channels = 0, 0 audio_start_pts, audio_end_pts = 0, -1
audio_start_pts, audio_end_pts = 0, -1 audio_timebase_num, audio_timebase_den = 0, 1
audio_timebase_num, audio_timebase_den = 0, 1 # pass 1: decode all frames using new decoder
# pass 1: decode all frames using new decoder tv_result = torch.ops.video_reader.read_video_from_memory(
tv_result = torch.ops.video_reader.read_video_from_memory( video_tensor,
video_tensor, SEEK_FRAME_MARGIN,
seek_frame_margin, 0, # getPtsOnly
0, # getPtsOnly 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, video_start_pts,
video_start_pts, video_end_pts,
video_end_pts, video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, audio_start_pts,
audio_start_pts, audio_end_pts,
audio_end_pts, audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
(
vframes,
vframe_pts,
vtimebase,
vfps,
vduration,
aframes,
aframe_pts,
atimebase,
asample_rate,
aduration,
) = tv_result
assert abs(config.video_fps - vfps.item()) < 0.01
start_pts_ind_max = vframe_pts.size(0) - num_frames
if start_pts_ind_max <= 0:
return
# randomly pick start pts
start_pts_ind = randint(0, start_pts_ind_max)
end_pts_ind = start_pts_ind + num_frames - 1
video_start_pts = vframe_pts[start_pts_ind]
video_end_pts = vframe_pts[end_pts_ind]
video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1]
if len(atimebase) > 0:
# when audio stream is available
audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1]
audio_start_pts = _pts_convert(
video_start_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
math.floor,
)
audio_end_pts = _pts_convert(
video_end_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
math.ceil,
)
# pass 2: decode frames in the randomly generated range
tv_result = torch.ops.video_reader.read_video_from_memory(
video_tensor,
SEEK_FRAME_MARGIN,
0, # getPtsOnly
1, # readVideoStream
width,
height,
min_dimension,
max_dimension,
video_start_pts,
video_end_pts,
video_timebase_num,
video_timebase_den,
1, # readAudioStream
samples,
channels,
audio_start_pts,
audio_end_pts,
audio_timebase_num,
audio_timebase_den,
)
# pass 3: decode frames in range using PyAv
video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path)
video_start_pts_av = _pts_convert(
video_start_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
math.floor,
)
video_end_pts_av = _pts_convert(
video_end_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
math.ceil,
)
if audio_timebase_av:
audio_start_pts = _pts_convert(
video_start_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
math.floor,
)
audio_end_pts = _pts_convert(
video_end_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
math.ceil,
) )
(
vframes, pyav_result = _decode_frames_by_av_module(
vframe_pts, full_path,
vtimebase, video_start_pts_av,
vfps, video_end_pts_av,
vduration, audio_start_pts,
aframes, audio_end_pts,
aframe_pts, )
atimebase,
asample_rate, assert tv_result[0].size(0) == num_frames
aduration, if pyav_result.vframes.size(0) == num_frames:
) = tv_result # if PyAv decodes a different number of video frames, skip
assert abs(config.video_fps - vfps.item()) < 0.01 # comparing the decoding results between Torchvision video reader
# and PyAv
for num_frames in [4, 8, 16, 32, 64, 128]: self.compare_decoding_result(tv_result, pyav_result, config)
start_pts_ind_max = vframe_pts.size(0) - num_frames
if start_pts_ind_max <= 0: @pytest.mark.parametrize("test_video,config", test_videos.items())
continue def test_probe_video_from_file(self, test_video, config):
# randomly pick start pts
start_pts_ind = randint(0, start_pts_ind_max)
end_pts_ind = start_pts_ind + num_frames - 1
video_start_pts = vframe_pts[start_pts_ind]
video_end_pts = vframe_pts[end_pts_ind]
video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1]
if len(atimebase) > 0:
# when audio stream is available
audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1]
audio_start_pts = _pts_convert(
video_start_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
math.floor,
)
audio_end_pts = _pts_convert(
video_end_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
math.ceil,
)
# pass 2: decode frames in the randomly generated range
tv_result = torch.ops.video_reader.read_video_from_memory(
video_tensor,
seek_frame_margin,
0, # getPtsOnly
1, # readVideoStream
width,
height,
min_dimension,
max_dimension,
video_start_pts,
video_end_pts,
video_timebase_num,
video_timebase_den,
1, # readAudioStream
samples,
channels,
audio_start_pts,
audio_end_pts,
audio_timebase_num,
audio_timebase_den,
)
# pass 3: decode frames in range using PyAv
video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path)
video_start_pts_av = _pts_convert(
video_start_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
math.floor,
)
video_end_pts_av = _pts_convert(
video_end_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
math.ceil,
)
if audio_timebase_av:
audio_start_pts = _pts_convert(
video_start_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
math.floor,
)
audio_end_pts = _pts_convert(
video_end_pts.item(),
Fraction(video_timebase_num.item(), video_timebase_den.item()),
Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
math.ceil,
)
pyav_result = _decode_frames_by_av_module(
full_path,
video_start_pts_av,
video_end_pts_av,
audio_start_pts,
audio_end_pts,
)
assert tv_result[0].size(0) == num_frames
if pyav_result.vframes.size(0) == num_frames:
# if PyAv decodes a different number of video frames, skip
# comparing the decoding results between Torchvision video reader
# and PyAv
self.compare_decoding_result(tv_result, pyav_result, config)
def test_probe_video_from_file(self):
""" """
Test the case when decoder probes a video file Test the case when decoder probes a video file
""" """
for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video)
full_path = os.path.join(VIDEO_DIR, test_video) probe_result = torch.ops.video_reader.probe_video_from_file(full_path)
probe_result = torch.ops.video_reader.probe_video_from_file(full_path) self.check_probe_result(probe_result, config)
self.check_probe_result(probe_result, config)
def test_probe_video_from_memory(self): @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_probe_video_from_memory(self, test_video, config):
""" """
Test the case when decoder probes a video in memory Test the case when decoder probes a video in memory
""" """
for test_video, config in test_videos.items(): _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor)
probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor) self.check_probe_result(probe_result, config)
self.check_probe_result(probe_result, config)
def test_probe_video_from_memory_script(self): @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_probe_video_from_memory_script(self, test_video, config):
scripted_fun = torch.jit.script(io._probe_video_from_memory) scripted_fun = torch.jit.script(io._probe_video_from_memory)
assert scripted_fun is not None assert scripted_fun is not None
for test_video, config in test_videos.items(): _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) probe_result = scripted_fun(video_tensor)
probe_result = scripted_fun(video_tensor) self.check_meta_result(probe_result, config)
self.check_meta_result(probe_result, config)
def test_read_video_from_memory_scripted(self): @pytest.mark.parametrize("test_video", test_videos.keys())
def test_read_video_from_memory_scripted(self, test_video):
""" """
Test the case when video is already in memory, and decoder reads data in memory Test the case when video is already in memory, and decoder reads data in memory
""" """
...@@ -1190,29 +1191,28 @@ class TestVideoReader: ...@@ -1190,29 +1191,28 @@ class TestVideoReader:
scripted_fun = torch.jit.script(io._read_video_from_memory) scripted_fun = torch.jit.script(io._read_video_from_memory)
assert scripted_fun is not None assert scripted_fun is not None
for test_video, _config in test_videos.items(): _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
# decode all frames using cpp decoder
# decode all frames using cpp decoder scripted_fun(
scripted_fun( video_tensor,
video_tensor, SEEK_FRAME_MARGIN,
seek_frame_margin, 1, # readVideoStream
1, # readVideoStream width,
width, height,
height, min_dimension,
min_dimension, max_dimension,
max_dimension, [video_start_pts, video_end_pts],
[video_start_pts, video_end_pts], video_timebase_num,
video_timebase_num, video_timebase_den,
video_timebase_den, 1, # readAudioStream
1, # readAudioStream samples,
samples, channels,
channels, [audio_start_pts, audio_end_pts],
[audio_start_pts, audio_end_pts], audio_timebase_num,
audio_timebase_num, audio_timebase_den,
audio_timebase_den, )
) # FUTURE: check value of video / audio frames
# FUTURE: check value of video / audio frames
def test_invalid_file(self): def test_invalid_file(self):
set_video_backend("video_reader") set_video_backend("video_reader")
...@@ -1223,33 +1223,31 @@ class TestVideoReader: ...@@ -1223,33 +1223,31 @@ class TestVideoReader:
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):
io.read_video("foo.mp4") io.read_video("foo.mp4")
def test_audio_present_pts(self): @pytest.mark.parametrize("test_video", test_videos.keys())
@pytest.mark.parametrize("backend", ["video_reader", "pyav"])
@pytest.mark.parametrize("start_offset", [0, 1000])
@pytest.mark.parametrize("end_offset", [3000, None])
def test_audio_present_pts(self, test_video, backend, start_offset, end_offset):
"""Test if audio frames are returned with pts unit.""" """Test if audio frames are returned with pts unit."""
backends = ["video_reader", "pyav"] full_path = os.path.join(VIDEO_DIR, test_video)
start_offsets = [0, 1000] container = av.open(full_path)
end_offsets = [3000, None] if container.streams.audio:
for test_video, _ in test_videos.items(): set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts")
container = av.open(full_path) assert all([dimension > 0 for dimension in audio.shape[:2]])
if container.streams.audio:
for backend, start_offset, end_offset in itertools.product(backends, start_offsets, end_offsets): @pytest.mark.parametrize("test_video", test_videos.keys())
set_video_backend(backend) @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
_, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts") @pytest.mark.parametrize("start_offset", [0, 0.1])
assert all([dimension > 0 for dimension in audio.shape[:2]]) @pytest.mark.parametrize("end_offset", [0.3, None])
def test_audio_present_sec(self, test_video, backend, start_offset, end_offset):
def test_audio_present_sec(self):
"""Test if audio frames are returned with sec unit.""" """Test if audio frames are returned with sec unit."""
backends = ["video_reader", "pyav"] full_path = os.path.join(VIDEO_DIR, test_video)
start_offsets = [0, 0.1] container = av.open(full_path)
end_offsets = [0.3, None] if container.streams.audio:
for test_video, _ in test_videos.items(): set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec")
container = av.open(full_path) assert all([dimension > 0 for dimension in audio.shape[:2]])
if container.streams.audio:
for backend, start_offset, end_offset in itertools.product(backends, start_offsets, end_offsets):
set_video_backend(backend)
_, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec")
assert all([dimension > 0 for dimension in audio.shape[:2]])
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment