nvdec_tutorial.py 23.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""
Accelerated video decoding with NVDEC
=====================================

.. _nvdec_tutorial:

**Author**: `Moto Hira <moto@meta.com>`__

This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
with TorchAudio, and how it improves the performance of video decoding.
"""

######################################################################
#
# .. note::
#
#    This tutorial requires FFmpeg libraries compiled with HW
#    acceleration enabled.
#
#    Please refer to
#    :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
#    for how to build FFmpeg with HW acceleration.
#

import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

######################################################################
#
moto's avatar
moto committed
33
import os
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import time

import matplotlib
import matplotlib.pyplot as plt
from torchaudio.io import StreamReader

matplotlib.rcParams["image.interpolation"] = "none"

######################################################################
#
# Check the prerequisites
# -----------------------
#
# First, we check that TorchAudio correctly detects FFmpeg libraries
# that support HW decoder/encoder.
#

from torchaudio.utils import ffmpeg_utils

######################################################################
#

print("FFmpeg Library versions:")
for k, ver in ffmpeg_utils.get_versions().items():
moto's avatar
moto committed
58
    print(f"  {k}:\t{'.'.join(str(v) for v in ver)}")
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

######################################################################
#

print("Available NVDEC Decoders:")
for k in ffmpeg_utils.get_video_decoders().keys():
    if "cuvid" in k:
        print(f" - {k}")

######################################################################
#

print("Avaialbe GPU:")
print(torch.cuda.get_device_properties(0))

######################################################################
#
# We will use the following video which has the following properties;
#
# - Codec: H.264
# - Resolution: 960x540
# - FPS: 29.97
# - Pixel format: YUV420P
#
moto's avatar
moto committed
83
84
85
86
87
# .. raw:: html
#
#    <video style="max-width: 100%" controls>
#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
#    </video>
88
89
90
91
92
93
94
95
96
97
98
99
100

######################################################################
#

src = torchaudio.utils.download_asset(
    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)

######################################################################
# Decoding videos with NVDEC
# --------------------------
#
# To use HW video decoder, you need to specify the HW decoder when
moto's avatar
moto committed
101
102
# defining the output video stream by passing ``decoder`` option to
# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
103
104
105
106
107
108
109
110
111
#

s = StreamReader(src)
s.add_video_stream(5, decoder="h264_cuvid")
s.fill_buffer()
(video,) = s.pop_chunks()

######################################################################
#
moto's avatar
moto committed
112
# The video frames are decoded and returned as tensor of NCHW format.
113

moto's avatar
moto committed
114
115
116
117
118
119
120
121
print(video.shape, video.dtype)

######################################################################
#
# By default, the decoded frames are sent back to CPU memory, and
# CPU tensors are created.

print(video.device)
122
123
124

######################################################################
#
moto's avatar
moto committed
125
126
127
# By specifying ``hw_accel`` option, you can convert the decoded frames
# to CUDA tensor.
# ``hw_accel`` option takes string values and pass it
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# to :py:class:`torch.device`.
#
# .. note::
#
#    Currently, ``hw_accel`` option and
#    :py:meth:`~torchaudio.io.StreamReader.add_basic_video_stream`
#    are not compatible. ``add_basic_video_stream`` adds post-decoding
#    process, which is designed for frames in CPU memory.
#    Please use :py:meth:`~torchaudio.io.StreamReader.add_video_stream`.
#

s = StreamReader(src)
s.add_video_stream(5, decoder="h264_cuvid", hw_accel="cuda:0")
s.fill_buffer()
(video,) = s.pop_chunks()

print(video.shape, video.dtype, video.device)


######################################################################
moto's avatar
moto committed
148
# .. note::
149
#
moto's avatar
moto committed
150
151
152
#    When there are multiple of GPUs available, ``StreamReader`` by
#    default uses the first GPU. You can change this by providing
#    ``"gpu"`` option.
153
154
155
156
157
158
159
160
161
162
163
164
#
# .. code::
#
#    # Video data is sent to CUDA device 0, decoded and
#    # converted on the same device.
#    s.add_video_stream(
#        ...,
#        decoder="h264_cuvid",
#        decoder_option={"gpu": "0"},
#        hw_accel="cuda:0",
#    )
#
moto's avatar
moto committed
165
166
167
168
169
170
171
172
173
# .. note::
#
#    ``"gpu"`` option and ``hw_accel`` option can be specified
#    independently. If they do not match, decoded frames are
#    transfered to the device specified by ``hw_accell``
#    automatically.
#
# .. code::
#
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#    # Video data is sent to CUDA device 0, and decoded there.
#    # Then it is transfered to CUDA device 1, and converted to
#    # CUDA tensor.
#    s.add_video_stream(
#        ...,
#        decoder="h264_cuvid",
#        decoder_option={"gpu": "0"},
#        hw_accel="cuda:1",
#    )

######################################################################
# Visualization
# -------------
#
# Let's look at the frames decoded by HW decoder and compare them
# against equivalent results from software decoders.
#
# The following function seeks into the given timestamp and decode one
# frame with the specificed decoder.


def test_decode(decoder: str, seek: float):
    s = StreamReader(src)
    s.seek(seek)
    s.add_video_stream(1, decoder=decoder)
    s.fill_buffer()
    (video,) = s.pop_chunks()
    return video[0]


######################################################################
#

timestamps = [12, 19, 45, 131, 180]

cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]

######################################################################
#
# .. note::
#
#    Currently, HW decoder does not support colorspace conversion.
#    Decoded frames are YUV format.
#    The following function performs YUV to RGB covnersion
#    (and axis shuffling for plotting).


def yuv_to_rgb(frames):
    frames = frames.cpu().to(torch.float)
    y = frames[..., 0, :, :]
    u = frames[..., 1, :, :]
    v = frames[..., 2, :, :]

    y /= 255
    u = u / 255 - 0.5
    v = v / 255 - 0.5

    r = y + 1.14 * v
    g = y + -0.396 * u - 0.581 * v
    b = y + 2.029 * u

    rgb = torch.stack([r, g, b], -1)
    rgb = (rgb * 255).clamp(0, 255).to(torch.uint8)
    return rgb.numpy()


######################################################################
#
# Now we visualize the resutls.
#


def plot():
    n_rows = len(timestamps)
    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
    for i in range(n_rows):
        axes[i][0].imshow(yuv_to_rgb(cpu_frames[i]))
        axes[i][1].imshow(yuv_to_rgb(cuda_frames[i]))

    axes[0][0].set_title("Software decoder")
    axes[0][1].set_title("HW decoder")
    plt.setp(axes, xticks=[], yticks=[])
    plt.tight_layout()


plot()

######################################################################
#
# They are indistinguishable to the eyes of the author.
# Feel free to let us know if you spot something. :)
#


######################################################################
# HW resizing and cropping
# ------------------------
#
# You can use ``decoder_option`` argument to provide decoder-specific
# options.
#
# The following options are often relevant in preprocessing.
#
# - ``resize``: Resize the frame into ``(width)x(height)``.
# - ``crop``: Crop the frame ``(top)x(bottom)x(left)x(right)``.
#   Note that the specified values are the amount of rows/columns removed.
#   The final image size is ``(width - left - right)x(height - top -bottom)``.
#   If ``crop`` and ``resize`` options are used together,
#   ``crop`` is performed first.
#
# For other available options, please run
# ``ffmpeg -h decoder=h264_cuvid``.
#


def test_options(option):
    s = StreamReader(src)
    s.seek(87)
    s.add_video_stream(1, decoder="h264_cuvid", hw_accel="cuda:0", decoder_option=option)
    s.fill_buffer()
    (video,) = s.pop_chunks()
moto's avatar
moto committed
296
    print(f"Option: {option}:\t{video.shape}")
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
    return video[0]


######################################################################
#

original = test_options(option=None)
resized = test_options(option={"resize": "480x270"})
cropped = test_options(option={"crop": "135x135x240x240"})
cropped_and_resized = test_options(option={"crop": "135x135x240x240", "resize": "640x360"})


######################################################################
#


def plot():
    fig, axes = plt.subplots(2, 2, figsize=[12.8, 9.6])
    axes[0][0].imshow(yuv_to_rgb(original))
    axes[0][1].imshow(yuv_to_rgb(resized))
    axes[1][0].imshow(yuv_to_rgb(cropped))
    axes[1][1].imshow(yuv_to_rgb(cropped_and_resized))

    axes[0][0].set_title("Original")
    axes[0][1].set_title("Resized")
    axes[1][0].set_title("Cropped")
    axes[1][1].set_title("Cropped and resized")
    plt.tight_layout()
    return fig


plot()

######################################################################
# Comparing resizing methods
# --------------------------
#
moto's avatar
moto committed
334
335
# Unlike software scaling, NVDEC does not provide an option to choose
# the scaling algorithm.
336
337
338
339
340
341
342
343
344
345
# In ML applicatoins, it is often necessary to construct a
# preprocessing pipeline with a similar numerical property.
# So here we compare the result of hardware resizing with software
# resizing of different algorithms.
#
# We will use the following video, which contains the test pattern
# generated using the following command.
#
# .. code::
#
moto's avatar
moto committed
346
347
348
349
350
351
352
#    ffmpeg -y -f lavfi -t 12.05 -i mptestsrc -movflags +faststart mptestsrc.mp4
#
# .. raw:: html
#
#    <video style="max-width: 100%" controls>
#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/mptestsrc.mp4" type="video/mp4">
#    </video>
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452


######################################################################
#

test_src = torchaudio.utils.download_asset("tutorial-assets/mptestsrc.mp4")


######################################################################
# The following function decodes video and
# apply the specified scaling algorithm.
#


def decode_resize_ffmpeg(mode, height, width, seek):
    filter_desc = None if mode is None else f"scale={width}:{height}:sws_flags={mode}"
    s = StreamReader(test_src)
    s.add_video_stream(1, filter_desc=filter_desc)
    s.seek(seek)
    s.fill_buffer()
    (chunk,) = s.pop_chunks()
    return chunk


######################################################################
# The following function uses HW decoder to decode video and resize.
#


def decode_resize_cuvid(height, width, seek):
    s = StreamReader(test_src)
    s.add_video_stream(1, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"}, hw_accel="cuda:0")
    s.seek(seek)
    s.fill_buffer()
    (chunk,) = s.pop_chunks()
    return chunk.cpu()


######################################################################
# Now we execute them and visualize the resulting frames.

params = {"height": 224, "width": 224, "seek": 3}

frames = [
    decode_resize_ffmpeg(None, **params),
    decode_resize_ffmpeg("neighbor", **params),
    decode_resize_ffmpeg("bilinear", **params),
    decode_resize_ffmpeg("bicubic", **params),
    decode_resize_cuvid(**params),
    decode_resize_ffmpeg("spline", **params),
    decode_resize_ffmpeg("lanczos:param0=1", **params),
    decode_resize_ffmpeg("lanczos:param0=3", **params),
    decode_resize_ffmpeg("lanczos:param0=5", **params),
]


######################################################################
#


def plot():
    fig, axes = plt.subplots(3, 3, figsize=[12.8, 15.2])
    for i, f in enumerate(frames):
        h, w = f.shape[2:4]
        f = f[..., : h // 4, : w // 4]
        axes[i // 3][i % 3].imshow(yuv_to_rgb(f[0]))
    axes[0][0].set_title("Original")
    axes[0][1].set_title("nearest neighbor")
    axes[0][2].set_title("bilinear")
    axes[1][0].set_title("bicubic")
    axes[1][1].set_title("NVDEC")
    axes[1][2].set_title("spline")
    axes[2][0].set_title("lanczos(1)")
    axes[2][1].set_title("lanczos(3)")
    axes[2][2].set_title("lanczos(5)")

    plt.setp(axes, xticks=[], yticks=[])
    plt.tight_layout()


plot()

######################################################################
# None of them is exactly the same. To the eyes of authors, lanczos(1)
# appears to be most similar to NVDEC.
# The bicubic looks close as well.

######################################################################
#
# Benchmark NVDEC with StreamReader
# ---------------------------------
#
# In this section, we compare the performace of software video
# decoding and HW video decoding.
#

######################################################################
# Decode as CUDA frames
# ---------------------
#
moto's avatar
moto committed
453
454
455
456
457
458
# First, we compare the time it takes for software decoder and
# hardware encoder to decode the same video.
# To make the result comparable, when using software decoder, we move
# the resulting tensor to CUDA.
#
# The procedures to test look like the following
459
#
moto's avatar
moto committed
460
461
# - Use hardware decoder and place data on CUDA directly
# - Use software decoder, generate CPU Tensors and move them to CUDA.
462
#
moto's avatar
moto committed
463
# .. note:
464
#
moto's avatar
moto committed
465
466
467
468
469
470
471
472
#    Because HW decoder currently only supports reading videos as
#    YUV444P format, we decode frames into YUV444P format for the case of
#    software decoder as well.
#


######################################################################
# The following function implements the hardware decoder test case.
473
474


moto's avatar
moto committed
475
def test_decode_cuda(src, decoder, hw_accel="cuda", frames_per_chunk=5):
476
    s = StreamReader(src)
moto's avatar
moto committed
477
    s.add_video_stream(frames_per_chunk, decoder=decoder, hw_accel=hw_accel)
478
479

    num_frames = 0
moto's avatar
moto committed
480
    chunk = None
481
    t0 = time.monotonic()
moto's avatar
moto committed
482
    for (chunk,) in s.stream():
483
484
        num_frames += chunk.shape[0]
    elapsed = time.monotonic() - t0
moto's avatar
moto committed
485
    print(f" - Shape: {chunk.shape}")
486
    fps = num_frames / elapsed
moto's avatar
moto committed
487
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
488
489
490
491
    return fps


######################################################################
moto's avatar
moto committed
492
493
494
495
# The following function implements the software decoder test case.


def test_decode_cpu(src, threads, decoder=None, frames_per_chunk=5):
496
    s = StreamReader(src)
moto's avatar
moto committed
497
    s.add_video_stream(frames_per_chunk, decoder=decoder, decoder_option={"threads": f"{threads}"})
498
499

    num_frames = 0
moto's avatar
moto committed
500
    device = torch.device("cuda")
501
502
503
504
505
    t0 = time.monotonic()
    for i, (chunk,) in enumerate(s.stream()):
        if i == 0:
            print(f" - Shape: {chunk.shape}")
        num_frames += chunk.shape[0]
moto's avatar
moto committed
506
        chunk = chunk.to(device)
507
508
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
    return fps


######################################################################
# For each resolution of video, we run multiple software decoder test
# cases with different number of threads.


def run_decode_tests(src, frames_per_chunk=5):
    fps = []
    print(f"Testing: {os.path.basename(src)}")
    for threads in [1, 4, 8, 16]:
        print(f"* Software decoding (num_threads={threads})")
        fps.append(test_decode_cpu(src, threads))
    print("* Hardware decoding")
    fps.append(test_decode_cuda(src, decoder="h264_cuvid"))
526
527
528
529
    return fps


######################################################################
moto's avatar
moto committed
530
# Now we run the tests with videos of different resolutions.
531
#
moto's avatar
moto committed
532
533
# QVGA
# ----
534

moto's avatar
moto committed
535
536
src_qvga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_qvga.h264.mp4")
fps_qvga = run_decode_tests(src_qvga)
537
538

######################################################################
moto's avatar
moto committed
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
# VGA
# ---

src_vga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_vga.h264.mp4")
fps_vga = run_decode_tests(src_vga)

######################################################################
# XGA
# ---

src_xga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_xga.h264.mp4")
fps_xga = run_decode_tests(src_xga)


######################################################################
# Result
# ------
556
#
moto's avatar
moto committed
557
# Now we plot the result.
558

moto's avatar
moto committed
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602

def plot():
    fig, ax = plt.subplots(figsize=[9.6, 6.4])

    for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
        ax.plot(items[:-1], marker=items[-1])
    ax.grid(axis="both")
    ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
    ax.legend(
        [
            "Software Decoding (threads=1)",
            "Software Decoding (threads=4)",
            "Software Decoding (threads=8)",
            "Software Decoding (threads=16)",
            "Hardware Decoding (CUDA Tensor)",
        ]
    )
    ax.set_title("Speed of processing video frames")
    ax.set_ylabel("Frames per second")
    plt.tight_layout()


plot()

######################################################################
#
# We observe couple of things
#
# - Increasing the number of threads in software decoding makes the
#   pipeline faster, but the performance saturates around 8 threads.
# - The performance gain from using hardware decoder depends on the
#   resolution of video.
# - At lower resolutions like QVGA, hardware decoding is slower than
#   software decoding
# - At higher resolutions like XGA, hardware decoding is faster
#   than software decoding.
#
#
# It is worth noting that the performance gain also depends on the
# type of GPU.
# We observed that when decoding VGA videos using V100 or A100 GPUs,
# hardware decoders are slower than software decoders. But using A10
# GPU hardware deocder is faster than software decodr.
#
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620

######################################################################
# Decode and resize
# -----------------
#
# Next, we add resize operation to the pipeline.
# We will compare the following pipelines.
#
# 1. Decode video using software decoder and read the frames as
#    PyTorch Tensor. Resize the tensor using
#    :py:func:`torch.nn.functional.interpolate`, then send
#    the resulting tensor to CUDA device.
# 2. Decode video using software decoder, resize the frame with
#    FFmpeg's filter graph, read the resized frames as PyTorch tensor,
#    then send it to CUDA device.
# 3. Decode and resize video simulaneously with HW decoder, read the
#    resulting frames as CUDA tensor.
#
moto's avatar
moto committed
621
# The pipeline 1 represents common video loading implementations.
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
#
# The pipeline 2 uses FFmpeg's filter graph, which allows to manipulate
# raw frames before converting them to Tensors.
#
# The pipeline 3 has the minimum amount of data transfer from CPU to
# CUDA, which significantly contribute to performant data loading.
#


######################################################################
# The following function implements the pipeline 1. It uses PyTorch's
# :py:func:`torch.nn.functional.interpolate`.
# We use ``bincubic`` mode, as we saw that the resulting frames are
# closest to NVDEC resizing.
#


moto's avatar
moto committed
639
def test_decode_then_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
640
    s = StreamReader(src)
moto's avatar
moto committed
641
    s.add_video_stream(frames_per_chunk, decoder_option={"threads": "8"})
642
643

    num_frames = 0
moto's avatar
moto committed
644
645
    device = torch.device("cuda")
    chunk = None
646
    t0 = time.monotonic()
moto's avatar
moto committed
647
    for (chunk,) in s.stream():
648
649
650
651
652
        num_frames += chunk.shape[0]
        chunk = torch.nn.functional.interpolate(chunk, [height, width], mode=mode, antialias=True)
        chunk = chunk.to(device)
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
653
654
    print(f" - Shape: {chunk.shape}")
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
655
656
657
658
659
660
661
662
663
664
665
666
    return fps


######################################################################
# The following function implements the pipeline 2. Frames are resized
# as part of decoding process, then sent to CUDA device.
#
# We use ``bincubic`` mode, to make the result comparable with
# PyTorch-based implementation above.
#


moto's avatar
moto committed
667
def test_decode_and_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
668
    s = StreamReader(src)
moto's avatar
moto committed
669
670
671
    s.add_video_stream(
        frames_per_chunk, filter_desc=f"scale={width}:{height}:sws_flags={mode}", decoder_option={"threads": "8"}
    )
672
673

    num_frames = 0
moto's avatar
moto committed
674
675
    device = torch.device("cuda")
    chunk = None
676
    t0 = time.monotonic()
moto's avatar
moto committed
677
    for (chunk,) in s.stream():
678
679
680
681
        num_frames += chunk.shape[0]
        chunk = chunk.to(device)
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
682
683
    print(f" - Shape: {chunk.shape}")
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
684
685
686
687
688
689
690
691
    return fps


######################################################################
# The following function implements the pipeline 3. Resizing is
# performed by NVDEC and the resulting tensor is placed on CUDA memory.


moto's avatar
moto committed
692
def test_hw_decode_and_resize(src, decoder, decoder_option, hw_accel="cuda", frames_per_chunk=5):
693
694
695
696
    s = StreamReader(src)
    s.add_video_stream(5, decoder=decoder, decoder_option=decoder_option, hw_accel=hw_accel)

    num_frames = 0
moto's avatar
moto committed
697
    chunk = None
698
    t0 = time.monotonic()
moto's avatar
moto committed
699
    for (chunk,) in s.stream():
700
701
702
        num_frames += chunk.shape[0]
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
703
704
    print(f" - Shape: {chunk.shape}")
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
705
706
707
708
709
    return fps


######################################################################
#
moto's avatar
moto committed
710
# The following function run the benchmark functions on given sources.
711
712
713
#


moto's avatar
moto committed
714
715
716
717
718
719
720
721
722
723
def run_resize_tests(src):
    print(f"Testing: {os.path.basename(src)}")
    height, width = 224, 224
    print("* Software decoding with PyTorch interpolate")
    cpu_resize1 = test_decode_then_resize(src, height=height, width=width)
    print("* Software decoding with FFmpeg scale")
    cpu_resize2 = test_decode_and_resize(src, height=height, width=width)
    print("* Hardware decoding with resize")
    cuda_resize = test_hw_decode_and_resize(src, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"})
    return [cpu_resize1, cpu_resize2, cuda_resize]
724
725
726
727


######################################################################
#
moto's avatar
moto committed
728
# Now we run the tests.
729
730

######################################################################
moto's avatar
moto committed
731
732
# QVGA
# ----
733

moto's avatar
moto committed
734
fps_qvga = run_resize_tests(src_qvga)
735
736

######################################################################
moto's avatar
moto committed
737
738
# VGA
# ---
739

moto's avatar
moto committed
740
fps_vga = run_resize_tests(src_vga)
741
742

######################################################################
moto's avatar
moto committed
743
744
# XGA
# ---
745

moto's avatar
moto committed
746
fps_xga = run_resize_tests(src_xga)
747
748

######################################################################
moto's avatar
moto committed
749
750
751
# Result
# ------
# Now we plot the result.
752
753
754
#


moto's avatar
moto committed
755
756
def plot():
    fig, ax = plt.subplots(figsize=[9.6, 6.4])
757

moto's avatar
moto committed
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
    for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
        ax.plot(items[:-1], marker=items[-1])
    ax.grid(axis="both")
    ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
    ax.legend(
        [
            "Software decoding\nwith resize\n(PyTorch interpolate)",
            "Software decoding\nwith resize\n(FFmpeg scale)",
            "NVDEC\nwith resizing",
        ]
    )
    ax.set_title("Speed of processing video frames")
    ax.set_xlabel("Input video resolution")
    ax.set_ylabel("Frames per second")
    plt.tight_layout()
773
774


moto's avatar
moto committed
775
plot()
776
777
778

######################################################################
#
moto's avatar
moto committed
779
780
781
# Hardware deocder shows a similar trend as previous experiment.
# In fact, the performance is almost the same. Hardware resizing has
# almost zero overhead for scaling down the frames.
782
#
moto's avatar
moto committed
783
784
785
786
787
788
789
# Software decoding also shows a similar trend. Performing resizing as
# part of decoding is faster. One possible explanation is that, video
# frames are internally stored as YUV420P, which has half the number
# of pixels compared to RGB24, or YUV444P. This means that if resizing
# before copying frame data to PyTorch tensor, the number of pixels
# manipulated and copied are smaller than the case where applying
# resizing after frames are converted to Tensor.
790
791
792
793
#

######################################################################
#
moto's avatar
moto committed
794
# Tag: :obj:`torchaudio.io`