nvdec_tutorial.py 23.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""
Accelerated video decoding with NVDEC
=====================================

.. _nvdec_tutorial:

**Author**: `Moto Hira <moto@meta.com>`__

This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
with TorchAudio, and how it improves the performance of video decoding.
"""

######################################################################
#
# .. note::
#
#    This tutorial requires FFmpeg libraries compiled with HW
#    acceleration enabled.
#
#    Please refer to
#    :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
#    for how to build FFmpeg with HW acceleration.
#

import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

######################################################################
#
moto's avatar
moto committed
33
import os
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import time

import matplotlib.pyplot as plt
from torchaudio.io import StreamReader

######################################################################
#
# Check the prerequisites
# -----------------------
#
# First, we check that TorchAudio correctly detects FFmpeg libraries
# that support HW decoder/encoder.
#

from torchaudio.utils import ffmpeg_utils

######################################################################
#

print("FFmpeg Library versions:")
for k, ver in ffmpeg_utils.get_versions().items():
moto's avatar
moto committed
55
    print(f"  {k}:\t{'.'.join(str(v) for v in ver)}")
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

######################################################################
#

print("Available NVDEC Decoders:")
for k in ffmpeg_utils.get_video_decoders().keys():
    if "cuvid" in k:
        print(f" - {k}")

######################################################################
#

print("Avaialbe GPU:")
print(torch.cuda.get_device_properties(0))

######################################################################
#
# We will use the following video which has the following properties;
#
# - Codec: H.264
# - Resolution: 960x540
# - FPS: 29.97
# - Pixel format: YUV420P
#
moto's avatar
moto committed
80
81
82
83
84
# .. raw:: html
#
#    <video style="max-width: 100%" controls>
#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
#    </video>
85
86
87
88
89
90
91
92
93
94
95
96
97

######################################################################
#

src = torchaudio.utils.download_asset(
    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
)

######################################################################
# Decoding videos with NVDEC
# --------------------------
#
# To use HW video decoder, you need to specify the HW decoder when
moto's avatar
moto committed
98
99
# defining the output video stream by passing ``decoder`` option to
# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
100
101
102
103
104
105
106
107
108
#

s = StreamReader(src)
s.add_video_stream(5, decoder="h264_cuvid")
s.fill_buffer()
(video,) = s.pop_chunks()

######################################################################
#
moto's avatar
moto committed
109
# The video frames are decoded and returned as tensor of NCHW format.
110

moto's avatar
moto committed
111
112
113
114
115
116
117
118
print(video.shape, video.dtype)

######################################################################
#
# By default, the decoded frames are sent back to CPU memory, and
# CPU tensors are created.

print(video.device)
119
120
121

######################################################################
#
moto's avatar
moto committed
122
123
124
# By specifying ``hw_accel`` option, you can convert the decoded frames
# to CUDA tensor.
# ``hw_accel`` option takes string values and pass it
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# to :py:class:`torch.device`.
#
# .. note::
#
#    Currently, ``hw_accel`` option and
#    :py:meth:`~torchaudio.io.StreamReader.add_basic_video_stream`
#    are not compatible. ``add_basic_video_stream`` adds post-decoding
#    process, which is designed for frames in CPU memory.
#    Please use :py:meth:`~torchaudio.io.StreamReader.add_video_stream`.
#

s = StreamReader(src)
s.add_video_stream(5, decoder="h264_cuvid", hw_accel="cuda:0")
s.fill_buffer()
(video,) = s.pop_chunks()

print(video.shape, video.dtype, video.device)


######################################################################
moto's avatar
moto committed
145
# .. note::
146
#
moto's avatar
moto committed
147
148
149
#    When there are multiple of GPUs available, ``StreamReader`` by
#    default uses the first GPU. You can change this by providing
#    ``"gpu"`` option.
150
151
152
153
154
155
156
157
158
159
160
161
#
# .. code::
#
#    # Video data is sent to CUDA device 0, decoded and
#    # converted on the same device.
#    s.add_video_stream(
#        ...,
#        decoder="h264_cuvid",
#        decoder_option={"gpu": "0"},
#        hw_accel="cuda:0",
#    )
#
moto's avatar
moto committed
162
163
164
165
166
167
168
169
170
# .. note::
#
#    ``"gpu"`` option and ``hw_accel`` option can be specified
#    independently. If they do not match, decoded frames are
#    transfered to the device specified by ``hw_accell``
#    automatically.
#
# .. code::
#
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#    # Video data is sent to CUDA device 0, and decoded there.
#    # Then it is transfered to CUDA device 1, and converted to
#    # CUDA tensor.
#    s.add_video_stream(
#        ...,
#        decoder="h264_cuvid",
#        decoder_option={"gpu": "0"},
#        hw_accel="cuda:1",
#    )

######################################################################
# Visualization
# -------------
#
# Let's look at the frames decoded by HW decoder and compare them
# against equivalent results from software decoders.
#
# The following function seeks into the given timestamp and decode one
# frame with the specificed decoder.


def test_decode(decoder: str, seek: float):
    s = StreamReader(src)
    s.seek(seek)
    s.add_video_stream(1, decoder=decoder)
    s.fill_buffer()
    (video,) = s.pop_chunks()
    return video[0]


######################################################################
#

timestamps = [12, 19, 45, 131, 180]

cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]

######################################################################
#
# .. note::
#
#    Currently, HW decoder does not support colorspace conversion.
#    Decoded frames are YUV format.
#    The following function performs YUV to RGB covnersion
#    (and axis shuffling for plotting).


def yuv_to_rgb(frames):
    frames = frames.cpu().to(torch.float)
    y = frames[..., 0, :, :]
    u = frames[..., 1, :, :]
    v = frames[..., 2, :, :]

    y /= 255
    u = u / 255 - 0.5
    v = v / 255 - 0.5

    r = y + 1.14 * v
    g = y + -0.396 * u - 0.581 * v
    b = y + 2.029 * u

    rgb = torch.stack([r, g, b], -1)
    rgb = (rgb * 255).clamp(0, 255).to(torch.uint8)
    return rgb.numpy()


######################################################################
#
# Now we visualize the resutls.
#


def plot():
    n_rows = len(timestamps)
    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
    for i in range(n_rows):
        axes[i][0].imshow(yuv_to_rgb(cpu_frames[i]))
        axes[i][1].imshow(yuv_to_rgb(cuda_frames[i]))

    axes[0][0].set_title("Software decoder")
    axes[0][1].set_title("HW decoder")
    plt.setp(axes, xticks=[], yticks=[])
    plt.tight_layout()


plot()

######################################################################
#
# They are indistinguishable to the eyes of the author.
# Feel free to let us know if you spot something. :)
#


######################################################################
# HW resizing and cropping
# ------------------------
#
# You can use ``decoder_option`` argument to provide decoder-specific
# options.
#
# The following options are often relevant in preprocessing.
#
# - ``resize``: Resize the frame into ``(width)x(height)``.
# - ``crop``: Crop the frame ``(top)x(bottom)x(left)x(right)``.
#   Note that the specified values are the amount of rows/columns removed.
#   The final image size is ``(width - left - right)x(height - top -bottom)``.
#   If ``crop`` and ``resize`` options are used together,
#   ``crop`` is performed first.
#
# For other available options, please run
# ``ffmpeg -h decoder=h264_cuvid``.
#


def test_options(option):
    s = StreamReader(src)
    s.seek(87)
    s.add_video_stream(1, decoder="h264_cuvid", hw_accel="cuda:0", decoder_option=option)
    s.fill_buffer()
    (video,) = s.pop_chunks()
moto's avatar
moto committed
293
    print(f"Option: {option}:\t{video.shape}")
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
    return video[0]


######################################################################
#

original = test_options(option=None)
resized = test_options(option={"resize": "480x270"})
cropped = test_options(option={"crop": "135x135x240x240"})
cropped_and_resized = test_options(option={"crop": "135x135x240x240", "resize": "640x360"})


######################################################################
#


def plot():
    fig, axes = plt.subplots(2, 2, figsize=[12.8, 9.6])
    axes[0][0].imshow(yuv_to_rgb(original))
    axes[0][1].imshow(yuv_to_rgb(resized))
    axes[1][0].imshow(yuv_to_rgb(cropped))
    axes[1][1].imshow(yuv_to_rgb(cropped_and_resized))

    axes[0][0].set_title("Original")
    axes[0][1].set_title("Resized")
    axes[1][0].set_title("Cropped")
    axes[1][1].set_title("Cropped and resized")
    plt.tight_layout()
    return fig


plot()

######################################################################
# Comparing resizing methods
# --------------------------
#
moto's avatar
moto committed
331
332
# Unlike software scaling, NVDEC does not provide an option to choose
# the scaling algorithm.
333
334
335
336
337
338
339
340
341
342
# In ML applicatoins, it is often necessary to construct a
# preprocessing pipeline with a similar numerical property.
# So here we compare the result of hardware resizing with software
# resizing of different algorithms.
#
# We will use the following video, which contains the test pattern
# generated using the following command.
#
# .. code::
#
moto's avatar
moto committed
343
344
345
346
347
348
349
#    ffmpeg -y -f lavfi -t 12.05 -i mptestsrc -movflags +faststart mptestsrc.mp4
#
# .. raw:: html
#
#    <video style="max-width: 100%" controls>
#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/mptestsrc.mp4" type="video/mp4">
#    </video>
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449


######################################################################
#

test_src = torchaudio.utils.download_asset("tutorial-assets/mptestsrc.mp4")


######################################################################
# The following function decodes video and
# apply the specified scaling algorithm.
#


def decode_resize_ffmpeg(mode, height, width, seek):
    filter_desc = None if mode is None else f"scale={width}:{height}:sws_flags={mode}"
    s = StreamReader(test_src)
    s.add_video_stream(1, filter_desc=filter_desc)
    s.seek(seek)
    s.fill_buffer()
    (chunk,) = s.pop_chunks()
    return chunk


######################################################################
# The following function uses HW decoder to decode video and resize.
#


def decode_resize_cuvid(height, width, seek):
    s = StreamReader(test_src)
    s.add_video_stream(1, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"}, hw_accel="cuda:0")
    s.seek(seek)
    s.fill_buffer()
    (chunk,) = s.pop_chunks()
    return chunk.cpu()


######################################################################
# Now we execute them and visualize the resulting frames.

params = {"height": 224, "width": 224, "seek": 3}

frames = [
    decode_resize_ffmpeg(None, **params),
    decode_resize_ffmpeg("neighbor", **params),
    decode_resize_ffmpeg("bilinear", **params),
    decode_resize_ffmpeg("bicubic", **params),
    decode_resize_cuvid(**params),
    decode_resize_ffmpeg("spline", **params),
    decode_resize_ffmpeg("lanczos:param0=1", **params),
    decode_resize_ffmpeg("lanczos:param0=3", **params),
    decode_resize_ffmpeg("lanczos:param0=5", **params),
]


######################################################################
#


def plot():
    fig, axes = plt.subplots(3, 3, figsize=[12.8, 15.2])
    for i, f in enumerate(frames):
        h, w = f.shape[2:4]
        f = f[..., : h // 4, : w // 4]
        axes[i // 3][i % 3].imshow(yuv_to_rgb(f[0]))
    axes[0][0].set_title("Original")
    axes[0][1].set_title("nearest neighbor")
    axes[0][2].set_title("bilinear")
    axes[1][0].set_title("bicubic")
    axes[1][1].set_title("NVDEC")
    axes[1][2].set_title("spline")
    axes[2][0].set_title("lanczos(1)")
    axes[2][1].set_title("lanczos(3)")
    axes[2][2].set_title("lanczos(5)")

    plt.setp(axes, xticks=[], yticks=[])
    plt.tight_layout()


plot()

######################################################################
# None of them is exactly the same. To the eyes of authors, lanczos(1)
# appears to be most similar to NVDEC.
# The bicubic looks close as well.

######################################################################
#
# Benchmark NVDEC with StreamReader
# ---------------------------------
#
# In this section, we compare the performace of software video
# decoding and HW video decoding.
#

######################################################################
# Decode as CUDA frames
# ---------------------
#
moto's avatar
moto committed
450
451
452
453
454
455
# First, we compare the time it takes for software decoder and
# hardware encoder to decode the same video.
# To make the result comparable, when using software decoder, we move
# the resulting tensor to CUDA.
#
# The procedures to test look like the following
456
#
moto's avatar
moto committed
457
458
# - Use hardware decoder and place data on CUDA directly
# - Use software decoder, generate CPU Tensors and move them to CUDA.
459
#
moto's avatar
moto committed
460
# .. note:
461
#
moto's avatar
moto committed
462
463
464
465
466
467
468
469
#    Because HW decoder currently only supports reading videos as
#    YUV444P format, we decode frames into YUV444P format for the case of
#    software decoder as well.
#


######################################################################
# The following function implements the hardware decoder test case.
470
471


moto's avatar
moto committed
472
def test_decode_cuda(src, decoder, hw_accel="cuda", frames_per_chunk=5):
473
    s = StreamReader(src)
moto's avatar
moto committed
474
    s.add_video_stream(frames_per_chunk, decoder=decoder, hw_accel=hw_accel)
475
476

    num_frames = 0
moto's avatar
moto committed
477
    chunk = None
478
    t0 = time.monotonic()
moto's avatar
moto committed
479
    for (chunk,) in s.stream():
480
481
        num_frames += chunk.shape[0]
    elapsed = time.monotonic() - t0
moto's avatar
moto committed
482
    print(f" - Shape: {chunk.shape}")
483
    fps = num_frames / elapsed
moto's avatar
moto committed
484
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
485
486
487
488
    return fps


######################################################################
moto's avatar
moto committed
489
490
491
492
# The following function implements the software decoder test case.


def test_decode_cpu(src, threads, decoder=None, frames_per_chunk=5):
493
    s = StreamReader(src)
moto's avatar
moto committed
494
    s.add_video_stream(frames_per_chunk, decoder=decoder, decoder_option={"threads": f"{threads}"})
495
496

    num_frames = 0
moto's avatar
moto committed
497
    device = torch.device("cuda")
498
499
500
501
502
    t0 = time.monotonic()
    for i, (chunk,) in enumerate(s.stream()):
        if i == 0:
            print(f" - Shape: {chunk.shape}")
        num_frames += chunk.shape[0]
moto's avatar
moto committed
503
        chunk = chunk.to(device)
504
505
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
    return fps


######################################################################
# For each resolution of video, we run multiple software decoder test
# cases with different number of threads.


def run_decode_tests(src, frames_per_chunk=5):
    fps = []
    print(f"Testing: {os.path.basename(src)}")
    for threads in [1, 4, 8, 16]:
        print(f"* Software decoding (num_threads={threads})")
        fps.append(test_decode_cpu(src, threads))
    print("* Hardware decoding")
    fps.append(test_decode_cuda(src, decoder="h264_cuvid"))
523
524
525
526
    return fps


######################################################################
moto's avatar
moto committed
527
# Now we run the tests with videos of different resolutions.
528
#
moto's avatar
moto committed
529
530
# QVGA
# ----
531

moto's avatar
moto committed
532
533
src_qvga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_qvga.h264.mp4")
fps_qvga = run_decode_tests(src_qvga)
534
535

######################################################################
moto's avatar
moto committed
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
# VGA
# ---

src_vga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_vga.h264.mp4")
fps_vga = run_decode_tests(src_vga)

######################################################################
# XGA
# ---

src_xga = torchaudio.utils.download_asset("tutorial-assets/testsrc2_xga.h264.mp4")
fps_xga = run_decode_tests(src_xga)


######################################################################
# Result
# ------
553
#
moto's avatar
moto committed
554
# Now we plot the result.
555

moto's avatar
moto committed
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599

def plot():
    fig, ax = plt.subplots(figsize=[9.6, 6.4])

    for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
        ax.plot(items[:-1], marker=items[-1])
    ax.grid(axis="both")
    ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
    ax.legend(
        [
            "Software Decoding (threads=1)",
            "Software Decoding (threads=4)",
            "Software Decoding (threads=8)",
            "Software Decoding (threads=16)",
            "Hardware Decoding (CUDA Tensor)",
        ]
    )
    ax.set_title("Speed of processing video frames")
    ax.set_ylabel("Frames per second")
    plt.tight_layout()


plot()

######################################################################
#
# We observe couple of things
#
# - Increasing the number of threads in software decoding makes the
#   pipeline faster, but the performance saturates around 8 threads.
# - The performance gain from using hardware decoder depends on the
#   resolution of video.
# - At lower resolutions like QVGA, hardware decoding is slower than
#   software decoding
# - At higher resolutions like XGA, hardware decoding is faster
#   than software decoding.
#
#
# It is worth noting that the performance gain also depends on the
# type of GPU.
# We observed that when decoding VGA videos using V100 or A100 GPUs,
# hardware decoders are slower than software decoders. But using A10
# GPU hardware deocder is faster than software decodr.
#
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617

######################################################################
# Decode and resize
# -----------------
#
# Next, we add resize operation to the pipeline.
# We will compare the following pipelines.
#
# 1. Decode video using software decoder and read the frames as
#    PyTorch Tensor. Resize the tensor using
#    :py:func:`torch.nn.functional.interpolate`, then send
#    the resulting tensor to CUDA device.
# 2. Decode video using software decoder, resize the frame with
#    FFmpeg's filter graph, read the resized frames as PyTorch tensor,
#    then send it to CUDA device.
# 3. Decode and resize video simulaneously with HW decoder, read the
#    resulting frames as CUDA tensor.
#
moto's avatar
moto committed
618
# The pipeline 1 represents common video loading implementations.
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
#
# The pipeline 2 uses FFmpeg's filter graph, which allows to manipulate
# raw frames before converting them to Tensors.
#
# The pipeline 3 has the minimum amount of data transfer from CPU to
# CUDA, which significantly contribute to performant data loading.
#


######################################################################
# The following function implements the pipeline 1. It uses PyTorch's
# :py:func:`torch.nn.functional.interpolate`.
# We use ``bincubic`` mode, as we saw that the resulting frames are
# closest to NVDEC resizing.
#


moto's avatar
moto committed
636
def test_decode_then_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
637
    s = StreamReader(src)
moto's avatar
moto committed
638
    s.add_video_stream(frames_per_chunk, decoder_option={"threads": "8"})
639
640

    num_frames = 0
moto's avatar
moto committed
641
642
    device = torch.device("cuda")
    chunk = None
643
    t0 = time.monotonic()
moto's avatar
moto committed
644
    for (chunk,) in s.stream():
645
646
647
648
649
        num_frames += chunk.shape[0]
        chunk = torch.nn.functional.interpolate(chunk, [height, width], mode=mode, antialias=True)
        chunk = chunk.to(device)
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
650
651
    print(f" - Shape: {chunk.shape}")
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
652
653
654
655
656
657
658
659
660
661
662
663
    return fps


######################################################################
# The following function implements the pipeline 2. Frames are resized
# as part of decoding process, then sent to CUDA device.
#
# We use ``bincubic`` mode, to make the result comparable with
# PyTorch-based implementation above.
#


moto's avatar
moto committed
664
def test_decode_and_resize(src, height, width, mode="bicubic", frames_per_chunk=5):
665
    s = StreamReader(src)
moto's avatar
moto committed
666
667
668
    s.add_video_stream(
        frames_per_chunk, filter_desc=f"scale={width}:{height}:sws_flags={mode}", decoder_option={"threads": "8"}
    )
669
670

    num_frames = 0
moto's avatar
moto committed
671
672
    device = torch.device("cuda")
    chunk = None
673
    t0 = time.monotonic()
moto's avatar
moto committed
674
    for (chunk,) in s.stream():
675
676
677
678
        num_frames += chunk.shape[0]
        chunk = chunk.to(device)
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
679
680
    print(f" - Shape: {chunk.shape}")
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
681
682
683
684
685
686
687
688
    return fps


######################################################################
# The following function implements the pipeline 3. Resizing is
# performed by NVDEC and the resulting tensor is placed on CUDA memory.


moto's avatar
moto committed
689
def test_hw_decode_and_resize(src, decoder, decoder_option, hw_accel="cuda", frames_per_chunk=5):
690
691
692
693
    s = StreamReader(src)
    s.add_video_stream(5, decoder=decoder, decoder_option=decoder_option, hw_accel=hw_accel)

    num_frames = 0
moto's avatar
moto committed
694
    chunk = None
695
    t0 = time.monotonic()
moto's avatar
moto committed
696
    for (chunk,) in s.stream():
697
698
699
        num_frames += chunk.shape[0]
    elapsed = time.monotonic() - t0
    fps = num_frames / elapsed
moto's avatar
moto committed
700
701
    print(f" - Shape: {chunk.shape}")
    print(f" - Processed {num_frames} frames in {elapsed:.2f} seconds. ({fps:.2f} fps)")
702
703
704
705
706
    return fps


######################################################################
#
moto's avatar
moto committed
707
# The following function run the benchmark functions on given sources.
708
709
710
#


moto's avatar
moto committed
711
712
713
714
715
716
717
718
719
720
def run_resize_tests(src):
    print(f"Testing: {os.path.basename(src)}")
    height, width = 224, 224
    print("* Software decoding with PyTorch interpolate")
    cpu_resize1 = test_decode_then_resize(src, height=height, width=width)
    print("* Software decoding with FFmpeg scale")
    cpu_resize2 = test_decode_and_resize(src, height=height, width=width)
    print("* Hardware decoding with resize")
    cuda_resize = test_hw_decode_and_resize(src, decoder="h264_cuvid", decoder_option={"resize": f"{width}x{height}"})
    return [cpu_resize1, cpu_resize2, cuda_resize]
721
722
723
724


######################################################################
#
moto's avatar
moto committed
725
# Now we run the tests.
726
727

######################################################################
moto's avatar
moto committed
728
729
# QVGA
# ----
730

moto's avatar
moto committed
731
fps_qvga = run_resize_tests(src_qvga)
732
733

######################################################################
moto's avatar
moto committed
734
735
# VGA
# ---
736

moto's avatar
moto committed
737
fps_vga = run_resize_tests(src_vga)
738
739

######################################################################
moto's avatar
moto committed
740
741
# XGA
# ---
742

moto's avatar
moto committed
743
fps_xga = run_resize_tests(src_xga)
744
745

######################################################################
moto's avatar
moto committed
746
747
748
# Result
# ------
# Now we plot the result.
749
750
751
#


moto's avatar
moto committed
752
753
def plot():
    fig, ax = plt.subplots(figsize=[9.6, 6.4])
754

moto's avatar
moto committed
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
    for items in zip(fps_qvga, fps_vga, fps_xga, "ov^sx"):
        ax.plot(items[:-1], marker=items[-1])
    ax.grid(axis="both")
    ax.set_xticks([0, 1, 2], ["QVGA (320x240)", "VGA (640x480)", "XGA (1024x768)"])
    ax.legend(
        [
            "Software decoding\nwith resize\n(PyTorch interpolate)",
            "Software decoding\nwith resize\n(FFmpeg scale)",
            "NVDEC\nwith resizing",
        ]
    )
    ax.set_title("Speed of processing video frames")
    ax.set_xlabel("Input video resolution")
    ax.set_ylabel("Frames per second")
    plt.tight_layout()
770
771


moto's avatar
moto committed
772
plot()
773
774
775

######################################################################
#
moto's avatar
moto committed
776
777
778
# Hardware deocder shows a similar trend as previous experiment.
# In fact, the performance is almost the same. Hardware resizing has
# almost zero overhead for scaling down the frames.
779
#
moto's avatar
moto committed
780
781
782
783
784
785
786
# Software decoding also shows a similar trend. Performing resizing as
# part of decoding is faster. One possible explanation is that, video
# frames are internally stored as YUV420P, which has half the number
# of pixels compared to RGB24, or YUV444P. This means that if resizing
# before copying frame data to PyTorch tensor, the number of pixels
# manipulated and copied are smaller than the case where applying
# resizing after frames are converted to Tensor.
787
788
789
790
#

######################################################################
#
moto's avatar
moto committed
791
# Tag: :obj:`torchaudio.io`