Commit ffeba11a authored by mayp777's avatar mayp777
Browse files

UPDATE

parent 29deb085
......@@ -15,6 +15,26 @@ article.pytorch-article img.shield-badge {
margin-top: -18px;
margin-bottom: 9px;
}
/* Apply code highlight to the whole sentences instead of each word */
code.docutils.literal.notranslate {
background-color: #f3f4f7;
border-color: #f3f4f7;
border-radius: 5px;
padding: 1px 2px;
}
code.docutils.literal.notranslate span.pre {
background-color: transparent;
padding: none;
}
/* Do not overwrite in Tables generated by autosummary */
tr.row-odd code.docutils.literal.notranslate span.pre {
background-color: transparent;
border-color: transparent;
}
tr.row-even code.docutils.literal.notranslate {
background-color: transparent;
border-color: transparent;
}
/* Fix for Sphinx gallery 0.11
See https://github.com/sphinx-gallery/sphinx-gallery/issues/990
*/
......@@ -35,3 +55,23 @@ article.pytorch-article div.tutorials-card div.card-body code {
border-bottom: none;
background-color: #AFB8C133;
}
/* C++ doc */
/* Fix the malindentation of `class` header*/
article.pytorch-article .cpp.class dt {
padding-left: 0.5em;
}
/* keywords like const and explicit, class */
dt.sig.sig-object.cpp > span.k > span.pre {
color: #c95362;
}
dt.sig.sig-object.cpp > span.n:not(.sig-param) > span.pre {
color: #9257c6;
}
/* Fix */
dt.sig.sig-object.cpp > span.sig-prename.descclassname > span.n > span.pre {
color: #6c6c6d;
}
dt.sig.sig-object.cpp > span.sig-prename.descname > span.n > span.pre {
color: #6c6c6d;
}
......@@ -5,7 +5,11 @@
.. autoclass:: {{ fullname }}()
{%- if name in ["RNNTBundle.FeatureExtractor", "RNNTBundle.TokenProcessor"] %}
{%- set support_classes = [] %}
{%- if name in ["RNNTBundle.FeatureExtractor", "RNNTBundle.TokenProcessor", "Wav2Vec2FABundle.Tokenizer"] %}
{%- set methods = ["__call__"] %}
{%- elif name == "Wav2Vec2FABundle.Aligner" %}
{%- set attributes = [] %}
{%- set methods = ["__call__"] %}
{%- elif name == "Tacotron2TTSBundle.TextProcessor" %}
{%- set attributes = ["tokens"] %}
......@@ -13,14 +17,25 @@
{%- elif name == "Tacotron2TTSBundle.Vocoder" %}
{%- set attributes=["sample_rate"] %}
{%- set methods = ["__call__"] %}
{%- elif name == "VGGishBundle.VGGish" %}
{%- set attributes = [] %}
{%- set methods = ["forward"] %}
{%- elif name == "VGGishBundle.VGGishInputProcessor" %}
{%- set attributes = [] %}
{%- set methods = ["__call__"] %}
{% endif %}
..
ATTRIBUTES
{%- if attributes %}
Properties
----------
{%- endif %}
{%- for item in attributes %}
{%- if not item.startswith('_') %}
{{ item | underline("-") }}
{{ item | underline("~") }}
.. container:: py attribute
......@@ -29,13 +44,17 @@
{%- endif %}
{%- endfor %}
..
METHODS
{%- if methods %}
Methods
-------
{%- endif %}
{%- for item in methods %}
{%- if item != "__init__" %}
{{item | underline("-") }}
{{item | underline("~") }}
.. container:: py attribute
......@@ -43,3 +62,24 @@
{%- endif %}
{%- endfor %}
{%- if support_classes %}
Support Structures
------------------
{%- endif %}
{%- for item in support_classes %}
{% set components = item.split('.') %}
{{ components[-1] | underline("~") }}
.. container:: py attribute
.. autoclass:: {{[fullname, item] | join('.')}}
:members:
{%- endfor %}
..
autogenerated from source/_templates/autosummary/cuda_ctc_decoder_class.rst
{#
################################################################################
# autosummary template for CUCTCDecoder
# Since the class has multiple methods and support structure.
# we want to have them show up in the table of contents.
# The default class template does not do this, so we use custom one here.
################################################################################
#}
{{ name | underline }}
{%- if name != "CUCTCDecoder" %}
.. autofunction:: {{fullname}}
{%- else %}
.. autoclass:: {{ fullname }}()
Methods
=======
{%- for item in members %}
{%- if not item.startswith('_') or item == "__call__" %}
{{ item | underline("-") }}
.. container:: py attribute
.. automethod:: {{[fullname, item] | join('.')}}
{%- endif %}
{%- endfor %}
Support Structures
==================
{%- for item in ["CUCTCHypothesis"] %}
{{ item | underline("-") }}
.. autoclass:: torchaudio.models.decoder.{{item}}
:members:
{%- endfor %}
{%- endif %}
..
autogenerated from source/_templates/autosummary/io.rst
{{ fullname | underline }}
.. autofunction:: {{ fullname }}
{%- if name == "info" %}
Support Structure
-----------------
AudioMetaData
~~~~~~~~~~~~~
.. autoclass:: torchaudio.AudioMetaData
{%- endif %}
......@@ -17,12 +17,12 @@
{%- if attributes %}
Properties
==========
----------
{%- for item in attributes %}
{%- if not item.startswith('_') and item not in inherited_members %}
{{ item | underline("-") }}
{{ item | underline("~") }}
.. container:: py attribute
......@@ -32,13 +32,19 @@ Properties
{%- endfor %}
{%- endif %}
{%- if members %}
Methods
=======
-------
{%- for item in members %}
{%- if not item.startswith('_') and item not in inherited_members and item not in attributes %}
{%- if
not item.startswith('_')
and item not in inherited_members
and item not in attributes
%}
{{ item | underline("-") }}
{{ item | underline("~") }}
.. container:: py attribute
......@@ -46,18 +52,39 @@ Methods
{%- endif %}
{%- endfor %}
{%- endif %}
{%- if name == "StreamReader" %}
{%- if name in ["StreamReader", "StreamWriter"] %}
Support Structures
==================
------------------
{%- for item in ["StreamReaderSourceStream", "StreamReaderSourceAudioStream", "StreamReaderSourceVideoStream", "StreamReaderOutputStream"] %}
{%- if name == "StreamReader" %}
{%- for item in [
"ChunkTensor",
"SourceStream",
"SourceAudioStream",
"SourceVideoStream",
"OutputStream",
"OutputAudioStream",
"OutputVideoStream",
] %}
{{ item | underline("~") }}
.. autoclass:: torchaudio.io._stream_reader.{{item}}()
:members:
{{ item | underline("-") }}
{%- endfor %}
{%- elif name == "StreamWriter" %}
CodecConfig
~~~~~~~~~~~
.. autoclass:: torchaudio.io.{{item}}()
.. autoclass:: torchaudio.io::CodecConfig
:members:
{%- endfor %}
{%- endif %}
{%- endif %}
..
autogenerated from source/_templates/autosummary/model_class.rst
.. currentmodule:: torchaudio.models
..
{%- set methods=["forward"] %}
{%- set helpers={
"torchaudio.models.RNNTBeamSearch": [
"Hypothesis",
],
}
-%}
{%- set factory={
"torchaudio.models.ConvTasNet": [
"conv_tasnet_base",
],
"torchaudio.models.Wav2Vec2Model": [
"wav2vec2_model",
"wav2vec2_base",
"wav2vec2_large",
"wav2vec2_large_lv60k",
"wav2vec2_xlsr_300m",
"wav2vec2_xlsr_1b",
"wav2vec2_xlsr_2b",
"hubert_base",
"hubert_large",
"hubert_xlarge",
"wavlm_model",
"wavlm_base",
"wavlm_large",
],
"torchaudio.models.HuBERTPretrainModel": [
"hubert_pretrain_model",
"hubert_pretrain_base",
"hubert_pretrain_large",
"hubert_pretrain_xlarge",
],
"torchaudio.models.RNNT": [
"emformer_rnnt_model",
"emformer_rnnt_base",
],
"torchaudio.models.HDemucs": [
"hdemucs_low",
"hdemucs_medium",
"hdemucs_high",
],
"torchaudio.models.SquimObjective": [
"squim_objective_model",
"squim_objective_base",
],
"torchaudio.models.SquimSubjective": [
"squim_subjective_model",
"squim_subjective_base",
],
}
-%}
{%- set utils={
"torchaudio.models.Wav2Vec2Model": [
"~torchaudio.models.wav2vec2.utils.import_fairseq_model",
"~torchaudio.models.wav2vec2.utils.import_huggingface_model",
]
}
-%}
{%- if name in ["Wav2Vec2Model"] %}
{{ methods.extend(["extract_features"]) }}
{%- elif name in ["Emformer", "RNNTBeamSearch", "WaveRNN", "Tacotron2", ] %}
......@@ -10,10 +72,17 @@
{{ methods.extend(["transcribe_streaming", "transcribe", "predict", "join"]) }}
{%- endif %}
.. TITLE
{{ name | underline }}
.. CLASS DEFINITIONS
.. autoclass:: {{ fullname }}
Methods
=======
{% for item in methods %}
{{item | underline("-") }}
......@@ -24,17 +93,58 @@
{%- endfor %}
{%- if name == "RNNTBeamSearch" %}
.. HELPER STRUCTURES
{%- if helpers[fullname] %}
Support Structures
==================
Hypothesis
----------
{%- for item in helpers[fullname] %}
{{item | underline("-") }}
.. container:: py attribute
.. autodata:: torchaudio.models.Hypothesis
.. autodata:: {{["torchaudio.models", item] | join('.')}}
:no-value:
{%- endfor %}
{%- endif %}
.. FACTORY FUNCTIONS
{%- if factory[fullname] %}
Factory Functions
=================
.. autosummary::
:toctree: ../generated
:nosignatures:
{% for item in factory[fullname] %}
{{["~torchaudio.models", item] | join('.')}}
{%- endfor %}
{%- endif %}
.. UTILITY FUNCTIONS
{%- if utils[fullname] %}
Utility Functions
=================
.. currentmodule:: torchaudio.models
.. autosummary::
:toctree: ../generated
:nosignatures:
{% for item in utils[fullname] %}
{{ item }}
{%- endfor %}
{%- endif %}
{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
{% if page_source_suffix %}
{% set suffix = page_source_suffix %}
{% else %}
{% set suffix = source_suffix %}
{% endif %}
{% if meta is defined and meta is not none %}
{% set check_meta = True %}
{% else %}
{% set check_meta = False %}
{% endif %}
{% if check_meta and 'github_url' in meta %}
{% set display_github = True %}
{% endif %}
{% if check_meta and 'bitbucket_url' in meta %}
{% set display_bitbucket = True %}
{% endif %}
{% if check_meta and 'gitlab_url' in meta %}
{% set display_gitlab = True %}
{% endif %}
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="pytorch-breadcrumbs">
{% block breadcrumbs %}
<li>
<a href="{{ pathto(master_doc) }}">
{% if theme_pytorch_project == 'tutorials' %}
Tutorials
{% else %}
Docs
{% endif %}
</a> &gt;
</li>
{% for doc in parents %}
<li><a href="{{ doc.link|e }}">{{ doc.title }}</a> &gt;</li>
{% endfor %}
<li>{{ title }} &gt;</li>
{% if 'dev' in version %}
<li>Nightly (unstable)</li>
{% elif version_stable in version %}
<li>Current (stable)</li>
{% else %}
<li>Old version (stable)</li>
{% endif %}
{% endblock %}
{% block breadcrumbs_aside %}
<li class="pytorch-breadcrumbs-aside">
{% if hasdoc(pagename) %}
{% if display_github %}
{% if check_meta and 'github_url' in meta %}
<!-- User defined GitHub URL -->
<a href="{{ meta['github_url'] }}" class="fa fa-github"> {{ _('Edit on GitHub') }}</a>
{% else %}
<a href="https://{{ github_host|default("github.com") }}/{{ github_user }}/{{ github_repo }}/{{ theme_vcs_pageview_mode|default("blob") }}/{{ github_version }}{{ conf_py_path }}{{ pagename }}{{ suffix }}" class="fa fa-github"> {{ _('Edit on GitHub') }}</a>
{% endif %}
{% elif display_bitbucket %}
{% if check_meta and 'bitbucket_url' in meta %}
<!-- User defined Bitbucket URL -->
<a href="{{ meta['bitbucket_url'] }}" class="fa fa-bitbucket"> {{ _('Edit on Bitbucket') }}</a>
{% else %}
<a href="https://bitbucket.org/{{ bitbucket_user }}/{{ bitbucket_repo }}/src/{{ bitbucket_version}}{{ conf_py_path }}{{ pagename }}{{ suffix }}?mode={{ theme_vcs_pageview_mode|default("view") }}" class="fa fa-bitbucket"> {{ _('Edit on Bitbucket') }}</a>
{% endif %}
{% elif display_gitlab %}
{% if check_meta and 'gitlab_url' in meta %}
<!-- User defined GitLab URL -->
<a href="{{ meta['gitlab_url'] }}" class="fa fa-gitlab"> {{ _('Edit on GitLab') }}</a>
{% else %}
<a href="https://{{ gitlab_host|default("gitlab.com") }}/{{ gitlab_user }}/{{ gitlab_repo }}/{{ theme_vcs_pageview_mode|default("blob") }}/{{ gitlab_version }}{{ conf_py_path }}{{ pagename }}{{ suffix }}" class="fa fa-gitlab"> {{ _('Edit on GitLab') }}</a>
{% endif %}
{% elif show_source and source_url_prefix %}
<a href="{{ source_url_prefix }}{{ pagename }}{{ suffix }}"><img src="{{ pathto('_static/images/view-page-source-icon.svg', 1) }}"></a>
{% elif show_source and has_source and sourcename %}
<a href="{{ pathto('_sources/' + sourcename, true)|e }}" rel="nofollow"><img src="{{ pathto('_static/images/view-page-source-icon.svg', 1) }}"></a>
{% endif %}
{% endif %}
</li>
{% endblock %}
</ul>
{% if (theme_prev_next_buttons_location == 'top' or theme_prev_next_buttons_location == 'both') and (next or prev) %}
<div class="rst-breadcrumbs-buttons" role="navigation" aria-label="breadcrumb navigation">
{% if next %}
<a href="{{ next.link|e }}" class="btn btn-neutral float-right" title="{{ next.title|striptags|e }}" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>
{% endif %}
{% if prev %}
<a href="{{ prev.link|e }}" class="btn btn-neutral" title="{{ prev.title|striptags|e }}" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>
{% endif %}
</div>
{% endif %}
</div>
......@@ -2,7 +2,7 @@
{% block sidebartitle %}
<div class="version">
<a href="{{ pathto('../versions.html', 1) }}">{{ version }} &#x25BC</a>
<a href="{{ pathto('../versions.html', 1) }}"><span style="font-size:110%">{{ version }} &#x25BC</span></a>
</div>
{% include "searchbox.html" %}
{% endblock %}
......@@ -83,7 +83,6 @@
$(".main-menu a:contains('Github')").each(overwrite);
});
{% if 'tutorial' in pagename %}
{#
# Override the right side menu bar behavior so that subsections
# are shown by default in tutorial page.
......@@ -107,6 +106,30 @@
}
};
});
{% endif %}
</script>
{% if 'libtorchaudio' in pagename %}
{#
# change the layout of signature one param per line if #params >= 2
#}
<script type="text/javascript">
$(window).ready(function() {
$("dt.sig.sig-object.cpp").each(function(i) {
let newline = "<br>\20\20\20\20";
let params = $(this).children(".sig-param");
if (params.length >= 2) {
$(this).html($(this).html().replace(/, /g, "," + newline));
$(this).children(".sig-paren").each(function(i) {
console.log(i, $(this));
if (i == 0) {
$(this).html($(this).html() + newline + "\20");
} else if (i == 1) {
$(this).html("<br>" + $(this).html());
}
});
}
});
});
</script>
{% endif %}
{% endblock %}
.. _enabling_hw_decoder:
Enabling GPU video decoder/encoder
==================================
TorchAudio can make use of hardware-based video decoding and encoding supported by underlying FFmpeg libraries that are linked at runtime.
Using NVIDIA's GPU decoder and encoder, it is also possible to pass around CUDA Tensor directly, that is decode video into CUDA tensor or encode video from CUDA tensor, without moving data from/to CPU.
This improves the video throughput significantly. However, please note that not all the video formats are supported by hardware acceleration.
This page goes through how to build FFmpeg with hardware acceleration. For the detail on the performance of GPU decoder and encoder please see :ref:`NVDEC tutoial <nvdec_tutorial>` and :ref:`NVENC tutorial <nvenc_tutorial>`.
Overview
--------
Using them in TorchAduio requires additional FFmpeg configuration.
In the following, we look into how to enable GPU video decoding with `NVIDIA's Video codec SDK <https://developer.nvidia.com/nvidia-video-codec-sdk>`_.
To use NVENC/NVDEC with TorchAudio, the following items are required.
1. NVIDIA GPU with hardware video decoder/encoder.
2. FFmpeg libraries compiled with NVDEC/NVENC support. †
3. PyTorch / TorchAudio with CUDA support.
TorchAudio’s official binary distributions are compiled to work with FFmpeg libraries, and they contain the logic to use hardware decoding/encoding.
In the following, we build FFmpeg 4 libraries with NVDEC/NVENC support. You can also use FFmpeg 5 or 6.
The following procedure was tested on Ubuntu.
† For details on NVDEC/NVENC and FFmpeg, please refer to the following articles.
- https://docs.nvidia.com/video-technologies/video-codec-sdk/11.1/nvdec-video-decoder-api-prog-guide/
- https://docs.nvidia.com/video-technologies/video-codec-sdk/11.1/ffmpeg-with-nvidia-gpu/index.html#compiling-ffmpeg
- https://developer.nvidia.com/blog/nvidia-ffmpeg-transcoding-guide/
Check the GPU and CUDA version
------------------------------
First, check the available GPU. Here, we have Tesla T4 with CUDA Toolkit 11.2 installed.
.. code-block::
$ nvidia-smi
Fri Oct 7 13:01:26 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |
| N/A 56C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Checking the compute capability
-------------------------------
Later, we need the version of compute capability supported by this GPU. The following page lists the GPUs and corresponding compute capabilities. The compute capability of T4 is ``7.5``.
https://developer.nvidia.com/cuda-gpus
Install NVIDIA Video Codec Headers
----------------------------------
To build FFmpeg with NVDEC/NVENC, we first need to install the headers that FFmpeg uses to interact with Video Codec SDK.
Since we have CUDA 11 working in the system, we use one of ``n11`` tag.
.. code-block:: bash
git clone https://git.videolan.org/git/ffmpeg/nv-codec-headers.git
cd nv-codec-headers
git checkout n11.0.10.1
sudo make install
The location of installation can be changed with ``make PREFIX=<DESIRED_DIRECTORY> install``.
.. code-block:: text
Cloning into 'nv-codec-headers'...
remote: Enumerating objects: 819, done.
remote: Counting objects: 100% (819/819), done.
remote: Compressing objects: 100% (697/697), done.
remote: Total 819 (delta 439), reused 0 (delta 0)
Receiving objects: 100% (819/819), 156.42 KiB | 410.00 KiB/s, done.
Resolving deltas: 100% (439/439), done.
Note: checking out 'n11.0.10.1'.
You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.
If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:
git checkout -b <new-branch-name>
HEAD is now at 315ad74 add cuMemcpy
sed 's#@@PREFIX@@#/usr/local#' ffnvcodec.pc.in > ffnvcodec.pc
install -m 0755 -d '/usr/local/include/ffnvcodec'
install -m 0644 include/ffnvcodec/*.h '/usr/local/include/ffnvcodec'
install -m 0755 -d '/usr/local/lib/pkgconfig'
install -m 0644 ffnvcodec.pc '/usr/local/lib/pkgconfig'
Install FFmpeg dependencies
---------------------------
Next, we install tools and libraries required during the FFmpeg build.
The minimum requirement is `Yasm <https://yasm.tortall.net/>`_.
Here we additionally install H264 video codec and HTTPS protocol,
which we use later for verifying the installation.
.. code-block:: bash
sudo apt -qq update
sudo apt -qq install -y yasm libx264-dev libgnutls28-dev
.. code-block:: text
... Omitted for brevity ...
STRIP install-libavutil-shared
Setting up libx264-dev:amd64 (2:0.152.2854+gite9a5903-2) ...
Setting up yasm (1.3.0-2build1) ...
Setting up libunbound2:amd64 (1.6.7-1ubuntu2.5) ...
Setting up libp11-kit-dev:amd64 (0.23.9-2ubuntu0.1) ...
Setting up libtasn1-6-dev:amd64 (4.13-2) ...
Setting up libtasn1-doc (4.13-2) ...
Setting up libgnutlsxx28:amd64 (3.5.18-1ubuntu1.6) ...
Setting up libgnutls-dane0:amd64 (3.5.18-1ubuntu1.6) ...
Setting up libgnutls-openssl27:amd64 (3.5.18-1ubuntu1.6) ...
Setting up libgmpxx4ldbl:amd64 (2:6.1.2+dfsg-2) ...
Setting up libidn2-dev:amd64 (2.0.4-1.1ubuntu0.2) ...
Setting up libidn2-0-dev (2.0.4-1.1ubuntu0.2) ...
Setting up libgmp-dev:amd64 (2:6.1.2+dfsg-2) ...
Setting up nettle-dev:amd64 (3.4.1-0ubuntu0.18.04.1) ...
Setting up libgnutls28-dev:amd64 (3.5.18-1ubuntu1.6) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.6) ...
Build FFmpeg with NVDEC/NVENC support
-------------------------------------
Next we download the source code of FFmpeg 4. We use 4.4.2 here.
.. code-block:: bash
wget -q https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n4.4.2.tar.gz
tar -xf n4.4.2.tar.gz
cd FFmpeg-n4.4.2
Next we configure FFmpeg build. Note the following:
1. We provide flags like ``-I/usr/local/cuda/include``, ``-L/usr/local/cuda/lib64`` to let the build process know where the CUDA libraries are found.
2. We provide flags like ``--enable-nvdec`` and ``--enable-nvenc`` to enable NVDEC/NVENC.
3. We also provide NVCC flags with compute capability ``75``, which corresponds to ``7.5`` of T4. †
4. We install the library in ``/usr/lib/``.
.. note::
† The configuration script verifies NVCC by compiling a sample code. By default it uses old compute capability such as ``30``, which is no longer supported by CUDA 11. So it is required to set a correct compute capability.
.. code-block:: bash
prefix=/usr/
ccap=75
./configure \
--prefix="${prefix}" \
--extra-cflags='-I/usr/local/cuda/include' \
--extra-ldflags='-L/usr/local/cuda/lib64' \
--nvccflags="-gencode arch=compute_${ccap},code=sm_${ccap} -O2" \
--disable-doc \
--enable-decoder=aac \
--enable-decoder=h264 \
--enable-decoder=h264_cuvid \
--enable-decoder=rawvideo \
--enable-indev=lavfi \
--enable-encoder=libx264 \
--enable-encoder=h264_nvenc \
--enable-demuxer=mov \
--enable-muxer=mp4 \
--enable-filter=scale \
--enable-filter=testsrc2 \
--enable-protocol=file \
--enable-protocol=https \
--enable-gnutls \
--enable-shared \
--enable-gpl \
--enable-nonfree \
--enable-cuda-nvcc \
--enable-libx264 \
--enable-nvenc \
--enable-cuvid \
--enable-nvdec
.. code-block:: text
install prefix /usr/
source path .
C compiler gcc
C library glibc
ARCH x86 (generic)
big-endian no
runtime cpu detection yes
standalone assembly yes
x86 assembler yasm
MMX enabled yes
MMXEXT enabled yes
3DNow! enabled yes
3DNow! extended enabled yes
SSE enabled yes
SSSE3 enabled yes
AESNI enabled yes
AVX enabled yes
AVX2 enabled yes
AVX-512 enabled yes
XOP enabled yes
FMA3 enabled yes
FMA4 enabled yes
i686 features enabled yes
CMOV is fast yes
EBX available yes
EBP available yes
debug symbols yes
strip symbols yes
optimize for size no
optimizations yes
static no
shared yes
postprocessing support no
network support yes
threading support pthreads
safe bitstream reader yes
texi2html enabled no
perl enabled yes
pod2man enabled yes
makeinfo enabled no
makeinfo supports HTML no
External libraries:
alsa libx264 lzma
bzlib libxcb zlib
gnutls libxcb_shape
iconv libxcb_xfixes
External libraries providing hardware acceleration:
cuda cuvid nvenc
cuda_llvm ffnvcodec v4l2_m2m
cuda_nvcc nvdec
Libraries:
avcodec avformat swscale
avdevice avutil
avfilter swresample
Programs:
ffmpeg ffprobe
Enabled decoders:
aac hevc rawvideo
av1 mjpeg vc1
h263 mpeg1video vp8
h264 mpeg2video vp9
h264_cuvid mpeg4
Enabled encoders:
h264_nvenc libx264
Enabled hwaccels:
av1_nvdec mpeg1_nvdec vp8_nvdec
h264_nvdec mpeg2_nvdec vp9_nvdec
hevc_nvdec mpeg4_nvdec wmv3_nvdec
mjpeg_nvdec vc1_nvdec
Enabled parsers:
h263 mpeg4video vp9
Enabled demuxers:
mov
Enabled muxers:
mov mp4
Enabled protocols:
file tcp
https tls
Enabled filters:
aformat hflip transpose
anull null trim
atrim scale vflip
format testsrc2
Enabled bsfs:
aac_adtstoasc null vp9_superframe_split
h264_mp4toannexb vp9_superframe
Enabled indevs:
lavfi
Enabled outdevs:
License: nonfree and unredistributable
Now we build and install
.. code-block:: bash
make clean
make -j
sudo make install
.. code-block:: text
... Omitted for brevity ...
INSTALL libavdevice/libavdevice.so
INSTALL libavfilter/libavfilter.so
INSTALL libavformat/libavformat.so
INSTALL libavcodec/libavcodec.so
INSTALL libswresample/libswresample.so
INSTALL libswscale/libswscale.so
INSTALL libavutil/libavutil.so
INSTALL install-progs-yes
INSTALL ffmpeg
INSTALL ffprobe
Checking the intallation
------------------------
To verify that the FFmpeg we built have CUDA support, we can check the list of available decoders and encoders.
.. code-block:: bash
ffprobe -hide_banner -decoders | grep h264
.. code-block:: text
VFS..D h264 H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10
V..... h264_cuvid Nvidia CUVID H264 decoder (codec h264)
.. code-block:: bash
ffmpeg -hide_banner -encoders | grep 264
.. code-block:: text
V..... libx264 libx264 H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (codec h264)
V....D h264_nvenc NVIDIA NVENC H.264 encoder (codec h264)
The following command fetches video from remote server, decode with NVDEC (cuvid) and re-encode with NVENC. If this command does not work, then there is an issue with FFmpeg installation, and TorchAudio would not be able to use them either.
.. code-block:: bash
$ src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
$ ffmpeg -hide_banner -y -vsync 0 \
-hwaccel cuvid \
-hwaccel_output_format cuda \
-c:v h264_cuvid \
-resize 360x240 \
-i "${src}" \
-c:a copy \
-c:v h264_nvenc \
-b:v 5M test.mp4
Note that there is ``Stream #0:0 -> #0:0 (h264 (h264_cuvid) -> h264 (h264_nvenc))``, which means that video is decoded with ``h264_cuvid`` decoder and ``h264_nvenc`` encoder.
.. code-block::
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4':
Metadata:
major_brand : mp42
minor_version : 512
compatible_brands: mp42iso2avc1mp41
encoder : Lavf58.76.100
Duration: 00:03:26.04, start: 0.000000, bitrate: 1294 kb/s
Stream #0:0(eng): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 960x540 [SAR 1:1 DAR 16:9], 1156 kb/s, 29.97 fps, 29.97 tbr, 30k tbn, 59.94 tbc (default)
Metadata:
handler_name : ?Mainconcept Video Media Handler
vendor_id : [0][0][0][0]
Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default)
Metadata:
handler_name : #Mainconcept MP4 Sound Media Handler
vendor_id : [0][0][0][0]
Stream mapping:
Stream #0:0 -> #0:0 (h264 (h264_cuvid) -> h264 (h264_nvenc))
Stream #0:1 -> #0:1 (copy)
Press [q] to stop, [?] for help
Output #0, mp4, to 'test.mp4':
Metadata:
major_brand : mp42
minor_version : 512
compatible_brands: mp42iso2avc1mp41
encoder : Lavf58.76.100
Stream #0:0(eng): Video: h264 (Main) (avc1 / 0x31637661), cuda(tv, bt709, progressive), 360x240 [SAR 1:1 DAR 3:2], q=2-31, 5000 kb/s, 29.97 fps, 30k tbn (default)
Metadata:
handler_name : ?Mainconcept Video Media Handler
vendor_id : [0][0][0][0]
encoder : Lavc58.134.100 h264_nvenc
Side data:
cpb: bitrate max/min/avg: 0/0/5000000 buffer size: 10000000 vbv_delay: N/A
Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default)
Metadata:
handler_name : #Mainconcept MP4 Sound Media Handler
vendor_id : [0][0][0][0]
frame= 6175 fps=1712 q=11.0 Lsize= 37935kB time=00:03:26.01 bitrate=1508.5kbits/s speed=57.1x
video:34502kB audio:3234kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.526932%
Using the GPU decoder/encoder from TorchAudio
---------------------------------------------
Checking the installation
~~~~~~~~~~~~~~~~~~~~~~~~~
Once the FFmpeg is properly working with hardware acceleration, we need to check if TorchAudio can pick it up correctly.
There are utility functions to query the capability of FFmpeg in :py:mod:`torchaudio.utils.ffmpeg_utils`.
You can first use :py:func:`~torchaudio.utils.ffmpeg_utils.get_video_decoders` and :py:func:`~torchaudio.utils.ffmpeg_utils.get_video_encoders` to check if GPU decoders and encoders (such as ``h264_cuvid`` and ``h264_nvenc``) are listed.
It is often the case where there are multiple FFmpeg installations in the system, and TorchAudio is loading one different than expected. In such cases, use of ``ffmpeg`` to check the installation does not help. You can use functions like :py:func:`~torchaudio.utils.ffmpeg_utils.get_build_config` and :py:func:`~torchaudio.utils.ffmpeg_utils.get_versions` to get information about FFmpeg libraries TorchAudio loaded.
.. code-block:: python
from torchaudio.utils import ffmpeg_utils
print("Library versions:")
print(ffmpeg_utils.get_versions())
print("\nBuild config:")
print(ffmpeg_utils.get_build_config())
print("\nDecoders:")
print([k for k in ffmpeg_utils.get_video_decoders().keys() if "cuvid" in k])
print("\nEncoders:")
print([k for k in ffmpeg_utils.get_video_encoders().keys() if "nvenc" in k])
.. code-block:: text
Library versions:
{'libavutil': (56, 31, 100), 'libavcodec': (58, 54, 100), 'libavformat': (58, 29, 100), 'libavfilter': (7, 57, 100), 'libavdevice': (58, 8, 100)}
Build config:
--prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
Decoders:
['h264_cuvid', 'hevc_cuvid', 'mjpeg_cuvid', 'mpeg1_cuvid', 'mpeg2_cuvid', 'mpeg4_cuvid', 'vc1_cuvid', 'vp8_cuvid', 'vp9_cuvid']
Encoders:
['h264_nvenc', 'nvenc', 'nvenc_h264', 'nvenc_hevc', 'hevc_nvenc']
Using the hardware decoder and encoder
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Once the installation and the runtime linking work fine, then you can test the GPU decoding with the following.
For the detail on the performance of GPU decoder and encoder please see :ref:`NVDEC tutoial <nvdec_tutorial>` and :ref:`NVENC tutorial <nvenc_tutorial>`.
Building on Jetson
==================
1. Install JetPack
------------------
JetPack includes the collection of CUDA-related libraries that is required to run PyTorch with CUDA.
Please refer to https://developer.nvidia.com/embedded/learn/get-started-jetson-agx-orin-devkit for the up-to-date instruction.
.. code-block::
sudo bash -c 'echo "deb https://repo.download.nvidia.com/jetson/common r34.1 main" >> /etc/apt/sources.list.d/nvidia-l4t-apt-source.list'
sudo bash -c 'echo "deb https://repo.download.nvidia.com/jetson/t234 r34.1 main" >> /etc/apt/sources.list.d/nvidia-l4t-apt-source.list'
sudo apt update
sudo apt dist-upgrade
# REBOOT
sudo apt install nvidia-jetpack
Checking the versions
~~~~~~~~~~~~~~~~~~~~~
To check the version installed you can use the following commands;
.. code-block::
# JetPack
$ apt list --installed | grep nvidia-jetpack
nvidia-jetpack-dev/stable,now 5.0.1-b118 arm64 [installed,automatic]
nvidia-jetpack-runtime/stable,now 5.0.1-b118 arm64 [installed,automatic]
nvidia-jetpack/stable,now 5.0.1-b118 arm64 [installed]
# CUDA
$ apt list --installed | grep cuda-toolkit
cuda-toolkit-11-4-config-common/stable,now 11.4.243-1 all [installed,automatic]
cuda-toolkit-11-4/stable,now 11.4.14-1 arm64 [installed,automatic]
cuda-toolkit-11-config-common/stable,now 11.4.243-1 all [installed,automatic]
cuda-toolkit-config-common/stable,now 11.4.243-1 all [installed,automatic]
# cuDNN
$ apt list --installed | grep cudnn
libcudnn8-dev/stable,now 8.3.2.49-1+cuda11.4 arm64 [installed,automatic]
libcudnn8-samples/stable,now 8.3.2.49-1+cuda11.4 arm64 [installed,automatic]
libcudnn8/stable,now 8.3.2.49-1+cuda11.4 arm64 [installed,automatic]
nvidia-cudnn8-dev/stable,now 5.0.1-b118 arm64 [installed,automatic]
nvidia-cudnn8-runtime/stable,now 5.0.1-b118 arm64 [installed,automatic]
.. image:: https://download.pytorch.org/torchaudio/doc-assets/jetson-package-versions.png
:width: 360px
2. [Optional] Install jtop
--------------------------
Since Tegra GPUs are not supported by ``nvidia-smi`` command, it is recommended to isntall ``jtop``.
Only super-use can install ``jtop``. So make sure to add ``-U``, so that running ``jtop`` won't require super-user priviledge.
3. Install ``pip`` in user env
------------------------------
By default, ``pip`` / ``pip3`` commands use the ones from system directory ``/usr/bin/``, and its ``site-packages`` directory is protected and cannot be modified without ``sudo``.
One way to workaround this is to install ``pip`` in user directory.
https://forums.developer.nvidia.com/t/python-3-module-install-folder/181321
.. code-block::
wget https://bootstrap.pypa.io/get-pip.py
python get-pip.py --user
After this verify that ``pip`` command is pointing the one in user directory.
.. code-block::
$ which pip
/home/USER/.local/bin/pip
4. Install PyTorch
------------------
As of PyTorch 1.13 and torchaudio 0.13, there is no official pre-built binaries for Linux ARM64. Nidia provides custom pre-built binaries for PyTorch, which works with specific JetPack.
Please refer to https://docs.nvidia.com/deeplearning/frameworks/install-pytorch-jetson-platform/index.html for up-to-date instruction on how to install PyTorch.
.. code-block::
$ package=torch-1.13.0a0+340c4120.nv22.06-cp38-cp38-linux_aarch64.whl
$ wget "https://developer.download.nvidia.com/compute/redist/jp/v50/pytorch/${package}"
$ pip install --no-cache "${package}"
Verify the installation by checking the version and CUDA device accessibility.
.. code-block::
$ python -c '
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.empty((1, 2), device=torch.device("cuda")))
'
1.13.0a0+410ce96a.nv22.12
True
tensor([[0., 0.]], device='cuda:0')
.. image:: https://download.pytorch.org/torchaudio/doc-assets/jetson-torch.png
:width: 360px
5. Build TorchAudio
-------------------
1. Install build tools
~~~~~~~~~~~~~~~~~~~~~~
.. code-block::
pip install cmake ninja
2. Install dependencies
~~~~~~~~~~~~~~~~~~~~~~~
.. code-block::
sudo apt install ffmpeg libavformat-dev libavcodec-dev libavutil-dev libavdevice-dev libavfilter-dev
3. Build TorchAudio
~~~~~~~~~~~~~~~~~~~
.. code-block::
git clone https://github.com/pytorch/audio
cd audio
USE_CUDA=1 pip install -v -e . --no-use-pep517
4. Check the installation
~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block::
import torchaudio
print(torchaudio.__version__)
torchaudio.utils.ffmpeg_utils.get_build_config()
.. code-block::
2.0.0a0+2ead941
--prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
.. image:: https://download.pytorch.org/torchaudio/doc-assets/jetson-verify-build.png
:width: 360px
Building on Linux and macOS
===========================
1. Install Conda and activate conda environment
-----------------------------------------------
Please folllow the instruction at https://docs.conda.io/en/latest/miniconda.html
2. Install PyTorch
------------------
Please select the version of PyTorch you want to install from https://pytorch.org/get-started/locally/
Here, we install nightly build.
.. code-block::
conda install pytorch -c pytorch-nightly
3. Install build tools
----------------------
.. code-block::
conda install cmake ninja
4. Clone the torchaudio repository
----------------------------------
.. code-block::
git clone https://github.com/pytorch/audio
cd audio
5. Build
--------
.. code-block::
python setup.py develop
.. note::
Due to the complexity of build process, TorchAudio only supports in-place build.
To use ``pip``, please use ``--no-use-pep517`` option.
``pip install -v -e . --no-use-pep517``
[Optional] Build TorchAudio with a custom built FFmpeg
------------------------------------------------------
By default, torchaudio tries to build FFmpeg extension with support for multiple FFmpeg versions. This process uses pre-built FFmpeg libraries compiled for specific CPU architectures like ``x86_64`` and ``aarch64`` (``arm64``).
If your CPU is not one of those, then the build process can fail. To workaround, one can disable FFmpeg integration (by setting the environment variable ``USE_FFMPEG=0``) or switch to the single version FFmpeg extension.
To build single version FFmpeg extension, FFmpeg binaries must be provided by user and available in the build environment. To do so, install FFmpeg and set ``FFMPEG_ROOT`` environment variable to specify the location of FFmpeg.
.. code-block::
conda install -c conda-forge ffmpeg
FFMPEG_ROOT=${CONDA_PREFIX} python setup.py develop
Building from source
====================
TorchAudio integrates PyTorch for numerical computation and third party libraries for multimedia I/O. It requires the following tools to build from source.
- `PyTorch <https://pytorch.org>`_
- `CMake <https://cmake.org/>`_
- `Ninja <https://ninja-build.org/>`_
- C++ complier with C++ 17 support
- `GCC <https://gcc.gnu.org/>`_ (Linux)
- `Clang <https://clang.llvm.org/>`_ (macOS)
- `MSVC <https://visualstudio.microsoft.com>`_ 2019 or newer (Windows)
- `CUDA toolkit <https://developer.nvidia.com/cudnn>`_ and `cuDNN <https://developer.nvidia.com/cudnn>`_ (if building CUDA extension)
Most of the tools are available in `Conda <https://conda.io/>`_, so we recommend using conda.
.. toctree::
:maxdepth: 1
build.linux
build.windows
build.jetson
Customizing the build
---------------------
TorchAudio's integration with third party libraries can be enabled/disabled via
environment variables.
They can be enabled by passing ``1`` and disabled by ``0``.
- ``BUILD_SOX``: Enable/disable I/O features based on libsox.
- ``BUILD_KALDI``: Enable/disable feature extraction based on Kaldi.
- ``BUILD_RNNT``: Enable/disable custom RNN-T loss function.
- ``USE_FFMPEG``: Enable/disable I/O features based on FFmpeg libraries.
- ``USE_ROCM``: Enable/disable AMD ROCm support.
- ``USE_CUDA``: Enable/disable CUDA support.
For the latest configurations and their default values, please check the source code.
https://github.com/pytorch/audio/blob/main/tools/setup_helpers/extension.py
Building on Windows
===================
To build TorchAudio on Windows, we need to enable C++ compiler and install build tools and runtime dependencies.
We use Microsoft Visual C++ for compiling C++ and Conda for managing the other build tools and runtime dependencies.
1. Install build tools
----------------------
MSVC
~~~~
Please follow the instruction at https://visualstudio.microsoft.com/downloads/, and make sure to install C++ development tools.
.. note::
The official binary distribution are compiled with MSVC 2019.
The following section uses path from MSVC 2019 Community Edition.
Conda
~~~~~
Please follow the instruction at https://docs.conda.io/en/latest/miniconda.html.
2. Start the dev environment
----------------------------
In the following, we need to use C++ compiler (``cl``), and Conda package manager (``conda``).
We also use Bash for the sake of similar experience to Linux/macOS.
To do so, the following three steps are required.
1. Open command prompt
2. Enable developer environment
3. [Optional] Launch bash
|
The following combination is known to work.
1. Launch Anaconda3 Command Prompt.
|
.. image:: https://download.pytorch.org/torchaudio/doc-assets/windows-conda.png
:width: 360px
|
Please make sure that ``conda`` command is recognized.
|
.. image:: https://download.pytorch.org/torchaudio/doc-assets/windows-conda2.png
:width: 360px
|
2. Activate dev tools by running the following command.
We need to use the MSVC x64 toolset for compilation.
To enable the toolset, one can use ``vcvarsall.bat`` or ``vcvars64.bat`` file, which
are found under Visual Studio's installation folder, under ``VC\Auxiliary\Build\``.
More information are available at https://docs.microsoft.com/en-us/cpp/build/how-to-enable-a-64-bit-visual-cpp-toolset-on-the-command-line?view=msvc-160#use-vcvarsallbat-to-set-a-64-bit-hosted-build-architecture
.. code-block::
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
Please makes sure that ``cl`` command is recognized.
.. image:: https://download.pytorch.org/torchaudio/doc-assets/windows-msvc.png
:width: 360px
3. [Optional] Launch bash with the following command.
If you want a similar UX as Linux/macOS, you can launch Bash. However, please note that in Bash environment, the file paths are different from native Windows style, and ``torchaudio.datasets`` module does not work.
.. code-block::
Miniconda3\Library\bin\bash.exe
.. image:: https://download.pytorch.org/torchaudio/doc-assets/windows-bash.png
:width: 360px
3. Install PyTorch
------------------
Please refer to https://pytorch.org/get-started/locally/ for the up-to-date way to install PyTorch.
The following command installs the nightly build version of PyTorch.
.. code-block::
# CPU-only
conda install pytorch cpuonly -c pytorch-nightly
# With CUDA support
conda install pytorch pytorch-cuda=11.7 -c pytorch-nightly -c nvidia
When installing CUDA-enabled version, it also install CUDA toolkit.
4. [Optional] cuDNN
-------------------
If you intend to build CUDA-related features, please install cuDNN.
Download CuDNN from https://developer.nvidia.com/cudnn, and extract files in
the same directories as CUDA toolkit.
When using conda, the directories are ``${CONDA_PREFIX}/bin``, ``${CONDA_PREFIX}/include``, ``${CONDA_PREFIX}/Lib/x64``.
5. Install external dependencies
--------------------------------
.. code-block::
conda install cmake ninja
6. Build TorchAudio
-------------------
Now that we have everything ready, we can build TorchAudio.
.. code-block::
git clone https://github.com/pytorch/audio
cd audio
.. code-block::
# In Command Prompt
python setup.py develop
.. code-block::
# In Bash
python setup.py develop
.. note::
Due to the complexity of build process, TorchAudio only supports in-place build.
To use ``pip``, please use ``--no-use-pep517`` option.
``pip install -v -e . --no-use-pep517``
[Optional] Build TorchAudio with a custom FFmpeg
------------------------------------------------
By default, torchaudio tries to build FFmpeg extension with support for multiple FFmpeg versions. This process uses pre-built FFmpeg libraries compiled for specific CPU architectures like ``x86_64``.
If your CPU is different, then the build process can fail. To workaround, one can disable FFmpeg integration (by setting the environment variable ``USE_FFMPEG=0``) or switch to the single version FFmpeg extension.
To build single version FFmpeg extension, FFmpeg binaries must be provided by user and available in the build environment. To do so, install FFmpeg and set ``FFMPEG_ROOT`` environment variable to specify the location of FFmpeg.
.. code-block::
conda install -c conda-forge ffmpeg
FFMPEG_ROOT=${CONDA_PREFIX}/Library python setup.py develop
[Optional] Building FFmpeg from source
--------------------------------------
The following section illustrates a way to build FFmpeg libraries from source.
Conda-forge's FFmpeg package comes with support for major codecs and GPU decoders, so regular users and developers do not need to build FFmpeg from source.
If you are not using Conda, then you can either find a pre-built binary distribution or build FFmpeg by yourself.
Also, in case torchaudio developer needs to update and customize the CI for FFmpeg build, this section might be helpful.
1. Install MSYS2
~~~~~~~~~~~~~~~~
To build FFmpeg in a way it is usable from the TorchAudio development environment, we need to build binaries native to ``MINGW64``. To do so, we need tools required by FFmpeg's build process, such as ``pkg-config`` and ``make``, that work in ``MINGW64`` environment. For this purpose, we use MSYS2.
FFmpeg's official documentation touches this https://trac.ffmpeg.org/wiki/CompilationGuide/MinGW
Please follow the instruction at https://www.msys2.org/ to install MSYS2.
.. note::
In CI environment, often `Chocolatery <https://chocolatey.org/>`_ can be used to install MSYS2.
2. Launch MSYS2
~~~~~~~~~~~~~~~
Use the shortcut to launch MSYS2 (MINGW64).
.. image:: https://download.pytorch.org/torchaudio/doc-assets/windows-msys2.png
:width: 360px
.. note::
The Bash environment in MSYS2 does not play well with Conda env, so do not add Conda initialization script in ``~/.bashrc`` of MSYS2 environment. (i.e. ``C:\msys2\home\USER\.bashrc``) Instead, add it in ``C:\Users\USER\.bashrc``)
3. Install build tools
~~~~~~~~~~~~~~~~~~~~~~
.. code-block::
$ pacman -S mingw-w64-x86_64-make
$ pacman -S mingw-w64-x86_64-yasm
After the installation, you should have packages similar to the following;
.. code-block::
$ pacman -Qe
base 2020.12-1
base-devel 2022.01-2
filesystem 2023.01-2
mingw-w64-x86_64-make 4.3-1
mingw-w64-x86_64-pkgconf 1.8.0-2
mingw-w64-x86_64-yasm 1.3.0-4
msys2-runtime 3.4.3-5
4. Build FFmpeg
~~~~~~~~~~~~~~~
Check out FFmpeg source code.
.. code-block::
git clone https://github.com/ffmpeg/ffmpeg
cd ffmpeg
git checkout <VERSION>
Build
.. code-block::
./configure --toolchain=msvc
make -j
If the build succeeds, ``ffmpeg.exe`` should be found in the same directory. Make sure that you can run it.
5. Verify the build
~~~~~~~~~~~~~~~~~~~
Check that the resulting FFmpeg binary is accessible from Conda env
Now launch a new command prompt and enable the TorchAudio development environment. Make sure that you can run the ``ffmpeg.exe`` command generated in the previous step.
......@@ -25,6 +25,7 @@ from datetime import datetime
sys.path.insert(0, os.path.abspath("."))
import pytorch_sphinx_theme
# -- General configuration ------------------------------------------------
......@@ -52,8 +53,20 @@ extensions = [
"sphinxcontrib.bibtex",
"sphinx_gallery.gen_gallery",
"nbsphinx",
"breathe",
]
breathe_projects = {"libtorchaudio": "cpp/xml"}
breathe_default_project = "libtorchaudio"
breathe_projects_source = {
"libtorchaudio": (
"../../torchaudio/csrc/ffmpeg/",
["ffmpeg.h"],
)
}
nbsphinx_requirejs_path = ""
autodoc_member_order = "bysource"
......@@ -114,6 +127,22 @@ def _get_pattern():
return ret
def reset_mpl(gallery_conf, fname):
from sphinx_gallery.scrapers import _reset_matplotlib
_reset_matplotlib(gallery_conf, fname)
import matplotlib
matplotlib.rcParams.update(
{
"image.interpolation": "none",
"figure.figsize": (9.6, 4.8),
"font.size": 8.0,
"axes.axisbelow": True,
}
)
sphinx_gallery_conf = {
"examples_dirs": [
"../../examples/tutorials",
......@@ -126,6 +155,7 @@ sphinx_gallery_conf = {
"promote_jupyter_magic": True,
"first_notebook_cell": None,
"doc_module": ("torchaudio",),
"reset_modules": (reset_mpl, "seaborn"),
}
autosummary_generate = True
......@@ -164,6 +194,15 @@ else:
version = f"Nightly Build ({torchaudio.__version__})"
release = "nightly"
#
# Specify the version of the current stable release.
# Used in `docs/source/_templates/breadcrumbs.html`
#
# https://stackoverflow.com/a/33845358/1106930
#
html_context = {"version_stable": "2.1.1"}
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
......@@ -201,7 +240,7 @@ html_theme_options = {
"display_version": True,
"logo_only": True,
"navigation_with_keys": True,
"analytics_id": "UA-117752657-2",
"analytics_id": "GTM-T8XT4PS",
}
# Add any paths that contain custom static files (such as style sheets) here,
......
......@@ -26,6 +26,23 @@ Utility
apply_codec
resample
loudness
convolve
fftconvolve
add_noise
preemphasis
deemphasis
speed
frechet_distance
Forced Alignment
----------------
.. autosummary::
:toctree: generated
:nosignatures:
forced_align
merge_tokens
TokenSpan
Filtering
......@@ -73,7 +90,6 @@ Feature Extractions
compute_deltas
detect_pitch_frequency
sliding_window_cmn
compute_kaldi_pitch
spectral_centroid
Multi-channel
......
......@@ -22,6 +22,18 @@ model implementations and application components.
logo
references
.. toctree::
:maxdepth: 2
:caption: Installation
:hidden:
installation
build
build.linux
build.windows
build.jetson
build.ffmpeg
.. toctree::
:maxdepth: 1
:caption: API Tutorials
......@@ -32,12 +44,20 @@ model implementations and application components.
tutorials/streamreader_advanced_tutorial
tutorials/streamwriter_basic_tutorial
tutorials/streamwriter_advanced
hw_acceleration_tutorial
tutorials/nvdec_tutorial
tutorials/nvenc_tutorial
tutorials/effector_tutorial
tutorials/audio_resampling_tutorial
tutorials/audio_data_augmentation_tutorial
tutorials/audio_feature_extractions_tutorial
tutorials/audio_feature_augmentation_tutorial
tutorials/ctc_forced_alignment_api_tutorial
tutorials/oscillator_tutorial
tutorials/additive_synthesis_tutorial
tutorials/filter_design_tutorial
tutorials/subtractive_synthesis_tutorial
tutorials/audio_datasets_tutorial
......@@ -48,12 +68,15 @@ model implementations and application components.
tutorials/speech_recognition_pipeline_tutorial
tutorials/asr_inference_with_ctc_decoder_tutorial
tutorials/asr_inference_with_cuda_ctc_decoder_tutorial
tutorials/online_asr_tutorial
tutorials/device_asr
tutorials/forced_alignment_tutorial
tutorials/forced_alignment_for_multilingual_data_tutorial
tutorials/tacotron2_pipeline_tutorial
tutorials/mvdr_tutorial
tutorials/hybrid_demucs_tutorial
tutorials/squim_tutorial
.. toctree::
:maxdepth: 1
......@@ -64,15 +87,15 @@ model implementations and application components.
Emformer RNN-T ASR <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>
Conv-TasNet Source Separation <https://github.com/pytorch/audio/tree/main/examples/source_separation>
HuBERT Pre-training and Fine-tuning (ASR) <https://github.com/pytorch/audio/tree/main/examples/hubert>
Real-time AV-ASR <https://github.com/pytorch/audio/tree/main/examples/avsr>
.. toctree::
:maxdepth: 1
:caption: API Reference
:caption: Python API Reference
:hidden:
torchaudio
io
backend
functional
transforms
datasets
......@@ -102,6 +125,13 @@ Tutorials
.. customcardstart::
.. customcarditem::
:header: AM inference with CUDA CTC Beam Seach Decoder
:card_description: Learn how to perform ASR beam search decoding with GPU, using <code>torchaudio.models.decoder.cuda_ctc_decoder</code>.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/asr_inference_with_ctc_decoder_tutorial.png
:link: tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html
:tags: Pipelines,ASR,CTC-Decoder,CUDA-CTC-Decoder
.. customcarditem::
:header: Loading waveform Tensors from files and saving them
:card_description: Learn how to query/load audio files and save waveform tensors to files, using <code>torchaudio.info</code>, <code>torchaudio.load</code> and <code>torchaudio.save</code> functions.
......@@ -109,6 +139,20 @@ Tutorials
:link: tutorials/audio_io_tutorial.html
:tags: I/O
.. customcarditem::
:header: CTC Forced Alignment API
:card_description: Learn how to use TorchAudio's CTC forced alignment API (<code>torchaudio.functional.forced_align</code>).
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/ctc_forced_alignment_api_tutorial.png
:link: tutorials/ctc_forced_alignment_api_tutorial.html
:tags: CTC,Forced-Alignment
.. customcarditem::
:header: Forced alignment for multilingual data
:card_description: Learn how to use align multiligual data using TorchAudio's CTC forced alignment API (<code>torchaudio.functional.forced_align</code>) and a multiligual Wav2Vec2 model.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/forced_alignment_for_multilingual_data_tutorial.png
:link: tutorials/forced_alignment_for_multilingual_data_tutorial.html
:tags: Forced-Alignment
.. customcarditem::
:header: Streaming media decoding with StreamReader
:card_description: Learn how to load audio/video to Tensors using <code>torchaudio.io.StreamReader</code> class.
......@@ -129,7 +173,7 @@ Tutorials
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/streamwriter_basic_tutorial.gif
:link: tutorials/streamwriter_basic_tutorial.html
:tags: I/O,StreamWriter
.. customcarditem::
:header: Playing media with StreamWriter
:card_description: Learn how to play audio/video with <code>torchaudio.io.StreamWriter</code>.
......@@ -138,11 +182,25 @@ Tutorials
:tags: I/O,StreamWriter
.. customcarditem::
:header: Hardware accelerated video I/O with NVDEC/NVENC
:card_description: Learn how to setup and use HW accelerated video I/O.
:header: Hardware accelerated video decoding with NVDEC
:card_description: Learn how to use HW video decoder.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/hw_acceleration_tutorial.png
:link: hw_acceleration_tutorial.html
:tags: I/O,StreamReader,StreamWriter
:link: tutorials/nvdec_tutorial.html
:tags: I/O,StreamReader
.. customcarditem::
:header: Hardware accelerated video encoding with NVENC
:card_description: Learn how to use HW video encoder.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/hw_acceleration_tutorial.png
:link: tutorials/nvenc_tutorial.html
:tags: I/O,StreamWriter
.. customcarditem::
:header: Apply effects and codecs to waveform
:card_description: Learn how to apply effects and codecs to waveform using <code>torchaudio.io.AudioEffector</code>.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/effector_tutorial.png
:link: tutorials/effector_tutorial.html
:tags: Preprocessing
.. customcarditem::
:header: Audio resampling with bandlimited sinc interpolation
......@@ -199,7 +257,7 @@ Tutorials
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/online_asr_tutorial.gif
:link: tutorials/online_asr_tutorial.html
:tags: Pipelines,ASR,RNNT,StreamReader
.. customcarditem::
:header: Real-time microphone ASR with Emformer RNN-T
:card_description: Learn how to transcribe speech fomr microphone with Emformer RNN-T (<code>torchaudio.pipelines.RNNTBundle</code>) and <code>torchaudio.io.StreamReader</code>.
......@@ -220,7 +278,7 @@ Tutorials
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/tacotron2_pipeline_tutorial.png
:link: tutorials/tacotron2_pipeline_tutorial.html
:tags: Pipelines,TTS-(Text-to-Speech)
.. customcarditem::
:header: Speech Enhancement with MVDR Beamforming
:card_description: Learn how to improve speech quality with MVDR Beamforming.
......@@ -235,6 +293,12 @@ Tutorials
:link: tutorials/hybrid_demucs_tutorial.html
:tags: Pipelines,Source-Separation
.. customcarditem::
:header: Torchaudio-Squim: Non-intrusive Speech Assessment in TorchAudio
:card_description: Learn how to estimate subjective and objective metrics with pre-trained TorchAudio-SQUIM models (<code>torchaudio.pipelines.SQUIMObjective</code>).
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/squim_tutorial.png
:link: tutorials/squim_tutorial.html
:tags: Pipelines,Speech Assessment,Speech Enhancement
.. customcardend::
......@@ -267,3 +331,14 @@ In BibTeX format:
journal={arXiv preprint arXiv:2110.15018},
year={2021}
}
.. code-block:: bibtex
@misc{hwang2023torchaudio,
title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
year={2023},
eprint={2310.17864},
archivePrefix={arXiv},
primaryClass={eess.AS}
}
Installing pre-built binaries
=============================
``torchaudio`` has binary distributions for PyPI (``pip``) and Anaconda (``conda``).
Please refer to https://pytorch.org/get-started/locally/ for the details.
.. note::
Each ``torchaudio`` package is compiled against specific version of ``torch``.
Please refer to the following table and install the correct pair of ``torch`` and ``torchaudio``.
.. note::
Starting ``0.10``, torchaudio has CPU-only and CUDA-enabled binary distributions,
each of which requires a corresponding PyTorch distribution.
.. note::
This software was compiled against an unmodified copies of FFmpeg, with the specific rpath removed so as to enable the use of system libraries. The LGPL source can be downloaded from the following locations: `n4.1.8 <https://github.com/FFmpeg/FFmpeg/releases/tag/n4.4.4>`__ (`license <https://github.com/FFmpeg/FFmpeg/blob/n4.4.4/COPYING.LGPLv2.1>`__), `n5.0.3 <https://github.com/FFmpeg/FFmpeg/releases/tag/n5.0.3>`__ (`license <https://github.com/FFmpeg/FFmpeg/blob/n5.0.3/COPYING.LGPLv2.1>`__) and `n6.0 <https://github.com/FFmpeg/FFmpeg/releases/tag/n6.0>`__ (`license <https://github.com/FFmpeg/FFmpeg/blob/n6.0/COPYING.LGPLv2.1>`__).
Dependencies
------------
* `PyTorch <https://pytorch.org>`_
Please refer to the compatibility matrix bellow for supported PyTorch versions.
.. _optional_dependencies:
Optional Dependencies
~~~~~~~~~~~~~~~~~~~~~
.. _ffmpeg_dependency:
* `FFmpeg <https://ffmpeg.org>`__
Required to use :py:mod:`torchaudio.io` module. and ``backend="ffmpeg"`` in
`I/O functions <./torchaudio.html#i-o>`__.
Starting version 2.1, TorchAudio official binary distributions are compatible with
FFmpeg version 6, 5 and 4. (>=4.4, <7). At runtime, TorchAudio first looks for FFmpeg 6,
if not found, then it continues to looks for 5 and move on to 4.
There are multiple ways to install FFmpeg libraries.
Please refer to the official documentation for how to install FFmpeg.
If you are using Anaconda Python distribution,
``conda install -c conda-forge 'ffmpeg<7'`` will install
compatible FFmpeg libraries.
If you need to specify the version of FFmpeg TorchAudio searches and links, you can
specify it via the environment variable ``TORCHAUDIO_USE_FFMPEG_VERSION``. For example,
by setting ``TORCHAUDIO_USE_FFMPEG_VERSION=5``, TorchAudio will only look for FFmpeg
5.
If for some reason, this search mechanism is causing an issue, you can disable
the FFmpeg integration entirely by setting the environment variable
``TORCHAUDIO_USE_FFMPEG=0``.
There are multiple ways to install FFmpeg libraries.
If you are using Anaconda Python distribution,
``conda install -c conda-forge 'ffmpeg<7'`` will install
compatible FFmpeg libraries.
.. note::
When searching for FFmpeg installation, TorchAudio looks for library files
which have names with version numbers.
That is, ``libavutil.so.<VERSION>`` for Linux, ``libavutil.<VERSION>.dylib``
for macOS, and ``avutil-<VERSION>.dll`` for Windows.
Many public pre-built binaries follow this naming scheme, but some distributions
have un-versioned file names.
If you are having difficulties detecting FFmpeg, double check that the library
files you installed follow this naming scheme, (and then make sure
that they are in one of the directories listed in library search path.)
* `SoX <https://sox.sourceforge.net/>`__
Required to use ``backend="sox"`` in `I/O functions <./torchaudio.html#i-o>`__.
Starting version 2.1, TorchAudio requires separately installed libsox.
If dynamic linking is causing an issue, you can set the environment variable
``TORCHAUDIO_USE_SOX=0``, and TorchAudio won't use SoX.
.. note::
TorchAudio looks for a library file with unversioned name, that is ``libsox.so``
for Linux, and ``libsox.dylib`` for macOS. Some package managers install the library
file with different name. For example, aptitude on Ubuntu installs ``libsox.so.3``.
To have TorchAudio link against it, you can create a symbolic link to it with name
``libsox.so`` (and put the symlink in a library search path).
.. note::
TorchAudio is tested on libsox 14.4.2. (And it is unlikely that other
versions would work.)
* `SoundFile <https://pysoundfile.readthedocs.io/>`__
Required to use ``backend="soundfile"`` in `I/O functions <./torchaudio.html#i-o>`__.
* `sentencepiece <https://pypi.org/project/sentencepiece/>`__
Required for performing automatic speech recognition with :ref:`Emformer RNN-T<RNNT>`.
You can install it by running ``pip install sentencepiece``.
* `deep-phonemizer <https://pypi.org/project/deep-phonemizer/>`__
Required for performing text-to-speech with :ref:`Tacotron2`.
* `kaldi_io <https://pypi.org/project/kaldi-io/>`__
Required to use :py:mod:`torchaudio.kaldi_io` module.
Compatibility Matrix
--------------------
The official binary distributions of TorchAudio contain extension modules
which are written in C++ and linked against specific versions of PyTorch.
TorchAudio and PyTorch from different releases cannot be used together.
Please refer to the following table for the matching versions.
.. list-table::
:header-rows: 1
* - ``PyTorch``
- ``TorchAudio``
- ``Python``
* - ``2.1.0``
- ``2.1.0``
- ``>=3.8``, ``<=3.11``
* - ``2.0.1``
- ``2.0.2``
- ``>=3.8``, ``<=3.11``
* - ``2.0.0``
- ``2.0.1``
- ``>=3.8``, ``<=3.11``
* - ``1.13.1``
- ``0.13.1``
- ``>=3.7``, ``<=3.10``
* - ``1.13.0``
- ``0.13.0``
- ``>=3.7``, ``<=3.10``
* - ``1.12.1``
- ``0.12.1``
- ``>=3.7``, ``<=3.10``
* - ``1.12.0``
- ``0.12.0``
- ``>=3.7``, ``<=3.10``
* - ``1.11.0``
- ``0.11.0``
- ``>=3.7``, ``<=3.9``
* - ``1.10.0``
- ``0.10.0``
- ``>=3.6``, ``<=3.9``
* - ``1.9.1``
- ``0.9.1``
- ``>=3.6``, ``<=3.9``
* - ``1.8.1``
- ``0.8.1``
- ``>=3.6``, ``<=3.9``
* - ``1.7.1``
- ``0.7.2``
- ``>=3.6``, ``<=3.9``
* - ``1.7.0``
- ``0.7.0``
- ``>=3.6``, ``<=3.8``
* - ``1.6.0``
- ``0.6.0``
- ``>=3.6``, ``<=3.8``
* - ``1.5.0``
- ``0.5.0``
- ``>=3.5``, ``<=3.8``
* - ``1.4.0``
- ``0.4.0``
- ``==2.7``, ``>=3.5``, ``<=3.8``
......@@ -12,6 +12,8 @@ torchaudio.io
StreamReader
StreamWriter
AudioEffector
play_audio
.. rubric:: Tutorials using ``torchaudio.io``
......
......@@ -20,3 +20,19 @@ CTC Decoder
.. rubric:: Tutorials using CTC Decoder
.. minigallery:: torchaudio.models.decoder.CTCDecoder
CUDA CTC Decoder
----------------
.. autosummary::
:toctree: generated
:nosignatures:
:template: autosummary/cuda_ctc_decoder_class.rst
CUCTCDecoder
cuda_ctc_decoder
.. rubric:: Tutorials using CUDA CTC Decoder
.. minigallery:: torchaudio.models.decoder.CUCTCDecoder
......@@ -7,15 +7,13 @@ torchaudio.models
The ``torchaudio.models`` subpackage contains definitions of models for addressing common audio tasks.
For pre-trained models, please refer to :mod:`torchaudio.pipelines` module.
Model Definitions
-----------------
.. note::
For models with pre-trained parameters, please refer to :mod:`torchaudio.pipelines` module.
Model defintions are responsible for constructing computation graphs and executing them.
Some models have complex structure and variations.
For such models, `Factory Functions`_ are provided.
For such models, factory functions are provided.
.. autosummary::
:toctree: generated
......@@ -30,42 +28,9 @@ For such models, `Factory Functions`_ are provided.
HuBERTPretrainModel
RNNT
RNNTBeamSearch
SquimObjective
SquimSubjective
Tacotron2
Wav2Letter
Wav2Vec2Model
WaveRNN
Factory Functions
-----------------
.. autosummary::
:toctree: generated
:nosignatures:
conv_tasnet_base
emformer_rnnt_model
emformer_rnnt_base
wav2vec2_model
wav2vec2_base
wav2vec2_large
wav2vec2_large_lv60k
hubert_base
hubert_large
hubert_xlarge
hubert_pretrain_model
hubert_pretrain_base
hubert_pretrain_large
hubert_pretrain_xlarge
hdemucs_low
hdemucs_medium
hdemucs_high
Utility Functions
-----------------
.. autosummary::
:toctree: generated
:nosignatures:
~wav2vec2.utils.import_fairseq_model
~wav2vec2.utils.import_huggingface_model
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment