Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
# Creating a New Release
In order to create a new release:
1. Navigate to the [Fairseq Workflows](https://github.com/facebookresearch/fairseq/actions) and find the one named _Fairseq Release_.
2. Under _Run Workflow_ choose the branch `main` and for _Release Type_ enter either `major`, `minor`, or `patch`.
3. A branch named `$new_version-release` will be created where the `version.txt` file is updated. Merge those changes into `main`.
4. Make sure that a [new PYPI package](https://pypi.org/project/fairseq/) has been uploaded.
5. Make sure that a [new github release](https://github.com/facebookresearch/fairseq/releases) has been created.
python fairseq_cli/hydra_train.py -m task.data=`pwd`/data task.local_cache_path=`pwd`/scratch/cache_abaevski/imagenet --config-dir examples/data2vec/config/v2 --config-name base_images_only_task
# python setup.py build_ext --inplace
#!/bin/bash
#SBATCH -p wzhdexclu10
#SBATCH -N 2
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --mem 50G
#SBATCH --gres=dcu:4
#SBATCH -J data2vec
#SBATCH -o logs/%x-%j.txt
#SBATCH -e logs/%x-%j.txt
ulimit -u 200000
# export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"
module purge
module load compiler/devtoolset/7.3.1
module load mpi/hpcx/gcc-7.3.1
module load compiler/dtk/23.04
# source /opt/dtk-23.04/env.sh
source /public/home/chenzk/dtk-23.04/env.sh
module list
which mpirun
# load env
source /public/home/chenzk/anaconda3/bin/activate fairseq
# conda activate fairseq
which python3
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/chenzk/anaconda3/envs/fairseq/lib
#source activate fairseq
export PYTHON=python3
export NPROC_PER_NODE=4
rm -f ./hostfile/*
rm -f core.*
dir="./hostfile"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi
dir="./checkpoints"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/pretrain_datavec_mpi.sh $dist_url
#!/bin/bash
#SBATCH -p wzhdexclu10
#SBATCH -N 2
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --mem 50G
#SBATCH --gres=dcu:4
#SBATCH -J data2vec
#SBATCH -o logs/%x-%j.txt
#SBATCH -e logs/%x-%j.txt
ulimit -u 200000
# export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"
module purge
module load compiler/devtoolset/7.3.1
module load mpi/hpcx/gcc-7.3.1
module load compiler/dtk/23.04
# source /opt/dtk-23.04/env.sh
source /public/home/chenzk/dtk-23.04/env.sh
module list
which mpirun
# load env
source /public/home/chenzk/anaconda3/bin/activate fairseq
# conda activate fairseq
which python3
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/chenzk/anaconda3/envs/fairseq/lib
#source activate fairseq
export PYTHON=python3
export NPROC_PER_NODE=4
rm -f ./hostfile/*
rm -f core.*
dir="./hostfile"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi
# sleep 1d
dir="./checkpoints"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
CONFIG_DIR="examples/data2vec/config/v2"
# CONFIG_DIR="examples/data2vec/config/vision/finetuning"
CONFIG_NAME="base_images_only_task"
# CONFIG_NAME="mae_imagenet_clean"
DATA_PATH=`pwd`/"data"
CACHE_PATH=`pwd`/"scratch/cache_abaevski/imagenet"
CHECKPOINT_PATH=`pwd`/"checkpoints/checkpoint_last.pt"
srun python3 fairseq_cli/hydra_train.py -m \
task.data=$DATA_PATH \
task.local_cache_path=$CACHE_PATH \
distributed_training.distributed_init_method=tcp://$dist_url:34566 \
distributed_training.distributed_world_size=$np \
--config-dir $CONFIG_DIR \
--config-name $CONFIG_NAME \
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python -msphinx
SPHINXPROJ = fairseq
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
.. _Command-line Tools:
Command-line Tools
==================
Fairseq provides several command-line tools for training and evaluating models:
- :ref:`fairseq-preprocess`: Data pre-processing: build vocabularies and binarize training data
- :ref:`fairseq-train`: Train a new model on one or multiple GPUs
- :ref:`fairseq-generate`: Translate pre-processed data with a trained model
- :ref:`fairseq-interactive`: Translate raw text with a trained model
- :ref:`fairseq-score`: BLEU scoring of generated translations against reference translations
- :ref:`fairseq-eval-lm`: Language model evaluation
.. _fairseq-preprocess:
fairseq-preprocess
~~~~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.preprocess
.. argparse::
:module: fairseq.options
:func: get_preprocessing_parser
:prog: fairseq-preprocess
.. _fairseq-train:
fairseq-train
~~~~~~~~~~~~~
.. automodule:: fairseq_cli.train
.. argparse::
:module: fairseq.options
:func: get_training_parser
:prog: fairseq-train
.. _fairseq-generate:
fairseq-generate
~~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.generate
.. argparse::
:module: fairseq.options
:func: get_generation_parser
:prog: fairseq-generate
.. _fairseq-interactive:
fairseq-interactive
~~~~~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.interactive
.. argparse::
:module: fairseq.options
:func: get_interactive_generation_parser
:prog: fairseq-interactive
.. _fairseq-score:
fairseq-score
~~~~~~~~~~~~~
.. automodule:: fairseq_cli.score
.. argparse::
:module: fairseq_cli.score
:func: get_parser
:prog: fairseq-score
.. _fairseq-eval-lm:
fairseq-eval-lm
~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.eval_lm
.. argparse::
:module: fairseq.options
:func: get_eval_lm_parser
:prog: fairseq-eval-lm
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# fairseq documentation build configuration file, created by
# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
import os
import sys
from fairseq import __version__
# source code directory, relative to this file, for sphinx-autobuild
sys.path.insert(0, os.path.abspath(".."))
source_suffix = [".rst"]
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
"sphinx.ext.napoleon",
"sphinxarg.ext",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The master toctree document.
master_doc = "index"
# General information about the project.
project = "fairseq"
copyright = "Facebook AI Research (FAIR)"
author = "Facebook AI Research (FAIR)"
github_doc_root = "https://github.com/pytorch/fairseq/tree/main/docs/"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = __version__
# The full version, including alpha/beta/rc tags.
release = __version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
highlight_language = "python"
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
html_theme = "classic"
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"numpy": ("http://docs.scipy.org/doc/numpy/", None),
"python": ("https://docs.python.org/", None),
"torch": ("https://pytorch.org/docs/master/", None),
}
.. role:: hidden
:class: hidden-section
.. _Criterions:
Criterions
==========
Criterions compute the loss function given the model and batch, roughly::
loss = criterion(model, batch)
.. automodule:: fairseq.criterions
:members:
.. autoclass:: fairseq.criterions.FairseqCriterion
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.adaptive_loss.AdaptiveLoss
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.composite_loss.CompositeLoss
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.cross_entropy.CrossEntropyCriterion
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.label_smoothed_cross_entropy.LabelSmoothedCrossEntropyCriterion
:members:
:undoc-members:
.. role:: hidden
:class: hidden-section
.. module:: fairseq.data
Data Loading and Utilities
==========================
.. _datasets:
Datasets
--------
**Datasets** define the data format and provide helpers for creating
mini-batches.
.. autoclass:: fairseq.data.FairseqDataset
:members:
.. autoclass:: fairseq.data.LanguagePairDataset
:members:
.. autoclass:: fairseq.data.MonolingualDataset
:members:
**Helper Datasets**
These datasets wrap other :class:`fairseq.data.FairseqDataset` instances and
provide additional functionality:
.. autoclass:: fairseq.data.BacktranslationDataset
:members:
.. autoclass:: fairseq.data.ConcatDataset
:members:
.. autoclass:: fairseq.data.ResamplingDataset
:members:
.. autoclass:: fairseq.data.RoundRobinZipDatasets
:members:
.. autoclass:: fairseq.data.TransformEosDataset
:members:
Dictionary
----------
.. autoclass:: fairseq.data.Dictionary
:members:
Iterators
---------
.. autoclass:: fairseq.data.CountingIterator
:members:
.. autoclass:: fairseq.data.EpochBatchIterator
:members:
.. autoclass:: fairseq.data.GroupedIterator
:members:
.. autoclass:: fairseq.data.ShardedIterator
:members:
[writers]
option-limit=0
Evaluating Pre-trained Models
=============================
First, download a pre-trained model along with its vocabularies:
.. code-block:: console
> curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf -
This model uses a `Byte Pair Encoding (BPE)
vocabulary <https://arxiv.org/abs/1508.07909>`__, so we'll have to apply
the encoding to the source text before it can be translated. This can be
done with the
`apply\_bpe.py <https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/apply_bpe.py>`__
script using the ``wmt14.en-fr.fconv-cuda/bpecodes`` file. ``@@`` is
used as a continuation marker and the original text can be easily
recovered with e.g. ``sed s/@@ //g`` or by passing the ``--remove-bpe``
flag to :ref:`fairseq-generate`. Prior to BPE, input text needs to be tokenized
using ``tokenizer.perl`` from
`mosesdecoder <https://github.com/moses-smt/mosesdecoder>`__.
Let's use :ref:`fairseq-interactive` to generate translations interactively.
Here, we use a beam size of 5 and preprocess the input with the Moses
tokenizer and the given Byte-Pair Encoding vocabulary. It will automatically
remove the BPE continuation markers and detokenize the output.
.. code-block:: console
> MODEL_DIR=wmt14.en-fr.fconv-py
> fairseq-interactive \
--path $MODEL_DIR/model.pt $MODEL_DIR \
--beam 5 --source-lang en --target-lang fr \
--tokenizer moses \
--bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes
| loading model(s) from wmt14.en-fr.fconv-py/model.pt
| [en] dictionary: 44206 types
| [fr] dictionary: 44463 types
| Type the input sentence and press return:
Why is it rare to discover new marine mammal species?
S-0 Why is it rare to discover new marine mam@@ mal species ?
H-0 -0.0643349438905716 Pourquoi est-il rare de découvrir de nouvelles espèces de mammifères marins?
P-0 -0.0763 -0.1849 -0.0956 -0.0946 -0.0735 -0.1150 -0.1301 -0.0042 -0.0321 -0.0171 -0.0052 -0.0062 -0.0015
This generation script produces three types of outputs: a line prefixed
with *O* is a copy of the original source sentence; *H* is the
hypothesis along with an average log-likelihood; and *P* is the
positional score per token position, including the
end-of-sentence marker which is omitted from the text.
Other types of output lines you might see are *D*, the detokenized hypothesis,
*T*, the reference target, *A*, alignment info, *E* the history of generation steps.
See the `README <https://github.com/pytorch/fairseq#pre-trained-models>`__ for a
full list of pre-trained models available.
Training a New Model
====================
The following tutorial is for machine translation. For an example of how
to use Fairseq for other tasks, such as :ref:`language modeling`, please see the
``examples/`` directory.
Data Pre-processing
-------------------
Fairseq contains example pre-processing scripts for several translation
datasets: IWSLT 2014 (German-English), WMT 2014 (English-French) and WMT
2014 (English-German). To pre-process and binarize the IWSLT dataset:
.. code-block:: console
> cd examples/translation/
> bash prepare-iwslt14.sh
> cd ../..
> TEXT=examples/translation/iwslt14.tokenized.de-en
> fairseq-preprocess --source-lang de --target-lang en \
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
--destdir data-bin/iwslt14.tokenized.de-en
This will write binarized data that can be used for model training to
``data-bin/iwslt14.tokenized.de-en``.
Training
--------
Use :ref:`fairseq-train` to train a new model. Here a few example settings that work
well for the IWSLT 2014 dataset:
.. code-block:: console
> mkdir -p checkpoints/fconv
> CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
--optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
--arch fconv_iwslt_de_en --save-dir checkpoints/fconv
By default, :ref:`fairseq-train` will use all available GPUs on your machine. Use the
``CUDA_VISIBLE_DEVICES`` environment variable to select specific GPUs and/or to
change the number of GPU devices that will be used.
Also note that the batch size is specified in terms of the maximum
number of tokens per batch (``--max-tokens``). You may need to use a
smaller value depending on the available GPU memory on your system.
Generation
----------
Once your model is trained, you can generate translations using
:ref:`fairseq-generate` **(for binarized data)** or
:ref:`fairseq-interactive` **(for raw text)**:
.. code-block:: console
> fairseq-generate data-bin/iwslt14.tokenized.de-en \
--path checkpoints/fconv/checkpoint_best.pt \
--batch-size 128 --beam 5
| [de] dictionary: 35475 types
| [en] dictionary: 24739 types
| data-bin/iwslt14.tokenized.de-en test 6750 examples
| model fconv
| loaded checkpoint trainings/fconv/checkpoint_best.pt
S-721 danke .
T-721 thank you .
...
To generate translations with only a CPU, use the ``--cpu`` flag. BPE
continuation markers can be removed with the ``--remove-bpe`` flag.
Advanced Training Options
=========================
Large mini-batch training with delayed updates
----------------------------------------------
The ``--update-freq`` option can be used to accumulate gradients from
multiple mini-batches and delay updating, creating a larger effective
batch size. Delayed updates can also improve training speed by reducing
inter-GPU communication costs and by saving idle time caused by variance
in workload across GPUs. See `Ott et al.
(2018) <https://arxiv.org/abs/1806.00187>`__ for more details.
To train on a single GPU with an effective batch size that is equivalent
to training on 8 GPUs:
.. code-block:: console
> CUDA_VISIBLE_DEVICES=0 fairseq-train --update-freq 8 (...)
Training with half precision floating point (FP16)
--------------------------------------------------
.. note::
FP16 training requires a Volta GPU and CUDA 9.1 or greater
Recent GPUs enable efficient half precision floating point computation,
e.g., using `Nvidia Tensor Cores
<https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__.
Fairseq supports FP16 training with the ``--fp16`` flag:
.. code-block:: console
> fairseq-train --fp16 (...)
Distributed training
--------------------
Distributed training in fairseq is implemented on top of ``torch.distributed``.
The easiest way to launch jobs is with the `torch.distributed.launch
<https://pytorch.org/docs/stable/distributed.html#launch-utility>`__ tool.
For example, to train a large English-German Transformer model on 2 nodes each
with 8 GPUs (in total 16 GPUs), run the following command on each node,
replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
sure to update ``--master_addr`` to the IP address of the first node:
.. code-block:: console
> python -m torch.distributed.launch --nproc_per_node=8 \
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
--master_port=12345 \
$(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
--arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
--lr 0.0005 \
--dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-tokens 3584 \
--max-epoch 70 \
--fp16
On SLURM clusters, fairseq will automatically detect the number of nodes and
GPUs, but a port number must be provided:
.. code-block:: console
> salloc --gpus=16 --nodes 2 (...)
> srun fairseq-train --distributed-port 12345 (...).
Sharding very large datasets
----------------------------
It can be challenging to train over very large datasets, particularly if your
machine does not have much system RAM. Most tasks in fairseq support training
over "sharded" datasets, in which the original dataset has been preprocessed
into non-overlapping chunks (or "shards").
For example, instead of preprocessing all your data into a single "data-bin"
directory, you can split the data and create "data-bin1", "data-bin2", etc.
Then you can adapt your training command like so:
.. code-block:: console
> fairseq-train data-bin1:data-bin2:data-bin3 (...)
Training will now iterate over each shard, one by one, with each shard
corresponding to an "epoch", thus reducing system memory usage.
## Hydra
[Hydra](https://github.com/facebookresearch/hydra) is an open-source Python
framework that simplifies the development of research and other complex
applications. The key feature is the ability to dynamically create a
hierarchical configuration by composition and override it through config files
and the command line. The name Hydra comes from its ability to run multiple
similar jobs - much like a Hydra with multiple heads.
## Motivation
Until recently, all components in fairseq were configured through a shared
`args` namespace that was created at application startup. Components declared
their own `add_args` method to update the argparse parser, hoping that the names
would not clash with arguments from other components. While this model works for
smaller applications, as fairseq grew and became integrated into other
applications, this became problematic. In order to determine how to configure
each component, one needed to a) examine what args were added by this component,
and b) read the code to figure out what shared arguments it is using that were
added in other places. Reproducing models involved sharing commands that often
contained dozens of command line switches.
The model described above is still supported by fairseq for backward
compatibility, but will be deprecated some time in the future.
New components in fairseq should now create a dataclass that encapsulates all
parameters required to configure this component. The dataclass is registered
along with the component, and fairseq takes care of constructing and providing
this configuration object to the component's constructor. Note that sharing
parameters can optionally still work, but one has to explicitly point to the
"source of truth" (see inheritance example below). These changes make components
in fairseq more independent and re-usable by other applications: all that is
needed to create a component is to initialize its dataclass and overwrite some
of the defaults.
While configuring fairseq through command line (using either the legacy argparse
based or the new Hydra based entry points) is still fully supported, you can now
take advantage of configuring fairseq completely or piece-by-piece through
hierarchical YAML configuration files. These files can also be shipped as
examples that others can use to run an identically configured job.
Additionally, Hydra has a rich and growing [library of
plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that
provide functionality such as hyperparameter sweeping (including using bayesian
optimization through the [Ax](https://github.com/facebook/Ax) library), job
launching across various platforms, and more.
## Creating or migrating components
In general, each new (or updated) component should provide a companion
[dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are
typically located in the same file as the component and are passed as arguments
to the `register_*()` functions. Top-level configs that should be present in
every fairseq application are placed in the
[global](fairseq/dataclass/configs.py) config file and added to the
`FairseqConfig` object.
Each dataclass is a plain-old-data object, similar to a `NamedTuple`. These
classes are decorated with a `@dataclass` decorator, and typically inherit from
`FairseqDataclass` (which adds some functionality for backward compatibility).
Each field must have a type, and generally has metadata (such as a help string)
and a default value. Only primitive types or other config objects are allowed as
data types for each field.
#### Example:
```python
from dataclasses import dataclass, field
from fairseq.dataclass import FairseqDataclass
@dataclass
class InteractiveConfig(FairseqDataclass):
buffer_size: int = field(
default=0,
metadata={
"help": "read this many sentences into a buffer before processing them"
},
)
input: str = field(
default="-",
metadata={"help": "file to read from; use - for stdin"},
)
```
### Inherting values
Some components require sharing a value. For example, a learning rate scheduler
and an optimizer may both need to know the initial learning rate value. One can
declare a field that, by default, will inherit its value from another config
node in the same hierarchy:
```python
@dataclass
FairseqAdamConfig(FairseqDataclass):
...
lr: List[float] = II("optimization.lr")
...
```
`II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"`, which is
the value one can use in a YAML config file or through command line to achieve
the same effect. Note that this assumes that there is an "optimization" config
object in the root config and it has a field called "lr".
### Tasks and Models
Creating Tasks and Models works same as before, except that legacy
implementations now inherit from `LegacyFairseq*` base classes, while new
components inherit from `FairseqTask` and `FairseqModel` and provide a dataclass
to the `register_*()` functions.
#### Task example:
```python
@dataclass
class LanguageModelingConfig(FairseqDataclass):
data: Optional[str] = field(
default=None, metadata={"help": "path to data directory"}
)
...
@register_task("language_modeling", dataclass=LanguageModelingConfig)
class LanguageModelingTask(FairseqTask):
...
@classmethod
def setup_task(cls, cfg: LanguageModelingConfig):
...
```
#### Model example:
```python
@dataclass
class TransformerLanguageModelConfig(FairseqDataclass):
activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
default="relu", metadata={"help": "activation function to use"}
)
dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
...
@register_model("transformer_lm", dataclass=TransformerLanguageModelConfig)
class TransformerLanguageModel(FairseqLanguageModel):
...
@classmethod
def build_model(cls, cfg: TransformerLanguageModelConfig, task: FairseqTask):
...
```
### Other components
Other components work as before, but they now take their configuration dataclass
as the only constructor argument:
```python
@dataclass
class MosesTokenizerConfig(FairseqDataclass):
source_lang: str = field(default="en", metadata={"help": "source language"})
...
@register_tokenizer("moses", dataclass=MosesTokenizerConfig)
class MosesTokenizer(object):
def __init__(self, cfg: MosesTokenizerConfig):
...
```
Note that if you are adding a new registry for a new set of components, you need
to add it to the `FairseqConfig` object in `fairseq/dataclass/configs.py`:
```python
@dataclass
class FairseqConfig(object):
...
my_new_registry: Any = None
```
## Training with `fairseq-hydra-train`
To fully take advantage of configuration flexibility offered by Hydra, you may
want to train new models using the `fairseq-hydra-train` entry point. Legacy CLI
tools such as `fairseq-train` will remain supported for the foreseeable future
but will be deprecated eventually.
On startup, Hydra will create a configuration object that contains a hierarchy
of all the necessary dataclasses populated with their default values in the
code. The default values are overwritten by values found in YAML files in
`fairseq/config` directory (which currently sets minimal defaults) and then
further overwritten by values provided through command line arguments.
Some of the most common use cases are shown below:
### 1. Override default values through command line:
```shell script
$ fairseq-hydra-train \
distributed_training.distributed_world_size=1 \
dataset.batch_size=2 \
task.data=data-bin \
model=transformer_lm/transformer_lm_gpt \
task=language_modeling \
optimization.max_update=5000
```
Note that along with explicitly providing values for parameters such as
`dataset.batch_size`, this also tells Hydra to overlay configuration found in
`fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml` over the default
values in the dataclass. If you want to train a model without specifying a
particular architecture you can simply specify `model=transformer_lm`. This only
works for migrated tasks and models.
### 2. Replace bundled configs with an external config:
```shell script
$ fairseq-hydra-train \
--config-dir /path/to/external/configs \
--config-name wiki103
```
where `/path/to/external/configs/wiki103.yaml` contains:
```yaml
# @package _group_
model:
_name: transformer_lm
distributed_training:
distributed_world_size: 1
dataset:
batch_size: 2
task:
_name: language_modeling
data: /path/to/data
add_bos_token: false
max_target_positions: 1024
optimization:
max_update: 50000
lr: [ 0.25 ]
criterion: cross_entropy
optimizer: adam
lr_scheduler:
_name: cosine
```
Note that here bundled configs from `fairseq/config` directory are not used,
however the defaults from each dataclass will still be used (unless overwritten
by your external config).
Additionally you can choose to break up your configs by creating a directory
structure in the same location as your main config file, with the names of the
top-level fields (such as "model", "dataset", etc), and placing config files
with meaningful names that would populate that specific section of your
top-level config file (for example, you might have
`model/small_transformer_lm.yaml`, `model/big_transformer_lm.yaml`, etc). You
can then specify the correct configuration via command line, defaults in the
main config, or even launch all of them as a sweep (see Hydra documentation on
how to do this).
### 3. Add an external config directory to Hydra search path:
This allows combining default configuration (including using any bundled config
files), while specifying your own config files for some parts of the
configuration.
```shell script
$ fairseq-hydra-train \
distributed_training.distributed_world_size=1 \
dataset.batch_size=2 \
task.data=/path/to/data/ \
model=transformer_lm/2_layers \
task=language_modeling \
optimization.max_update=5000 \
--config-dir /path/to/external/configs
```
where `/path/to/external/configs` has the following structure:
```
.
+-- model
| +-- transformer_lm
| | +-- 2_layers.yaml
```
and `2_layers.yaml` contains a copy of `transformer_lm_gpt.yaml` but with
`decoder_layers` set to 2. You can add other configs to configure other
components as well.
.. fairseq documentation master file, created by
sphinx-quickstart on Fri Aug 17 21:45:30 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
:github_url: https://github.com/pytorch/fairseq
fairseq documentation
=====================
Fairseq is a sequence modeling toolkit written in `PyTorch
<http://pytorch.org/>`_ that allows researchers and developers to
train custom models for translation, summarization, language modeling and other
text generation tasks.
.. toctree::
:maxdepth: 1
:caption: Getting Started
getting_started
command_line_tools
.. toctree::
:maxdepth: 1
:caption: Extending Fairseq
overview
tutorial_simple_lstm
tutorial_classifying_names
.. toctree::
:maxdepth: 2
:caption: Library Reference
tasks
models
criterions
optim
lr_scheduler
data
modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`search`
.. role:: hidden
:class: hidden-section
.. _Learning Rate Schedulers:
Learning Rate Schedulers
========================
Learning Rate Schedulers update the learning rate over the course of training.
Learning rates can be updated after each update via :func:`step_update` or at
epoch boundaries via :func:`step`.
.. automodule:: fairseq.optim.lr_scheduler
:members:
.. autoclass:: fairseq.optim.lr_scheduler.FairseqLRScheduler
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.cosine_lr_scheduler.CosineSchedule
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.fixed_schedule.FixedSchedule
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.inverse_square_root_schedule.InverseSquareRootSchedule
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.reduce_lr_on_plateau.ReduceLROnPlateau
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.triangular_lr_scheduler.TriangularSchedule
:members:
:undoc-members:
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=python -msphinx
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=fairseq
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The Sphinx module was not found. Make sure you have Sphinx installed,
echo.then set the SPHINXBUILD environment variable to point to the full
echo.path of the 'sphinx-build' executable. Alternatively you may add the
echo.Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
.. role:: hidden
:class: hidden-section
.. module:: fairseq.models
.. _Models:
Models
======
A Model defines the neural network's ``forward()`` method and encapsulates all
of the learnable parameters in the network. Each model also provides a set of
named *architectures* that define the precise network configuration (e.g.,
embedding dimension, number of layers, etc.).
Both the model type and architecture are selected via the ``--arch``
command-line argument. Once selected, a model may expose additional command-line
arguments for further configuration.
.. note::
All fairseq Models extend :class:`BaseFairseqModel`, which in turn extends
:class:`torch.nn.Module`. Thus any fairseq Model can be used as a
stand-alone Module in other PyTorch code.
Convolutional Neural Networks (CNN)
-----------------------------------
.. module:: fairseq.models.fconv
.. autoclass:: fairseq.models.fconv.FConvModel
:members:
.. autoclass:: fairseq.models.fconv.FConvEncoder
:members:
:undoc-members:
.. autoclass:: fairseq.models.fconv.FConvDecoder
:members:
Long Short-Term Memory (LSTM) networks
--------------------------------------
.. module:: fairseq.models.lstm
.. autoclass:: fairseq.models.lstm.LSTMModel
:members:
.. autoclass:: fairseq.models.lstm.LSTMEncoder
:members:
.. autoclass:: fairseq.models.lstm.LSTMDecoder
:members:
Transformer (self-attention) networks
-------------------------------------
.. module:: fairseq.models.transformer
.. autoclass:: fairseq.models.transformer.TransformerModel
:members:
.. autoclass:: fairseq.models.transformer.TransformerEncoder
:members:
.. autoclass:: fairseq.models.transformer.TransformerEncoderLayer
:members:
.. autoclass:: fairseq.models.transformer.TransformerDecoder
:members:
.. autoclass:: fairseq.models.transformer.TransformerDecoderLayer
:members:
Adding new models
-----------------
.. currentmodule:: fairseq.models
.. autofunction:: fairseq.models.register_model
.. autofunction:: fairseq.models.register_model_architecture
.. autoclass:: fairseq.models.BaseFairseqModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqEncoderDecoderModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqEncoderModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqLanguageModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqMultiModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqEncoder
:members:
.. autoclass:: fairseq.models.CompositeEncoder
:members:
.. autoclass:: fairseq.models.FairseqDecoder
:members:
.. _Incremental decoding:
Incremental decoding
--------------------
.. autoclass:: fairseq.models.FairseqIncrementalDecoder
:members:
:undoc-members:
Modules
=======
Fairseq provides several stand-alone :class:`torch.nn.Module` classes that may
be helpful when implementing a new :class:`~fairseq.models.BaseFairseqModel`.
.. automodule:: fairseq.modules
:members:
:undoc-members:
.. role:: hidden
:class: hidden-section
.. _optimizers:
Optimizers
==========
Optimizers update the Model parameters based on the gradients.
.. automodule:: fairseq.optim
:members:
.. autoclass:: fairseq.optim.FairseqOptimizer
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adadelta.Adadelta
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adagrad.Adagrad
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adafactor.FairseqAdafactor
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adam.FairseqAdam
:members:
:undoc-members:
.. autoclass:: fairseq.optim.fp16_optimizer.FP16Optimizer
:members:
:undoc-members:
.. autoclass:: fairseq.optim.nag.FairseqNAG
:members:
:undoc-members:
.. autoclass:: fairseq.optim.sgd.SGD
:members:
:undoc-members:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment