add fairseq0.10.2

7df61696 · Sugon_ldc · 7df61696 · 7df61696 · 7df61696 · 7df61696
Commit 7df61696 authored Jul 28, 2023 by Sugon_ldc
20 changed files
--- a/config/model/transformer_lm_wiki103.yaml
+++ b/config/model/transformer_lm_wiki103.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/config/optimizer/adam.yaml
+++ b/config/optimizer/adam.yaml
+# @package _group_
+adam_betas: "(0.9, 0.999)"
+adam_eps: 1.0e-8
+weight_decay: 0
+use_old_adam: false
--- a/config/optimizer/nag.yaml
+++ b/config/optimizer/nag.yaml
+# @package _group_
+momentum: 0.99
+weight_decay: 0.0
--- a/config/params/eval_lm_params.yaml
+++ b/config/params/eval_lm_params.yaml
+# @package _group_
+common:
+  no_progress_bar: false
+  log_interval: 100
+  log_format: null
+  tensorboard_logdir: null
+  seed: 1
+  cpu: false
+  fp16: false
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  min_loss_scale: 1.0e-4
+  threshold_loss_scale: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  checkpoint_suffix: ""
+  quantization_config_path: null
+distributed_training:
+  distributed_rank: 0
+  distributed_backend: "nccl"
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  local_rank: 0
+  distributed_no_spawn: false
+  ddp_backend: "c10d"
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  fast_stat_sync: false
+  broadcast_buffers: false
+  distributed_wrapper: "DDP"
+  slowmo_momentum: null
+  slowmo_algorithm: "LocalSGD"
+  localsgd_frequency: 3
+dataset:
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: false
+  max_tokens: null
+  batch_size: ${params.dataset.batch_size}
+  required_batch_size_multiple: 8
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: "train"
+  valid_subset: "valid"
+  validate_interval: 1
+  fixed_validation_seed: null
+  disable_validation: false
+  curriculum: 0
+  gen_subset: "test"
+  num_shards: 1
+  shard_id: 0
+  max_tokens_valid: ${params.dataset.max_tokens}
+  batch_size_valid: ${params.dataset.batch_size}
+optimization:
+  max_epoch: 0
+  max_update: 0
+  clip_norm: 25.0
+  sentence_avg: false
+  update_freq: [1]
+  lr: [0.25]
+  min_lr: -1.0
+  use_bmuf: false
+checkpoint:
+  save_dir: "checkpoints"
+  restore_file: "checkpoint_last.pt"
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: "{}"
+  save_interval: 1
+  save_interval_updates: 0
+  keep_interval_updates: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: false
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: "loss"
+  maximize_best_checkpoint_metric: false
+  patience: -1
+common_eval:
+  path: null
+  remove_bpe: null
+  quiet: false
+  model_overrides: "{}"
+  results_path: null
+eval_lm:
+  output_word_probs: false
+  output_word_stats: false
+  context_window: 0
+bmuf:
+  block_lr: 1
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
--- a/config/params/training_params.yaml
+++ b/config/params/training_params.yaml
+# @package _group_
+common:
+  no_progress_bar: false
+  log_interval: 100
+  log_format: null
+  tensorboard_logdir: null
+  seed: 1
+  cpu: false
+  fp16: false
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  min_loss_scale: 1.0e-4
+  threshold_loss_scale: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  checkpoint_suffix: ""
+  quantization_config_path: null
+distributed_training:
+  distributed_rank: 0
+  distributed_backend: "nccl"
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  local_rank: 0
+  distributed_no_spawn: false
+  ddp_backend: "c10d"
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  fast_stat_sync: false
+  broadcast_buffers: false
+  distributed_wrapper: "DDP"
+  slowmo_momentum: null
+  slowmo_algorithm: "LocalSGD"
+  localsgd_frequency: 3
+dataset:
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: false
+  max_tokens: null
+  batch_size: ${params.dataset.batch_size}
+  required_batch_size_multiple: 8
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: "train"
+  valid_subset: "valid"
+  validate_interval: 1
+  fixed_validation_seed: null
+  disable_validation: false
+  curriculum: 0
+  gen_subset: "test"
+  num_shards: 1
+  shard_id: 0
+  max_tokens_valid: ${params.dataset.max_tokens}
+  batch_size_valid: ${params.dataset.batch_size}
+optimization:
+  max_epoch: 0
+  max_update: 0
+  clip_norm: 25.0
+  sentence_avg: false
+  update_freq: [1]
+  lr: [0.25]
+  min_lr: -1.0
+  use_bmuf: false
+checkpoint:
+  save_dir: "checkpoints"
+  restore_file: "checkpoint_last.pt"
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: "{}"
+  save_interval: 1
+  save_interval_updates: 0
+  keep_interval_updates: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: false
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: "loss"
+  maximize_best_checkpoint_metric: false
+  patience: -1
+bmuf:
+  block_lr: 1
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
--- a/config/task/language_modeling.yaml
+++ b/config/task/language_modeling.yaml
+# @package _group_
+data: ???
+sample_break_mode: "none"
+tokens_per_sample: 1024
+output_dictionary_size: -1
+self_target: false
+future_target: false
+past_target: false
+add_bos_token: false
+max_target_positions: null
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = fairseq
+SOURCEDIR     = .
+BUILDDIR      = _build
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+.PHONY: help Makefile
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/docs/_static/theme_overrides.css
+++ b/docs/_static/theme_overrides.css
+.wy-table-responsive table td kbd {
+    white-space: nowrap;
+}
+.wy-table-responsive table td {
+    white-space: normal !important;
+}
+.wy-table-responsive {
+    overflow: visible !important;
+}
--- a/docs/command_line_tools.rst
+++ b/docs/command_line_tools.rst
+.. _Command-line Tools:
+Command-line Tools
+==================
+Fairseq provides several command-line tools for training and evaluating models:
+- :ref:`fairseq-preprocess`: Data pre-processing: build vocabularies and binarize training data
+- :ref:`fairseq-train`: Train a new model on one or multiple GPUs
+- :ref:`fairseq-generate`: Translate pre-processed data with a trained model
+- :ref:`fairseq-interactive`: Translate raw text with a trained model
+- :ref:`fairseq-score`: BLEU scoring of generated translations against reference translations
+- :ref:`fairseq-eval-lm`: Language model evaluation
+.. _fairseq-preprocess:
+fairseq-preprocess
+~~~~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.preprocess
+    .. argparse::
+        :module: fairseq.options
+        :func: get_preprocessing_parser
+        :prog: fairseq-preprocess
+.. _fairseq-train:
+fairseq-train
+~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.train
+    .. argparse::
+        :module: fairseq.options
+        :func: get_training_parser
+        :prog: fairseq-train
+.. _fairseq-generate:
+fairseq-generate
+~~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.generate
+    .. argparse::
+        :module: fairseq.options
+        :func: get_generation_parser
+        :prog: fairseq-generate
+.. _fairseq-interactive:
+fairseq-interactive
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.interactive
+    .. argparse::
+        :module: fairseq.options
+        :func: get_interactive_generation_parser
+        :prog: fairseq-interactive
+.. _fairseq-score:
+fairseq-score
+~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.score
+    .. argparse::
+        :module: fairseq_cli.score
+        :func: get_parser
+        :prog: fairseq-score
+.. _fairseq-eval-lm:
+fairseq-eval-lm
+~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.eval_lm
+    .. argparse::
+        :module: fairseq.options
+        :func: get_eval_lm_parser
+        :prog: fairseq-eval-lm
--- a/docs/conf.py
+++ b/docs/conf.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# fairseq documentation build configuration file, created by
+# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+import os
+import sys
+# source code directory, relative to this file, for sphinx-autobuild
+sys.path.insert(0, os.path.abspath(".."))
+source_suffix = [".rst"]
+# -- General configuration ------------------------------------------------
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "sphinxarg.ext",
+]
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+# The master toctree document.
+master_doc = "index"
+# General information about the project.
+project = "fairseq"
+copyright = "2019, Facebook AI Research (FAIR)"
+author = "Facebook AI Research (FAIR)"
+github_doc_root = "https://github.com/pytorch/fairseq/tree/master/docs/"
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = "0.10.2"
+# The full version, including alpha/beta/rc tags.
+release = "0.10.2"
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+highlight_language = "python"
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+# -- Options for HTML output ----------------------------------------------
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+html_context = {
+    "css_files": [
+        "_static/theme_overrides.css",  # override wide tables in RTD theme
+    ],
+}
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+# html_sidebars = {
+#    '**': [
+#        'about.html',
+#        'navigation.html',
+#        'relations.html',  # needs 'show_related': True theme option to display
+#        'searchbox.html',
+#        'donate.html',
+#    ]
+# }
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
+    "python": ("https://docs.python.org/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}
--- a/docs/criterions.rst
+++ b/docs/criterions.rst
+.. role:: hidden
+    :class: hidden-section
+.. _Criterions:
+Criterions
+==========
+Criterions compute the loss function given the model and batch, roughly::
+  loss = criterion(model, batch)
+.. automodule:: fairseq.criterions
+    :members:
+.. autoclass:: fairseq.criterions.FairseqCriterion
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.adaptive_loss.AdaptiveLoss
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.composite_loss.CompositeLoss
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.cross_entropy.CrossEntropyCriterion
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.label_smoothed_cross_entropy.LabelSmoothedCrossEntropyCriterion
+    :members:
+    :undoc-members:
--- a/docs/data.rst
+++ b/docs/data.rst
+.. role:: hidden
+    :class: hidden-section
+.. module:: fairseq.data
+Data Loading and Utilities
+==========================
+.. _datasets:
+Datasets
+--------
+**Datasets** define the data format and provide helpers for creating
+mini-batches.
+.. autoclass:: fairseq.data.FairseqDataset
+    :members:
+.. autoclass:: fairseq.data.LanguagePairDataset
+    :members:
+.. autoclass:: fairseq.data.MonolingualDataset
+    :members:
+**Helper Datasets**
+These datasets wrap other :class:`fairseq.data.FairseqDataset` instances and
+provide additional functionality:
+.. autoclass:: fairseq.data.BacktranslationDataset
+    :members:
+.. autoclass:: fairseq.data.ConcatDataset
+    :members:
+.. autoclass:: fairseq.data.ResamplingDataset
+    :members:
+.. autoclass:: fairseq.data.RoundRobinZipDatasets
+    :members:
+.. autoclass:: fairseq.data.TransformEosDataset
+    :members:
+Dictionary
+----------
+.. autoclass:: fairseq.data.Dictionary
+    :members:
+Iterators
+---------
+.. autoclass:: fairseq.data.CountingIterator
+    :members:
+.. autoclass:: fairseq.data.EpochBatchIterator
+    :members:
+.. autoclass:: fairseq.data.GroupedIterator
+    :members:
+.. autoclass:: fairseq.data.ShardedIterator
+    :members:
--- a/docs/docutils.conf
+++ b/docs/docutils.conf
+[writers]
+option-limit=0
--- a/docs/fairseq.gif
+++ b/docs/fairseq.gif
--- a/docs/fairseq_logo.png
+++ b/docs/fairseq_logo.png
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
+Evaluating Pre-trained Models
+=============================
+First, download a pre-trained model along with its vocabularies:
+.. code-block:: console
+    > curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf -
+This model uses a `Byte Pair Encoding (BPE)
+vocabulary <https://arxiv.org/abs/1508.07909>`__, so we'll have to apply
+the encoding to the source text before it can be translated. This can be
+done with the
+`apply\_bpe.py <https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/apply_bpe.py>`__
+script using the ``wmt14.en-fr.fconv-cuda/bpecodes`` file. ``@@`` is
+used as a continuation marker and the original text can be easily
+recovered with e.g. ``sed s/@@ //g`` or by passing the ``--remove-bpe``
+flag to :ref:`fairseq-generate`. Prior to BPE, input text needs to be tokenized
+using ``tokenizer.perl`` from
+`mosesdecoder <https://github.com/moses-smt/mosesdecoder>`__.
+Let's use :ref:`fairseq-interactive` to generate translations interactively.
+Here, we use a beam size of 5 and preprocess the input with the Moses
+tokenizer and the given Byte-Pair Encoding vocabulary. It will automatically
+remove the BPE continuation markers and detokenize the output.
+.. code-block:: console
+    > MODEL_DIR=wmt14.en-fr.fconv-py
+    > fairseq-interactive \
+        --path $MODEL_DIR/model.pt $MODEL_DIR \
+        --beam 5 --source-lang en --target-lang fr \
+        --tokenizer moses \
+        --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes
+    | loading model(s) from wmt14.en-fr.fconv-py/model.pt
+    | [en] dictionary: 44206 types
+    | [fr] dictionary: 44463 types
+    | Type the input sentence and press return:
+    Why is it rare to discover new marine mammal species?
+    S-0     Why is it rare to discover new marine mam@@ mal species ?
+    H-0     -0.0643349438905716     Pourquoi est-il rare de découvrir de nouvelles espèces de mammifères marins?
+    P-0     -0.0763 -0.1849 -0.0956 -0.0946 -0.0735 -0.1150 -0.1301 -0.0042 -0.0321 -0.0171 -0.0052 -0.0062 -0.0015
+This generation script produces three types of outputs: a line prefixed
+with *O* is a copy of the original source sentence; *H* is the
+hypothesis along with an average log-likelihood; and *P* is the
+positional score per token position, including the
+end-of-sentence marker which is omitted from the text.
+Other types of output lines you might see are *D*, the detokenized hypothesis,
+*T*, the reference target, *A*, alignment info, *E* the history of generation steps.
+See the `README <https://github.com/pytorch/fairseq#pre-trained-models>`__ for a
+full list of pre-trained models available.
+Training a New Model
+====================
+The following tutorial is for machine translation. For an example of how
+to use Fairseq for other tasks, such as :ref:`language modeling`, please see the
+``examples/`` directory.
+Data Pre-processing
+-------------------
+Fairseq contains example pre-processing scripts for several translation
+datasets: IWSLT 2014 (German-English), WMT 2014 (English-French) and WMT
+2014 (English-German). To pre-process and binarize the IWSLT dataset:
+.. code-block:: console
+    > cd examples/translation/
+    > bash prepare-iwslt14.sh
+    > cd ../..
+    > TEXT=examples/translation/iwslt14.tokenized.de-en
+    > fairseq-preprocess --source-lang de --target-lang en \
+        --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+        --destdir data-bin/iwslt14.tokenized.de-en
+This will write binarized data that can be used for model training to
+``data-bin/iwslt14.tokenized.de-en``.
+Training
+--------
+Use :ref:`fairseq-train` to train a new model. Here a few example settings that work
+well for the IWSLT 2014 dataset:
+.. code-block:: console
+    > mkdir -p checkpoints/fconv
+    > CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
+        --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
+        --arch fconv_iwslt_de_en --save-dir checkpoints/fconv
+By default, :ref:`fairseq-train` will use all available GPUs on your machine. Use the
+``CUDA_VISIBLE_DEVICES`` environment variable to select specific GPUs and/or to
+change the number of GPU devices that will be used.
+Also note that the batch size is specified in terms of the maximum
+number of tokens per batch (``--max-tokens``). You may need to use a
+smaller value depending on the available GPU memory on your system.
+Generation
+----------
+Once your model is trained, you can generate translations using
+:ref:`fairseq-generate` **(for binarized data)** or
+:ref:`fairseq-interactive` **(for raw text)**:
+.. code-block:: console
+    > fairseq-generate data-bin/iwslt14.tokenized.de-en \
+        --path checkpoints/fconv/checkpoint_best.pt \
+        --batch-size 128 --beam 5
+    | [de] dictionary: 35475 types
+    | [en] dictionary: 24739 types
+    | data-bin/iwslt14.tokenized.de-en test 6750 examples
+    | model fconv
+    | loaded checkpoint trainings/fconv/checkpoint_best.pt
+    S-721   danke .
+    T-721   thank you .
+    ...
+To generate translations with only a CPU, use the ``--cpu`` flag. BPE
+continuation markers can be removed with the ``--remove-bpe`` flag.
+Advanced Training Options
+=========================
+Large mini-batch training with delayed updates
+----------------------------------------------
+The ``--update-freq`` option can be used to accumulate gradients from
+multiple mini-batches and delay updating, creating a larger effective
+batch size. Delayed updates can also improve training speed by reducing
+inter-GPU communication costs and by saving idle time caused by variance
+in workload across GPUs. See `Ott et al.
+(2018) <https://arxiv.org/abs/1806.00187>`__ for more details.
+To train on a single GPU with an effective batch size that is equivalent
+to training on 8 GPUs:
+.. code-block:: console
+    > CUDA_VISIBLE_DEVICES=0 fairseq-train --update-freq 8 (...)
+Training with half precision floating point (FP16)
+--------------------------------------------------
+.. note::
+    FP16 training requires a Volta GPU and CUDA 9.1 or greater
+Recent GPUs enable efficient half precision floating point computation,
+e.g., using `Nvidia Tensor Cores
+<https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__.
+Fairseq supports FP16 training with the ``--fp16`` flag:
+.. code-block:: console
+    > fairseq-train --fp16 (...)
+Distributed training
+--------------------
+Distributed training in fairseq is implemented on top of ``torch.distributed``.
+The easiest way to launch jobs is with the `torch.distributed.launch
+<https://pytorch.org/docs/stable/distributed.html#launch-utility>`__ tool.
+For example, to train a large English-German Transformer model on 2 nodes each
+with 8 GPUs (in total 16 GPUs), run the following command on each node,
+replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
+.. code-block:: console
+    > python -m torch.distributed.launch --nproc_per_node=8 \
+        --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
+        --master_port=1234 \
+        $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
+        --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
+        --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+        --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
+        --lr 0.0005 --min-lr 1e-09 \
+        --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+        --max-tokens 3584 \
+        --fp16  --distributed-no-spawn 
+Sharding very large datasets
+----------------------------
+It can be challenging to train over very large datasets, particularly if your
+machine does not have much system RAM. Most tasks in fairseq support training
+over "sharded" datasets, in which the original dataset has been preprocessed
+into non-overlapping chunks (or "shards").
+For example, instead of preprocessing all your data into a single "data-bin"
+directory, you can split the data and create "data-bin1", "data-bin2", etc.
+Then you can adapt your training command like so:
+.. code-block:: console
+    > fairseq-train data-bin1:data-bin2:data-bin3 (...)
+Training will now iterate over each shard, one by one, with each shard
+corresponding to an "epoch", thus reducing system memory usage.
--- a/docs/hydra_integration.md
+++ b/docs/hydra_integration.md
+## Hydra
+Hydra is an open-source Python framework that simplifies the development of research and other complex applications. The key feature is the ability to dynamically create a hierarchical configuration by composition and override it through config files and the command line. The name Hydra comes from its ability to run multiple similar jobs - much like a Hydra with multiple heads.
+## Train models with hydra interface
+#### Provide parameters in `.yaml` files
+For example, if we'd like to train a language model with transformer, we could provide parameters in yaml files. Note that the modules used (task, model, criterion, optimizer, lr scheduler) in training must be migrated with hydra interface already (See session below).
+- Provide top level choices on which generic parameter file, and which modules to use: `config/config.yaml`, this will look like for example:
+```
+defaults:
+  - params: training_params
+  - task: language_modeling
+  - model: transformer_lm
+  - criterion: cross_entropy
+  - optimizer: adam
+  - lr_scheduler: inverse_sqrt
+```
+- Provide generic parameters common across different training jobs: `config/params/training_params.yaml`
+- Provide task parameters: `config/task/language_modeling.yaml`
+- Provide model parameters: `config/model/transformer_lm.yaml`
+- Provide criterion parameters: `config/criterion/cross_entropy.yaml`
+- Provide optimizer parameters: `config/optimizer/adam.yaml`
+- Provide lr_scheduler parameters `config/lr_scheduler/inverse_sqrt.yaml`
+#### Command line overriding
+`train_hydra.py` is the main entry point for training with hydra interface. If we specify all parameters we want in `.yaml` files, then we could simply use command:
+```
+# task.data is requested field marked by `???` in yaml
+python fairseq_cli/train_hydra.py \
+task.data=/private/home/abaevski/data/wiki103 \
+```
+Alternatively, if we need to override certain params from the command line, we could do so as below (note the structure of where each parameter sits)
+```
+python fairseq_cli/train_hydra.py
+params=training_params \
+task=language_modeling \
+task.data=/private/home/abaevski/data/wiki103 \
+task.tokens_per_sample=512 \
+task.sample_break_mode=none \
+model=transformer_lm \
+model.share_decoder_input_output_embed=true \
+model.dropout=0.1 \
+optimizer=adam \
+optimizer.adam_betas="'(0.9, 0.98)'" \
+optimizer.weight_decay=0.01 \
+lr_scheduler=inverse_sqrt \
+lr_scheduler.warmup_updates=4000 \
+lr_scheduler.warmup_init_lr=1e-07 \
+criterion=cross_entropy \
+params.common.fp16=true \
+params.common.log_format=json \
+params.common.log_interval=1 \
+params.dataset.max_tokens=1024 \
+params.dataset.num_workers=4 \
+params.optimization.update_freq=[16] \
+params.optimization.max_update=50000 \
+params.optimization.clip_norm=0.0 \
+params.optimization.lr=[0.0005] \
+params.checkpoint.save_dir=/checkpoint/mtian/transformer_wikitext-103-hydra-args-cli \
+params.checkpoint.save_interval_updates=10
+```
+## Migrate existing/Creating new modules to hydra interface
+In each of the modules we want to migrated/create with hydra interface, fundamentally we need to
+- Provide a dataclass that layouts the parameters used in the module.
+- Modify the builder and/or constructor that previously takes `argparse.Namespace` argument `args`, into taking `omegaconf.DictConfig` config objects. At this moment we allow `Union[omegaconf.DictConfig, argparse.Namespace]` to support compatibility.
+- For `add_args()`, we need to extract argument from the dataclass defined in the same file, and append them into `parser`. This is also to support compatibility. This is simply supported with `gen_parser_from_dataclass` API, see examples files below.
+#### Migrated examples:
+- Task: `fairseq/tasks/language_modeling.py`
+- Model: `fairseq/models/transformer_lm.py`
+- Criterion: `fairseq/criterions/adaptive_loss.py` and `fairseq/criterions/cross_entropy.py`
+- Optimizer: `fairseq/optim/adam.py` and `fairseq/optim/nag.py`
+- LR scheduler: `fairseq/optim/lr_scheduler/cosine_lr_scheduler.py` and `fairseq/optim/lr_scheduler/inverse_square_root_schedule.py`
+## Interpolate parameters across different places
+## Support of legacy interface
+If you still like to pass legacy style arguments in command line, `fairseq_cli/train.py` can support this. Internally it coverted `args` into hydra config objects whenever there are migrated modules aligned.
+```
+python fairseq_cli/train.py --task language_modeling \
+/private/home/abaevski/data/wiki103 \
+--save-dir /checkpoint/mtian/transformer_wikitext-103-hydra-args-cli \
+--arch transformer_lm --share-decoder-input-output-embed \
+--dropout 0.1 \
+--optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
+--lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+--tokens-per-sample 512 --sample-break-mode none \
+--max-tokens 1024 --update-freq 16 \
+--fp16 \
+--max-update 50000 --log-format json --log-interval 1 --num-workers 4 \
+--save-interval-updates 10
+```
--- a/docs/index.rst
+++ b/docs/index.rst
+.. fairseq documentation master file, created by
+   sphinx-quickstart on Fri Aug 17 21:45:30 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+:github_url: https://github.com/pytorch/fairseq
+fairseq documentation
+=====================
+Fairseq is a sequence modeling toolkit written in `PyTorch
+<http://pytorch.org/>`_ that allows researchers and developers to
+train custom models for translation, summarization, language modeling and other
+text generation tasks.
+.. toctree::
+    :maxdepth: 1
+    :caption: Getting Started
+    getting_started
+    command_line_tools
+.. toctree::
+    :maxdepth: 1
+    :caption: Extending Fairseq
+    overview
+    tutorial_simple_lstm
+    tutorial_classifying_names
+.. toctree::
+    :maxdepth: 2
+    :caption: Library Reference
+    tasks
+    models
+    criterions
+    optim
+    lr_scheduler
+    data
+    modules
+Indices and tables
+==================
+* :ref:`genindex`
+* :ref:`search`
--- a/docs/lr_scheduler.rst
+++ b/docs/lr_scheduler.rst
+.. role:: hidden
+    :class: hidden-section
+.. _Learning Rate Schedulers:
+Learning Rate Schedulers
+========================
+Learning Rate Schedulers update the learning rate over the course of training.
+Learning rates can be updated after each update via :func:`step_update` or at
+epoch boundaries via :func:`step`.
+.. automodule:: fairseq.optim.lr_scheduler
+    :members:
+.. autoclass:: fairseq.optim.lr_scheduler.FairseqLRScheduler
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.cosine_lr_scheduler.CosineSchedule
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.fixed_schedule.FixedSchedule
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.inverse_square_root_schedule.InverseSquareRootSchedule
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.reduce_lr_on_plateau.ReduceLROnPlateau
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.triangular_lr_scheduler.TriangularSchedule
+    :members:
+    :undoc-members:
--- a/docs/make.bat
+++ b/docs/make.bat
+@ECHO OFF
+pushd %~dp0
+REM Command file for Sphinx documentation
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=python -msphinx
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+set SPHINXPROJ=fairseq
+if "%1" == "" goto help
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
+	echo.then set the SPHINXBUILD environment variable to point to the full
+	echo.path of the 'sphinx-build' executable. Alternatively you may add the
+	echo.Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+:end
+popd