initial commit

18d27e00 · wangwei990215 · 541f4c7a · 18d27e00 · 18d27e00 · 18d27e00
Commit 18d27e00 authored Aug 27, 2024 by wangwei990215
20 changed files
--- a/fairseq/config/model/transformer_lm_big.yaml
+++ b/fairseq/config/model/transformer_lm_big.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.0
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/model/transformer_lm_gbw.yaml
+++ b/fairseq/config/model/transformer_lm_gbw.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/model/transformer_lm_gpt.yaml
+++ b/fairseq/config/model/transformer_lm_gpt.yaml
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 768
+decoder_output_dim: 768
+decoder_input_dim: 768
+decoder_ffn_embed_dim: 3072
+decoder_layers: 12
+decoder_attention_heads: 12
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/model/transformer_lm_gpt2_big.yaml
+++ b/fairseq/config/model/transformer_lm_gpt2_big.yaml
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1600
+decoder_output_dim: 1600
+decoder_input_dim: 1600
+decoder_ffn_embed_dim: 6400
+decoder_layers: 48
+decoder_attention_heads: 25
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/model/transformer_lm_gpt2_medium.yaml
+++ b/fairseq/config/model/transformer_lm_gpt2_medium.yaml
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1280
+decoder_output_dim: 1280
+decoder_input_dim: 1280
+decoder_ffn_embed_dim: 5120
+decoder_layers: 36
+decoder_attention_heads: 20
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/model/transformer_lm_gpt2_small.yaml
+++ b/fairseq/config/model/transformer_lm_gpt2_small.yaml
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 24
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/model/transformer_lm_wiki103.yaml
+++ b/fairseq/config/model/transformer_lm_wiki103.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/fairseq/config/optimizer/adam.yaml
+++ b/fairseq/config/optimizer/adam.yaml
+# @package _group_
+adam_betas: "(0.9, 0.999)"
+adam_eps: 1.0e-8
+weight_decay: 0
+use_old_adam: false
--- a/fairseq/config/optimizer/nag.yaml
+++ b/fairseq/config/optimizer/nag.yaml
+# @package _group_
+momentum: 0.99
+weight_decay: 0.0
--- a/fairseq/config/params/eval_lm_params.yaml
+++ b/fairseq/config/params/eval_lm_params.yaml
+# @package _group_
+common:
+  no_progress_bar: false
+  log_interval: 100
+  log_format: null
+  tensorboard_logdir: null
+  seed: 1
+  cpu: false
+  fp16: false
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  min_loss_scale: 1.0e-4
+  threshold_loss_scale: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  checkpoint_suffix: ""
+  quantization_config_path: null
+distributed_training:
+  distributed_rank: 0
+  distributed_backend: "nccl"
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  local_rank: 0
+  distributed_no_spawn: false
+  ddp_backend: "c10d"
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  fast_stat_sync: false
+  broadcast_buffers: false
+  distributed_wrapper: "DDP"
+  slowmo_momentum: null
+  slowmo_algorithm: "LocalSGD"
+  localsgd_frequency: 3
+dataset:
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: false
+  max_tokens: null
+  batch_size: ${params.dataset.batch_size}
+  required_batch_size_multiple: 8
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: "train"
+  valid_subset: "valid"
+  validate_interval: 1
+  fixed_validation_seed: null
+  disable_validation: false
+  curriculum: 0
+  gen_subset: "test"
+  num_shards: 1
+  shard_id: 0
+  max_tokens_valid: ${params.dataset.max_tokens}
+  batch_size_valid: ${params.dataset.batch_size}
+optimization:
+  max_epoch: 0
+  max_update: 0
+  clip_norm: 25.0
+  sentence_avg: false
+  update_freq: [1]
+  lr: [0.25]
+  min_lr: -1.0
+  use_bmuf: false
+checkpoint:
+  save_dir: "checkpoints"
+  restore_file: "checkpoint_last.pt"
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: "{}"
+  save_interval: 1
+  save_interval_updates: 0
+  keep_interval_updates: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: false
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: "loss"
+  maximize_best_checkpoint_metric: false
+  patience: -1
+common_eval:
+  path: null
+  remove_bpe: null
+  quiet: false
+  model_overrides: "{}"
+  results_path: null
+eval_lm:
+  output_word_probs: false
+  output_word_stats: false
+  context_window: 0
+bmuf:
+  block_lr: 1
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
--- a/fairseq/config/params/training_params.yaml
+++ b/fairseq/config/params/training_params.yaml
+# @package _group_
+common:
+  no_progress_bar: false
+  log_interval: 100
+  log_format: null
+  tensorboard_logdir: null
+  seed: 1
+  cpu: false
+  fp16: false
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  min_loss_scale: 1.0e-4
+  threshold_loss_scale: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  checkpoint_suffix: ""
+  quantization_config_path: null
+distributed_training:
+  distributed_rank: 0
+  distributed_backend: "nccl"
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  local_rank: 0
+  distributed_no_spawn: false
+  ddp_backend: "c10d"
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  fast_stat_sync: false
+  broadcast_buffers: false
+  distributed_wrapper: "DDP"
+  slowmo_momentum: null
+  slowmo_algorithm: "LocalSGD"
+  localsgd_frequency: 3
+dataset:
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: false
+  max_tokens: null
+  batch_size: ${params.dataset.batch_size}
+  required_batch_size_multiple: 8
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: "train"
+  valid_subset: "valid"
+  validate_interval: 1
+  fixed_validation_seed: null
+  disable_validation: false
+  curriculum: 0
+  gen_subset: "test"
+  num_shards: 1
+  shard_id: 0
+  max_tokens_valid: ${params.dataset.max_tokens}
+  batch_size_valid: ${params.dataset.batch_size}
+optimization:
+  max_epoch: 0
+  max_update: 0
+  clip_norm: 25.0
+  sentence_avg: false
+  update_freq: [1]
+  lr: [0.25]
+  min_lr: -1.0
+  use_bmuf: false
+checkpoint:
+  save_dir: "checkpoints"
+  restore_file: "checkpoint_last.pt"
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: "{}"
+  save_interval: 1
+  save_interval_updates: 0
+  keep_interval_updates: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: false
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: "loss"
+  maximize_best_checkpoint_metric: false
+  patience: -1
+bmuf:
+  block_lr: 1
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
--- a/fairseq/config/task/language_modeling.yaml
+++ b/fairseq/config/task/language_modeling.yaml
+# @package _group_
+data: ???
+sample_break_mode: "none"
+tokens_per_sample: 1024
+output_dictionary_size: -1
+self_target: false
+future_target: false
+past_target: false
+add_bos_token: false
+max_target_positions: null
--- a/fairseq/docs/Makefile
+++ b/fairseq/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = fairseq
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/fairseq/docs/_static/theme_overrides.css
+++ b/fairseq/docs/_static/theme_overrides.css
+.wy-table-responsive table td kbd {
+    white-space: nowrap;
+}
+.wy-table-responsive table td {
+    white-space: normal !important;
+}
+.wy-table-responsive {
+    overflow: visible !important;
+}
--- a/fairseq/docs/command_line_tools.rst
+++ b/fairseq/docs/command_line_tools.rst
+.. _Command-line Tools:
+
+Command-line Tools
+==================
+
+Fairseq provides several command-line tools for training and evaluating models:
+
+- :ref:`fairseq-preprocess`: Data pre-processing: build vocabularies and binarize training data
+- :ref:`fairseq-train`: Train a new model on one or multiple GPUs
+- :ref:`fairseq-generate`: Translate pre-processed data with a trained model
+- :ref:`fairseq-interactive`: Translate raw text with a trained model
+- :ref:`fairseq-score`: BLEU scoring of generated translations against reference translations
+- :ref:`fairseq-eval-lm`: Language model evaluation
+
+
+.. _fairseq-preprocess:
+
+fairseq-preprocess
+~~~~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.preprocess
+
+    .. argparse::
+        :module: fairseq.options
+        :func: get_preprocessing_parser
+        :prog: fairseq-preprocess
+
+
+.. _fairseq-train:
+
+fairseq-train
+~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.train
+
+    .. argparse::
+        :module: fairseq.options
+        :func: get_training_parser
+        :prog: fairseq-train
+
+
+.. _fairseq-generate:
+
+fairseq-generate
+~~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.generate
+
+    .. argparse::
+        :module: fairseq.options
+        :func: get_generation_parser
+        :prog: fairseq-generate
+
+
+.. _fairseq-interactive:
+
+fairseq-interactive
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.interactive
+
+    .. argparse::
+        :module: fairseq.options
+        :func: get_interactive_generation_parser
+        :prog: fairseq-interactive
+
+
+.. _fairseq-score:
+
+fairseq-score
+~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.score
+
+    .. argparse::
+        :module: fairseq_cli.score
+        :func: get_parser
+        :prog: fairseq-score
+
+
+.. _fairseq-eval-lm:
+
+fairseq-eval-lm
+~~~~~~~~~~~~~~~
+.. automodule:: fairseq_cli.eval_lm
+
+    .. argparse::
+        :module: fairseq.options
+        :func: get_eval_lm_parser
+        :prog: fairseq-eval-lm
--- a/fairseq/docs/conf.py
+++ b/fairseq/docs/conf.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# fairseq documentation build configuration file, created by
+# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import sys
+
+
+# source code directory, relative to this file, for sphinx-autobuild
+sys.path.insert(0, os.path.abspath(".."))
+
+source_suffix = [".rst"]
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "sphinxarg.ext",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The master toctree document.
+master_doc = "index"
+
+# General information about the project.
+project = "fairseq"
+copyright = "2019, Facebook AI Research (FAIR)"
+author = "Facebook AI Research (FAIR)"
+
+github_doc_root = "https://github.com/pytorch/fairseq/tree/master/docs/"
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = "0.10.2"
+# The full version, including alpha/beta/rc tags.
+release = "0.10.2"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+highlight_language = "python"
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+html_context = {
+    "css_files": [
+        "_static/theme_overrides.css",  # override wide tables in RTD theme
+    ],
+}
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+# html_sidebars = {
+#    '**': [
+#        'about.html',
+#        'navigation.html',
+#        'relations.html',  # needs 'show_related': True theme option to display
+#        'searchbox.html',
+#        'donate.html',
+#    ]
+# }
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
+    "python": ("https://docs.python.org/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}
--- a/fairseq/docs/criterions.rst
+++ b/fairseq/docs/criterions.rst
+.. role:: hidden
+    :class: hidden-section
+
+.. _Criterions:
+
+Criterions
+==========
+
+Criterions compute the loss function given the model and batch, roughly::
+
+  loss = criterion(model, batch)
+
+.. automodule:: fairseq.criterions
+    :members:
+
+.. autoclass:: fairseq.criterions.FairseqCriterion
+    :members:
+    :undoc-members:
+
+.. autoclass:: fairseq.criterions.adaptive_loss.AdaptiveLoss
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.composite_loss.CompositeLoss
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.cross_entropy.CrossEntropyCriterion
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.criterions.label_smoothed_cross_entropy.LabelSmoothedCrossEntropyCriterion
+    :members:
+    :undoc-members:
--- a/fairseq/docs/data.rst
+++ b/fairseq/docs/data.rst
+.. role:: hidden
+    :class: hidden-section
+
+.. module:: fairseq.data
+
+Data Loading and Utilities
+==========================
+
+.. _datasets:
+
+Datasets
+--------
+
+**Datasets** define the data format and provide helpers for creating
+mini-batches.
+
+.. autoclass:: fairseq.data.FairseqDataset
+    :members:
+.. autoclass:: fairseq.data.LanguagePairDataset
+    :members:
+.. autoclass:: fairseq.data.MonolingualDataset
+    :members:
+
+**Helper Datasets**
+
+These datasets wrap other :class:`fairseq.data.FairseqDataset` instances and
+provide additional functionality:
+
+.. autoclass:: fairseq.data.BacktranslationDataset
+    :members:
+.. autoclass:: fairseq.data.ConcatDataset
+    :members:
+.. autoclass:: fairseq.data.ResamplingDataset
+    :members:
+.. autoclass:: fairseq.data.RoundRobinZipDatasets
+    :members:
+.. autoclass:: fairseq.data.TransformEosDataset
+    :members:
+
+
+Dictionary
+----------
+
+.. autoclass:: fairseq.data.Dictionary
+    :members:
+
+
+Iterators
+---------
+
+.. autoclass:: fairseq.data.CountingIterator
+    :members:
+.. autoclass:: fairseq.data.EpochBatchIterator
+    :members:
+.. autoclass:: fairseq.data.GroupedIterator
+    :members:
+.. autoclass:: fairseq.data.ShardedIterator
+    :members:
--- a/fairseq/docs/docutils.conf
+++ b/fairseq/docs/docutils.conf
+[writers]
+option-limit=0
--- a/fairseq/docs/fairseq.gif
+++ b/fairseq/docs/fairseq.gif