Commit f2563c21 authored by Myle Ott's avatar Myle Ott Committed by Facebook Github Bot
Browse files

Cleanup LM + Flake8

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/720

Differential Revision: D15259091

Pulled By: myleott

fbshipit-source-id: 06a35996c06ccddb49fdc9e01e348ff3c9da334e
parent eddcdf08
...@@ -10,6 +10,7 @@ from typing import Any, Dict ...@@ -10,6 +10,7 @@ from typing import Any, Dict
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
from fairseq.data.masked_lm_dictionary import MaskedLMDictionary from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
from fairseq.models import register_model, register_model_architecture
from fairseq.models.transformer import ( from fairseq.models.transformer import (
TransformerDecoder, TransformerDecoder,
TransformerEncoder, TransformerEncoder,
...@@ -17,8 +18,6 @@ from fairseq.models.transformer import ( ...@@ -17,8 +18,6 @@ from fairseq.models.transformer import (
base_architecture as transformer_base_architecture, base_architecture as transformer_base_architecture,
) )
from . import register_model, register_model_architecture
@register_model("transformer_from_pretrained_xlm") @register_model("transformer_from_pretrained_xlm")
class TransformerFromPretrainedXLMModel(TransformerModel): class TransformerFromPretrainedXLMModel(TransformerModel):
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.transformer import (
Embedding,
TransformerDecoder,
)
from fairseq.modules import (
AdaptiveInput,
CharacterTokenEmbedder,
)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False:
# backward compatibility
args.tie_adaptive_proj = True
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(
task.dictionary, eval(args.character_filters),
args.character_embedding_dim, args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(
len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int),
)
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = TransformerDecoder(
args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False,
)
return TransformerLanguageModel(decoder)
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.activation_fn = getattr(args, 'activation_fn', 'relu')
args.add_bos_token = getattr(args, 'add_bos_token', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.decoder_layers = getattr(args, 'decoder_layers', 16)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.dropout = getattr(args, 'dropout', 0.3)
args.adaptive_input = getattr(args, 'adaptive_input', True)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
def transformer_lm_gpt(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 12)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
def transformer_lm_gpt2_small(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_layers = getattr(args, 'decoder_layers', 24)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
def transformer_lm_gpt2_medium(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)
args.decoder_layers = getattr(args, 'decoder_layers', 36)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 20)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
def transformer_lm_gpt2_big(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)
args.decoder_layers = getattr(args, 'decoder_layers', 48)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 25)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import torch import torch
import torch.nn.functional as F
from torch import nn from torch import nn
...@@ -50,6 +49,6 @@ class Highway(torch.nn.Module): ...@@ -50,6 +49,6 @@ class Highway(torch.nn.Module):
projection = layer(x) projection = layer(x)
proj_x, gate = projection.chunk(2, dim=-1) proj_x, gate = projection.chunk(2, dim=-1)
proj_x = self.activation(proj_x) proj_x = self.activation(proj_x)
gate = F.sigmoid(gate) gate = torch.sigmoid(gate)
x = gate * x + (gate.new_tensor([1]) - gate) * proj_x x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
return x return x
...@@ -5,14 +5,12 @@ ...@@ -5,14 +5,12 @@
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import math
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import utils from fairseq import utils
from .unfold import unfold1d from fairseq.modules.unfold import unfold1d
class LightweightConv1d(nn.Module): class LightweightConv1d(nn.Module):
......
...@@ -5,12 +5,16 @@ ...@@ -5,12 +5,16 @@
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import math
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq.modules import gelu, MultiheadAttention, BertLayerNorm, LayerNorm
from fairseq.modules import (
BertLayerNorm,
gelu,
LayerNorm,
MultiheadAttention,
)
class TransformerSentenceEncoderLayer(nn.Module): class TransformerSentenceEncoderLayer(nn.Module):
......
...@@ -12,6 +12,13 @@ from .fairseq_optimizer import FairseqOptimizer ...@@ -12,6 +12,13 @@ from .fairseq_optimizer import FairseqOptimizer
from .fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer from .fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
__all__ = [
'FairseqOptimizer',
'FP16Optimizer',
'MemoryEfficientFP16Optimizer',
]
OPTIMIZER_REGISTRY = {} OPTIMIZER_REGISTRY = {}
OPTIMIZER_CLASS_NAMES = set() OPTIMIZER_CLASS_NAMES = set()
......
...@@ -78,7 +78,6 @@ class CrossLingualLMTask(FairseqTask): ...@@ -78,7 +78,6 @@ class CrossLingualLMTask(FairseqTask):
lang2id[lang] = id lang2id[lang] = id
return lang2id return lang2id
@classmethod @classmethod
def load_dictionary(cls, filename): def load_dictionary(cls, filename):
return MaskedLMDictionary.load(filename) return MaskedLMDictionary.load(filename)
......
...@@ -21,7 +21,7 @@ from fairseq.data import ( ...@@ -21,7 +21,7 @@ from fairseq.data import (
TruncatedDictionary, TruncatedDictionary,
indexed_dataset indexed_dataset
) )
from . import FairseqTask, register_task from fairseq.tasks import FairseqTask, register_task
@register_task('language_modeling') @register_task('language_modeling')
......
...@@ -13,10 +13,8 @@ import torch ...@@ -13,10 +13,8 @@ import torch
from fairseq import options, utils from fairseq import options, utils
from fairseq.data import ( from fairseq.data import (
BacktranslationDataset,
Dictionary, Dictionary,
LanguagePairDataset, LanguagePairDataset,
NoisingDataset,
RoundRobinZipDatasets, RoundRobinZipDatasets,
TransformEosLangPairDataset, TransformEosLangPairDataset,
indexed_dataset, indexed_dataset,
......
...@@ -12,9 +12,8 @@ from fairseq import options, utils ...@@ -12,9 +12,8 @@ from fairseq import options, utils
from fairseq.data import ( from fairseq.data import (
ConcatDataset, ConcatDataset,
data_utils, data_utils,
Dictionary, indexed_dataset,
LanguagePairDataset, LanguagePairDataset,
indexed_dataset
) )
from . import FairseqTask, register_task from . import FairseqTask, register_task
...@@ -26,8 +25,8 @@ class TranslationTask(FairseqTask): ...@@ -26,8 +25,8 @@ class TranslationTask(FairseqTask):
Translate from one (source) language to another (target) language. Translate from one (source) language to another (target) language.
Args: Args:
src_dict (Dictionary): dictionary for the source language src_dict (~fairseq.data.Dictionary): dictionary for the source language
tgt_dict (Dictionary): dictionary for the target language tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
.. note:: .. note::
......
...@@ -9,8 +9,8 @@ import contextlib ...@@ -9,8 +9,8 @@ import contextlib
import torch import torch
from fairseq import modules, utils from fairseq import modules, utils
from . import register_task from fairseq.tasks import register_task
from .translation import TranslationTask from fairseq.tasks.translation import TranslationTask
@contextlib.contextmanager @contextlib.contextmanager
......
...@@ -11,12 +11,11 @@ Translate raw text with a trained model. Batches data on-the-fly. ...@@ -11,12 +11,11 @@ Translate raw text with a trained model. Batches data on-the-fly.
from collections import namedtuple from collections import namedtuple
import fileinput import fileinput
import sys
import torch import torch
from fairseq import checkpoint_utils, options, tasks, utils from fairseq import checkpoint_utils, options, tasks, utils
from fairseq.sequence_generator import SequenceGenerator
Batch = namedtuple('Batch', 'ids src_tokens src_lengths') Batch = namedtuple('Batch', 'ids src_tokens src_lengths')
Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments') Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments')
......
...@@ -10,7 +10,6 @@ Train a new model on one or across multiple GPUs. ...@@ -10,7 +10,6 @@ Train a new model on one or across multiple GPUs.
""" """
import collections import collections
import itertools
import math import math
import os import os
import random import random
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment