Commit f2563c21 authored by Myle Ott's avatar Myle Ott Committed by Facebook Github Bot
Browse files

Cleanup LM + Flake8

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/720

Differential Revision: D15259091

Pulled By: myleott

fbshipit-source-id: 06a35996c06ccddb49fdc9e01e348ff3c9da334e
parent eddcdf08
...@@ -10,6 +10,7 @@ from typing import Any, Dict ...@@ -10,6 +10,7 @@ from typing import Any, Dict
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
from fairseq.data.masked_lm_dictionary import MaskedLMDictionary from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
from fairseq.models import register_model, register_model_architecture
from fairseq.models.transformer import ( from fairseq.models.transformer import (
TransformerDecoder, TransformerDecoder,
TransformerEncoder, TransformerEncoder,
...@@ -17,8 +18,6 @@ from fairseq.models.transformer import ( ...@@ -17,8 +18,6 @@ from fairseq.models.transformer import (
base_architecture as transformer_base_architecture, base_architecture as transformer_base_architecture,
) )
from . import register_model, register_model_architecture
@register_model("transformer_from_pretrained_xlm") @register_model("transformer_from_pretrained_xlm")
class TransformerFromPretrainedXLMModel(TransformerModel): class TransformerFromPretrainedXLMModel(TransformerModel):
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.transformer import (
Embedding,
TransformerDecoder,
)
from fairseq.modules import (
AdaptiveInput,
CharacterTokenEmbedder,
)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False:
# backward compatibility
args.tie_adaptive_proj = True
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(
task.dictionary, eval(args.character_filters),
args.character_embedding_dim, args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(
len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int),
)
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = TransformerDecoder(
args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False,
)
return TransformerLanguageModel(decoder)
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.activation_fn = getattr(args, 'activation_fn', 'relu')
args.add_bos_token = getattr(args, 'add_bos_token', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.decoder_layers = getattr(args, 'decoder_layers', 16)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.dropout = getattr(args, 'dropout', 0.3)
args.adaptive_input = getattr(args, 'adaptive_input', True)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
def transformer_lm_gpt(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 12)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
def transformer_lm_gpt2_small(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_layers = getattr(args, 'decoder_layers', 24)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
def transformer_lm_gpt2_medium(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)
args.decoder_layers = getattr(args, 'decoder_layers', 36)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 20)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
def transformer_lm_gpt2_big(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)
args.decoder_layers = getattr(args, 'decoder_layers', 48)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 25)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
...@@ -150,7 +150,7 @@ class DynamicConv1dTBC(nn.Module): ...@@ -150,7 +150,7 @@ class DynamicConv1dTBC(nn.Module):
weight = F.dropout(weight, self.weight_dropout, training=self.training, inplace=False) weight = F.dropout(weight, self.weight_dropout, training=self.training, inplace=False)
output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1
output = output.view(T, B, C) output = output.view(T, B, C)
return output return output
...@@ -195,7 +195,7 @@ class DynamicConv1dTBC(nn.Module): ...@@ -195,7 +195,7 @@ class DynamicConv1dTBC(nn.Module):
# turn the convolution filters into band matrices # turn the convolution filters into band matrices
weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False) weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight) weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
output = torch.bmm(weight_expanded, x) output = torch.bmm(weight_expanded, x)
output = output.transpose(0, 1).contiguous().view(T, B, C) output = output.transpose(0, 1).contiguous().view(T, B, C)
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import torch import torch
import torch.nn.functional as F
from torch import nn from torch import nn
...@@ -50,6 +49,6 @@ class Highway(torch.nn.Module): ...@@ -50,6 +49,6 @@ class Highway(torch.nn.Module):
projection = layer(x) projection = layer(x)
proj_x, gate = projection.chunk(2, dim=-1) proj_x, gate = projection.chunk(2, dim=-1)
proj_x = self.activation(proj_x) proj_x = self.activation(proj_x)
gate = F.sigmoid(gate) gate = torch.sigmoid(gate)
x = gate * x + (gate.new_tensor([1]) - gate) * proj_x x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
return x return x
...@@ -5,14 +5,12 @@ ...@@ -5,14 +5,12 @@
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import math
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import utils from fairseq import utils
from .unfold import unfold1d from fairseq.modules.unfold import unfold1d
class LightweightConv1d(nn.Module): class LightweightConv1d(nn.Module):
...@@ -182,7 +180,7 @@ class LightweightConv1dTBC(nn.Module): ...@@ -182,7 +180,7 @@ class LightweightConv1dTBC(nn.Module):
weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1) weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1)
weight = F.dropout(weight, self.weight_dropout, training=self.training) weight = F.dropout(weight, self.weight_dropout, training=self.training)
output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 output = torch.bmm(x_unfold, weight) # T*B*H x R x 1
output = output.view(T, B, C) output = output.view(T, B, C)
return output return output
......
...@@ -5,12 +5,16 @@ ...@@ -5,12 +5,16 @@
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import math
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq.modules import gelu, MultiheadAttention, BertLayerNorm, LayerNorm
from fairseq.modules import (
BertLayerNorm,
gelu,
LayerNorm,
MultiheadAttention,
)
class TransformerSentenceEncoderLayer(nn.Module): class TransformerSentenceEncoderLayer(nn.Module):
......
...@@ -12,6 +12,13 @@ from .fairseq_optimizer import FairseqOptimizer ...@@ -12,6 +12,13 @@ from .fairseq_optimizer import FairseqOptimizer
from .fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer from .fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
__all__ = [
'FairseqOptimizer',
'FP16Optimizer',
'MemoryEfficientFP16Optimizer',
]
OPTIMIZER_REGISTRY = {} OPTIMIZER_REGISTRY = {}
OPTIMIZER_CLASS_NAMES = set() OPTIMIZER_CLASS_NAMES = set()
......
...@@ -78,7 +78,6 @@ class CrossLingualLMTask(FairseqTask): ...@@ -78,7 +78,6 @@ class CrossLingualLMTask(FairseqTask):
lang2id[lang] = id lang2id[lang] = id
return lang2id return lang2id
@classmethod @classmethod
def load_dictionary(cls, filename): def load_dictionary(cls, filename):
return MaskedLMDictionary.load(filename) return MaskedLMDictionary.load(filename)
......
...@@ -21,7 +21,7 @@ from fairseq.data import ( ...@@ -21,7 +21,7 @@ from fairseq.data import (
TruncatedDictionary, TruncatedDictionary,
indexed_dataset indexed_dataset
) )
from . import FairseqTask, register_task from fairseq.tasks import FairseqTask, register_task
@register_task('language_modeling') @register_task('language_modeling')
......
...@@ -13,10 +13,8 @@ import torch ...@@ -13,10 +13,8 @@ import torch
from fairseq import options, utils from fairseq import options, utils
from fairseq.data import ( from fairseq.data import (
BacktranslationDataset,
Dictionary, Dictionary,
LanguagePairDataset, LanguagePairDataset,
NoisingDataset,
RoundRobinZipDatasets, RoundRobinZipDatasets,
TransformEosLangPairDataset, TransformEosLangPairDataset,
indexed_dataset, indexed_dataset,
......
...@@ -12,9 +12,8 @@ from fairseq import options, utils ...@@ -12,9 +12,8 @@ from fairseq import options, utils
from fairseq.data import ( from fairseq.data import (
ConcatDataset, ConcatDataset,
data_utils, data_utils,
Dictionary, indexed_dataset,
LanguagePairDataset, LanguagePairDataset,
indexed_dataset
) )
from . import FairseqTask, register_task from . import FairseqTask, register_task
...@@ -26,8 +25,8 @@ class TranslationTask(FairseqTask): ...@@ -26,8 +25,8 @@ class TranslationTask(FairseqTask):
Translate from one (source) language to another (target) language. Translate from one (source) language to another (target) language.
Args: Args:
src_dict (Dictionary): dictionary for the source language src_dict (~fairseq.data.Dictionary): dictionary for the source language
tgt_dict (Dictionary): dictionary for the target language tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
.. note:: .. note::
......
...@@ -9,8 +9,8 @@ import contextlib ...@@ -9,8 +9,8 @@ import contextlib
import torch import torch
from fairseq import modules, utils from fairseq import modules, utils
from . import register_task from fairseq.tasks import register_task
from .translation import TranslationTask from fairseq.tasks.translation import TranslationTask
@contextlib.contextmanager @contextlib.contextmanager
......
...@@ -11,12 +11,11 @@ Translate raw text with a trained model. Batches data on-the-fly. ...@@ -11,12 +11,11 @@ Translate raw text with a trained model. Batches data on-the-fly.
from collections import namedtuple from collections import namedtuple
import fileinput import fileinput
import sys
import torch import torch
from fairseq import checkpoint_utils, options, tasks, utils from fairseq import checkpoint_utils, options, tasks, utils
from fairseq.sequence_generator import SequenceGenerator
Batch = namedtuple('Batch', 'ids src_tokens src_lengths') Batch = namedtuple('Batch', 'ids src_tokens src_lengths')
Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments') Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments')
......
...@@ -10,7 +10,6 @@ Train a new model on one or across multiple GPUs. ...@@ -10,7 +10,6 @@ Train a new model on one or across multiple GPUs.
""" """
import collections import collections
import itertools
import math import math
import os import os
import random import random
...@@ -140,7 +139,7 @@ def train(args, trainer, task, epoch_itr): ...@@ -140,7 +139,7 @@ def train(args, trainer, task, epoch_itr):
"""Train the model for one epoch.""" """Train the model for one epoch."""
# Update parameters every N batches # Update parameters every N batches
update_freq = args.update_freq[epoch_itr.epoch - 1] \ update_freq = args.update_freq[epoch_itr.epoch - 1] \
if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1] if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1]
# Initialize data iterator # Initialize data iterator
itr = epoch_itr.next_epoch_itr( itr = epoch_itr.next_epoch_itr(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment