Commit f2563c21 authored by Myle Ott's avatar Myle Ott Committed by Facebook Github Bot
Browse files

Cleanup LM + Flake8

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/720

Differential Revision: D15259091

Pulled By: myleott

fbshipit-source-id: 06a35996c06ccddb49fdc9e01e348ff3c9da334e
parent eddcdf08
......@@ -47,4 +47,5 @@ __all__ = [
'TokenBlockDataset',
'TransformEosDataset',
'TransformEosLangPairDataset',
'TruncatedDictionary',
]
......@@ -10,6 +10,7 @@ import os
import numpy as np
from collections import Iterable
def infer_language_pair(path):
"""Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
src, dst = None, None
......@@ -182,7 +183,7 @@ def batch_by_size(
def process_bpe_symbol(sentence: str, bpe_symbol: str):
if bpe_symbol == 'sentencepiece':
sentence = sentence.replace(' ','').replace('\u2581', ' ').strip()
sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip()
elif bpe_symbol is not None:
sentence = (sentence + ' ').replace(bpe_symbol, '').rstrip()
return sentence
......@@ -18,6 +18,7 @@ from fairseq.data import data_utils
class Dictionary(object):
"""A mapping from symbols to consecutive integers"""
def __init__(self, pad='<pad>', eos='</s>', unk='<unk>', bos='<s>'):
self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
self.symbols = []
......@@ -282,6 +283,7 @@ class Dictionary(object):
else:
merge_result(Dictionary._add_file_to_dictionary_single_worker(filename, tokenize, dict.eos_word))
class TruncatedDictionary(object):
def __init__(self, wrapped_dict, length):
......
......@@ -7,8 +7,6 @@
import itertools
import math
import queue
import threading
import numpy as np
import torch
......
......@@ -8,8 +8,6 @@
import numpy as np
import torch
from fairseq import utils
from . import data_utils, FairseqDataset
......
......@@ -10,7 +10,7 @@ import math
import numpy as np
import torch
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Tuple
from . import FairseqDataset, data_utils
......
......@@ -9,19 +9,33 @@ import argparse
import importlib
import os
from .fairseq_decoder import FairseqDecoder # noqa: F401
from .fairseq_encoder import FairseqEncoder # noqa: F401
from .fairseq_incremental_decoder import FairseqIncrementalDecoder # noqa: F401
from .fairseq_decoder import FairseqDecoder
from .fairseq_encoder import FairseqEncoder
from .fairseq_incremental_decoder import FairseqIncrementalDecoder
from .fairseq_model import (
BaseFairseqModel,
FairseqModel, # noqa: F401
FairseqMultiModel, # noqa: F401
FairseqLanguageModel, # noqa: F401
FairseqEncoderModel, # noqa: F401
FairseqModel,
FairseqMultiModel,
FairseqLanguageModel,
FairseqEncoderModel,
)
from .composite_encoder import CompositeEncoder # noqa: F401
from .distributed_fairseq_model import DistributedFairseqModel # noqa: F401
from .composite_encoder import CompositeEncoder
from .distributed_fairseq_model import DistributedFairseqModel
__all__ = [
'BaseFairseqModel',
'CompositeEncoder',
'DistributedFairseqModel',
'FairseqDecoder',
'FairseqEncoder',
'FairseqEncoderModel',
'FairseqIncrementalDecoder',
'FairseqLanguageModel',
'FairseqModel',
'FairseqMultiModel',
]
MODEL_REGISTRY = {}
......
......@@ -5,7 +5,7 @@
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from . import FairseqEncoder
from fairseq.models import FairseqEncoder
class CompositeEncoder(FairseqEncoder):
......
......@@ -6,14 +6,11 @@
# can be found in the PATENTS file in the same directory.
import inspect
import socket
from torch.nn import parallel
from fairseq import distributed_utils
from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
from . import BaseFairseqModel
from fairseq.models import BaseFairseqModel
def DistributedFairseqModel(args, model):
......
......@@ -5,7 +5,7 @@
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from . import FairseqDecoder
from fairseq.models import FairseqDecoder
class FairseqIncrementalDecoder(FairseqDecoder):
......@@ -25,7 +25,7 @@ class FairseqIncrementalDecoder(FairseqDecoder):
The :class:`FairseqIncrementalDecoder` interface also defines the
:func:`reorder_incremental_state` method, which is used during beam search
to select and reorder the incremental state based on the selection of beams.
To learn more about how incremental decoding works, refer to `this blog
<http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_.
"""
......
......@@ -4,14 +4,15 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from typing import Dict, List, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from . import FairseqDecoder, FairseqEncoder
from fairseq.data import Dictionary
from fairseq.models import FairseqDecoder, FairseqEncoder
class BaseFairseqModel(nn.Module):
......
......@@ -10,17 +10,19 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq import utils
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from fairseq.modules import (
AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding,
LinearizedConvolution,
)
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel,
FairseqLanguageModel, register_model, register_model_architecture,
)
@register_model('fconv')
class FConvModel(FairseqModel):
......@@ -111,58 +113,6 @@ class FConvModel(FairseqModel):
return FConvModel(encoder, decoder)
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
adaptive_softmax_dropout=args.adaptive_softmax_dropout,
)
return FConvLanguageModel(decoder)
class FConvEncoder(FairseqEncoder):
"""
Convolutional encoder consisting of `len(convolutions)` layers.
......@@ -643,46 +593,6 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
return nn.utils.weight_norm(m, dim=2)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv', 'fconv')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.fconv import FConvDecoder
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
adaptive_softmax_dropout=args.adaptive_softmax_dropout,
)
return FConvLanguageModel(decoder)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)
......@@ -12,14 +12,20 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import checkpoint_utils
from fairseq.modules import (
DownsampledMultiHeadAttention, GradMultiply, LayerNorm,
LearnedPositionalEmbedding, LinearizedConvolution,
from fairseq.models import (
CompositeEncoder,
FairseqDecoder,
FairseqEncoder,
FairseqModel,
register_model,
register_model_architecture,
)
from . import (
FairseqEncoder, CompositeEncoder, FairseqDecoder, FairseqModel,
register_model, register_model_architecture,
from fairseq.modules import (
DownsampledMultiHeadAttention,
GradMultiply,
LayerNorm,
LearnedPositionalEmbedding,
LinearizedConvolution,
)
......
......@@ -12,15 +12,21 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import (
AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm,
LearnedPositionalEmbedding, MultiheadAttention, SinusoidalPositionalEmbedding,
DynamicConv1dTBC, LightweightConv1dTBC,
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from . import (
FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel,
FairseqModel, register_model, register_model_architecture,
from fairseq.modules import (
AdaptiveSoftmax,
DynamicConv1dTBC,
LayerNorm,
LearnedPositionalEmbedding,
LightweightConv1dTBC,
MultiheadAttention,
SinusoidalPositionalEmbedding,
)
......@@ -171,117 +177,6 @@ class LightConvModel(FairseqModel):
return LightConvModel(encoder, decoder)
@register_model('lightconv_lm')
class LightConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--input-dropout', type=float, metavar='D',
help='dropout probability of the inputs')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads or LightConv/DynamicConv heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', default=False, action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
"""LightConv and DynamicConv arguments"""
parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
help='list of kernel size (default: "[3,7,15,31,31,31]")')
parser.add_argument('--decoder-glu', type=options.eval_bool,
help='glu after in proj')
parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
choices=['dynamic', 'lightweight'],
help='type of convolution')
parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
parser.add_argument('--weight-dropout', type=float, metavar='D',
help='dropout probability for conv weights')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
args.character_embedding_dim,
args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int))
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
return LightConvLanguageModel(decoder)
class LightConvEncoder(FairseqEncoder):
"""
LightConv encoder consisting of *args.encoder_layers* layers. Each layer
......@@ -786,47 +681,6 @@ def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=Fals
return m
@register_model_architecture('lightconv_lm', 'lightconv_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
if len(args.decoder_kernel_size_list) == 1:
args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
def lightconv_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('lightconv', 'lightconv')
def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.lightconv import (
Embedding,
LightConvDecoder,
)
from fairseq.modules import (
AdaptiveInput,
CharacterTokenEmbedder,
)
@register_model('lightconv_lm')
class LightConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--input-dropout', type=float, metavar='D',
help='dropout probability of the inputs')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads or LightConv/DynamicConv heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', default=False, action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
"""LightConv and DynamicConv arguments"""
parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
help='list of kernel size (default: "[3,7,15,31,31,31]")')
parser.add_argument('--decoder-glu', type=options.eval_bool,
help='glu after in proj')
parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
choices=['dynamic', 'lightweight'],
help='type of convolution')
parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
parser.add_argument('--weight-dropout', type=float, metavar='D',
help='dropout probability for conv weights')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
args.character_embedding_dim,
args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int))
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
return LightConvLanguageModel(decoder)
@register_model_architecture('lightconv_lm', 'lightconv_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
if len(args.decoder_kernel_size_list) == 1:
args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
def lightconv_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
......@@ -10,11 +10,14 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import AdaptiveSoftmax
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model,
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from fairseq.modules import AdaptiveSoftmax
@register_model('lstm')
......@@ -299,7 +302,7 @@ class AttentionLayer(nn.Module):
# sum weighted sources
x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
x = F.tanh(self.output_proj(torch.cat((x, input), dim=1)))
x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1)))
return x, attn_scores
......
......@@ -9,8 +9,11 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from . import (
BaseFairseqModel, FairseqEncoder, register_model, register_model_architecture,
from fairseq.models import (
BaseFairseqModel,
FairseqEncoder,
register_model,
register_model_architecture,
)
from fairseq.modules import (
SinusoidalPositionalEmbedding,
......
......@@ -8,17 +8,19 @@
from collections import OrderedDict
from fairseq import utils
from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
from . import FairseqMultiModel, register_model, register_model_architecture
from .transformer import (
from fairseq.models import (
FairseqMultiModel,
register_model,
register_model_architecture,
)
from fairseq.models.transformer import (
base_architecture,
Embedding,
TransformerModel,
TransformerEncoder,
TransformerDecoder,
)
from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
@register_model('multilingual_transformer')
......
......@@ -12,14 +12,19 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import (
AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm,
MultiheadAttention, PositionalEmbedding, SinusoidalPositionalEmbedding,
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from . import (
FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel,
FairseqModel, register_model, register_model_architecture,
from fairseq.modules import (
AdaptiveSoftmax,
LayerNorm,
MultiheadAttention,
PositionalEmbedding,
SinusoidalPositionalEmbedding,
)
......@@ -149,113 +154,6 @@ class TransformerModel(FairseqModel):
return TransformerModel(encoder, decoder)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False:
# backward compatibility
args.tie_adaptive_proj = True
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(
task.dictionary, eval(args.character_filters),
args.character_embedding_dim, args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(
len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int),
)
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = TransformerDecoder(
args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False,
)
return TransformerLanguageModel(decoder)
class TransformerEncoder(FairseqEncoder):
"""
Transformer encoder consisting of *args.encoder_layers* layers. Each layer
......@@ -804,67 +702,6 @@ def Linear(in_features, out_features, bias=True):
return m
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.activation_fn = getattr(args, 'activation_fn', 'relu')
args.add_bos_token = getattr(args, 'add_bos_token', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.decoder_layers = getattr(args, 'decoder_layers', 16)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.dropout = getattr(args, 'dropout', 0.3)
args.adaptive_input = getattr(args, 'adaptive_input', True)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer', 'transformer')
def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment