"docs/vscode:/vscode.git/clone" did not exist on "57f7d2593427bd2cbf7d15d32844cfc5f7717d3f"
Commit f2563c21 authored by Myle Ott's avatar Myle Ott Committed by Facebook Github Bot
Browse files

Cleanup LM + Flake8

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/720

Differential Revision: D15259091

Pulled By: myleott

fbshipit-source-id: 06a35996c06ccddb49fdc9e01e348ff3c9da334e
parent eddcdf08
...@@ -47,4 +47,5 @@ __all__ = [ ...@@ -47,4 +47,5 @@ __all__ = [
'TokenBlockDataset', 'TokenBlockDataset',
'TransformEosDataset', 'TransformEosDataset',
'TransformEosLangPairDataset', 'TransformEosLangPairDataset',
'TruncatedDictionary',
] ]
...@@ -10,6 +10,7 @@ import os ...@@ -10,6 +10,7 @@ import os
import numpy as np import numpy as np
from collections import Iterable from collections import Iterable
def infer_language_pair(path): def infer_language_pair(path):
"""Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx""" """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
src, dst = None, None src, dst = None, None
...@@ -182,7 +183,7 @@ def batch_by_size( ...@@ -182,7 +183,7 @@ def batch_by_size(
def process_bpe_symbol(sentence: str, bpe_symbol: str): def process_bpe_symbol(sentence: str, bpe_symbol: str):
if bpe_symbol == 'sentencepiece': if bpe_symbol == 'sentencepiece':
sentence = sentence.replace(' ','').replace('\u2581', ' ').strip() sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip()
elif bpe_symbol is not None: elif bpe_symbol is not None:
sentence = (sentence + ' ').replace(bpe_symbol, '').rstrip() sentence = (sentence + ' ').replace(bpe_symbol, '').rstrip()
return sentence return sentence
...@@ -18,6 +18,7 @@ from fairseq.data import data_utils ...@@ -18,6 +18,7 @@ from fairseq.data import data_utils
class Dictionary(object): class Dictionary(object):
"""A mapping from symbols to consecutive integers""" """A mapping from symbols to consecutive integers"""
def __init__(self, pad='<pad>', eos='</s>', unk='<unk>', bos='<s>'): def __init__(self, pad='<pad>', eos='</s>', unk='<unk>', bos='<s>'):
self.unk_word, self.pad_word, self.eos_word = unk, pad, eos self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
self.symbols = [] self.symbols = []
...@@ -282,6 +283,7 @@ class Dictionary(object): ...@@ -282,6 +283,7 @@ class Dictionary(object):
else: else:
merge_result(Dictionary._add_file_to_dictionary_single_worker(filename, tokenize, dict.eos_word)) merge_result(Dictionary._add_file_to_dictionary_single_worker(filename, tokenize, dict.eos_word))
class TruncatedDictionary(object): class TruncatedDictionary(object):
def __init__(self, wrapped_dict, length): def __init__(self, wrapped_dict, length):
......
...@@ -7,8 +7,6 @@ ...@@ -7,8 +7,6 @@
import itertools import itertools
import math import math
import queue
import threading
import numpy as np import numpy as np
import torch import torch
......
...@@ -8,8 +8,6 @@ ...@@ -8,8 +8,6 @@
import numpy as np import numpy as np
import torch import torch
from fairseq import utils
from . import data_utils, FairseqDataset from . import data_utils, FairseqDataset
......
...@@ -10,7 +10,7 @@ import math ...@@ -10,7 +10,7 @@ import math
import numpy as np import numpy as np
import torch import torch
from typing import Dict, List, Tuple, Union from typing import Dict, List, Tuple
from . import FairseqDataset, data_utils from . import FairseqDataset, data_utils
......
...@@ -9,19 +9,33 @@ import argparse ...@@ -9,19 +9,33 @@ import argparse
import importlib import importlib
import os import os
from .fairseq_decoder import FairseqDecoder # noqa: F401 from .fairseq_decoder import FairseqDecoder
from .fairseq_encoder import FairseqEncoder # noqa: F401 from .fairseq_encoder import FairseqEncoder
from .fairseq_incremental_decoder import FairseqIncrementalDecoder # noqa: F401 from .fairseq_incremental_decoder import FairseqIncrementalDecoder
from .fairseq_model import ( from .fairseq_model import (
BaseFairseqModel, BaseFairseqModel,
FairseqModel, # noqa: F401 FairseqModel,
FairseqMultiModel, # noqa: F401 FairseqMultiModel,
FairseqLanguageModel, # noqa: F401 FairseqLanguageModel,
FairseqEncoderModel, # noqa: F401 FairseqEncoderModel,
) )
from .composite_encoder import CompositeEncoder # noqa: F401 from .composite_encoder import CompositeEncoder
from .distributed_fairseq_model import DistributedFairseqModel # noqa: F401 from .distributed_fairseq_model import DistributedFairseqModel
__all__ = [
'BaseFairseqModel',
'CompositeEncoder',
'DistributedFairseqModel',
'FairseqDecoder',
'FairseqEncoder',
'FairseqEncoderModel',
'FairseqIncrementalDecoder',
'FairseqLanguageModel',
'FairseqModel',
'FairseqMultiModel',
]
MODEL_REGISTRY = {} MODEL_REGISTRY = {}
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
from . import FairseqEncoder from fairseq.models import FairseqEncoder
class CompositeEncoder(FairseqEncoder): class CompositeEncoder(FairseqEncoder):
......
...@@ -6,14 +6,11 @@ ...@@ -6,14 +6,11 @@
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import inspect import inspect
import socket
from torch.nn import parallel from torch.nn import parallel
from fairseq import distributed_utils
from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
from fairseq.models import BaseFairseqModel
from . import BaseFairseqModel
def DistributedFairseqModel(args, model): def DistributedFairseqModel(args, model):
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
from . import FairseqDecoder from fairseq.models import FairseqDecoder
class FairseqIncrementalDecoder(FairseqDecoder): class FairseqIncrementalDecoder(FairseqDecoder):
......
...@@ -4,14 +4,15 @@ ...@@ -4,14 +4,15 @@
# This source code is licensed under the license found in the LICENSE file in # This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights # the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
from typing import Dict, List, Optional from typing import Dict, List, Optional
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from . import FairseqDecoder, FairseqEncoder
from fairseq.data import Dictionary from fairseq.data import Dictionary
from fairseq.models import FairseqDecoder, FairseqEncoder
class BaseFairseqModel(nn.Module): class BaseFairseqModel(nn.Module):
......
...@@ -10,17 +10,19 @@ import torch ...@@ -10,17 +10,19 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import options, utils from fairseq import utils
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from fairseq.modules import ( from fairseq.modules import (
AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding, AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding,
LinearizedConvolution, LinearizedConvolution,
) )
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel,
FairseqLanguageModel, register_model, register_model_architecture,
)
@register_model('fconv') @register_model('fconv')
class FConvModel(FairseqModel): class FConvModel(FairseqModel):
...@@ -111,58 +113,6 @@ class FConvModel(FairseqModel): ...@@ -111,58 +113,6 @@ class FConvModel(FairseqModel):
return FConvModel(encoder, decoder) return FConvModel(encoder, decoder)
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
adaptive_softmax_dropout=args.adaptive_softmax_dropout,
)
return FConvLanguageModel(decoder)
class FConvEncoder(FairseqEncoder): class FConvEncoder(FairseqEncoder):
""" """
Convolutional encoder consisting of `len(convolutions)` layers. Convolutional encoder consisting of `len(convolutions)` layers.
...@@ -643,46 +593,6 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): ...@@ -643,46 +593,6 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
return nn.utils.weight_norm(m, dim=2) return nn.utils.weight_norm(m, dim=2)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv', 'fconv') @register_model_architecture('fconv', 'fconv')
def base_architecture(args): def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1) args.dropout = getattr(args, 'dropout', 0.1)
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.fconv import FConvDecoder
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
adaptive_softmax_dropout=args.adaptive_softmax_dropout,
)
return FConvLanguageModel(decoder)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)
...@@ -12,14 +12,20 @@ import torch.nn as nn ...@@ -12,14 +12,20 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import checkpoint_utils from fairseq import checkpoint_utils
from fairseq.modules import ( from fairseq.models import (
DownsampledMultiHeadAttention, GradMultiply, LayerNorm, CompositeEncoder,
LearnedPositionalEmbedding, LinearizedConvolution, FairseqDecoder,
FairseqEncoder,
FairseqModel,
register_model,
register_model_architecture,
) )
from fairseq.modules import (
from . import ( DownsampledMultiHeadAttention,
FairseqEncoder, CompositeEncoder, FairseqDecoder, FairseqModel, GradMultiply,
register_model, register_model_architecture, LayerNorm,
LearnedPositionalEmbedding,
LinearizedConvolution,
) )
......
...@@ -12,15 +12,21 @@ import torch.nn as nn ...@@ -12,15 +12,21 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import options, utils from fairseq import options, utils
from fairseq.modules import ( from fairseq.models import (
AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm, FairseqEncoder,
LearnedPositionalEmbedding, MultiheadAttention, SinusoidalPositionalEmbedding, FairseqIncrementalDecoder,
DynamicConv1dTBC, LightweightConv1dTBC, FairseqModel,
register_model,
register_model_architecture,
) )
from fairseq.modules import (
from . import ( AdaptiveSoftmax,
FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel, DynamicConv1dTBC,
FairseqModel, register_model, register_model_architecture, LayerNorm,
LearnedPositionalEmbedding,
LightweightConv1dTBC,
MultiheadAttention,
SinusoidalPositionalEmbedding,
) )
...@@ -171,117 +177,6 @@ class LightConvModel(FairseqModel): ...@@ -171,117 +177,6 @@ class LightConvModel(FairseqModel):
return LightConvModel(encoder, decoder) return LightConvModel(encoder, decoder)
@register_model('lightconv_lm')
class LightConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--input-dropout', type=float, metavar='D',
help='dropout probability of the inputs')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads or LightConv/DynamicConv heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', default=False, action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
"""LightConv and DynamicConv arguments"""
parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
help='list of kernel size (default: "[3,7,15,31,31,31]")')
parser.add_argument('--decoder-glu', type=options.eval_bool,
help='glu after in proj')
parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
choices=['dynamic', 'lightweight'],
help='type of convolution')
parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
parser.add_argument('--weight-dropout', type=float, metavar='D',
help='dropout probability for conv weights')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
args.character_embedding_dim,
args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int))
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
return LightConvLanguageModel(decoder)
class LightConvEncoder(FairseqEncoder): class LightConvEncoder(FairseqEncoder):
""" """
LightConv encoder consisting of *args.encoder_layers* layers. Each layer LightConv encoder consisting of *args.encoder_layers* layers. Each layer
...@@ -786,47 +681,6 @@ def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=Fals ...@@ -786,47 +681,6 @@ def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=Fals
return m return m
@register_model_architecture('lightconv_lm', 'lightconv_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
if len(args.decoder_kernel_size_list) == 1:
args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
def lightconv_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('lightconv', 'lightconv') @register_model_architecture('lightconv', 'lightconv')
def base_architecture(args): def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.lightconv import (
Embedding,
LightConvDecoder,
)
from fairseq.modules import (
AdaptiveInput,
CharacterTokenEmbedder,
)
@register_model('lightconv_lm')
class LightConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--input-dropout', type=float, metavar='D',
help='dropout probability of the inputs')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads or LightConv/DynamicConv heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', default=False, action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
"""LightConv and DynamicConv arguments"""
parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
help='list of kernel size (default: "[3,7,15,31,31,31]")')
parser.add_argument('--decoder-glu', type=options.eval_bool,
help='glu after in proj')
parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
choices=['dynamic', 'lightweight'],
help='type of convolution')
parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
parser.add_argument('--weight-dropout', type=float, metavar='D',
help='dropout probability for conv weights')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
args.character_embedding_dim,
args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int))
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
return LightConvLanguageModel(decoder)
@register_model_architecture('lightconv_lm', 'lightconv_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
if len(args.decoder_kernel_size_list) == 1:
args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
def lightconv_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
...@@ -10,11 +10,14 @@ import torch.nn as nn ...@@ -10,11 +10,14 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import options, utils from fairseq import options, utils
from fairseq.modules import AdaptiveSoftmax from fairseq.models import (
from . import ( FairseqEncoder,
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model, FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture, register_model_architecture,
) )
from fairseq.modules import AdaptiveSoftmax
@register_model('lstm') @register_model('lstm')
...@@ -299,7 +302,7 @@ class AttentionLayer(nn.Module): ...@@ -299,7 +302,7 @@ class AttentionLayer(nn.Module):
# sum weighted sources # sum weighted sources
x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0) x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
x = F.tanh(self.output_proj(torch.cat((x, input), dim=1))) x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1)))
return x, attn_scores return x, attn_scores
......
...@@ -9,8 +9,11 @@ import torch ...@@ -9,8 +9,11 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from . import ( from fairseq.models import (
BaseFairseqModel, FairseqEncoder, register_model, register_model_architecture, BaseFairseqModel,
FairseqEncoder,
register_model,
register_model_architecture,
) )
from fairseq.modules import ( from fairseq.modules import (
SinusoidalPositionalEmbedding, SinusoidalPositionalEmbedding,
......
...@@ -8,17 +8,19 @@ ...@@ -8,17 +8,19 @@
from collections import OrderedDict from collections import OrderedDict
from fairseq import utils from fairseq import utils
from fairseq.tasks.multilingual_translation import MultilingualTranslationTask from fairseq.models import (
FairseqMultiModel,
from . import FairseqMultiModel, register_model, register_model_architecture register_model,
register_model_architecture,
from .transformer import ( )
from fairseq.models.transformer import (
base_architecture, base_architecture,
Embedding, Embedding,
TransformerModel, TransformerModel,
TransformerEncoder, TransformerEncoder,
TransformerDecoder, TransformerDecoder,
) )
from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
@register_model('multilingual_transformer') @register_model('multilingual_transformer')
......
...@@ -12,14 +12,19 @@ import torch.nn as nn ...@@ -12,14 +12,19 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq import options, utils from fairseq import options, utils
from fairseq.modules import ( from fairseq.models import (
AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm, FairseqEncoder,
MultiheadAttention, PositionalEmbedding, SinusoidalPositionalEmbedding, FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
) )
from fairseq.modules import (
from . import ( AdaptiveSoftmax,
FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel, LayerNorm,
FairseqModel, register_model, register_model_architecture, MultiheadAttention,
PositionalEmbedding,
SinusoidalPositionalEmbedding,
) )
...@@ -149,113 +154,6 @@ class TransformerModel(FairseqModel): ...@@ -149,113 +154,6 @@ class TransformerModel(FairseqModel):
return TransformerModel(encoder, decoder) return TransformerModel(encoder, decoder)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False:
# backward compatibility
args.tie_adaptive_proj = True
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(
task.dictionary, eval(args.character_filters),
args.character_embedding_dim, args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(
len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int),
)
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = TransformerDecoder(
args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False,
)
return TransformerLanguageModel(decoder)
class TransformerEncoder(FairseqEncoder): class TransformerEncoder(FairseqEncoder):
""" """
Transformer encoder consisting of *args.encoder_layers* layers. Each layer Transformer encoder consisting of *args.encoder_layers* layers. Each layer
...@@ -804,67 +702,6 @@ def Linear(in_features, out_features, bias=True): ...@@ -804,67 +702,6 @@ def Linear(in_features, out_features, bias=True):
return m return m
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.activation_fn = getattr(args, 'activation_fn', 'relu')
args.add_bos_token = getattr(args, 'add_bos_token', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.decoder_layers = getattr(args, 'decoder_layers', 16)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.dropout = getattr(args, 'dropout', 0.3)
args.adaptive_input = getattr(args, 'adaptive_input', True)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer', 'transformer') @register_model_architecture('transformer', 'transformer')
def base_architecture(args): def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment