"git@developer.sourcefind.cn:one/TransferBench.git" did not exist on "38f89e595b56c0bbea6e993c3c3705ca502bf884"
Commit 4e1ec2d8 authored by myleott's avatar myleott Committed by Myle Ott
Browse files

Merge OSS + internal changes

parent d4816034
...@@ -82,6 +82,19 @@ class Dictionary(object): ...@@ -82,6 +82,19 @@ class Dictionary(object):
self.count.append(n) self.count.append(n)
return idx return idx
def update(self, new_dict):
"""Updates counts from new dictionary."""
for word in new_dict.symbols:
idx2 = new_dict.indices[word]
if word in self.indices:
idx = self.indices[word]
self.count[idx] = self.count[idx] + new_dict.count[idx2]
else:
idx = len(self.symbols)
self.indices[word] = idx
self.symbols.append(word)
self.count.append(new_dict.count[idx2])
def finalize(self, threshold=1, nwords=-1, padding_factor=8): def finalize(self, threshold=1, nwords=-1, padding_factor=8):
"""Sort symbols by frequency in descending order, ignoring special ones. """Sort symbols by frequency in descending order, ignoring special ones.
......
...@@ -51,19 +51,12 @@ class FConvModel(FairseqModel): ...@@ -51,19 +51,12 @@ class FConvModel(FairseqModel):
@classmethod @classmethod
def build_model(cls, args, src_dict, dst_dict): def build_model(cls, args, src_dict, dst_dict):
"""Build a new model instance."""
# make sure that all args are properly defaulted (in case there are any new ones) # make sure that all args are properly defaulted (in case there are any new ones)
base_architecture(args) base_architecture(args)
"""Build a new model instance."""
if not hasattr(args, 'max_source_positions'): if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.max_positions args.max_source_positions = args.max_positions
args.max_target_positions = args.max_positions args.max_target_positions = args.max_positions
if not hasattr(args, 'share_input_output_embed'):
args.share_input_output_embed = False
if not hasattr(args, 'encoder_embed_path'):
args.encoder_embed_path = None
if not hasattr(args, 'decoder_embed_path'):
args.decoder_embed_path = None
encoder_embed_dict = None encoder_embed_dict = None
if args.encoder_embed_path: if args.encoder_embed_path:
...@@ -464,8 +457,10 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): ...@@ -464,8 +457,10 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
@register_model_architecture('fconv', 'fconv') @register_model_architecture('fconv', 'fconv')
def base_architecture(args): def base_architecture(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20') args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 20') args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 20')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256) args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.decoder_attention = getattr(args, 'decoder_attention', 'True') args.decoder_attention = getattr(args, 'decoder_attention', 'True')
......
...@@ -30,6 +30,8 @@ class LSTMModel(FairseqModel): ...@@ -30,6 +30,8 @@ class LSTMModel(FairseqModel):
help='encoder embedding dimension') help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR', parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR',
help='path to pre-trained encoder embedding') help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
help='encoder hidden size')
parser.add_argument('--encoder-layers', type=int, metavar='N', parser.add_argument('--encoder-layers', type=int, metavar='N',
help='number of encoder layers') help='number of encoder layers')
parser.add_argument('--encoder-bidirectional', action='store_true', parser.add_argument('--encoder-bidirectional', action='store_true',
...@@ -38,6 +40,8 @@ class LSTMModel(FairseqModel): ...@@ -38,6 +40,8 @@ class LSTMModel(FairseqModel):
help='decoder embedding dimension') help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR', parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR',
help='path to pre-trained decoder embedding') help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
help='decoder hidden size')
parser.add_argument('--decoder-layers', type=int, metavar='N', parser.add_argument('--decoder-layers', type=int, metavar='N',
help='number of decoder layers') help='number of decoder layers')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
...@@ -57,29 +61,31 @@ class LSTMModel(FairseqModel): ...@@ -57,29 +61,31 @@ class LSTMModel(FairseqModel):
@classmethod @classmethod
def build_model(cls, args, src_dict, dst_dict): def build_model(cls, args, src_dict, dst_dict):
"""Build a new model instance."""
# make sure that all args are properly defaulted (in case there are any new ones) # make sure that all args are properly defaulted (in case there are any new ones)
base_architecture(args) base_architecture(args)
"""Build a new model instance.""" def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
if not hasattr(args, 'encoder_embed_path'): num_embeddings = len(dictionary)
args.encoder_embed_path = None padding_idx = dictionary.pad()
if not hasattr(args, 'decoder_embed_path'): embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
args.decoder_embed_path = None embed_dict = utils.parse_embedding(embed_path)
utils.print_embed_overlap(embed_dict, dictionary)
return utils.load_embedding(embed_dict, dictionary, embed_tokens)
encoder_embed_dict = None pretrained_encoder_embed = None
if args.encoder_embed_path: if args.encoder_embed_path:
encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path) pretrained_encoder_embed = load_pretrained_embedding_from_file(
utils.print_embed_overlap(encoder_embed_dict, src_dict) args.encoder_embed_path, src_dict, args.encoder_embed_dim)
pretrained_decoder_embed = None
decoder_embed_dict = None
if args.decoder_embed_path: if args.decoder_embed_path:
decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path) pretrained_decoder_embed = load_pretrained_embedding_from_file(
utils.print_embed_overlap(decoder_embed_dict, dst_dict) args.decoder_embed_path, dst_dict, args.decoder_embed_dim)
encoder = LSTMEncoder( encoder = LSTMEncoder(
dictionary=src_dict, dictionary=src_dict,
embed_dim=args.encoder_embed_dim, embed_dim=args.encoder_embed_dim,
embed_dict=encoder_embed_dict, hidden_size=args.encoder_hidden_size,
num_layers=args.encoder_layers, num_layers=args.encoder_layers,
dropout_in=args.encoder_dropout_in, dropout_in=args.encoder_dropout_in,
dropout_out=args.encoder_dropout_out, dropout_out=args.encoder_dropout_out,
...@@ -93,7 +99,7 @@ class LSTMModel(FairseqModel): ...@@ -93,7 +99,7 @@ class LSTMModel(FairseqModel):
decoder = LSTMDecoder( decoder = LSTMDecoder(
dictionary=dst_dict, dictionary=dst_dict,
embed_dim=args.decoder_embed_dim, embed_dim=args.decoder_embed_dim,
embed_dict=decoder_embed_dict, hidden_size=args.decoder_hidden_size,
out_embed_dim=args.decoder_out_embed_dim, out_embed_dim=args.decoder_out_embed_dim,
num_layers=args.decoder_layers, num_layers=args.decoder_layers,
dropout_in=args.decoder_dropout_in, dropout_in=args.decoder_dropout_in,
...@@ -108,8 +114,13 @@ class LSTMModel(FairseqModel): ...@@ -108,8 +114,13 @@ class LSTMModel(FairseqModel):
class LSTMEncoder(FairseqEncoder): class LSTMEncoder(FairseqEncoder):
"""LSTM encoder.""" """LSTM encoder."""
def __init__(self, dictionary, embed_dim=512, embed_dict=None, def __init__(
num_layers=1, dropout_in=0.1, dropout_out=0.1): self, dictionary, embed_dim=512, hidden_size=512, num_layers=1,
dropout_in=0.1, dropout_out=0.1, bidirectional=False,
left_pad_source=LanguagePairDataset.LEFT_PAD_SOURCE,
pretrained_embed=None,
padding_value=0.,
):
super().__init__(dictionary) super().__init__(dictionary)
self.num_layers = num_layers self.num_layers = num_layers
self.dropout_in = dropout_in self.dropout_in = dropout_in
...@@ -119,9 +130,10 @@ class LSTMEncoder(FairseqEncoder): ...@@ -119,9 +130,10 @@ class LSTMEncoder(FairseqEncoder):
num_embeddings = len(dictionary) num_embeddings = len(dictionary)
self.padding_idx = dictionary.pad() self.padding_idx = dictionary.pad()
self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) if pretrained_embed is None:
if embed_dict: self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) else:
self.embed_tokens = pretrained_embed
self.lstm = LSTM( self.lstm = LSTM(
input_size=embed_dim, input_size=embed_dim,
...@@ -236,10 +248,12 @@ class AttentionLayer(nn.Module): ...@@ -236,10 +248,12 @@ class AttentionLayer(nn.Module):
class LSTMDecoder(FairseqIncrementalDecoder): class LSTMDecoder(FairseqIncrementalDecoder):
"""LSTM decoder.""" """LSTM decoder."""
def __init__(self, dictionary, encoder_embed_dim=512, def __init__(
embed_dim=512, embed_dict=None, self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512,
out_embed_dim=512, num_layers=1, dropout_in=0.1, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True,
dropout_out=0.1, attention=True): encoder_embed_dim=512, encoder_output_units=512,
pretrained_embed=None,
):
super().__init__(dictionary) super().__init__(dictionary)
self.dropout_in = dropout_in self.dropout_in = dropout_in
self.dropout_out = dropout_out self.dropout_out = dropout_out
...@@ -247,9 +261,15 @@ class LSTMDecoder(FairseqIncrementalDecoder): ...@@ -247,9 +261,15 @@ class LSTMDecoder(FairseqIncrementalDecoder):
num_embeddings = len(dictionary) num_embeddings = len(dictionary)
padding_idx = dictionary.pad() padding_idx = dictionary.pad()
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if pretrained_embed is None:
if embed_dict: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) else:
self.embed_tokens = pretrained_embed
self.encoder_output_units = encoder_output_units
assert encoder_output_units == hidden_size, \
'{} {}'.format(encoder_output_units, hidden_size)
# TODO another Linear layer if not equal
self.layers = nn.ModuleList([ self.layers = nn.ModuleList([
LSTMCell( LSTMCell(
...@@ -408,13 +428,15 @@ def Linear(in_features, out_features, bias=True, dropout=0): ...@@ -408,13 +428,15 @@ def Linear(in_features, out_features, bias=True, dropout=0):
@register_model_architecture('lstm', 'lstm') @register_model_architecture('lstm', 'lstm')
def base_architecture(args): def base_architecture(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', 512) args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim)
args.encoder_layers = getattr(args, 'encoder_layers', 1) args.encoder_layers = getattr(args, 'encoder_layers', 1)
args.encoder_bidirectional = getattr(args, 'encoder_bidirectional', False) args.encoder_bidirectional = getattr(args, 'encoder_bidirectional', False)
args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', args.dropout) args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', args.dropout)
args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', args.dropout) args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', args.dropout)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_hidden_size = getattr(args, 'decoder_hidden_size', 512) args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_hidden_size = getattr(args, 'decoder_hidden_size', args.decoder_embed_dim)
args.decoder_layers = getattr(args, 'decoder_layers', 1) args.decoder_layers = getattr(args, 'decoder_layers', 1)
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512) args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
args.decoder_attention = getattr(args, 'decoder_attention', '1') args.decoder_attention = getattr(args, 'decoder_attention', '1')
......
...@@ -275,7 +275,7 @@ def parse_embedding(embed_path): ...@@ -275,7 +275,7 @@ def parse_embedding(embed_path):
the -0.0230 -0.0264 0.0287 0.0171 0.1403 the -0.0230 -0.0264 0.0287 0.0171 0.1403
at -0.0395 -0.1286 0.0275 0.0254 -0.0932 at -0.0395 -0.1286 0.0275 0.0254 -0.0932
""" """
embed_dict = dict() embed_dict = {}
with open(embed_path) as f_embed: with open(embed_path) as f_embed:
_ = next(f_embed) # skip header _ = next(f_embed) # skip header
for line in f_embed: for line in f_embed:
...@@ -353,7 +353,6 @@ def buffered_arange(max): ...@@ -353,7 +353,6 @@ def buffered_arange(max):
def convert_padding_direction( def convert_padding_direction(
src_tokens, src_tokens,
src_lengths,
padding_idx, padding_idx,
right_to_left=False, right_to_left=False,
left_to_right=False, left_to_right=False,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment