Unverified Commit 66415206 authored by Myle Ott's avatar Myle Ott Committed by GitHub
Browse files

fairseq-py goes distributed (#106)

This PR includes breaking API changes to modularize fairseq-py and adds support for distributed training across multiple nodes.

Changes:
- c7033ef: add support for distributed training! See updated README for usage.
- e016299: modularize fairseq-py, adding support for register_model, register_criterion, register_optimizer, etc.
- 154e440: update LSTM implementation to use PackedSequence objects in the encoder, better following best practices and improving perf
- 90c2973 and 1da6265: improve unit test coverage
parent 7e86e30c
......@@ -4,10 +4,32 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from torch.optim.optimizer import Optimizer, required
from . import FairseqOptimizer, register_optimizer
@register_optimizer('nag')
class FairseqNAG(FairseqOptimizer):
def __init__(self, args, params):
super().__init__(args, params)
self._optimizer = NAG(params, **self.optimizer_config)
@property
def optimizer_config(self):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
return {
'lr': self.args.lr[0],
'momentum': self.args.momentum,
'weight_decay': self.args.weight_decay,
}
class NAG(Optimizer):
def __init__(self, params, lr=required, momentum=0, weight_decay=0):
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import torch.optim
from . import FairseqOptimizer, register_optimizer
@register_optimizer('sgd')
class SGD(FairseqOptimizer):
def __init__(self, args, params):
super().__init__(args, params)
self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
@property
def optimizer_config(self):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
return {
'lr': self.args.lr[0],
'momentum': self.args.momentum,
'weight_decay': self.args.weight_decay,
}
......@@ -4,12 +4,64 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import argparse
from fairseq import models
from fairseq.multiprocessing_trainer import MultiprocessingTrainer
from fairseq.criterions import CRITERION_REGISTRY
from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_CONFIG_REGISTRY
from fairseq.optim import OPTIMIZER_REGISTRY
from fairseq.optim.lr_scheduler import LR_SCHEDULER_REGISTRY
def get_training_parser():
parser = get_parser('Trainer')
add_dataset_args(parser, train=True)
add_distributed_training_args(parser)
add_model_args(parser)
add_optimization_args(parser)
add_checkpoint_args(parser)
return parser
def get_generation_parser():
parser = get_parser('Generation')
add_dataset_args(parser, gen=True)
add_generation_args(parser)
return parser
def parse_args_and_arch(parser, _args=None):
# The parser doesn't know about model/criterion/optimizer-specific args, so
# we parse twice. First we parse the model/criterion/optimizer, then we
# parse a second time after adding the *-specific arguments.
args, _ = parser.parse_known_args(_args)
# Add model-specific args to parser.
model_specific_group = parser.add_argument_group(
'Model-specific configuration',
# Only include attributes which are explicitly given as command-line
# arguments or which have default values.
argument_default=argparse.SUPPRESS,
)
ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group)
# Add *-specific args to parser.
CRITERION_REGISTRY[args.criterion].add_args(parser)
OPTIMIZER_REGISTRY[args.optimizer].add_args(parser)
LR_SCHEDULER_REGISTRY[args.lr_scheduler].add_args(parser)
# Parse a second time.
args = parser.parse_args(_args)
# Post-process args.
args.lr = list(map(float, args.lr.split(',')))
if args.max_sentences_valid is None:
args.max_sentences_valid = args.max_sentences
# Apply architecture configuration.
ARCH_CONFIG_REGISTRY[args.arch](args)
return args
def get_parser(desc):
......@@ -17,7 +69,7 @@ def get_parser(desc):
description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc)
parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar')
parser.add_argument('--log-interval', type=int, default=1000, metavar='N',
help='log progress every N updates (when progress bar is disabled)')
help='log progress every N batches (when progress bar is disabled)')
parser.add_argument('--log-format', default=None, help='log format to use',
choices=['json', 'none', 'simple', 'tqdm'])
parser.add_argument('--seed', default=1, type=int, metavar='N',
......@@ -25,7 +77,7 @@ def get_parser(desc):
return parser
def add_dataset_args(parser):
def add_dataset_args(parser, train=False, gen=False):
group = parser.add_argument_group('Dataset and data loading')
group.add_argument('data', metavar='DIR',
help='path to data directory')
......@@ -33,49 +85,91 @@ def add_dataset_args(parser):
help='source language')
group.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
help='target language')
group.add_argument('-j', '--workers', default=1, type=int, metavar='N',
help='number of data loading workers (default: 1)')
group.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
help='max number of tokens in the source sequence')
group.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
help='max number of tokens in the target sequence')
group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true',
help='Ignore too long or too short lines in valid and test set')
group.add_argument('--max-tokens', default=6000, type=int, metavar='N',
help='maximum number of tokens in a batch')
group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N',
help='maximum number of sentences in a batch')
if train:
group.add_argument('--train-subset', default='train', metavar='SPLIT',
choices=['train', 'valid', 'test'],
help='data subset to use for training (train, valid, test)')
group.add_argument('--valid-subset', default='valid', metavar='SPLIT',
help='comma separated list of data subsets to use for validation'
' (train, valid, valid1,test, test1)')
group.add_argument('--max-sentences-valid', type=int, metavar='N',
help='maximum number of sentences in a validation batch'
' (defaults to --max-sentences)')
if gen:
group.add_argument('--gen-subset', default='test', metavar='SPLIT',
help='data subset to generate (train, valid, test)')
group.add_argument('--num-shards', default=1, type=int, metavar='N',
help='shard generation over N shards')
group.add_argument('--shard-id', default=0, type=int, metavar='ID',
help='id of the shard to generate (id < num_shards)')
return group
def add_distributed_training_args(parser):
group = parser.add_argument_group('Distributed training')
group.add_argument('--distributed-world-size', default=1, type=int, metavar='N',
help='total number of GPUs across all nodes, default: 1 GPU')
group.add_argument('--distributed-rank', default=0, type=int,
help='rank of the current worker')
group.add_argument('--distributed-backend', default='nccl', type=str,
help='distributed backend')
group.add_argument('--distributed-init-method', default=None, type=str,
help='typically tcp://hostname:port that will be used to '
'establish initial connetion')
group.add_argument('--distributed-port', default=-1, type=int,
help='port number (not required if using --distributed-init-method)')
group.add_argument('--device-id', default=0, type=int,
help='which GPU to use (usually configured automatically)')
return group
def add_optimization_args(parser):
group = parser.add_argument_group('Optimization')
group.add_argument('--optimizer', default='nag', metavar='OPT',
choices=MultiprocessingTrainer.OPTIMIZERS,
help='optimizer ({})'.format(', '.join(MultiprocessingTrainer.OPTIMIZERS)))
group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR1,LR2,...,LRn',
help='learning rate for the first n epochs with all epochs >n using LRn')
group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float,
help='minimum learning rate')
group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N',
help='force annealing at specified epoch')
group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
help='force stop training at specified epoch')
group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS',
help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)')
group.add_argument('--momentum', default=0.99, type=float, metavar='M',
help='momentum factor')
group.add_argument('--adam-betas', default='(0.9, 0.999)', metavar='B',
help='betas for Adam optimizer')
group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
help='clip threshold of gradients')
group.add_argument('--sentence-avg', action='store_true',
help='normalize gradients by the number of sentences in a batch'
' (default is to normalize by number of tokens)')
# Optimizer definitions can be found under fairseq/optim/
group.add_argument('--optimizer', default='nag', metavar='OPT',
choices=OPTIMIZER_REGISTRY.keys(),
help='optimizer: {} (default: nag)'.format(', '.join(OPTIMIZER_REGISTRY.keys())))
group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N',
help='learning rate for the first N epochs; all epochs >N using LR_N'
' (note: this may be interpreted differently depending on --lr-scheduler)')
group.add_argument('--momentum', default=0.99, type=float, metavar='M',
help='momentum factor')
group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
help='weight decay')
# Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau',
help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.format(
', '.join(LR_SCHEDULER_REGISTRY.keys())))
group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR',
help='minimum learning rate')
group.add_argument('--sample-without-replacement', default=0, type=int, metavar='N',
help='If bigger than 0, use that number of mini-batches for each epoch,'
' where each sample is drawn randomly without replacement from the'
' dataset')
group.add_argument('--curriculum', default=0, type=int, metavar='N',
help='sort batches by source length for first N epochs')
group.add_argument('--sentence-avg', action='store_true',
help='normalize gradients by the number of sentences in a batch'
' (default is to normalize by number of tokens)')
return group
......@@ -85,8 +179,8 @@ def add_checkpoint_args(parser):
help='path to save checkpoints')
group.add_argument('--restore-file', default='checkpoint_last.pt',
help='filename in save-dir from which to load checkpoint')
group.add_argument('--save-interval', type=int, default=-1,
help='checkpoint every this many batches')
group.add_argument('--save-interval', type=int, default=-1, metavar='N',
help='save a checkpoint every N updates')
group.add_argument('--no-save', action='store_true',
help='don\'t save models and checkpoints')
group.add_argument('--no-epoch-checkpoints', action='store_true',
......@@ -96,6 +190,8 @@ def add_checkpoint_args(parser):
def add_generation_args(parser):
group = parser.add_argument_group('Generation')
group.add_argument('--path', metavar='FILE', required=True, action='append',
help='path(s) to model file(s)')
group.add_argument('--beam', default=5, type=int, metavar='N',
help='beam size')
group.add_argument('--nbest', default=1, type=int, metavar='N',
......@@ -124,57 +220,35 @@ def add_generation_args(parser):
group.add_argument('--replace-unk', nargs='?', const=True, default=None,
help='perform unknown replacement (optionally with alignment dictionary)')
group.add_argument('--quiet', action='store_true',
help='Only print final scores')
help='only print final scores')
group.add_argument('--score-reference', action='store_true',
help='just score the reference translation')
return group
def add_model_args(parser):
group = parser.add_argument_group(
'Model configuration',
# Only include attributes which are explicitly given as command-line
# arguments or which have model-independent default values.
argument_default=argparse.SUPPRESS,
)
group = parser.add_argument_group('Model configuration')
# Model definitions can be found under fairseq/models/
#
# The model architecture can be specified in several ways.
# In increasing order of priority:
# 1) model defaults (lowest priority)
# 2) --arch argument
# 3) --encoder/decoder-* arguments (highest priority)
# Note: --arch cannot be combined with --encoder/decoder-* arguments.
group.add_argument('--arch', '-a', default='fconv', metavar='ARCH', choices=models.arch_model_map.keys(),
help='model architecture ({})'.format(', '.join(models.arch_model_map.keys())))
group.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
group.add_argument('--encoder-layers', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
group.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
group.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
group.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
group.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
# Granular dropout settings for models that support them (e.g., LSTM):
group.add_argument('--encoder-dropout-in', type=float, metavar='D',
help='dropout probability for encoder input embedding')
group.add_argument('--encoder-dropout-out', type=float, metavar='D',
help='dropout probability for encoder output')
group.add_argument('--decoder-dropout-in', type=float, metavar='D',
help='dropout probability for decoder input embedding')
group.add_argument('--decoder-dropout-out', type=float, metavar='D',
help='dropout probability for decoder output')
# These arguments have default values independent of the model:
group.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
group.add_argument('--label-smoothing', default=0, type=float, metavar='D',
help='epsilon for label smoothing, 0 means no label smoothing')
group.add_argument('--share-input-output-embed', action='store_true',
help="Share input and output embeddings, "
"requires --decoder-out-embed-dim and --decoder-embed-dim be equal ")
group.add_argument(
'--arch', '-a', default='fconv', metavar='ARCH', required=True,
choices=ARCH_MODEL_REGISTRY.keys(),
help='model architecture: {} (default: fconv)'.format(
', '.join(ARCH_MODEL_REGISTRY.keys())),
)
# Criterion definitions can be found under fairseq/criterions/
group.add_argument(
'--criterion', default='cross_entropy', metavar='CRIT',
choices=CRITERION_REGISTRY.keys(),
help='training criterion: {} (default: cross_entropy)'.format(
', '.join(CRITERION_REGISTRY.keys())),
)
return group
......@@ -4,7 +4,6 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
"""
Wrapper around various loggers and progress bars (e.g., tqdm).
......@@ -13,12 +12,33 @@ Wrapper around various loggers and progress bars (e.g., tqdm).
from collections import OrderedDict
import json
from numbers import Number
import sys
from tqdm import tqdm
from fairseq.meters import AverageMeter
def build_progress_bar(args, iterator, epoch=None, prefix=None, default='tqdm', no_progress_bar='none'):
if args.log_format is None:
args.log_format = no_progress_bar if args.no_progress_bar else default
if args.log_format == 'tqdm' and not sys.stderr.isatty():
args.log_format = 'simple'
if args.log_format == 'json':
bar = json_progress_bar(iterator, epoch, prefix, args.log_interval)
elif args.log_format == 'none':
bar = noop_progress_bar(iterator, epoch, prefix)
elif args.log_format == 'simple':
bar = simple_progress_bar(iterator, epoch, prefix, args.log_interval)
elif args.log_format == 'tqdm':
bar = tqdm_progress_bar(iterator, epoch, prefix)
else:
raise ValueError('Unknown log format: {}'.format(args.log_format))
return bar
class progress_bar(object):
"""Abstract class for progress bars."""
def __init__(self, iterable, epoch=None, prefix=None):
......
......@@ -4,13 +4,10 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from contextlib import ExitStack
import math
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from fairseq import utils
from fairseq.models import FairseqIncrementalDecoder
......@@ -54,27 +51,31 @@ class SequenceGenerator(object):
return self
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
cuda_device=None, timer=None):
cuda=False, timer=None):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda_device: GPU on which to do generation.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
s = utils.make_variable(sample, volatile=True, cuda_device=cuda_device)
s = utils.make_variable(sample, volatile=True, cuda=cuda)
input = s['net_input']
srclen = input['src_tokens'].size(1)
if timer is not None:
timer.start()
with utils.maybe_no_grad():
hypos = self.generate(input['src_tokens'], beam_size=beam_size,
maxlen=int(maxlen_a*srclen + maxlen_b))
hypos = self.generate(
input['src_tokens'],
input['src_lengths'],
beam_size=beam_size,
maxlen=int(maxlen_a*srclen + maxlen_b),
)
if timer is not None:
timer.stop(s['ntokens'])
for i, id in enumerate(s['id'].data):
......@@ -83,15 +84,15 @@ class SequenceGenerator(object):
ref = utils.strip_pad(s['target'].data[i, :], self.pad)
yield id, src, ref, hypos[i]
def generate(self, src_tokens, beam_size=None, maxlen=None):
def generate(self, src_tokens, src_lengths, beam_size=None, maxlen=None):
"""Generate a batch of translations."""
with ExitStack() as stack:
for model in self.models:
if isinstance(model.decoder, FairseqIncrementalDecoder):
stack.enter_context(model.decoder.incremental_inference())
return self._generate(src_tokens, beam_size, maxlen)
return self._generate(src_tokens, src_lengths, beam_size, maxlen)
def _generate(self, src_tokens, beam_size=None, maxlen=None):
def _generate(self, src_tokens, src_lengths, beam_size=None, maxlen=None):
bsz, srclen = src_tokens.size()
maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen
......@@ -107,11 +108,15 @@ class SequenceGenerator(object):
model.decoder.set_beam_size(beam_size)
# compute the encoder output for each beam
encoder_out = model.encoder(src_tokens.repeat(1, beam_size).view(-1, srclen))
encoder_out = model.encoder(
src_tokens.repeat(1, beam_size).view(-1, srclen),
src_lengths.repeat(beam_size),
)
encoder_outs.append(encoder_out)
# initialize buffers
scores = encoder_outs[0][0].data.new(bsz * beam_size).fill_(0)
scores = src_tokens.data.new(bsz * beam_size, maxlen + 1).float().fill_(0)
scores_buf = scores.clone()
tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad)
tokens_buf = tokens.clone()
tokens[:, 0] = self.eos
......@@ -121,7 +126,7 @@ class SequenceGenerator(object):
# list of completed sentences
finalized = [[] for i in range(bsz)]
finished = [False for i in range(bsz)]
worst_finalized = [{'idx': None, 'score': float('Inf')} for i in range(bsz)]
worst_finalized = [{'idx': None, 'score': -math.inf} for i in range(bsz)]
num_remaining_sent = bsz
# number of candidate hypos per step
......@@ -138,7 +143,7 @@ class SequenceGenerator(object):
buffers[name] = type_of.new()
return buffers[name]
def is_finished(sent):
def is_finished(sent, step, unfinalized_scores=None):
"""
Check whether we've finished generation for a given sentence, by
comparing the worst score among finalized hypotheses to the best
......@@ -146,19 +151,18 @@ class SequenceGenerator(object):
"""
assert len(finalized[sent]) <= beam_size
if len(finalized[sent]) == beam_size:
if self.stop_early:
if self.stop_early or step == maxlen or unfinalized_scores is None:
return True
# stop if the best unfinalized score is worse than the worst
# finalized one
bbsz = sent*beam_size
best_unfinalized_score = scores[bbsz:bbsz+beam_size].max()
best_unfinalized_score = unfinalized_scores[sent].max()
if self.normalize_scores:
best_unfinalized_score /= maxlen
if worst_finalized[sent]['score'] >= best_unfinalized_score:
return True
return False
def finalize_hypos(step, bbsz_idx, scores):
def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None):
"""
Finalize the given hypotheses at this step, while keeping the total
number of finalized hypotheses per sentence <= beam_size.
......@@ -171,34 +175,51 @@ class SequenceGenerator(object):
step: current time step
bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
indicating which hypotheses to finalize
scores: A vector of the same size as bbsz_idx containing scores
for each hypothesis
eos_scores: A vector of the same size as bbsz_idx containing
scores for each hypothesis
unfinalized_scores: A vector containing scores for all
unfinalized hypotheses
"""
assert bbsz_idx.numel() == scores.numel()
norm_scores = scores/math.pow(step+1, self.len_penalty) if self.normalize_scores else scores
assert bbsz_idx.numel() == eos_scores.numel()
# clone relevant token and attention tensors
tokens_clone = tokens.index_select(0, bbsz_idx)
tokens_clone = tokens_clone[:, 1:step+2] # skip the first index, which is EOS
tokens_clone[:, step] = self.eos
attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2]
# compute scores per token position
pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1]
pos_scores[:, step] = eos_scores
# convert from cumulative to per-position scores
pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
# normalize sentence-level scores
if self.normalize_scores:
eos_scores /= (step+1)**self.len_penalty
sents_seen = set()
for idx, score in zip(bbsz_idx.cpu(), norm_scores.cpu()):
for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), eos_scores.tolist())):
sent = idx // beam_size
sents_seen.add(sent)
def get_hypo():
hypo = tokens[idx, 1:step+2].clone() # skip the first index, which is EOS
hypo[step] = self.eos
attention = attn[idx, :, 1:step+2].clone()
_, alignment = attention.max(dim=0)
_, alignment = attn_clone[i].max(dim=0)
return {
'tokens': hypo,
'tokens': tokens_clone[i],
'score': score,
'attention': attention,
'attention': attn_clone[i], # src_len x tgt_len
'alignment': alignment,
'positional_scores': pos_scores[i],
}
if len(finalized[sent]) < beam_size:
finalized[sent].append(get_hypo())
elif score > worst_finalized[sent]['score']:
elif not self.stop_early and score > worst_finalized[sent]['score']:
# replace worst hypo for this sentence with new/better one
worst_idx = worst_finalized[sent]['idx']
finalized[sent][worst_idx] = get_hypo()
if worst_idx is not None:
finalized[sent][worst_idx] = get_hypo()
# find new worst finalized hypo for this sentence
idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score'])
......@@ -211,7 +232,7 @@ class SequenceGenerator(object):
num_finished = 0
for sent in sents_seen:
# check termination conditions for this sentence
if not finished[sent] and is_finished(sent):
if not finished[sent] and is_finished(sent, step, unfinalized_scores):
finished[sent] = True
num_finished += 1
return num_finished
......@@ -229,25 +250,44 @@ class SequenceGenerator(object):
# at the first step all hypotheses are equally likely, so use
# only the first beam
probs = probs.unfold(0, 1, beam_size).squeeze(2).contiguous()
scores = scores.type_as(probs)
scores_buf = scores_buf.type_as(probs)
else:
# make probs contain cumulative scores for each hypothesis
probs.add_(scores.view(-1, 1))
probs.add_(scores[:, step-1].view(-1, 1))
probs[:, self.pad] = -math.inf # never select pad
probs[:, self.unk] -= self.unk_penalty # apply unk penalty
# Record attention scores
attn[:, :, step+1].copy_(avg_attn_scores)
# take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
cand_scores = buffer('cand_scores', type_of=scores)
cand_indices = buffer('cand_indices')
cand_beams = buffer('cand_beams')
probs.view(bsz, -1).topk(
min(cand_size, probs.view(bsz, -1).size(1) - 1), # -1 so we never select pad
out=(cand_scores, cand_indices))
torch.div(cand_indices, self.vocab_size, out=cand_beams)
cand_indices.fmod_(self.vocab_size)
eos_bbsz_idx = buffer('eos_bbsz_idx')
eos_scores = buffer('eos_scores', type_of=scores)
if step < maxlen:
# take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
torch.topk(
probs.view(bsz, -1),
k=min(cand_size, probs.view(bsz, -1).size(1) - 1), # -1 so we never select pad
out=(cand_scores, cand_indices),
)
torch.div(cand_indices, self.vocab_size, out=cand_beams)
cand_indices.fmod_(self.vocab_size)
else:
# finalize all active hypotheses once we hit maxlen
# pick the hypothesis with the highest prob of EOS right now
torch.sort(
probs[:, self.eos],
descending=True,
out=(eos_scores, eos_bbsz_idx),
)
num_remaining_sent -= finalize_hypos(
step, eos_bbsz_idx, eos_scores)
assert num_remaining_sent == 0
break
# cand_bbsz_idx contains beam indices for the top candidate
# hypotheses, with a range of values: [0, bsz*beam_size),
......@@ -257,58 +297,87 @@ class SequenceGenerator(object):
# finalize hypotheses that end in eos
eos_mask = cand_indices.eq(self.eos)
if step >= self.minlen:
eos_bbsz_idx = buffer('eos_bbsz_idx')
# only consider eos when it's among the top beam_size indices
cand_bbsz_idx[:, :beam_size].masked_select(eos_mask[:, :beam_size], out=eos_bbsz_idx)
torch.masked_select(
cand_bbsz_idx[:, :beam_size],
mask=eos_mask[:, :beam_size],
out=eos_bbsz_idx,
)
if eos_bbsz_idx.numel() > 0:
eos_scores = buffer('eos_scores', type_of=scores)
cand_scores[:, :beam_size].masked_select(eos_mask[:, :beam_size], out=eos_scores)
num_remaining_sent -= finalize_hypos(step, eos_bbsz_idx, eos_scores)
torch.masked_select(
cand_scores[:, :beam_size],
mask=eos_mask[:, :beam_size],
out=eos_scores,
)
num_remaining_sent -= finalize_hypos(
step, eos_bbsz_idx, eos_scores, cand_scores)
assert num_remaining_sent >= 0
if num_remaining_sent == 0:
break
assert step < maxlen
# set active_mask so that values > cand_size indicate eos hypos
# and values < cand_size indicate candidate active hypos.
# After, the min values per row are the top candidate active hypos
active_mask = buffer('active_mask')
torch.add(eos_mask.type_as(cand_offsets)*cand_size, cand_offsets[:eos_mask.size(1)],
out=active_mask)
torch.add(
eos_mask.type_as(cand_offsets)*cand_size,
cand_offsets[:eos_mask.size(1)],
out=active_mask,
)
# get the top beam_size active hypotheses, which are just the hypos
# with the smallest values in active_mask
active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore')
active_mask.topk(beam_size, 1, largest=False, out=(_ignore, active_hypos))
active_mask.topk(
k=beam_size, dim=1, largest=False,
out=(_ignore, active_hypos),
)
active_bbsz_idx = buffer('active_bbsz_idx')
cand_bbsz_idx.gather(1, active_hypos, out=active_bbsz_idx)
active_scores = cand_scores.gather(1, active_hypos,
out=scores.view(bsz, beam_size))
cand_bbsz_idx.gather(
dim=1, index=active_hypos,
out=active_bbsz_idx,
)
active_scores = cand_scores.gather(
dim=1, index=active_hypos,
out=scores[:, step].view(bsz, beam_size),
)
active_bbsz_idx = active_bbsz_idx.view(-1)
active_scores = active_scores.view(-1)
# finalize all active hypotheses once we hit maxlen
# finalize_hypos will take care of adding the EOS markers
if step == maxlen:
num_remaining_sent -= finalize_hypos(step, active_bbsz_idx, active_scores)
assert num_remaining_sent == 0
break
# copy tokens for active hypotheses
torch.index_select(tokens[:, :step+1], dim=0, index=active_bbsz_idx,
out=tokens_buf[:, :step+1])
cand_indices.gather(1, active_hypos,
out=tokens_buf.view(bsz, beam_size, -1)[:, :, step+1])
# copy tokens and scores for active hypotheses
torch.index_select(
tokens[:, :step+1], dim=0, index=active_bbsz_idx,
out=tokens_buf[:, :step+1],
)
torch.gather(
cand_indices, dim=1, index=active_hypos,
out=tokens_buf.view(bsz, beam_size, -1)[:, :, step+1],
)
if step > 0:
torch.index_select(
scores[:, :step], dim=0, index=active_bbsz_idx,
out=scores_buf[:, :step],
)
torch.gather(
cand_scores, dim=1, index=active_hypos,
out=scores_buf.view(bsz, beam_size, -1)[:, :, step],
)
# copy attention for active hypotheses
torch.index_select(attn[:, :, :step+2], dim=0, index=active_bbsz_idx,
out=attn_buf[:, :, :step+2])
torch.index_select(
attn[:, :, :step+2], dim=0, index=active_bbsz_idx,
out=attn_buf[:, :, :step+2],
)
# swap buffers
old_tokens = tokens
tokens = tokens_buf
tokens_buf = old_tokens
old_scores = scores
scores = scores_buf
scores_buf = old_scores
old_attn = attn
attn = attn_buf
attn_buf = old_attn
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import utils
class SequenceScorer(object):
"""Scores the target for a given source sentence."""
def __init__(self, models):
self.models = models
self.pad = models[0].dst_dict.pad()
assert all(m.dst_dict.pad() == self.pad for m in self.models[1:])
def cuda(self):
for model in self.models:
model.cuda()
return self
def score_batched_itr(self, data_itr, cuda=False, timer=None):
"""Iterate over a batched dataset and yield scored translations."""
for sample in data_itr:
s = utils.make_variable(sample, volatile=True, cuda=cuda)
if timer is not None:
timer.start()
pos_scores, attn = self.score(s)
if timer is not None:
timer.stop(s['ntokens'])
for i, id in enumerate(s['id'].data):
src = s['net_input']['src_tokens'].data[i, :]
# remove padding from ref
ref = utils.strip_pad(s['target'].data[i, :], self.pad)
tgt_len = ref.numel()
pos_scores_i = pos_scores[i][:tgt_len]
score_i = pos_scores_i.sum() / tgt_len
attn_i = attn[i]
_, alignment = attn_i.max(dim=0)
hypos = [{
'tokens': ref,
'score': score_i,
'attention': attn_i,
'alignment': alignment,
'positional_scores': pos_scores_i,
}]
# return results in the same format as SequenceGenenerator
yield id, src, ref, hypos
def score(self, sample):
"""Score a batch of translations."""
net_input = sample['net_input']
# compute scores for each model in the ensemble
avg_probs = None
avg_attn = None
for model in self.models:
with utils.maybe_no_grad():
model.eval()
encoder_out = model.encoder(
net_input['src_tokens'],
net_input['src_lengths'],
)
decoder_out, attn = model.decoder(
net_input['prev_output_tokens'],
encoder_out,
)
probs = model.get_normalized_probs(decoder_out, log_probs=False).data
if avg_probs is None:
avg_probs = probs
else:
avg_probs.add_(probs)
if attn is not None:
attn = attn.data
if avg_attn is None:
avg_attn = attn
else:
avg_attn.add_(attn)
avg_probs.div_(len(self.models))
avg_probs.log_()
if avg_attn is not None:
avg_attn.div_(len(self.models))
avg_probs = avg_probs.gather(
dim=2,
index=sample['target'].data.unsqueeze(-1),
)
return avg_probs.squeeze(2), avg_attn
......@@ -4,7 +4,6 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from collections import Counter
import re
......@@ -13,8 +12,10 @@ import torch
from fairseq import dictionary
SPACE_NORMALIZER = re.compile("\s+")
def tokenize_line(line):
line = SPACE_NORMALIZER.sub(" ", line)
line = line.strip()
......
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
"""
Train a network on multiple GPUs.
"""
from collections import OrderedDict
import math
import torch
from fairseq import distributed_utils, optim, utils
from fairseq.meters import AverageMeter, TimeMeter
from fairseq.optim import lr_scheduler
class Trainer(object):
"""Main class for multi-GPU training.
Each GPU has a full copy of the model and is assigned to its own Python
process. Gradients are accumulated with torch.distributed.all_reduce and all
model replicas are updated synchronously after each batch.
"""
def __init__(self, args, model, criterion):
if not torch.cuda.is_available():
raise NotImplementedError('Training on CPU is not supported')
self.args = args
# copy model and criterion to current device
self.model = model.cuda()
self.criterion = criterion.cuda()
# initialize optimizer and LR scheduler
self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
# initialize meters
self.meters = OrderedDict()
self.meters['train_loss'] = AverageMeter()
self.meters['train_nll_loss'] = AverageMeter()
self.meters['valid_loss'] = AverageMeter()
self.meters['valid_nll_loss'] = AverageMeter()
self.meters['wps'] = TimeMeter() # words per second
self.meters['ups'] = TimeMeter() # updates per second
self.meters['wpb'] = AverageMeter() # words per batch
self.meters['bsz'] = AverageMeter() # sentences per batch
self.meters['gnorm'] = AverageMeter() # gradient norm
self.meters['clip'] = AverageMeter() # % of updates clipped
self.meters['oom'] = AverageMeter() # out of memory
self._max_bsz_seen = 0
self._num_updates = 0
def save_checkpoint(self, filename, extra_state):
"""Save all training state in a checkpoint file."""
if self.args.distributed_rank == 0: # only save one checkpoint
utils.save_state(filename, self.args, self.model, self.criterion, self.optimizer,
self.lr_scheduler, self._num_updates, self._optim_history, extra_state)
def load_checkpoint(self, filename):
"""Load all training state from a checkpoint file."""
extra_state, self._optim_history, last_optim_state = utils.load_model_state(
filename, self.model, cuda_device=torch.cuda.current_device())
if last_optim_state is not None:
# rebuild optimizer after loading model, since params may have changed
self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
# only reload optimizer and lr_scheduler if they match
last_optim = self._optim_history[-1]
if last_optim['criterion_name'] == self.criterion.__class__.__name__:
self.lr_scheduler.load_state_dict(last_optim['lr_scheduler_state'])
if last_optim['optimizer_name'] == self.optimizer.__class__.__name__:
self.optimizer.load_state_dict(last_optim_state)
self._num_updates = last_optim['num_updates']
return extra_state
def train_step(self, sample):
"""Do forward, backward and parameter update."""
sample = self._prepare_sample(sample, volatile=False)
# forward pass
loss, sample_sizes, logging_outputs, ooms_fwd = self._forward(sample)
# aggregate stats and logging outputs
ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
grad_denom = self.criterion.__class__.grad_denom(sample_sizes)
agg_logging_output = self.criterion.__class__.aggregate_logging_outputs(logging_outputs)
# backward pass, all-reduce gradients and take an optimization step
grad_norm, ooms_bwd = self._backward_and_opt(loss, grad_denom)
# update meters
self.meters['wps'].update(ntokens)
self.meters['ups'].update(1.)
self.meters['wpb'].update(ntokens)
self.meters['bsz'].update(nsentences)
self.meters['gnorm'].update(grad_norm)
self.meters['clip'].update(1. if grad_norm > self.args.clip_norm else 0.)
self.meters['oom'].update(ooms_fwd + ooms_bwd)
# update loss meters for training
if 'loss' in agg_logging_output:
self.meters['train_loss'].update(agg_logging_output['loss'], grad_denom)
# criterions can optionally log the NLL loss too
if 'nll_loss' in agg_logging_output:
self.meters['train_nll_loss'].update(agg_logging_output['nll_loss'], ntokens)
return agg_logging_output
def _forward(self, sample, eval=False):
# prepare model and optimizer
if eval:
self.model.eval()
else:
self.model.train()
self.optimizer.zero_grad()
loss = None
sample_size = 0
logging_output = {
'ntokens': sample['ntokens'] if sample is not None else 0,
'nsentences': sample['target'].size(0) if sample is not None else 0,
}
oom = 0
if sample is not None:
try:
with utils.maybe_no_grad(eval):
# calculate loss and sample size
loss, sample_size, logging_output_ = self.criterion(self.model, sample)
logging_output.update(logging_output_)
except RuntimeError as e:
if not eval and 'out of memory' in str(e):
print('| WARNING: ran out of memory, skipping batch')
oom = 1
loss = None
if hasattr(torch.cuda, 'empty_cache'):
torch.cuda.empty_cache()
else:
raise e
# synchronize logging outputs for multi-GPU training
if self.args.distributed_world_size > 1:
sample_sizes, logging_outputs, ooms = zip(*list(
distributed_utils.all_gather_list((sample_size, logging_output, oom))))
ooms = sum(ooms)
else:
sample_sizes = [sample_size]
logging_outputs = [logging_output]
ooms = oom
return loss, sample_sizes, logging_outputs, ooms
def _backward_and_opt(self, loss, grad_denom):
oom = 0
if loss is not None:
try:
# backward pass
loss.backward()
except RuntimeError as e:
if 'out of memory' in str(e):
print('| WARNING: ran out of memory, skipping batch')
oom = 1
if hasattr(torch.cuda, 'empty_cache'):
torch.cuda.empty_cache()
self.optimizer.zero_grad()
else:
raise e
# all-reduce grads and rescale by grad_denom
if self.args.distributed_world_size > 1:
grads = [p.grad.data for p in self.model.parameters() if p.requires_grad]
distributed_utils.all_reduce_and_rescale_tensors(grads, grad_denom)
else:
for p in self.model.parameters():
if p.requires_grad:
p.grad.data.div_(grad_denom)
# clip grads
if self.args.clip_norm > 0:
grad_norm = torch.nn.utils.clip_grad_norm(self.model.parameters(), self.args.clip_norm)
else:
grad_norm = math.sqrt(sum(p.grad.data.norm()**2 for p in self.model.parameters()))
# take an optimization step
self.optimizer.step()
self._num_updates += 1
# update learning rate
self.lr_scheduler.step_update(self._num_updates)
return grad_norm, oom
def valid_step(self, sample):
"""Do forward pass in evaluation mode."""
sample = self._prepare_sample(sample, volatile=True)
# forward pass
loss, sample_sizes, logging_outputs, ooms_fwd = self._forward(sample, eval=True)
assert not ooms_fwd, 'Ran out of memory during validation'
# aggregate stats and logging outputs
ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
grad_denom = self.criterion.__class__.grad_denom(sample_sizes)
agg_logging_output = self.criterion.__class__.aggregate_logging_outputs(logging_outputs)
# update loss meters for validation
if 'loss' in agg_logging_output:
self.meters['valid_loss'].update(agg_logging_output['loss'], grad_denom)
# criterions can optionally log the NLL loss too
if 'nll_loss' in agg_logging_output:
self.meters['valid_nll_loss'].update(agg_logging_output['nll_loss'], ntokens)
return agg_logging_output
def lr_step(self, epoch, val_loss=None):
"""Adjust the learning rate based on the validation loss."""
return self.lr_scheduler.step(epoch, val_loss)
def get_lr(self):
"""Get the current learning rate."""
return self.optimizer.get_lr()
def get_model(self):
"""Get the model replica."""
return self.model
def get_meter(self, name):
"""Get a specific meter by name."""
if name not in self.meters:
return None
return self.meters[name]
def get_num_updates(self):
"""Get the number of parameters updates."""
return self._num_updates
def _prepare_sample(self, sample, volatile):
if sample is None or len(sample) == 0:
return None
if hasattr(torch.cuda, 'empty_cache'):
# clear the caching allocator if this is the largest sample we've seen
if sample['target'].size(0) > self._max_bsz_seen:
self._max_bsz_seen = sample['target'].size(0)
torch.cuda.empty_cache()
return utils.make_variable(sample, volatile=volatile, cuda=True)
......@@ -4,57 +4,17 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import contextlib
import logging
import os
import torch
import traceback
import sys
from torch.autograd import Variable
from torch.serialization import default_restore_location
from fairseq import criterions, progress_bar, tokenizer
def parse_args_and_arch(parser):
from fairseq import models
args = parser.parse_args()
args.model = models.arch_model_map[args.arch]
args = getattr(models, args.model).parse_arch(args)
return args
def build_model(args, src_dict, dst_dict):
from fairseq import models
assert hasattr(models, args.model), 'Missing model type'
return getattr(models, args.model).build_model(args, src_dict, dst_dict)
def build_criterion(args, src_dict, dst_dict):
if args.label_smoothing > 0:
return criterions.LabelSmoothedCrossEntropyCriterion(args, dst_dict)
else:
return criterions.CrossEntropyCriterion(args, dst_dict)
def build_progress_bar(args, iterator, epoch=None, prefix=None):
if args.log_format is None:
args.log_format = 'tqdm' if sys.stderr.isatty() else 'simple'
if args.log_format == 'json':
bar = progress_bar.json_progress_bar(iterator, epoch, prefix, args.log_interval)
elif args.log_format == 'none':
bar = progress_bar.noop_progress_bar(iterator, epoch, prefix)
elif args.log_format == 'simple':
bar = progress_bar.simple_progress_bar(iterator, epoch, prefix, args.log_interval)
elif args.log_format == 'tqdm':
bar = progress_bar.tqdm_progress_bar(iterator, epoch, prefix)
else:
raise ValueError('Unknown log format: {}'.format(args.log_format))
return bar
from fairseq import tokenizer
def torch_persistent_save(*args, **kwargs):
......@@ -66,7 +26,8 @@ def torch_persistent_save(*args, **kwargs):
logging.error(traceback.format_exc())
def save_state(filename, args, model, criterion, optimizer, lr_scheduler, optim_history=None, extra_state=None):
def save_state(filename, args, model, criterion, optimizer, lr_scheduler,
num_updates, optim_history=None, extra_state=None):
if optim_history is None:
optim_history = []
if extra_state is None:
......@@ -77,7 +38,9 @@ def save_state(filename, args, model, criterion, optimizer, lr_scheduler, optim_
'optimizer_history': optim_history + [
{
'criterion_name': criterion.__class__.__name__,
'best_loss': lr_scheduler.best,
'optimizer_name': optimizer.__class__.__name__,
'lr_scheduler_state': lr_scheduler.state_dict(),
'num_updates': num_updates,
}
],
'last_optimizer_state': optimizer.state_dict(),
......@@ -102,7 +65,7 @@ def load_model_state(filename, model, cuda_device=None):
# load model parameters
try:
model.load_state_dict(state['model'])
except:
except Exception:
raise Exception('Cannot load model parameters from checkpoint, '
'please ensure that the architectures match')
......@@ -115,7 +78,7 @@ def _upgrade_state_dict(state):
if 'optimizer_history' not in state:
state['optimizer_history'] = [
{
'criterion_name': criterions.CrossEntropyCriterion.__name__,
'criterion_name': 'CrossEntropyCriterion',
'best_loss': state['best_loss'],
},
]
......@@ -137,6 +100,18 @@ def _upgrade_state_dict(state):
state['last_optimizer_state'] = state['optimizer_history'][-1]['optimizer']
for optim_hist in state['optimizer_history']:
del optim_hist['optimizer']
# record the optimizer class name
if 'optimizer_name' not in state['optimizer_history'][-1]:
state['optimizer_history'][-1]['optimizer_name'] = 'FairseqNAG'
# move best_loss into lr_scheduler_state
if 'lr_scheduler_state' not in state['optimizer_history'][-1]:
state['optimizer_history'][-1]['lr_scheduler_state'] = {
'best': state['optimizer_history'][-1]['best_loss'],
}
del state['optimizer_history'][-1]['best_loss']
# keep track of number of updates
if 'num_updates' not in state['optimizer_history'][-1]:
state['optimizer_history'][-1]['num_updates'] = 0
return state
......@@ -146,7 +121,7 @@ def load_ensemble_for_inference(filenames, src_dict=None, dst_dict=None, data_di
The source and target dictionaries can be given explicitly, or loaded from
the `data_dir` directory.
"""
from fairseq import data
from fairseq import data, models
# load model architectures and weights
states = []
......@@ -166,8 +141,7 @@ def load_ensemble_for_inference(filenames, src_dict=None, dst_dict=None, data_di
# build ensemble
ensemble = []
for state in states:
model = build_model(args, src_dict, dst_dict)
state['model'] = model.upgrade_state_dict(state['model'])
model = models.build_model(args, src_dict, dst_dict)
model.load_state_dict(state['model'])
ensemble.append(model)
return ensemble, args
......@@ -197,13 +171,16 @@ def volatile_variable(*args, **kwargs):
return Variable(*args, **kwargs, volatile=True)
def make_variable(sample, volatile=False, cuda_device=None):
def make_variable(sample, volatile=False, cuda=False):
"""Wrap input tensors in Variable class."""
if len(sample) == 0:
return {}
def _make_variable(maybe_tensor):
if torch.is_tensor(maybe_tensor):
if cuda_device is not None and torch.cuda.is_available():
maybe_tensor = maybe_tensor.cuda(async=True, device=cuda_device)
if cuda and torch.cuda.is_available():
maybe_tensor = maybe_tensor.cuda()
if volatile:
return volatile_variable(maybe_tensor)
else:
......@@ -229,8 +206,8 @@ def load_align_dict(replace_unk):
align_dict = {}
with open(replace_unk, 'r') as f:
for line in f:
l = line.split()
align_dict[l[0]] = l[1]
cols = line.split()
align_dict[cols[0]] = cols[1]
else:
# No alignment dictionary provided but we still want to perform unknown word replacement by copying the
# original source word.
......@@ -262,20 +239,37 @@ def post_process_prediction(hypo_tokens, src_str, alignment, align_dict, dst_dic
return hypo_tokens, hypo_str, alignment
def lstrip_pad(tensor, pad):
return tensor[tensor.eq(pad).long().sum():]
def rstrip_pad(tensor, pad):
strip = tensor.eq(pad).long().sum()
if strip > 0:
return tensor[:-strip]
return tensor
def strip_pad(tensor, pad):
if tensor[0] == pad:
tensor = lstrip_pad(tensor, pad)
if tensor[-1] == pad:
tensor = rstrip_pad(tensor, pad)
return tensor
return tensor[tensor.ne(pad)]
def buffered_arange(max):
if not hasattr(buffered_arange, 'buf'):
buffered_arange.buf = torch.LongTensor()
if max > buffered_arange.buf.numel():
torch.arange(max, out=buffered_arange.buf)
return buffered_arange.buf[:max]
def convert_padding_direction(
src_tokens,
src_lengths,
padding_idx,
right_to_left=False,
left_to_right=False,
):
assert not isinstance(src_tokens, Variable)
assert not isinstance(src_lengths, Variable)
assert right_to_left ^ left_to_right
pad_mask = src_tokens.eq(padding_idx)
if pad_mask.max() == 0:
# no padding, return early
return src_tokens
max_len = src_tokens.size(1)
range = buffered_arange(max_len).type_as(src_tokens).expand_as(src_tokens)
num_pads = pad_mask.long().sum(dim=1, keepdim=True)
if right_to_left:
index = torch.remainder(range - num_pads, max_len)
else:
index = torch.remainder(range + num_pads, max_len)
return src_tokens.gather(1, index)
#!/usr/bin/env python3
#!/usr/bin/env python3 -u
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import torch
from fairseq import bleu, data, options, tokenizer, utils
from fairseq import bleu, data, options, progress_bar, tokenizer, utils
from fairseq.meters import StopwatchMeter, TimeMeter
from fairseq.sequence_generator import SequenceGenerator
from fairseq.sequence_scorer import SequenceScorer
def main():
parser = options.get_parser('Generation')
parser.add_argument('--path', metavar='FILE', required=True, action='append',
help='path(s) to model file(s)')
dataset_args = options.add_dataset_args(parser)
dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N',
help='batch size')
dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT',
help='data subset to generate (train, valid, test)')
dataset_args.add_argument('--num-shards', default=1, type=int, metavar='N',
help='shard generation over N shards')
dataset_args.add_argument('--shard-id', default=0, type=int, metavar='ID',
help='id of the shard to generate (id < num_shards)')
options.add_generation_args(parser)
args = parser.parse_args()
if args.no_progress_bar and args.log_format is None:
args.log_format = 'none'
def main(args):
print(args)
use_cuda = torch.cuda.is_available() and not args.cpu
......@@ -40,9 +23,19 @@ def main():
# Load dataset
if args.replace_unk is None:
dataset = data.load_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang)
dataset = data.load_dataset(
args.data,
[args.gen_subset],
args.source_lang,
args.target_lang,
)
else:
dataset = data.load_raw_text_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang)
dataset = data.load_raw_text_dataset(
args.data,
[args.gen_subset],
args.source_lang,
args.target_lang,
)
if args.source_lang is None or args.target_lang is None:
# record inferred languages in args
args.source_lang, args.target_lang = dataset.src, dataset.dst
......@@ -58,37 +51,49 @@ def main():
# Optimize ensemble for generation
for model in models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)
# Initialize generator
translator = SequenceGenerator(
models, beam_size=args.beam, stop_early=(not args.no_early_stop),
normalize_scores=(not args.unnormalized), len_penalty=args.lenpen,
unk_penalty=args.unkpen)
if use_cuda:
translator.cuda()
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
)
# Load alignment dictionary for unknown word replacement
# (None if no unknown word replacement, empty if no path to align dictionary)
align_dict = utils.load_align_dict(args.replace_unk)
# Generate and compute BLEU score
scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk())
# Load dataset (possibly sharded)
max_positions = min(model.max_encoder_positions() for model in models)
itr = dataset.eval_dataloader(
args.gen_subset, max_sentences=args.batch_size, max_positions=max_positions,
skip_invalid_size_inputs_valid_test=args.skip_invalid_size_inputs_valid_test)
args.gen_subset,
max_sentences=args.max_sentences,
max_positions=max_positions,
skip_invalid_size_inputs_valid_test=args.skip_invalid_size_inputs_valid_test,
)
if args.num_shards > 1:
if args.shard_id < 0 or args.shard_id >= args.num_shards:
raise ValueError('--shard-id must be between 0 and num_shards')
itr = data.sharded_iterator(itr, args.num_shards, args.shard_id)
# Initialize generator
gen_timer = StopwatchMeter()
if args.score_reference:
translator = SequenceScorer(models)
else:
translator = SequenceGenerator(
models, beam_size=args.beam, stop_early=(not args.no_early_stop),
normalize_scores=(not args.unnormalized), len_penalty=args.lenpen,
unk_penalty=args.unkpen)
if use_cuda:
translator.cuda()
# Generate and compute BLEU score
scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk())
num_sentences = 0
with utils.build_progress_bar(args, itr) as t:
with progress_bar.build_progress_bar(args, itr) as t:
if args.score_reference:
translations = translator.score_batched_itr(t, cuda=use_cuda, timer=gen_timer)
else:
translations = translator.generate_batched_itr(
t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
cuda=use_cuda, timer=gen_timer)
wps_meter = TimeMeter()
gen_timer = StopwatchMeter()
translations = translator.generate_batched_itr(
t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
cuda_device=0 if use_cuda else None, timer=gen_timer)
for sample_id, src_tokens, target_tokens, hypos in translations:
# Process input and ground truth
target_tokens = target_tokens.int().cpu()
......@@ -112,19 +117,26 @@ def main():
alignment=hypo['alignment'].int().cpu(),
align_dict=align_dict,
dst_dict=dataset.dst_dict,
remove_bpe=args.remove_bpe)
remove_bpe=args.remove_bpe,
)
if not args.quiet:
print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str))
print('P-{}\t{}'.format(
sample_id,
' '.join(map(
lambda x: '{:.4f}'.format(x),
hypo['positional_scores'].tolist(),
))
))
print('A-{}\t{}'.format(sample_id, ' '.join(map(str, alignment))))
# Score only the top hypothesis
if i == 0:
if align_dict is not None or args.remove_bpe is not None:
# Convert back to tokens for evaluation with unk replacement and/or without BPE
target_tokens = tokenizer.Tokenizer.tokenize(target_str,
dataset.dst_dict,
add_if_not_exist=True)
target_tokens = tokenizer.Tokenizer.tokenize(
target_str, dataset.dst_dict, add_if_not_exist=True)
scorer.add(target_tokens, hypo_tokens)
wps_meter.update(src_tokens.size(0))
......@@ -137,4 +149,6 @@ def main():
if __name__ == '__main__':
main()
parser = options.get_generation_parser()
args = parser.parse_args()
main(args)
#!/usr/bin/env python3
#!/usr/bin/env python3 -u
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import sys
import torch
from torch.autograd import Variable
......@@ -14,17 +14,12 @@ from fairseq import options, tokenizer, utils
from fairseq.sequence_generator import SequenceGenerator
def main():
parser = options.get_parser('Generation')
parser.add_argument('--path', metavar='FILE', required=True, action='append',
help='path(s) to model file(s)')
options.add_dataset_args(parser)
options.add_generation_args(parser)
args = parser.parse_args()
def main(args):
print(args)
use_cuda = torch.cuda.is_available() and not args.cpu
if hasattr(torch, 'set_grad_enabled'):
torch.set_grad_enabled(False)
# Load ensemble
print('| loading model(s) from {}'.format(', '.join(args.path)))
......@@ -37,7 +32,8 @@ def main():
# Optimize ensemble for generation
for model in models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
)
# Initialize generator
translator = SequenceGenerator(
......@@ -57,7 +53,11 @@ def main():
src_tokens = tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
if use_cuda:
src_tokens = src_tokens.cuda()
translations = translator.generate(Variable(src_tokens.view(1, -1)))
src_lengths = src_tokens.new([src_tokens.numel()])
translations = translator.generate(
Variable(src_tokens.view(1, -1)),
Variable(src_lengths.view(-1)),
)
hypos = translations[0]
print('O\t{}'.format(src_str))
......@@ -69,10 +69,13 @@ def main():
alignment=hypo['alignment'].int().cpu(),
align_dict=align_dict,
dst_dict=dst_dict,
remove_bpe=args.remove_bpe)
remove_bpe=args.remove_bpe,
)
print('H\t{}\t{}'.format(hypo['score'], hypo_str))
print('A\t{}'.format(' '.join(map(str, alignment))))
if __name__ == '__main__':
main()
parser = options.get_generation_parser()
args = parser.parse_args()
main(args)
#!/usr/bin/env python3 -u
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import os
import random
import signal
import torch
from fairseq import distributed_utils, options
from singleprocess_train import main as single_process_main
def main(args):
# Set distributed training parameters for a single node.
args.distributed_world_size = torch.cuda.device_count()
args.distributed_init_method = 'tcp://localhost:{port}'.format(
port=random.randint(10000, 20000))
mp = torch.multiprocessing.get_context('spawn')
# Create a thread to listen for errors in the child processes.
error_queue = mp.SimpleQueue()
error_handler = ErrorHandler(error_queue)
# Train with multiprocessing.
procs = []
for i in range(args.distributed_world_size):
args.distributed_rank = i
args.device_id = i
procs.append(mp.Process(target=run, args=(args, error_queue, ), daemon=True))
procs[i].start()
error_handler.add_child(procs[i].pid)
for p in procs:
p.join()
def run(args, error_queue):
try:
args.distributed_rank = distributed_utils.distributed_init(args)
single_process_main(args)
except KeyboardInterrupt:
pass # killed by parent, do nothing
except Exception:
# propagate exception to parent process, keeping original traceback
import traceback
error_queue.put((args.distributed_rank, traceback.format_exc()))
class ErrorHandler(object):
"""A class that listens for exceptions in children processes and propagates
the tracebacks to the parent process."""
def __init__(self, error_queue):
import signal
import threading
self.error_queue = error_queue
self.children_pids = []
self.error_thread = threading.Thread(target=self.error_listener, daemon=True)
self.error_thread.start()
signal.signal(signal.SIGUSR1, self.signal_handler)
def add_child(self, pid):
self.children_pids.append(pid)
def error_listener(self):
(rank, original_trace) = self.error_queue.get()
self.error_queue.put((rank, original_trace))
os.kill(os.getpid(), signal.SIGUSR1)
def signal_handler(self, signalnum, stackframe):
for pid in self.children_pids:
os.kill(pid, signal.SIGINT) # kill children processes
(rank, original_trace) = self.error_queue.get()
msg = "\n\n-- Tracebacks above this line can probably be ignored --\n\n"
msg += original_trace
raise Exception(msg)
if __name__ == '__main__':
parser = options.get_training_parser()
args = options.parse_args_and_arch(parser)
main(args)
......@@ -13,10 +13,10 @@ import os
import shutil
from fairseq import dictionary, indexed_dataset
from fairseq.tokenizer import Tokenizer
from fairseq.tokenizer import Tokenizer, tokenize_line
def main():
def get_parser():
parser = argparse.ArgumentParser(
description='Data pre-processing: Create dictionary and store data in binary format')
parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
......@@ -36,22 +36,37 @@ def main():
parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'],
help='output format (optional)')
parser.add_argument('--joined-dictionary', action='store_true', help='Generate joined dictionary')
return parser
args = parser.parse_args()
def main(args):
print(args)
os.makedirs(args.destdir, exist_ok=True)
if args.srcdict:
src_dict = dictionary.Dictionary.load(args.srcdict)
if args.joined_dictionary:
assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
src_dict = dictionary.Dictionary()
for lang in [args.source_lang, args.target_lang]:
Tokenizer.add_file_to_dictionary(
filename='{}.{}'.format(args.trainpref, lang),
dict=src_dict,
tokenize=tokenize_line,
)
src_dict.finalize()
tgt_dict = src_dict
else:
src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
if args.srcdict:
src_dict = dictionary.Dictionary.load(args.srcdict)
else:
src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
if args.tgtdict:
tgt_dict = dictionary.Dictionary.load(args.tgtdict)
else:
tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))
src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)),
threshold=args.thresholdsrc, nwords=args.nwordssrc)
if args.tgtdict:
tgt_dict = dictionary.Dictionary.load(args.tgtdict)
else:
tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))
tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)),
threshold=args.thresholdtgt, nwords=args.nwordstgt)
......@@ -136,4 +151,6 @@ def main():
if __name__ == '__main__':
main()
parser = get_parser()
args = parser.parse_args()
main(args)
......@@ -54,13 +54,14 @@ class build_py_hook(build_py):
setup(
name='fairseq',
version='0.3.0',
version='0.4.0',
description='Facebook AI Research Sequence-to-Sequence Toolkit',
long_description=readme,
license=license,
install_requires=reqs.strip().split('\n'),
packages=find_packages(),
ext_modules=[bleu],
test_suite='tests',
# build and install PyTorch extensions
package_data={
......
#!/usr/bin/env python3 -u
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import collections
import itertools
import os
import math
import torch
from fairseq import criterions, data, models, options, progress_bar
from fairseq.meters import AverageMeter, StopwatchMeter
from fairseq.trainer import Trainer
def main(args):
print(args)
if not torch.cuda.is_available():
raise NotImplementedError('Training on CPU is not supported')
torch.cuda.set_device(args.device_id)
torch.manual_seed(args.seed)
# Load dataset
splits = ['train', 'valid']
if data.has_binary_files(args.data, splits):
dataset = data.load_dataset(
args.data, splits, args.source_lang, args.target_lang)
else:
dataset = data.load_raw_text_dataset(
args.data, splits, args.source_lang, args.target_lang)
if args.source_lang is None or args.target_lang is None:
# record inferred languages in args, so that it's saved in checkpoints
args.source_lang, args.target_lang = dataset.src, dataset.dst
print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
for split in splits:
print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split])))
# Build model and criterion
model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict)
print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))
print('| num. model params: {}'.format(sum(p.data.numel() for p in model.parameters())))
# Build trainer
trainer = Trainer(args, model, criterion)
print('| training on {} GPUs'.format(args.distributed_world_size))
print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
args.max_tokens,
args.max_sentences,
))
# Load the latest checkpoint if one is available
os.makedirs(args.save_dir, exist_ok=True)
checkpoint_path = os.path.join(args.save_dir, args.restore_file)
extra_state = trainer.load_checkpoint(checkpoint_path)
if extra_state is not None:
epoch = extra_state['epoch']
batch_offset = extra_state['batch_offset']
print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch))
if batch_offset == 0:
trainer.lr_step(epoch)
epoch += 1
else:
epoch, batch_offset = 1, 0
# Train until the learning rate gets too small
max_epoch = args.max_epoch or math.inf
lr = trainer.get_lr()
train_meter = StopwatchMeter()
train_meter.start()
while lr > args.min_lr and epoch <= max_epoch:
# train for one epoch
train(args, trainer, dataset, epoch, batch_offset)
# evaluate on validate set
for k, subset in enumerate(args.valid_subset.split(',')):
val_loss = validate(args, trainer, dataset, subset, epoch)
if k == 0:
# only use first validation loss to update the learning schedule
lr = trainer.lr_step(epoch, val_loss)
# save checkpoint
if not args.no_save:
save_checkpoint(trainer, args, epoch, 0, val_loss)
epoch += 1
batch_offset = 0
train_meter.stop()
print('| done training in {:.1f} seconds'.format(train_meter.sum))
def train(args, trainer, dataset, epoch, batch_offset):
"""Train the model for one epoch."""
# Set seed based on args.seed and the epoch number so that we get
# reproducible results when resuming from checkpoints
seed = args.seed + epoch
torch.manual_seed(seed)
# The max number of positions can be different for train and valid
# e.g., RNNs may support more positions at test time than seen in training
max_positions_train = (
min(args.max_source_positions, trainer.get_model().max_encoder_positions()),
min(args.max_target_positions, trainer.get_model().max_decoder_positions())
)
# Initialize dataloader, starting at batch_offset
itr = dataset.train_dataloader(
args.train_subset,
max_tokens=args.max_tokens,
max_sentences=args.max_sentences,
max_positions=max_positions_train,
seed=seed,
epoch=epoch,
sample_without_replacement=args.sample_without_replacement,
sort_by_source_size=(epoch <= args.curriculum),
shard_id=args.distributed_rank,
num_shards=args.distributed_world_size,
)
progress = progress_bar.build_progress_bar(args, itr, epoch, no_progress_bar='simple')
itr = itertools.islice(progress, batch_offset, None)
# reset training meters
for k in ['train_loss', 'train_nll_loss', 'wps', 'ups', 'wpb', 'bsz', 'clip']:
meter = trainer.get_meter(k)
if meter is not None:
meter.reset()
extra_meters = collections.defaultdict(lambda: AverageMeter())
for i, sample in enumerate(itr, start=batch_offset):
log_output = trainer.train_step(sample)
# log mid-epoch stats
stats = get_training_stats(trainer)
for k, v in log_output.items():
if k in ['loss', 'nll_loss']:
continue # these are already logged above
extra_meters[k].update(v)
stats[k] = extra_meters[k].avg
progress.log(stats)
# save mid-epoch checkpoints
if i == batch_offset:
# ignore the first mini-batch in words-per-second calculation
trainer.get_meter('wps').reset()
if args.save_interval > 0 and trainer.get_num_updates() % args.save_interval == 0:
save_checkpoint(trainer, args, epoch, i + 1)
# log end-of-epoch stats
stats = get_training_stats(trainer)
for k, meter in extra_meters.items():
stats[k] = meter.avg
progress.print(stats)
def get_training_stats(trainer):
stats = collections.OrderedDict()
stats['loss'] = '{:.3f}'.format(trainer.get_meter('train_loss').avg)
if trainer.get_meter('train_nll_loss').count > 0:
nll_loss = trainer.get_meter('train_nll_loss').avg
stats['nll_loss'] = '{:.3f}'.format(nll_loss)
else:
nll_loss = trainer.get_meter('train_loss').avg
stats['ppl'] = get_perplexity(nll_loss)
stats['wps'] = round(trainer.get_meter('wps').avg)
stats['ups'] = '{:.1f}'.format(trainer.get_meter('ups').avg)
stats['wpb'] = round(trainer.get_meter('wpb').avg)
stats['bsz'] = round(trainer.get_meter('bsz').avg)
stats['num_updates'] = trainer.get_num_updates()
stats['lr'] = trainer.get_lr()
stats['gnorm'] = '{:.3f}'.format(trainer.get_meter('gnorm').avg)
stats['clip'] = '{:.0%}'.format(trainer.get_meter('clip').avg)
return stats
def validate(args, trainer, dataset, subset, epoch):
"""Evaluate the model on the validation set and return the average loss."""
# Initialize dataloader
max_positions_valid = (
trainer.get_model().max_encoder_positions(),
trainer.get_model().max_decoder_positions(),
)
itr = dataset.eval_dataloader(
subset,
max_tokens=args.max_tokens,
max_sentences=args.max_sentences_valid,
max_positions=max_positions_valid,
skip_invalid_size_inputs_valid_test=args.skip_invalid_size_inputs_valid_test,
descending=True, # largest batch first to warm the caching allocator
shard_id=args.distributed_rank,
num_shards=args.distributed_world_size,
)
progress = progress_bar.build_progress_bar(
args, itr, epoch,
prefix='valid on \'{}\' subset'.format(subset),
no_progress_bar='simple'
)
# reset validation loss meters
for k in ['valid_loss', 'valid_nll_loss']:
meter = trainer.get_meter(k)
if meter is not None:
meter.reset()
extra_meters = collections.defaultdict(lambda: AverageMeter())
for sample in progress:
log_output = trainer.valid_step(sample)
# log mid-validation stats
stats = get_valid_stats(trainer)
for k, v in log_output.items():
if k in ['loss', 'nll_loss']:
continue
extra_meters[k].update(v)
stats[k] = extra_meters[k].avg
progress.log(stats)
# log validation stats
stats = get_valid_stats(trainer)
for k, meter in extra_meters.items():
stats[k] = meter.avg
progress.print(stats)
return stats['valid_loss']
def get_valid_stats(trainer):
stats = collections.OrderedDict()
stats['valid_loss'] = trainer.get_meter('valid_loss').avg
if trainer.get_meter('valid_nll_loss').count > 0:
nll_loss = trainer.get_meter('valid_nll_loss').avg
stats['valid_nll_loss'] = nll_loss
else:
nll_loss = trainer.get_meter('valid_loss').avg
stats['valid_ppl'] = get_perplexity(nll_loss)
return stats
def get_perplexity(loss):
try:
return '{:.2f}'.format(math.pow(2, loss))
except OverflowError:
return float('inf')
def save_checkpoint(trainer, args, epoch, batch_offset, val_loss=None):
extra_state = {
'epoch': epoch,
'batch_offset': batch_offset,
'val_loss': val_loss,
}
if batch_offset == 0:
if not args.no_epoch_checkpoints:
epoch_filename = os.path.join(args.save_dir, 'checkpoint{}.pt'.format(epoch))
trainer.save_checkpoint(epoch_filename, extra_state)
assert val_loss is not None
if not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best:
save_checkpoint.best = val_loss
best_filename = os.path.join(args.save_dir, 'checkpoint_best.pt')
trainer.save_checkpoint(best_filename, extra_state)
elif not args.no_epoch_checkpoints:
epoch_filename = os.path.join(
args.save_dir, 'checkpoint{}_{}.pt'.format(epoch, batch_offset))
trainer.save_checkpoint(epoch_filename, extra_state)
last_filename = os.path.join(args.save_dir, 'checkpoint_last.pt')
trainer.save_checkpoint(last_filename, extra_state)
if __name__ == '__main__':
parser = options.get_training_parser()
args = options.parse_args_and_arch(parser)
main(args)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from io import StringIO
import os
import random
import sys
import tempfile
import unittest
import torch
from fairseq import options
import preprocess, train, generate, interactive
class TestBinaries(unittest.TestCase):
def test_binaries(self):
# comment this out to debug the unittest if it's failing
self.mock_stdout()
with tempfile.TemporaryDirectory() as data_dir:
self.create_dummy_data(data_dir)
self.preprocess_data(data_dir)
self.train_model(data_dir)
self.generate(data_dir)
self.unmock_stdout()
def create_dummy_data(self, data_dir, num_examples=1000, maxlen=20):
def _create_dummy_data(filename):
data = torch.rand(num_examples * maxlen)
data = 97 + torch.floor(26 * data).int()
with open(os.path.join(data_dir, filename), 'w') as h:
offset = 0
for i in range(num_examples):
ex_len = random.randint(1, maxlen)
ex_str = ' '.join(map(chr, data[offset:offset+ex_len]))
print(ex_str, file=h)
offset += ex_len
_create_dummy_data('train.in')
_create_dummy_data('train.out')
_create_dummy_data('valid.in')
_create_dummy_data('valid.out')
_create_dummy_data('test.in')
_create_dummy_data('test.out')
def preprocess_data(self, data_dir):
preprocess_parser = preprocess.get_parser()
preprocess_args = preprocess_parser.parse_args([
'--source-lang', 'in',
'--target-lang', 'out',
'--trainpref', os.path.join(data_dir, 'train'),
'--validpref', os.path.join(data_dir, 'valid'),
'--testpref', os.path.join(data_dir, 'test'),
'--thresholdtgt', '0',
'--thresholdsrc', '0',
'--destdir', data_dir,
])
preprocess.main(preprocess_args)
def train_model(self, data_dir):
train_parser = options.get_training_parser()
train_args = options.parse_args_and_arch(
train_parser,
[
data_dir,
'--arch', 'fconv_iwslt_de_en',
'--optimizer', 'nag',
'--lr', '0.05',
'--max-tokens', '500',
'--save-dir', data_dir,
'--max-epoch', '1',
'--no-progress-bar',
],
)
train.main(train_args)
def generate(self, data_dir):
generate_parser = options.get_generation_parser()
generate_args = generate_parser.parse_args([
data_dir,
'--path', os.path.join(data_dir, 'checkpoint_best.pt'),
'--beam', '5',
'--batch-size', '32',
'--gen-subset', 'valid',
'--no-progress-bar',
])
# evaluate model in batch mode
generate.main(generate_args)
# evaluate model interactively
orig_stdin = sys.stdin
sys.stdin = StringIO('h e l l o\n')
interactive.main(generate_args)
sys.stdin = orig_stdin
def mock_stdout(self):
self._orig_stdout = sys.stdout
sys.stdout = StringIO()
def unmock_stdout(self):
if hasattr(self, '_orig_stdout'):
sys.stdout = self._orig_stdout
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import argparse
import unittest
import torch
from torch.autograd import Variable
from fairseq.sequence_generator import SequenceGenerator
import tests.utils as test_utils
class TestSequenceGenerator(unittest.TestCase):
def setUp(self):
# construct dummy dictionary
d = test_utils.dummy_dictionary(vocab_size=2)
self.assertEqual(d.pad(), 1)
self.assertEqual(d.eos(), 2)
self.assertEqual(d.unk(), 3)
self.eos = d.eos()
self.w1 = 4
self.w2 = 5
# construct source data
self.src_tokens = Variable(torch.LongTensor([
[self.w1, self.w2, self.eos],
[self.w1, self.w2, self.eos],
]))
self.src_lengths = Variable(torch.LongTensor([2, 2]))
args = argparse.Namespace()
unk = 0.
args.beam_probs = [
# step 0:
torch.FloatTensor([
# eos w1 w2
# sentence 1:
[0.0, unk, 0.9, 0.1], # beam 1
[0.0, unk, 0.9, 0.1], # beam 2
# sentence 2:
[0.0, unk, 0.7, 0.3],
[0.0, unk, 0.7, 0.3],
]),
# step 1:
torch.FloatTensor([
# eos w1 w2 prefix
# sentence 1:
[1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 <eos>: 0.9*1.0)
[0.0, unk, 0.9, 0.1], # w2: 0.1
# sentence 2:
[0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 <eos>: 0.7*0.25)
[0.00, unk, 0.10, 0.9], # w2: 0.3
]),
# step 2:
torch.FloatTensor([
# eos w1 w2 prefix
# sentence 1:
[0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9
[0.6, unk, 0.2, 0.2], # w2 w2: 0.1*0.1 (emit: w2 w2 <eos>: 0.1*0.1*0.6)
# sentence 2:
[0.60, unk, 0.4, 0.00], # w1 w2: 0.7*0.4 (emit: w1 w2 <eos>: 0.7*0.4*0.6)
[0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9
]),
# step 3:
torch.FloatTensor([
# eos w1 w2 prefix
# sentence 1:
[1.0, unk, 0.0, 0.0], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0)
[1.0, unk, 0.0, 0.0], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0)
# sentence 2:
[0.1, unk, 0.5, 0.4], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1)
[1.0, unk, 0.0, 0.0], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0)
]),
]
self.model = test_utils.TestModel.build_model(args, d, d)
def test_with_normalization(self):
generator = SequenceGenerator([self.model])
hypos = generator.generate(self.src_tokens, self.src_lengths, beam_size=2)
eos, w1, w2 = self.eos, self.w1, self.w2
# sentence 1, beam 1
self.assertHypoTokens(hypos[0][0], [w1, eos])
self.assertHypoScore(hypos[0][0], [0.9, 1.0])
# sentence 1, beam 2
self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
# sentence 2, beam 1
self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0])
# sentence 2, beam 2
self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6])
def test_without_normalization(self):
# Sentence 1: unchanged from the normalized case
# Sentence 2: beams swap order
generator = SequenceGenerator([self.model], normalize_scores=False)
hypos = generator.generate(self.src_tokens, self.src_lengths, beam_size=2)
eos, w1, w2 = self.eos, self.w1, self.w2
# sentence 1, beam 1
self.assertHypoTokens(hypos[0][0], [w1, eos])
self.assertHypoScore(hypos[0][0], [0.9, 1.0], normalized=False)
# sentence 1, beam 2
self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], normalized=False)
# sentence 2, beam 1
self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], normalized=False)
# sentence 2, beam 2
self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], normalized=False)
def test_with_lenpen_favoring_short_hypos(self):
lenpen = 0.6
generator = SequenceGenerator([self.model], len_penalty=lenpen)
hypos = generator.generate(self.src_tokens, self.src_lengths, beam_size=2)
eos, w1, w2 = self.eos, self.w1, self.w2
# sentence 1, beam 1
self.assertHypoTokens(hypos[0][0], [w1, eos])
self.assertHypoScore(hypos[0][0], [0.9, 1.0], lenpen=lenpen)
# sentence 1, beam 2
self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
# sentence 2, beam 1
self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], lenpen=lenpen)
# sentence 2, beam 2
self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
def test_with_lenpen_favoring_long_hypos(self):
lenpen = 5.0
generator = SequenceGenerator([self.model], len_penalty=lenpen)
hypos = generator.generate(self.src_tokens, self.src_lengths, beam_size=2)
eos, w1, w2 = self.eos, self.w1, self.w2
# sentence 1, beam 1
self.assertHypoTokens(hypos[0][0], [w2, w1, w2, eos])
self.assertHypoScore(hypos[0][0], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
# sentence 1, beam 2
self.assertHypoTokens(hypos[0][1], [w1, eos])
self.assertHypoScore(hypos[0][1], [0.9, 1.0], lenpen=lenpen)
# sentence 2, beam 1
self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
# sentence 2, beam 2
self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6], lenpen=lenpen)
def test_maxlen(self):
generator = SequenceGenerator([self.model], maxlen=2)
hypos = generator.generate(self.src_tokens, self.src_lengths, beam_size=2)
eos, w1, w2 = self.eos, self.w1, self.w2
# sentence 1, beam 1
self.assertHypoTokens(hypos[0][0], [w1, eos])
self.assertHypoScore(hypos[0][0], [0.9, 1.0])
# sentence 1, beam 2
self.assertHypoTokens(hypos[0][1], [w2, w2, eos])
self.assertHypoScore(hypos[0][1], [0.1, 0.1, 0.6])
# sentence 2, beam 1
self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6])
# sentence 2, beam 2
self.assertHypoTokens(hypos[1][1], [w2, w2, eos])
self.assertHypoScore(hypos[1][1], [0.3, 0.9, 0.01])
def test_no_stop_early(self):
generator = SequenceGenerator([self.model], stop_early=False)
hypos = generator.generate(self.src_tokens, self.src_lengths, beam_size=2)
eos, w1, w2 = self.eos, self.w1, self.w2
# sentence 1, beam 1
self.assertHypoTokens(hypos[0][0], [w1, eos])
self.assertHypoScore(hypos[0][0], [0.9, 1.0])
# sentence 1, beam 2
self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
# sentence 2, beam 1
self.assertHypoTokens(hypos[1][0], [w2, w2, w2, w2, eos])
self.assertHypoScore(hypos[1][0], [0.3, 0.9, 0.99, 0.4, 1.0])
# sentence 2, beam 2
self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0])
def assertHypoTokens(self, hypo, tokens):
self.assertTensorEqual(hypo['tokens'], torch.LongTensor(tokens))
def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.):
pos_scores = torch.FloatTensor(pos_probs).log()
self.assertAlmostEqual(hypo['positional_scores'], pos_scores)
self.assertEqual(pos_scores.numel(), hypo['tokens'].numel())
score = pos_scores.sum()
if normalized:
score /= pos_scores.numel()**lenpen
self.assertLess(abs(score - hypo['score']), 1e-6)
def assertAlmostEqual(self, t1, t2):
self.assertEqual(t1.size(), t2.size(), "size mismatch")
self.assertLess((t1 - t2).abs().max(), 1e-4)
def assertTensorEqual(self, t1, t2):
self.assertEqual(t1.size(), t2.size(), "size mismatch")
self.assertEqual(t1.ne(t2).long().sum(), 0)
if __name__ == '__main__':
unittest.main()
This diff is collapsed.
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import unittest
import torch
from torch.autograd import Variable
from fairseq import utils
class TestUtils(unittest.TestCase):
def test_convert_padding_direction(self):
pad = 1
left_pad = torch.LongTensor([
[2, 3, 4, 5, 6],
[1, 7, 8, 9, 10],
[1, 1, 1, 11, 12],
])
right_pad = torch.LongTensor([
[2, 3, 4, 5, 6],
[7, 8, 9, 10, 1],
[11, 12, 1, 1, 1],
])
lengths = torch.LongTensor([5, 4, 2])
self.assertAlmostEqual(
right_pad,
utils.convert_padding_direction(
left_pad,
lengths,
pad,
left_to_right=True,
),
)
self.assertAlmostEqual(
left_pad,
utils.convert_padding_direction(
right_pad,
lengths,
pad,
right_to_left=True,
),
)
def test_make_variable(self):
t = [{'k': torch.rand(5, 5)}]
v = utils.make_variable(t)[0]['k']
self.assertTrue(isinstance(v, Variable))
self.assertFalse(v.data.is_cuda)
v = utils.make_variable(t, cuda=True)[0]['k']
self.assertEqual(v.data.is_cuda, torch.cuda.is_available())
def assertAlmostEqual(self, t1, t2):
self.assertEqual(t1.size(), t2.size(), "size mismatch")
self.assertLess((t1 - t2).abs().max(), 1e-4)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment