Commit e734b0fa authored by Sergey Edunov's avatar Sergey Edunov
Browse files

Initial commit

parents
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from .fconv import *
__all__ = [
'fconv', 'fconv_iwslt_de_en', 'fconv_wmt_en_ro', 'fconv_wmt_en_de',
'fconv_wmt_en_fr',
]
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq.modules import BeamableMM, LinearizedConvolution
class FConvModel(nn.Module):
def __init__(self, encoder, decoder, padding_idx=1):
super(FConvModel, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.encoder.num_attention_layers = sum([layer is not None for layer in decoder.attention])
self.padding_idx = padding_idx
self._is_generation_fast = False
def forward(self, src_tokens, src_positions, input_tokens, input_positions):
encoder_out = self.encoder(src_tokens, src_positions)
decoder_out = self.decoder(input_tokens, input_positions, encoder_out)
return decoder_out.view(-1, decoder_out.size(-1))
def make_generation_fast_(self, beam_size, use_beamable_mm=False):
"""Optimize model for faster generation.
Optimizations include:
- remove WeightNorm
- (optionally) use BeamableMM in attention layers
The optimized model should not be used again for training.
Note: this can be combined with incremental inference in the Decoder for
even faster generation.
"""
if self._is_generation_fast:
return # only apply once
self._is_generation_fast = True
# remove weight norm from all modules in the network
def remove_weight_norm(m):
try:
nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(remove_weight_norm)
# use BeamableMM in attention layers
if use_beamable_mm:
self.decoder._use_beamable_mm(beam_size)
def train(mode):
if mode:
raise RuntimeError('cannot train after make_generation_fast')
# this model should no longer be used for training
self.eval()
self.train = train
class Encoder(nn.Module):
"""Convolutional encoder"""
def __init__(self, num_embeddings, embed_dim=512, max_positions=1024,
convolutions=((512, 3),) * 20, dropout=0.1, padding_idx=1):
super(Encoder, self).__init__()
self.dropout = dropout
self.num_attention_layers = None
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
self.embed_positions = Embedding(max_positions, embed_dim, padding_idx)
in_channels = convolutions[0][0]
self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
self.projections = nn.ModuleList()
self.convolutions = nn.ModuleList()
for (out_channels, kernel_size) in convolutions:
pad = (kernel_size - 1) // 2
self.projections.append(Linear(in_channels, out_channels)
if in_channels != out_channels else None)
self.convolutions.append(
ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad,
dropout=dropout))
in_channels = out_channels
self.fc2 = Linear(in_channels, embed_dim)
def forward(self, tokens, positions):
# embed tokens and positions
x = self.embed_tokens(tokens) + self.embed_positions(positions)
x = F.dropout(x, p=self.dropout, training=self.training)
input_embedding = x
# project to size of convolution
x = self.fc1(x)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# temporal convolutions
for proj, conv in zip(self.projections, self.convolutions):
residual = x if proj is None else proj(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x)
x = F.glu(x, dim=-1)
x = (x + residual) * math.sqrt(0.5)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# project back to size of embedding
x = self.fc2(x)
# scale gradients (this only affects backward, not forward)
x = grad_multiply(x, 1.0 / (2.0 * self.num_attention_layers))
# add output to input embedding for attention
y = (x + input_embedding) * math.sqrt(0.5)
return x, y
class AttentionLayer(nn.Module):
def __init__(self, conv_channels, embed_dim, bmm=None):
super(AttentionLayer, self).__init__()
# projects from output of convolution to embedding dimension
self.in_projection = Linear(conv_channels, embed_dim)
# projects from embedding dimension to convolution size
self.out_projection = Linear(embed_dim, conv_channels)
self.bmm = bmm if bmm is not None else torch.bmm
def forward(self, x, target_embedding, encoder_out):
residual = x
# attention
x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5)
x = self.bmm(x, encoder_out[0])
# softmax over last dim
sz = x.size()
x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
x = x.view(sz)
attn_scores = x
x = self.bmm(x, encoder_out[1])
# scale attention output
s = encoder_out[1].size(1)
x = x * (s * math.sqrt(1.0 / s))
# project back
x = (self.out_projection(x) + residual) * math.sqrt(0.5)
return x, attn_scores
class Decoder(nn.Module):
"""Convolutional decoder"""
def __init__(self, num_embeddings, embed_dim=512, out_embed_dim=256,
max_positions=1024, convolutions=((512, 3),) * 20,
attention=True, dropout=0.1, padding_idx=1):
super(Decoder, self).__init__()
self.dropout = dropout
in_channels = convolutions[0][0]
if isinstance(attention, bool):
# expand True into [True, True, ...] and do the same with False
attention = [attention] * len(convolutions)
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
self.embed_positions = Embedding(max_positions, embed_dim, padding_idx)
self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
self.projections = nn.ModuleList()
self.convolutions = nn.ModuleList()
self.attention = nn.ModuleList()
for i, (out_channels, kernel_size) in enumerate(convolutions):
pad = kernel_size - 1
self.projections.append(Linear(in_channels, out_channels)
if in_channels != out_channels else None)
self.convolutions.append(
LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
padding=pad, dropout=dropout))
self.attention.append(AttentionLayer(out_channels, embed_dim)
if attention[i] else None)
in_channels = out_channels
self.fc2 = Linear(in_channels, out_embed_dim)
self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
self._is_inference_incremental = False
def forward(self, tokens, positions, encoder_out):
# embed tokens and positions
x = self.embed_tokens(tokens) + self.embed_positions(positions)
x = F.dropout(x, p=self.dropout, training=self.training)
target_embedding = x
# project to size of convolution
x = self.fc1(x)
# transpose only once to speed up attention layers
encoder_a, encoder_b = encoder_out
encoder_a = encoder_a.transpose(1, 2).contiguous()
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# temporal convolutions
for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
residual = x if proj is None else proj(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x)
x = conv.remove_future_timesteps(x)
x = F.glu(x)
# attention
if attention is not None:
x = x.transpose(1, 0)
x, _ = attention(x, target_embedding, (encoder_a, encoder_b))
x = x.transpose(1, 0)
# residual
x = (x + residual) * math.sqrt(0.5)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# project back to size of vocabulary
x = self.fc2(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.fc3(x)
return x
def context_size(self):
"""Maximum number of input elements each output element depends on"""
context = 1
for conv in self.convolutions:
context += conv.kernel_size[0] - 1
return context
def incremental_inference(self):
"""Context manager for incremental inference.
This provides an optimized forward pass for incremental inference
(i.e., it predicts one time step at a time). If the input order changes
between time steps, call model.decoder.reorder_incremental_state to
update the relevant buffers. To generate a fresh sequence, first call
model.decoder.clear_incremental_state.
Usage:
```
with model.decoder.incremental_inference():
for step in range(maxlen):
out = model.decoder(tokens[:, :step], positions[:, :step],
encoder_out)
probs = F.log_softmax(out[:, -1, :])
```
"""
class IncrementalInference(object):
def __init__(self, decoder):
self.decoder = decoder
def __enter__(self):
self.decoder._start_incremental_inference()
def __exit__(self, *args):
self.decoder._stop_incremental_inference()
return IncrementalInference(self)
def _start_incremental_inference(self):
assert not self._is_inference_incremental, \
'already performing incremental inference'
self._is_inference_incremental = True
# save original forward and convolution layers
self._orig_forward = self.forward
self._orig_conv = self.convolutions
# switch to incremental forward
self.forward = self._incremental_forward
# start a fresh sequence
self.clear_incremental_state()
def _stop_incremental_inference(self):
# restore original forward and convolution layers
self.forward = self._orig_forward
self.convolutions = self._orig_conv
self._is_inference_incremental = False
def _incremental_forward(self, tokens, positions, encoder_out):
assert self._is_inference_incremental
# setup initial state
if self.prev_state is None:
# transpose encoder output once to speed up attention layers
encoder_a, encoder_b = encoder_out
encoder_a = encoder_a.transpose(1, 2).contiguous()
self.prev_state = {
'encoder_out': (encoder_a, encoder_b),
}
# load previous state
encoder_a, encoder_b = self.prev_state['encoder_out']
# keep only the last token for incremental forward pass
tokens = tokens[:, -1:]
positions = positions[:, -1:]
# embed tokens and positions
x = self.embed_tokens(tokens) + self.embed_positions(positions)
target_embedding = x
# project to size of convolution
x = self.fc1(x)
# temporal convolutions
avg_attn_scores = None
num_attn_layers = len(self.attention)
for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
residual = x if proj is None else proj(x)
x = conv.incremental_forward(x)
x = F.glu(x)
# attention
if attention is not None:
x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b))
attn_scores = attn_scores / num_attn_layers
if avg_attn_scores is None:
avg_attn_scores = attn_scores
else:
avg_attn_scores += attn_scores
# residual
x = (x + residual) * math.sqrt(0.5)
# project back to size of vocabulary
x = self.fc2(x)
x = self.fc3(x)
return x, avg_attn_scores
def clear_incremental_state(self):
"""Clear all state used for incremental generation.
**For incremental inference only**
This should be called before generating a fresh sequence.
"""
if self._is_inference_incremental:
self.prev_state = None
for conv in self.convolutions:
conv.clear_buffer()
def reorder_incremental_state(self, new_order):
"""Reorder buffered internal state (for incremental generation).
**For incremental inference only**
This should be called when the order of the input has changed from the
previous time step. A typical use case is beam search, where the input
order changes between time steps based on the choice of beams.
"""
if self._is_inference_incremental:
for conv in self.convolutions:
conv.reorder_buffer(new_order)
def _use_beamable_mm(self, beam_size):
"""Replace torch.bmm with BeamableMM in attention layers."""
beamable_mm = BeamableMM(beam_size)
for attn in self.attention:
attn.bmm = beamable_mm
def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
m.weight.data.normal_(0, 0.1)
return m
def Linear(in_features, out_features, dropout=0):
"""Weight-normalized Linear layer (input: N x T x C)"""
m = nn.Linear(in_features, out_features)
m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
m.bias.data.zero_()
return nn.utils.weight_norm(m)
def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
"""Weight-normalized Conv1d layer optimized for decoding"""
m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
m.weight.data.normal_(mean=0, std=std)
m.bias.data.zero_()
return nn.utils.weight_norm(m)
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
"""Weight-normalized Conv1d layer"""
from fairseq.modules import ConvTBC
m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
m.weight.data.normal_(mean=0, std=std)
m.bias.data.zero_()
return nn.utils.weight_norm(m, dim=2)
def grad_multiply(x, scale):
return GradMultiply.apply(x, scale)
class GradMultiply(torch.autograd.Function):
@staticmethod
def forward(ctx, x, scale):
ctx.scale = scale
res = x.new(x)
ctx.mark_shared_storage((x, res))
return res
@staticmethod
def backward(ctx, grad):
return grad * ctx.scale, None
def fconv_iwslt_de_en(dataset, dropout, **kwargs):
encoder_convs = [(256, 3)] * 4
decoder_convs = [(256, 3)] * 3
return fconv(dataset, dropout, 256, encoder_convs, 256, decoder_convs, **kwargs)
def fconv_wmt_en_ro(dataset, dropout, **kwargs):
convs = [(512, 3)] * 20
return fconv(dataset, dropout, 512, convs, 512, convs, **kwargs)
def fconv_wmt_en_de(dataset, dropout, **kwargs):
convs = [(512, 3)] * 9 # first 10 layers have 512 units
convs += [(1024, 3)] * 4 # next 3 layers have 768 units
convs += [(2048, 1)] * 2 # final 2 layers are 1x1
return fconv(dataset, dropout, 768, convs, 768, convs,
decoder_out_embed_dim=512,
**kwargs)
def fconv_wmt_en_fr(dataset, dropout, **kwargs):
convs = [(512, 3)] * 6 # first 5 layers have 512 units
convs += [(768, 3)] * 4 # next 4 layers have 768 units
convs += [(1024, 3)] * 3 # next 4 layers have 1024 units
convs += [(2048, 1)] * 1 # next 1 layer is 1x1
convs += [(4096, 1)] * 1 # final 1 layer is 1x1
return fconv(dataset, dropout, 768, convs, 768, convs,
decoder_out_embed_dim=512,
**kwargs)
def fconv(dataset, dropout, encoder_embed_dim, encoder_convolutions,
decoder_embed_dim, decoder_convolutions, attention=True,
decoder_out_embed_dim=256, max_positions=1024):
padding_idx = dataset.dst_dict.pad()
encoder = Encoder(
len(dataset.src_dict),
embed_dim=encoder_embed_dim,
convolutions=encoder_convolutions,
dropout=dropout,
padding_idx=padding_idx,
max_positions=max_positions)
decoder = Decoder(
len(dataset.dst_dict),
embed_dim=decoder_embed_dim,
convolutions=decoder_convolutions,
out_embed_dim=decoder_out_embed_dim,
attention=attention,
dropout=dropout,
padding_idx=padding_idx,
max_positions=max_positions)
return FConvModel(encoder, decoder, padding_idx)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from .beamable_mm import *
from .linearized_convolution import *
from .conv_tbc import ConvTBC
__all__ = [
'BeamableMM', 'LinearizedConvolution', 'ConvTBC',
]
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import torch
import torch.nn as nn
class BeamableMM(nn.Module):
"""This module provides an optimized MM for beam decoding with attention.
It leverage the fact that the source-side of the input is replicated beam
times and the target-side of the input is of width one. This layer speeds up
inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
"""
def __init__(self, beam_size):
super(BeamableMM, self).__init__()
self.beam_size = beam_size
def forward(self, input1, input2):
if (
not self.training and # test mode
self.beam_size > 0 and # beam size is set
input1.dim() == 3 and # only support batched input
input1.size(1) == 1 # single time step update
):
bsz, beam = input1.size(0), self.beam_size
# bsz x 1 x nhu --> bsz/beam x beam x nhu
input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
# bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
# use non batched operation if bsz = beam
if input1.size(0) == 1:
output = torch.mm(input1[0, :, :], input2[0, :, :])
else:
output = input1.bmm(input2)
return output.view(bsz, 1, -1)
else:
return input1.bmm(input2)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import torch
from torch.autograd import Variable, Function
from torch.nn.modules.utils import _single
try:
from fairseq import temporal_convolution_tbc
except ImportError as e:
import sys
sys.stderr.write('ERROR: missing temporal_convolution_tbc, run `python setup.py install`\n')
raise e
class ConvTBC(torch.nn.Module):
"""1D convolution over an input of shape (time x batch x channel)
The implementation uses gemm to perform the convolution. This implementation
is faster than cuDNN for small kernel sizes.
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0):
super(ConvTBC, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _single(kernel_size)
self.stride = _single(stride)
self.padding = _single(padding)
assert self.stride == (1,)
self.weight = torch.nn.Parameter(torch.Tensor(
self.kernel_size[0], in_channels, out_channels))
self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
def forward(self, input):
return ConvTBCFunction.apply(
input.contiguous(), self.weight, self.bias, self.padding[0])
def __repr__(self):
s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
', padding={padding}')
if self.bias is None:
s += ', bias=False'
s += ')'
return s.format(name=self.__class__.__name__, **self.__dict__)
class ConvTBCFunction(Function):
@staticmethod
def forward(ctx, input, weight, bias, pad):
input_size = input.size()
weight_size = weight.size()
kernel_size = weight_size[0]
output = input.new(
input_size[0] - kernel_size + 1 + pad * 2,
input_size[1],
weight_size[2])
ctx.input_size = input_size
ctx.weight_size = weight_size
ctx.save_for_backward(input, weight)
temporal_convolution_tbc.TemporalConvolutionTBC_forward(
input.type().encode('utf-8'),
input,
output,
weight,
bias)
return output
@staticmethod
def backward(ctx, grad_output):
input, weight = ctx.saved_tensors
grad_output = grad_output.data.contiguous()
grad_input = grad_output.new(ctx.input_size).zero_()
grad_weight = grad_output.new(ctx.weight_size).zero_()
grad_bias = grad_output.new(ctx.weight_size[2])
temporal_convolution_tbc.TemporalConvolutionTBC_backward(
input.type().encode('utf-8'),
grad_output,
grad_input,
grad_weight,
grad_bias,
input,
weight)
grad_input = Variable(grad_input, volatile=True)
grad_weight = Variable(grad_weight, volatile=True)
grad_bias = Variable(grad_bias, volatile=True)
return grad_input, grad_weight, grad_bias, None
def conv_tbc(input, weight, bias=None, stride=1, padding=0):
return ConvTBCFunction.apply(
input.contiguous(), weight, bias, padding[0])
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import torch
import torch.nn.functional as F
from .conv_tbc import ConvTBC
class LinearizedConvolution(ConvTBC):
"""An optimized version of nn.Conv1d.
This module replaces convolutions with linear layers as appropriate
and supports optimizations for incremental inference.
"""
def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
super().__init__(in_channels, out_channels, kernel_size, **kwargs)
self.clear_buffer()
self._linearized_weight = None
self.register_backward_hook(self._clear_linearized_weight)
def remove_future_timesteps(self, x):
"""Remove future time steps created by padding."""
if self.kernel_size[0] > 1 and self.padding[0] > 0:
x = x[:-self.padding[0], :, :]
return x
def incremental_forward(self, input):
"""Forward convolution one time step at a time.
This function maintains an internal state to buffer signal and
accepts a single frame as input. If the input order changes
between time steps, call reorder_buffer. To apply to fresh
inputs, call clear_buffer.
"""
if self.training:
raise RuntimeError('LinearizedConvolution only supports inference')
# run forward pre hooks (e.g., weight norm)
for hook in self._forward_pre_hooks.values():
hook(self, input)
# reshape weight
weight = self._get_linearized_weight()
kw = self.kernel_size[0]
bsz = input.size(0) # input: bsz x len x dim
if kw > 1:
input = input.data
if self.input_buffer is None:
self.input_buffer = input.new(bsz, kw, input.size(2))
self.input_buffer.zero_()
else:
# shift buffer
self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
# append next input
self.input_buffer[:, -1, :] = input[:, -1, :]
input = torch.autograd.Variable(self.input_buffer, volatile=True)
output = F.linear(input.view(bsz, -1), weight, self.bias)
return output.view(bsz, 1, -1)
def clear_buffer(self):
self.input_buffer = None
def reorder_buffer(self, new_order):
if self.input_buffer is not None:
self.input_buffer = self.input_buffer.index_select(0, new_order)
def _get_linearized_weight(self):
if self._linearized_weight is None:
kw = self.kernel_size[0]
weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
assert weight.size() == (self.out_channels, kw, self.in_channels)
self._linearized_weight = weight.view(self.out_channels, -1)
return self._linearized_weight
def _clear_linearized_weight(self, *args):
self._linearized_weight = None
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import os
import signal
import threading
from torch import multiprocessing
class MultiprocessingEventLoop(object):
"""Start a multiprocessing event loop."""
def __init__(self, device_ids=None, multiprocessing_method='spawn'):
super().__init__()
self.device_ids = tuple(device_ids)
self.num_replicas = len(device_ids)
self.rank = None
self._mp = multiprocessing.get_context(multiprocessing_method)
self._start_error_handler()
self._start_multiprocessing()
def call_async(self, rank, action, **kwargs):
"""Asynchronously call a function in each child process.
Call a function named `action` on the rank'th process and return
a Future with the result.
"""
def result_generator():
yield self.return_pipes[rank].recv()
assert not self.return_pipes[rank].poll(), \
'return pipe must be consumed before calling another function'
self.input_pipes[rank].send((action, kwargs))
return Future(result_generator())
def stop(self, interrupt_children=False):
"""Stop multiprocessing."""
for rank in range(self.num_replicas):
self.input_pipes[rank].close()
self.return_pipes[rank].close()
if interrupt_children:
# send KeyboardInterrupt to children
os.kill(self.procs[rank].pid, signal.SIGINT)
else:
self.procs[rank].join()
self.error_queue.put((None, None)) # poison pill
def _start_error_handler(self):
"""Error handler to catch exceptions in child processes."""
# create a thread to listen for errors in the child processes
self.error_queue = self._mp.SimpleQueue()
error_thread = threading.Thread(target=self._error_listener,
daemon=True)
error_thread.start()
# create signal handler that executes in the main process/thread and
# handles errors from child processes
signal.signal(signal.SIGUSR1, self._signal_handler)
def _error_listener(self):
"""A thread that listens for errors in the child processes.
Errors are handled in a signal handler in the main thread.
"""
(rank, original_trace) = self.error_queue.get()
if rank is None: # poison pill, return
return
# requeue error and switch to main thread for handling the error
self.error_queue.put((rank, original_trace))
os.kill(os.getpid(), signal.SIGUSR1)
def _signal_handler(self, signal, frame):
"""Signal handler that handles errors from child processes.
This signal handler executes in the main/process thread.
"""
self.stop(interrupt_children=True)
(rank, original_trace) = self.error_queue.get()
msg = "\n\n-- Tracebacks above this line can probably be ignored --\n\n"
msg += original_trace
raise Exception(msg)
def _start_multiprocessing(self):
"""Create child processes to run async event loop.
Each process reads input from a Pipe, performs some computation,
and returns its output to another Pipe.
"""
# create child processes
input_pipes = []
return_pipes = []
procs = []
for rank, id in enumerate(self.device_ids):
recv_input_pipe, send_input_pipe = self._mp.Pipe(duplex=False)
recv_return_pipe, send_return_pipe = self._mp.Pipe(duplex=False)
proc = self._mp.Process(
target=self._process_event_loop,
args=(rank, id, recv_input_pipe, send_return_pipe),
daemon=True)
proc.start()
input_pipes.append(send_input_pipe)
return_pipes.append(recv_return_pipe)
procs.append(proc)
self.input_pipes = input_pipes
self.return_pipes = return_pipes
self.procs = procs
def _process_event_loop(self, rank, device_id, input_pipe, return_pipe):
"""Event loop that runs in each child process.
Event loop:
- take an action from the input pipe
- call the corresponding function in this process
- put the return value in the return pipe
Any exceptions are put in the error queue.
"""
self.rank = rank
try:
# event loop
while True:
action, kwargs = input_pipe.recv()
action_fn = getattr(self, action)
return_pipe.send(action_fn(rank, device_id, **kwargs))
except EOFError:
# input pipe was closed, do nothing
pass
except KeyboardInterrupt:
# killed by parent, do nothing
pass
except Exception:
# propagate exception from child to parent process, keeping
# original traceback
import traceback
self.error_queue.put((rank, traceback.format_exc()))
finally:
# cleanup pipes
input_pipe.close()
return_pipe.close()
class Future(object):
"""A wrapper around a Python generator, with syntactic sugar."""
def __init__(self, generator):
self.generator = generator
def gen(self):
return next(self.generator)
@staticmethod
def gen_list(gens):
return [g.gen() for g in gens]
@staticmethod
def gen_tuple_list(gens):
list = [g.gen() for g in gens]
return zip(*list)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import multiprocessing
import os
import pdb
import sys
class MultiprocessingPdb(pdb.Pdb):
"""A Pdb wrapper that works in a multiprocessing environment.
Usage: `from fairseq import pdb; pdb.set_trace()`
"""
_stdin_fd = sys.stdin.fileno()
_stdin = None
_stdin_lock = multiprocessing.Lock()
def __init__(self):
pdb.Pdb.__init__(self, nosigint=True)
def _cmdloop(self):
stdin_bak = sys.stdin
with self._stdin_lock:
try:
if not self._stdin:
self._stdin = os.fdopen(self._stdin_fd)
sys.stdin = self._stdin
self.cmdloop()
finally:
sys.stdin = stdin_bak
pdb = MultiprocessingPdb()
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
"""
Train a network on multiple GPUs using multiprocessing.
"""
import torch
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
from fairseq import nccl, utils
from fairseq.criterions import FairseqCriterion
from fairseq.multiprocessing_event_loop import MultiprocessingEventLoop, Future
from fairseq.nag import NAG
class MultiprocessingTrainer(MultiprocessingEventLoop):
"""Main class for multi-GPU training.
Each GPU has a full copy of the model and is assigned to its own Python
process. Gradients are accumulated with all-reduce and all model replicas
are updated synchronously after each batch.
The methods in this class are divided into synchronous functions, which
prepare and dispatch the input to each process, and asynchronous functions
(prefixed with `_async_`), which run on each process in parallel.
"""
def __init__(self, args, model, device_ids=None,
multiprocessing_method='spawn'):
if device_ids is None:
device_ids = tuple(range(torch.cuda.device_count()))
super().__init__(device_ids, multiprocessing_method)
if not torch.cuda.is_available():
raise NotImplementedError('Training on CPU is not supported')
model = model.share_memory()
nccl_uid = nccl.get_unique_id()
Future.gen_list([
self.call_async(rank, '_async_init', args=args, model=model,
nccl_uid=nccl_uid)
for rank in range(self.num_replicas)
])
def _async_init(self, rank, device_id, args, model, nccl_uid):
"""Initialize child processes."""
self.args = args
# set torch.seed in this process
torch.manual_seed(args.seed)
# set CUDA device
torch.cuda.set_device(device_id)
# initialize NCCL
nccl.initialize(self.num_replicas, nccl_uid, device_id)
# copy model to current device
self.model = model.cuda()
# initialize optimizer
self.optimizer = NAG(self.model.parameters(), lr=self.args.lr,
momentum=self.args.momentum,
weight_decay=self.args.weight_decay)
self.flat_grads = None
# initialize LR scheduler
self.lr_scheduler = self._build_lr_scheduler()
def _build_lr_scheduler(self):
if self.args.force_anneal > 0:
def anneal(e):
if e < self.args.force_anneal:
return 1
else:
return self.args.lrshrink ** (e + 1 - self.args.force_anneal)
lr_scheduler = LambdaLR(self.optimizer, anneal)
lr_scheduler.best = None
else:
# decay the LR by 0.1 every time the validation loss plateaus
lr_scheduler = ReduceLROnPlateau(self.optimizer, patience=0)
return lr_scheduler
def get_model(self):
"""Get one of the model replicas."""
# just return the first model, since all replicas are the same
return self.call_async(0, '_async_get_model').gen()
def _async_get_model(self, rank, device_id):
return self.model
def save_checkpoint(self, args, epoch, batch_offset, val_loss=None):
"""Save a checkpoint for the current model."""
self.call_async(0, '_async_save_checkpoint', args=args, epoch=epoch,
batch_offset=batch_offset, val_loss=val_loss).gen()
def _async_save_checkpoint(self, rank, device_id, args, epoch, batch_offset, val_loss):
utils.save_checkpoint(args, epoch, batch_offset, self.model,
self.optimizer, self.lr_scheduler, val_loss)
def load_checkpoint(self, filename):
"""Load a checkpoint into the model replicas in each process."""
results = Future.gen_list([
self.call_async(rank, '_async_load_checkpoint', filename=filename)
for rank in range(self.num_replicas)
])
epoch, batch_offset = results[0]
return epoch, batch_offset
def _async_load_checkpoint(self, rank, device_id, filename):
return utils.load_checkpoint(filename, self.model, self.optimizer,
self.lr_scheduler, cuda_device=device_id)
def train_step(self, samples, criterion):
"""Do forward, backward and gradient step in parallel."""
assert isinstance(criterion, FairseqCriterion)
# scatter sample across GPUs
samples, data_events = self._scatter_samples(samples)
criterion.prepare(samples)
# forward pass, backward pass and gradient step
losses = [
self.call_async(rank, '_async_train_step', sample=samples[rank],
criterion=criterion, data_event=event)
for rank, event in enumerate(data_events)
]
# aggregate losses and gradient norms
losses, grad_norms = Future.gen_tuple_list(losses)
loss = criterion.aggregate(losses)
return loss, grad_norms[0]
def _async_train_step(self, rank, device_id, sample, criterion, data_event):
data_event.wait()
self.model.train()
# zero grads even if net_input is None, since we will all-reduce them
self.optimizer.zero_grad()
# calculate loss and grads
loss = 0
if sample is not None:
net_output = self.model(**sample['net_input'])
loss_ = criterion(net_output, sample)
loss_.backward()
loss = loss_.data[0]
# flatten grads into a contiguous block of memory
if self.flat_grads is None:
self.flat_grads = self._flatten_grads_(self.model)
# all-reduce grads
nccl.all_reduce(self.flat_grads)
# clip grads
grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm)
# take an optimization step
self.optimizer.step()
return loss, grad_norm
def _flatten_grads_(self, model):
num_params = sum(p.data.numel() for p in model.parameters())
flat_grads = next(model.parameters()).data.new(num_params)
offset = 0
for p in model.parameters():
grad = p.grad.data
numel, sz = grad.numel(), grad.size()
flat_grads[offset:offset+numel] = grad.view(-1)
grad.set_(flat_grads[offset:offset+numel])
grad.resize_(sz) # preserve original shape
offset += numel
return flat_grads
def _clip_grads_(self, flat_grads, clipv):
norm = flat_grads.norm()
if clipv > 0 and norm > clipv:
coef = max(norm, 1e-6) / clipv
flat_grads.div_(coef)
return norm
def valid_step(self, samples, criterion):
"""Do forward pass in parallel."""
# scatter sample across GPUs
samples, data_events = self._scatter_samples(samples, volatile=True)
criterion.prepare(samples)
# forward pass
losses = [
self.call_async(rank, '_async_valid_step', sample=samples[rank],
criterion=criterion, data_event=event)
for rank, event in enumerate(data_events)
]
# aggregate losses
loss = criterion.aggregate(Future.gen_list(losses))
return loss
def _async_valid_step(self, rank, device_id, sample, criterion, data_event):
if sample is None:
return 0
data_event.wait()
self.model.eval()
net_output = self.model(**sample['net_input'])
loss = criterion(net_output, sample)
return loss.data[0]
def get_lr(self):
"""Get the current learning rate."""
return self.call_async(0, '_async_get_lr').gen()
def _async_get_lr(self, rank, device_id):
return self.optimizer.param_groups[0]['lr']
def lr_step(self, val_loss=None, epoch=None):
"""Adjust the learning rate depending on the validation loss."""
lr = Future.gen_list([
self.call_async(rank, '_async_lr_step', val_loss=val_loss, epoch=epoch)
for rank in range(self.num_replicas)
])
return lr[0]
def _async_lr_step(self, rank, device_id, epoch, val_loss):
# update the learning rate
if self.args.force_anneal > 0:
self.lr_scheduler.step(epoch)
else:
self.lr_scheduler.step(val_loss, epoch)
return self.optimizer.param_groups[0]['lr']
def _scatter_samples(self, samples, volatile=False):
"""Split and distribute a sample across GPUs."""
res = [utils.prepare_sample(sample, volatile=volatile,
cuda_device=device_id)
for sample, device_id in zip(samples, self.device_ids)]
# Pad with None until its size is equal to the number of replicas.
res = res + [None]*(self.num_replicas - len(samples))
# Synchronize GPU devices after data is sent to prevent
# race conditions.
events = []
for d in self.device_ids:
with torch.cuda.device(d):
event = torch.cuda.Event(interprocess=True)
event.record()
events.append(event)
return res, events
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from torch.optim.optimizer import Optimizer, required
class NAG(Optimizer):
def __init__(self, params, lr=required, momentum=0, weight_decay=0):
defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
super(NAG, self).__init__(params, defaults)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
lr = group['lr']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad.data
if weight_decay != 0:
d_p.add_(weight_decay, p.data)
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
param_state['momentum_buffer'] = d_p.clone().zero_()
buf = param_state['momentum_buffer']
p.data.add_(momentum * momentum, buf)
p.data.add_(-(1 + momentum) * lr, d_p)
buf.mul_(momentum).add_(-lr, d_p)
return loss
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
"""
A modified version of torch.cuda.nccl.all_reduce for launching kernels on each
GPU separately.
"""
import ctypes
import warnings
lib = None
_uid = None
_rank = None
_num_devices = None
_comm = None
__all__ = ['all_reduce', 'initialize', 'get_unique_id']
def _libnccl():
global lib
if lib is None:
lib = ctypes.cdll.LoadLibrary(None)
if hasattr(lib, 'ncclCommDestroy'):
lib.ncclCommDestroy.restype = None
lib.ncclGetErrorString.restype = ctypes.c_char_p
else:
lib = None
return lib
def is_available(tensors):
devices = set()
for tensor in tensors:
if not tensor.is_contiguous():
return False
if not tensor.is_cuda:
return False
device = tensor.get_device()
if device in devices:
return False
devices.add(device)
if _libnccl() is None:
warnings.warn('NCCL library not found. Check your LD_LIBRARY_PATH')
return False
return True
_communicators = {}
# ncclDataType_t
ncclChar = 0
ncclInt = 1
ncclHalf = 2
ncclFloat = 3
ncclDouble = 4
ncclInt64 = 5
ncclUint64 = 6
# ncclRedOp_t
SUM = 0
PROD = 1
MAX = 2
MIN = 3
nccl_types = {
'torch.cuda.ByteTensor': ncclChar,
'torch.cuda.CharTensor': ncclChar,
'torch.cuda.IntTensor': ncclInt,
'torch.cuda.HalfTensor': ncclHalf,
'torch.cuda.FloatTensor': ncclFloat,
'torch.cuda.DoubleTensor': ncclDouble,
'torch.cuda.LongTensor': ncclInt64,
}
class NcclError(RuntimeError):
def __init__(self, status):
self.status = status
msg = '{0} ({1})'.format(lib.ncclGetErrorString(status), status)
super(NcclError, self).__init__(msg)
class NcclComm(ctypes.c_void_p):
def __del__(self):
lib.ncclCommDestroy(self)
class NcclUniqueId(ctypes.Structure):
_fields_ = [
('internal', ctypes.c_uint8 * 128)
]
def check_error(status):
if status != 0:
raise NcclError(status)
_uids = []
def get_unique_id():
if _libnccl() is None:
raise RuntimeError('Unable to load NCCL library')
uid = NcclUniqueId()
check_error(lib.ncclGetUniqueId(ctypes.byref(uid)))
_uids.append(uid) # Don't allow UIDs to be collected
return uid
def initialize(num_devices, uid, rank):
global _num_devices, _uid, _rank
if _libnccl() is None:
raise RuntimeError('Unable to load NCCL library')
_num_devices = num_devices
if rank != 0:
_uid = NcclUniqueId.from_buffer_copy(uid)
else:
_uid = uid
_rank = rank
def communicator():
global _comm
if _uid is None:
raise RuntimeError('NCCL not initialized')
if _comm is None:
comm = ctypes.c_void_p()
check_error(lib.ncclCommInitRank(
ctypes.byref(comm),
ctypes.c_int(_num_devices),
_uid,
ctypes.c_int(_rank)))
_comm = comm
return _comm
def all_reduce(input, output=None, op=SUM, stream=None):
comm = communicator()
if output is None:
output = input
if stream is not None:
stream = stream.cuda_stream
data_type = nccl_types[input.type()]
check_error(lib.ncclAllReduce(
ctypes.c_void_p(input.data_ptr()),
ctypes.c_void_p(output.data_ptr()),
ctypes.c_size_t(input.numel()),
data_type,
op,
comm,
ctypes.c_void_p(stream)))
return output
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import argparse
from fairseq import models
def get_parser(desc):
parser = argparse.ArgumentParser(
description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc)
parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar')
parser.add_argument('--log-interval', type=int, default=1000, metavar='N',
help='log progress every N updates (when progress bar is disabled)')
parser.add_argument('--seed', default=1, type=int, metavar='N',
help='pseudo random number generator seed')
return parser
def add_dataset_args(parser):
group = parser.add_argument_group('Dataset and data loading')
group.add_argument('data', metavar='DIR',
help='path to data directory')
group.add_argument('-s', '--source-lang', default=None, metavar='SRC',
help='source language')
group.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
help='target language')
group.add_argument('-j', '--workers', default=1, type=int, metavar='N',
help='number of data loading workers (default: 1)')
group.add_argument('--max-positions', default=1024, type=int, metavar='N',
help='max number of tokens in the sequence')
return group
def add_optimization_args(parser):
group = parser.add_argument_group('Optimization')
group.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR',
help='initial learning rate')
group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float,
help='minimum learning rate')
group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N',
help='force annealing at specified epoch')
group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
help='force stop training at specified epoch')
group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS',
help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)')
group.add_argument('--momentum', default=0.99, type=float, metavar='M',
help='momentum factor')
group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
help='clip threshold of gradients')
group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
help='weight decay')
group.add_argument('--sample-without-replacement', default=0, type=int, metavar='N',
help='If bigger than 0, use that number of mini-batches for each epoch,'
' where each sample is drawn randomly with replacement from the'
' dataset')
return group
def add_checkpoint_args(parser):
group = parser.add_argument_group('Checkpointing')
group.add_argument('--save-dir', metavar='DIR', default='checkpoints',
help='path to save checkpoints')
group.add_argument('--restore-file', default='checkpoint_last.pt',
help='filename in save-dir from which to load checkpoint')
group.add_argument('--save-interval', type=int, default=-1,
help='checkpoint every this many batches')
group.add_argument('--no-save', action='store_true',
help='don\'t save models and checkpoints')
group.add_argument('--no-epoch-checkpoints', action='store_true',
help='only store last and best checkpoints')
return group
def add_generation_args(parser):
group = parser.add_argument_group('Generation')
group.add_argument('--beam', default=5, type=int, metavar='N',
help='beam size')
group.add_argument('--nbest', default=1, type=int, metavar='N',
help='number of hypotheses to output')
group.add_argument('--max-len-a', default=0, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
group.add_argument('--max-len-b', default=200, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
group.add_argument('--remove-bpe', action='store_true',
help='remove BPE tokens before scoring')
group.add_argument('--no-early-stop', action='store_true',
help=('continue searching even after finalizing k=beam '
'hypotheses; this is more correct, but increases '
'generation time by 50%%'))
group.add_argument('--unnormalized', action='store_true',
help='compare unnormalized hypothesis scores')
group.add_argument('--cpu', action='store_true', help='generate on CPU')
group.add_argument('--no-beamable-mm', action='store_true',
help='don\'t use BeamableMM in attention layers')
group.add_argument('--lenpen', default=1, type=float,
help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
group.add_argument('--unk-replace-dict', default='', type=str,
help='performs unk word replacement')
return group
def add_model_args(parser):
group = parser.add_argument_group('Model configuration')
group.add_argument('--arch', '-a', default='fconv', metavar='ARCH',
choices=models.__all__,
help='model architecture ({})'.format(', '.join(models.__all__)))
group.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N',
help='encoder embedding dimension')
group.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
group.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N',
help='decoder embedding dimension')
group.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
group.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
group.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N',
help='decoder output embedding dimension')
group.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
group.add_argument('--label-smoothing', default=0, type=float, metavar='D',
help='epsilon for label smoothing, 0 means no label smoothing')
return group
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
"""
Progress bar wrapper around tqdm which handles non-tty outputs
"""
import sys
from tqdm import tqdm
class progress_bar(tqdm):
enabled = sys.stderr.isatty()
print_interval = 1000
def __new__(cls, *args, **kwargs):
if cls.enabled:
return tqdm(*args, **kwargs)
else:
return simple_progress_bar(cls.print_interval, *args, **kwargs)
class simple_progress_bar(tqdm):
def __init__(self, print_interval, *args, **kwargs):
super(simple_progress_bar, self).__init__(*args, **kwargs)
self.print_interval = print_interval
def __iter__(self):
size = len(self.iterable)
for i, obj in enumerate(self.iterable):
yield obj
if i > 0 and i % self.print_interval == 0:
msg = '{} {:5d} / {:d} {}\n'.format(self.desc, i, size, self.postfix)
sys.stdout.write(msg)
sys.stdout.flush()
@classmethod
def write(cls, s, file=None, end="\n"):
fp = file if file is not None else sys.stdout
fp.write(s)
fp.write(end)
fp.flush()
@staticmethod
def status_printer(file):
def print_status(s):
pass
return print_status
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
from contextlib import ExitStack
import math
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from fairseq import utils
class SequenceGenerator(object):
def __init__(self, models, dst_dict, beam_size=1, minlen=1, maxlen=200,
stop_early=True, normalize_scores=True, len_penalty=1):
"""Generates translations of a given source sentence.
Args:
min/maxlen: The length of the generated output will be bounded by
minlen and maxlen (not including the end-of-sentence marker).
stop_early: Stop generation immediately after we finalize beam_size
hypotheses, even though longer hypotheses might have better
normalized scores.
normalize_scores: Normalize scores by the length of the output.
"""
self.models = models
self.dict = dst_dict
self.pad = dst_dict.pad()
self.eos = dst_dict.eos()
self.vocab_size = len(dst_dict)
self.beam_size = beam_size
self.minlen = minlen
self.maxlen = maxlen
self.positions = torch.LongTensor(range(self.pad + 1, self.pad + maxlen + 2))
self.decoder_context = models[0].decoder.context_size()
self.stop_early = stop_early
self.normalize_scores = normalize_scores
self.len_penalty = len_penalty
def cuda(self):
for model in self.models:
model.cuda()
self.positions = self.positions.cuda()
return self
def generate_batched_itr(self, data_itr, maxlen_a=0, maxlen_b=200,
cuda_device=None, timer=None):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda_device: GPU on which to do generation.
timer: StopwatchMeter for timing generations.
"""
def lstrip_pad(tensor):
return tensor[tensor.eq(self.pad).sum():]
for sample in data_itr:
s = utils.prepare_sample(sample, volatile=True, cuda_device=cuda_device)
input = s['net_input']
srclen = input['src_tokens'].size(1)
if timer is not None:
timer.start()
hypos = self.generate(input['src_tokens'], input['src_positions'],
maxlen=(maxlen_a*srclen + maxlen_b))
if timer is not None:
timer.stop(s['ntokens'])
for i, id in enumerate(s['id']):
src = input['src_tokens'].data[i, :]
# remove padding from ref, which appears at the beginning
ref = lstrip_pad(s['target'].data[i, :])
yield id, src, ref, hypos[i]
def generate(self, src_tokens, src_positions, beam_size=None, maxlen=None):
"""Generate a batch of translations."""
with ExitStack() as stack:
for model in self.models:
stack.enter_context(model.decoder.incremental_inference())
return self._generate(src_tokens, src_positions, beam_size, maxlen)
def _generate(self, src_tokens, src_positions, beam_size=None, maxlen=None):
bsz = src_tokens.size(0)
beam_size = beam_size if beam_size is not None else self.beam_size
maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen
encoder_outs = []
for model in self.models:
model.eval()
model.decoder.clear_incremental_state() # start a fresh sequence
# compute the encoder output and expand to beam size
encoder_out = model.encoder(src_tokens, src_positions)
encoder_out = self._expand_encoder_out(encoder_out, beam_size)
encoder_outs.append(encoder_out)
# initialize buffers
scores = encoder_outs[0][0].data.new(bsz * beam_size).fill_(0)
tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad)
tokens_buf = tokens.clone()
tokens[:, 0] = self.eos
align = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(-1)
align_buf = align.clone()
# list of completed sentences
finalized = [[] for i in range(bsz)]
finished = [False for i in range(bsz)]
worst_finalized = [{'idx': None, 'score': float('Inf')} for i in range(bsz)]
num_remaining_sent = bsz
# number of candidate hypos per step
cand_size = 2 * beam_size # 2 x beam size in case half are EOS
# offset arrays for converting between different indexing schemes
bbsz_offsets = (torch.arange(0, bsz)*beam_size).unsqueeze(1).type_as(tokens)
cand_offsets = torch.arange(0, cand_size).type_as(tokens)
# helper function for allocating buffers on the fly
buffers = {}
def buffer(name, type_of=tokens):
if name not in buffers:
buffers[name] = type_of.new()
return buffers[name]
def is_finished(sent):
"""
Check whether we've finished generation for a given sentence, by
comparing the worst score among finalized hypotheses to the best
possible score among unfinalized hypotheses.
"""
assert len(finalized[sent]) <= beam_size
if len(finalized[sent]) == beam_size:
if self.stop_early:
return True
# stop if the best unfinalized score is worse than the worst
# finalized one
bbsz = sent*beam_size
best_unfinalized_score = scores[bbsz:bbsz+beam_size].max()
if self.normalize_scores:
best_unfinalized_score /= maxlen
if worst_finalized[sent]['score'] >= best_unfinalized_score:
return True
return False
def finalize_hypos(step, bbsz_idx, scores):
"""
Finalize the given hypotheses at this step, while keeping the total
number of finalized hypotheses per sentence <= beam_size.
Note: the input must be in the desired finalization order, so that
hypotheses that appear earlier in the input are preferred to those
that appear later.
Args:
step: current time step
bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
indicating which hypotheses to finalize
scores: A vector of the same size as bbsz_idx containing scores
for each hypothesis
"""
assert bbsz_idx.numel() == scores.numel()
norm_scores = scores/math.pow(step+1, self.len_penalty) if self.normalize_scores else scores
sents_seen = set()
for idx, score in zip(bbsz_idx.cpu(), norm_scores.cpu()):
sent = idx // beam_size
sents_seen.add(sent)
def get_hypo():
hypo = tokens[idx, 1:step+2].clone()
hypo[step] = self.eos
alignment = align[idx, 1:step+2].clone()
return {
'tokens': hypo,
'score': score,
'alignment': alignment,
}
if len(finalized[sent]) < beam_size:
finalized[sent].append(get_hypo())
elif score > worst_finalized[sent]['score']:
# replace worst hypo for this sentence with new/better one
worst_idx = worst_finalized[sent]['idx']
finalized[sent][worst_idx] = get_hypo()
# find new worst finalized hypo for this sentence
idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score'])
worst_finalized[sent] = {
'score': s['score'],
'idx': idx,
}
# return number of hypotheses finished this step
num_finished = 0
for sent in sents_seen:
# check termination conditions for this sentence
if not finished[sent] and is_finished(sent):
finished[sent] = True
num_finished += 1
return num_finished
reorder_state = None
for step in range(maxlen + 1): # one extra step for EOS marker
# reorder decoder internal states based on the prev choice of beams
if reorder_state is not None:
for model in self.models:
model.decoder.reorder_incremental_state(reorder_state)
probs, avg_attn_scores = self._decode(tokens[:, :step+1], encoder_outs)
if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam
probs = probs.unfold(0, 1, beam_size).squeeze(2).contiguous()
else:
# make probs contain cumulative scores for each hypothesis
probs.add_(scores.view(-1, 1))
# record alignment to source tokens, based on attention
_ignore_scores = buffer('_ignore_scores', type_of=scores)
avg_attn_scores.topk(1, out=(_ignore_scores, align[:, step+1].unsqueeze(1)))
# take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
cand_scores = buffer('cand_scores', type_of=scores)
cand_indices = buffer('cand_indices')
cand_beams = buffer('cand_beams')
probs.view(bsz, -1).topk(cand_size, out=(cand_scores, cand_indices))
torch.div(cand_indices, self.vocab_size, out=cand_beams)
cand_indices.fmod_(self.vocab_size)
# cand_bbsz_idx contains beam indices for the top candidate
# hypotheses, with a range of values: [0, bsz*beam_size),
# and dimensions: [bsz, cand_size]
cand_bbsz_idx = cand_beams.add_(bbsz_offsets)
# finalize hypotheses that end in eos
eos_mask = cand_indices.eq(self.eos)
if step >= self.minlen:
eos_bbsz_idx = buffer('eos_bbsz_idx')
cand_bbsz_idx.masked_select(eos_mask, out=eos_bbsz_idx)
if eos_bbsz_idx.numel() > 0:
eos_scores = buffer('eos_scores', type_of=scores)
cand_scores.masked_select(eos_mask, out=eos_scores)
num_remaining_sent -= finalize_hypos(step, eos_bbsz_idx, eos_scores)
assert num_remaining_sent >= 0
if num_remaining_sent == 0:
break
# set active_mask so that values > cand_size indicate eos hypos
# and values < cand_size indicate candidate active hypos.
# After, the min values per row are the top candidate active hypos
active_mask = buffer('active_mask')
torch.add((eos_mask*cand_size).type_as(cand_offsets), cand_offsets,
out=active_mask)
# get the top beam_size active hypotheses, which are just the hypos
# with the smallest values in active_mask
active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore')
active_mask.topk(beam_size, 1, largest=False, out=(_ignore, active_hypos))
active_bbsz_idx = buffer('active_bbsz_idx')
cand_bbsz_idx.gather(1, active_hypos, out=active_bbsz_idx)
active_scores = cand_scores.gather(1, active_hypos,
out=scores.view(bsz, beam_size))
active_bbsz_idx = active_bbsz_idx.view(-1)
active_scores = active_scores.view(-1)
# finalize all active hypotheses once we hit maxlen
# finalize_hypos will take care of adding the EOS markers
if step == maxlen:
num_remaining_sent -= finalize_hypos(step, active_bbsz_idx, active_scores)
assert num_remaining_sent == 0
break
# copy tokens for active hypotheses
torch.index_select(tokens[:, :step+1], dim=0, index=active_bbsz_idx,
out=tokens_buf[:, :step+1])
cand_indices.gather(1, active_hypos,
out=tokens_buf.view(bsz, beam_size, -1)[:, :, step+1])
# copy attention/alignment for active hypotheses
torch.index_select(align[:, :step+2], dim=0, index=active_bbsz_idx,
out=align_buf[:, :step+2])
# swap buffers
old_tokens = tokens
tokens = tokens_buf
tokens_buf = old_tokens
old_align = align
align = align_buf
align_buf = old_align
# reorder incremental state in decoder
reorder_state = active_bbsz_idx
# sort by score descending
for sent in range(bsz):
finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
return finalized
def _decode(self, tokens, encoder_outs):
length = tokens.size(1)
# repeat the first length positions to fill batch
positions = self.positions[:length].view(1, length)
# wrap in Variables
tokens = Variable(tokens, volatile=True)
positions = Variable(positions, volatile=True)
avg_probs = None
avg_attn = None
for model, encoder_out in zip(self.models, encoder_outs):
decoder_out, attn = model.decoder(tokens, positions, encoder_out)
probs = F.softmax(decoder_out[:, -1, :]).data
attn = attn[:, -1, :].data
if avg_probs is None or avg_attn is None:
avg_probs = probs
avg_attn = attn
else:
avg_probs.add_(probs)
avg_attn.add_(attn)
avg_probs.div_(len(self.models))
avg_probs.log_()
avg_attn.div_(len(self.models))
return avg_probs, avg_attn
def _expand_encoder_out(self, encoder_out, beam_size):
res = []
for tensor in encoder_out:
res.append(
# repeat beam_size times along second dimension
tensor.repeat(1, beam_size, *[1 for i in range(tensor.dim()-2)]) \
# then collapse into [bsz*beam, ...original dims...]
.view(-1, *tensor.size()[1:])
)
return tuple(res)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import re
import torch
from fairseq import dictionary
def tokenize_line(line):
line = re.sub(r"\t", "", line)
line = re.sub(r"^\s+", "", line)
line = re.sub(r"\s+$", "", line)
line = re.sub(r"\s+", " ", line)
return line.split()
class Tokenizer:
@staticmethod
def build_dictionary(filename, tokenize=tokenize_line):
dict = dictionary.Dictionary()
Tokenizer.add_file_to_dictionary(filename, dict, tokenize)
dict.finalize()
return dict
@staticmethod
def add_file_to_dictionary(filename, dict, tokenize):
with open(filename, 'r') as f:
for line in f.readlines():
for word in tokenize(line):
dict.add_symbol(word)
dict.add_symbol(dict.eos_word)
@staticmethod
def binarize(filename, dict, consumer, tokenize=tokenize_line):
nseq, ntok, nunk = 0, 0, 0
replaced = {}
with open(filename, 'r') as f:
for line in f.readlines():
words = tokenize(line)
nwords = len(words)
ids = torch.IntTensor(nwords + 1)
nseq = nseq + 1
for i in range(0, len(words)):
word = words[i]
idx = dict.index(word)
if idx == dict.unk_index and word != dict.unk_word:
nunk = nunk + 1
if word in replaced:
replaced[word] = replaced[word] + 1
else:
replaced[word] = 1
ids[i] = idx
ids[nwords] = dict.eos_index
consumer(ids)
ntok = ntok + len(ids)
return {'nseq': nseq, 'nunk': nunk, 'ntok': ntok, 'replaced': len(replaced)}
@staticmethod
def tokenize(line, dict, tokenize=tokenize_line, add_if_not_exist=True):
words = tokenize(line)
nwords = len(words)
ids = torch.IntTensor(nwords + 1)
for i in range(0, len(words)):
if add_if_not_exist:
ids[i] = dict.add_symbol(words[i])
else:
ids[i] = dict.index(words[i])
ids[nwords] = dict.eos_index
return ids
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import os
import torch
from torch.autograd import Variable
from torch.serialization import default_restore_location
from fairseq import criterions, data, models
def build_model(args, dataset):
if args.arch == 'fconv':
encoder_layers = eval(args.encoder_layers)
decoder_layers = eval(args.decoder_layers)
decoder_attention = eval(args.decoder_attention)
model = models.fconv(
dataset, args.dropout, args.encoder_embed_dim, encoder_layers,
args.decoder_embed_dim, decoder_layers, decoder_attention,
decoder_out_embed_dim=args.decoder_out_embed_dim,
max_positions=args.max_positions)
else:
model = models.__dict__[args.arch](dataset, args.dropout,
max_positions=args.max_positions)
return model
def build_criterion(args, dataset):
padding_idx = dataset.dst_dict.pad()
if args.label_smoothing > 0:
return criterions.LabelSmoothedCrossEntropyCriterion(args.label_smoothing, padding_idx)
else:
return criterions.CrossEntropyCriterion(padding_idx)
def torch_persistent_save(*args, **kwargs):
for i in range(3):
try:
return torch.save(*args, **kwargs)
except:
if i == 3:
raise
def save_checkpoint(args, epoch, batch_offset, model, optimizer, lr_scheduler, val_loss=None):
state_dict = {
'args': args,
'epoch': epoch,
'batch_offset': batch_offset,
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'best_loss': lr_scheduler.best,
'val_loss': val_loss,
}
if batch_offset == 0:
if not args.no_epoch_checkpoints:
epoch_filename = os.path.join(args.save_dir, 'checkpoint{}.pt'.format(epoch))
torch_persistent_save(state_dict, epoch_filename)
assert val_loss is not None
if not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best:
save_checkpoint.best = val_loss
best_filename = os.path.join(args.save_dir, 'checkpoint_best.pt')
torch_persistent_save(state_dict, best_filename)
last_filename = os.path.join(args.save_dir, 'checkpoint_last.pt')
torch_persistent_save(state_dict, last_filename)
def load_checkpoint(filename, model, optimizer, lr_scheduler, cuda_device=None):
if not os.path.exists(filename):
return 1, 0
if cuda_device is None:
state = torch.load(filename)
else:
state = torch.load(
filename,
map_location=lambda s, l: default_restore_location(s, 'cuda:{}'.format(cuda_device))
)
model.load_state_dict(state['model'])
optimizer.load_state_dict(state['optimizer'])
lr_scheduler.best = state['best_loss']
epoch = state['epoch'] + 1
batch_offset = state['batch_offset']
gpu_str = ' on GPU #{}'.format(cuda_device) if cuda_device is not None else ''
print('| loaded checkpoint {} (epoch {}){}'.format(filename, epoch, gpu_str))
return epoch, batch_offset
def load_ensemble_for_inference(models, data_path):
# load model architectures and weights
states = []
for model in models:
if not os.path.exists(model):
raise IOError('Model file not found: ' + model)
states.append(
torch.load(model, map_location=lambda s, l: default_restore_location(s, 'cpu'))
)
# load dataset
args = states[0]['args']
dataset = data.load(data_path, args.source_lang, args.target_lang)
# build models
models = []
for state in states:
model = build_model(args, dataset)
model.load_state_dict(state['model'])
models.append(model)
return models, dataset
def prepare_sample(sample, volatile=False, cuda_device=None):
"""Wrap input tensors in Variable class."""
def make_variable(tensor):
if cuda_device is not None and torch.cuda.is_available():
tensor = tensor.cuda(async=True, device=cuda_device)
return Variable(tensor, volatile=volatile)
return {
'id': sample['id'],
'ntokens': sample['ntokens'],
'target': make_variable(sample['target']),
'net_input': {
key: make_variable(sample[key])
for key in ['src_tokens', 'src_positions', 'input_tokens', 'input_positions']
},
}
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import sys
import torch
from torch.autograd import Variable
from fairseq import bleu, options, utils, tokenizer
from fairseq.meters import StopwatchMeter, TimeMeter
from fairseq.progress_bar import progress_bar
from fairseq.sequence_generator import SequenceGenerator
def main():
parser = options.get_parser('Generation')
parser.add_argument('--path', metavar='FILE', required=True, action='append',
help='path(s) to model file(s)')
dataset_args = options.add_dataset_args(parser)
dataset_args.add_argument('-i', '--interactive', action='store_true',
help='generate translations in interactive mode')
dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N',
help='batch size')
dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT',
help='data subset to generate (train, valid, test)')
options.add_generation_args(parser)
args = parser.parse_args()
print(args)
if args.no_progress_bar:
progress_bar.enabled = False
use_cuda = torch.cuda.is_available() and not args.cpu
# Load model and dataset
print('| loading model(s) from {}'.format(', '.join(args.path)))
models, dataset = utils.load_ensemble_for_inference(args.path, args.data)
print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
if not args.interactive:
print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset])))
# Optimize model for generation
for model in models:
model.make_generation_fast_(args.beam, not args.no_beamable_mm)
# Initialize generator
translator = SequenceGenerator(models, dataset.dst_dict, beam_size=args.beam,
stop_early=(not args.no_early_stop),
normalize_scores=(not args.unnormalized),
len_penalty=args.lenpen)
align_dict = {}
if args.unk_replace_dict != '':
assert args.interactive, "Unkown words replacing requires access to original source and is only" \
"supported in interactive mode"
with open(args.unk_replace_dict, 'r') as f:
for line in f:
l = line.split()
align_dict[l[0]] = l[1]
def replace_unk(hypo_str, align_str, src, unk):
hypo_tokens = hypo_str.split()
src_tokens = tokenizer.tokenize_line(src)
align_idx = [int(i) for i in align_str.split()]
for i, ht in enumerate(hypo_tokens):
if ht == unk:
src_token = src_tokens[align_idx[i]]
if src_token in align_dict:
hypo_tokens[i] = align_dict[src_token]
else:
hypo_tokens[i] = src_token
return ' '.join(hypo_tokens)
if use_cuda:
translator.cuda()
bpe_symbol = '@@ ' if args.remove_bpe else None
def display_hypotheses(id, src, orig, ref, hypos):
id_str = '' if id is None else '-{}'.format(id)
src_str = to_sentence(dataset.src_dict, src, bpe_symbol)
print('S{}\t{}'.format(id_str, src_str))
if orig is not None:
print('O{}\t{}'.format(id_str, orig.strip()))
if ref is not None:
print('T{}\t{}'.format(id_str, to_sentence(dataset.dst_dict, ref, bpe_symbol, ref_unk=True)))
for hypo in hypos:
hypo_str = to_sentence(dataset.dst_dict, hypo['tokens'], bpe_symbol)
align_str = ' '.join(map(str, hypo['alignment']))
if args.unk_replace_dict != '':
hypo_str = replace_unk(hypo_str, align_str, orig, unk_symbol(dataset.dst_dict))
print('H{}\t{}\t{}'.format(
id_str, hypo['score'], hypo_str))
print('A{}\t{}'.format(id_str, align_str))
if args.interactive:
for line in sys.stdin:
tokens = tokenizer.Tokenizer.tokenize(line, dataset.src_dict, add_if_not_exist=False).long()
start = dataset.src_dict.pad() + 1
positions = torch.arange(start, start + len(tokens)).type_as(tokens)
if use_cuda:
positions = positions.cuda()
tokens = tokens.cuda()
translations = translator.generate(Variable(tokens.view(1, -1)), Variable(positions.view(1, -1)))
hypos = translations[0]
display_hypotheses(None, tokens, line, None, hypos[:min(len(hypos), args.nbest)])
else:
non_bpe_dict = {}
def maybe_remove_bpe_and_reindex(tokens):
"""Helper for removing BPE symbols from a tensor of indices.
If BPE removal is enabled, the returned tensor is reindexed
using a new dictionary that is created on-the-fly."""
if not args.remove_bpe:
return tokens
assert (tokens == dataset.dst_dict.pad()).sum() == 0
return torch.IntTensor([
non_bpe_dict.setdefault(w, len(non_bpe_dict))
for w in to_sentence(dataset.dst_dict, tokens, bpe_symbol).split(' ')
])
# Generate and compute BLEU score
scorer = bleu.Scorer(
dataset.dst_dict.pad() if not args.remove_bpe else -1,
dataset.dst_dict.eos() if not args.remove_bpe else -1,
dataset.dst_dict.unk())
itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size, max_positions=args.max_positions)
num_sentences = 0
with progress_bar(itr, smoothing=0, leave=False) as t:
wps_meter = TimeMeter()
gen_timer = StopwatchMeter()
translations = translator.generate_batched_itr(
t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
cuda_device=0 if use_cuda else None, timer=gen_timer)
for id, src, ref, hypos in translations:
ref = ref.int().cpu()
top_hypo = hypos[0]['tokens'].int().cpu()
scorer.add(maybe_remove_bpe_and_reindex(ref), maybe_remove_bpe_and_reindex(top_hypo))
display_hypotheses(id, src, None, ref, hypos[:min(len(hypos), args.nbest)])
wps_meter.update(src.size(0))
t.set_postfix(wps='{:5d}'.format(round(wps_meter.avg)))
num_sentences += 1
print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format(
num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
def to_token(dict, i, runk):
return runk if i == dict.unk() else dict[i]
def unk_symbol(dict, ref_unk=False):
return '<{}>'.format(dict.unk_word) if ref_unk else dict.unk_word
def to_sentence(dict, tokens, bpe_symbol=None, ref_unk=False):
if torch.is_tensor(tokens) and tokens.dim() == 2:
sentences = [to_sentence(dict, token) for token in tokens]
return '\n'.join(sentences)
eos = dict.eos()
runk = unk_symbol(dict, ref_unk=ref_unk)
sent = ' '.join([to_token(dict, i, runk) for i in tokens if i != eos])
if bpe_symbol is not None:
sent = sent.replace(bpe_symbol, '')
return sent
if __name__ == '__main__':
main()
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import argparse
import os
from itertools import zip_longest
from fairseq import dictionary, indexed_dataset
from fairseq.tokenizer import Tokenizer
def main():
parser = argparse.ArgumentParser(
description='Data pre-processing: Create dictionary and store data in binary format')
parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language')
parser.add_argument('--trainpref', metavar='FP', default='train', help='target language')
parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes')
parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes')
parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir')
parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
help='map words appearing less than threshold times to unknown')
parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
help='map words appearing less than threshold times to unknown')
parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain')
parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain')
parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
args = parser.parse_args()
print(args)
os.makedirs(args.destdir, exist_ok=True)
src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)),
threshold=args.thresholdsrc, nwords=args.nwordssrc)
tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))
tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)),
threshold=args.thresholdtgt, nwords=args.nwordstgt)
def make_dataset(input_prefix, output_prefix, lang):
dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
ds = indexed_dataset.IndexedDatasetBuilder(
'{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang,
args.target_lang, lang)
)
def consumer(tensor):
ds.add_item(tensor)
input_file = '{}.{}'.format(input_prefix, lang)
res = Tokenizer.binarize(input_file, dict, consumer)
print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
lang, input_file, res['nseq'], res['ntok'],
100 * res['nunk'] / res['ntok'], dict.unk_word))
ds.finalize('{}/{}.{}-{}.{}.idx'.format(
args.destdir, output_prefix,
args.source_lang, args.target_lang, lang))
make_dataset(args.trainpref, 'train', args.source_lang)
make_dataset(args.trainpref, 'train', args.target_lang)
for k, validpref in enumerate(args.validpref.split(',')):
outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
make_dataset(validpref, outprefix, args.source_lang)
make_dataset(validpref, outprefix, args.target_lang)
for k, testpref in enumerate(args.testpref.split(',')):
outprefix = 'test{}'.format(k) if k > 0 else 'test'
make_dataset(testpref, outprefix, args.source_lang)
make_dataset(testpref, outprefix, args.target_lang)
print('| Wrote preprocessed data to {}'.format(args.destdir))
if args.alignfile:
src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
freq_map = {}
with open(args.alignfile, 'r') as align_file:
with open(src_file_name, 'r') as src_file:
with open(tgt_file_name, 'r') as tgt_file:
for a, s, t in zip_longest(align_file, src_file, tgt_file):
si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
ai = list(map(lambda x: tuple(x.split('-')), a.split()))
for sai, tai in ai:
srcidx = si[int(sai)]
tgtidx = ti[int(tai)]
if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
assert srcidx != src_dict.pad()
assert srcidx != src_dict.eos()
assert tgtidx != tgt_dict.pad()
assert tgtidx != tgt_dict.eos()
if srcidx not in freq_map:
freq_map[srcidx] = {}
if tgtidx not in freq_map[srcidx]:
freq_map[srcidx][tgtidx] = 1
else:
freq_map[srcidx][tgtidx] += 1
align_dict = {}
for srcidx in freq_map.keys():
align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)
with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
args.source_lang, args.target_lang)), 'w') as f:
for k, v in align_dict.items():
print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
if __name__ == '__main__':
main()
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import argparse
import os
import sys
from fairseq import bleu, dictionary, tokenizer
def main():
parser = argparse.ArgumentParser(description='Command-line script for BLEU scoring.')
parser.add_argument('-s', '--sys', default='-', help='system output')
parser.add_argument('-r', '--ref', default='', help='references')
parser.add_argument('-o', '--order', default=4, metavar='N',
type=int, help='consider ngrams up to this order')
parser.add_argument('--ignore-case', action='store_true',
help='case-insensitive scoring')
args = parser.parse_args()
print(args)
assert args.sys == '-' or os.path.exists(args.sys), \
"System output file {} does not exist".format(args.sys)
assert os.path.exists(args.ref), \
"Reference file {} does not exist".format(args.ref)
dict = dictionary.Dictionary()
def readlines(fd):
for line in fd.readlines():
if args.ignore_case:
yield line.lower()
yield line
def score(fdsys):
with open(args.ref) as fdref:
scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict)
ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict)
scorer.add(ref_tok, sys_tok)
print(scorer.result_string(args.order))
if args.sys == '-':
score(sys.stdin)
else:
with open(args.sys, 'r') as f:
score(f)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment