Commit c394d7d1 authored by “change”'s avatar “change”
Browse files

init

parents
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from fairseq.search import Search
class NoisyChannelBeamSearch(Search):
def __init__(self, tgt_dict):
super().__init__(tgt_dict)
self.fw_scores_buf = None
self.lm_scores_buf = None
def _init_buffers(self, t):
# super()._init_buffers(t)
if self.fw_scores_buf is None:
self.scores_buf = t.new()
self.indices_buf = torch.LongTensor().to(device=t.device)
self.beams_buf = torch.LongTensor().to(device=t.device)
self.fw_scores_buf = t.new()
self.lm_scores_buf = t.new()
def combine_fw_bw(self, combine_method, fw_cum, bw, step):
if combine_method == "noisy_channel":
fw_norm = fw_cum.div(step + 1)
lprobs = bw + fw_norm
elif combine_method == "lm_only":
lprobs = bw + fw_cum
return lprobs
def step(self, step, fw_lprobs, scores, bw_lprobs, lm_lprobs, combine_method):
self._init_buffers(fw_lprobs)
bsz, beam_size, vocab_size = fw_lprobs.size()
if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam
fw_lprobs = fw_lprobs[:, ::beam_size, :].contiguous()
bw_lprobs = bw_lprobs[:, ::beam_size, :].contiguous()
# nothing to add since we are at the first step
fw_lprobs_cum = fw_lprobs
else:
# make probs contain cumulative scores for each hypothesis
raw_scores = (scores[:, :, step - 1].unsqueeze(-1))
fw_lprobs_cum = (fw_lprobs.add(raw_scores))
combined_lprobs = self.combine_fw_bw(combine_method, fw_lprobs_cum, bw_lprobs, step)
# choose the top k according to the combined noisy channel model score
torch.topk(
combined_lprobs.view(bsz, -1),
k=min(
# Take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
beam_size * 2,
combined_lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad
),
out=(self.scores_buf, self.indices_buf),
)
# save corresponding fw and lm scores
self.fw_scores_buf = torch.gather(fw_lprobs_cum.view(bsz, -1), 1, self.indices_buf)
self.lm_scores_buf = torch.gather(lm_lprobs.view(bsz, -1), 1, self.indices_buf)
# Project back into relative indices and beams
self.beams_buf = self.indices_buf // vocab_size
self.indices_buf.fmod_(vocab_size)
return self.scores_buf, self.fw_scores_buf, self.lm_scores_buf, self.indices_buf, self.beams_buf
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import Dict, List, Optional
import math
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from .noisy_channel_beam_search import NoisyChannelBeamSearch
from fairseq.sequence_generator import EnsembleModel
class NoisyChannelSequenceGenerator(object):
def __init__(
self,
combine_method,
tgt_dict,
src_dict=None,
beam_size=1,
max_len_a=0,
max_len_b=200,
min_len=1,
len_penalty=1.0,
unk_penalty=0.0,
retain_dropout=False,
temperature=1.0,
match_source_len=False,
no_repeat_ngram_size=0,
normalize_scores=True,
channel_models=None,
k2=10,
ch_weight=1.0,
channel_scoring_type='log_norm',
top_k_vocab=0,
lm_models=None,
lm_dict=None,
lm_weight=1.0,
normalize_lm_scores_by_tgt_len=False,
):
"""Generates translations of a given source sentence,
using beam search with noisy channel decoding.
Args:
combine_method (string, optional): Method to combine direct, LM and
channel model scores (default: None)
tgt_dict (~fairseq.data.Dictionary): target dictionary
src_dict (~fairseq.data.Dictionary): source dictionary
beam_size (int, optional): beam width (default: 1)
max_len_a/b (int, optional): generate sequences of maximum length
ax + b, where x is the source length
min_len (int, optional): the minimum length of the generated output
(not including end-of-sentence)
len_penalty (float, optional): length penalty, where <1.0 favors
shorter, >1.0 favors longer sentences (default: 1.0)
unk_penalty (float, optional): unknown word penalty, where <0
produces more unks, >0 produces fewer (default: 0.0)
retain_dropout (bool, optional): use dropout when generating
(default: False)
temperature (float, optional): temperature, where values
>1.0 produce more uniform samples and values <1.0 produce
sharper samples (default: 1.0)
match_source_len (bool, optional): outputs should match the source
length (default: False)
no_repeat_ngram_size (int, optional): Size of n-grams that we avoid
repeating in the generation (default: 0)
normalize_scores (bool, optional): normalize scores by the length
of the output (default: True)
channel_models (List[~fairseq.models.FairseqModel]): ensemble of models
translating from the target to the source
k2 (int, optional): Top K2 candidates to score per beam at each step (default:10)
ch_weight (int, optional): Weight associated with the channel model score
assuming that the direct model score has weight 1.0 (default: 1.0)
channel_scoring_type (str, optional): String specifying how to score
the channel model (default: 'log_norm')
top_k_vocab (int, optional): If `channel_scoring_type` is `'src_vocab'` or
`'src_vocab_batched'`, then this parameter specifies the number of
most frequent tokens to include in the channel model output vocabulary,
in addition to the source tokens in the input batch (default: 0)
lm_models (List[~fairseq.models.FairseqModel]): ensemble of models
generating text in the target language
lm_dict (~fairseq.data.Dictionary): LM Model dictionary
lm_weight (int, optional): Weight associated with the LM model score
assuming that the direct model score has weight 1.0 (default: 1.0)
normalize_lm_scores_by_tgt_len (bool, optional): Should we normalize LM scores
by the target length? By default, we normalize the combination of
LM and channel model scores by the source length
"""
self.pad = tgt_dict.pad()
self.unk = tgt_dict.unk()
self.eos = tgt_dict.eos()
self.vocab_size = len(tgt_dict)
self.beam_size = beam_size
# the max beam size is the dictionary size - 1, since we never select pad
self.beam_size = min(beam_size, self.vocab_size - 1)
self.max_len_a = max_len_a
self.max_len_b = max_len_b
self.min_len = min_len
self.normalize_scores = normalize_scores
self.len_penalty = len_penalty
self.unk_penalty = unk_penalty
self.retain_dropout = retain_dropout
self.temperature = temperature
self.match_source_len = match_source_len
self.no_repeat_ngram_size = no_repeat_ngram_size
self.channel_models = channel_models
self.src_dict = src_dict
self.tgt_dict = tgt_dict
self.combine_method = combine_method
self.k2 = k2
self.ch_weight = ch_weight
self.channel_scoring_type = channel_scoring_type
self.top_k_vocab = top_k_vocab
self.lm_models = lm_models
self.lm_dict = lm_dict
self.lm_weight = lm_weight
self.log_softmax_fn = torch.nn.LogSoftmax(dim=1)
self.normalize_lm_scores_by_tgt_len = normalize_lm_scores_by_tgt_len
self.share_tgt_dict = (self.lm_dict == self.tgt_dict)
self.tgt_to_lm = make_dict2dict(tgt_dict, lm_dict)
self.ch_scoring_bsz = 3072
assert temperature > 0, '--temperature must be greater than 0'
self.search = NoisyChannelBeamSearch(tgt_dict)
@torch.no_grad()
def generate(
self,
models,
sample,
prefix_tokens=None,
bos_token=None,
**kwargs
):
"""Generate a batch of translations.
Args:
models (List[~fairseq.models.FairseqModel]): ensemble of models
sample (dict): batch
prefix_tokens (torch.LongTensor, optional): force decoder to begin
with these tokens
"""
model = EnsembleModel(models)
incremental_states = torch.jit.annotate(
List[Dict[str, Dict[str, Optional[Tensor]]]],
[
torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
for i in range(model.models_size)
],
)
if not self.retain_dropout:
model.eval()
# model.forward normally channels prev_output_tokens into the decoder
# separately, but SequenceGenerator directly calls model.encoder
encoder_input = {
k: v for k, v in sample['net_input'].items()
if k != 'prev_output_tokens'
}
src_tokens = encoder_input['src_tokens']
src_lengths_no_eos = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)
input_size = src_tokens.size()
# batch dimension goes first followed by source lengths
bsz = input_size[0]
src_len = input_size[1]
beam_size = self.beam_size
if self.match_source_len:
max_len = src_lengths_no_eos.max().item()
else:
max_len = min(
int(self.max_len_a * src_len + self.max_len_b),
# exclude the EOS marker
model.max_decoder_positions() - 1,
)
# compute the encoder output for each beam
encoder_outs = model.forward_encoder(encoder_input)
new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
new_order = new_order.to(src_tokens.device).long()
encoder_outs = model.reorder_encoder_out(encoder_outs, new_order)
src_lengths = encoder_input['src_lengths']
# initialize buffers
scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0)
lm_prefix_scores = src_tokens.new(bsz * beam_size).float().fill_(0)
scores_buf = scores.clone()
tokens = src_tokens.new(bsz * beam_size, max_len + 2).long().fill_(self.pad)
tokens_buf = tokens.clone()
tokens[:, 0] = self.eos if bos_token is None else bos_token
# reorder source tokens so they may be used as a reference in generating P(S|T)
src_tokens = reorder_all_tokens(src_tokens, src_lengths, self.src_dict.eos_index)
src_tokens = src_tokens.repeat(1, beam_size).view(-1, src_len)
src_lengths = src_lengths.view(bsz, -1).repeat(1, beam_size).view(bsz*beam_size, -1)
attn, attn_buf = None, None
nonpad_idxs = None
# The cands_to_ignore indicates candidates that should be ignored.
# For example, suppose we're sampling and have already finalized 2/5
# samples. Then the cands_to_ignore would mark 2 positions as being ignored,
# so that we only finalize the remaining 3 samples.
cands_to_ignore = src_tokens.new_zeros(bsz, beam_size).eq(-1) # forward and backward-compatible False mask
# list of completed sentences
finalized = [[] for i in range(bsz)]
finished = [False for i in range(bsz)]
num_remaining_sent = bsz
# number of candidate hypos per step
cand_size = 2 * beam_size # 2 x beam size in case half are EOS
# offset arrays for converting between different indexing schemes
bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens)
cand_offsets = torch.arange(0, cand_size).type_as(tokens)
# helper function for allocating buffers on the fly
buffers = {}
def buffer(name, type_of=tokens): # noqa
if name not in buffers:
buffers[name] = type_of.new()
return buffers[name]
def is_finished(sent, step, unfin_idx):
"""
Check whether we've finished generation for a given sentence, by
comparing the worst score among finalized hypotheses to the best
possible score among unfinalized hypotheses.
"""
assert len(finalized[sent]) <= beam_size
if len(finalized[sent]) == beam_size:
return True
return False
def finalize_hypos(step, bbsz_idx, eos_scores, combined_noisy_channel_eos_scores):
"""
Finalize the given hypotheses at this step, while keeping the total
number of finalized hypotheses per sentence <= beam_size.
Note: the input must be in the desired finalization order, so that
hypotheses that appear earlier in the input are preferred to those
that appear later.
Args:
step: current time step
bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
indicating which hypotheses to finalize
eos_scores: A vector of the same size as bbsz_idx containing
fw scores for each hypothesis
combined_noisy_channel_eos_scores: A vector of the same size as bbsz_idx containing
combined noisy channel scores for each hypothesis
"""
assert bbsz_idx.numel() == eos_scores.numel()
# clone relevant token and attention tensors
tokens_clone = tokens.index_select(0, bbsz_idx)
tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS
assert not tokens_clone.eq(self.eos).any()
tokens_clone[:, step] = self.eos
attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None
# compute scores per token position
pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1]
pos_scores[:, step] = eos_scores
# convert from cumulative to per-position scores
pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
# normalize sentence-level scores
if self.normalize_scores:
combined_noisy_channel_eos_scores /= (step + 1) ** self.len_penalty
cum_unfin = []
prev = 0
for f in finished:
if f:
prev += 1
else:
cum_unfin.append(prev)
sents_seen = set()
for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), combined_noisy_channel_eos_scores.tolist())):
unfin_idx = idx // beam_size
sent = unfin_idx + cum_unfin[unfin_idx]
sents_seen.add((sent, unfin_idx))
if self.match_source_len and step > src_lengths_no_eos[unfin_idx]:
score = -math.inf
def get_hypo():
if attn_clone is not None:
# remove padding tokens from attn scores
hypo_attn = attn_clone[i][nonpad_idxs[sent]]
_, alignment = hypo_attn.max(dim=0)
else:
hypo_attn = None
alignment = None
return {
'tokens': tokens_clone[i],
'score': score,
'attention': hypo_attn, # src_len x tgt_len
'alignment': alignment,
'positional_scores': pos_scores[i],
}
if len(finalized[sent]) < beam_size:
finalized[sent].append(get_hypo())
newly_finished = []
for sent, unfin_idx in sents_seen:
# check termination conditions for this sentence
if not finished[sent] and is_finished(sent, step, unfin_idx):
finished[sent] = True
newly_finished.append(unfin_idx)
return newly_finished
def noisy_channel_rescoring(lprobs, beam_size, bsz, src_tokens, tokens, k):
"""Rescore the top k hypothesis from each beam using noisy channel modeling
Returns:
new_fw_lprobs: the direct model probabilities after pruning the top k
new_ch_lm_lprobs: the combined channel and language model probabilities
new_lm_lprobs: the language model probabilities after pruning the top k
"""
with torch.no_grad():
lprobs_size = lprobs.size()
if prefix_tokens is not None and step < prefix_tokens.size(1):
probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :]
cand_scores = torch.gather(
probs_slice, dim=1,
index=prefix_tokens[:, step].view(-1, 1).data
).expand(-1, beam_size).contiguous().view(bsz*beam_size, 1)
cand_indices = prefix_tokens[:, step].view(-1, 1).expand(bsz, beam_size).data.contiguous().view(bsz*beam_size, 1)
# need to calculate and save fw and lm probs for prefix tokens
fw_top_k = cand_scores
fw_top_k_idx = cand_indices
k = 1
else:
# take the top k best words for every sentence in batch*beam
fw_top_k, fw_top_k_idx = torch.topk(lprobs.view(beam_size*bsz, -1), k=k)
eos_idx = torch.nonzero(fw_top_k_idx.view(bsz*beam_size*k, -1) == self.eos)[:, 0]
ch_scores = fw_top_k.new_full((beam_size*bsz*k, ), 0)
src_size = torch.sum(src_tokens[:, :] != self.src_dict.pad_index, dim=1, keepdim=True, dtype=fw_top_k.dtype)
if self.combine_method != "lm_only":
temp_src_tokens_full = src_tokens[:, :].repeat(1, k).view(bsz*beam_size*k, -1)
not_padding = temp_src_tokens_full[:, 1:] != self.src_dict.pad_index
cur_tgt_size = step+2
# add eos to all candidate sentences except those that already end in eos
eos_tokens = tokens[:, 0].repeat(1, k).view(-1, 1)
eos_tokens[eos_idx] = self.tgt_dict.pad_index
if step == 0:
channel_input = torch.cat((fw_top_k_idx.view(-1, 1), eos_tokens), 1)
else:
# move eos from beginning to end of target sentence
channel_input = torch.cat((tokens[:, 1:step + 1].repeat(1, k).view(-1, step), fw_top_k_idx.view(-1, 1), eos_tokens), 1)
ch_input_lengths = torch.tensor(np.full(channel_input.size(0), cur_tgt_size))
ch_input_lengths[eos_idx] = cur_tgt_size-1
if self.channel_scoring_type == "unnormalized":
ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths)
ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True)
del ch_encoder_output
ch_intermed_scores = channel_model.decoder.unnormalized_scores_given_target(ch_decoder_output, target_ids=temp_src_tokens_full[:, 1:])
ch_intermed_scores = ch_intermed_scores.float()
ch_intermed_scores *= not_padding.float()
ch_scores = torch.sum(ch_intermed_scores, dim=1)
elif self.channel_scoring_type == "k2_separate":
for k_idx in range(k):
k_eos_tokens = eos_tokens[k_idx::k, :]
if step == 0:
k_ch_input = torch.cat((fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1)
else:
# move eos from beginning to end of target sentence
k_ch_input = torch.cat((tokens[:, 1:step + 1], fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1)
k_ch_input_lengths = ch_input_lengths[k_idx::k]
k_ch_output = channel_model(k_ch_input, k_ch_input_lengths, src_tokens)
k_ch_lprobs = channel_model.get_normalized_probs(k_ch_output, log_probs=True)
k_ch_intermed_scores = torch.gather(k_ch_lprobs[:, :-1, :], 2, src_tokens[:, 1:].unsqueeze(2)).squeeze(2)
k_ch_intermed_scores *= not_padding.float()
ch_scores[k_idx::k] = torch.sum(k_ch_intermed_scores, dim=1)
elif self.channel_scoring_type == "src_vocab":
ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths)
ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True)
del ch_encoder_output
ch_lprobs = normalized_scores_with_batch_vocab(
channel_model.decoder,
ch_decoder_output, src_tokens, k, bsz, beam_size,
self.src_dict.pad_index, top_k=self.top_k_vocab)
ch_scores = torch.sum(ch_lprobs, dim=1)
elif self.channel_scoring_type == "src_vocab_batched":
ch_bsz_size = temp_src_tokens_full.shape[0]
ch_lprobs_list = [None] * len(range(0, ch_bsz_size, self.ch_scoring_bsz))
for i, start_idx in enumerate(range(0, ch_bsz_size, self.ch_scoring_bsz)):
end_idx = min(start_idx + self.ch_scoring_bsz, ch_bsz_size)
temp_src_tokens_full_batch = temp_src_tokens_full[start_idx:end_idx, :]
channel_input_batch = channel_input[start_idx:end_idx, :]
ch_input_lengths_batch = ch_input_lengths[start_idx:end_idx]
ch_encoder_output_batch = channel_model.encoder(channel_input_batch, src_lengths=ch_input_lengths_batch)
ch_decoder_output_batch, _ = channel_model.decoder(temp_src_tokens_full_batch, encoder_out=ch_encoder_output_batch, features_only=True)
ch_lprobs_list[i] = normalized_scores_with_batch_vocab(
channel_model.decoder,
ch_decoder_output_batch, src_tokens, k, bsz, beam_size,
self.src_dict.pad_index, top_k=self.top_k_vocab,
start_idx=start_idx, end_idx=end_idx)
ch_lprobs = torch.cat(ch_lprobs_list, dim=0)
ch_scores = torch.sum(ch_lprobs, dim=1)
else:
ch_output = channel_model(channel_input, ch_input_lengths, temp_src_tokens_full)
ch_lprobs = channel_model.get_normalized_probs(ch_output, log_probs=True)
ch_intermed_scores = torch.gather(ch_lprobs[:, :-1, :], 2, temp_src_tokens_full[:, 1:].unsqueeze(2)).squeeze().view(bsz*beam_size*k, -1)
ch_intermed_scores *= not_padding.float()
ch_scores = torch.sum(ch_intermed_scores, dim=1)
else:
cur_tgt_size = 0
ch_scores = ch_scores.view(bsz*beam_size, k)
expanded_lm_prefix_scores = lm_prefix_scores.unsqueeze(1).expand(-1, k).flatten()
if self.share_tgt_dict:
lm_scores = get_lm_scores(lm, tokens[:, :step + 1].view(-1, step+1), lm_incremental_states, fw_top_k_idx.view(-1, 1), torch.tensor(np.full(tokens.size(0), step+1)), k)
else:
new_lm_input = dict2dict(tokens[:, :step + 1].view(-1, step+1), self.tgt_to_lm)
new_cands = dict2dict(fw_top_k_idx.view(-1, 1), self.tgt_to_lm)
lm_scores = get_lm_scores(lm, new_lm_input, lm_incremental_states, new_cands, torch.tensor(np.full(tokens.size(0), step+1)), k)
lm_scores.add_(expanded_lm_prefix_scores)
ch_lm_scores = combine_ch_lm(self.combine_method, ch_scores, lm_scores, src_size, cur_tgt_size)
# initialize all as min value
new_fw_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1)
new_ch_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1)
new_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1)
new_fw_lprobs[:, self.pad] = -math.inf
new_ch_lm_lprobs[:, self.pad] = -math.inf
new_lm_lprobs[:, self.pad] = -math.inf
new_fw_lprobs.scatter_(1, fw_top_k_idx, fw_top_k)
new_ch_lm_lprobs.scatter_(1, fw_top_k_idx, ch_lm_scores)
new_lm_lprobs.scatter_(1, fw_top_k_idx, lm_scores.view(-1, k))
return new_fw_lprobs, new_ch_lm_lprobs, new_lm_lprobs
def combine_ch_lm(combine_type, ch_scores, lm_scores1, src_size, tgt_size):
if self.channel_scoring_type == "unnormalized":
ch_scores = self.log_softmax_fn(
ch_scores.view(-1, self.beam_size * self.k2)
).view(ch_scores.shape)
ch_scores = ch_scores * self.ch_weight
lm_scores1 = lm_scores1 * self.lm_weight
if combine_type == "lm_only":
# log P(T|S) + log P(T)
ch_scores = lm_scores1.view(ch_scores.size())
elif combine_type == "noisy_channel":
# 1/t log P(T|S) + 1/s log P(S|T) + 1/t log P(T)
if self.normalize_lm_scores_by_tgt_len:
ch_scores.div_(src_size)
lm_scores_norm = lm_scores1.view(ch_scores.size()).div(tgt_size)
ch_scores.add_(lm_scores_norm)
# 1/t log P(T|S) + 1/s log P(S|T) + 1/s log P(T)
else:
ch_scores.add_(lm_scores1.view(ch_scores.size()))
ch_scores.div_(src_size)
return ch_scores
if self.channel_models is not None:
channel_model = self.channel_models[0] # assume only one channel_model model
else:
channel_model = None
lm = EnsembleModel(self.lm_models)
lm_incremental_states = torch.jit.annotate(
List[Dict[str, Dict[str, Optional[Tensor]]]],
[
torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
for i in range(lm.models_size)
],
)
reorder_state = None
batch_idxs = None
for step in range(max_len + 1): # one extra step for EOS marker
# reorder decoder internal states based on the prev choice of beams
if reorder_state is not None:
if batch_idxs is not None:
# update beam indices to take into account removed sentences
corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs)
reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size)
model.reorder_incremental_state(incremental_states, reorder_state)
encoder_outs = model.reorder_encoder_out(encoder_outs, reorder_state)
lm.reorder_incremental_state(lm_incremental_states, reorder_state)
fw_lprobs, avg_attn_scores = model.forward_decoder(
tokens[:, :step + 1], encoder_outs, incremental_states, temperature=self.temperature,
)
fw_lprobs[:, self.pad] = -math.inf # never select pad
fw_lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty
fw_lprobs, ch_lm_lprobs, lm_lprobs = noisy_channel_rescoring(fw_lprobs, beam_size, bsz, src_tokens, tokens, self.k2)
# handle min and max length constraints
if step >= max_len:
fw_lprobs[:, :self.eos] = -math.inf
fw_lprobs[:, self.eos + 1:] = -math.inf
elif step < self.min_len:
fw_lprobs[:, self.eos] = -math.inf
# handle prefix tokens (possibly with different lengths)
if prefix_tokens is not None and step < prefix_tokens.size(1):
prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
prefix_mask = prefix_toks.ne(self.pad)
prefix_fw_lprobs = fw_lprobs.gather(-1, prefix_toks.unsqueeze(-1))
fw_lprobs[prefix_mask] = -math.inf
fw_lprobs[prefix_mask] = fw_lprobs[prefix_mask].scatter_(
-1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_fw_lprobs
)
prefix_ch_lm_lprobs = ch_lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1))
ch_lm_lprobs[prefix_mask] = -math.inf
ch_lm_lprobs[prefix_mask] = ch_lm_lprobs[prefix_mask].scatter_(
-1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_ch_lm_lprobs
)
prefix_lm_lprobs = lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1))
lm_lprobs[prefix_mask] = -math.inf
lm_lprobs[prefix_mask] = lm_lprobs[prefix_mask].scatter_(
-1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lm_lprobs
)
# if prefix includes eos, then we should make sure tokens and
# scores are the same across all beams
eos_mask = prefix_toks.eq(self.eos)
if eos_mask.any():
# validate that the first beam matches the prefix
first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[:, 0, 1:step + 1]
eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
assert (first_beam == target_prefix).all()
def replicate_first_beam(tensor, mask):
tensor = tensor.view(-1, beam_size, tensor.size(-1))
tensor[mask] = tensor[mask][:, :1, :]
return tensor.view(-1, tensor.size(-1))
# copy tokens, scores and lprobs from the first beam to all beams
tokens = replicate_first_beam(tokens, eos_mask_batch_dim)
scores = replicate_first_beam(scores, eos_mask_batch_dim)
fw_lprobs = replicate_first_beam(fw_lprobs, eos_mask_batch_dim)
ch_lm_lprobs = replicate_first_beam(ch_lm_lprobs, eos_mask_batch_dim)
lm_lprobs = replicate_first_beam(lm_lprobs, eos_mask_batch_dim)
if self.no_repeat_ngram_size > 0:
# for each beam and batch sentence, generate a list of previous ngrams
gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)]
for bbsz_idx in range(bsz * beam_size):
gen_tokens = tokens[bbsz_idx].tolist()
for ngram in zip(*[gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]):
gen_ngrams[bbsz_idx][tuple(ngram[:-1])] = \
gen_ngrams[bbsz_idx].get(tuple(ngram[:-1]), []) + [ngram[-1]]
# Record attention scores
if avg_attn_scores is not None:
if attn is None:
attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2)
attn_buf = attn.clone()
nonpad_idxs = src_tokens.ne(self.pad)
attn[:, :, step + 1].copy_(avg_attn_scores)
scores = scores.type_as(fw_lprobs)
scores_buf = scores_buf.type_as(fw_lprobs)
self.search.set_src_lengths(src_lengths_no_eos)
if self.no_repeat_ngram_size > 0:
def calculate_banned_tokens(bbsz_idx):
# before decoding the next token, prevent decoding of ngrams that have already appeared
ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist())
return gen_ngrams[bbsz_idx].get(ngram_index, [])
if step + 2 - self.no_repeat_ngram_size >= 0:
# no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)]
else:
banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)]
for bbsz_idx in range(bsz * beam_size):
fw_lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf
combined_noisy_channel_scores, fw_lprobs_top_k, lm_lprobs_top_k, cand_indices, cand_beams = self.search.step(
step,
fw_lprobs.view(bsz, -1, self.vocab_size),
scores.view(bsz, beam_size, -1)[:, :, :step], ch_lm_lprobs.view(bsz, -1, self.vocab_size),
lm_lprobs.view(bsz, -1, self.vocab_size), self.combine_method
)
# cand_bbsz_idx contains beam indices for the top candidate
# hypotheses, with a range of values: [0, bsz*beam_size),
# and dimensions: [bsz, cand_size]
cand_bbsz_idx = cand_beams.add(bbsz_offsets)
# finalize hypotheses that end in eos (except for candidates to be ignored)
eos_mask = cand_indices.eq(self.eos)
eos_mask[:, :beam_size] &= ~cands_to_ignore
# only consider eos when it's among the top beam_size indices
eos_bbsz_idx = torch.masked_select(
cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
)
finalized_sents = set()
if eos_bbsz_idx.numel() > 0:
eos_scores = torch.masked_select(
fw_lprobs_top_k[:, :beam_size], mask=eos_mask[:, :beam_size]
)
combined_noisy_channel_eos_scores = torch.masked_select(
combined_noisy_channel_scores[:, :beam_size],
mask=eos_mask[:, :beam_size],
)
# finalize hypo using channel model score
finalized_sents = finalize_hypos(
step, eos_bbsz_idx, eos_scores, combined_noisy_channel_eos_scores)
num_remaining_sent -= len(finalized_sents)
assert num_remaining_sent >= 0
if num_remaining_sent == 0:
break
if len(finalized_sents) > 0:
new_bsz = bsz - len(finalized_sents)
# construct batch_idxs which holds indices of batches to keep for the next pass
batch_mask = cand_indices.new_ones(bsz)
batch_mask[cand_indices.new(finalized_sents)] = 0
batch_idxs = torch.nonzero(batch_mask).squeeze(-1)
eos_mask = eos_mask[batch_idxs]
cand_beams = cand_beams[batch_idxs]
bbsz_offsets.resize_(new_bsz, 1)
cand_bbsz_idx = cand_beams.add(bbsz_offsets)
lm_lprobs_top_k = lm_lprobs_top_k[batch_idxs]
fw_lprobs_top_k = fw_lprobs_top_k[batch_idxs]
cand_indices = cand_indices[batch_idxs]
if prefix_tokens is not None:
prefix_tokens = prefix_tokens[batch_idxs]
src_lengths_no_eos = src_lengths_no_eos[batch_idxs]
cands_to_ignore = cands_to_ignore[batch_idxs]
scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
scores_buf.resize_as_(scores)
tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
tokens_buf.resize_as_(tokens)
src_tokens = src_tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
src_lengths = src_lengths.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
lm_prefix_scores = lm_prefix_scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1).squeeze()
if attn is not None:
attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1)
attn_buf.resize_as_(attn)
bsz = new_bsz
else:
batch_idxs = None
# Set active_mask so that values > cand_size indicate eos or
# ignored hypos and values < cand_size indicate candidate
# active hypos. After this, the min values per row are the top
# candidate active hypos.
eos_mask[:, :beam_size] |= cands_to_ignore
active_mask = torch.add(
eos_mask.type_as(cand_offsets) * cand_size,
cand_offsets[: eos_mask.size(1)],
)
# get the top beam_size active hypotheses, which are just the hypos
# with the smallest values in active_mask
active_hypos, new_cands_to_ignore = buffer('active_hypos'), buffer('new_cands_to_ignore')
torch.topk(
active_mask, k=beam_size, dim=1, largest=False,
out=(new_cands_to_ignore, active_hypos)
)
# update cands_to_ignore to ignore any finalized hypos
cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
assert (~cands_to_ignore).any(dim=1).all()
active_bbsz_idx = buffer('active_bbsz_idx')
torch.gather(
cand_bbsz_idx, dim=1, index=active_hypos,
out=active_bbsz_idx,
)
active_scores = torch.gather(
fw_lprobs_top_k, dim=1, index=active_hypos,
out=scores[:, step].view(bsz, beam_size),
)
active_bbsz_idx = active_bbsz_idx.view(-1)
active_scores = active_scores.view(-1)
# copy tokens and scores for active hypotheses
torch.index_select(
tokens[:, :step + 1], dim=0, index=active_bbsz_idx,
out=tokens_buf[:, :step + 1],
)
torch.gather(
cand_indices, dim=1, index=active_hypos,
out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1],
)
if step > 0:
torch.index_select(
scores[:, :step], dim=0, index=active_bbsz_idx,
out=scores_buf[:, :step],
)
torch.gather(
fw_lprobs_top_k, dim=1, index=active_hypos,
out=scores_buf.view(bsz, beam_size, -1)[:, :, step],
)
torch.gather(
lm_lprobs_top_k, dim=1, index=active_hypos,
out=lm_prefix_scores.view(bsz, beam_size)
)
# copy attention for active hypotheses
if attn is not None:
torch.index_select(
attn[:, :, :step + 2], dim=0, index=active_bbsz_idx,
out=attn_buf[:, :, :step + 2],
)
# swap buffers
tokens, tokens_buf = tokens_buf, tokens
scores, scores_buf = scores_buf, scores
if attn is not None:
attn, attn_buf = attn_buf, attn
# reorder incremental state in decoder
reorder_state = active_bbsz_idx
# sort by score descending
for sent in range(len(finalized)):
finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
return finalized
def get_lm_scores(model, input_tokens, incremental_states, cand_tokens, input_len, k):
with torch.no_grad():
lm_lprobs, avg_attn_scores = model.forward_decoder(
input_tokens, encoder_outs=None, incremental_states=incremental_states,
)
lm_lprobs_size = lm_lprobs.size(0)
probs_next_wrd = torch.gather(lm_lprobs.repeat(1, k).view(lm_lprobs_size*k, -1), 1, cand_tokens).squeeze().view(-1)
return probs_next_wrd
def make_dict2dict(old_dict, new_dict):
dict2dict_map = {}
for sym in old_dict.symbols:
dict2dict_map[old_dict.index(sym)] = new_dict.index(sym)
return dict2dict_map
def dict2dict(tokens, dict2dict_map):
if tokens.device == torch.device('cpu'):
tokens_tmp = tokens
else:
tokens_tmp = tokens.cpu()
return tokens_tmp.map_(
tokens_tmp,
lambda _, val, dict2dict_map=dict2dict_map : dict2dict_map[float(val)]
).to(tokens.device)
def reorder_tokens(tokens, lengths, eos):
# reorder source tokens so they may be used as reference for P(S|T)
return torch.cat((tokens.new([eos]), tokens[-lengths:-1], tokens[:-lengths]), 0)
def reorder_all_tokens(tokens, lengths, eos):
# used to reorder src tokens from [<pad> <w1> <w2> .. <eos>] to [<eos> <w1> <w2>...<pad>]
# so source tokens can be used to predict P(S|T)
return torch.stack([reorder_tokens(token, length, eos) for token, length in zip(tokens, lengths)])
def normalized_scores_with_batch_vocab(
model_decoder, features, target_ids, k, bsz, beam_size,
pad_idx, top_k=0, vocab_size_meter=None, start_idx=None,
end_idx=None, **kwargs):
"""
Get normalized probabilities (or log probs) from a net's output
w.r.t. vocab consisting of target IDs in the batch
"""
if model_decoder.adaptive_softmax is None:
weight = model_decoder.output_projection.weight
vocab_ids = torch.unique(
torch.cat(
(torch.unique(target_ids), torch.arange(top_k, device=target_ids.device))
)
)
id_map = dict(zip(vocab_ids.tolist(), range(len(vocab_ids))))
mapped_target_ids = target_ids.cpu().apply_(
lambda x, id_map=id_map: id_map[x]
).to(target_ids.device)
expanded_target_ids = mapped_target_ids[:, :].repeat(1, k).view(bsz*beam_size*k, -1)
if start_idx is not None and end_idx is not None:
expanded_target_ids = expanded_target_ids[start_idx:end_idx, :]
logits = F.linear(features, weight[vocab_ids, :])
log_softmax = F.log_softmax(logits, dim=-1, dtype=torch.float32)
intermed_scores = torch.gather(
log_softmax[:, :-1, :],
2,
expanded_target_ids[:, 1:].unsqueeze(2),
).squeeze()
not_padding = expanded_target_ids[:, 1:] != pad_idx
intermed_scores *= not_padding.float()
return intermed_scores
else:
raise ValueError("adaptive softmax doesn't work with " +
"`normalized_scores_with_batch_vocab()`")
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from fairseq.tasks.translation import TranslationTask
from fairseq.tasks.language_modeling import LanguageModelingTask
from fairseq import checkpoint_utils
import argparse
from fairseq.tasks import register_task
import torch
@register_task("noisy_channel_translation")
class NoisyChannelTranslation(TranslationTask):
"""
Rescore the top k candidates from each beam using noisy channel modeling
"""
@staticmethod
def add_args(parser):
"""Add task-specific arguments to the parser."""
TranslationTask.add_args(parser)
# fmt: off
parser.add_argument('--channel-model', metavar='FILE',
help='path to P(S|T) model. P(S|T) and P(T|S) must share source and target dictionaries.')
parser.add_argument('--combine-method', default='lm_only',
choices=['lm_only', 'noisy_channel'],
help="""method for combining direct and channel model scores.
lm_only: decode with P(T|S)P(T)
noisy_channel: decode with 1/t P(T|S) + 1/s(P(S|T)P(T))""")
parser.add_argument('--normalize-lm-scores-by-tgt-len', action='store_true', default=False,
help='normalize lm score by target length instead of source length')
parser.add_argument('--channel-scoring-type', default='log_norm', choices=['unnormalized', 'log_norm', 'k2_separate', 'src_vocab', 'src_vocab_batched'],
help="Normalize bw scores with log softmax or return bw scores without log softmax")
parser.add_argument('--top-k-vocab', default=0, type=int,
help='top k vocab IDs to use with `src_vocab` in channel model scoring')
parser.add_argument('--k2', default=50, type=int,
help='the top k2 candidates to rescore with the noisy channel model for each beam')
parser.add_argument('--ch-wt', default=1, type=float,
help='weight for the channel model')
parser.add_argument('--lm-model', metavar='FILE',
help='path to lm model file, to model P(T). P(T) must share the same vocab as the direct model on the target side')
parser.add_argument('--lm-data', metavar='FILE',
help='path to lm model training data for target language, used to properly load LM with correct dictionary')
parser.add_argument('--lm-wt', default=1, type=float,
help='the weight of the lm in joint decoding')
# fmt: on
def build_generator(
self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None
):
if getattr(args, "score_reference", False):
raise NotImplementedError()
else:
from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator
use_cuda = torch.cuda.is_available() and not self.args.cpu
assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!'
assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs'
if self.args.channel_model is not None:
import copy
ch_args_task = copy.deepcopy(self.args)
tmp = ch_args_task.source_lang
ch_args_task.source_lang = ch_args_task.target_lang
ch_args_task.target_lang = tmp
ch_args_task._name = 'translation'
channel_task = TranslationTask.setup_task(ch_args_task)
arg_dict = {}
arg_dict['task'] = 'language_modeling'
arg_dict['sample_break_mode'] = 'eos'
arg_dict['data'] = self.args.lm_data
arg_dict['output_dictionary_size'] = -1
lm_args = argparse.Namespace(**arg_dict)
lm_task = LanguageModelingTask.setup_task(lm_args)
lm_dict = lm_task.output_dictionary
if self.args.channel_model is not None:
channel_models, _ = checkpoint_utils.load_model_ensemble(self.args.channel_model.split(':'), task=channel_task)
for model in channel_models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
need_attn=args.print_alignment,
)
if self.args.fp16:
model.half()
if use_cuda:
model.cuda()
else:
channel_models = None
lm_models, _ = checkpoint_utils.load_model_ensemble(self.args.lm_model.split(':'), task=lm_task)
for model in lm_models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
need_attn=args.print_alignment,
)
if self.args.fp16:
model.half()
if use_cuda:
model.cuda()
return NoisyChannelSequenceGenerator(
combine_method=self.args.combine_method,
tgt_dict=self.target_dictionary,
src_dict=self.source_dictionary,
beam_size=getattr(args, 'beam', 5),
max_len_a=getattr(args, 'max_len_a', 0),
max_len_b=getattr(args, 'max_len_b', 200),
min_len=getattr(args, 'min_len', 1),
len_penalty=getattr(args, 'lenpen', 1),
unk_penalty=getattr(args, 'unkpen', 0),
temperature=getattr(args, 'temperature', 1.),
match_source_len=getattr(args, 'match_source_len', False),
no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0),
normalize_scores=(not getattr(args, 'unnormalized', False)),
channel_models=channel_models,
k2=getattr(self.args, 'k2', 50),
ch_weight=getattr(self.args, 'ch_wt', 1),
channel_scoring_type=self.args.channel_scoring_type,
top_k_vocab=self.args.top_k_vocab,
lm_models=lm_models,
lm_dict=lm_dict,
lm_weight=getattr(self.args, 'lm_wt', 1),
normalize_lm_scores_by_tgt_len=getattr(self.args, 'normalize_lm_scores_by_tgt_len', False),
)
<p align="center">
<img src="flores_logo.png" width="500">
</p>
# Flores101: Large-Scale Multilingual Machine Translation
## Introduction
Baseline pretrained models for small and large tracks of WMT 21 Large-Scale Multilingual Machine Translation competition.
Flores Task at WMT 21: http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html
Flores announement blog post: https://ai.facebook.com/blog/flores-researchers-kick-off-multilingual-translation-challenge-at-wmt-and-call-for-compute-grants/
## Pretrained models
Model | Num layers | Embed dimension | FFN dimension| Vocab Size | #params | Download
---|---|---|---|---|---|---
`flores101_mm100_615M` | 12 | 1024 | 4096 | 256,000 | 615M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
`flores101_mm100_175M` | 6 | 512 | 2048 | 256,000 | 175M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz
These models are trained similar to [M2M-100](https://arxiv.org/abs/2010.11125) with additional support for the languages that are part of the WMT Large-Scale Multilingual Machine Translation track. Full list of languages can be found at the bottom.
## Example Generation code
### Download model, sentencepiece vocab
```bash
fairseq=/path/to/fairseq
cd $fairseq
# Download 615M param model.
wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
# Extract
tar -xvzf flores101_mm100_615M.tar.gz
```
### Encode using our SentencePiece Model
Note: Install SentencePiece from [here](https://github.com/google/sentencepiece)
```bash
fairseq=/path/to/fairseq
cd $fairseq
# Download example dataset From German to French
sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de
sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr
for lang in de fr ; do
python scripts/spm_encode.py \
--model flores101_mm100_615M/sentencepiece.bpe.model \
--output_format=piece \
--inputs=raw_input.de-fr.${lang} \
--outputs=spm.de-fr.${lang}
done
```
### Binarization
```bash
fairseq-preprocess \
--source-lang de --target-lang fr \
--testpref spm.de-fr \
--thresholdsrc 0 --thresholdtgt 0 \
--destdir data_bin \
--srcdict flores101_mm100_615M/dict.txt --tgtdict flores101_mm100_615M/dict.txt
```
### Generation
```bash
fairseq-generate \
data_bin \
--batch-size 1 \
--path flores101_mm100_615M/model.pt \
--fixed-dictionary flores101_mm100_615M/dict.txt \
-s de -t fr \
--remove-bpe 'sentencepiece' \
--beam 5 \
--task translation_multi_simple_epoch \
--lang-pairs flores101_mm100_615M/language_pairs.txt \
--decoder-langtok --encoder-langtok src \
--gen-subset test \
--fp16 \
--dataset-impl mmap \
--distributed-world-size 1 --distributed-no-spawn
```
### Supported Languages and lang code
Language | lang code
---|---
Akrikaans | af
Amharic | am
Arabic | ar
Assamese | as
Asturian | ast
Aymara | ay
Azerbaijani | az
Bashkir | ba
Belarusian | be
Bulgarian | bg
Bengali | bn
Breton | br
Bosnian | bs
Catalan | ca
Cebuano | ceb
Chokwe | cjk
Czech | cs
Welsh | cy
Danish | da
German | de
Dyula| dyu
Greek | el
English | en
Spanish | es
Estonian | et
Persian | fa
Fulah | ff
Finnish | fi
French | fr
Western Frisian | fy
Irish | ga
Scottish Gaelic | gd
Galician | gl
Gujarati | gu
Hausa | ha
Hebrew | he
Hindi | hi
Croatian | hr
Haitian Creole | ht
Hungarian | hu
Armenian | hy
Indonesian | id
Igbo | ig
Iloko | ilo
Icelandic | is
Italian | it
Japanese | ja
Javanese | jv
Georgian | ka
Kachin | kac
Kamba | kam
Kabuverdianu | kea
Kongo | kg
Kazakh | kk
Central Khmer | km
Kimbundu | kmb
Northern Kurdish | kmr
Kannada | kn
Korean | ko
Kurdish | ku
Kyrgyz | ky
Luxembourgish | lb
Ganda | lg
Lingala | ln
Lao | lo
Lithuanian | lt
Luo | luo
Latvian | lv
Malagasy | mg
Maori | mi
Macedonian | mk
Malayalam | ml
Mongolian | mn
Marathi | mr
Malay | ms
Maltese | mt
Burmese | my
Nepali | ne
Dutch | nl
Norwegian | no
Northern Sotho | ns
Nyanja | ny
Occitan | oc
Oromo | om
Oriya | or
Punjabi | pa
Polish | pl
Pashto | ps
Portuguese | pt
Quechua | qu
Romanian | ro
Russian | ru
Sindhi | sd
Shan | shn
Sinhala | si
Slovak | sk
Slovenian | sl
Shona | sn
Somali | so
Albanian | sq
Serbian | sr
Swati | ss
Sundanese | su
Swedish | sv
Swahili | sw
Tamil | ta
Telugu | te
Tajik | tg
Thai | th
Tigrinya | ti
Tagalog | tl
Tswana | tn
Turkish | tr
Ukrainian | uk
Umbundu | umb
Urdu | ur
Uzbek | uz
Vietnamese | vi
Wolof | wo
Xhosa | xh
Yiddish | yi
Yoruba | yo
Chinese| zh
Zulu | zu
# Fully Sharded Data Parallel (FSDP)
## Overview
Recent work by [Microsoft](https://arxiv.org/abs/1910.02054) and
[Google](https://arxiv.org/abs/2004.13336) has shown that data parallel
training can be made significantly more efficient by sharding the model
parameters and optimizer state across data parallel workers. These ideas are
encapsulated in the new **`FullyShardedDataParallel` (FSDP)** wrapper provided
by [fairscale](https://github.com/facebookresearch/fairscale/).
Compared to PyTorch DDP:
* FSDP produces identical results as PyTorch DDP (it's still synchronous data parallel training)
* FSDP shards parameters (FP16 + FP32) and optimizer state across data parallel GPUs
* FSDP is faster than PyTorch DDP because the optimizer step is sharded, and the communication can be overlapped with the forward pass
* FSDP enables training 13B parameter models on 8 GPUs and 175B parameter models on 128 GPUs
FSDP is fully supported in fairseq via the following new arguments:
* `--ddp-backend=fully_sharded`: enables full sharding via FSDP
* `--cpu-offload`: offloads the optimizer state and FP32 model copy to CPU (combine with `--optimizer=cpu_adam`)
* `--no-reshard-after-forward`: increases training speed for large models (1B+ params) and is similar to ZeRO stage 2
* other popular options (`--fp16`, `--update-freq`, `--checkpoint-activations`, `--offload-activations`, etc.) continue to work as normal
<details><summary>Limitations</summary><p>
FSDP currently has several limitations compared to fairseq's default DDP backend (PyTorch DDP):
* while FSDP is full compatible with pointwise Optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.), it is not currently compatible with non-pointwise Optimizers (e.g., Adagrad, Adafactor, LAMB, etc.)
* FSDP depends on flattening the parameters, so models that currently require `--fp16-no-flatten-grads` may not be supported
See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed
explanation of these and other limitations.
</p></details>
<details><summary>How it works</summary><p>
<img width="800" alt="Fully Sharded Data Parallel" src="https://user-images.githubusercontent.com/231798/110406775-c2de0000-8050-11eb-9718-fbfc4510a76a.png">
See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed
explanation of how FSDP works.
</p></details>
## Example usage
The following examples illustrate how to train a very large language model with
13 billion parameters on 1 GPU by offloading parameters and optimizer states to
CPU, or on 8 GPUs by fully sharding the params and optimizer states across GPUs.
These examples use the WikiText-103 dataset for demonstration purposes, but
in practice a much larger dataset will be needed to achieve good results.
Follow the [instructions here](https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.pretraining.md#1-preprocess-the-data)
to preprocess the WikiText-103 dataset using the GPT-2/RoBERTa vocabulary.
### 13B params on 1 V100 GPU (with CPU offloading)
The following command trains a 13B parameter GPT-3 model on a single V100 GPU
using the `--cpu-offload` feature to offload parameters and optimizer states to
CPU. In this setting, the optimizer step (Adam) happens on CPU. We also use the
`--checkpoint-activations` feature (sometimes called [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html)),
which further saves memory in exchange for a small increase in computation.
**Requirements:**
- Install the latest master version of fairscale: `pip install git+https://github.com/facebookresearch/fairscale.git@master`
- You'll need 32GB of GPU memory and ~256GB of system memory to train the 13B param model.
- If you have less system memory, the 6.7B param model can be trained with ~128GB of system memory, just set `--arch transformer_lm_gpt3_6_7`
- We use the CPU Adam optimizer from [DeepSpeed](https://github.com/microsoft/DeepSpeed), so you'll need to `pip install deepspeed` before running the command.
**Notes:**
- The command will take ~5 minutes to start training, during which time it will appear to be hung, since randomly initializing 13B weights can be slow.
- The `--cpu-offload` feature requires training in mixed precision (`--fp16`).
- Tune the `OMP_NUM_THREADS` env variable for best performance with CPU offloading.
- The example command below stops training after 10 steps (`--max-update 10`) and does not save checkpoints (`--no-save`).
```bash
OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0 \
fairseq-train data-bin/wikitext-103-roberta-bpe-bin \
--ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
--cpu-offload --checkpoint-activations \
--task language_modeling --tokens-per-sample 2048 --batch-size 8 \
--arch transformer_lm_gpt3_13 \
--optimizer cpu_adam --adam-betas "(0.9,0.98)" \
--lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
--max-update 10 --no-save --log-format json --log-interval 1
```
<details><summary>Example output</summary><p>
```
(...)
2021-03-08 12:29:51 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920)
(...)
2021-03-08 12:29:51 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs)
2021-03-08 12:29:51 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8
(...)
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1
(...)
2021-03-08 12:31:36 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.475", "ppl": "91120.8", "wps": "0", "ups": "0", "wpb": "16384", "bsz": "8", "num_updates": "1", "lr": "2e-05", "gnorm": "20.751", "loss_scale": "4", "train_wall": "99", "gb_free": "9.3", "wall": "105"}
2021-03-08 12:32:33 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.446", "ppl": "89281.6", "wps": "288.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "2", "lr": "4e-05", "gnorm": "19.777", "loss_scale": "4", "train_wall": "57", "gb_free": "9.3", "wall": "161"}
2021-03-08 12:33:12 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0
2021-03-08 12:33:51 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0
2021-03-08 12:34:45 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "25.22", "ppl": "3.90691e+07", "wps": "123.4", "ups": "0.01", "wpb": "16384", "bsz": "8", "num_updates": "3", "lr": "6e-05", "gnorm": "131.281", "loss_scale": "1", "train_wall": "133", "gb_free": "9.3", "wall": "294"}
2021-03-08 12:35:43 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.079", "ppl": "276809", "wps": "285.5", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "4", "lr": "8e-05", "gnorm": "13.776", "loss_scale": "1", "train_wall": "57", "gb_free": "9.3", "wall": "351"}
2021-03-08 12:36:35 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "23.729", "ppl": "1.39088e+07", "wps": "316.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "5", "lr": "0.0001", "gnorm": "72.774", "loss_scale": "1", "train_wall": "52", "gb_free": "9.3", "wall": "403"}
2021-03-08 12:37:28 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "20.429", "ppl": "1.41203e+06", "wps": "307.6", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "6", "lr": "8e-05", "gnorm": "60.846", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "456"}
2021-03-08 12:38:27 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.965", "ppl": "511684", "wps": "279.4", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "7", "lr": "6e-05", "gnorm": "22.687", "loss_scale": "1", "train_wall": "59", "gb_free": "9.3", "wall": "515"}
2021-03-08 12:39:18 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.345", "ppl": "332887", "wps": "319.1", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "8", "lr": "4e-05", "gnorm": "8.451", "loss_scale": "1", "train_wall": "51", "gb_free": "9.3", "wall": "566"}
2021-03-08 12:40:11 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "18.262", "ppl": "314336", "wps": "305.9", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "9", "lr": "2e-05", "gnorm": "6.457", "loss_scale": "1", "train_wall": "54", "gb_free": "9.3", "wall": "620"}
2021-03-08 12:41:04 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "17.556", "ppl": "192686", "wps": "311.8", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "10", "lr": "0", "gnorm": "5.796", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "673"}
2021-03-08 12:41:04 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10
2021-03-08 12:41:04 | INFO | fairseq_cli.train | begin validation on "valid" subset
2021-03-08 12:43:15 | INFO | valid | {"epoch": 1, "valid_loss": "17.953", "valid_ppl": "253807", "valid_wps": "1868.4", "valid_wpb": "15400.2", "valid_bsz": "7.6", "valid_num_updates": "10"}
2021-03-08 12:43:15 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below)
2021-03-08 12:43:15 | INFO | train | {"epoch": 1, "train_loss": "19.351", "train_ppl": "668509", "train_wps": "210.9", "train_ups": "0.01", "train_wpb": "16384", "train_bsz": "8", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "36.26", "train_loss_scale": "1", "train_train_wall": "667", "train_gb_free": "9.3", "train_wall": "804"}
2021-03-08 12:43:15 | INFO | fairseq_cli.train | done training in 798.6 seconds
```
</p></details>
### 13B params on 8 V100 GPUs (with full parameter + optimizer state sharding)
FSDP can also shard the parameters and optimizer states across multiple GPUs,
reducing memory requirements significantly. On 8 x 32GB GPUs, sharding enables
training the same 13B parameter model *without offloading the parameters to
CPU*. However, without CPU offloading we'd only be able to fit a batch size of
1 per GPU, which would cause training speed to suffer.
We obtain the best performance on 8 GPUs by combining full sharding and CPU
offloading. The following command trains the same 13B parameter GPT-3 model as
before on 8 x 32GB V100 GPUs; training speed increases superlinearly from ~310
words per second to ~3200 words per second.
```bash
OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
fairseq-train data-bin/wikitext-103-roberta-bpe-bin \
--ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
--cpu-offload --checkpoint-activations \
--task language_modeling --tokens-per-sample 2048 --batch-size 8 \
--arch transformer_lm_gpt3_13 \
--optimizer cpu_adam --adam-betas "(0.9,0.98)" \
--lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
--max-update 10 --no-save --log-format json --log-interval 1
```
<details><summary>Example output</summary><p>
```
(...)
2021-03-08 18:04:09 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920)
(...)
2021-03-08 18:04:09 | INFO | fairseq_cli.train | training on 8 devices (GPUs/TPUs)
2021-03-08 18:04:09 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8
(...)
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1
(...)
2021-03-08 18:05:06 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "16.408", "ppl": "86945.6", "wps": "0", "ups": "0", "wpb": "131072", "bsz": "64", "num_updates": "1", "lr": "2e-05", "gnorm": "18.27", "loss_scale": "4", "train_wall": "47", "gb_free": "9.3", "wall": "56"}
2021-03-08 18:05:45 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "16.352", "ppl": "83644.3", "wps": "3283.4", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "2", "lr": "4e-05", "gnorm": "18.411", "loss_scale": "4", "train_wall": "40", "gb_free": "9.3", "wall": "96"}
2021-03-08 18:06:21 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0
2021-03-08 18:06:56 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0
2021-03-08 18:07:37 | INFO | train_inner | {"epoch": 1, "update": 0.006, "loss": "23.682", "ppl": "1.34537e+07", "wps": "1176.6", "ups": "0.01", "wpb": "131072", "bsz": "64", "num_updates": "3", "lr": "6e-05", "gnorm": "119.682", "loss_scale": "1", "train_wall": "111", "gb_free": "9.3", "wall": "208"}
2021-03-08 18:08:18 | INFO | train_inner | {"epoch": 1, "update": 0.007, "loss": "18.988", "ppl": "519921", "wps": "3189.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "4", "lr": "8e-05", "gnorm": "14.934", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "249"}
2021-03-08 18:08:59 | INFO | train_inner | {"epoch": 1, "update": 0.008, "loss": "20.08", "ppl": "1.10798e+06", "wps": "3223.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "5", "lr": "0.0001", "gnorm": "59.92", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "289"}
2021-03-08 18:09:39 | INFO | train_inner | {"epoch": 1, "update": 0.009, "loss": "18.323", "ppl": "327980", "wps": "3256.6", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "6", "lr": "8e-05", "gnorm": "37.425", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "330"}
2021-03-08 18:10:20 | INFO | train_inner | {"epoch": 1, "update": 0.01, "loss": "17.264", "ppl": "157354", "wps": "3188.7", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "7", "lr": "6e-05", "gnorm": "10.824", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "371"}
2021-03-08 18:11:01 | INFO | train_inner | {"epoch": 1, "update": 0.011, "loss": "16.794", "ppl": "113647", "wps": "3230", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "8", "lr": "4e-05", "gnorm": "5.616", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "411"}
2021-03-08 18:11:39 | INFO | train_inner | {"epoch": 1, "update": 0.012, "loss": "16.706", "ppl": "106938", "wps": "3384", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "9", "lr": "2e-05", "gnorm": "5.318", "loss_scale": "1", "train_wall": "39", "gb_free": "9.3", "wall": "450"}
2021-03-08 18:12:19 | INFO | train_inner | {"epoch": 1, "update": 0.013, "loss": "16.548", "ppl": "95796.2", "wps": "3274.4", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "10", "lr": "0", "gnorm": "5.22", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "490"}
2021-03-08 18:12:19 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10
2021-03-08 18:12:19 | INFO | fairseq_cli.train | begin validation on "valid" subset
2021-03-08 18:12:45 | INFO | valid | {"epoch": 1, "valid_loss": "16.624", "valid_ppl": "101000", "valid_wps": "10855.9", "valid_wpb": "123202", "valid_bsz": "60.5", "valid_num_updates": "10"}
2021-03-08 18:12:45 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below)
2021-03-08 18:12:45 | INFO | train | {"epoch": 1, "train_loss": "18.114", "train_ppl": "283776", "train_wps": "2567.8", "train_ups": "0.02", "train_wpb": "131072", "train_bsz": "64", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "29.562", "train_loss_scale": "1", "train_train_wall": "480", "train_gb_free": "9.3", "train_wall": "516"}
2021-03-08 18:12:45 | INFO | fairseq_cli.train | done training in 509.9 seconds
```
</p></details>
# GottBERT: a pure German language model
## Introduction
[GottBERT](http://arxiv.org/abs/2012.02110) is a pretrained language model trained on 145GB of German text based on RoBERTa.
## Example usage
### fairseq
##### Load GottBERT from torch.hub (PyTorch >= 1.1):
```python
import torch
gottbert = torch.hub.load('pytorch/fairseq', 'gottbert-base')
gottbert.eval() # disable dropout (or leave in train mode to finetune)
```
##### Load GottBERT (for PyTorch 1.0 or custom models):
```python
# Download gottbert model
wget https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz
tar -xzvf gottbert.tar.gz
# Load the model in fairseq
from fairseq.models.roberta import GottbertModel
gottbert = GottbertModel.from_pretrained('/path/to/gottbert')
gottbert.eval() # disable dropout (or leave in train mode to finetune)
```
##### Filling masks:
```python
masked_line = 'Gott ist <mask> ! :)'
gottbert.fill_mask(masked_line, topk=3)
# [('Gott ist gut ! :)', 0.3642110526561737, ' gut'),
# ('Gott ist überall ! :)', 0.06009674072265625, ' überall'),
# ('Gott ist großartig ! :)', 0.0370681993663311, ' großartig')]
```
##### Extract features from GottBERT
```python
# Extract the last layer's features
line = "Der erste Schluck aus dem Becher der Naturwissenschaft macht atheistisch , aber auf dem Grunde des Bechers wartet Gott !"
tokens = gottbert.encode(line)
last_layer_features = gottbert.extract_features(tokens)
assert last_layer_features.size() == torch.Size([1, 27, 768])
# Extract all layer's features (layer 0 is the embedding layer)
all_layers = gottbert.extract_features(tokens, return_all_hiddens=True)
assert len(all_layers) == 13
assert torch.all(all_layers[-1] == last_layer_features)
```
## Citation
If you use our work, please cite:
```bibtex
@misc{scheible2020gottbert,
title={GottBERT: a pure German Language Model},
author={Raphael Scheible and Fabian Thomczyk and Patric Tippmann and Victor Jaravine and Martin Boeker},
year={2020},
eprint={2012.02110},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
# HuBERT
## Pre-trained and fine-tuned (ASR) models
Model | Pretraining Data | Finetuning Dataset | Model
|---|---|---|---
HuBERT Base (~95M params) | [Librispeech](http://www.openslr.org/12) 960 hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt)
HuBERT Large (~316M params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt)
HuBERT Extra Large (~1B params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt)
HuBERT Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt)
HuBERT Extra Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt)
## Load a pretrained model
```
ckpt_path = "/path/to/the/checkpoint.pt"
models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path], strict=False)
model = models[0]
```
** We will follow-up with a patch such that you wouldn't need to pass `strict=False` for loading the checkpoint in future.
## Train a new model
### Data preparation
Follow the steps in `./simple_kmeans` to create:
- `{train,valid}.tsv` waveform list files
- `{train,valid}.km` frame-aligned pseudo label files.
The `label_rate` is the same as the feature frame rate used for clustering,
which is 100Hz for MFCC features and 50Hz for HuBERT features by default.
### Pre-train a HuBERT model
Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km`
are saved at `/path/to/labels`, and the label rate is 100Hz.
To train a base model (12 layer transformer), run:
```sh
$ python fairseq_cli/hydra_train.py \
--config-dir /path/to/fairseq-py/examples/hubert/config/pretrain \
--config-name hubert_base_librispeech \
task.data=/path/to/data task.label_dir=/path/to/labels model.label_rate=100
```
### Fine-tune a HuBERT model with a CTC loss
Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their
corresponding character transcripts `{train,valid}.ltr` are saved at
`/path/to/trans`.
To fine-tune a pre-trained HuBERT model at `/path/to/checkpoint`, run
```sh
$ python fairseq_cli/hydra_train.py \
--config-dir /path/to/fairseq-py/examples/hubert/config/finetune \
--config-name base_10h \
task.data=/path/to/data task.label_dir=/path/to/trans \
model.w2v_path=/path/to/checkpoint
```
### Decode a HuBERT model
Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of
the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is
saved at `/path/to/checkpoint`. We support three decoding modes:
- Viterbi decoding: greedy decoding without a language model
- KenLM decoding: decoding with an arpa-format KenLM n-gram language model
- Fairseq-LM deocding: decoding with a Fairseq neural language model
#### Viterbi decoding
`task.normalize` needs to be consistent with the value used during fine-tuning.
Decoding results will be saved at
`/path/to/experiment/directory/decode/viterbi/test`.
```sh
$ python examples/speech_recognition/new/infer.py \
--config-dir /path/to/fairseq-py/examples/hubert/config/decode \
--config-name infer_viterbi \
task.data=/path/to/data \
task.normalize=[true|false] \
decoding.exp_dir=/path/to/experiment/directory \
common_eval.path=/path/to/checkpoint
dataset.gen_subset=test \
```
#### KenLM / Fairseq-LM decoding
Suppose the pronunciation lexicon and the n-gram LM are saved at
`/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be
saved at `/path/to/experiment/directory/decode/kenlm/test`.
```sh
$ python examples/speech_recognition/new/infer.py \
--config-dir /path/to/fairseq-py/examples/hubert/config/decode \
--config-name infer_kenlm \
task.data=/path/to/data \
task.normalize=[true|false] \
decoding.exp_dir=/path/to/experiment/directory \
common_eval.path=/path/to/checkpoint
dataset.gen_subset=test \
decoding.decoder.lexicon=/path/to/lexicon \
decoding.decoder.lmpath=/path/to/arpa
```
The command above uses the default decoding hyperparameter, which can be found
in `examples/speech_recognition/hydra/decoder.py`. These parameters can be
configured from the command line. For example, to search with a beam size of
500, we can append the command above with `decoding.decoder.beam=500`.
Important parameters include:
- decoding.decoder.beam
- decoding.decoder.beamthreshold
- decoding.decoder.lmweight
- decoding.decoder.wordscore
- decoding.decoder.silweight
To decode with a Fairseq LM, use `--config-name infer_fsqlm` instead, and
change the path of lexicon and LM accordingly.
# @package _global_
common_eval:
results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset}
hydra:
sweeper:
ax_config:
max_trials: 60
early_stop:
minimize: true
max_epochs_without_improvement: 10
epsilon: 0.025
experiment:
name: ${dataset.gen_subset}
objective_name: wer
minimize: true
parameter_constraints: null
outcome_constraints: null
status_quo: null
client:
verbose_logging: false
random_seed: null
params:
decoding.decoder.lmweight:
type: range
bounds: [0.0, 8.0]
decoding.decoder.wordscore:
type: range
bounds: [-5.0, 5.0]
decoding.decoder.silweight:
type: range
bounds: [-10.0, 0.0]
# @package _global_
common_eval:
results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset}
hydra:
sweeper:
ax_config:
max_trials: 60
early_stop:
minimize: true
max_epochs_without_improvement: 10
epsilon: 0.025
experiment:
name: ${dataset.gen_subset}
objective_name: wer
minimize: true
parameter_constraints: null
outcome_constraints: null
status_quo: null
client:
verbose_logging: false
random_seed: null
params:
decoding.decoder.lmweight:
type: range
bounds: [0.0, 4.0]
decoding.decoder.wordscore:
type: range
bounds: [-5.0, 5.0]
decoding.decoder.silweight:
type: range
bounds: [-8.0, 0.0]
# @package _group_
defaults:
- model: null
hydra:
run:
dir: ${common_eval.results_path}/beam${decoding.decoder.beam}_lmw${decoding.decoder.lmweight}_wrd${decoding.decoder.wordscore}_sil${decoding.decoder.silweight}
sweep:
dir: ${common_eval.results_path}
subdir: beam${decoding.decoder.beam}_th${decoding.decoder.beamthreshold}_lmw${decoding.decoder.lmweight}_wrd${decoding.decoder.wordscore}_sil${decoding.decoder.silweight}
task:
_name: hubert_pretraining
single_target: true
data: ???
normalize: ???
decoding:
type: fairseqlm
lexicon: ???
lmpath: ???
beamthreshold: 25 # 100
beam: 500
lmweight: 2
wordscore: -1
silweight: 0
unique_wer_file: true
beam: 500
common_eval:
results_path: ???
path: ???
post_process: letter
dataset:
max_tokens: 1100000
gen_subset: ???
# @package _group_
defaults:
- model: null
hydra:
run:
dir: ${common_eval.results_path}/beam${decoding.decoder.beam}_lmw${decoding.decoder.lmweight}_wrd${decoding.decoder.wordscore}_sil${decoding.decoder.silweight}
sweep:
dir: ${common_eval.results_path}
subdir: beam${decoding.decoder.beam}_th${decoding.decoder.beamthreshold}_lmw${decoding.decoder.lmweight}_wrd${decoding.decoder.wordscore}_sil${decoding.decoder.silweight}
task:
_name: hubert_pretraining
single_target: true
data: ???
normalize: ???
decoding:
type: kenlm
lexicon: ???
lmpath: ???
beamthreshold: 100
beam: 500
lmweight: 2
wordscore: -1
silweight: 0
unique_wer_file: true
beam: 500
common_eval:
results_path: ???
path: ???
post_process: letter
dataset:
max_tokens: 1100000
gen_subset: ???
# @package _group_
defaults:
- model: null
hydra:
run:
dir: ${common_eval.results_path}/beam${decoding.decoder.beam}_lmw${decoding.decoder.lmweight}_wrd${decoding.decoder.wordscore}_sil${decoding.decoder.silweight}
sweep:
dir: ${common_eval.results_path}
subdir: beam${decoding.decoder.beam}_th${decoding.decoder.beamthreshold}_lmw${decoding.decoder.lmweight}_wrd${decoding.decoder.wordscore}_sil${decoding.decoder.silweight}
task:
_name: hubert_pretraining
single_target: true
data: ???
normalize: ???
decoding:
type: viterbi
unique_wer_file: true
common_eval:
results_path: ???
path: ???
post_process: letter
generation:
nbest: 1
beam: 500
dataset:
max_tokens: 1100000
gen_subset: ???
# @package _global_
hydra:
launcher:
cpus_per_task: ${distributed_training.distributed_world_size}
gpus_per_node: ${distributed_training.distributed_world_size}
tasks_per_node: ${hydra.launcher.gpus_per_node}
nodes: 1
mem_gb: 200
timeout_min: 4320
max_num_timeout: 50
name: ${hydra.job.config_name}
submitit_folder: ${hydra.sweep.dir}/submitit
distributed_training:
distributed_world_size: 1
distributed_no_spawn: true
distributed_port: 29761
# @package _global_
hydra:
launcher:
cpus_per_task: ${distributed_training.distributed_world_size}
gpus_per_node: ${distributed_training.distributed_world_size}
tasks_per_node: ${hydra.launcher.gpus_per_node}
nodes: 1
mem_gb: 200
timeout_min: 4320
max_num_timeout: 50
name: ${hydra.job.config_name}
submitit_folder: ${hydra.sweep.dir}/submitit
distributed_training:
distributed_world_size: 8
distributed_no_spawn: true
distributed_port: 29761
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tblog
seed: 1337
checkpoint:
save_interval: 5
keep_interval_updates: 1
no_epoch_checkpoints: true
best_checkpoint_metric: wer
distributed_training:
ddp_backend: c10d
find_unused_parameters: true
distributed_world_size: 1
distributed_port: 29671
nprocs_per_node: 8
task:
_name: hubert_pretraining
data: ???
fine_tuning: true
label_dir: ???
normalize: false # must be consistent with pre-training
labels: ["ltr"]
single_target: true
dataset:
num_workers: 0
max_tokens: 3200000
validate_after_updates: ${model.freeze_finetune_updates}
validate_interval: 5
train_subset: train
valid_subset: valid
criterion:
_name: ctc
zero_infinity: true
optimization:
max_update: 25000
lr: [2e-5]
sentence_avg: true
update_freq: [1]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-08
lr_scheduler:
_name: tri_stage
warmup_steps: 8000
hold_steps: 0
decay_steps: 72000
final_lr_scale: 0.05
model:
_name: hubert_ctc
w2v_path: ???
apply_mask: true
mask_selection: static
mask_length: 10
mask_other: 0
mask_prob: 0.75
mask_channel_selection: static
mask_channel_length: 64
mask_channel_other: 0
mask_channel_prob: 0.5
layerdrop: 0.1
dropout: 0.0
activation_dropout: 0.1
attention_dropout: 0.0
feature_grad_mult: 0.0
freeze_finetune_updates: 10000
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
- model.w2v_path
- dataset.train_subset
- dataset.valid_subset
- criterion.wer_kenlm_model
- criterion.wer_lexicon
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _global_
task:
normalize: false
model:
w2v_path: /checkpoint/wnhsu/w2v/hubert_final/iter1/hubert.km.randcrop.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU400k.s1337.ngpu32/checkpoint_last.pt
# @package _global_
criterion:
wer_kenlm_model: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/4-gram.bin
wer_lexicon: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/10h/raw/lexicon_ltr.lst
wer_lm_weight: 2.0
wer_word_score: -1.0
# @package _global_
hydra:
launcher:
cpus_per_task: 8
gpus_per_node: 8
tasks_per_node: ${hydra.launcher.gpus_per_node}
nodes: 1
comment: null
mem_gb: 384
timeout_min: 4320
max_num_timeout: 100
constraint: volta32gb
name: ${hydra.job.config_name}/${hydra.job.override_dirname}
submitit_folder: ${hydra.sweep.dir}/submitit/%j
distributed_training:
distributed_world_size: 8
distributed_port: 29671
nprocs_per_node: 8
# @package _global_
task:
label_dir: ???
labels: ["km"]
model:
label_rate: 100
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment