Commit 799a38c5 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #616 failed with stages
in 0 seconds
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
import math
from dataclasses import dataclass, field
from typing import Optional
import torch
import torch.nn.functional as F
import numpy as np
from fairseq import metrics, utils
from fairseq.criterions import FairseqCriterion, register_criterion
from fairseq.dataclass import FairseqDataclass
from omegaconf import II
@dataclass
class AdjustLabelSmoothedCrossEntropyCriterionConfig(FairseqDataclass):
label_smoothing: float = field(
default=0.0,
metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"},
)
report_accuracy: bool = field(
default=False,
metadata={"help": "report accuracy metric"},
)
ignore_prefix_size: int = field(
default=0,
metadata={"help": "Ignore first N tokens"},
)
ignore_eos: bool = field(
default=False,
metadata={"help": "Ignore eos token"},
)
sentence_avg: bool = II("optimization.sentence_avg")
drop_worst_ratio: float = field(
default=0.0,
metadata={"help": "ratio for discarding bad samples"},
)
drop_worst_after: int = field(
default=0,
metadata={"help": "steps for discarding bad samples"},
)
use_rdrop: bool = field(
default=False, metadata={"help": "use R-Drop"}
)
reg_alpha: float = field(
default=1.0, metadata={"help": "weight for R-Drop"}
)
sample_patch_num: int = field(
default=196, metadata={"help": "sample patches for v1"}
)
constraint_range: Optional[str] = field(
default=None,
metadata={"help": "constraint range"}
)
def construct_rdrop_sample(x):
if isinstance(x, dict):
for key in x:
x[key] = construct_rdrop_sample(x[key])
return x
elif isinstance(x, torch.Tensor):
return x.repeat(2, *([1] * (x.dim()-1)))
elif isinstance(x, int):
return x * 2
elif isinstance(x, np.ndarray):
return x.repeat(2)
else:
raise NotImplementedError
def kl_loss(p, q):
p_loss = F.kl_div(p, torch.exp(q), reduction='sum')
q_loss = F.kl_div(q, torch.exp(p), reduction='sum')
loss = (p_loss + q_loss) / 2
return loss
def label_smoothed_nll_loss(
lprobs, target, epsilon, update_num, reduce=True,
drop_worst_ratio=0.0, drop_worst_after=0, use_rdrop=False, reg_alpha=1.0,
constraint_masks=None, constraint_start=None, constraint_end=None
):
if target.dim() == lprobs.dim() - 1:
target = target.unsqueeze(-1)
nll_loss = -lprobs.gather(dim=-1, index=target).squeeze(-1)
if constraint_masks is not None:
smooth_loss = -lprobs.masked_fill(~constraint_masks, 0).sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (constraint_masks.sum(1) - 1 + 1e-6)
elif constraint_start is not None and constraint_end is not None:
constraint_range = [0, 1, 2, 3] + list(range(constraint_start, constraint_end))
smooth_loss = -lprobs[:, constraint_range].sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (len(constraint_range) - 1 + 1e-6)
else:
smooth_loss = -lprobs.sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (lprobs.size(-1) - 1)
loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
if drop_worst_ratio > 0 and update_num > drop_worst_after:
if use_rdrop:
true_batch_size = loss.size(0) // 2
_, indices = torch.topk(loss[:true_batch_size], k=int(true_batch_size * (1 - drop_worst_ratio)), largest=False)
loss = torch.cat([loss[indices], loss[indices+true_batch_size]])
nll_loss = torch.cat([nll_loss[indices], nll_loss[indices+true_batch_size]])
lprobs = torch.cat([lprobs[indices], lprobs[indices+true_batch_size]])
else:
loss, indices = torch.topk(loss, k=int(loss.shape[0] * (1 - drop_worst_ratio)), largest=False)
nll_loss = nll_loss[indices]
lprobs = lprobs[indices]
ntokens = loss.numel()
nll_loss = nll_loss.sum()
loss = loss.sum()
if use_rdrop:
true_batch_size = lprobs.size(0) // 2
p = lprobs[:true_batch_size]
q = lprobs[true_batch_size:]
if constraint_start is not None and constraint_end is not None:
constraint_range = [0, 1, 2, 3] + list(range(constraint_start, constraint_end))
p = p[:, constraint_range]
q = q[:, constraint_range]
loss += kl_loss(p, q) * reg_alpha
return loss, nll_loss, ntokens
@register_criterion(
"adjust_label_smoothed_cross_entropy", dataclass=AdjustLabelSmoothedCrossEntropyCriterionConfig
)
class AdjustLabelSmoothedCrossEntropyCriterion(FairseqCriterion):
def __init__(
self,
task,
sentence_avg,
label_smoothing,
ignore_prefix_size=0,
ignore_eos=False,
report_accuracy=False,
drop_worst_ratio=0,
drop_worst_after=0,
use_rdrop=False,
reg_alpha=1.0,
sample_patch_num=196,
constraint_range=None
):
super().__init__(task)
self.sentence_avg = sentence_avg
self.eps = label_smoothing
self.ignore_prefix_size = ignore_prefix_size
self.ignore_eos = ignore_eos
self.report_accuracy = report_accuracy
self.drop_worst_ratio = drop_worst_ratio
self.drop_worst_after = drop_worst_after
self.use_rdrop = use_rdrop
self.reg_alpha = reg_alpha
self.sample_patch_num = sample_patch_num
self.constraint_start = None
self.constraint_end = None
if constraint_range is not None:
constraint_start, constraint_end = constraint_range.split(',')
self.constraint_start = int(constraint_start)
self.constraint_end = int(constraint_end)
def forward(self, model, sample, update_num=0, reduce=True):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
if isinstance(sample, list):
if self.sample_patch_num > 0:
sample[0]['net_input']['sample_patch_num'] = self.sample_patch_num
loss_v1, sample_size_v1, logging_output_v1 = self.forward(model, sample[0], update_num, reduce)
loss_v2, sample_size_v2, logging_output_v2 = self.forward(model, sample[1], update_num, reduce)
loss = loss_v1 / sample_size_v1 + loss_v2 / sample_size_v2
sample_size = 1
logging_output = {
"loss": loss.data,
"loss_v1": loss_v1.data,
"loss_v2": loss_v2.data,
"nll_loss": logging_output_v1["nll_loss"].data / sample_size_v1 + logging_output_v2["nll_loss"].data / sample_size_v2,
"ntokens": logging_output_v1["ntokens"] + logging_output_v2["ntokens"],
"nsentences": logging_output_v1["nsentences"] + logging_output_v2["nsentences"],
"sample_size": 1,
"sample_size_v1": sample_size_v1,
"sample_size_v2": sample_size_v2,
}
return loss, sample_size, logging_output
if self.use_rdrop:
construct_rdrop_sample(sample)
net_output = model(**sample["net_input"])
loss, nll_loss, ntokens = self.compute_loss(model, net_output, sample, update_num, reduce=reduce)
sample_size = (
sample["target"].size(0) if self.sentence_avg else ntokens
)
logging_output = {
"loss": loss.data,
"nll_loss": nll_loss.data,
"ntokens": sample["ntokens"],
"nsentences": sample["nsentences"],
"sample_size": sample_size,
}
if self.report_accuracy:
n_correct, total = self.compute_accuracy(model, net_output, sample)
logging_output["n_correct"] = utils.item(n_correct.data)
logging_output["total"] = utils.item(total.data)
return loss, sample_size, logging_output
def get_lprobs_and_target(self, model, net_output, sample):
conf = sample['conf'][:, None, None] if 'conf' in sample and sample['conf'] is not None else 1
constraint_masks = None
if "constraint_masks" in sample and sample["constraint_masks"] is not None:
constraint_masks = sample["constraint_masks"]
net_output[0] = net_output[0].masked_fill(~constraint_masks, -math.inf)
if self.constraint_start is not None and self.constraint_end is not None:
net_output[0][:, :, 4:self.constraint_start] = -math.inf
net_output[0][:, :, self.constraint_end:] = -math.inf
lprobs = model.get_normalized_probs(net_output, log_probs=True) * conf
target = model.get_targets(sample, net_output)
if self.ignore_prefix_size > 0:
lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
target = target[:, self.ignore_prefix_size :].contiguous()
if constraint_masks is not None:
constraint_masks = constraint_masks[:, self.ignore_prefix_size :, :].contiguous()
if self.ignore_eos:
bsz, seq_len, embed_dim = lprobs.size()
eos_indices = target.eq(self.task.tgt_dict.eos())
lprobs = lprobs[~eos_indices].reshape(bsz, seq_len-1, embed_dim)
target = target[~eos_indices].reshape(bsz, seq_len-1)
if constraint_masks is not None:
constraint_masks = constraint_masks[~eos_indices].reshape(bsz, seq_len-1, embed_dim)
if constraint_masks is not None:
constraint_masks = constraint_masks.view(-1, constraint_masks.size(-1))
return lprobs.view(-1, lprobs.size(-1)), target.view(-1), constraint_masks
def compute_loss(self, model, net_output, sample, update_num, reduce=True):
lprobs, target, constraint_masks = self.get_lprobs_and_target(model, net_output, sample)
if constraint_masks is not None:
constraint_masks = constraint_masks[target != self.padding_idx]
lprobs = lprobs[target != self.padding_idx]
target = target[target != self.padding_idx]
loss, nll_loss, ntokens = label_smoothed_nll_loss(
lprobs,
target,
self.eps,
update_num,
reduce=reduce,
drop_worst_ratio=self.drop_worst_ratio,
drop_worst_after=self.drop_worst_after,
use_rdrop=self.use_rdrop,
reg_alpha=self.reg_alpha,
constraint_masks=constraint_masks,
constraint_start=self.constraint_start,
constraint_end=self.constraint_end
)
return loss, nll_loss, ntokens
def compute_accuracy(self, model, net_output, sample):
lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
mask = target.ne(self.padding_idx)
n_correct = torch.sum(
lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))
)
total = torch.sum(mask)
return n_correct, total
@classmethod
def reduce_metrics(cls, logging_outputs) -> None:
"""Aggregate logging outputs from data parallel training."""
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
loss_sum_v1 = sum(log.get("loss_v1", 0) for log in logging_outputs)
loss_sum_v2 = sum(log.get("loss_v2", 0) for log in logging_outputs)
nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
sample_size_v1 = sum(log.get("sample_size_v1", 0) for log in logging_outputs)
sample_size_v2 = sum(log.get("sample_size_v2", 0) for log in logging_outputs)
metrics.log_scalar(
"loss", loss_sum / sample_size, sample_size, round=3
)
metrics.log_scalar(
"loss_v1", loss_sum_v1 / max(sample_size_v1, 1), max(sample_size_v1, 1), round=3
)
metrics.log_scalar(
"loss_v2", loss_sum_v2 / max(sample_size_v2, 1), max(sample_size_v2, 1), round=3
)
metrics.log_scalar(
"nll_loss", nll_loss_sum / sample_size, ntokens, round=3
)
metrics.log_derived(
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
)
metrics.log_scalar(
"ntokens", ntokens, 1, round=3
)
metrics.log_scalar(
"nsentences", nsentences, 1, round=3
)
metrics.log_scalar(
"sample_size", sample_size, 1, round=3
)
metrics.log_scalar(
"sample_size_v1", sample_size_v1, 1, round=3
)
metrics.log_scalar(
"sample_size_v2", sample_size_v2, 1, round=3
)
total = utils.item(sum(log.get("total", 0) for log in logging_outputs))
if total > 0:
metrics.log_scalar("total", total)
n_correct = utils.item(
sum(log.get("n_correct", 0) for log in logging_outputs)
)
metrics.log_scalar("n_correct", n_correct)
metrics.log_derived(
"accuracy",
lambda meters: round(
meters["n_correct"].sum * 100.0 / meters["total"].sum, 3
)
if meters["total"].sum > 0
else float("nan"),
)
@staticmethod
def logging_outputs_can_be_summed() -> bool:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return True
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import math
from dataclasses import dataclass, field
from typing import Optional
import torch
import torch.nn.functional as F
import numpy as np
from fairseq import metrics, utils
from fairseq.criterions import FairseqCriterion, register_criterion
from fairseq.dataclass import FairseqDataclass
from omegaconf import II
@dataclass
class AdjustLabelSmoothedEncouragingLossConfig(FairseqDataclass):
label_smoothing: float = field(
default=0.0,
metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"},
)
report_accuracy: bool = field(
default=False,
metadata={"help": "report accuracy metric"},
)
ignore_prefix_size: int = field(
default=0,
metadata={"help": "Ignore first N tokens"},
)
ignore_eos: bool = field(
default=False,
metadata={"help": "Ignore eos token"},
)
sentence_avg: bool = II("optimization.sentence_avg")
drop_worst_ratio: float = field(
default=0.0,
metadata={"help": "ratio for discarding bad samples"},
)
drop_worst_after: int = field(
default=0,
metadata={"help": "steps for discarding bad samples"},
)
use_rdrop: bool = field(
default=False, metadata={"help": "use R-Drop"}
)
reg_alpha: float = field(
default=1.0, metadata={"help": "weight for R-Drop"}
)
sample_patch_num: int = field(
default=196, metadata={"help": "sample patchs for v1"}
)
constraint_range: Optional[str] = field(
default=None,
metadata={"help": "constraint range"}
)
log_end: float = field(
default=0.75,
metadata={"help": "higher log_end is for cases with higher performance,"
" we recommend 0.75 or 0.5 as your first try."}
)
drop_best_ratio: float = field(
default=0.0,
metadata={"help": "ratio for discarding best samples"},
)
drop_best_after: int = field(
default=0,
metadata={"help": "steps for discarding best samples"},
)
def construct_rdrop_sample(x):
if isinstance(x, dict):
for key in x:
x[key] = construct_rdrop_sample(x[key])
return x
elif isinstance(x, torch.Tensor):
return x.repeat(2, *([1] * (x.dim()-1)))
elif isinstance(x, int):
return x * 2
elif isinstance(x, np.ndarray):
return x.repeat(2)
else:
raise NotImplementedError
def kl_loss(p, q):
p_loss = F.kl_div(p, torch.exp(q), reduction='sum')
q_loss = F.kl_div(q, torch.exp(p), reduction='sum')
loss = (p_loss + q_loss) / 2
return loss
def label_smoothed_nll_loss(
lprobs, target, epsilon, update_num, reduce=True,
drop_worst_ratio=0.0, drop_worst_after=0, use_rdrop=False, reg_alpha=1.0,
constraint_masks=None, constraint_start=None, constraint_end=None, drop_best_ratio=0.0,
drop_best_after=0,
):
if target.dim() == lprobs.dim() - 1:
target = target.unsqueeze(-1)
nll_loss = -lprobs.gather(dim=-1, index=target).squeeze(-1)
if constraint_masks is not None:
smooth_loss = -lprobs.masked_fill(~constraint_masks, 0).sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (constraint_masks.sum(1) - 1 + 1e-6)
elif constraint_start is not None and constraint_end is not None:
constraint_range = [0, 1, 2, 3] + list(range(constraint_start, constraint_end))
smooth_loss = -lprobs[:, constraint_range].sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (len(constraint_range) - 1 + 1e-6)
else:
smooth_loss = -lprobs.sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (lprobs.size(-1) - 1)
loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
if drop_worst_ratio > 0 and update_num > drop_worst_after:
if use_rdrop:
true_batch_size = loss.size(0) // 2
_, indices = torch.topk(loss[:true_batch_size], k=int(true_batch_size * (1 - drop_worst_ratio)), largest=False)
loss = torch.cat([loss[indices], loss[indices+true_batch_size]])
nll_loss = torch.cat([nll_loss[indices], nll_loss[indices+true_batch_size]])
lprobs = torch.cat([lprobs[indices], lprobs[indices+true_batch_size]])
else:
loss, indices = torch.topk(loss, k=int(loss.shape[0] * (1 - drop_worst_ratio)), largest=False)
nll_loss = nll_loss[indices]
lprobs = lprobs[indices]
target = target[indices]
if update_num > drop_best_after:
loss, indices = torch.topk(loss, k=int(loss.shape[0] * (1 - drop_best_ratio)), largest=True)
nll_loss = nll_loss[indices]
lprobs = lprobs[indices]
target = target[indices]
ntokens = loss.numel()
nll_loss = nll_loss.sum()
loss = loss.sum()
if use_rdrop:
true_batch_size = lprobs.size(0) // 2
p = lprobs[:true_batch_size]
q = lprobs[true_batch_size:]
if constraint_start is not None and constraint_end is not None:
constraint_range = [0, 1, 2, 3] + list(range(constraint_start, constraint_end))
p = p[:, constraint_range]
q = q[:, constraint_range]
loss += kl_loss(p, q) * reg_alpha
return loss, nll_loss, ntokens,lprobs,target
@register_criterion(
"adjust_label_smoothed_encouraging_loss", dataclass=AdjustLabelSmoothedEncouragingLossConfig
)
class AdjustLabelSmoothedEncouragingLossCriterion(FairseqCriterion):
def __init__(
self,
task,
sentence_avg,
label_smoothing,
ignore_prefix_size=0,
ignore_eos=False,
report_accuracy=False,
drop_worst_ratio=0,
drop_worst_after=0,
use_rdrop=False,
reg_alpha=1.0,
sample_patch_num=196,
constraint_range=None,
log_end=0.75,
drop_best_ratio=0.0,
drop_best_after=0,
):
super().__init__(task)
self.sentence_avg = sentence_avg
self.eps = label_smoothing
self.ignore_prefix_size = ignore_prefix_size
self.ignore_eos = ignore_eos
self.report_accuracy = report_accuracy
self.drop_worst_ratio = drop_worst_ratio
self.drop_worst_after = drop_worst_after
self.use_rdrop = use_rdrop
self.reg_alpha = reg_alpha
self.sample_patch_num = sample_patch_num
self.constraint_start = None
self.constraint_end = None
if constraint_range is not None:
constraint_start, constraint_end = constraint_range.split(',')
self.constraint_start = int(constraint_start)
self.constraint_end = int(constraint_end)
self.log_end = log_end
self.drop_best_ratio = drop_best_ratio
self.drop_best_after = drop_best_after
print('el, self.log_end=', self.log_end)
# @staticmethod
# def add_args(parser):
# """Add criterion-specific arguments to the parser."""
# # fmt: off
# parser.add_argument('--log_end', type=float, default=1.0)
def forward(self, model, sample, update_num=0, reduce=True):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
if isinstance(sample, list):
if self.sample_patch_num > 0:
sample[0]['net_input']['sample_patch_num'] = self.sample_patch_num
loss_v1, sample_size_v1, logging_output_v1 = self.forward(model, sample[0], update_num, reduce)
loss_v2, sample_size_v2, logging_output_v2 = self.forward(model, sample[1], update_num, reduce)
loss = loss_v1 / sample_size_v1 + loss_v2 / sample_size_v2
sample_size = 1
logging_output = {
"loss": loss.data,
"loss_v1": loss_v1.data,
"loss_v2": loss_v2.data,
"nll_loss": logging_output_v1["nll_loss"].data / sample_size_v1 + logging_output_v2["nll_loss"].data / sample_size_v2,
"ntokens": logging_output_v1["ntokens"] + logging_output_v2["ntokens"],
"nsentences": logging_output_v1["nsentences"] + logging_output_v2["nsentences"],
"sample_size": 1,
"sample_size_v1": sample_size_v1,
"sample_size_v2": sample_size_v2,
}
return loss, sample_size, logging_output
if self.use_rdrop:
construct_rdrop_sample(sample)
net_output = model(**sample["net_input"])
loss, nll_loss, ntokens = self.compute_loss(model, net_output, sample, update_num, reduce=reduce)
sample_size = (
sample["target"].size(0) if self.sentence_avg else ntokens
)
logging_output = {
"loss": loss.data,
"nll_loss": nll_loss.data,
"ntokens": sample["ntokens"],
"nsentences": sample["nsentences"],
"sample_size": sample_size,
}
if self.report_accuracy:
n_correct, total = self.compute_accuracy(model, net_output, sample)
logging_output["n_correct"] = utils.item(n_correct.data)
logging_output["total"] = utils.item(total.data)
return loss, sample_size, logging_output
def get_lprobs_and_target(self, model, net_output, sample):
conf = sample['conf'][:, None, None] if 'conf' in sample and sample['conf'] is not None else 1
constraint_masks = None
if "constraint_masks" in sample and sample["constraint_masks"] is not None:
constraint_masks = sample["constraint_masks"]
net_output[0].masked_fill_(~constraint_masks, -math.inf)
if self.constraint_start is not None and self.constraint_end is not None:
net_output[0][:, :, 4:self.constraint_start] = -math.inf
net_output[0][:, :, self.constraint_end:] = -math.inf
lprobs = model.get_normalized_probs(net_output, log_probs=True) * conf
target = model.get_targets(sample, net_output)
if self.ignore_prefix_size > 0:
lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
target = target[:, self.ignore_prefix_size :].contiguous()
if constraint_masks is not None:
constraint_masks = constraint_masks[:, self.ignore_prefix_size :, :].contiguous()
if self.ignore_eos:
bsz, seq_len, embed_dim = lprobs.size()
eos_indices = target.eq(self.task.tgt_dict.eos())
lprobs = lprobs[~eos_indices].reshape(bsz, seq_len-1, embed_dim)
target = target[~eos_indices].reshape(bsz, seq_len-1)
if constraint_masks is not None:
constraint_masks = constraint_masks[~eos_indices].reshape(bsz, seq_len-1, embed_dim)
if constraint_masks is not None:
constraint_masks = constraint_masks.view(-1, constraint_masks.size(-1))
return lprobs.view(-1, lprobs.size(-1)), target.view(-1), constraint_masks
def compute_loss(self, model, net_output, sample, update_num, reduce=True):
lprobs, target, constraint_masks = self.get_lprobs_and_target(model, net_output, sample)
if constraint_masks is not None:
constraint_masks = constraint_masks[target != self.padding_idx]
lprobs = lprobs[target != self.padding_idx]
target = target[target != self.padding_idx]
loss, nll_loss, ntokens, lprobs, target = label_smoothed_nll_loss(
lprobs,
target,
self.eps,
update_num,
reduce=reduce,
drop_worst_ratio=self.drop_worst_ratio,
drop_worst_after=self.drop_worst_after,
use_rdrop=self.use_rdrop,
reg_alpha=self.reg_alpha,
constraint_masks=constraint_masks,
constraint_start=self.constraint_start,
constraint_end=self.constraint_end
)
# for encouraging loss
probs = torch.exp(lprobs)
bonus = torch.log(torch.clamp((torch.ones_like(probs) - probs), min=1e-5)) # likelihood bonus
log_end = self.log_end
if log_end != 1.0: # e.g. 0.9
y_log_end = torch.log(torch.ones_like(probs) - log_end)
bonus_after_log_end = 1 / (log_end - torch.ones_like(probs)) * (probs - log_end) + y_log_end
# x:log_end, y torch.log(torch.clamp((torch.ones_like(probs) - probs), min=self.cl_eps))
bonus = torch.where(probs > log_end, bonus_after_log_end, bonus)
c_loss = F.nll_loss(
-bonus,
target.view(-1),
reduction='sum',
)
smoothing_c_loss = bonus.sum(dim=-1)
smoothing_c_loss = smoothing_c_loss.sum()
c_loss = c_loss * (1 - self.eps) + (self.eps / lprobs.size(-1)) * smoothing_c_loss
loss = loss + c_loss
# end for encouraging loss
return loss, nll_loss, ntokens
def compute_accuracy(self, model, net_output, sample):
lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
mask = target.ne(self.padding_idx)
n_correct = torch.sum(
lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))
)
total = torch.sum(mask)
return n_correct, total
@classmethod
def reduce_metrics(cls, logging_outputs) -> None:
"""Aggregate logging outputs from data parallel training."""
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
loss_sum_v1 = sum(log.get("loss_v1", 0) for log in logging_outputs)
loss_sum_v2 = sum(log.get("loss_v2", 0) for log in logging_outputs)
nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
sample_size_v1 = sum(log.get("sample_size_v1", 0) for log in logging_outputs)
sample_size_v2 = sum(log.get("sample_size_v2", 0) for log in logging_outputs)
metrics.log_scalar(
"loss", loss_sum / sample_size, sample_size, round=3
)
metrics.log_scalar(
"loss_v1", loss_sum_v1 / max(sample_size_v1, 1), max(sample_size_v1, 1), round=3
)
metrics.log_scalar(
"loss_v2", loss_sum_v2 / max(sample_size_v2, 1), max(sample_size_v2, 1), round=3
)
metrics.log_scalar(
"nll_loss", nll_loss_sum / sample_size, ntokens, round=3
)
metrics.log_derived(
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
)
metrics.log_scalar(
"ntokens", ntokens, 1, round=3
)
metrics.log_scalar(
"nsentences", nsentences, 1, round=3
)
metrics.log_scalar(
"sample_size", sample_size, 1, round=3
)
metrics.log_scalar(
"sample_size_v1", sample_size_v1, 1, round=3
)
metrics.log_scalar(
"sample_size_v2", sample_size_v2, 1, round=3
)
total = utils.item(sum(log.get("total", 0) for log in logging_outputs))
if total > 0:
metrics.log_scalar("total", total)
n_correct = utils.item(
sum(log.get("n_correct", 0) for log in logging_outputs)
)
metrics.log_scalar("n_correct", n_correct)
metrics.log_derived(
"accuracy",
lambda meters: round(
meters["n_correct"].sum * 100.0 / meters["total"].sum, 3
)
if meters["total"].sum > 0
else float("nan"),
)
@staticmethod
def logging_outputs_can_be_summed() -> bool:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return True
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
import math
import string
from dataclasses import dataclass, field
from collections import OrderedDict
from typing import Optional
import torch
from fairseq import metrics, utils
from fairseq.criterions import FairseqCriterion, register_criterion
from fairseq.dataclass import FairseqDataclass
from omegaconf import II
from data import data_utils
from utils.cider.pyciderevalcap.ciderD.ciderD import CiderD
def scst_loss(lprobs, target, reward, ignore_index=None, reduce=True):
loss = -lprobs.gather(dim=-1, index=target.unsqueeze(-1)).squeeze() * reward.unsqueeze(-1)
if ignore_index is not None:
pad_mask = target.eq(ignore_index)
loss.masked_fill_(pad_mask, 0.0)
ntokens = (~pad_mask).sum()
else:
loss = loss.squeeze(-1)
ntokens = target.numel()
if reduce:
loss = loss.sum()
return loss, ntokens
@dataclass
class ScstRewardCriterionConfig(FairseqDataclass):
scst_cider_cached_tokens: str = field(
default="coco-train-words.p",
metadata={"help": "path to cached cPickle file used to calculate CIDEr scores"},
)
ignore_prefix_size: int = field(
default=0,
metadata={"help": "Ignore first N tokens"},
)
sentence_avg: bool = II("optimization.sentence_avg")
constraint_range: Optional[str] = field(
default=None,
metadata={"help": "constraint range"}
)
@register_criterion(
"scst_reward_criterion", dataclass=ScstRewardCriterionConfig
)
class ScstRewardCriterion(FairseqCriterion):
CIDER_REWARD_WEIGHT = 1
def __init__(
self,
task,
scst_cider_cached_tokens,
sentence_avg,
ignore_prefix_size=0,
constraint_range=None
):
super().__init__(task)
self.scst_cider_scorer = CiderD(df=scst_cider_cached_tokens)
self.sentence_avg = sentence_avg
self.ignore_prefix_size = ignore_prefix_size
self.transtab = str.maketrans({key: None for key in string.punctuation})
self.constraint_start = None
self.constraint_end = None
if constraint_range is not None:
constraint_start, constraint_end = constraint_range.split(',')
self.constraint_start = int(constraint_start)
self.constraint_end = int(constraint_end)
def forward(self, model, sample, update_num=0, reduce=True):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
loss, score, ntokens, nsentences = self.compute_loss(model, sample, reduce=reduce)
sample_size = (
nsentences if self.sentence_avg else ntokens
)
logging_output = {
"loss": loss.data,
"score": score,
"ntokens": ntokens,
"nsentences": nsentences,
"sample_size": sample_size,
}
return loss, sample_size, logging_output
def _calculate_eval_scores(self, gen_res, gt_idx, gt_res):
'''
gen_res: generated captions, list of str
gt_idx: list of int, of the same length as gen_res
gt_res: ground truth captions, list of list of str.
gen_res[i] corresponds to gt_res[gt_idx[i]]
Each image can have multiple ground truth captions
'''
gen_res_size = len(gen_res)
res = OrderedDict()
for i in range(gen_res_size):
res[i] = [self._wrap_sentence(gen_res[i].strip().translate(self.transtab))]
gts = OrderedDict()
gt_res_ = [
[self._wrap_sentence(gt_res[i][j].strip().translate(self.transtab)) for j in range(len(gt_res[i]))]
for i in range(len(gt_res))
]
for i in range(gen_res_size):
gts[i] = gt_res_[gt_idx[i]]
res_ = [{'image_id':i, 'caption': res[i]} for i in range(len(res))]
_, batch_cider_scores = self.scst_cider_scorer.compute_score(gts, res_)
scores = self.CIDER_REWARD_WEIGHT * batch_cider_scores
return scores
@classmethod
def _wrap_sentence(self, s):
# ensure the sentence ends with <eos> token
# in order to keep consisitent with cider_cached_tokens
r = s.strip()
if r.endswith('.'):
r = r[:-1]
r += ' <eos>'
return r
def get_generator_out(self, model, sample):
def decode(toks):
hypo = toks.int().cpu()
hypo_str = self.task.tgt_dict.string(hypo)
hypo_str = self.task.bpe.decode(hypo_str).strip()
return hypo, hypo_str
model.eval()
with torch.no_grad():
self.task.scst_generator.model.eval()
gen_out = self.task.scst_generator.generate([model], sample)
gen_target = []
gen_res = []
gt_res = []
for i in range(len(gen_out)):
for j in range(len(gen_out[i])):
hypo, hypo_str = decode(gen_out[i][j]["tokens"])
gen_target.append(hypo)
gen_res.append(hypo_str)
gt_res.append(
decode(utils.strip_pad(sample["target"][i], self.padding_idx))[1].split('&&')
)
return gen_target, gen_res, gt_res
def get_reward_and_scores(self, gen_res, gt_res, device):
batch_size = len(gt_res)
gen_res_size = len(gen_res)
seq_per_img = gen_res_size // batch_size
gt_idx = [i // seq_per_img for i in range(gen_res_size)]
scores = self._calculate_eval_scores(gen_res, gt_idx, gt_res)
sc_ = scores.reshape(batch_size, seq_per_img)
baseline = (sc_.sum(1, keepdims=True) - sc_) / (sc_.shape[1] - 1)
# sample - baseline
reward = scores.reshape(batch_size, seq_per_img)
reward = reward - baseline
reward = reward.reshape(gen_res_size)
reward = torch.as_tensor(reward, device=device, dtype=torch.float64)
return reward, scores
def get_net_output(self, model, sample, gen_target):
def merge(sample_list, eos=self.task.tgt_dict.eos(), move_eos_to_beginning=False):
return data_utils.collate_tokens(
sample_list,
pad_idx=self.padding_idx,
eos_idx=eos,
left_pad=False,
move_eos_to_beginning=move_eos_to_beginning,
)
batch_size = len(sample["target"])
gen_target_size = len(gen_target)
seq_per_img = gen_target_size // batch_size
model.train()
sample_src_tokens = torch.repeat_interleave(
sample['net_input']['src_tokens'], seq_per_img, dim=0
)
sample_src_lengths = torch.repeat_interleave(
sample['net_input']['src_lengths'], seq_per_img, dim=0
)
sample_patch_images = torch.repeat_interleave(
sample['net_input']['patch_images'], seq_per_img, dim=0
)
sample_patch_masks = torch.repeat_interleave(
sample['net_input']['patch_masks'], seq_per_img, dim=0
)
gen_prev_output_tokens = torch.as_tensor(
merge(gen_target, eos=self.task.tgt_dict.bos(), move_eos_to_beginning=True),
device=sample["target"].device, dtype=torch.int64
)
gen_target_tokens = torch.as_tensor(
merge(gen_target), device=sample["target"].device, dtype=torch.int64
)
net_output = model(
src_tokens=sample_src_tokens, src_lengths=sample_src_lengths,
patch_images=sample_patch_images, patch_masks=sample_patch_masks,
prev_output_tokens=gen_prev_output_tokens
)
return net_output, gen_target_tokens
def get_lprobs_and_target(self, model, net_output, gen_target):
if self.constraint_start is not None and self.constraint_end is not None:
net_output[0][:, :, 4:self.constraint_start] = -math.inf
net_output[0][:, :, self.constraint_end:] = -math.inf
lprobs = model.get_normalized_probs(net_output, log_probs=True)
if self.ignore_prefix_size > 0:
if getattr(lprobs, "batch_first", False):
lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
gen_target = gen_target[:, self.ignore_prefix_size :].contiguous()
else:
lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous()
gen_target = gen_target[self.ignore_prefix_size :, :].contiguous()
return lprobs, gen_target
def compute_loss(self, model, sample, reduce=True):
gen_target, gen_res, gt_res = self.get_generator_out(model, sample)
reward, scores = self.get_reward_and_scores(gen_res, gt_res, device=sample["target"].device)
net_output, gen_target_tokens = self.get_net_output(model, sample, gen_target)
gen_lprobs, gen_target_tokens = self.get_lprobs_and_target(model, net_output, gen_target_tokens)
loss, ntokens = scst_loss(gen_lprobs, gen_target_tokens, reward, ignore_index=self.padding_idx, reduce=reduce)
nsentences = gen_target_tokens.size(0)
return loss, scores.sum(), ntokens, nsentences
@classmethod
def reduce_metrics(cls, logging_outputs) -> None:
"""Aggregate logging outputs from data parallel training."""
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
score_sum = sum(log.get("score", 0) for log in logging_outputs)
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
metrics.log_scalar(
"loss", loss_sum / sample_size, sample_size, round=3
)
metrics.log_scalar(
"score", score_sum / nsentences, nsentences, round=3
)
metrics.log_scalar(
"ntokens", ntokens, 1, round=3
)
metrics.log_scalar(
"nsentences", nsentences, 1, round=3
)
metrics.log_scalar(
"sample_size", sample_size, 1, round=3
)
@staticmethod
def logging_outputs_can_be_summed() -> bool:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return True
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
import math
from dataclasses import dataclass, field
from typing import Optional
import torch
import torch.nn.functional as F
import numpy as np
from fairseq import metrics, utils
from fairseq.criterions import FairseqCriterion, register_criterion
from fairseq.dataclass import FairseqDataclass
from omegaconf import II
@dataclass
class SpeechPretrainLossConfig(FairseqDataclass):
label_smoothing: float = field(
default=0.0,
metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"},
)
report_accuracy: bool = field(
default=False,
metadata={"help": "report accuracy metric"},
)
ignore_prefix_size: int = field(
default=0,
metadata={"help": "Ignore first N tokens"},
)
ignore_eos: bool = field(
default=False,
metadata={"help": "Ignore eos token"},
)
sentence_avg: bool = II("optimization.sentence_avg")
drop_worst_ratio: float = field(
default=0.0,
metadata={"help": "ratio for discarding bad samples"},
)
drop_worst_after: int = field(
default=0,
metadata={"help": "steps for discarding bad samples"},
)
use_rdrop: bool = field(
default=False, metadata={"help": "use R-Drop"}
)
reg_alpha: float = field(
default=1.0, metadata={"help": "weight for R-Drop"}
)
sample_patch_num: int = field(
default=196, metadata={"help": "sample patchs for v1"}
)
constraint_range: Optional[str] = field(
default=None,
metadata={"help": "constraint range"}
)
def construct_rdrop_sample(x):
if isinstance(x, dict):
for key in x:
x[key] = construct_rdrop_sample(x[key])
return x
elif isinstance(x, torch.Tensor):
return x.repeat(2, *([1] * (x.dim()-1)))
elif isinstance(x, int):
return x * 2
elif isinstance(x, np.ndarray):
return x.repeat(2)
else:
raise NotImplementedError
def kl_loss(p, q):
p_loss = F.kl_div(p, torch.exp(q), reduction='sum')
q_loss = F.kl_div(q, torch.exp(p), reduction='sum')
loss = (p_loss + q_loss) / 2
return loss
def label_smoothed_nll_loss(
lprobs, target, epsilon, update_num, reduce=True,
drop_worst_ratio=0.0, drop_worst_after=0, use_rdrop=False, reg_alpha=1.0,
constraint_masks=None, constraint_start=None, constraint_end=None
):
if target.dim() == lprobs.dim() - 1:
target = target.unsqueeze(-1)
nll_loss = -lprobs.gather(dim=-1, index=target).squeeze(-1)
if constraint_masks is not None:
smooth_loss = -lprobs.masked_fill(~constraint_masks, 0).sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (constraint_masks.sum(1) - 1 + 1e-6)
elif constraint_start is not None and constraint_end is not None:
constraint_range = [0, 1, 2, 3, 4] + list(range(constraint_start, constraint_end))
smooth_loss = -lprobs[:, constraint_range].sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (len(constraint_range) - 1 + 1e-6)
else:
smooth_loss = -lprobs.sum(dim=-1, keepdim=True).squeeze(-1)
eps_i = epsilon / (lprobs.size(-1) - 1)
loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
if drop_worst_ratio > 0 and update_num > drop_worst_after:
if use_rdrop:
true_batch_size = loss.size(0) // 2
_, indices = torch.topk(loss[:true_batch_size], k=int(true_batch_size * (1 - drop_worst_ratio)), largest=False)
loss = torch.cat([loss[indices], loss[indices+true_batch_size]])
nll_loss = torch.cat([nll_loss[indices], nll_loss[indices+true_batch_size]])
lprobs = torch.cat([lprobs[indices], lprobs[indices+true_batch_size]])
else:
loss, indices = torch.topk(loss, k=int(loss.shape[0] * (1 - drop_worst_ratio)), largest=False)
nll_loss = nll_loss[indices]
lprobs = lprobs[indices]
ntokens = loss.numel()
nll_loss = nll_loss.sum()
loss = loss.sum()
if use_rdrop:
true_batch_size = lprobs.size(0) // 2
p = lprobs[:true_batch_size]
q = lprobs[true_batch_size:]
if constraint_start is not None and constraint_end is not None:
constraint_range = [0, 1, 2, 3, 4] + list(range(constraint_start, constraint_end))
p = p[:, constraint_range]
q = q[:, constraint_range]
loss += kl_loss(p, q) * reg_alpha
return loss, nll_loss, ntokens
# lens: torch.LongTensor
# returns: torch.BoolTensor
def lengths_to_padding_mask(lens):
bsz, max_lens = lens.size(0), torch.max(lens).item()
mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
return mask
# lens: torch.LongTensor
# returns: torch.BoolTensor
def lengths_to_mask(lens):
return ~lengths_to_padding_mask(lens)
@register_criterion(
"speech_pretrain_loss", dataclass=SpeechPretrainLossConfig
)
class SpeechPretrainLoss(FairseqCriterion):
def __init__(
self,
task,
sentence_avg,
label_smoothing,
ignore_prefix_size=0,
ignore_eos=False,
report_accuracy=False,
drop_worst_ratio=0,
drop_worst_after=0,
use_rdrop=False,
reg_alpha=1.0,
sample_patch_num=196,
constraint_range=None
):
super().__init__(task)
self.train_stage = task.train_stage
self.sentence_avg = sentence_avg
self.eps = label_smoothing
self.ignore_prefix_size = ignore_prefix_size
self.ignore_eos = ignore_eos
self.report_accuracy = report_accuracy
self.drop_worst_ratio = drop_worst_ratio
self.drop_worst_after = drop_worst_after
self.use_rdrop = use_rdrop
self.reg_alpha = reg_alpha
self.sample_patch_num = sample_patch_num
self.constraint_start = None
self.constraint_end = None
if constraint_range is not None:
constraint_start, constraint_end = constraint_range.split(',')
self.constraint_start = int(constraint_start)
self.constraint_end = int(constraint_end)
self.blank_idx = (
task.phone_dictionary.index("<blank>")-3
)
self.pad_idx = task.tgt_dict.pad()
self.eos_idx = task.tgt_dict.eos()
self.bce_pos_weight = 5.0
def forward(self, model, sample, update_num=0, reduce=True, loss_type="nll"):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
if isinstance(sample, list) or isinstance(sample, tuple):
if self.train_stage == 1:
# P2T
loss_v1, sample_size_v1, logging_output_v1 = self.forward(model, sample[0], update_num, reduce, loss_type="nll")
else:
# S2T, PP
loss_v1, sample_size_v1, logging_output_v1 = self.forward(model, sample[0], update_num, reduce, loss_type="nll,ctc")
if len(sample[1]) > 0:
# P2T
loss_v2, sample_size_v2, logging_output_v2 = self.forward(model, sample[1], update_num, reduce, loss_type="nll")
if len(sample) == 3:
# MSP, S2C
loss_v3, sample_size_v3, logging_output_v3 = self.forward(model, sample[2], update_num, reduce,
loss_type="kl,nll") # final
loss = (loss_v1 + loss_v2 + loss_v3) / (sample_size_v1 + sample_size_v2 + sample_size_v3) # final
sample_size = 1
logging_output = {
"loss": loss.data,
"loss_v1": loss_v1.data,
"loss_v2": loss_v2.data,
"loss_v3": loss_v3.data,
"nll_loss": logging_output_v1["nll_loss"].data / sample_size_v1 + logging_output_v2[
"nll_loss"].data / sample_size_v2,
"ctc_loss": logging_output_v1["ctc_loss"].data / sample_size_v1,
"kl_loss": logging_output_v3["kl_loss"].data / sample_size_v3,
"ntokens": logging_output_v1["ntokens"] + logging_output_v2["ntokens"] + logging_output_v3['ntokens'],
"nsentences": logging_output_v1["nsentences"] + logging_output_v2["nsentences"] + logging_output_v3['nsentences'],
"sample_size": 1,
"sample_size_v1": sample_size_v1,
"sample_size_v2": sample_size_v2,
"sample_size_v3": sample_size_v3
}
else:
loss = (loss_v1 + loss_v2) / (sample_size_v1 + sample_size_v2)
sample_size = 1
logging_output = {
"loss": loss.data,
"loss_v1": loss_v1.data,
"loss_v2": loss_v2.data,
"nll_loss": logging_output_v1["nll_loss"].data / sample_size_v1 + logging_output_v2["nll_loss"].data / sample_size_v2,
"ctc_loss": logging_output_v1["ctc_loss"].data / sample_size_v1,
"ntokens": logging_output_v1["ntokens"] + logging_output_v2["ntokens"],
"nsentences": logging_output_v1["nsentences"] + logging_output_v2["nsentences"],
"sample_size": 1,
"sample_size_v1": sample_size_v1,
"sample_size_v2": sample_size_v2,
}
else:
loss = loss_v1 / sample_size_v1
sample_size = 1
logging_output = {
"loss": loss.data,
"loss_v1": loss_v1.data,
"nll_loss": logging_output_v1["nll_loss"].data / sample_size_v1,
"ctc_loss": logging_output_v1["ctc_loss"].data / sample_size_v1,
"ntokens": logging_output_v1["ntokens"],
"nsentences": logging_output_v1["nsentences"],
"sample_size": 1,
"sample_size_v1": sample_size_v1
}
return loss, sample_size, logging_output
if self.use_rdrop:
construct_rdrop_sample(sample)
net_output = model(**sample["net_input"])
loss, nll_loss, ctc_loss, kl_loss_val, ntokens = self.compute_loss(model, net_output, sample, update_num, reduce=reduce, loss_type=loss_type)
sample_size = (
sample["target"].size(0) if self.sentence_avg else ntokens
)
logging_output = {
"loss": loss.data,
"ctc_loss": ctc_loss.data,
"nll_loss": nll_loss.data,
"kl_loss": kl_loss_val.data,
"ntokens": sample["ntokens"],
"nsentences": sample["nsentences"],
"sample_size": sample_size,
}
if self.report_accuracy:
n_correct, total = self.compute_accuracy(model, net_output, sample)
logging_output["n_correct"] = utils.item(n_correct.data)
logging_output["total"] = utils.item(total.data)
return loss, sample_size, logging_output
def get_lprobs_and_target(self, model, net_output, sample):
conf = sample['conf'][:, None, None] if 'conf' in sample and sample['conf'] is not None else 1
constraint_masks = None
if "constraint_masks" in sample and sample["constraint_masks"] is not None:
constraint_masks = sample["constraint_masks"]
net_output[0].masked_fill_(~constraint_masks, -math.inf)
if self.constraint_start is not None and self.constraint_end is not None:
net_output[0][:, :, 4:self.constraint_start] = -math.inf
net_output[0][:, :, self.constraint_end:] = -math.inf
lprobs = model.get_normalized_probs(net_output, log_probs=True) * conf
target = model.get_targets(sample, net_output)
if self.ignore_prefix_size > 0:
lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
target = target[:, self.ignore_prefix_size :].contiguous()
if constraint_masks is not None:
constraint_masks = constraint_masks[:, self.ignore_prefix_size :, :].contiguous()
if self.ignore_eos:
bsz, seq_len, embed_dim = lprobs.size()
eos_indices = target.eq(self.task.tgt_dict.eos())
lprobs = lprobs[~eos_indices].reshape(bsz, seq_len-1, embed_dim)
target = target[~eos_indices].reshape(bsz, seq_len-1)
if constraint_masks is not None:
constraint_masks = constraint_masks[~eos_indices].reshape(bsz, seq_len-1, embed_dim)
if constraint_masks is not None:
constraint_masks = constraint_masks.view(-1, constraint_masks.size(-1))
return lprobs.view(-1, lprobs.size(-1)), target.view(-1), constraint_masks
def compute_loss(self, model, net_output, sample, update_num, reduce=True, loss_type="nll"):
nll_loss = torch.tensor(0.0)
ctc_loss = torch.tensor(0.0)
kl_loss_val = torch.tensor(0.0)
ntokens = sample["ntokens"]
loss_type = loss_type.split(",")
if "nll" in loss_type:
lprobs, target, constraint_masks = self.get_lprobs_and_target(model, net_output, sample)
if constraint_masks is not None:
constraint_masks = constraint_masks[target != self.padding_idx]
lprobs = lprobs[target != self.padding_idx]
target = target[target != self.padding_idx]
_, nll_loss, ntokens = label_smoothed_nll_loss(
lprobs,
target,
self.eps,
update_num,
reduce=reduce,
drop_worst_ratio=self.drop_worst_ratio,
drop_worst_after=self.drop_worst_after,
use_rdrop=self.use_rdrop,
reg_alpha=self.reg_alpha,
constraint_masks=constraint_masks,
constraint_start=self.constraint_start,
constraint_end=self.constraint_end
)
if "ctc" in loss_type:
ctc_loss = self.compute_ctc_loss(model, net_output, sample)
if "kl" in loss_type:
kl_loss_val = self.compute_kl_loss(net_output)
loss = nll_loss + ctc_loss + kl_loss_val
return loss, nll_loss, ctc_loss, kl_loss_val, ntokens
def compute_ctc_loss(self, model, net_output, sample):
lprobs = model.get_encoder_normalized_probs(
net_output[1], log_probs=True
).contiguous() # (T, B, C) from the encoder
non_padding_mask = ~net_output[1]["encoder_padding_mask"]
input_lengths = non_padding_mask.long().sum(-1)
# pad_mask = (sample["phone_items"] != self.pad_idx) & (
# sample["phone_items"] != self.eos_idx
# )
target_lengths = sample["ctc_output_lengths"]
pad_mask = torch.arange(target_lengths.max()).expand([target_lengths.shape[0], -1]).to(target_lengths) < target_lengths.unsqueeze(1)
targets_flat = sample["ctc_outputs"].masked_select(pad_mask)
# target_lengths = pad_mask.sum(-1)
with torch.backends.cudnn.flags(enabled=False):
loss = F.ctc_loss(
lprobs,
targets_flat,
input_lengths,
target_lengths,
blank=self.blank_idx,
reduction="sum",
zero_infinity=True,
)
return loss
def compute_kl_loss(self, net_output):
losses = net_output[1]["kl_loss"]
return losses
def compute_accuracy(self, model, net_output, sample):
lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
mask = target.ne(self.padding_idx)
n_correct = torch.sum(
lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))
)
total = torch.sum(mask)
return n_correct, total
@classmethod
def reduce_metrics(cls, logging_outputs) -> None:
"""Aggregate logging outputs from data parallel training."""
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
loss_sum_v1 = sum(log.get("loss_v1", 0) for log in logging_outputs)
loss_sum_v2 = sum(log.get("loss_v2", 0) for log in logging_outputs)
loss_sum_v3 = sum(log.get("loss_v3", 0) for log in logging_outputs)
nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
ctc_loss_sum = sum(log.get("ctc_loss", 0) for log in logging_outputs)
kl_loss_sum = sum(log.get("kl_loss", 0) for log in logging_outputs)
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
sample_size_v1 = sum(log.get("sample_size_v1", 0) for log in logging_outputs)
sample_size_v2 = sum(log.get("sample_size_v2", 0) for log in logging_outputs)
sample_size_v3 = sum(log.get("sample_size_v3", 0) for log in logging_outputs)
metrics.log_scalar(
"loss", loss_sum / sample_size, sample_size, round=3
)
metrics.log_scalar(
"loss_v1", loss_sum_v1 / max(sample_size_v1, 1), max(sample_size_v1, 1), round=3
)
metrics.log_scalar(
"loss_v2", loss_sum_v2 / max(sample_size_v2, 1), max(sample_size_v2, 1), round=3
)
metrics.log_scalar(
"loss_v3", loss_sum_v3 / max(sample_size_v3, 1), max(sample_size_v3, 1), round=3
)
metrics.log_scalar(
"nll_loss", nll_loss_sum / sample_size, ntokens, round=3
)
metrics.log_scalar(
"ctc_loss", ctc_loss_sum / sample_size, ntokens, round=3
)
metrics.log_scalar(
"kl_loss", kl_loss_sum / sample_size, ntokens, round=3
)
metrics.log_derived(
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
)
metrics.log_scalar(
"ntokens", ntokens, 1, round=3
)
metrics.log_scalar(
"nsentences", nsentences, 1, round=3
)
metrics.log_scalar(
"sample_size", sample_size, 1, round=3
)
metrics.log_scalar(
"sample_size_v1", sample_size_v1, 1, round=3
)
metrics.log_scalar(
"sample_size_v2", sample_size_v2, 1, round=3
)
metrics.log_scalar(
"sample_size_v3", sample_size_v3, 1, round=3
)
total = utils.item(sum(log.get("total", 0) for log in logging_outputs))
if total > 0:
metrics.log_scalar("total", total)
n_correct = utils.item(
sum(log.get("n_correct", 0) for log in logging_outputs)
)
metrics.log_scalar("n_correct", n_correct)
metrics.log_derived(
"accuracy",
lambda meters: round(
meters["n_correct"].sum * 100.0 / meters["total"].sum, 3
)
if meters["total"].sum > 0
else float("nan"),
)
@staticmethod
def logging_outputs_can_be_summed() -> bool:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return True
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import functools
import numpy as np
import torch
import base64
from torchvision import transforms
from timm.data import create_transform
from utils.vision_helper import RandomAugment
from PIL import Image, ImageFile
from data import data_utils
from data.ofa_dataset import OFADataset
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
conf = None
if samples[0].get("conf", None) is not None:
conf = torch.cat([s['conf'] for s in samples], dim=0)
ref_dict = None
if samples[0].get("ref_dict", None) is not None:
ref_dict = np.array([s['ref_dict'] for s in samples])
constraint_masks = None
if samples[0].get("constraint_mask", None) is not None:
constraint_masks = merge("constraint_mask")
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor(
[s["target"].ne(pad_idx).long().sum() for s in samples]
)
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"patch_images": patch_images,
"patch_masks": patch_masks,
"prev_output_tokens": prev_output_tokens
},
"conf": conf,
"ref_dict": ref_dict,
"constraint_masks": constraint_masks,
"target": target,
}
return batch
class ImageClassifyDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=128,
max_tgt_length=30,
patch_image_size=224,
constraint_trie=None,
imagenet_default_mean_and_std=False
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.patch_image_size = patch_image_size
self.constraint_trie = constraint_trie
if imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
if self.split != 'train':
self.patch_resize_transform = transforms.Compose([
lambda image: image.convert("RGB"),
transforms.Resize([patch_image_size, patch_image_size], interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
logger.info("val split, do not use random augmentation.")
else:
self.patch_resize_transform = create_transform(
input_size=patch_image_size,
is_training=True,
color_jitter=0.4,
auto_augment='rand-m9-mstd0.5-inc1',
interpolation='bicubic',
re_prob=0.25,
re_mode='pixel',
re_count=1,
mean=mean,
std=std,
)
self.patch_resize_transform = transforms.Compose(functools.reduce(lambda x, y:x + y, [
[lambda image: image.convert("RGB"),],
self.patch_resize_transform.transforms[:2],
[self.patch_resize_transform.transforms[2]],
[RandomAugment(2, 7, isPIL=True, augs=['Identity', 'AutoContrast', 'Equalize', 'Brightness', 'Sharpness', 'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']), ],
self.patch_resize_transform.transforms[3:],
]))
logger.info("train split, use random augmentation.")
def __getitem__(self, index):
image, label_name = self.dataset[index]
image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
patch_image = self.patch_resize_transform(image)
patch_mask = torch.tensor([True])
src_item = self.encode_text(' what does the image describe?')
tgt_item = self.encode_text(" {}".format(label_name))
ref_dict = {label_name: 1.0}
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
target_item = torch.cat([tgt_item, self.eos_item])
prev_output_item = torch.cat([self.bos_item, tgt_item])
example = {
"id": index,
"source": src_item,
"patch_image": patch_image,
"patch_mask": patch_mask,
"target": target_item,
"prev_output_tokens": prev_output_item,
"ref_dict": ref_dict,
}
if self.constraint_trie is not None:
constraint_mask = torch.zeros((len(prev_output_item), len(self.tgt_dict))).bool()
for i in range(len(prev_output_item)):
constraint_prefix_token = prev_output_item[:i+1].tolist()
constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
constraint_mask[i][constraint_nodes] = True
example["constraint_mask"] = constraint_mask
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
try:
from collections.abc import Iterable
except ImportError:
from collections import Iterable
import contextlib
import itertools
import logging
import re
import warnings
from typing import Optional, Tuple
import numpy as np
import torch
from fairseq.file_io import PathManager
from fairseq import utils
import os
logger = logging.getLogger(__name__)
def infer_language_pair(path):
"""Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
src, dst = None, None
for filename in PathManager.ls(path):
parts = filename.split(".")
if len(parts) >= 3 and len(parts[1].split("-")) == 2:
return parts[1].split("-")
return src, dst
def collate_tokens(
values,
pad_idx,
eos_idx=None,
left_pad=False,
move_eos_to_beginning=False,
pad_to_length=None,
pad_to_multiple=1,
pad_to_bsz=None,
):
"""Convert a list of 1d tensors into a padded 2d tensor."""
size = max(v.size(0) for v in values)
size = size if pad_to_length is None else max(size, pad_to_length)
if pad_to_multiple != 1 and size % pad_to_multiple != 0:
size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
def copy_tensor(src, dst):
assert dst.numel() == src.numel()
if move_eos_to_beginning:
if eos_idx is None:
# if no eos_idx is specified, then use the last token in src
dst[0] = src[-1]
else:
dst[0] = eos_idx
dst[1:] = src[:-1]
else:
dst.copy_(src)
if values[0].dim() == 1:
res = values[0].new(len(values), size).fill_(pad_idx)
elif values[0].dim() == 2:
assert move_eos_to_beginning is False
res = values[0].new(len(values), size, values[0].size(1)).fill_(pad_idx)
else:
raise NotImplementedError
for i, v in enumerate(values):
copy_tensor(v, res[i][size - len(v) :] if left_pad else res[i][: len(v)])
return res
def collater_audio(audios, pad_audio, max_sample_size, random_crop=False):
audio_sizes = [len(s) for s in audios]
if pad_audio:
audio_size = min(max(audio_sizes), max_sample_size)
else:
audio_size = min(min(audio_sizes), max_sample_size)
collated_audios = audios[0].new_zeros(len(audios), audio_size)
padding_mask = (
torch.BoolTensor(collated_audios.shape).fill_(False)
# if self.pad_audio else None
)
audio_starts = [0 for _ in audios]
for i, audio in enumerate(audios):
diff = len(audio) - audio_size
if diff == 0:
collated_audios[i] = audio
elif diff < 0:
collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)])
padding_mask[i, diff:] = True
else:
collated_audios[i], audio_starts[i] = crop_to_max_size(
audio, audio_size, random_crop
)
return collated_audios, padding_mask, audio_starts
def crop_to_max_size(wav, target_size, random_crop):
size = len(wav)
diff = size - target_size
if diff <= 0:
return wav, 0
start, end = 0, target_size
if random_crop:
start = np.random.randint(0, diff + 1)
end = size - diff + start
return wav[start:end], start
def load_indexed_dataset(
path, dictionary=None, dataset_impl=None, combine=False, default="cached"
):
"""A helper function for loading indexed datasets.
Args:
path (str): path to indexed dataset (e.g., 'data-bin/train')
dictionary (~fairseq.data.Dictionary): data dictionary
dataset_impl (str, optional): which dataset implementation to use. If
not provided, it will be inferred automatically. For legacy indexed
data we use the 'cached' implementation by default.
combine (bool, optional): automatically load and combine multiple
datasets. For example, if *path* is 'data-bin/train', then we will
combine 'data-bin/train', 'data-bin/train1', ... and return a
single ConcatDataset instance.
"""
import fairseq.data.indexed_dataset as indexed_dataset
from fairseq.data.concat_dataset import ConcatDataset
datasets = []
for k in itertools.count():
path_k = path + (str(k) if k > 0 else "")
try:
path_k = indexed_dataset.get_indexed_dataset_to_local(path_k)
except Exception as e:
if "StorageException: [404] Path not found" in str(e):
logger.warning(f"path_k: {e} not found")
else:
raise e
dataset_impl_k = dataset_impl
if dataset_impl_k is None:
dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k)
dataset = indexed_dataset.make_dataset(
path_k,
impl=dataset_impl_k or default,
fix_lua_indexing=True,
dictionary=dictionary,
)
if dataset is None:
break
logger.info("loaded {:,} examples from: {}".format(len(dataset), path_k))
datasets.append(dataset)
if not combine:
break
if len(datasets) == 0:
return None
elif len(datasets) == 1:
return datasets[0]
else:
return ConcatDataset(datasets)
@contextlib.contextmanager
def numpy_seed(seed, *addl_seeds):
"""Context manager which seeds the NumPy PRNG with the specified seed and
restores the state afterward"""
if seed is None:
yield
return
if len(addl_seeds) > 0:
seed = int(hash((seed, *addl_seeds)) % 1e6)
state = np.random.get_state()
np.random.seed(seed)
try:
yield
finally:
np.random.set_state(state)
def collect_filtered(function, iterable, filtered):
"""
Similar to :func:`filter` but collects filtered elements in ``filtered``.
Args:
function (callable): function that returns ``False`` for elements that
should be filtered
iterable (iterable): iterable to filter
filtered (list): list to store filtered elements
"""
for el in iterable:
if function(el):
yield el
else:
filtered.append(el)
def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False):
def compare_leq(a, b):
return a <= b if not isinstance(a, tuple) else max(a) <= b
def check_size(idx):
if isinstance(max_positions, float) or isinstance(max_positions, int):
return size_fn(idx) <= max_positions
elif isinstance(max_positions, dict):
idx_size = size_fn(idx)
assert isinstance(idx_size, dict)
intersect_keys = set(max_positions.keys()) & set(idx_size.keys())
return all(
all(
a is None or b is None or a <= b
for a, b in zip(idx_size[key], max_positions[key])
)
for key in intersect_keys
)
else:
# For MultiCorpusSampledDataset, will generalize it later
if not isinstance(size_fn(idx), Iterable):
return all(size_fn(idx) <= b for b in max_positions)
return all(
a is None or b is None or a <= b
for a, b in zip(size_fn(idx), max_positions)
)
ignored = []
itr = collect_filtered(check_size, indices, ignored)
indices = np.fromiter(itr, dtype=np.int64, count=-1)
return indices, ignored
def filter_by_size(indices, dataset, max_positions, raise_exception=False):
"""
[deprecated] Filter indices based on their size.
Use `FairseqDataset::filter_indices_by_size` instead.
Args:
indices (List[int]): ordered list of dataset indices
dataset (FairseqDataset): fairseq dataset instance
max_positions (tuple): filter elements larger than this size.
Comparisons are done component-wise.
raise_exception (bool, optional): if ``True``, raise an exception if
any elements are filtered (default: False).
"""
warnings.warn(
"data_utils.filter_by_size is deprecated. "
"Use `FairseqDataset::filter_indices_by_size` instead.",
stacklevel=2,
)
if isinstance(max_positions, float) or isinstance(max_positions, int):
if hasattr(dataset, "sizes") and isinstance(dataset.sizes, np.ndarray):
ignored = indices[dataset.sizes[indices] > max_positions].tolist()
indices = indices[dataset.sizes[indices] <= max_positions]
elif (
hasattr(dataset, "sizes")
and isinstance(dataset.sizes, list)
and len(dataset.sizes) == 1
):
ignored = indices[dataset.sizes[0][indices] > max_positions].tolist()
indices = indices[dataset.sizes[0][indices] <= max_positions]
else:
indices, ignored = _filter_by_size_dynamic(
indices, dataset.size, max_positions
)
else:
indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
if len(ignored) > 0 and raise_exception:
raise Exception(
(
"Size of sample #{} is invalid (={}) since max_positions={}, "
"skip this example with --skip-invalid-size-inputs-valid-test"
).format(ignored[0], dataset.size(ignored[0]), max_positions)
)
if len(ignored) > 0:
logger.warning(
(
"{} samples have invalid sizes and will be skipped, "
"max_positions={}, first few sample ids={}"
).format(len(ignored), max_positions, ignored[:10])
)
return indices
def filter_paired_dataset_indices_by_size(src_sizes, tgt_sizes, indices, max_sizes):
"""Filter a list of sample indices. Remove those that are longer
than specified in max_sizes.
Args:
indices (np.array): original array of sample indices
max_sizes (int or list[int] or tuple[int]): max sample size,
can be defined separately for src and tgt (then list or tuple)
Returns:
np.array: filtered sample array
list: list of removed indices
"""
if max_sizes is None:
return indices, []
if type(max_sizes) in (int, float):
max_src_size, max_tgt_size = max_sizes, max_sizes
else:
max_src_size, max_tgt_size = max_sizes
if tgt_sizes is None:
ignored = indices[src_sizes[indices] > max_src_size]
else:
ignored = indices[
(src_sizes[indices] > max_src_size) | (tgt_sizes[indices] > max_tgt_size)
]
if len(ignored) > 0:
if tgt_sizes is None:
indices = indices[src_sizes[indices] <= max_src_size]
else:
indices = indices[
(src_sizes[indices] <= max_src_size)
& (tgt_sizes[indices] <= max_tgt_size)
]
return indices, ignored.tolist()
def batch_by_size(
indices,
num_tokens_fn,
num_tokens_vec=None,
max_tokens=None,
max_sentences=None,
required_batch_size_multiple=1,
fixed_shapes=None,
):
"""
Yield mini-batches of indices bucketed by size. Batches may contain
sequences of different lengths.
Args:
indices (List[int]): ordered list of dataset indices
num_tokens_fn (callable): function that returns the number of tokens at
a given index
num_tokens_vec (List[int], optional): precomputed vector of the number
of tokens for each index in indices (to enable faster batch generation)
max_tokens (int, optional): max number of tokens in each batch
(default: None).
max_sentences (int, optional): max number of sentences in each
batch (default: None).
required_batch_size_multiple (int, optional): require batch size to
be less than N or a multiple of N (default: 1).
fixed_shapes (List[Tuple[int, int]], optional): if given, batches will
only be created with the given shapes. *max_sentences* and
*required_batch_size_multiple* will be ignored (default: None).
"""
try:
from fairseq.data.data_utils_fast import (
batch_by_size_fn,
batch_by_size_vec,
batch_fixed_shapes_fast,
)
except ImportError:
raise ImportError(
"Please build Cython components with: "
"`python setup.py build_ext --inplace`"
)
except ValueError:
raise ValueError(
"Please build (or rebuild) Cython components with `python setup.py build_ext --inplace`."
)
# added int() to avoid TypeError: an integer is required
max_tokens = (
int(max_tokens) if max_tokens is not None else -1
)
max_sentences = max_sentences if max_sentences is not None else -1
bsz_mult = required_batch_size_multiple
if not isinstance(indices, np.ndarray):
indices = np.fromiter(indices, dtype=np.int64, count=-1)
if num_tokens_vec is not None and not isinstance(num_tokens_vec, np.ndarray):
num_tokens_vec = np.fromiter(num_tokens_vec, dtype=np.int64, count=-1)
if fixed_shapes is None:
if num_tokens_vec is None:
return batch_by_size_fn(
indices,
num_tokens_fn,
max_tokens,
max_sentences,
bsz_mult,
)
else:
return batch_by_size_vec(
indices,
num_tokens_vec,
max_tokens,
max_sentences,
bsz_mult,
)
else:
fixed_shapes = np.array(fixed_shapes, dtype=np.int64)
sort_order = np.lexsort(
[
fixed_shapes[:, 1].argsort(), # length
fixed_shapes[:, 0].argsort(), # bsz
]
)
fixed_shapes_sorted = fixed_shapes[sort_order]
return batch_fixed_shapes_fast(indices, num_tokens_fn, fixed_shapes_sorted)
def post_process(sentence: str, symbol: str):
if symbol == "sentencepiece":
sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
elif symbol == "wordpiece":
sentence = sentence.replace(" ", "").replace("_", " ").strip()
elif symbol == "letter":
sentence = sentence.replace(" ", "").replace("|", " ").strip()
elif symbol == "silence":
import re
sentence = sentence.replace("<SIL>", "")
sentence = re.sub(' +', ' ', sentence).strip()
elif symbol == "_EOW":
sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
elif symbol in {"subword_nmt", "@@ ", "@@"}:
if symbol == "subword_nmt":
symbol = "@@ "
sentence = (sentence + " ").replace(symbol, "").rstrip()
elif symbol == "none":
pass
elif symbol is not None:
raise NotImplementedError(f"Unknown post_process option: {symbol}")
return sentence
def compute_mask_indices(
shape: Tuple[int, int],
padding_mask: Optional[torch.Tensor],
mask_prob: float,
mask_length: int,
mask_type: str = "static",
mask_other: float = 0.0,
min_masks: int = 0,
no_overlap: bool = False,
min_space: int = 0,
) -> np.ndarray:
"""
Computes random mask spans for a given shape
Args:
shape: the the shape for which to compute masks.
should be of size 2 where first element is batch size and 2nd is timesteps
padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
mask_type: how to compute mask lengths
static = fixed size
uniform = sample from uniform distribution [mask_other, mask_length*2]
normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
poisson = sample from possion distribution with lambda = mask length
min_masks: minimum number of masked spans
no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
"""
bsz, all_sz = shape
mask = np.full((bsz, all_sz), False)
all_num_mask = int(
# add a random number for probabilistic rounding
mask_prob * all_sz / float(mask_length)
+ np.random.rand()
)
all_num_mask = max(min_masks, all_num_mask)
mask_idcs = []
for i in range(bsz):
if padding_mask is not None:
sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int(
# add a random number for probabilistic rounding
mask_prob * sz / float(mask_length)
+ np.random.rand()
)
num_mask = max(min_masks, num_mask)
else:
sz = all_sz
num_mask = all_num_mask
if mask_type == "static":
lengths = np.full(num_mask, mask_length)
elif mask_type == "uniform":
lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
elif mask_type == "normal":
lengths = np.random.normal(mask_length, mask_other, size=num_mask)
lengths = [max(1, int(round(x))) for x in lengths]
elif mask_type == "poisson":
lengths = np.random.poisson(mask_length, size=num_mask)
lengths = [int(round(x)) for x in lengths]
else:
raise Exception("unknown mask selection " + mask_type)
if sum(lengths) == 0:
lengths[0] = min(mask_length, sz - 1)
if no_overlap:
mask_idc = []
def arrange(s, e, length, keep_length):
span_start = np.random.randint(s, e - length)
mask_idc.extend(span_start + i for i in range(length))
new_parts = []
if span_start - s - min_space >= keep_length:
new_parts.append((s, span_start - min_space + 1))
if e - span_start - keep_length - min_space > keep_length:
new_parts.append((span_start + length + min_space, e))
return new_parts
parts = [(0, sz)]
min_length = min(lengths)
for length in sorted(lengths, reverse=True):
lens = np.fromiter(
(e - s if e - s >= length + min_space else 0 for s, e in parts),
np.int,
)
l_sum = np.sum(lens)
if l_sum == 0:
break
probs = lens / np.sum(lens)
c = np.random.choice(len(parts), p=probs)
s, e = parts.pop(c)
parts.extend(arrange(s, e, length, min_length))
mask_idc = np.asarray(mask_idc)
else:
min_len = min(lengths)
if sz - min_len <= num_mask:
min_len = sz - num_mask - 1
mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
mask_idc = np.asarray(
[
mask_idc[j] + offset
for j in range(len(mask_idc))
for offset in range(lengths[j])
]
)
mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
min_len = min([len(m) for m in mask_idcs])
for i, mask_idc in enumerate(mask_idcs):
if len(mask_idc) > min_len:
mask_idc = np.random.choice(mask_idc, min_len, replace=False)
mask[i, mask_idc] = True
return mask
def get_mem_usage():
try:
import psutil
mb = 1024 * 1024
return f"used={psutil.virtual_memory().used / mb}Mb; avail={psutil.virtual_memory().available / mb}Mb"
except ImportError:
return "N/A"
# lens: torch.LongTensor
# returns: torch.BoolTensor
def lengths_to_padding_mask(lens):
bsz, max_lens = lens.size(0), torch.max(lens).item()
mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
return mask
# lens: torch.LongTensor
# returns: torch.BoolTensor
def lengths_to_mask(lens):
return ~lengths_to_padding_mask(lens)
def get_buckets(sizes, num_buckets):
buckets = np.unique(
np.percentile(
sizes,
np.linspace(0, 100, num_buckets + 1),
interpolation='lower',
)[1:]
)
return buckets
def get_bucketed_sizes(orig_sizes, buckets):
sizes = np.copy(orig_sizes)
assert np.min(sizes) >= 0
start_val = -1
for end_val in buckets:
mask = (sizes > start_val) & (sizes <= end_val)
sizes[mask] = end_val
start_val = end_val
return sizes
def _find_extra_valid_paths(dataset_path: str) -> set:
paths = utils.split_paths(dataset_path)
all_valid_paths = set()
for sub_dir in paths:
contents = PathManager.ls(sub_dir)
valid_paths = [c for c in contents if re.match("valid*[0-9].*", c) is not None]
all_valid_paths |= {os.path.basename(p) for p in valid_paths}
# Remove .bin, .idx etc
roots = {os.path.splitext(p)[0] for p in all_valid_paths}
return roots
def raise_if_valid_subsets_unintentionally_ignored(train_cfg) -> None:
"""Raises if there are paths matching 'valid*[0-9].*' which are not combined or ignored."""
if (
train_cfg.dataset.ignore_unused_valid_subsets
or train_cfg.dataset.combine_valid_subsets
or train_cfg.dataset.disable_validation
or not hasattr(train_cfg.task, "data")
):
return
other_paths = _find_extra_valid_paths(train_cfg.task.data)
specified_subsets = train_cfg.dataset.valid_subset.split(",")
ignored_paths = [p for p in other_paths if p not in specified_subsets]
if ignored_paths:
advice = "Set --combine-val to combine them or --ignore-unused-valid-subsets to ignore them."
msg = f"Valid paths {ignored_paths} will be ignored. {advice}"
raise ValueError(msg)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
import os
import torch
import pickle
class FileDataset:
def __init__(self, file_path, selected_col_ids=None, dtypes=None, separator="\t", cached_index=False):
self.file_path = file_path
assert os.path.exists(self.file_path), "Error: The local datafile {} not exists!".format(self.file_path)
self.separator = separator
if selected_col_ids is None:
# default to all fields
self.selected_col_ids = list(
range(len(open(self.file_path).readline().rstrip("\n").split(self.separator))))
else:
self.selected_col_ids = [int(col_id) for col_id in selected_col_ids.split(",")]
if dtypes is None:
# default to str
self.dtypes = [str for col_id in self.selected_col_ids]
else:
self.dtypes = [eval(col_dtype) for col_dtype in dtypes.split(",")]
assert len(self.dtypes) == len(self.selected_col_ids)
self.data_cnt = 0
try:
self.slice_id = torch.distributed.get_rank()
self.slice_count = torch.distributed.get_world_size()
except Exception:
self.slice_id = 0
self.slice_count = 1
self.cached_index = cached_index
self._init_seek_index()
self._reader = self._get_reader()
print("file {} slice_id {} row count {} total row count {}".format(
self.file_path, self.slice_id, self.row_count, self.total_row_count)
)
def _init_seek_index(self):
if self.cached_index:
cache_path = "{}.index".format(self.file_path)
assert os.path.exists(cache_path), "cache file {} not exists!".format(cache_path)
self.total_row_count, self.lineid_to_offset = pickle.load(open(cache_path, "rb"))
print("local datafile {} slice_id {} use cached row_count and line_idx-to-offset mapping".format(
self.file_path, self.slice_id))
else:
# make an iteration over the file to get row_count and line_idx-to-offset mapping
fp = open(self.file_path, "r")
print("local datafile {} slice_id {} begin to initialize row_count and line_idx-to-offset mapping".format(
self.file_path, self.slice_id))
self.total_row_count = 0
offset = 0
self.lineid_to_offset = []
for line in fp:
self.lineid_to_offset.append(offset)
self.total_row_count += 1
offset += len(line.encode('utf-8'))
self._compute_start_pos_and_row_count()
print("local datafile {} slice_id {} finished initializing row_count and line_idx-to-offset mapping".format(
self.file_path, self.slice_id))
def _compute_start_pos_and_row_count(self):
self.row_count = self.total_row_count // self.slice_count
if self.slice_id < self.total_row_count - self.row_count * self.slice_count:
self.row_count += 1
self.start_pos = self.row_count * self.slice_id
else:
self.start_pos = self.row_count * self.slice_id + (self.total_row_count - self.row_count * self.slice_count)
def _get_reader(self):
fp = open(self.file_path, "r")
fp.seek(self.lineid_to_offset[self.start_pos])
return fp
def _seek(self, offset=0):
try:
print("slice_id {} seek offset {}".format(self.slice_id, self.start_pos + offset))
self._reader.seek(self.lineid_to_offset[self.start_pos + offset])
self.data_cnt = offset
except Exception:
print("slice_id {} seek offset {}".format(self.slice_id, offset))
self._reader.seek(self.lineid_to_offset[offset])
self.data_cnt = offset
def __del__(self):
self._reader.close()
def __len__(self):
return self.row_count
def get_total_row_count(self):
return self.total_row_count
def __getitem__(self, index):
if self.data_cnt == self.row_count:
print("reach the end of datafile, start a new reader")
self.data_cnt = 0
self._reader = self._get_reader()
column_l = self._reader.readline().rstrip("\n").split(self.separator)
self.data_cnt += 1
column_l = [dtype(column_l[col_id]) for col_id, dtype in zip(self.selected_col_ids, self.dtypes)]
return column_l
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment