Commit b3864b28 authored by freewym's avatar freewym Committed by Facebook Github Bot
Browse files

avoid "divided by zero error" in logging_outputs when --use-bmuf is e… (#812)

Summary:
… enabled.

When doing multi-gpu training with --use-bmuf turned on and --global-sync-iter > 1, each replica may not sync with other replicas at each iteration. So logging_outputs only has stats of their own.  On the other hand, logging_outputs may be empty at the end of an epoch after "a dummy iteration" because the number of replicas does not divide the number of batches of the training data. If this happens, sample_size and ntokens would be 0 for some replica  and cause "divided by 0" error. This fix sets *loss to 0 if sample_size/ntokens is 0.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/812

Reviewed By: myleott, yqwangustc

Differential Revision: D15908614

Pulled By: nayansinghal

fbshipit-source-id: c92e8e095f012bdb4ef753a3c627fd215afa215d
parent d9c79133
......@@ -83,12 +83,12 @@ class AdaptiveLoss(FairseqCriterion):
nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
agg_output = {
'loss': loss_sum / sample_size / math.log(2),
'nll_loss': loss_sum / sample_size / math.log(2),
'loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
'nll_loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
'ntokens': ntokens,
'nsentences': nsentences,
'sample_size': sample_size,
}
if sample_size != ntokens:
agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
agg_output['nll_loss'] = loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.
return agg_output
......@@ -58,7 +58,7 @@ class CrossEntropyCriterion(FairseqCriterion):
nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
agg_output = {
'loss': loss_sum / sample_size / math.log(2),
'loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
'ntokens': ntokens,
'nsentences': nsentences,
'sample_size': sample_size,
......
......@@ -68,8 +68,8 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
return {
'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2),
'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2),
'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2) if sample_size > 0 else 0.,
'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2) if ntokens > 0 else 0.,
'ntokens': ntokens,
'nsentences': nsentences,
'sample_size': sample_size,
......
......@@ -138,10 +138,10 @@ class MaskedLmLoss(FairseqCriterion):
agg_loss = sum(log.get('loss', 0) for log in logging_outputs)
agg_output = {
'loss': agg_loss / sample_size / math.log(2),
'lm_loss': lm_loss_sum / ntokens / math.log(2),
'sentence_loss': sentence_loss_sum / nsentences / math.log(2),
'nll_loss': lm_loss_sum / ntokens / math.log(2),
'loss': agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.,
'lm_loss': lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.,
'sentence_loss': sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.,
'nll_loss': lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.,
'ntokens': ntokens,
'nsentences': nsentences,
'sample_size': sample_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment