avoid "divided by zero error" in logging_outputs when --use-bmuf is e… (#812)

Summary: … enabled. When doing multi-gpu training with --use-bmuf turned on and --global-sync-iter > 1, each replica may not sync with other replicas at each iteration. So logging_outputs only has stats of their own. On the other hand, logging_outputs may be empty at the end of an epoch after "a dummy iteration" because the number of replicas does not divide the number of batches of the training data. If this happens, sample_size and ntokens would be 0 for some replica and cause "divided by 0" error. This fix sets *loss to 0 if sample_size/ntokens is 0. Pull Request resolved: https://github.com/pytorch/fairseq/pull/812 Reviewed By: myleott, yqwangustc Differential Revision: D15908614 Pulled By: nayansinghal fbshipit-source-id: c92e8e095f012bdb4ef753a3c627fd215afa215d

avoid "divided by zero error" in logging_outputs when --use-bmuf is e… (#812)
Summary: … enabled. When doing multi-gpu training with --use-bmuf turned on and --global-sync-iter > 1, each replica may not sync with other replicas at each iteration. So logging_outputs only has stats of their own. On the other hand, logging_outputs may be empty at the end of an epoch after "a dummy iteration" because the number of replicas does not divide the number of batches of the training data. If this happens, sample_size and ntokens would be 0 for some replica and cause "divided by 0" error. This fix sets *loss to 0 if sample_size/ntokens is 0. Pull Request resolved: https://github.com/pytorch/fairseq/pull/812 Reviewed By: myleott, yqwangustc Differential Revision: D15908614 Pulled By: nayansinghal fbshipit-source-id: c92e8e095f012bdb4ef753a3c627fd215afa215d
b3864b28 · freewym · Facebook Github Bot · d9c79133 · b3864b28 · b3864b28
Commit b3864b28 authored Jun 25, 2019 by freewym Committed by Facebook Github Bot Jun 25, 2019
4 changed files
--- a/fairseq/criterions/adaptive_loss.py
+++ b/fairseq/criterions/adaptive_loss.py
@@ -83,12 +83,12 @@ class AdaptiveLoss(FairseqCriterion):
        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
        agg_output = {
-            'loss': loss_sum / sample_size / math.log(2),
-            'nll_loss': loss_sum / sample_size / math.log(2),
+            'loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
+            'nll_loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,
        }
        if sample_size != ntokens:
-            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
+            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.
        return agg_output
--- a/fairseq/criterions/cross_entropy.py
+++ b/fairseq/criterions/cross_entropy.py
@@ -58,7 +58,7 @@ class CrossEntropyCriterion(FairseqCriterion):
        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
        agg_output = {
-            'loss': loss_sum / sample_size / math.log(2),
+            'loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,

--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -68,8 +68,8 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
        return {
-            'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2),
-            'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2),
+            'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2) if sample_size > 0 else 0.,
+            'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2) if ntokens > 0 else 0.,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,

--- a/fairseq/criterions/masked_lm_loss.py
+++ b/fairseq/criterions/masked_lm_loss.py
@@ -138,10 +138,10 @@ class MaskedLmLoss(FairseqCriterion):
        agg_loss = sum(log.get('loss', 0) for log in logging_outputs)

        agg_output = {
-            'loss': agg_loss / sample_size / math.log(2),
-            'lm_loss': lm_loss_sum / ntokens / math.log(2),
-            'sentence_loss': sentence_loss_sum / nsentences / math.log(2),
-            'nll_loss': lm_loss_sum / ntokens / math.log(2),
+            'loss': agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.,
+            'lm_loss': lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.,
+            'sentence_loss': sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.,
+            'nll_loss': lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,