Move normalization of model output (e.g., via LSM) into model definition

4db6579a · Myle Ott · c21a6e29 · 4db6579a · 4db6579a · 4db6579a
Commit 4db6579a authored Jan 07, 2018 by Myle Ott
6 changed files
--- a/fairseq/criterions/cross_entropy.py
+++ b/fairseq/criterions/cross_entropy.py
@@ -26,9 +26,9 @@ class CrossEntropyCriterion(FairseqCriterion):
        3) logging outputs to display while training
        """
        net_output = model(**sample['net_input'])
-        input = net_output.view(-1, net_output.size(-1))
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
        target = sample['target'].view(-1)
-        loss = F.cross_entropy(input, target, size_average=False, ignore_index=self.padding_idx,
+        loss = F.nll_loss(lprobs, target, size_average=False, ignore_index=self.padding_idx,
                          reduce=reduce)
        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
        logging_output = {

--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -62,9 +62,9 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
        3) logging outputs to display while training
        """
        net_output = model(**sample['net_input'])
-        input = F.log_softmax(net_output.view(-1, net_output.size(-1)), dim=1)
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
        target = sample['target'].view(-1)
-        loss = LabelSmoothedNLLLoss.apply(input, target, self.eps, self.padding_idx, self.weights, reduce)
+        loss = LabelSmoothedNLLLoss.apply(lprobs, target, self.eps, self.padding_idx, self.weights, reduce)
        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
        logging_output = {
            'loss': loss.data[0] if reduce else loss.data,

--- a/fairseq/models/fairseq_decoder.py
+++ b/fairseq/models/fairseq_decoder.py
@@ -7,6 +7,7 @@
 #
 import torch.nn as nn
+import torch.nn.functional as F
 class FairseqDecoder(nn.Module):
@@ -15,6 +16,15 @@ class FairseqDecoder(nn.Module):
    def __init__(self):
        super().__init__()
+    def get_normalized_probs(self, net_output, log_probs):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        vocab = net_output.size(-1)
+        net_output1 = net_output.view(-1, vocab)
+        if log_probs:
+            return F.log_softmax(net_output1, dim=1).view_as(net_output)
+        else:
+            return F.softmax(net_output1, dim=1).view_as(net_output)
    def max_positions(self):
        """Maximum input length supported by the decoder."""
        raise NotImplementedError

--- a/fairseq/models/fairseq_incremental_decoder.py
+++ b/fairseq/models/fairseq_incremental_decoder.py
@@ -37,7 +37,7 @@ class FairseqIncrementalDecoder(FairseqDecoder):
        with model.decoder.incremental_inference():
            for step in range(maxlen):
                out, _ = model.decoder(tokens[:, :step], encoder_out)
-                probs = torch.nn.functional.log_softmax(out[:, -1, :])
+                probs = model.get_normalized_probs(out[:, -1, :], log_probs=False)
        ```
        """
        class IncrementalInference(object):

--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -35,6 +35,10 @@ class FairseqModel(nn.Module):
        decoder_out, _ = self.decoder(input_tokens, encoder_out)
        return decoder_out.view(-1, decoder_out.size(-1))
+    def get_normalized_probs(self, net_output, log_probs):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.decoder.get_normalized_probs(net_output, log_probs)
    def max_encoder_positions(self):
        """Maximum input length supported by the encoder."""
        return self.encoder.max_positions()

--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -328,7 +328,7 @@ class SequenceGenerator(object):
        avg_attn = None
        for model, encoder_out in zip(self.models, encoder_outs):
            decoder_out, attn = model.decoder(tokens, encoder_out)
-            probs = F.softmax(decoder_out[:, -1, :], dim=1).data
+            probs = model.get_normalized_probs(decoder_out[:, -1, :], log_probs=False).data
            if avg_probs is None:
                avg_probs = probs
            else: