update adaptive softmax head

438db43d · thomwolf · c306869e · 438db43d
Commit 438db43d authored Feb 07, 2019 by thomwolf
Show whitespace changes
Inline Side-by-side

Showing with 54 additions and 32 deletions

pytorch_pretrained_bert/modeling_transfo_xl_utilities.py pytorch_pretrained_bert/modeling_transfo_xl_utilities.py +54 -32

No files found.
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -89,15 +89,23 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):

        return logit

-    def forward(self, hidden, target, keep_order=False):
+    def forward(self, hidden, target=None, keep_order=False):
        '''
+            Params:
                hidden :: [len*bsz x d_proj]
                target :: [len*bsz]
+            Return:
+                if target is None:
+                    out :: [len*bsz] Negative log likelihood
+                else:
+                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
            We could replace this implementation by the native PyTorch one
-            if their was an option to set bias on all clusters in the native one.
-            line https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+            if their's had an option to set bias on all clusters in the native one.
+            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
        '''

+        if target is not None:
+            target = target.view(-1)
            if hidden.size(0) != target.size(0):
                raise RuntimeError('Input and target should have the same size '
                                'in the batch dimension.')
@@ -105,8 +113,11 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        if self.n_clusters == 0:
            logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                        self.out_layers[0].bias, self.out_projs[0])
-            nll = -F.log_softmax(logit, dim=-1) \
+            if target is not None:
+                output = -F.log_softmax(logit, dim=-1) \
                        .gather(1, target.unsqueeze(1)).squeeze(1)
+            else:
+                output = F.log_softmax(logit, dim=-1)
        else:
            # construct weights and biases
            weights, biases = [], []
@@ -133,14 +144,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
            head_logprob = F.log_softmax(head_logit, dim=1)

-            nll = torch.zeros_like(target,
-                    dtype=hidden.dtype, device=hidden.device)
+            if target is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)

            offset = 0
            cutoff_values = [0] + self.cutoffs
            for i in range(len(cutoff_values) - 1):
                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]

+                if target is not None:
                    mask_i = (target >= l_idx) & (target < r_idx)
                    indices_i = mask_i.nonzero().squeeze()

@@ -149,28 +163,36 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):

                    target_i = target.index_select(0, indices_i) - l_idx
                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden

                if i == 0:
+                    if target is not None:
                        logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                    else:
+                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
                else:
                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]

-                    hidden_i = hidden.index_select(0, indices_i)
-
                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    if target is not None:
                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i

+                if target is not None:
                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
-                    nll.index_copy_(0, indices_i, -logprob_i)
+                        out.index_copy_(0, indices_i, -logprob_i)
                    else:
-                    nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
-
+                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
                    offset += logprob_i.size(0)

-        return nll
+        return out


    def log_prob(self, hidden):