Add explicit dimension to softmax calls

18a6d85c · Myle Ott · 7da4e062 · 18a6d85c · 18a6d85c · 18a6d85c
Commit 18a6d85c authored Dec 26, 2017 by Myle Ott
7 changed files
--- a/README.md
+++ b/README.md
@@ -24,8 +24,8 @@ If you use the code in your paper, then please cite it as:
 * Python version 3.6
 * A [PyTorch installation](http://pytorch.org/)

-Currently fairseq-py requires installing PyTorch from source.
-Please follow the instructions here: https://github.com/pytorch/pytorch#from-source.
+Currently fairseq-py requires PyTorch version >= 0.3.0.
+Please follow the instructions here: https://github.com/pytorch/pytorch#installation.

 If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size` as command line
 options to `nvidia-docker run`.

--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -57,7 +57,7 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
        3) logging outputs to display while training
        """
        net_output = model(**sample['net_input'])
-        input = F.log_softmax(net_output.view(-1, net_output.size(-1)))
+        input = F.log_softmax(net_output.view(-1, net_output.size(-1)), dim=1)
        target = sample['target'].view(-1)
        loss = LabelSmoothedNLLLoss.apply(input, target, self.eps, self.padding_idx, self.weights)
        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']

--- a/fairseq/models/fconv.py
+++ b/fairseq/models/fconv.py
@@ -87,7 +87,7 @@ class FConvEncoder(FairseqEncoder):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
-            x = F.glu(x, dim=-1)
+            x = F.glu(x, dim=2)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
@@ -128,7 +128,7 @@ class AttentionLayer(nn.Module):

        # softmax over last dim
        sz = x.size()
-        x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
+        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
        x = x.view(sz)
        attn_scores = x

@@ -234,7 +234,7 @@ class FConvDecoder(FairseqIncrementalDecoder):
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = conv.remove_future_timesteps(x)
-            x = F.glu(x)
+            x = F.glu(x, dim=2)

            # attention
            if attention is not None:

--- a/fairseq/models/lstm.py
+++ b/fairseq/models/lstm.py
@@ -94,7 +94,7 @@ class AttentionLayer(nn.Module):

        # compute attention
        attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
-        attn_scores = F.softmax(attn_scores.t()).t()  # srclen x bsz
+        attn_scores = F.softmax(attn_scores.t(), dim=1).t()  # srclen x bsz

        # sum weighted sources
        x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)

--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -326,7 +326,7 @@ class SequenceGenerator(object):
        avg_attn = None
        for model, encoder_out in zip(self.models, encoder_outs):
            decoder_out, attn = model.decoder(tokens, encoder_out)
-            probs = F.softmax(decoder_out[:, -1, :]).data
+            probs = F.softmax(decoder_out[:, -1, :], dim=1).data
            attn = attn[:, -1, :].data
            if avg_probs is None or avg_attn is None:
                avg_probs = probs

--- a/requirements.txt
+++ b/requirements.txt
 cffi
 numpy
-torch
+torch>=0.3.0
 tqdm
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ class build_py_hook(build_py):

 setup(
    name='fairseq',
-    version='0.2.0',
+    version='0.3.0',
    description='Facebook AI Research Sequence-to-Sequence Toolkit',
    long_description=readme,
    license=license,