Commit 18a6d85c authored by Myle Ott's avatar Myle Ott
Browse files

Add explicit dimension to softmax calls

parent 7da4e062
......@@ -24,8 +24,8 @@ If you use the code in your paper, then please cite it as:
* Python version 3.6
* A [PyTorch installation](http://pytorch.org/)
Currently fairseq-py requires installing PyTorch from source.
Please follow the instructions here: https://github.com/pytorch/pytorch#from-source.
Currently fairseq-py requires PyTorch version >= 0.3.0.
Please follow the instructions here: https://github.com/pytorch/pytorch#installation.
If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size` as command line
options to `nvidia-docker run`.
......
......@@ -57,7 +57,7 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
3) logging outputs to display while training
"""
net_output = model(**sample['net_input'])
input = F.log_softmax(net_output.view(-1, net_output.size(-1)))
input = F.log_softmax(net_output.view(-1, net_output.size(-1)), dim=1)
target = sample['target'].view(-1)
loss = LabelSmoothedNLLLoss.apply(input, target, self.eps, self.padding_idx, self.weights)
sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
......
......@@ -87,7 +87,7 @@ class FConvEncoder(FairseqEncoder):
residual = x if proj is None else proj(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x)
x = F.glu(x, dim=-1)
x = F.glu(x, dim=2)
x = (x + residual) * math.sqrt(0.5)
# T x B x C -> B x T x C
......@@ -128,7 +128,7 @@ class AttentionLayer(nn.Module):
# softmax over last dim
sz = x.size()
x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
x = x.view(sz)
attn_scores = x
......@@ -234,7 +234,7 @@ class FConvDecoder(FairseqIncrementalDecoder):
x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x)
x = conv.remove_future_timesteps(x)
x = F.glu(x)
x = F.glu(x, dim=2)
# attention
if attention is not None:
......
......@@ -94,7 +94,7 @@ class AttentionLayer(nn.Module):
# compute attention
attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
attn_scores = F.softmax(attn_scores.t()).t() # srclen x bsz
attn_scores = F.softmax(attn_scores.t(), dim=1).t() # srclen x bsz
# sum weighted sources
x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
......
......@@ -326,7 +326,7 @@ class SequenceGenerator(object):
avg_attn = None
for model, encoder_out in zip(self.models, encoder_outs):
decoder_out, attn = model.decoder(tokens, encoder_out)
probs = F.softmax(decoder_out[:, -1, :]).data
probs = F.softmax(decoder_out[:, -1, :], dim=1).data
attn = attn[:, -1, :].data
if avg_probs is None or avg_attn is None:
avg_probs = probs
......
......@@ -54,7 +54,7 @@ class build_py_hook(build_py):
setup(
name='fairseq',
version='0.2.0',
version='0.3.0',
description='Facebook AI Research Sequence-to-Sequence Toolkit',
long_description=readme,
license=license,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment