Commit 18a6d85c authored by Myle Ott's avatar Myle Ott
Browse files

Add explicit dimension to softmax calls

parent 7da4e062
...@@ -24,8 +24,8 @@ If you use the code in your paper, then please cite it as: ...@@ -24,8 +24,8 @@ If you use the code in your paper, then please cite it as:
* Python version 3.6 * Python version 3.6
* A [PyTorch installation](http://pytorch.org/) * A [PyTorch installation](http://pytorch.org/)
Currently fairseq-py requires installing PyTorch from source. Currently fairseq-py requires PyTorch version >= 0.3.0.
Please follow the instructions here: https://github.com/pytorch/pytorch#from-source. Please follow the instructions here: https://github.com/pytorch/pytorch#installation.
If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size` as command line If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size` as command line
options to `nvidia-docker run`. options to `nvidia-docker run`.
......
...@@ -57,7 +57,7 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion): ...@@ -57,7 +57,7 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
3) logging outputs to display while training 3) logging outputs to display while training
""" """
net_output = model(**sample['net_input']) net_output = model(**sample['net_input'])
input = F.log_softmax(net_output.view(-1, net_output.size(-1))) input = F.log_softmax(net_output.view(-1, net_output.size(-1)), dim=1)
target = sample['target'].view(-1) target = sample['target'].view(-1)
loss = LabelSmoothedNLLLoss.apply(input, target, self.eps, self.padding_idx, self.weights) loss = LabelSmoothedNLLLoss.apply(input, target, self.eps, self.padding_idx, self.weights)
sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens'] sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
......
...@@ -87,7 +87,7 @@ class FConvEncoder(FairseqEncoder): ...@@ -87,7 +87,7 @@ class FConvEncoder(FairseqEncoder):
residual = x if proj is None else proj(x) residual = x if proj is None else proj(x)
x = F.dropout(x, p=self.dropout, training=self.training) x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x) x = conv(x)
x = F.glu(x, dim=-1) x = F.glu(x, dim=2)
x = (x + residual) * math.sqrt(0.5) x = (x + residual) * math.sqrt(0.5)
# T x B x C -> B x T x C # T x B x C -> B x T x C
...@@ -128,7 +128,7 @@ class AttentionLayer(nn.Module): ...@@ -128,7 +128,7 @@ class AttentionLayer(nn.Module):
# softmax over last dim # softmax over last dim
sz = x.size() sz = x.size()
x = F.softmax(x.view(sz[0] * sz[1], sz[2])) x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
x = x.view(sz) x = x.view(sz)
attn_scores = x attn_scores = x
...@@ -234,7 +234,7 @@ class FConvDecoder(FairseqIncrementalDecoder): ...@@ -234,7 +234,7 @@ class FConvDecoder(FairseqIncrementalDecoder):
x = F.dropout(x, p=self.dropout, training=self.training) x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x) x = conv(x)
x = conv.remove_future_timesteps(x) x = conv.remove_future_timesteps(x)
x = F.glu(x) x = F.glu(x, dim=2)
# attention # attention
if attention is not None: if attention is not None:
......
...@@ -94,7 +94,7 @@ class AttentionLayer(nn.Module): ...@@ -94,7 +94,7 @@ class AttentionLayer(nn.Module):
# compute attention # compute attention
attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2) attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
attn_scores = F.softmax(attn_scores.t()).t() # srclen x bsz attn_scores = F.softmax(attn_scores.t(), dim=1).t() # srclen x bsz
# sum weighted sources # sum weighted sources
x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0) x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
......
...@@ -326,7 +326,7 @@ class SequenceGenerator(object): ...@@ -326,7 +326,7 @@ class SequenceGenerator(object):
avg_attn = None avg_attn = None
for model, encoder_out in zip(self.models, encoder_outs): for model, encoder_out in zip(self.models, encoder_outs):
decoder_out, attn = model.decoder(tokens, encoder_out) decoder_out, attn = model.decoder(tokens, encoder_out)
probs = F.softmax(decoder_out[:, -1, :]).data probs = F.softmax(decoder_out[:, -1, :], dim=1).data
attn = attn[:, -1, :].data attn = attn[:, -1, :].data
if avg_probs is None or avg_attn is None: if avg_probs is None or avg_attn is None:
avg_probs = probs avg_probs = probs
......
...@@ -54,7 +54,7 @@ class build_py_hook(build_py): ...@@ -54,7 +54,7 @@ class build_py_hook(build_py):
setup( setup(
name='fairseq', name='fairseq',
version='0.2.0', version='0.3.0',
description='Facebook AI Research Sequence-to-Sequence Toolkit', description='Facebook AI Research Sequence-to-Sequence Toolkit',
long_description=readme, long_description=readme,
license=license, license=license,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment