Making our code compatible with the latest pytorch (#223)

* Making our code compatible with the latest pytorch * revert * torch.nn.utils.clip_grad_norm now returns tensor

Making our code compatible with the latest pytorch (#223)
* Making our code compatible with the latest pytorch * revert * torch.nn.utils.clip_grad_norm now returns tensor
2f976aae · Sergey Edunov · Sergey Edunov · 9438019f · 2f976aae · 2f976aae
Commit 2f976aae authored Feb 27, 2018 by Sergey Edunov Committed by Sergey Edunov Feb 27, 2018
5 changed files
--- a/fairseq/criterions/cross_entropy.py
+++ b/fairseq/criterions/cross_entropy.py
@@ -9,7 +9,7 @@ import math
 import torch.nn.functional as F
 from . import FairseqCriterion, register_criterion
+from fairseq import utils
 @register_criterion('cross_entropy')
 class CrossEntropyCriterion(FairseqCriterion):
@@ -33,7 +33,7 @@ class CrossEntropyCriterion(FairseqCriterion):
                          reduce=reduce)
        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
        logging_output = {
-            'loss': loss.data[0] if reduce else loss.data,
+            'loss': utils.item(loss.data) if reduce else loss.data,
            'ntokens': sample['ntokens'],
            'sample_size': sample_size,
        }

--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -79,8 +79,8 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
        nll_loss = F.nll_loss(lprobs, target, size_average=False, ignore_index=self.padding_idx, reduce=reduce)
        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
        logging_output = {
-            'loss': loss.data[0] if reduce else loss.data,
+            'loss': utils.item(loss.data) if reduce else loss.data,
-            'nll_loss': nll_loss.data[0] if reduce else loss.data,
+            'nll_loss': utils.item(nll_loss.data) if reduce else loss.data,
            'ntokens': sample['ntokens'],
            'sample_size': sample_size,
        }

--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -116,7 +116,7 @@ def all_gather_list(data, max_size=4096):
    if len(enc) >= max_size:
        raise ValueError('encoded data exceeds max_size: {}'.format(len(enc)))
    in_buffer[0] = len(enc)
-    in_buffer[1:len(enc)+1] = torch.ByteTensor(enc)
+    in_buffer[1:len(enc)+1] = torch.ByteTensor(list(enc))
    torch.distributed.all_gather(out_buffers, in_buffer.cuda())

--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -190,7 +190,7 @@ class Trainer(object):
        # clip grads
        if self.args.clip_norm > 0:
-            grad_norm = torch.nn.utils.clip_grad_norm(self.model.parameters(), self.args.clip_norm)
+            grad_norm = utils.item(torch.nn.utils.clip_grad_norm(self.model.parameters(), self.args.clip_norm))
        else:
            grad_norm = math.sqrt(sum(p.grad.data.norm()**2 for p in self.model.parameters()))

--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -304,3 +304,10 @@ def convert_padding_direction(
    else:
        index = torch.remainder(range + num_pads, max_len)
    return src_tokens.gather(1, index)
+def item(tensor):
+    if hasattr(tensor, 'item'):
+        return tensor.item()
+    if hasattr(tensor, '__getitem__'):
+        return tensor[0]
+    return tensor