Initial commit

9e8a8c05 · jerrrrry · 9e8a8c05 · 9e8a8c05 · 9e8a8c05 · 9e8a8c05
Commit 9e8a8c05 authored Oct 14, 2024 by jerrrrry
20 changed files
--- a/implementations/pytorch/fairseq/__pycache__/options.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/options.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/progress_bar.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/progress_bar.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/sequence_generator.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/sequence_generator.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/tokenizer.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/tokenizer.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/trainer.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/trainer.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/utils.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/utils.cpython-310.pyc
--- a/implementations/pytorch/fairseq/bleu.py
+++ b/implementations/pytorch/fairseq/bleu.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import ctypes
+import math
+import torch
+
+try:
+    from fairseq import libbleu
+except ImportError as e:
+    import sys
+    sys.stderr.write('ERROR: missing libbleu.so. run `python setup.py install`\n')
+    raise e
+
+
+C = ctypes.cdll.LoadLibrary(libbleu.__file__)
+
+
+class BleuStat(ctypes.Structure):
+    _fields_ = [
+        ('reflen', ctypes.c_size_t),
+        ('predlen', ctypes.c_size_t),
+        ('match1', ctypes.c_size_t),
+        ('count1', ctypes.c_size_t),
+        ('match2', ctypes.c_size_t),
+        ('count2', ctypes.c_size_t),
+        ('match3', ctypes.c_size_t),
+        ('count3', ctypes.c_size_t),
+        ('match4', ctypes.c_size_t),
+        ('count4', ctypes.c_size_t),
+    ]
+
+
+class Scorer(object):
+    def __init__(self, pad, eos):
+        self.stat = BleuStat()
+        self.pad = pad
+        self.eos = eos
+        self.reset()
+
+    def reset(self, one_init=False):
+        if one_init:
+            C.bleu_one_init(ctypes.byref(self.stat))
+        else:
+            C.bleu_zero_init(ctypes.byref(self.stat))
+
+    def add(self, ref, pred):
+        if not isinstance(ref, torch.IntTensor):
+            raise TypeError('ref must be a torch.IntTensor (got {})'
+                            .format(type(ref)))
+        if not isinstance(pred, torch.IntTensor):
+            raise TypeError('pred must be a torch.IntTensor(got {})'
+                            .format(type(pred)))
+
+        # don't match unknown words
+        rref = ref.clone()
+        assert not rref.lt(0).any()
+        #rref[rref.eq(self.unk)] = -999
+
+        rref = rref.contiguous().view(-1)
+        pred = pred.contiguous().view(-1)
+
+        C.bleu_add(
+            ctypes.byref(self.stat),
+            ctypes.c_size_t(rref.size(0)),
+            ctypes.c_void_p(rref.data_ptr()),
+            ctypes.c_size_t(pred.size(0)),
+            ctypes.c_void_p(pred.data_ptr()),
+            ctypes.c_int(self.pad),
+            ctypes.c_int(self.eos))
+
+    def score(self, order=4):
+        psum = sum(math.log(p) if p > 0 else float('-Inf') for p in self.precision()[:order])
+        return self.brevity() * math.exp(psum / order) * 100
+
+    def precision(self):
+        def ratio(a, b):
+            return a / b if b > 0 else 0
+
+        return [
+            ratio(self.stat.match1, self.stat.count1),
+            ratio(self.stat.match2, self.stat.count2),
+            ratio(self.stat.match3, self.stat.count3),
+            ratio(self.stat.match4, self.stat.count4),
+        ]
+
+    def brevity(self):
+        r = self.stat.reflen / self.stat.predlen
+        return min(1, math.exp(1 - r))
+
+    def result_string(self, order=4):
+        assert order <= 4, "BLEU scores for order > 4 aren't supported"
+        fmt = 'BLEU{} = {:2.2f}, {:2.1f}'
+        for _ in range(1, order):
+            fmt += '/{:2.1f}'
+        fmt += ' (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})'
+        bleup = [p * 100 for p in self.precision()[:order]]
+        return fmt.format(order, self.score(order=order), *bleup,
+                          self.brevity(), self.stat.predlen/self.stat.reflen,
+                          self.stat.predlen, self.stat.reflen)
--- a/implementations/pytorch/fairseq/clib/libbleu/libbleu.cpp
+++ b/implementations/pytorch/fairseq/clib/libbleu/libbleu.cpp
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <map>
+#include <array>
+#include <cstring>
+#include <cstdio>
+
+typedef struct
+{
+    size_t reflen;
+    size_t predlen;
+    size_t match1;
+    size_t count1;
+    size_t match2;
+    size_t count2;
+    size_t match3;
+    size_t count3;
+    size_t match4;
+    size_t count4;
+} bleu_stat;
+
+// left trim (remove pad)
+void bleu_ltrim(size_t* len, int** sent, int pad) {
+  size_t start = 0;
+  while(start < *len) {
+    if (*(*sent + start) != pad) { break; }
+    start++;
+  }
+  *sent += start;
+  *len -= start;
+}
+
+// right trim remove (eos)
+void bleu_rtrim(size_t* len, int** sent, int pad, int eos) {
+  size_t end = *len - 1;
+  while (end > 0) {
+    if (*(*sent + end) != eos && *(*sent + end) != pad) { break; }
+    end--;
+  }
+  *len = end + 1;
+}
+
+// left and right trim
+void bleu_trim(size_t* len, int** sent, int pad, int eos) {
+  bleu_ltrim(len, sent, pad);
+  bleu_rtrim(len, sent, pad, eos);
+}
+
+size_t bleu_hash(int len, int* data) {
+  size_t h     = 14695981039346656037ul;
+  size_t prime = 0x100000001b3;
+  char* b      = (char*) data;
+  size_t blen  = sizeof(int) * len;
+
+  while (blen-- > 0) {
+    h ^= *b++;
+    h *= prime;
+  }
+
+  return h;
+}
+
+void bleu_addngram(
+    size_t *ntotal, size_t *nmatch, size_t n,
+    size_t reflen, int* ref, size_t predlen, int* pred) {
+
+  if (predlen < n) { return; }
+
+  predlen = predlen - n + 1;
+  (*ntotal) += predlen;
+
+  if (reflen < n) { return; }
+
+  reflen = reflen - n + 1;
+
+  std::map<size_t, size_t> count;
+  while (predlen > 0) {
+    size_t w = bleu_hash(n, pred++);
+    count[w]++;
+    predlen--;
+  }
+
+  while (reflen > 0) {
+    size_t w = bleu_hash(n, ref++);
+    if (count[w] > 0) {
+      (*nmatch)++;
+      count[w] -=1;
+    }
+    reflen--;
+  }
+}
+
+extern "C" {
+
+void bleu_zero_init(bleu_stat* stat) {
+  std::memset(stat, 0, sizeof(bleu_stat));
+}
+
+void bleu_one_init(bleu_stat* stat) {
+  bleu_zero_init(stat);
+  stat->count1 = 0;
+  stat->count2 = 1;
+  stat->count3 = 1;
+  stat->count4 = 1;
+  stat->match1 = 0;
+  stat->match2 = 1;
+  stat->match3 = 1;
+  stat->match4 = 1;
+}
+
+void bleu_add(
+    bleu_stat* stat,
+    size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) {
+
+  bleu_trim(&reflen, &ref, pad, eos);
+  bleu_trim(&predlen, &pred, pad, eos);
+  stat->reflen += reflen;
+  stat->predlen += predlen;
+
+  bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred);
+}
+
+}
--- a/implementations/pytorch/fairseq/clib/libbleu/module.cpp
+++ b/implementations/pytorch/fairseq/clib/libbleu/module.cpp
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <Python.h>
+
+
+static PyMethodDef method_def[] = {
+  {NULL, NULL, 0, NULL}
+};
+
+static struct PyModuleDef module_def = {
+   PyModuleDef_HEAD_INIT,
+   "libbleu",   /* name of module */
+   NULL,     /* module documentation, may be NULL */
+   -1,       /* size of per-interpreter state of the module,
+                or -1 if the module keeps state in global variables. */
+   method_def
+};
+
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init_libbleu()
+#else
+PyMODINIT_FUNC PyInit_libbleu()
+#endif
+{
+  PyObject *m = PyModule_Create(&module_def);
+  if (!m) {
+    return NULL;
+  }
+  return m;
+}
--- a/implementations/pytorch/fairseq/criterions/__init__.py
+++ b/implementations/pytorch/fairseq/criterions/__init__.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import importlib
+import os
+
+from .fairseq_criterion import FairseqCriterion
+
+
+CRITERION_REGISTRY = {}
+CRITERION_CLASS_NAMES = set()
+
+
+def build_criterion(args, task):
+    return CRITERION_REGISTRY[args.criterion](args, task)
+
+
+def register_criterion(name):
+    """Decorator to register a new criterion."""
+
+    def register_criterion_cls(cls):
+        if name in CRITERION_REGISTRY:
+            raise ValueError('Cannot register duplicate criterion ({})'.format(name))
+        if not issubclass(cls, FairseqCriterion):
+            raise ValueError('Criterion ({}: {}) must extend FairseqCriterion'.format(name, cls.__name__))
+        if cls.__name__ in CRITERION_CLASS_NAMES:
+            # We use the criterion class name as a unique identifier in
+            # checkpoints, so all criterions must have unique class names.
+            raise ValueError('Cannot register criterion with duplicate class name ({})'.format(cls.__name__))
+        CRITERION_REGISTRY[name] = cls
+        CRITERION_CLASS_NAMES.add(cls.__name__)
+        return cls
+
+    return register_criterion_cls
+
+
+# automatically import any Python files in the criterions/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('fairseq.criterions.' + module)
--- a/implementations/pytorch/fairseq/criterions/__pycache__/__init__.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/criterions/__pycache__/__init__.cpython-310.pyc
--- a/implementations/pytorch/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc
--- a/implementations/pytorch/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc
--- a/implementations/pytorch/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc
--- a/implementations/pytorch/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc
--- a/implementations/pytorch/fairseq/criterions/adaptive_loss.py
+++ b/implementations/pytorch/fairseq/criterions/adaptive_loss.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+import math
+import torch.nn.functional as F
+
+from fairseq import utils
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion('adaptive_loss')
+class AdaptiveLoss(FairseqCriterion):
+    """This is an implementation of the loss function accompanying the adaptive softmax approximation for
+    graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs"
+    (http://arxiv.org/abs/1609.04309)."""
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+
+        assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
+        adaptive_softmax = model.decoder.adaptive_softmax
+
+        net_output = model(**sample['net_input'])
+        target = model.get_targets(sample, net_output).view(-1)
+
+        bsz = target.size(0)
+
+        logits, target = adaptive_softmax(net_output[0], target)
+        assert len(target) == len(logits)
+
+        loss = net_output[0].new(1 if reduce else bsz).zero_()
+
+        for i in range(len(target)):
+            if target[i] is not None:
+                assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
+                loss += F.cross_entropy(logits[i], target[i], size_average=False, ignore_index=self.padding_idx,
+                                        reduce=reduce)
+
+        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+        agg_output = {
+            'loss': loss_sum / sample_size / math.log(2),
+            'sample_size': sample_size,
+        }
+        if sample_size != ntokens:
+            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
+        return agg_output
--- a/implementations/pytorch/fairseq/criterions/cross_entropy.py
+++ b/implementations/pytorch/fairseq/criterions/cross_entropy.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+import torch.nn.functional as F
+
+from fairseq import utils
+
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion('cross_entropy')
+class CrossEntropyCriterion(FairseqCriterion):
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        target = model.get_targets(sample, net_output).view(-1)
+        loss = F.nll_loss(lprobs, target, size_average=False, ignore_index=self.padding_idx,
+                          reduce=reduce)
+        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+        agg_output = {
+            'loss': loss_sum / sample_size / math.log(2),
+            'sample_size': sample_size,
+        }
+        if sample_size != ntokens:
+            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
+        return agg_output
--- a/implementations/pytorch/fairseq/criterions/fairseq_criterion.py
+++ b/implementations/pytorch/fairseq/criterions/fairseq_criterion.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from torch.nn.modules.loss import _Loss
+
+
+class FairseqCriterion(_Loss):
+
+    def __init__(self, args, task):
+        super().__init__()
+        self.args = args
+        self.padding_idx = task.target_dictionary.pad()
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        pass
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        raise NotImplementedError
+
+    @staticmethod
+    def grad_denom(sample_sizes):
+        """Compute the gradient denominator for a set of sample sizes."""
+        return sum(sample_sizes)
--- a/implementations/pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/implementations/pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+import torch
+import math
+
+from fairseq import utils
+
+from . import FairseqCriterion, register_criterion
+
+@register_criterion('label_smoothed_cross_entropy')
+class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.eps = args.label_smoothing
+        self.cpu_loss = torch.empty(1, dtype=torch.float32, device=torch.device('cpu'))
+        self.cpu_loss = self.cpu_loss.pin_memory()
+        if args.fast_xentropy :
+            from apex.contrib.xentropy import SoftmaxCrossEntropyLoss
+            self.xentropy_func = SoftmaxCrossEntropyLoss.apply
+        else:
+            self.xentropy_func = None
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
+                            help='epsilon for label smoothing, 0 means no label smoothing')
+        parser.add_argument('--fast-xentropy', action='store_true',
+                            help='Execute fast logSoftmax and Cross Entropy function.')
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        target = model.get_targets(sample, net_output).view(-1, 1)
+        if self.xentropy_func is not None:
+            assert (net_output[0].dtype == torch.float16) or (net_output[0].dtype == torch.float32), "Unsupported data types"
+            output = net_output[0].view(net_output[0].size(0)*net_output[0].size(1),net_output[0].size(2))
+            labels = target.view(target.size(0)*target.size(1))
+            losses = self.xentropy_func(output, labels, self.eps, self.padding_idx, net_output[0].dtype == torch.float16)
+            loss   = losses.sum()
+        else :
+            lprobs = model.get_normalized_probs(net_output, log_probs=True)
+            lprobs = lprobs.view(-1, lprobs.size(-1))
+            non_pad_mask = target.ne(self.padding_idx)
+            nll_loss = -lprobs.gather(dim=-1, index=target)[non_pad_mask]
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)[non_pad_mask]
+            if reduce:
+                nll_loss = nll_loss.sum()
+                smooth_loss = smooth_loss.sum()
+            eps_i = self.eps / lprobs.size(-1)
+            loss = (1. - self.eps) * nll_loss + eps_i * smooth_loss
+
+        sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
+        # Copy the Loss to the CPU without generating a Synchronize
+        self.cpu_loss.copy_(loss.detach(),non_blocking=True)
+        logging_output = {
+            'loss': utils.item(self.cpu_loss) if reduce else self.cpu_loss.data,
+            #'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data,
+            'ntokens': sample['ntokens'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+        return {
+            'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2),
+             #'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2),
+            'sample_size': sample_size,
+        }
--- a/implementations/pytorch/fairseq/data/__init__.py
+++ b/implementations/pytorch/fairseq/data/__init__.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .dictionary import Dictionary
+from .fairseq_dataset import FairseqDataset
+from .indexed_dataset import IndexedDataset, IndexedInMemoryDataset, IndexedRawTextDataset  # noqa: F401
+from .language_pair_dataset import LanguagePairDataset
+from .monolingual_dataset import MonolingualDataset
+from .token_block_dataset import TokenBlockDataset
+
+from .data_utils import EpochBatchIterator