test_noising.py

# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.

import torch
import unittest

from fairseq.data import data_utils, Dictionary, noising


class TestDataNoising(unittest.TestCase):
    def _get_test_data(self):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor([i + 1 for i in src_len])

    def test_word_dropout(self):
        vocab, x, x_len = self._get_test_data()

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
            # Expect only the first word (2 bpe tokens) of the first example
            # was dropped out
            self.assertEqual(x_len[0] - 2, l_noised[0])
            for i in range(l_noised[0]):
                self.assertEqual(x_noised[i][0], x[i+2][0])

    def test_word_blank(self):
        vocab, x, x_len = self._get_test_data()

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
            # Expect only the first word (2 bpe tokens) of the first example
            # was blanked out
            self.assertEqual(x_len[0], l_noised[0])
            for i in range(l_noised[0]):
                if i < 2:
                    self.assertEqual(x_noised[i][0], vocab.unk())
                else:
                    self.assertEqual(x_noised[i][0], x[i][0])

    def test_word_shuffle(self):
        vocab, x, x_len = self._get_test_data()

        with data_utils.numpy_seed(1234):
            word_shuffle = noising.WordShuffle(vocab)

            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
            for i in range(len(x_len)):
                for j in range(x_len[i]):
                    self.assertEqual(x[j][i], x_noised[j][i])
            self.assertEqual(x_len[0], l_noised[0])

            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
            # Expect the second example has the last three tokens shuffled
            # 6, 7, 8, 9 => 6, 8, 9, 7, where (8, 9) is a word
            for i in range(x_len[0]):
                self.assertEqual(x[i][0], x_noised[i][0])
            shuffle_map = {0: 0, 1: 3, 2: 1, 3: 2}
            for k, v in shuffle_map.items():
                self.assertEqual(x[k][1], x_noised[v][1])
            self.assertEqual(x_len[0], l_noised[0])
            self.assertEqual(x_len[1], l_noised[1])


if __name__ == '__main__':
    unittest.main()