Refactor fairseq/test_noising with a word shuffle helper function (#340)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/340 This allows us to do a lot less copy paste when adding new word shuffle function tests Reviewed By: xianxl Differential Revision: D12810304 fbshipit-source-id: a56b5df093d17be2b73837897c526978cab92b70

Refactor fairseq/test_noising with a word shuffle helper function (#340)
Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/340 This allows us to do a lot less copy paste when adding new word shuffle function tests Reviewed By: xianxl Differential Revision: D12810304 fbshipit-source-id: a56b5df093d17be2b73837897c526978cab92b70
b1521f96 · Liezl Puzon · Facebook Github Bot · 0b05467d · b1521f96
Commit b1521f96 authored Nov 01, 2018 by Liezl Puzon Committed by Facebook Github Bot Nov 01, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 126 additions and 81 deletions

tests/test_noising.py tests/test_noising.py +126 -81

No files found.
--- a/tests/test_noising.py
+++ b/tests/test_noising.py
@@ -6,6 +6,7 @@
 # can be found in the PATENTS file in the same directory.
 import unittest
+from typing import Dict, List
 import tests.utils as test_utils
 import torch
@@ -116,82 +117,145 @@ class TestDataNoising(unittest.TestCase):
            )
            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-    def assert_no_shuffle_with_0_distance(self, x, x_noised, x_len, l_noised):
+    def generate_unchanged_shuffle_map(self, length):
-        """
+        return {i: i for i in range(length)}
-        Applies word shuffle with 0 max_shuffle_distance and asserts that no
-        shuffling happened
+    def assert_word_shuffle_matches_expected(
-        """
+        self,
-        for i in range(len(x_len)):
+        x,
-            for j in range(x_len[i]):
+        x_len,
-                self.assertEqual(x[j][i], x_noised[j][i])
+        max_shuffle_distance: int,
-        self.assertEqual(x_len[0], l_noised[0])
+        vocab: Dictionary,
+        expected_shufle_maps: List[Dict[int, int]],
-    def assert_word_shuffle_with_distance_3(self, x, x_noised, x_len, l_noised):
+        expect_eos_at_end: bool,
+    ):
        """
-        Applies word shuffle with max_shuffle_distance = 3 and asserts that the
+        This verifies that with a given x, x_len, max_shuffle_distance, and
-        shuffling result is as expected. If test data changes, update this func
+        vocab, we get the expected shuffle result.
+        Args:
+            x: Tensor of shape (T x B) = (sequence_length, batch_size)
+            x_len: Tensor of length B = batch_size
+            max_shuffle_distance: arg to pass to noising
+            expected_shuffle_maps: List[mapping] where mapping is a
+                Dict[old_index, new_index], mapping x's elements from their
+                old positions in x to their new positions in x.
+            expect_eos_at_end: if True, check the output to make sure there is
+                an EOS at the end.
        """
-        # Expect the second example has the last three tokens shuffled
+        with data_utils.numpy_seed(1234):
-        # 6, 7, 8, 9 => 6, 8, 9, 7, where (8, 9) is a word
+            word_shuffle = noising.WordShuffle(vocab)
-        for i in range(x_len[0]):
+            x_noised, l_noised = word_shuffle.noising(
-            self.assertEqual(x[i][0], x_noised[i][0])
+                x, x_len, max_shuffle_distance=max_shuffle_distance
-        shuffle_map = {0: 0, 1: 3, 2: 1, 3: 2}
+            )
-        for k, v in shuffle_map.items():
-            self.assertEqual(x[k][1], x_noised[v][1])
-        self.assertEqual(x_len[0], l_noised[0])
-        self.assertEqual(x_len[1], l_noised[1])
-    def assert_nonbpe_shuffle_with_distance_3(self, x, x_noised, x_len, l_noised):
+        # For every example, we have a different expected shuffle map. We check
-        """
+        # that each example is shuffled as expected according to each
-        Applies word shuffle with max_shuffle_distance = 3 and asserts that the
+        # corresponding shuffle map.
-        shuffling result is as expected. If test data changes, update this func
+        for i in range(len(expected_shufle_maps)):
-        """
+            shuffle_map = expected_shufle_maps[i]
-        # Expect the first example has the last two tokens shuffled
+            for k, v in shuffle_map.items():
-        # Expect the secon example has the second and third tokens shuffled
+                self.assertEqual(x[k][i], x_noised[v][i])
-        shuffle_map = {0: 0, 1: 1, 2: 3, 3: 2}
-        for k, v in shuffle_map.items():
+        # Shuffling should not affect the length of each example
-            self.assertEqual(x[k][0], x_noised[v][0])
+        for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised):
-        shuffle_map = {0: 0, 1: 2, 2: 1, 3: 3, 4: 4}
+            self.assertEqual(pre_shuffle_length, post_shuffle_length)
-        for k, v in shuffle_map.items():
+        if expect_eos_at_end:
-            self.assertEqual(x[k][1], x_noised[v][1])
+            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-        self.assertEqual(x_len[0], l_noised[0])
-        self.assertEqual(x_len[1], l_noised[1])
    def test_word_shuffle_with_eos(self):
        vocab, x, x_len = self._get_test_data(append_eos=True)
-        with data_utils.numpy_seed(1234):
+        # Assert word shuffle with max shuffle distance 0 causes input to be
-            word_shuffle = noising.WordShuffle(vocab)
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
-            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
+            x=x,
-            self.assert_no_shuffle_with_0_distance(
+            x_len=x_len,
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
+            max_shuffle_distance=0,
-            )
+            vocab=vocab,
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=True,
+        )
-            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
+        # Assert word shuffle with max shuffle distance 3 matches our expected
-            self.assert_word_shuffle_with_distance_3(
+        # shuffle order
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
+        self.assert_word_shuffle_matches_expected(
-            )
+            x=x,
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(x_len[0]),
+                {0: 0, 1: 3, 2: 1, 3: 2},
+            ],
+            expect_eos_at_end=True,
+        )
    def test_word_shuffle_with_eos_nonbpe(self):
        vocab, x, x_len = self._get_test_data(append_eos=True, bpe=False)
-        with data_utils.numpy_seed(1234):
+        # Assert word shuffle with max shuffle distance 0 causes input to be
-            word_shuffle = noising.WordShuffle(vocab, bpe_cont_marker=None)
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            max_shuffle_distance=0,
+            vocab=vocab,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=True,
+        )
-            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
+        # Assert word shuffle with max shuffle distance 3 matches our expected
-            self.assert_no_shuffle_with_0_distance(
+        # shuffle order
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
+        self.assert_word_shuffle_matches_expected(
-            )
+            x=x,
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                {0: 0, 1: 1, 2: 3, 3: 2},
+                {0: 0, 1: 2, 2: 1, 3: 3, 4: 4},
+            ],
+            expect_eos_at_end=True,
+        )
-            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
+    def test_word_shuffle_without_eos(self):
-            self.assert_nonbpe_shuffle_with_distance_3(
+        """Same result as word shuffle with eos except no EOS at end"""
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
+        vocab, x, x_len = self._get_test_data(append_eos=False)
-            )
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
+        # Assert word shuffle with max shuffle distance 0 causes input to be
+        # unchanged
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            max_shuffle_distance=0,
+            vocab=vocab,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(example_len)
+                for example_len in x_len
+            ],
+            expect_eos_at_end=False,
+        )
+        # Assert word shuffle with max shuffle distance 3 matches our expected
+        # shuffle order
+        self.assert_word_shuffle_matches_expected(
+            x=x,
+            x_len=x_len,
+            vocab=vocab,
+            max_shuffle_distance=3,
+            expected_shufle_maps=[
+                self.generate_unchanged_shuffle_map(x_len[0]),
+                {0: 0, 1: 3, 2: 1, 3: 2},
+            ],
+            expect_eos_at_end=False,
+        )
    def assert_no_eos_at_end(self, x, x_len, eos):
        """Asserts that the last token of each sentence in x is not EOS """
@@ -228,25 +292,6 @@ class TestDataNoising(unittest.TestCase):
            )
            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-    def test_word_shuffle_without_eos(self):
-        """Same result as word shuffle with eos except no EOS at end"""
-        vocab, x, x_len = self._get_test_data(append_eos=False)
-        with data_utils.numpy_seed(1234):
-            word_shuffle = noising.WordShuffle(vocab)
-            x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
-            self.assert_no_shuffle_with_0_distance(
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
-            )
-            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-            x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
-            self.assert_word_shuffle_with_distance_3(
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
-            )
-            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
    def _get_noising_dataset_batch(
        self, src_tokens_no_pad, src_dict, use_append_eos_dataset=False
    ):