"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "5dc34713800306f498f8ecff6158fcd3668032be"
Commit b1521f96 authored by Liezl Puzon's avatar Liezl Puzon Committed by Facebook Github Bot
Browse files

Refactor fairseq/test_noising with a word shuffle helper function (#340)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/340

This allows us to do a lot less copy paste when adding new word shuffle function tests

Reviewed By: xianxl

Differential Revision: D12810304

fbshipit-source-id: a56b5df093d17be2b73837897c526978cab92b70
parent 0b05467d
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
# can be found in the PATENTS file in the same directory. # can be found in the PATENTS file in the same directory.
import unittest import unittest
from typing import Dict, List
import tests.utils as test_utils import tests.utils as test_utils
import torch import torch
...@@ -116,82 +117,145 @@ class TestDataNoising(unittest.TestCase): ...@@ -116,82 +117,145 @@ class TestDataNoising(unittest.TestCase):
) )
self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def assert_no_shuffle_with_0_distance(self, x, x_noised, x_len, l_noised): def generate_unchanged_shuffle_map(self, length):
""" return {i: i for i in range(length)}
Applies word shuffle with 0 max_shuffle_distance and asserts that no
shuffling happened def assert_word_shuffle_matches_expected(
""" self,
for i in range(len(x_len)): x,
for j in range(x_len[i]): x_len,
self.assertEqual(x[j][i], x_noised[j][i]) max_shuffle_distance: int,
self.assertEqual(x_len[0], l_noised[0]) vocab: Dictionary,
expected_shufle_maps: List[Dict[int, int]],
def assert_word_shuffle_with_distance_3(self, x, x_noised, x_len, l_noised): expect_eos_at_end: bool,
):
""" """
Applies word shuffle with max_shuffle_distance = 3 and asserts that the This verifies that with a given x, x_len, max_shuffle_distance, and
shuffling result is as expected. If test data changes, update this func vocab, we get the expected shuffle result.
Args:
x: Tensor of shape (T x B) = (sequence_length, batch_size)
x_len: Tensor of length B = batch_size
max_shuffle_distance: arg to pass to noising
expected_shuffle_maps: List[mapping] where mapping is a
Dict[old_index, new_index], mapping x's elements from their
old positions in x to their new positions in x.
expect_eos_at_end: if True, check the output to make sure there is
an EOS at the end.
""" """
# Expect the second example has the last three tokens shuffled with data_utils.numpy_seed(1234):
# 6, 7, 8, 9 => 6, 8, 9, 7, where (8, 9) is a word word_shuffle = noising.WordShuffle(vocab)
for i in range(x_len[0]): x_noised, l_noised = word_shuffle.noising(
self.assertEqual(x[i][0], x_noised[i][0]) x, x_len, max_shuffle_distance=max_shuffle_distance
shuffle_map = {0: 0, 1: 3, 2: 1, 3: 2} )
for k, v in shuffle_map.items():
self.assertEqual(x[k][1], x_noised[v][1])
self.assertEqual(x_len[0], l_noised[0])
self.assertEqual(x_len[1], l_noised[1])
def assert_nonbpe_shuffle_with_distance_3(self, x, x_noised, x_len, l_noised): # For every example, we have a different expected shuffle map. We check
""" # that each example is shuffled as expected according to each
Applies word shuffle with max_shuffle_distance = 3 and asserts that the # corresponding shuffle map.
shuffling result is as expected. If test data changes, update this func for i in range(len(expected_shufle_maps)):
""" shuffle_map = expected_shufle_maps[i]
# Expect the first example has the last two tokens shuffled for k, v in shuffle_map.items():
# Expect the secon example has the second and third tokens shuffled self.assertEqual(x[k][i], x_noised[v][i])
shuffle_map = {0: 0, 1: 1, 2: 3, 3: 2}
for k, v in shuffle_map.items(): # Shuffling should not affect the length of each example
self.assertEqual(x[k][0], x_noised[v][0]) for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised):
shuffle_map = {0: 0, 1: 2, 2: 1, 3: 3, 4: 4} self.assertEqual(pre_shuffle_length, post_shuffle_length)
for k, v in shuffle_map.items(): if expect_eos_at_end:
self.assertEqual(x[k][1], x_noised[v][1]) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
self.assertEqual(x_len[0], l_noised[0])
self.assertEqual(x_len[1], l_noised[1])
def test_word_shuffle_with_eos(self): def test_word_shuffle_with_eos(self):
vocab, x, x_len = self._get_test_data(append_eos=True) vocab, x, x_len = self._get_test_data(append_eos=True)
with data_utils.numpy_seed(1234): # Assert word shuffle with max shuffle distance 0 causes input to be
word_shuffle = noising.WordShuffle(vocab) # unchanged
self.assert_word_shuffle_matches_expected(
x_noised, l_noised = word_shuffle.noising(x, x_len, 0) x=x,
self.assert_no_shuffle_with_0_distance( x_len=x_len,
x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised max_shuffle_distance=0,
) vocab=vocab,
self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) expected_shufle_maps=[
self.generate_unchanged_shuffle_map(example_len)
for example_len in x_len
],
expect_eos_at_end=True,
)
x_noised, l_noised = word_shuffle.noising(x, x_len, 3) # Assert word shuffle with max shuffle distance 3 matches our expected
self.assert_word_shuffle_with_distance_3( # shuffle order
x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised self.assert_word_shuffle_matches_expected(
) x=x,
self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) x_len=x_len,
vocab=vocab,
max_shuffle_distance=3,
expected_shufle_maps=[
self.generate_unchanged_shuffle_map(x_len[0]),
{0: 0, 1: 3, 2: 1, 3: 2},
],
expect_eos_at_end=True,
)
def test_word_shuffle_with_eos_nonbpe(self): def test_word_shuffle_with_eos_nonbpe(self):
vocab, x, x_len = self._get_test_data(append_eos=True, bpe=False) vocab, x, x_len = self._get_test_data(append_eos=True, bpe=False)
with data_utils.numpy_seed(1234): # Assert word shuffle with max shuffle distance 0 causes input to be
word_shuffle = noising.WordShuffle(vocab, bpe_cont_marker=None) # unchanged
self.assert_word_shuffle_matches_expected(
x=x,
x_len=x_len,
max_shuffle_distance=0,
vocab=vocab,
expected_shufle_maps=[
self.generate_unchanged_shuffle_map(example_len)
for example_len in x_len
],
expect_eos_at_end=True,
)
x_noised, l_noised = word_shuffle.noising(x, x_len, 0) # Assert word shuffle with max shuffle distance 3 matches our expected
self.assert_no_shuffle_with_0_distance( # shuffle order
x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised self.assert_word_shuffle_matches_expected(
) x=x,
self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) x_len=x_len,
vocab=vocab,
max_shuffle_distance=3,
expected_shufle_maps=[
{0: 0, 1: 1, 2: 3, 3: 2},
{0: 0, 1: 2, 2: 1, 3: 3, 4: 4},
],
expect_eos_at_end=True,
)
x_noised, l_noised = word_shuffle.noising(x, x_len, 3) def test_word_shuffle_without_eos(self):
self.assert_nonbpe_shuffle_with_distance_3( """Same result as word shuffle with eos except no EOS at end"""
x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised vocab, x, x_len = self._get_test_data(append_eos=False)
)
self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) # Assert word shuffle with max shuffle distance 0 causes input to be
# unchanged
self.assert_word_shuffle_matches_expected(
x=x,
x_len=x_len,
max_shuffle_distance=0,
vocab=vocab,
expected_shufle_maps=[
self.generate_unchanged_shuffle_map(example_len)
for example_len in x_len
],
expect_eos_at_end=False,
)
# Assert word shuffle with max shuffle distance 3 matches our expected
# shuffle order
self.assert_word_shuffle_matches_expected(
x=x,
x_len=x_len,
vocab=vocab,
max_shuffle_distance=3,
expected_shufle_maps=[
self.generate_unchanged_shuffle_map(x_len[0]),
{0: 0, 1: 3, 2: 1, 3: 2},
],
expect_eos_at_end=False,
)
def assert_no_eos_at_end(self, x, x_len, eos): def assert_no_eos_at_end(self, x, x_len, eos):
"""Asserts that the last token of each sentence in x is not EOS """ """Asserts that the last token of each sentence in x is not EOS """
...@@ -228,25 +292,6 @@ class TestDataNoising(unittest.TestCase): ...@@ -228,25 +292,6 @@ class TestDataNoising(unittest.TestCase):
) )
self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def test_word_shuffle_without_eos(self):
"""Same result as word shuffle with eos except no EOS at end"""
vocab, x, x_len = self._get_test_data(append_eos=False)
with data_utils.numpy_seed(1234):
word_shuffle = noising.WordShuffle(vocab)
x_noised, l_noised = word_shuffle.noising(x, x_len, 0)
self.assert_no_shuffle_with_0_distance(
x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
)
self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
x_noised, l_noised = word_shuffle.noising(x, x_len, 3)
self.assert_word_shuffle_with_distance_3(
x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
)
self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def _get_noising_dataset_batch( def _get_noising_dataset_batch(
self, src_tokens_no_pad, src_dict, use_append_eos_dataset=False self, src_tokens_no_pad, src_dict, use_append_eos_dataset=False
): ):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment