Commit 2b13f3c0 authored by Liezl Puzon's avatar Liezl Puzon Committed by Facebook Github Bot
Browse files

Support BPE end of word marker suffix in fairseq noising module

Summary:
There are 2 ways to implement BPE:
1. use a continuation marker suffix to indicate that there is at least one more subtoken left in the word
2. use a end of word marker suffix to indicate that there is no more subtokens left in the word

This adds some logic to account for either kind of BPE marker suffix. This diff adds a corresponding test. I also refactored the test setup to reduce the number of boolean args when setting up test data.

Reviewed By: xianxl

Differential Revision: D12919428

fbshipit-source-id: 405e9f346dce6e736c1305288721dfc7b63e872a
parent b1521f96
...@@ -13,15 +13,24 @@ from fairseq.data import data_utils ...@@ -13,15 +13,24 @@ from fairseq.data import data_utils
class WordNoising(object): class WordNoising(object):
"""Generate a noisy version of a sentence, without changing words themselves.""" """Generate a noisy version of a sentence, without changing words themselves."""
def __init__(self, dictionary, bpe_cont_marker="@@"): def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
self.dictionary = dictionary self.dictionary = dictionary
self.bpe_end = np.array([ self.bpe_end = None
not self.dictionary[i].endswith(bpe_cont_marker) if bpe_cont_marker:
for i in range(len(self.dictionary)) self.bpe_end = np.array([
]) if bpe_cont_marker else None not self.dictionary[i].endswith(bpe_cont_marker)
for i in range(len(self.dictionary))
])
elif bpe_end_marker:
self.bpe_end = np.array([
self.dictionary[i].endswith(bpe_end_marker)
for i in range(len(self.dictionary))
])
self.get_word_idx = ( self.get_word_idx = (
self._get_bpe_word_idx if bpe_cont_marker else self._get_token_idx self._get_bpe_word_idx
if self.bpe_end is not None
else self._get_token_idx
) )
def noising(self, x, lengths, noising_prob=0.0): def noising(self, x, lengths, noising_prob=0.0):
...@@ -63,8 +72,8 @@ class WordDropout(WordNoising): ...@@ -63,8 +72,8 @@ class WordDropout(WordNoising):
then dropped words will be removed. Otherwise, it will be replaced by the then dropped words will be removed. Otherwise, it will be replaced by the
blank_idx.""" blank_idx."""
def __init__(self, dictionary): def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
super().__init__(dictionary) super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
def noising(self, x, lengths, dropout_prob=0.1, blank_idx=None): def noising(self, x, lengths, dropout_prob=0.1, blank_idx=None):
# x: (T x B), lengths: B # x: (T x B), lengths: B
...@@ -134,8 +143,8 @@ class WordDropout(WordNoising): ...@@ -134,8 +143,8 @@ class WordDropout(WordNoising):
class WordShuffle(WordNoising): class WordShuffle(WordNoising):
"""Shuffle words by no more than k positions.""" """Shuffle words by no more than k positions."""
def __init__(self, dictionary, bpe_cont_marker="@@"): def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
super().__init__(dictionary, bpe_cont_marker) super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
def noising(self, x, lengths, max_shuffle_distance=3): def noising(self, x, lengths, max_shuffle_distance=3):
# x: (T x B), lengths: B # x: (T x B), lengths: B
...@@ -152,7 +161,6 @@ class WordShuffle(WordNoising): ...@@ -152,7 +161,6 @@ class WordShuffle(WordNoising):
size=(x.size(0), x.size(1)), size=(x.size(0), x.size(1)),
) )
noise[0] = -1 # do not move start sentence symbol noise[0] = -1 # do not move start sentence symbol
# be sure to shuffle entire words # be sure to shuffle entire words
word_idx = self.get_word_idx(x) word_idx = self.get_word_idx(x)
x2 = x.clone() x2 = x.clone()
...@@ -182,15 +190,25 @@ class UnsupervisedMTNoising(WordNoising): ...@@ -182,15 +190,25 @@ class UnsupervisedMTNoising(WordNoising):
dictionary, dictionary,
max_word_shuffle_distance, max_word_shuffle_distance,
word_dropout_prob, word_dropout_prob,
word_blanking_prob word_blanking_prob,
bpe_cont_marker="@@",
bpe_end_marker=None,
): ):
super().__init__(dictionary) super().__init__(dictionary)
self.max_word_shuffle_distance = max_word_shuffle_distance self.max_word_shuffle_distance = max_word_shuffle_distance
self.word_dropout_prob = word_dropout_prob self.word_dropout_prob = word_dropout_prob
self.word_blanking_prob = word_blanking_prob self.word_blanking_prob = word_blanking_prob
self.word_dropout = WordDropout(dictionary=dictionary) self.word_dropout = WordDropout(
self.word_shuffle = WordShuffle(dictionary=dictionary) dictionary=dictionary,
bpe_cont_marker=bpe_cont_marker,
bpe_end_marker=bpe_end_marker,
)
self.word_shuffle = WordShuffle(
dictionary=dictionary,
bpe_cont_marker=bpe_cont_marker,
bpe_end_marker=bpe_end_marker,
)
def noising(self, x, lengths): def noising(self, x, lengths):
# 1. Word Shuffle # 1. Word Shuffle
......
...@@ -21,36 +21,109 @@ from fairseq.data import ( ...@@ -21,36 +21,109 @@ from fairseq.data import (
class TestDataNoising(unittest.TestCase): class TestDataNoising(unittest.TestCase):
def _get_test_data(self, append_eos=True, bpe=True): def _get_test_data_with_bpe_cont_marker(self, append_eos=True):
"""
Args:
append_eos: if True, each input sentence in the source tokens tensor
will have an EOS appended to the end.
Returns:
vocabs: BPE vocab with continuation markers as suffixes to denote
non-end of word tokens. This is the standard BPE format used in
fairseq's preprocessing.
x: input tensor containing numberized source tokens, with EOS at the
end if append_eos is true
src_lengths: and source lengths.
"""
vocab = Dictionary()
vocab.add_symbol("he@@")
vocab.add_symbol("llo")
vocab.add_symbol("how")
vocab.add_symbol("are")
vocab.add_symbol("y@@")
vocab.add_symbol("ou")
vocab.add_symbol("n@@")
vocab.add_symbol("ew")
vocab.add_symbol("or@@")
vocab.add_symbol("k")
src_tokens = [
["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
["how", "are", "y@@", "ou"],
]
x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
)
return vocab, x, src_lengths
def _get_test_data_with_bpe_end_marker(self, append_eos=True):
"""
Args:
append_eos: if True, each input sentence in the source tokens tensor
will have an EOS appended to the end.
Returns:
vocabs: BPE vocab with end-of-word markers as suffixes to denote
tokens at the end of a word. This is an alternative to fairseq's
standard preprocessing framework and is not generally supported
within fairseq.
x: input tensor containing numberized source tokens, with EOS at the
end if append_eos is true
src_lengths: and source lengths.
"""
vocab = Dictionary()
vocab.add_symbol("he")
vocab.add_symbol("llo_EOW")
vocab.add_symbol("how_EOW")
vocab.add_symbol("are_EOW")
vocab.add_symbol("y")
vocab.add_symbol("ou_EOW")
vocab.add_symbol("n")
vocab.add_symbol("ew_EOW")
vocab.add_symbol("or")
vocab.add_symbol("k_EOW")
src_tokens = [
["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"],
["how_EOW", "are_EOW", "y", "ou_EOW"],
]
x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
)
return vocab, x, src_lengths
def _get_test_data_with_word_vocab(self, append_eos=True):
"""
Args:
append_eos: if True, each input sentence in the source tokens tensor
will have an EOS appended to the end.
Returns:
vocabs: word vocab
x: input tensor containing numberized source tokens, with EOS at the
end if append_eos is true
src_lengths: and source lengths.
"""
vocab = Dictionary() vocab = Dictionary()
if bpe:
vocab.add_symbol("he@@")
vocab.add_symbol("llo")
vocab.add_symbol("how")
vocab.add_symbol("are")
vocab.add_symbol("y@@")
vocab.add_symbol("ou")
vocab.add_symbol("n@@")
vocab.add_symbol("ew")
vocab.add_symbol("or@@")
vocab.add_symbol("k")
src_tokens = [
["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
["how", "are", "y@@", "ou"],
]
else:
vocab.add_symbol("hello")
vocab.add_symbol("how")
vocab.add_symbol("are")
vocab.add_symbol("you")
vocab.add_symbol("new")
vocab.add_symbol("york")
src_tokens = [
["hello", "new", "york", "you"],
["how", "are", "you", "new", "york"],
]
vocab.add_symbol("hello")
vocab.add_symbol("how")
vocab.add_symbol("are")
vocab.add_symbol("you")
vocab.add_symbol("new")
vocab.add_symbol("york")
src_tokens = [
["hello", "new", "york", "you"],
["how", "are", "you", "new", "york"],
]
x, src_lengths = self._convert_src_tokens_to_tensor(
vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
)
return vocab, x, src_lengths
def _convert_src_tokens_to_tensor(
self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
):
src_len = [len(x) for x in src_tokens] src_len = [len(x) for x in src_tokens]
# If we have to append EOS, we include EOS in counting src length # If we have to append EOS, we include EOS in counting src length
if append_eos: if append_eos:
...@@ -64,7 +137,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -64,7 +137,7 @@ class TestDataNoising(unittest.TestCase):
x[i][j + 1] = vocab.eos() x[i][j + 1] = vocab.eos()
x = x.transpose(1, 0) x = x.transpose(1, 0)
return vocab, x, torch.LongTensor(src_len) return x, torch.LongTensor(src_len)
def assert_eos_at_end(self, x, x_len, eos): def assert_eos_at_end(self, x, x_len, eos):
"""Asserts last token of every sentence in x is EOS """ """Asserts last token of every sentence in x is EOS """
...@@ -86,7 +159,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -86,7 +159,7 @@ class TestDataNoising(unittest.TestCase):
self.assertEqual(x_noised[i][0], x[i + 2][0]) self.assertEqual(x_noised[i][0], x[i + 2][0])
def test_word_dropout_with_eos(self): def test_word_dropout_with_eos(self):
vocab, x, x_len = self._get_test_data(append_eos=True) vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
with data_utils.numpy_seed(1234): with data_utils.numpy_seed(1234):
noising_gen = noising.WordDropout(vocab) noising_gen = noising.WordDropout(vocab)
...@@ -107,7 +180,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -107,7 +180,7 @@ class TestDataNoising(unittest.TestCase):
self.assertEqual(x_noised[i][0], x[i][0]) self.assertEqual(x_noised[i][0], x[i][0])
def test_word_blank_with_eos(self): def test_word_blank_with_eos(self):
vocab, x, x_len = self._get_test_data(append_eos=True) vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
with data_utils.numpy_seed(1234): with data_utils.numpy_seed(1234):
noising_gen = noising.WordDropout(vocab) noising_gen = noising.WordDropout(vocab)
...@@ -128,6 +201,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -128,6 +201,7 @@ class TestDataNoising(unittest.TestCase):
vocab: Dictionary, vocab: Dictionary,
expected_shufle_maps: List[Dict[int, int]], expected_shufle_maps: List[Dict[int, int]],
expect_eos_at_end: bool, expect_eos_at_end: bool,
bpe_end_marker=None,
): ):
""" """
This verifies that with a given x, x_len, max_shuffle_distance, and This verifies that with a given x, x_len, max_shuffle_distance, and
...@@ -142,9 +216,17 @@ class TestDataNoising(unittest.TestCase): ...@@ -142,9 +216,17 @@ class TestDataNoising(unittest.TestCase):
old positions in x to their new positions in x. old positions in x to their new positions in x.
expect_eos_at_end: if True, check the output to make sure there is expect_eos_at_end: if True, check the output to make sure there is
an EOS at the end. an EOS at the end.
bpe_end_marker: str denoting the BPE end token. If this is not None, we
set the BPE cont token to None in the noising classes.
""" """
bpe_cont_marker = None
if bpe_end_marker is None:
bpe_cont_marker = "@@"
with data_utils.numpy_seed(1234): with data_utils.numpy_seed(1234):
word_shuffle = noising.WordShuffle(vocab) word_shuffle = noising.WordShuffle(
vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker
)
x_noised, l_noised = word_shuffle.noising( x_noised, l_noised = word_shuffle.noising(
x, x_len, max_shuffle_distance=max_shuffle_distance x, x_len, max_shuffle_distance=max_shuffle_distance
) )
...@@ -164,7 +246,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -164,7 +246,7 @@ class TestDataNoising(unittest.TestCase):
self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def test_word_shuffle_with_eos(self): def test_word_shuffle_with_eos(self):
vocab, x, x_len = self._get_test_data(append_eos=True) vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
# Assert word shuffle with max shuffle distance 0 causes input to be # Assert word shuffle with max shuffle distance 0 causes input to be
# unchanged # unchanged
...@@ -195,7 +277,8 @@ class TestDataNoising(unittest.TestCase): ...@@ -195,7 +277,8 @@ class TestDataNoising(unittest.TestCase):
) )
def test_word_shuffle_with_eos_nonbpe(self): def test_word_shuffle_with_eos_nonbpe(self):
vocab, x, x_len = self._get_test_data(append_eos=True, bpe=False) """The purpose of this is to test shuffling logic with word vocabs"""
vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True)
# Assert word shuffle with max shuffle distance 0 causes input to be # Assert word shuffle with max shuffle distance 0 causes input to be
# unchanged # unchanged
...@@ -227,7 +310,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -227,7 +310,7 @@ class TestDataNoising(unittest.TestCase):
def test_word_shuffle_without_eos(self): def test_word_shuffle_without_eos(self):
"""Same result as word shuffle with eos except no EOS at end""" """Same result as word shuffle with eos except no EOS at end"""
vocab, x, x_len = self._get_test_data(append_eos=False) vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
# Assert word shuffle with max shuffle distance 0 causes input to be # Assert word shuffle with max shuffle distance 0 causes input to be
# unchanged # unchanged
...@@ -257,6 +340,40 @@ class TestDataNoising(unittest.TestCase): ...@@ -257,6 +340,40 @@ class TestDataNoising(unittest.TestCase):
expect_eos_at_end=False, expect_eos_at_end=False,
) )
def test_word_shuffle_without_eos_with_bpe_end_marker(self):
"""Same result as word shuffle without eos except using BPE end token"""
vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False)
# Assert word shuffle with max shuffle distance 0 causes input to be
# unchanged
self.assert_word_shuffle_matches_expected(
x=x,
x_len=x_len,
max_shuffle_distance=0,
vocab=vocab,
expected_shufle_maps=[
self.generate_unchanged_shuffle_map(example_len)
for example_len in x_len
],
expect_eos_at_end=False,
bpe_end_marker="_EOW",
)
# Assert word shuffle with max shuffle distance 3 matches our expected
# shuffle order
self.assert_word_shuffle_matches_expected(
x=x,
x_len=x_len,
vocab=vocab,
max_shuffle_distance=3,
expected_shufle_maps=[
self.generate_unchanged_shuffle_map(x_len[0]),
{0: 0, 1: 3, 2: 1, 3: 2},
],
expect_eos_at_end=False,
bpe_end_marker="_EOW",
)
def assert_no_eos_at_end(self, x, x_len, eos): def assert_no_eos_at_end(self, x, x_len, eos):
"""Asserts that the last token of each sentence in x is not EOS """ """Asserts that the last token of each sentence in x is not EOS """
for i in range(len(x_len)): for i in range(len(x_len)):
...@@ -270,7 +387,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -270,7 +387,7 @@ class TestDataNoising(unittest.TestCase):
def test_word_dropout_without_eos(self): def test_word_dropout_without_eos(self):
"""Same result as word dropout with eos except no EOS at end""" """Same result as word dropout with eos except no EOS at end"""
vocab, x, x_len = self._get_test_data(append_eos=False) vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
with data_utils.numpy_seed(1234): with data_utils.numpy_seed(1234):
noising_gen = noising.WordDropout(vocab) noising_gen = noising.WordDropout(vocab)
...@@ -282,7 +399,7 @@ class TestDataNoising(unittest.TestCase): ...@@ -282,7 +399,7 @@ class TestDataNoising(unittest.TestCase):
def test_word_blank_without_eos(self): def test_word_blank_without_eos(self):
"""Same result as word blank with eos except no EOS at end""" """Same result as word blank with eos except no EOS at end"""
vocab, x, x_len = self._get_test_data(append_eos=False) vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
with data_utils.numpy_seed(1234): with data_utils.numpy_seed(1234):
noising_gen = noising.WordDropout(vocab) noising_gen = noising.WordDropout(vocab)
...@@ -330,7 +447,9 @@ class TestDataNoising(unittest.TestCase): ...@@ -330,7 +447,9 @@ class TestDataNoising(unittest.TestCase):
return denoising_batch_result return denoising_batch_result
def test_noising_dataset_with_eos(self): def test_noising_dataset_with_eos(self):
src_dict, src_tokens, _ = self._get_test_data(append_eos=True) src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
append_eos=True
)
# Format data for src_dataset # Format data for src_dataset
src_tokens = torch.t(src_tokens) src_tokens = torch.t(src_tokens)
...@@ -366,7 +485,9 @@ class TestDataNoising(unittest.TestCase): ...@@ -366,7 +485,9 @@ class TestDataNoising(unittest.TestCase):
AppendEosDataset when using it as the target in LanguagePairDataset. AppendEosDataset when using it as the target in LanguagePairDataset.
""" """
src_dict, src_tokens, _ = self._get_test_data(append_eos=False) src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
append_eos=False
)
# Format data for src_dataset # Format data for src_dataset
src_tokens = torch.t(src_tokens) src_tokens = torch.t(src_tokens)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment