"git@developer.sourcefind.cn:OpenDAS/torch-cluster.git" did not exist on "b5ae2c0fa3467b6fa002458fe0345b0a3e317139"
Commit c9c660c0 authored by Liezl Puzon's avatar Liezl Puzon Committed by Facebook Github Bot
Browse files

Denoising autoencoder task (#251)

Summary:
Pull Request resolved: https://github.com/pytorch/translate/pull/251

We should use shared encoder and separate decoders as in:

https://fb.facebook.com/groups/2156114531381111/permalink/2169028113423086/

Generation is a hack, ideally the net input should have the lang pair info so that when we pass the sample to the model, it can select the correct encoder/decoder pair.

diff [2/2] will be for flow integration for basic experimentation

TODO in a future diff: figure out how to generalize this so export will work??

This works with vocab reduction, but we only support vocab reduction for src-tgt, not src-src model. A future (lowpri) task could be to add word prediction vocab reduction for src-src model to speed up training.

Reviewed By: xianxl

Differential Revision: D10512576

fbshipit-source-id: 545d96cad8e814b9da7be102a48cc5cac358b758
parent 5bbd148e
...@@ -32,10 +32,17 @@ class WordNoising(object): ...@@ -32,10 +32,17 @@ class WordNoising(object):
Given a list of BPE tokens, for every index in the tokens list, Given a list of BPE tokens, for every index in the tokens list,
return the index of the word grouping that it belongs to. return the index of the word grouping that it belongs to.
For example, for input x corresponding to ["how", "are", "y@@", "ou"], For example, for input x corresponding to ["how", "are", "y@@", "ou"],
return [0, 1, 2, 2]. return [[0], [1], [2], [2]].
""" """
# x: (T x B) # x: (T x B)
bpe_end = self.bpe_end[x] bpe_end = self.bpe_end[x]
if (x.size(0) == 1 and x.size(1) == 1):
# Special case when we only have one word in x. If x = [[N]],
# bpe_end is a scalar (bool) instead of a 2-dim array of bools,
# which makes the sum operation below fail.
return np.array([[0]])
# do a reduce front sum to generate word ids # do a reduce front sum to generate word ids
word_idx = bpe_end[::-1].cumsum(0)[::-1] word_idx = bpe_end[::-1].cumsum(0)[::-1]
word_idx = word_idx.max(0)[None, :] - word_idx word_idx = word_idx.max(0)[None, :] - word_idx
...@@ -142,7 +149,7 @@ class WordShuffle(WordNoising): ...@@ -142,7 +149,7 @@ class WordShuffle(WordNoising):
noise = np.random.uniform( noise = np.random.uniform(
0, 0,
max_shuffle_distance, max_shuffle_distance,
size=(x.size(0) - 1, x.size(1)), size=(x.size(0), x.size(1)),
) )
noise[0] = -1 # do not move start sentence symbol noise[0] = -1 # do not move start sentence symbol
...@@ -153,7 +160,6 @@ class WordShuffle(WordNoising): ...@@ -153,7 +160,6 @@ class WordShuffle(WordNoising):
length_no_eos = lengths[i] length_no_eos = lengths[i]
if x[lengths[i] - 1, i] == self.dictionary.eos(): if x[lengths[i] - 1, i] == self.dictionary.eos():
length_no_eos = lengths[i] - 1 length_no_eos = lengths[i] - 1
# generate a random permutation # generate a random permutation
scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i] scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i]
# ensure no reordering inside a word # ensure no reordering inside a word
...@@ -216,6 +222,7 @@ class NoisingDataset(torch.utils.data.Dataset): ...@@ -216,6 +222,7 @@ class NoisingDataset(torch.utils.data.Dataset):
src_dataset, src_dataset,
src_dict, src_dict,
seed, seed,
noiser=None,
noising_class=UnsupervisedMTNoising, noising_class=UnsupervisedMTNoising,
**kwargs, **kwargs,
): ):
...@@ -235,6 +242,8 @@ class NoisingDataset(torch.utils.data.Dataset): ...@@ -235,6 +242,8 @@ class NoisingDataset(torch.utils.data.Dataset):
src_dict: src dict src_dict: src dict
src_dict: src dictionary src_dict: src dictionary
seed: seed to use when generating random noise seed: seed to use when generating random noise
noiser: a pre-initialized noiser. If this is None, a noiser will
be created using noising_class and kwargs.
noising_class: class to use when initializing noiser noising_class: class to use when initializing noiser
kwargs: noising args for configuring noising to apply kwargs: noising args for configuring noising to apply
Note that there is no equivalent argparse code for these args Note that there is no equivalent argparse code for these args
...@@ -246,7 +255,7 @@ class NoisingDataset(torch.utils.data.Dataset): ...@@ -246,7 +255,7 @@ class NoisingDataset(torch.utils.data.Dataset):
self.src_dataset = src_dataset self.src_dataset = src_dataset
self.src_dict = src_dict self.src_dict = src_dict
self.noiser = noising_class( self.noiser = noiser if noiser is not None else noising_class(
dictionary=src_dict, **kwargs, dictionary=src_dict, **kwargs,
) )
self.seed = seed self.seed = seed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment