Remove commented out test code

9238c57a · Neel Kant · ca6b6687 · 9238c57a
Commit 9238c57a authored Apr 03, 2020 by Neel Kant
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 23 deletions

megatron/data_utils/datasets.py megatron/data_utils/datasets.py +1 -23

No files found.
--- a/megatron/data_utils/datasets.py
+++ b/megatron/data_utils/datasets.py
@@ -977,13 +977,10 @@ class InverseClozeDataset(data.Dataset):
            # 10% of the time, the input sentence is left in the context.
            # The other 90% of the time, remove it.
            if rng.random() < 0.1:
-            # if True:
                context_tokens = input_tokens.copy()
                context_token_types = input_token_types.copy()
-            # parameters for examining sentences to remove from the context
+            # parameters for examining sentences to add to the context
-            # TODO: test detokenized stuff, make sure it's the same doc in the same order.
-            #       change preceding rng condition to always true
            view_preceding = True
            view_radius = 1
            while len(context_tokens) < padless_max_len:
@@ -1005,25 +1002,6 @@ class InverseClozeDataset(data.Dataset):
                if view_radius > num_sentences:
                    break
-            # detokenized_input = self.tokenizer.DecodeIds(input_tokens)
-            # detokenized_context = self.tokenizer.DecodeIds(context_tokens)
-            # encoded_sentences = [self.tokenizer.EncodeAsIds(s).tokenization for s in doc]
-            # full_document_encoded = list(itertools.chain(*encoded_sentences))
-            # detokenized_doc = self.tokenizer.DecodeIds(full_document_encoded)
-            # b1 = detokenized_input in detokenized_doc
-            # b2 = detokenized_context in detokenized_doc
-            # print("-" * 100)
-            # print('> input idx: {}'.format(input_sentence_idx))
-            # print('> input in doc: {}'.format(b1))
-            # print('> context in doc: {}'.format(b2))
-            # print('> input: {}'.format(detokenized_input))
-            # print('> context: {}'.format(detokenized_context))
-            # print('\n> doc: {}'.format(detokenized_doc))
-            # if not (b1 and b2):
-            #     raise ValueError("you dun goofed")
            # assemble the tokens and token types of the context
            context_tokens = context_tokens[:padless_max_len]
            context_token_types = context_token_types[:padless_max_len]