Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
9238c57a
Commit
9238c57a
authored
Apr 03, 2020
by
Neel Kant
Browse files
Remove commented out test code
parent
ca6b6687
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1 addition
and
23 deletions
+1
-23
megatron/data_utils/datasets.py
megatron/data_utils/datasets.py
+1
-23
No files found.
megatron/data_utils/datasets.py
View file @
9238c57a
...
@@ -977,13 +977,10 @@ class InverseClozeDataset(data.Dataset):
...
@@ -977,13 +977,10 @@ class InverseClozeDataset(data.Dataset):
# 10% of the time, the input sentence is left in the context.
# 10% of the time, the input sentence is left in the context.
# The other 90% of the time, remove it.
# The other 90% of the time, remove it.
if
rng
.
random
()
<
0.1
:
if
rng
.
random
()
<
0.1
:
# if True:
context_tokens
=
input_tokens
.
copy
()
context_tokens
=
input_tokens
.
copy
()
context_token_types
=
input_token_types
.
copy
()
context_token_types
=
input_token_types
.
copy
()
# parameters for examining sentences to remove from the context
# parameters for examining sentences to add to the context
# TODO: test detokenized stuff, make sure it's the same doc in the same order.
# change preceding rng condition to always true
view_preceding
=
True
view_preceding
=
True
view_radius
=
1
view_radius
=
1
while
len
(
context_tokens
)
<
padless_max_len
:
while
len
(
context_tokens
)
<
padless_max_len
:
...
@@ -1005,25 +1002,6 @@ class InverseClozeDataset(data.Dataset):
...
@@ -1005,25 +1002,6 @@ class InverseClozeDataset(data.Dataset):
if
view_radius
>
num_sentences
:
if
view_radius
>
num_sentences
:
break
break
# detokenized_input = self.tokenizer.DecodeIds(input_tokens)
# detokenized_context = self.tokenizer.DecodeIds(context_tokens)
# encoded_sentences = [self.tokenizer.EncodeAsIds(s).tokenization for s in doc]
# full_document_encoded = list(itertools.chain(*encoded_sentences))
# detokenized_doc = self.tokenizer.DecodeIds(full_document_encoded)
# b1 = detokenized_input in detokenized_doc
# b2 = detokenized_context in detokenized_doc
# print("-" * 100)
# print('> input idx: {}'.format(input_sentence_idx))
# print('> input in doc: {}'.format(b1))
# print('> context in doc: {}'.format(b2))
# print('> input: {}'.format(detokenized_input))
# print('> context: {}'.format(detokenized_context))
# print('\n> doc: {}'.format(detokenized_doc))
# if not (b1 and b2):
# raise ValueError("you dun goofed")
# assemble the tokens and token types of the context
# assemble the tokens and token types of the context
context_tokens
=
context_tokens
[:
padless_max_len
]
context_tokens
=
context_tokens
[:
padless_max_len
]
context_token_types
=
context_token_types
[:
padless_max_len
]
context_token_types
=
context_token_types
[:
padless_max_len
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment