Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
6d6a78a2
Commit
6d6a78a2
authored
Nov 13, 2020
by
Allen Wang
Committed by
A. Unique TensorFlower
Nov 13, 2020
Browse files
Create XLNet pretrain data loader.
PiperOrigin-RevId: 342283301
parent
42f8e96e
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
615 additions
and
12 deletions
+615
-12
official/nlp/data/pretrain_dataloader.py
official/nlp/data/pretrain_dataloader.py
+504
-0
official/nlp/data/pretrain_dataloader_test.py
official/nlp/data/pretrain_dataloader_test.py
+111
-12
No files found.
official/nlp/data/pretrain_dataloader.py
View file @
6d6a78a2
This diff is collapsed.
Click to expand it.
official/nlp/data/pretrain_dataloader_test.py
View file @
6d6a78a2
...
@@ -24,19 +24,21 @@ import tensorflow as tf
...
@@ -24,19 +24,21 @@ import tensorflow as tf
from
official.nlp.data
import
pretrain_dataloader
from
official.nlp.data
import
pretrain_dataloader
def
_create_fake_dataset
(
output_path
,
def
create_int_feature
(
values
):
seq_length
,
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
max_predictions_per_seq
,
return
f
use_position_id
,
use_next_sentence_label
,
use_v2_feature_names
=
False
):
def
_create_fake_bert_dataset
(
output_path
,
seq_length
,
max_predictions_per_seq
,
use_position_id
,
use_next_sentence_label
,
use_v2_feature_names
=
False
):
"""Creates a fake dataset."""
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
def
create_float_feature
(
values
):
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
return
f
...
@@ -70,6 +72,34 @@ def _create_fake_dataset(output_path,
...
@@ -70,6 +72,34 @@ def _create_fake_dataset(output_path,
writer
.
close
()
writer
.
close
()
def
_create_fake_xlnet_dataset
(
output_path
,
seq_length
,
max_predictions_per_seq
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
for
_
in
range
(
100
):
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
num_boundary_indices
=
np
.
random
.
randint
(
1
,
seq_length
)
if
max_predictions_per_seq
is
not
None
:
input_mask
=
np
.
zeros_like
(
input_ids
)
input_mask
[:
max_predictions_per_seq
]
=
1
np
.
random
.
shuffle
(
input_mask
)
else
:
input_mask
=
np
.
ones_like
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"input_word_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_type_ids"
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
"boundary_indices"
]
=
create_int_feature
(
sorted
(
np
.
random
.
randint
(
seq_length
,
size
=
(
num_boundary_indices
))))
features
[
"target"
]
=
create_int_feature
(
input_ids
+
1
)
features
[
"label"
]
=
create_int_feature
([
1
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
class
BertPretrainDataTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
class
BertPretrainDataTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
itertools
.
product
(
@
parameterized
.
parameters
(
itertools
.
product
(
...
@@ -80,7 +110,7 @@ class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -80,7 +110,7 @@ class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
seq_length
=
128
seq_length
=
128
max_predictions_per_seq
=
20
max_predictions_per_seq
=
20
_create_fake_dataset
(
_create_fake_
bert_
dataset
(
train_data_path
,
train_data_path
,
seq_length
,
seq_length
,
max_predictions_per_seq
,
max_predictions_per_seq
,
...
@@ -114,7 +144,7 @@ class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -114,7 +144,7 @@ class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
seq_length
=
128
seq_length
=
128
max_predictions_per_seq
=
20
max_predictions_per_seq
=
20
_create_fake_dataset
(
_create_fake_
bert_
dataset
(
train_data_path
,
train_data_path
,
seq_length
,
seq_length
,
max_predictions_per_seq
,
max_predictions_per_seq
,
...
@@ -141,5 +171,74 @@ class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -141,5 +171,74 @@ class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
self
.
assertIn
(
"masked_lm_weights"
,
features
)
self
.
assertIn
(
"masked_lm_weights"
,
features
)
class
XLNetPretrainDataTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
itertools
.
product
(
(
"fixed"
,
"single_token"
,
"whole_word"
,
"token_span"
),
(
0
,
64
),
(
20
,
None
),
))
def
test_load_data
(
self
,
sample_strategy
,
reuse_length
,
max_predictions_per_seq
):
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
seq_length
=
128
batch_size
=
5
_create_fake_xlnet_dataset
(
train_data_path
,
seq_length
,
max_predictions_per_seq
)
data_config
=
pretrain_dataloader
.
XLNetPretrainDataConfig
(
input_path
=
train_data_path
,
max_predictions_per_seq
=
max_predictions_per_seq
,
seq_length
=
seq_length
,
global_batch_size
=
batch_size
,
is_training
=
True
,
reuse_length
=
reuse_length
,
sample_strategy
=
sample_strategy
,
min_num_tokens
=
1
,
max_num_tokens
=
2
,
permutation_size
=
seq_length
//
2
,
leak_ratio
=
0.1
)
if
(
max_predictions_per_seq
is
None
and
sample_strategy
!=
"fixed"
):
with
self
.
assertRaisesWithRegexpMatch
(
ValueError
,
"`max_predictions_per_seq` must be set"
):
dataset
=
pretrain_dataloader
.
XLNetPretrainDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
else
:
dataset
=
pretrain_dataloader
.
XLNetPretrainDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
self
.
assertIn
(
"input_word_ids"
,
features
)
self
.
assertIn
(
"input_type_ids"
,
features
)
self
.
assertIn
(
"permutation_mask"
,
features
)
self
.
assertIn
(
"masked_tokens"
,
features
)
self
.
assertIn
(
"target"
,
features
)
self
.
assertIn
(
"target_mask"
,
features
)
self
.
assertAllClose
(
features
[
"input_word_ids"
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertAllClose
(
features
[
"input_type_ids"
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertAllClose
(
features
[
"permutation_mask"
].
shape
,
(
batch_size
,
seq_length
,
seq_length
))
self
.
assertAllClose
(
features
[
"masked_tokens"
].
shape
,
(
batch_size
,
seq_length
,))
if
max_predictions_per_seq
is
not
None
:
self
.
assertIn
(
"target_mapping"
,
features
)
self
.
assertAllClose
(
features
[
"target_mapping"
].
shape
,
(
batch_size
,
max_predictions_per_seq
,
seq_length
))
self
.
assertAllClose
(
features
[
"target_mask"
].
shape
,
(
batch_size
,
max_predictions_per_seq
))
self
.
assertAllClose
(
features
[
"target"
].
shape
,
(
batch_size
,
max_predictions_per_seq
))
else
:
self
.
assertAllClose
(
features
[
"target_mask"
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertAllClose
(
features
[
"target"
].
shape
,
(
batch_size
,
seq_length
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment