Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
f16a7b5b
Unverified
Commit
f16a7b5b
authored
May 04, 2021
by
vedanshu
Committed by
GitHub
May 04, 2021
Browse files
Merge pull request
#1
from tensorflow/master
new pull
parents
8e9296ff
8f58f396
Changes
298
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2933 additions
and
366 deletions
+2933
-366
official/nlp/data/create_xlnet_pretraining_data_test.py
official/nlp/data/create_xlnet_pretraining_data_test.py
+355
-0
official/nlp/data/data_loader.py
official/nlp/data/data_loader.py
+48
-0
official/nlp/data/data_loader_factory.py
official/nlp/data/data_loader_factory.py
+3
-4
official/nlp/data/data_loader_factory_test.py
official/nlp/data/data_loader_factory_test.py
+45
-0
official/nlp/data/pretrain_dataloader.py
official/nlp/data/pretrain_dataloader.py
+507
-14
official/nlp/data/pretrain_dataloader_test.py
official/nlp/data/pretrain_dataloader_test.py
+242
-0
official/nlp/data/pretrain_dynamic_dataloader.py
official/nlp/data/pretrain_dynamic_dataloader.py
+211
-0
official/nlp/data/pretrain_dynamic_dataloader_test.py
official/nlp/data/pretrain_dynamic_dataloader_test.py
+242
-0
official/nlp/data/question_answering_dataloader.py
official/nlp/data/question_answering_dataloader.py
+22
-7
official/nlp/data/question_answering_dataloader_test.py
official/nlp/data/question_answering_dataloader_test.py
+74
-0
official/nlp/data/sentence_prediction_dataloader.py
official/nlp/data/sentence_prediction_dataloader.py
+169
-7
official/nlp/data/sentence_prediction_dataloader_test.py
official/nlp/data/sentence_prediction_dataloader_test.py
+249
-0
official/nlp/data/sentence_retrieval_lib.py
official/nlp/data/sentence_retrieval_lib.py
+7
-9
official/nlp/data/squad_lib.py
official/nlp/data/squad_lib.py
+194
-117
official/nlp/data/squad_lib_sp.py
official/nlp/data/squad_lib_sp.py
+185
-101
official/nlp/data/tagging_data_lib.py
official/nlp/data/tagging_data_lib.py
+105
-25
official/nlp/data/tagging_data_lib_test.py
official/nlp/data/tagging_data_lib_test.py
+108
-0
official/nlp/data/tagging_data_loader.py
official/nlp/data/tagging_data_loader.py
+0
-82
official/nlp/data/tagging_dataloader.py
official/nlp/data/tagging_dataloader.py
+85
-0
official/nlp/data/tagging_dataloader_test.py
official/nlp/data/tagging_dataloader_test.py
+82
-0
No files found.
Too many changes to show.
To preserve performance only
298 of 298+
files are displayed.
Plain diff
Email patch
official/nlp/data/create_xlnet_pretraining_data_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.create_xlnet_pretraining_data."""
import
os
import
tempfile
from
typing
import
List
from
absl
import
logging
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.data
import
create_xlnet_pretraining_data
as
cpd
_VOCAB_WORDS
=
[
"vocab_1"
,
"vocab_2"
]
# pylint: disable=invalid-name
def
_create_files
(
temp_dir
:
str
,
file_contents
:
List
[
List
[
str
]])
->
List
[
str
]:
"""Writes arbitrary documents into files."""
root_dir
=
tempfile
.
mkdtemp
(
dir
=
temp_dir
)
files
=
[]
for
i
,
file_content
in
enumerate
(
file_contents
):
destination
=
os
.
path
.
join
(
root_dir
,
"%d.txt"
%
i
)
with
open
(
destination
,
"wb"
)
as
f
:
for
line
in
file_content
:
f
.
write
(
line
.
encode
(
"utf-8"
))
files
.
append
(
destination
)
return
files
def
_get_mock_tokenizer
():
"""Creates a mock tokenizer."""
class
MockSpieceModel
:
"""Mock Spiece model for testing."""
def
__init__
(
self
):
self
.
_special_piece_to_id
=
{
"<unk>"
:
0
,
}
for
piece
in
set
(
list
(
'!"#$%&
\"
()*+,-./:;?@[
\\
]^_`{|}~'
)):
self
.
_special_piece_to_id
[
piece
]
=
1
def
EncodeAsPieces
(
self
,
inputs
:
str
)
->
List
[
str
]:
return
inputs
def
SampleEncodeAsPieces
(
self
,
inputs
:
str
,
nbest_size
:
int
,
theta
:
float
)
->
List
[
str
]:
del
nbest_size
,
theta
return
inputs
def
PieceToId
(
self
,
piece
:
str
)
->
int
:
return
ord
(
piece
[
0
])
def
IdToPiece
(
self
,
id_
:
int
)
->
str
:
return
chr
(
id_
)
*
3
class
Tokenizer
:
"""Mock Tokenizer for testing."""
def
__init__
(
self
):
self
.
sp_model
=
MockSpieceModel
()
def
convert_ids_to_tokens
(
self
,
ids
:
List
[
int
])
->
List
[
str
]:
return
[
self
.
sp_model
.
IdToPiece
(
id_
)
for
id_
in
ids
]
return
Tokenizer
()
class
PreprocessDataTest
(
tf
.
test
.
TestCase
):
def
test_remove_extraneous_space
(
self
):
line
=
" abc "
output
=
cpd
.
_preprocess_line
(
line
)
self
.
assertEqual
(
output
,
"abc"
)
def
test_symbol_replacements
(
self
):
self
.
assertEqual
(
cpd
.
_preprocess_line
(
"``abc``"
),
"
\"
abc
\"
"
)
self
.
assertEqual
(
cpd
.
_preprocess_line
(
"''abc''"
),
"
\"
abc
\"
"
)
def
test_accent_replacements
(
self
):
self
.
assertEqual
(
cpd
.
_preprocess_line
(
"åbc"
),
"abc"
)
def
test_lower_case
(
self
):
self
.
assertEqual
(
cpd
.
_preprocess_line
(
"ABC"
,
do_lower_case
=
True
),
"abc"
)
def
test_end_to_end
(
self
):
self
.
assertEqual
(
cpd
.
_preprocess_line
(
"HelLo ``wórLd``"
,
do_lower_case
=
True
),
"hello
\"
world
\"
"
)
class
PreprocessAndTokenizeFilesTest
(
tf
.
test
.
TestCase
):
def
test_basic_end_to_end
(
self
):
documents
=
[
[
"This is sentence 1.
\n
"
,
"This is sentence 2.
\n
"
,
"Sentence 3 is what this is.
\n
"
,
],
[
"This is the second document.
\n
"
,
"This is the second line of the second document.
\n
"
],
]
input_files
=
_create_files
(
temp_dir
=
self
.
get_temp_dir
(),
file_contents
=
documents
)
all_data
=
cpd
.
preprocess_and_tokenize_input_files
(
input_files
=
input_files
,
tokenizer
=
_get_mock_tokenizer
(),
log_example_freq
=
1
)
self
.
assertEqual
(
len
(
all_data
),
len
(
documents
))
for
token_ids
,
sentence_ids
in
all_data
:
self
.
assertEqual
(
len
(
token_ids
),
len
(
sentence_ids
))
def
test_basic_correctness
(
self
):
documents
=
[[
"a
\n
"
,
"b
\n
"
,
"c
\n
"
]]
input_files
=
_create_files
(
temp_dir
=
self
.
get_temp_dir
(),
file_contents
=
documents
)
all_data
=
cpd
.
preprocess_and_tokenize_input_files
(
input_files
=
input_files
,
tokenizer
=
_get_mock_tokenizer
(),
log_example_freq
=
1
)
token_ids
,
sentence_ids
=
all_data
[
0
]
self
.
assertAllClose
(
token_ids
,
[
97
,
98
,
99
])
self
.
assertAllClose
(
sentence_ids
,
[
True
,
False
,
True
])
def
test_correctness_with_spaces_and_accents
(
self
):
documents
=
[[
" å
\n
"
,
"b
\n
"
,
" c
\n
"
,
]]
input_files
=
_create_files
(
temp_dir
=
self
.
get_temp_dir
(),
file_contents
=
documents
)
all_data
=
cpd
.
preprocess_and_tokenize_input_files
(
input_files
=
input_files
,
tokenizer
=
_get_mock_tokenizer
(),
log_example_freq
=
1
)
token_ids
,
sentence_ids
=
all_data
[
0
]
self
.
assertAllClose
(
token_ids
,
[
97
,
98
,
99
])
self
.
assertAllClose
(
sentence_ids
,
[
True
,
False
,
True
])
class
BatchReshapeTests
(
tf
.
test
.
TestCase
):
def
test_basic_functionality
(
self
):
per_host_batch_size
=
3
mock_shape
=
(
20
,)
# Should truncate and reshape.
expected_result_shape
=
(
3
,
6
)
tokens
=
np
.
zeros
(
mock_shape
)
sentence_ids
=
np
.
zeros
(
mock_shape
)
reshaped_data
=
cpd
.
_reshape_to_batch_dimensions
(
tokens
=
tokens
,
sentence_ids
=
sentence_ids
,
per_host_batch_size
=
per_host_batch_size
)
for
values
in
reshaped_data
:
self
.
assertEqual
(
len
(
values
.
flatten
())
%
per_host_batch_size
,
0
)
self
.
assertAllClose
(
values
.
shape
,
expected_result_shape
)
class
CreateSegmentsTest
(
tf
.
test
.
TestCase
):
def
test_basic_functionality
(
self
):
data_length
=
10
tokens
=
np
.
arange
(
data_length
)
sentence_ids
=
np
.
concatenate
([
np
.
zeros
(
data_length
//
2
),
np
.
ones
(
data_length
//
2
)])
begin_index
=
0
total_length
=
8
a_data
,
b_data
,
label
=
cpd
.
_create_a_and_b_segments
(
tokens
=
tokens
,
sentence_ids
=
sentence_ids
,
begin_index
=
begin_index
,
total_length
=
total_length
,
no_cut_probability
=
0.
)
self
.
assertAllClose
(
a_data
,
[
0
,
1
,
2
,
3
])
self
.
assertAllClose
(
b_data
,
[
5
,
6
,
7
,
8
])
self
.
assertEqual
(
label
,
1
)
def
test_no_cut
(
self
):
data_length
=
10
tokens
=
np
.
arange
(
data_length
)
sentence_ids
=
np
.
zeros
(
data_length
)
begin_index
=
0
total_length
=
8
a_data
,
b_data
,
label
=
cpd
.
_create_a_and_b_segments
(
tokens
=
tokens
,
sentence_ids
=
sentence_ids
,
begin_index
=
begin_index
,
total_length
=
total_length
,
no_cut_probability
=
0.
)
self
.
assertGreater
(
len
(
a_data
),
0
)
self
.
assertGreater
(
len
(
b_data
),
0
)
self
.
assertEqual
(
label
,
0
)
def
test_no_cut_with_probability
(
self
):
data_length
=
10
tokens
=
np
.
arange
(
data_length
)
sentence_ids
=
np
.
concatenate
([
np
.
zeros
(
data_length
//
2
),
np
.
ones
(
data_length
//
2
)])
begin_index
=
0
total_length
=
8
a_data
,
b_data
,
label
=
cpd
.
_create_a_and_b_segments
(
tokens
=
tokens
,
sentence_ids
=
sentence_ids
,
begin_index
=
begin_index
,
total_length
=
total_length
,
no_cut_probability
=
1.
)
self
.
assertGreater
(
len
(
a_data
),
0
)
self
.
assertGreater
(
len
(
b_data
),
0
)
self
.
assertEqual
(
label
,
0
)
class
CreateInstancesTest
(
tf
.
test
.
TestCase
):
"""Tests conversions of Token/Sentence IDs to training instances."""
def
test_basic
(
self
):
data_length
=
12
tokens
=
np
.
arange
(
data_length
)
sentence_ids
=
np
.
zeros
(
data_length
)
seq_length
=
8
instances
=
cpd
.
_convert_tokens_to_instances
(
tokens
=
tokens
,
sentence_ids
=
sentence_ids
,
per_host_batch_size
=
2
,
seq_length
=
seq_length
,
reuse_length
=
4
,
tokenizer
=
_get_mock_tokenizer
(),
bi_data
=
False
,
num_cores_per_host
=
1
,
logging_frequency
=
1
)
for
instance
in
instances
:
self
.
assertEqual
(
len
(
instance
.
data
),
seq_length
)
self
.
assertEqual
(
len
(
instance
.
segment_ids
),
seq_length
)
self
.
assertIsInstance
(
instance
.
label
,
int
)
self
.
assertIsInstance
(
instance
.
boundary_indices
,
list
)
class
TFRecordPathTests
(
tf
.
test
.
TestCase
):
def
test_basic
(
self
):
base_kwargs
=
dict
(
per_host_batch_size
=
1
,
num_cores_per_host
=
1
,
seq_length
=
2
,
reuse_length
=
1
)
config1
=
dict
(
prefix
=
"test"
,
suffix
=
""
,
bi_data
=
True
,
use_eod_token
=
False
,
do_lower_case
=
True
)
config1
.
update
(
base_kwargs
)
expectation1
=
"test_seqlen-2_reuse-1_bs-1_cores-1_uncased_bi.tfrecord"
self
.
assertEqual
(
cpd
.
get_tfrecord_name
(
**
config1
),
expectation1
)
config2
=
dict
(
prefix
=
""
,
suffix
=
"test"
,
bi_data
=
False
,
use_eod_token
=
False
,
do_lower_case
=
False
)
config2
.
update
(
base_kwargs
)
expectation2
=
"seqlen-2_reuse-1_bs-1_cores-1_cased_uni_test.tfrecord"
self
.
assertEqual
(
cpd
.
get_tfrecord_name
(
**
config2
),
expectation2
)
config3
=
dict
(
prefix
=
""
,
suffix
=
""
,
use_eod_token
=
True
,
bi_data
=
False
,
do_lower_case
=
True
)
config3
.
update
(
base_kwargs
)
expectation3
=
"seqlen-2_reuse-1_bs-1_cores-1_uncased_eod_uni.tfrecord"
self
.
assertEqual
(
cpd
.
get_tfrecord_name
(
**
config3
),
expectation3
)
class
TestCreateTFRecords
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
named_parameters
(
(
"bi_data_only"
,
True
,
False
,
False
),
(
"eod_token_only"
,
False
,
True
,
True
),
(
"lower_case_only"
,
False
,
False
,
True
),
(
"all_enabled"
,
True
,
True
,
True
),
)
def
test_end_to_end
(
self
,
bi_data
:
bool
,
use_eod_token
:
bool
,
do_lower_case
:
bool
):
tokenizer
=
_get_mock_tokenizer
()
num_documents
=
5
sentences_per_document
=
10
document_length
=
50
documents
=
[
[
"a "
*
document_length
for
_
in
range
(
sentences_per_document
)]
for
_
in
range
(
num_documents
)]
save_dir
=
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
())
files
=
_create_files
(
temp_dir
=
self
.
get_temp_dir
(),
file_contents
=
documents
)
cpd
.
create_tfrecords
(
tokenizer
=
tokenizer
,
input_file_or_files
=
","
.
join
(
files
),
use_eod_token
=
use_eod_token
,
do_lower_case
=
do_lower_case
,
per_host_batch_size
=
8
,
seq_length
=
8
,
reuse_length
=
4
,
bi_data
=
bi_data
,
num_cores_per_host
=
2
,
save_dir
=
save_dir
)
self
.
assertTrue
(
any
(
filter
(
lambda
x
:
x
.
endswith
(
".json"
),
os
.
listdir
(
save_dir
))))
self
.
assertTrue
(
any
(
filter
(
lambda
x
:
x
.
endswith
(
".tfrecord"
),
os
.
listdir
(
save_dir
))))
if
__name__
==
"__main__"
:
np
.
random
.
seed
(
0
)
logging
.
set_verbosity
(
logging
.
INFO
)
tf
.
test
.
main
()
official/nlp/data/data_loader.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An abstraction that NLP models define input pipelines."""
import
abc
from
typing
import
Optional
import
tensorflow
as
tf
class
DataLoader
(
metaclass
=
abc
.
ABCMeta
):
"""An abstract class defining the APIs for tf.data input pipeline."""
@
abc
.
abstractmethod
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
)
->
tf
.
data
.
Dataset
:
"""Implements DataLoader load method.
Builds the entire input pipeline inside the load method. Users can define
states inside the DataLoader class and returns a tf.data dataset
object.
Args:
input_context: This is a context class that is passed to the user's input
function and contains information about the compute replicas and input
pipelines. This object is used for multi-host inputs and passed by the
distribution strategy.
Returns:
A per-host tf.data dataset. Note that, we usually create the distributed
dataset through the load method, so we should not directly return a
distributed dataset here.
"""
pass
official/nlp/data/data_loader_factory.py
View file @
f16a7b5b
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -12,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A global factory to access NLP registered data loaders."""
from
official.
utils
import
registry
from
official.
core
import
registry
_REGISTERED_DATA_LOADER_CLS
=
{}
...
...
official/nlp/data/data_loader_factory_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.data_loader_factory."""
import
dataclasses
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.nlp.data
import
data_loader_factory
@
dataclasses
.
dataclass
class
MyDataConfig
(
cfg
.
DataConfig
):
is_training
:
bool
=
True
@
data_loader_factory
.
register_data_loader_cls
(
MyDataConfig
)
class
MyDataLoader
:
def
__init__
(
self
,
params
):
self
.
params
=
params
class
DataLoaderFactoryTest
(
tf
.
test
.
TestCase
):
def
test_register_and_load
(
self
):
train_config
=
MyDataConfig
()
train_loader
=
data_loader_factory
.
get_data_loader
(
train_config
)
self
.
assertTrue
(
train_loader
.
params
.
is_training
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/data/pretrain_dataloader.py
View file @
f16a7b5b
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -12,15 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the BERT pretraining task."""
from
typing
import
Mapping
,
Optional
from
absl
import
logging
import
dataclasses
import
numpy
as
np
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
from
official.
modeling.hyperparams
import
config_definitions
as
cfg
from
official.
nlp.data
import
data_loader
from
official.nlp.data
import
data_loader_factory
...
...
@@ -34,10 +36,16 @@ class BertPretrainDataConfig(cfg.DataConfig):
max_predictions_per_seq
:
int
=
76
use_next_sentence_label
:
bool
=
True
use_position_id
:
bool
=
False
# Historically, BERT implementations take `input_ids` and `segment_ids` as
# feature names. Inside the TF Model Garden implementation, the Keras model
# inputs are set as `input_word_ids` and `input_type_ids`. When
# v2_feature_names is True, the data loader assumes the tf.Examples use
# `input_word_ids` and `input_type_ids` as keys.
use_v2_feature_names
:
bool
=
False
@
data_loader_factory
.
register_data_loader_cls
(
BertPretrainDataConfig
)
class
BertPretrainDataLoader
:
class
BertPretrainDataLoader
(
data_loader
.
DataLoader
)
:
"""A class to load dataset for bert pretraining task."""
def
__init__
(
self
,
params
):
...
...
@@ -52,15 +60,10 @@ class BertPretrainDataLoader:
self
.
_use_next_sentence_label
=
params
.
use_next_sentence_label
self
.
_use_position_id
=
params
.
use_position_id
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
def
_name_to_features
(
self
):
name_to_features
=
{
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_mask'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'masked_lm_positions'
:
tf
.
io
.
FixedLenFeature
([
self
.
_max_predictions_per_seq
],
tf
.
int64
),
'masked_lm_ids'
:
...
...
@@ -68,13 +71,27 @@ class BertPretrainDataLoader:
'masked_lm_weights'
:
tf
.
io
.
FixedLenFeature
([
self
.
_max_predictions_per_seq
],
tf
.
float32
),
}
if
self
.
_params
.
use_v2_feature_names
:
name_to_features
.
update
({
'input_word_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_type_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
})
else
:
name_to_features
.
update
({
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
})
if
self
.
_use_next_sentence_label
:
name_to_features
[
'next_sentence_labels'
]
=
tf
.
io
.
FixedLenFeature
([
1
],
tf
.
int64
)
if
self
.
_use_position_id
:
name_to_features
[
'position_ids'
]
=
tf
.
io
.
FixedLenFeature
(
[
self
.
_seq_length
],
tf
.
int64
)
return
name_to_features
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
self
.
_name_to_features
()
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
...
...
@@ -90,13 +107,17 @@ class BertPretrainDataLoader:
def
_parse
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
=
{
'input_word_ids'
:
record
[
'input_ids'
],
'input_mask'
:
record
[
'input_mask'
],
'input_type_ids'
:
record
[
'segment_ids'
],
'masked_lm_positions'
:
record
[
'masked_lm_positions'
],
'masked_lm_ids'
:
record
[
'masked_lm_ids'
],
'masked_lm_weights'
:
record
[
'masked_lm_weights'
],
}
if
self
.
_params
.
use_v2_feature_names
:
x
[
'input_word_ids'
]
=
record
[
'input_word_ids'
]
x
[
'input_type_ids'
]
=
record
[
'input_type_ids'
]
else
:
x
[
'input_word_ids'
]
=
record
[
'input_ids'
]
x
[
'input_type_ids'
]
=
record
[
'segment_ids'
]
if
self
.
_use_next_sentence_label
:
x
[
'next_sentence_labels'
]
=
record
[
'next_sentence_labels'
]
if
self
.
_use_position_id
:
...
...
@@ -109,3 +130,475 @@ class BertPretrainDataLoader:
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
)
return
reader
.
read
(
input_context
)
@
dataclasses
.
dataclass
class
XLNetPretrainDataConfig
(
cfg
.
DataConfig
):
"""Data config for XLNet pretraining task.
Attributes:
input_path: See base class.
global_batch_size: See base calss.
is_training: See base class.
seq_length: The length of each sequence.
max_predictions_per_seq: The number of predictions per sequence.
reuse_length: The number of tokens in a previous segment to reuse. This
should be the same value used during pretrain data creation.
sample_strategy: The strategy used to sample factorization permutations.
Possible values: 'single_token', 'whole_word', 'token_span', 'word_span'.
min_num_tokens: The minimum number of tokens to sample in a span.
This is used when `sample_strategy` is 'token_span'.
max_num_tokens: The maximum number of tokens to sample in a span.
This is used when `sample_strategy` is 'token_span'.
min_num_words: The minimum number of words to sample in a span.
This is used when `sample_strategy` is 'word_span'.
max_num_words: The maximum number of words to sample in a span.
This is used when `sample_strategy` is 'word_span'.
permutation_size: The length of the longest permutation. This can be set
to `reuse_length`. This should NOT be greater than `reuse_length`,
otherwise this may introduce data leaks.
leak_ratio: The percentage of masked tokens that are leaked.
segment_sep_id: The ID of the SEP token used when preprocessing
the dataset.
segment_cls_id: The ID of the CLS token used when preprocessing
the dataset.
"""
input_path
:
str
=
''
global_batch_size
:
int
=
512
is_training
:
bool
=
True
seq_length
:
int
=
512
max_predictions_per_seq
:
int
=
76
reuse_length
:
int
=
256
sample_strategy
:
str
=
'word_span'
min_num_tokens
:
int
=
1
max_num_tokens
:
int
=
5
min_num_words
:
int
=
1
max_num_words
:
int
=
5
permutation_size
:
int
=
256
leak_ratio
:
float
=
0.1
segment_sep_id
:
int
=
4
segment_cls_id
:
int
=
3
@
data_loader_factory
.
register_data_loader_cls
(
XLNetPretrainDataConfig
)
class
XLNetPretrainDataLoader
(
data_loader
.
DataLoader
):
"""A class to load dataset for xlnet pretraining task."""
def
__init__
(
self
,
params
:
XLNetPretrainDataConfig
):
"""Inits `XLNetPretrainDataLoader` class.
Args:
params: A `XLNetPretrainDataConfig` object.
"""
self
.
_params
=
params
self
.
_seq_length
=
params
.
seq_length
self
.
_max_predictions_per_seq
=
params
.
max_predictions_per_seq
self
.
_reuse_length
=
params
.
reuse_length
self
.
_num_replicas_in_sync
=
None
self
.
_permutation_size
=
params
.
permutation_size
self
.
_sep_id
=
params
.
segment_sep_id
self
.
_cls_id
=
params
.
segment_cls_id
self
.
_sample_strategy
=
params
.
sample_strategy
self
.
_leak_ratio
=
params
.
leak_ratio
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
{
'input_word_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_type_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'boundary_indices'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
}
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
list
(
example
.
keys
()):
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
cast
(
t
,
tf
.
int32
)
example
[
name
]
=
t
return
example
def
_parse
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
=
{}
inputs
=
record
[
'input_word_ids'
]
x
[
'input_type_ids'
]
=
record
[
'input_type_ids'
]
if
self
.
_sample_strategy
in
[
'whole_word'
,
'word_span'
]:
boundary
=
tf
.
sparse
.
to_dense
(
record
[
'boundary_indices'
])
else
:
boundary
=
None
input_mask
=
self
.
_online_sample_mask
(
inputs
=
inputs
,
boundary
=
boundary
)
if
self
.
_reuse_length
>
0
:
if
self
.
_permutation_size
>
self
.
_reuse_length
:
logging
.
warning
(
'`permutation_size` is greater than `reuse_length` (%d > %d).'
'This may introduce data leakage.'
,
self
.
_permutation_size
,
self
.
_reuse_length
)
# Enable the memory mechanism.
# Permute the reuse and non-reuse segments separately.
non_reuse_len
=
self
.
_seq_length
-
self
.
_reuse_length
if
not
(
self
.
_reuse_length
%
self
.
_permutation_size
==
0
and
non_reuse_len
%
self
.
_permutation_size
==
0
):
raise
ValueError
(
'`reuse_length` and `seq_length` should both be '
'a multiple of `permutation_size`.'
)
# Creates permutation mask and target mask for the first reuse_len tokens.
# The tokens in this part are reused from the last sequence.
perm_mask_0
,
target_mask_0
,
tokens_0
,
masked_0
=
self
.
_get_factorization
(
inputs
=
inputs
[:
self
.
_reuse_length
],
input_mask
=
input_mask
[:
self
.
_reuse_length
])
# Creates permutation mask and target mask for the rest of tokens in
# current example, which are concatentation of two new segments.
perm_mask_1
,
target_mask_1
,
tokens_1
,
masked_1
=
self
.
_get_factorization
(
inputs
[
self
.
_reuse_length
:],
input_mask
[
self
.
_reuse_length
:])
perm_mask_0
=
tf
.
concat
(
[
perm_mask_0
,
tf
.
zeros
([
self
.
_reuse_length
,
non_reuse_len
],
dtype
=
tf
.
int32
)],
axis
=
1
)
perm_mask_1
=
tf
.
concat
(
[
tf
.
ones
([
non_reuse_len
,
self
.
_reuse_length
],
dtype
=
tf
.
int32
),
perm_mask_1
],
axis
=
1
)
perm_mask
=
tf
.
concat
([
perm_mask_0
,
perm_mask_1
],
axis
=
0
)
target_mask
=
tf
.
concat
([
target_mask_0
,
target_mask_1
],
axis
=
0
)
tokens
=
tf
.
concat
([
tokens_0
,
tokens_1
],
axis
=
0
)
masked_tokens
=
tf
.
concat
([
masked_0
,
masked_1
],
axis
=
0
)
else
:
# Disable the memory mechanism.
if
self
.
_seq_length
%
self
.
_permutation_size
!=
0
:
raise
ValueError
(
'`seq_length` should be a multiple of '
'`permutation_size`.'
)
# Permute the entire sequence together
perm_mask
,
target_mask
,
tokens
,
masked_tokens
=
self
.
_get_factorization
(
inputs
=
inputs
,
input_mask
=
input_mask
)
x
[
'permutation_mask'
]
=
tf
.
reshape
(
perm_mask
,
[
self
.
_seq_length
,
self
.
_seq_length
])
x
[
'input_word_ids'
]
=
tokens
x
[
'masked_tokens'
]
=
masked_tokens
target
=
tokens
if
self
.
_max_predictions_per_seq
is
not
None
:
indices
=
tf
.
range
(
self
.
_seq_length
,
dtype
=
tf
.
int32
)
bool_target_mask
=
tf
.
cast
(
target_mask
,
tf
.
bool
)
indices
=
tf
.
boolean_mask
(
indices
,
bool_target_mask
)
# account for extra padding due to CLS/SEP.
actual_num_predict
=
tf
.
shape
(
indices
)[
0
]
pad_len
=
self
.
_max_predictions_per_seq
-
actual_num_predict
target_mapping
=
tf
.
one_hot
(
indices
,
self
.
_seq_length
,
dtype
=
tf
.
int32
)
paddings
=
tf
.
zeros
([
pad_len
,
self
.
_seq_length
],
dtype
=
target_mapping
.
dtype
)
target_mapping
=
tf
.
concat
([
target_mapping
,
paddings
],
axis
=
0
)
x
[
'target_mapping'
]
=
tf
.
reshape
(
target_mapping
,
[
self
.
_max_predictions_per_seq
,
self
.
_seq_length
])
target
=
tf
.
boolean_mask
(
target
,
bool_target_mask
)
paddings
=
tf
.
zeros
([
pad_len
],
dtype
=
target
.
dtype
)
target
=
tf
.
concat
([
target
,
paddings
],
axis
=
0
)
x
[
'target'
]
=
tf
.
reshape
(
target
,
[
self
.
_max_predictions_per_seq
])
target_mask
=
tf
.
concat
([
tf
.
ones
([
actual_num_predict
],
dtype
=
tf
.
int32
),
tf
.
zeros
([
pad_len
],
dtype
=
tf
.
int32
)
],
axis
=
0
)
x
[
'target_mask'
]
=
tf
.
reshape
(
target_mask
,
[
self
.
_max_predictions_per_seq
])
else
:
x
[
'target'
]
=
tf
.
reshape
(
target
,
[
self
.
_seq_length
])
x
[
'target_mask'
]
=
tf
.
reshape
(
target_mask
,
[
self
.
_seq_length
])
return
x
def
_index_pair_to_mask
(
self
,
begin_indices
:
tf
.
Tensor
,
end_indices
:
tf
.
Tensor
,
inputs
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Converts beginning and end indices into an actual mask."""
non_func_mask
=
tf
.
logical_and
(
tf
.
not_equal
(
inputs
,
self
.
_sep_id
),
tf
.
not_equal
(
inputs
,
self
.
_cls_id
))
all_indices
=
tf
.
where
(
non_func_mask
,
tf
.
range
(
self
.
_seq_length
,
dtype
=
tf
.
int32
),
tf
.
constant
(
-
1
,
shape
=
[
self
.
_seq_length
],
dtype
=
tf
.
int32
))
candidate_matrix
=
tf
.
cast
(
tf
.
logical_and
(
all_indices
[
None
,
:]
>=
begin_indices
[:,
None
],
all_indices
[
None
,
:]
<
end_indices
[:,
None
]),
tf
.
float32
)
cumsum_matrix
=
tf
.
reshape
(
tf
.
cumsum
(
tf
.
reshape
(
candidate_matrix
,
[
-
1
])),
[
-
1
,
self
.
_seq_length
])
masked_matrix
=
tf
.
cast
(
cumsum_matrix
<=
self
.
_max_predictions_per_seq
,
tf
.
float32
)
target_mask
=
tf
.
reduce_sum
(
candidate_matrix
*
masked_matrix
,
axis
=
0
)
return
tf
.
cast
(
target_mask
,
tf
.
bool
)
def
_single_token_mask
(
self
,
inputs
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Samples individual tokens as prediction targets."""
all_indices
=
tf
.
range
(
self
.
_seq_length
,
dtype
=
tf
.
int32
)
non_func_mask
=
tf
.
logical_and
(
tf
.
not_equal
(
inputs
,
self
.
_sep_id
),
tf
.
not_equal
(
inputs
,
self
.
_cls_id
))
non_func_indices
=
tf
.
boolean_mask
(
all_indices
,
non_func_mask
)
masked_pos
=
tf
.
random
.
shuffle
(
non_func_indices
)
masked_pos
=
tf
.
sort
(
masked_pos
[:
self
.
_max_predictions_per_seq
])
sparse_indices
=
tf
.
stack
(
[
tf
.
zeros_like
(
masked_pos
),
masked_pos
],
axis
=-
1
)
sparse_indices
=
tf
.
cast
(
sparse_indices
,
tf
.
int64
)
sparse_indices
=
tf
.
sparse
.
SparseTensor
(
sparse_indices
,
values
=
tf
.
ones_like
(
masked_pos
),
dense_shape
=
(
1
,
self
.
_seq_length
))
target_mask
=
tf
.
sparse
.
to_dense
(
sp_input
=
sparse_indices
,
default_value
=
0
)
return
tf
.
squeeze
(
tf
.
cast
(
target_mask
,
tf
.
bool
))
def
_whole_word_mask
(
self
,
inputs
:
tf
.
Tensor
,
boundary
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Samples whole words as prediction targets."""
pair_indices
=
tf
.
concat
([
boundary
[:
-
1
,
None
],
boundary
[
1
:,
None
]],
axis
=
1
)
cand_pair_indices
=
tf
.
random
.
shuffle
(
pair_indices
)[:
self
.
_max_predictions_per_seq
]
begin_indices
=
cand_pair_indices
[:,
0
]
end_indices
=
cand_pair_indices
[:,
1
]
return
self
.
_index_pair_to_mask
(
begin_indices
=
begin_indices
,
end_indices
=
end_indices
,
inputs
=
inputs
)
def
_token_span_mask
(
self
,
inputs
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Samples token spans as prediction targets."""
min_num_tokens
=
self
.
_params
.
min_num_tokens
max_num_tokens
=
self
.
_params
.
max_num_tokens
mask_alpha
=
self
.
_seq_length
/
self
.
_max_predictions_per_seq
round_to_int
=
lambda
x
:
tf
.
cast
(
tf
.
round
(
x
),
tf
.
int32
)
# Sample span lengths from a zipf distribution
span_len_seq
=
np
.
arange
(
min_num_tokens
,
max_num_tokens
+
1
)
probs
=
np
.
array
([
1.0
/
(
i
+
1
)
for
i
in
span_len_seq
])
probs
/=
np
.
sum
(
probs
)
logits
=
tf
.
constant
(
np
.
log
(
probs
),
dtype
=
tf
.
float32
)
span_lens
=
tf
.
random
.
categorical
(
logits
=
logits
[
None
],
num_samples
=
self
.
_max_predictions_per_seq
,
dtype
=
tf
.
int32
,
)[
0
]
+
min_num_tokens
# Sample the ratio [0.0, 1.0) of left context lengths
span_lens_float
=
tf
.
cast
(
span_lens
,
tf
.
float32
)
left_ratio
=
tf
.
random
.
uniform
(
shape
=
[
self
.
_max_predictions_per_seq
],
minval
=
0.0
,
maxval
=
1.0
)
left_ctx_len
=
left_ratio
*
span_lens_float
*
(
mask_alpha
-
1
)
left_ctx_len
=
round_to_int
(
left_ctx_len
)
# Compute the offset from left start to the right end
right_offset
=
round_to_int
(
span_lens_float
*
mask_alpha
)
-
left_ctx_len
# Get the actual begin and end indices
begin_indices
=
(
tf
.
cumsum
(
left_ctx_len
)
+
tf
.
cumsum
(
right_offset
,
exclusive
=
True
))
end_indices
=
begin_indices
+
span_lens
# Remove out of range indices
valid_idx_mask
=
end_indices
<
self
.
_seq_length
begin_indices
=
tf
.
boolean_mask
(
begin_indices
,
valid_idx_mask
)
end_indices
=
tf
.
boolean_mask
(
end_indices
,
valid_idx_mask
)
# Shuffle valid indices
num_valid
=
tf
.
cast
(
tf
.
shape
(
begin_indices
)[
0
],
tf
.
int32
)
order
=
tf
.
random
.
shuffle
(
tf
.
range
(
num_valid
,
dtype
=
tf
.
int32
))
begin_indices
=
tf
.
gather
(
begin_indices
,
order
)
end_indices
=
tf
.
gather
(
end_indices
,
order
)
return
self
.
_index_pair_to_mask
(
begin_indices
=
begin_indices
,
end_indices
=
end_indices
,
inputs
=
inputs
)
def
_word_span_mask
(
self
,
inputs
:
tf
.
Tensor
,
boundary
:
tf
.
Tensor
):
"""Sample whole word spans as prediction targets."""
min_num_words
=
self
.
_params
.
min_num_words
max_num_words
=
self
.
_params
.
max_num_words
# Note: 1.2 is the token-to-word ratio
mask_alpha
=
self
.
_seq_length
/
self
.
_max_predictions_per_seq
/
1.2
round_to_int
=
lambda
x
:
tf
.
cast
(
tf
.
round
(
x
),
tf
.
int32
)
# Sample span lengths from a zipf distribution
span_len_seq
=
np
.
arange
(
min_num_words
,
max_num_words
+
1
)
probs
=
np
.
array
([
1.0
/
(
i
+
1
)
for
i
in
span_len_seq
])
probs
/=
np
.
sum
(
probs
)
logits
=
tf
.
constant
(
np
.
log
(
probs
),
dtype
=
tf
.
float32
)
# Sample `num_predict` words here: note that this is over sampling
span_lens
=
tf
.
random
.
categorical
(
logits
=
logits
[
None
],
num_samples
=
self
.
_max_predictions_per_seq
,
dtype
=
tf
.
int32
,
)[
0
]
+
min_num_words
# Sample the ratio [0.0, 1.0) of left context lengths
span_lens_float
=
tf
.
cast
(
span_lens
,
tf
.
float32
)
left_ratio
=
tf
.
random
.
uniform
(
shape
=
[
self
.
_max_predictions_per_seq
],
minval
=
0.0
,
maxval
=
1.0
)
left_ctx_len
=
left_ratio
*
span_lens_float
*
(
mask_alpha
-
1
)
left_ctx_len
=
round_to_int
(
left_ctx_len
)
right_offset
=
round_to_int
(
span_lens_float
*
mask_alpha
)
-
left_ctx_len
begin_indices
=
(
tf
.
cumsum
(
left_ctx_len
)
+
tf
.
cumsum
(
right_offset
,
exclusive
=
True
))
end_indices
=
begin_indices
+
span_lens
# Remove out of range indices
max_boundary_index
=
tf
.
cast
(
tf
.
shape
(
boundary
)[
0
]
-
1
,
tf
.
int32
)
valid_idx_mask
=
end_indices
<
max_boundary_index
begin_indices
=
tf
.
boolean_mask
(
begin_indices
,
valid_idx_mask
)
end_indices
=
tf
.
boolean_mask
(
end_indices
,
valid_idx_mask
)
begin_indices
=
tf
.
gather
(
boundary
,
begin_indices
)
end_indices
=
tf
.
gather
(
boundary
,
end_indices
)
# Shuffle valid indices
num_valid
=
tf
.
cast
(
tf
.
shape
(
begin_indices
)[
0
],
tf
.
int32
)
order
=
tf
.
random
.
shuffle
(
tf
.
range
(
num_valid
,
dtype
=
tf
.
int32
))
begin_indices
=
tf
.
gather
(
begin_indices
,
order
)
end_indices
=
tf
.
gather
(
end_indices
,
order
)
return
self
.
_index_pair_to_mask
(
begin_indices
=
begin_indices
,
end_indices
=
end_indices
,
inputs
=
inputs
)
def
_online_sample_mask
(
self
,
inputs
:
tf
.
Tensor
,
boundary
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Samples target positions for predictions.
Descriptions of each strategy:
- 'single_token': Samples individual tokens as prediction targets.
- 'token_span': Samples spans of tokens as prediction targets.
- 'whole_word': Samples individual words as prediction targets.
- 'word_span': Samples spans of words as prediction targets.
Args:
inputs: The input tokens.
boundary: The `int` Tensor of indices indicating whole word boundaries.
This is used in 'whole_word' and 'word_span'
Returns:
The sampled `bool` input mask.
Raises:
`ValueError`: if `max_predictions_per_seq` is not set or if boundary is
not provided for 'whole_word' and 'word_span' sample strategies.
"""
if
self
.
_max_predictions_per_seq
is
None
:
raise
ValueError
(
'`max_predictions_per_seq` must be set.'
)
if
boundary
is
None
and
'word'
in
self
.
_sample_strategy
:
raise
ValueError
(
'`boundary` must be provided for {} strategy'
.
format
(
self
.
_sample_strategy
))
if
self
.
_sample_strategy
==
'single_token'
:
return
self
.
_single_token_mask
(
inputs
)
elif
self
.
_sample_strategy
==
'token_span'
:
return
self
.
_token_span_mask
(
inputs
)
elif
self
.
_sample_strategy
==
'whole_word'
:
return
self
.
_whole_word_mask
(
inputs
,
boundary
)
elif
self
.
_sample_strategy
==
'word_span'
:
return
self
.
_word_span_mask
(
inputs
,
boundary
)
else
:
raise
NotImplementedError
(
'Invalid sample strategy.'
)
def
_get_factorization
(
self
,
inputs
:
tf
.
Tensor
,
input_mask
:
tf
.
Tensor
):
"""Samples a permutation of the factorization order.
Args:
inputs: the input tokens.
input_mask: the `bool` Tensor of the same shape as `inputs`.
If `True`, then this means select for partial prediction.
Returns:
perm_mask: An `int32` Tensor of shape [seq_length, seq_length] consisting
of 0s and 1s. If perm_mask[i][j] == 0, then this means that the i-th
token (in original order) cannot attend to the jth attention token.
target_mask: An `int32` Tensor of shape [seq_len] consisting of 0s and 1s.
If target_mask[i] == 1, then the i-th token needs to be predicted and
the mask will be used as input. This token will be included in the loss.
If target_mask[i] == 0, then the token (or [SEP], [CLS]) will be used as
input. This token will not be included in the loss.
tokens: int32 Tensor of shape [seq_length].
masked_tokens: int32 Tensor of shape [seq_length].
"""
factorization_length
=
tf
.
shape
(
inputs
)[
0
]
# Generate permutation indices
index
=
tf
.
range
(
factorization_length
,
dtype
=
tf
.
int32
)
index
=
tf
.
transpose
(
tf
.
reshape
(
index
,
[
-
1
,
self
.
_permutation_size
]))
index
=
tf
.
random
.
shuffle
(
index
)
index
=
tf
.
reshape
(
tf
.
transpose
(
index
),
[
-
1
])
input_mask
=
tf
.
cast
(
input_mask
,
tf
.
bool
)
# non-functional tokens
non_func_tokens
=
tf
.
logical_not
(
tf
.
logical_or
(
tf
.
equal
(
inputs
,
self
.
_sep_id
),
tf
.
equal
(
inputs
,
self
.
_cls_id
)))
masked_tokens
=
tf
.
logical_and
(
input_mask
,
non_func_tokens
)
non_masked_or_func_tokens
=
tf
.
logical_not
(
masked_tokens
)
smallest_index
=
-
2
*
tf
.
ones
([
factorization_length
],
dtype
=
tf
.
int32
)
# Similar to BERT, randomly leak some masked tokens
if
self
.
_leak_ratio
>
0
:
leak_tokens
=
tf
.
logical_and
(
masked_tokens
,
tf
.
random
.
uniform
([
factorization_length
],
maxval
=
1.0
)
<
self
.
_leak_ratio
)
can_attend_self
=
tf
.
logical_or
(
non_masked_or_func_tokens
,
leak_tokens
)
else
:
can_attend_self
=
non_masked_or_func_tokens
to_index
=
tf
.
where
(
can_attend_self
,
smallest_index
,
index
)
from_index
=
tf
.
where
(
can_attend_self
,
to_index
+
1
,
to_index
)
# For masked tokens, can attend if i > j
# For context tokens, always can attend each other
can_attend
=
from_index
[:,
None
]
>
to_index
[
None
,
:]
perm_mask
=
tf
.
cast
(
can_attend
,
tf
.
int32
)
# Only masked tokens are included in the loss
target_mask
=
tf
.
cast
(
masked_tokens
,
tf
.
int32
)
return
perm_mask
,
target_mask
,
inputs
,
masked_tokens
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
if
input_context
:
self
.
_num_replicas_in_sync
=
input_context
.
num_replicas_in_sync
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
)
return
reader
.
read
(
input_context
)
official/nlp/data/pretrain_dataloader_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.pretrain_dataloader."""
import
itertools
import
os
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.data
import
pretrain_dataloader
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
def
_create_fake_bert_dataset
(
output_path
,
seq_length
,
max_predictions_per_seq
,
use_position_id
,
use_next_sentence_label
,
use_v2_feature_names
=
False
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
for
_
in
range
(
100
):
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
features
[
"input_mask"
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
if
use_v2_feature_names
:
features
[
"input_word_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_type_ids"
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
else
:
features
[
"input_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"segment_ids"
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
"masked_lm_positions"
]
=
create_int_feature
(
np
.
random
.
randint
(
100
,
size
=
(
max_predictions_per_seq
)))
features
[
"masked_lm_ids"
]
=
create_int_feature
(
np
.
random
.
randint
(
100
,
size
=
(
max_predictions_per_seq
)))
features
[
"masked_lm_weights"
]
=
create_float_feature
(
[
1.0
]
*
max_predictions_per_seq
)
if
use_next_sentence_label
:
features
[
"next_sentence_labels"
]
=
create_int_feature
([
1
])
if
use_position_id
:
features
[
"position_ids"
]
=
create_int_feature
(
range
(
0
,
seq_length
))
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
def
_create_fake_xlnet_dataset
(
output_path
,
seq_length
,
max_predictions_per_seq
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
for
_
in
range
(
100
):
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
num_boundary_indices
=
np
.
random
.
randint
(
1
,
seq_length
)
if
max_predictions_per_seq
is
not
None
:
input_mask
=
np
.
zeros_like
(
input_ids
)
input_mask
[:
max_predictions_per_seq
]
=
1
np
.
random
.
shuffle
(
input_mask
)
else
:
input_mask
=
np
.
ones_like
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"input_word_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_type_ids"
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
"boundary_indices"
]
=
create_int_feature
(
sorted
(
np
.
random
.
randint
(
seq_length
,
size
=
(
num_boundary_indices
))))
features
[
"target"
]
=
create_int_feature
(
input_ids
+
1
)
features
[
"label"
]
=
create_int_feature
([
1
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
class
BertPretrainDataTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
itertools
.
product
(
(
False
,
True
),
(
False
,
True
),
))
def
test_load_data
(
self
,
use_next_sentence_label
,
use_position_id
):
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
seq_length
=
128
max_predictions_per_seq
=
20
_create_fake_bert_dataset
(
train_data_path
,
seq_length
,
max_predictions_per_seq
,
use_next_sentence_label
=
use_next_sentence_label
,
use_position_id
=
use_position_id
)
data_config
=
pretrain_dataloader
.
BertPretrainDataConfig
(
input_path
=
train_data_path
,
max_predictions_per_seq
=
max_predictions_per_seq
,
seq_length
=
seq_length
,
global_batch_size
=
10
,
is_training
=
True
,
use_next_sentence_label
=
use_next_sentence_label
,
use_position_id
=
use_position_id
)
dataset
=
pretrain_dataloader
.
BertPretrainDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
self
.
assertLen
(
features
,
6
+
int
(
use_next_sentence_label
)
+
int
(
use_position_id
))
self
.
assertIn
(
"input_word_ids"
,
features
)
self
.
assertIn
(
"input_mask"
,
features
)
self
.
assertIn
(
"input_type_ids"
,
features
)
self
.
assertIn
(
"masked_lm_positions"
,
features
)
self
.
assertIn
(
"masked_lm_ids"
,
features
)
self
.
assertIn
(
"masked_lm_weights"
,
features
)
self
.
assertEqual
(
"next_sentence_labels"
in
features
,
use_next_sentence_label
)
self
.
assertEqual
(
"position_ids"
in
features
,
use_position_id
)
def
test_v2_feature_names
(
self
):
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
seq_length
=
128
max_predictions_per_seq
=
20
_create_fake_bert_dataset
(
train_data_path
,
seq_length
,
max_predictions_per_seq
,
use_next_sentence_label
=
True
,
use_position_id
=
False
,
use_v2_feature_names
=
True
)
data_config
=
pretrain_dataloader
.
BertPretrainDataConfig
(
input_path
=
train_data_path
,
max_predictions_per_seq
=
max_predictions_per_seq
,
seq_length
=
seq_length
,
global_batch_size
=
10
,
is_training
=
True
,
use_next_sentence_label
=
True
,
use_position_id
=
False
,
use_v2_feature_names
=
True
)
dataset
=
pretrain_dataloader
.
BertPretrainDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
self
.
assertIn
(
"input_word_ids"
,
features
)
self
.
assertIn
(
"input_mask"
,
features
)
self
.
assertIn
(
"input_type_ids"
,
features
)
self
.
assertIn
(
"masked_lm_positions"
,
features
)
self
.
assertIn
(
"masked_lm_ids"
,
features
)
self
.
assertIn
(
"masked_lm_weights"
,
features
)
class
XLNetPretrainDataTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
itertools
.
product
(
(
"single_token"
,
"whole_word"
,
"token_span"
),
(
0
,
64
),
(
20
,
None
),
))
def
test_load_data
(
self
,
sample_strategy
,
reuse_length
,
max_predictions_per_seq
):
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"train.tf_record"
)
seq_length
=
128
batch_size
=
5
_create_fake_xlnet_dataset
(
train_data_path
,
seq_length
,
max_predictions_per_seq
)
data_config
=
pretrain_dataloader
.
XLNetPretrainDataConfig
(
input_path
=
train_data_path
,
max_predictions_per_seq
=
max_predictions_per_seq
,
seq_length
=
seq_length
,
global_batch_size
=
batch_size
,
is_training
=
True
,
reuse_length
=
reuse_length
,
sample_strategy
=
sample_strategy
,
min_num_tokens
=
1
,
max_num_tokens
=
2
,
permutation_size
=
seq_length
//
2
,
leak_ratio
=
0.1
)
if
max_predictions_per_seq
is
None
:
with
self
.
assertRaises
(
ValueError
):
dataset
=
pretrain_dataloader
.
XLNetPretrainDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
else
:
dataset
=
pretrain_dataloader
.
XLNetPretrainDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
self
.
assertIn
(
"input_word_ids"
,
features
)
self
.
assertIn
(
"input_type_ids"
,
features
)
self
.
assertIn
(
"permutation_mask"
,
features
)
self
.
assertIn
(
"masked_tokens"
,
features
)
self
.
assertIn
(
"target"
,
features
)
self
.
assertIn
(
"target_mask"
,
features
)
self
.
assertAllClose
(
features
[
"input_word_ids"
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertAllClose
(
features
[
"input_type_ids"
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertAllClose
(
features
[
"permutation_mask"
].
shape
,
(
batch_size
,
seq_length
,
seq_length
))
self
.
assertAllClose
(
features
[
"masked_tokens"
].
shape
,
(
batch_size
,
seq_length
,))
if
max_predictions_per_seq
is
not
None
:
self
.
assertIn
(
"target_mapping"
,
features
)
self
.
assertAllClose
(
features
[
"target_mapping"
].
shape
,
(
batch_size
,
max_predictions_per_seq
,
seq_length
))
self
.
assertAllClose
(
features
[
"target_mask"
].
shape
,
(
batch_size
,
max_predictions_per_seq
))
self
.
assertAllClose
(
features
[
"target"
].
shape
,
(
batch_size
,
max_predictions_per_seq
))
else
:
self
.
assertAllClose
(
features
[
"target_mask"
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertAllClose
(
features
[
"target"
].
shape
,
(
batch_size
,
seq_length
))
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/data/pretrain_dynamic_dataloader.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset loader for the pre-training with dynamic sequence length."""
from
typing
import
Optional
,
Tuple
import
dataclasses
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
from
official.nlp.data
import
data_loader_factory
from
official.nlp.data
import
pretrain_dataloader
@
dataclasses
.
dataclass
class
BertPretrainDataConfig
(
cfg
.
DataConfig
):
"""Data config for BERT pretraining task (tasks/masked_lm)."""
input_path
:
str
=
''
global_batch_size
:
int
=
512
is_training
:
bool
=
True
seq_bucket_lengths
:
Tuple
[
int
,
...]
=
(
128
,
256
,
384
,
512
,)
# TODO(rxsang): `seq_bucket_window_scale` is only useful when round robin
# tf.data service is disabled. Deprecate this flag once we always enable round
# robin tf.data service.
seq_bucket_window_scale
:
int
=
8
use_next_sentence_label
:
bool
=
True
use_position_id
:
bool
=
False
deterministic
:
bool
=
False
enable_tf_data_service
:
bool
=
False
enable_round_robin_tf_data_service
:
bool
=
False
tf_data_service_job_name
:
str
=
'bert_pretrain'
use_v2_feature_names
:
bool
=
False
@
data_loader_factory
.
register_data_loader_cls
(
BertPretrainDataConfig
)
class
PretrainingDynamicDataLoader
(
pretrain_dataloader
.
BertPretrainDataLoader
):
"""Dataset loader for bert-style pretraining with dynamic sequenece length.
Bucketizes the input id features by the seq_bucket_lengths and features are
padded to the bucket boundaries. The mask features are usually short than
input id features and can also be dynamic. We require the mask feature lengths
within a bucket must be the same. For example, with [128, 256] buckets,
the mask features for bucket 128 should always have the length as X and
features for bucket 256 should always have the length as Y.
The dataloader does not filter out empty masks. Make sure to handle this
in the model.
"""
def
__init__
(
self
,
params
):
self
.
_params
=
params
if
len
(
params
.
seq_bucket_lengths
)
<
1
:
raise
ValueError
(
'The seq_bucket_lengths cannot be empty.'
)
self
.
_seq_bucket_lengths
=
params
.
seq_bucket_lengths
self
.
_seq_bucket_window_scale
=
params
.
seq_bucket_window_scale
self
.
_global_batch_size
=
params
.
global_batch_size
self
.
_use_next_sentence_label
=
params
.
use_next_sentence_label
self
.
_use_position_id
=
params
.
use_position_id
self
.
_drop_remainder
=
params
.
drop_remainder
self
.
_enable_tf_data_service
=
params
.
enable_tf_data_service
self
.
_enable_round_robin_tf_data_service
=
(
params
.
enable_round_robin_tf_data_service
)
self
.
_mask_keys
=
[
'masked_lm_positions'
,
'masked_lm_ids'
,
'masked_lm_weights'
]
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
{
'input_ids'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'input_mask'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'masked_lm_positions'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'masked_lm_ids'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'masked_lm_weights'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
}
if
self
.
_use_next_sentence_label
:
name_to_features
[
'next_sentence_labels'
]
=
tf
.
io
.
FixedLenFeature
([
1
],
tf
.
int64
)
dynamic_keys
=
[
'input_ids'
,
'input_mask'
,
'segment_ids'
]
if
self
.
_use_position_id
:
name_to_features
[
'position_ids'
]
=
tf
.
io
.
VarLenFeature
(
tf
.
int64
)
dynamic_keys
.
append
(
'position_ids'
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
for
key
in
dynamic_keys
+
self
.
_mask_keys
:
example
[
key
]
=
tf
.
sparse
.
to_dense
(
example
[
key
])
# Truncate padded data after the first non pad in the
# sequence length dimension.
# Pad before the first non pad from the back should not be removed.
mask
=
tf
.
math
.
greater
(
tf
.
math
.
cumsum
(
example
[
'input_ids'
],
reverse
=
True
),
0
)
for
key
in
dynamic_keys
:
example
[
key
]
=
tf
.
boolean_mask
(
example
[
key
],
mask
)
# masked_lm_ids should be 0 padded.
# Change mask features to -1 padding so that we can differentiate
# padding from data or from bucketizing.
mask
=
tf
.
math
.
not_equal
(
example
[
'masked_lm_ids'
],
0
)
example
[
'masked_lm_ids'
]
=
tf
.
where
(
mask
,
example
[
'masked_lm_ids'
],
-
tf
.
ones
(
tf
.
shape
(
example
[
'masked_lm_ids'
]),
dtype
=
example
[
key
].
dtype
))
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
# tf.data service uses dataset graph fingerprint to distinguish input
# pipeline jobs, thus we sort the keys here to make sure they are generated
# in a deterministic order each time the dataset function is traced.
for
name
in
sorted
(
list
(
example
.
keys
())):
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
cast
(
t
,
tf
.
int32
)
example
[
name
]
=
t
return
example
def
_bucketize_and_batch
(
self
,
dataset
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Bucketize by sequence length and batch the datasets."""
per_replica_batch_size
=
input_context
.
get_per_replica_batch_size
(
self
.
_global_batch_size
)
if
input_context
else
self
.
_global_batch_size
def
element_length_func
(
example
,
seq_len_dim
):
return
tf
.
shape
(
example
[
'input_word_ids'
])[
seq_len_dim
]
bucket_boundaries
=
[
length
+
1
for
length
in
self
.
_seq_bucket_lengths
]
bucket_batch_sizes
=
[
per_replica_batch_size
]
*
(
len
(
bucket_boundaries
)
+
1
)
# Bucketize and batch the dataset with per replica batch size first.
dataset
=
dataset
.
apply
(
tf
.
data
.
experimental
.
bucket_by_sequence_length
(
lambda
example
:
tf
.
cast
(
element_length_func
(
example
,
0
),
tf
.
int32
),
bucket_boundaries
,
bucket_batch_sizes
,
pad_to_bucket_boundary
=
True
,
drop_remainder
=
self
.
_drop_remainder
))
if
input_context
:
window_size
=
input_context
.
num_replicas_in_sync
if
self
.
_enable_tf_data_service
and
(
not
self
.
_enable_round_robin_tf_data_service
):
# If tf.data service is enabled but round-robin behavior is not enabled,
# different TPU workers may fetch data from one tf.data service worker
# in different speed. We set the window size to be
# `seq_bucket_window_scale` larger to leave buffer if some workers are
# fetching data faster than others, so all the data within the same
# global batch can still have more chances to be in the same bucket.
window_size
*=
self
.
_seq_bucket_window_scale
# Group `num_replicas_in_sync` batches from same bucket together, so all
# replicas can get the same sequence length for one global step.
dataset
=
dataset
.
apply
(
tf
.
data
.
experimental
.
group_by_window
(
key_func
=
lambda
example
:
tf
.
cast
(
# pylint: disable=g-long-lambda
element_length_func
(
example
,
1
),
tf
.
int64
),
reduce_func
=
lambda
_
,
x
:
tf
.
data
.
Dataset
.
from_tensors
(
x
),
window_size
=
window_size
))
dataset
=
dataset
.
flat_map
(
lambda
x
:
x
)
def
_remove_pads_from_bucketize
(
features
):
# All mask features must have the same effective length.
# The real masked ids padding token is -1 and 0 comes from
# bucket_by_sequence_length.
mask
=
tf
.
math
.
not_equal
(
features
[
'masked_lm_ids'
],
0
)
mask_per_example
=
tf
.
math
.
reduce_sum
(
tf
.
cast
(
mask
,
tf
.
int32
),
axis
=
1
)
normalized
=
tf
.
cast
(
mask_per_example
/
tf
.
math
.
reduce_max
(
mask_per_example
),
tf
.
int32
)
assert_op
=
tf
.
debugging
.
assert_equal
(
tf
.
math
.
reduce_sum
(
normalized
),
per_replica_batch_size
,
'Number of non padded mask tokens is not the same for each example '
'in the same sequence length.'
)
with
tf
.
control_dependencies
([
assert_op
]):
for
key
in
self
.
_mask_keys
:
features
[
key
]
=
tf
.
reshape
(
tf
.
boolean_mask
(
features
[
key
],
mask
),
[
per_replica_batch_size
,
-
1
])
# Revert masked_lm_ids to be 0-padded.
mask
=
tf
.
math
.
not_equal
(
features
[
'masked_lm_ids'
],
-
1
)
features
[
'masked_lm_ids'
]
=
tf
.
where
(
mask
,
features
[
'masked_lm_ids'
],
tf
.
zeros
(
tf
.
shape
(
features
[
'masked_lm_ids'
]),
dtype
=
features
[
'masked_lm_ids'
].
dtype
))
return
features
dataset
=
dataset
.
map
(
_remove_pads_from_bucketize
)
return
dataset
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
,
transform_and_batch_fn
=
self
.
_bucketize_and_batch
)
return
reader
.
read
(
input_context
)
official/nlp/data/pretrain_dynamic_dataloader_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for nlp.data.pretrain_dynamic_dataloader."""
import
os
from
absl
import
logging
from
absl.testing
import
parameterized
import
numpy
as
np
import
orbit
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.nlp.configs
import
bert
from
official.nlp.configs
import
encoders
from
official.nlp.data
import
pretrain_dataloader
from
official.nlp.data
import
pretrain_dynamic_dataloader
from
official.nlp.tasks
import
masked_lm
def
_create_fake_dataset
(
output_path
,
seq_length
,
num_masked_tokens
,
max_seq_length
,
num_examples
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
for
_
in
range
(
num_examples
):
features
=
{}
padding
=
np
.
zeros
(
shape
=
(
max_seq_length
-
seq_length
),
dtype
=
np
.
int32
)
input_ids
=
np
.
random
.
randint
(
low
=
1
,
high
=
100
,
size
=
(
seq_length
))
features
[
'input_ids'
]
=
create_int_feature
(
np
.
concatenate
((
input_ids
,
padding
)))
features
[
'input_mask'
]
=
create_int_feature
(
np
.
concatenate
((
np
.
ones_like
(
input_ids
),
padding
)))
features
[
'segment_ids'
]
=
create_int_feature
(
np
.
concatenate
((
np
.
ones_like
(
input_ids
),
padding
)))
features
[
'position_ids'
]
=
create_int_feature
(
np
.
concatenate
((
np
.
ones_like
(
input_ids
),
padding
)))
features
[
'masked_lm_positions'
]
=
create_int_feature
(
np
.
random
.
randint
(
60
,
size
=
(
num_masked_tokens
),
dtype
=
np
.
int64
))
features
[
'masked_lm_ids'
]
=
create_int_feature
(
np
.
random
.
randint
(
100
,
size
=
(
num_masked_tokens
),
dtype
=
np
.
int64
))
features
[
'masked_lm_weights'
]
=
create_float_feature
(
np
.
ones
((
num_masked_tokens
,),
dtype
=
np
.
float32
))
features
[
'next_sentence_labels'
]
=
create_int_feature
(
np
.
array
([
0
]))
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
class
PretrainDynamicDataLoaderTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
distribution_strategy
=
[
strategy_combinations
.
cloud_tpu_strategy
,
],
mode
=
'eager'
))
def
test_distribution_strategy
(
self
,
distribution_strategy
):
max_seq_length
=
128
batch_size
=
8
input_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
_create_fake_dataset
(
input_path
,
seq_length
=
60
,
num_masked_tokens
=
20
,
max_seq_length
=
max_seq_length
,
num_examples
=
batch_size
)
data_config
=
pretrain_dynamic_dataloader
.
BertPretrainDataConfig
(
is_training
=
False
,
input_path
=
input_path
,
seq_bucket_lengths
=
[
64
,
128
],
global_batch_size
=
batch_size
)
dataloader
=
pretrain_dynamic_dataloader
.
PretrainingDynamicDataLoader
(
data_config
)
distributed_ds
=
orbit
.
utils
.
make_distributed_dataset
(
distribution_strategy
,
dataloader
.
load
)
train_iter
=
iter
(
distributed_ds
)
with
distribution_strategy
.
scope
():
config
=
masked_lm
.
MaskedLMConfig
(
init_checkpoint
=
self
.
get_temp_dir
(),
model
=
bert
.
PretrainerConfig
(
encoders
.
EncoderConfig
(
bert
=
encoders
.
BertEncoderConfig
(
vocab_size
=
30522
,
num_layers
=
1
)),
cls_heads
=
[
bert
.
ClsHeadConfig
(
inner_dim
=
10
,
num_classes
=
2
,
name
=
'next_sentence'
)
]),
train_data
=
data_config
)
task
=
masked_lm
.
MaskedLMTask
(
config
)
model
=
task
.
build_model
()
metrics
=
task
.
build_metrics
()
@
tf
.
function
def
step_fn
(
features
):
return
task
.
validation_step
(
features
,
model
,
metrics
=
metrics
)
distributed_outputs
=
distribution_strategy
.
run
(
step_fn
,
args
=
(
next
(
train_iter
),))
local_results
=
tf
.
nest
.
map_structure
(
distribution_strategy
.
experimental_local_results
,
distributed_outputs
)
logging
.
info
(
'Dynamic padding: local_results= %s'
,
str
(
local_results
))
dynamic_metrics
=
{}
for
metric
in
metrics
:
dynamic_metrics
[
metric
.
name
]
=
metric
.
result
()
data_config
=
pretrain_dataloader
.
BertPretrainDataConfig
(
is_training
=
False
,
input_path
=
input_path
,
seq_length
=
max_seq_length
,
max_predictions_per_seq
=
20
,
global_batch_size
=
batch_size
)
dataloader
=
pretrain_dataloader
.
BertPretrainDataLoader
(
data_config
)
distributed_ds
=
orbit
.
utils
.
make_distributed_dataset
(
distribution_strategy
,
dataloader
.
load
)
train_iter
=
iter
(
distributed_ds
)
with
distribution_strategy
.
scope
():
metrics
=
task
.
build_metrics
()
@
tf
.
function
def
step_fn_b
(
features
):
return
task
.
validation_step
(
features
,
model
,
metrics
=
metrics
)
distributed_outputs
=
distribution_strategy
.
run
(
step_fn_b
,
args
=
(
next
(
train_iter
),))
local_results
=
tf
.
nest
.
map_structure
(
distribution_strategy
.
experimental_local_results
,
distributed_outputs
)
logging
.
info
(
'Static padding: local_results= %s'
,
str
(
local_results
))
static_metrics
=
{}
for
metric
in
metrics
:
static_metrics
[
metric
.
name
]
=
metric
.
result
()
for
key
in
static_metrics
:
# We need to investigate the differences on losses.
if
key
!=
'next_sentence_loss'
:
self
.
assertEqual
(
dynamic_metrics
[
key
],
static_metrics
[
key
])
def
test_load_dataset
(
self
):
max_seq_length
=
128
batch_size
=
2
input_path_1
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train_1.tf_record'
)
_create_fake_dataset
(
input_path_1
,
seq_length
=
60
,
num_masked_tokens
=
20
,
max_seq_length
=
max_seq_length
,
num_examples
=
batch_size
)
input_path_2
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train_2.tf_record'
)
_create_fake_dataset
(
input_path_2
,
seq_length
=
100
,
num_masked_tokens
=
70
,
max_seq_length
=
max_seq_length
,
num_examples
=
batch_size
)
input_paths
=
','
.
join
([
input_path_1
,
input_path_2
])
data_config
=
pretrain_dynamic_dataloader
.
BertPretrainDataConfig
(
is_training
=
False
,
input_path
=
input_paths
,
seq_bucket_lengths
=
[
64
,
128
],
use_position_id
=
True
,
global_batch_size
=
batch_size
)
dataset
=
pretrain_dynamic_dataloader
.
PretrainingDynamicDataLoader
(
data_config
).
load
()
dataset_it
=
iter
(
dataset
)
features
=
next
(
dataset_it
)
self
.
assertCountEqual
([
'input_word_ids'
,
'input_mask'
,
'input_type_ids'
,
'next_sentence_labels'
,
'masked_lm_positions'
,
'masked_lm_ids'
,
'masked_lm_weights'
,
'position_ids'
,
],
features
.
keys
())
# Sequence length dimension should be bucketized and pad to 64.
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
64
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
64
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
64
))
self
.
assertEqual
(
features
[
'position_ids'
].
shape
,
(
batch_size
,
64
))
self
.
assertEqual
(
features
[
'masked_lm_positions'
].
shape
,
(
batch_size
,
20
))
features
=
next
(
dataset_it
)
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
128
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
128
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
128
))
self
.
assertEqual
(
features
[
'position_ids'
].
shape
,
(
batch_size
,
128
))
self
.
assertEqual
(
features
[
'masked_lm_positions'
].
shape
,
(
batch_size
,
70
))
def
test_load_dataset_not_same_masks
(
self
):
max_seq_length
=
128
batch_size
=
2
input_path_1
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train_3.tf_record'
)
_create_fake_dataset
(
input_path_1
,
seq_length
=
60
,
num_masked_tokens
=
20
,
max_seq_length
=
max_seq_length
,
num_examples
=
batch_size
)
input_path_2
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train_4.tf_record'
)
_create_fake_dataset
(
input_path_2
,
seq_length
=
60
,
num_masked_tokens
=
15
,
max_seq_length
=
max_seq_length
,
num_examples
=
batch_size
)
input_paths
=
','
.
join
([
input_path_1
,
input_path_2
])
data_config
=
pretrain_dynamic_dataloader
.
BertPretrainDataConfig
(
is_training
=
False
,
input_path
=
input_paths
,
seq_bucket_lengths
=
[
64
,
128
],
use_position_id
=
True
,
global_batch_size
=
batch_size
*
2
)
dataset
=
pretrain_dynamic_dataloader
.
PretrainingDynamicDataLoader
(
data_config
).
load
()
dataset_it
=
iter
(
dataset
)
with
self
.
assertRaisesRegex
(
tf
.
errors
.
InvalidArgumentError
,
'.*Number of non padded mask tokens.*'
):
next
(
dataset_it
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/data/question_answering_dataloader.py
View file @
f16a7b5b
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -12,20 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the question answering (e.g, SQuAD) task."""
from
typing
import
Mapping
,
Optional
import
dataclasses
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
from
official.
modeling.hyperparams
import
config_definitions
as
cfg
from
official.
nlp.data
import
data_loader
from
official.nlp.data
import
data_loader_factory
@
dataclasses
.
dataclass
class
QADataConfig
(
cfg
.
DataConfig
):
"""Data config for question answering task (tasks/question_answering)."""
# For training, `input_path` is expected to be a pre-processed TFRecord file,
# while for evaluation, it is expected to be a raw JSON file (b/173814590).
input_path
:
str
=
''
global_batch_size
:
int
=
48
is_training
:
bool
=
True
...
...
@@ -36,19 +38,23 @@ class QADataConfig(cfg.DataConfig):
input_preprocessed_data_path
:
str
=
''
doc_stride
:
int
=
128
query_length
:
int
=
64
# The path to the vocab file of word piece tokenizer or the
# model of the sentence piece tokenizer.
vocab_file
:
str
=
''
tokenization
:
str
=
'WordPiece'
# WordPiece or SentencePiece
do_lower_case
:
bool
=
True
xlnet_format
:
bool
=
False
@
data_loader_factory
.
register_data_loader_cls
(
QADataConfig
)
class
QuestionAnsweringDataLoader
:
class
QuestionAnsweringDataLoader
(
data_loader
.
DataLoader
)
:
"""A class to load dataset for sentence prediction (classification) task."""
def
__init__
(
self
,
params
):
self
.
_params
=
params
self
.
_seq_length
=
params
.
seq_length
self
.
_is_training
=
params
.
is_training
self
.
_xlnet_format
=
params
.
xlnet_format
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
...
...
@@ -57,6 +63,13 @@ class QuestionAnsweringDataLoader:
'input_mask'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
}
if
self
.
_xlnet_format
:
name_to_features
[
'class_index'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
name_to_features
[
'paragraph_mask'
]
=
tf
.
io
.
FixedLenFeature
(
[
self
.
_seq_length
],
tf
.
int64
)
if
self
.
_is_training
:
name_to_features
[
'is_impossible'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
if
self
.
_is_training
:
name_to_features
[
'start_positions'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
name_to_features
[
'end_positions'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
...
...
@@ -78,7 +91,7 @@ class QuestionAnsweringDataLoader:
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
,
y
=
{},
{}
for
name
,
tensor
in
record
.
items
():
if
name
in
(
'start_positions'
,
'end_positions'
):
if
name
in
(
'start_positions'
,
'end_positions'
,
'is_impossible'
):
y
[
name
]
=
tensor
elif
name
==
'input_ids'
:
x
[
'input_word_ids'
]
=
tensor
...
...
@@ -86,6 +99,8 @@ class QuestionAnsweringDataLoader:
x
[
'input_type_ids'
]
=
tensor
else
:
x
[
name
]
=
tensor
if
name
==
'start_positions'
and
self
.
_xlnet_format
:
x
[
name
]
=
tensor
return
(
x
,
y
)
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
...
...
official/nlp/data/question_answering_dataloader_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.question_answering_dataloader."""
import
os
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.data
import
question_answering_dataloader
def
_create_fake_dataset
(
output_path
,
seq_length
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
for
_
in
range
(
100
):
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
features
[
'input_ids'
]
=
create_int_feature
(
input_ids
)
features
[
'input_mask'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'segment_ids'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'start_positions'
]
=
create_int_feature
(
np
.
array
([
0
]))
features
[
'end_positions'
]
=
create_int_feature
(
np
.
array
([
10
]))
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
class
QuestionAnsweringDataTest
(
tf
.
test
.
TestCase
):
def
test_load_dataset
(
self
):
seq_length
=
128
batch_size
=
10
input_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
_create_fake_dataset
(
input_path
,
seq_length
)
data_config
=
question_answering_dataloader
.
QADataConfig
(
is_training
=
True
,
input_path
=
input_path
,
seq_length
=
seq_length
,
global_batch_size
=
batch_size
)
dataset
=
question_answering_dataloader
.
QuestionAnsweringDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
self
.
assertCountEqual
([
'input_word_ids'
,
'input_mask'
,
'input_type_ids'
],
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertCountEqual
([
'start_positions'
,
'end_positions'
],
labels
.
keys
())
self
.
assertEqual
(
labels
[
'start_positions'
].
shape
,
(
batch_size
,))
self
.
assertEqual
(
labels
[
'end_positions'
].
shape
,
(
batch_size
,))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/data/sentence_prediction_dataloader.py
View file @
f16a7b5b
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -12,16 +11,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the sentence prediction (classification) task."""
from
typing
import
Mapping
,
Optional
import
functools
from
typing
import
List
,
Mapping
,
Optional
import
dataclasses
import
tensorflow
as
tf
import
tensorflow_hub
as
hub
from
official.common
import
dataset_fn
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
from
official.modeling.hyperparams
import
config_definitions
as
cfg
from
official.nlp
import
modeling
from
official.nlp.data
import
data_loader
from
official.nlp.data
import
data_loader_factory
LABEL_TYPES_MAP
=
{
'int'
:
tf
.
int64
,
'float'
:
tf
.
float32
}
@
dataclasses
.
dataclass
class
SentencePredictionDataConfig
(
cfg
.
DataConfig
):
...
...
@@ -30,24 +37,32 @@ class SentencePredictionDataConfig(cfg.DataConfig):
global_batch_size
:
int
=
32
is_training
:
bool
=
True
seq_length
:
int
=
128
label_type
:
str
=
'int'
# Whether to include the example id number.
include_example_id
:
bool
=
False
@
data_loader_factory
.
register_data_loader_cls
(
SentencePredictionDataConfig
)
class
SentencePredictionDataLoader
:
class
SentencePredictionDataLoader
(
data_loader
.
DataLoader
)
:
"""A class to load dataset for sentence prediction (classification) task."""
def
__init__
(
self
,
params
):
self
.
_params
=
params
self
.
_seq_length
=
params
.
seq_length
self
.
_include_example_id
=
params
.
include_example_id
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
label_type
=
LABEL_TYPES_MAP
[
self
.
_params
.
label_type
]
name_to_features
=
{
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_mask'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'label_ids'
:
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
),
'label_ids'
:
tf
.
io
.
FixedLenFeature
([],
label_type
),
}
if
self
.
_include_example_id
:
name_to_features
[
'example_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
...
...
@@ -67,6 +82,9 @@ class SentencePredictionDataLoader:
'input_mask'
:
record
[
'input_mask'
],
'input_type_ids'
:
record
[
'segment_ids'
]
}
if
self
.
_include_example_id
:
x
[
'example_id'
]
=
record
[
'example_id'
]
y
=
record
[
'label_ids'
]
return
(
x
,
y
)
...
...
@@ -75,3 +93,147 @@ class SentencePredictionDataLoader:
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
)
return
reader
.
read
(
input_context
)
@
dataclasses
.
dataclass
class
SentencePredictionTextDataConfig
(
cfg
.
DataConfig
):
"""Data config for sentence prediction task with raw text."""
# Either set `input_path`...
input_path
:
str
=
''
# Either `int` or `float`.
label_type
:
str
=
'int'
# ...or `tfds_name` and `tfds_split` to specify input.
tfds_name
:
str
=
''
tfds_split
:
str
=
''
# The name of the text feature fields. The text features will be
# concatenated in order.
text_fields
:
Optional
[
List
[
str
]]
=
None
label_field
:
str
=
'label'
global_batch_size
:
int
=
32
seq_length
:
int
=
128
is_training
:
bool
=
True
# Either build preprocessing with Python code by specifying these values
# for modeling.layers.BertTokenizer()/SentencepieceTokenizer()....
tokenization
:
str
=
'WordPiece'
# WordPiece or SentencePiece
# Text vocab file if tokenization is WordPiece, or sentencepiece.ModelProto
# file if tokenization is SentencePiece.
vocab_file
:
str
=
''
lower_case
:
bool
=
True
# ...or load preprocessing from a SavedModel at this location.
preprocessing_hub_module_url
:
str
=
''
# Either tfrecord or sstsable or recordio.
file_type
:
str
=
'tfrecord'
include_example_id
:
bool
=
False
class
TextProcessor
(
tf
.
Module
):
"""Text features processing for sentence prediction task."""
def
__init__
(
self
,
seq_length
:
int
,
vocab_file
:
Optional
[
str
]
=
None
,
tokenization
:
Optional
[
str
]
=
None
,
lower_case
:
Optional
[
bool
]
=
True
,
preprocessing_hub_module_url
:
Optional
[
str
]
=
None
):
if
preprocessing_hub_module_url
:
self
.
_preprocessing_hub_module
=
hub
.
load
(
preprocessing_hub_module_url
)
self
.
_tokenizer
=
self
.
_preprocessing_hub_module
.
tokenize
self
.
_pack_inputs
=
functools
.
partial
(
self
.
_preprocessing_hub_module
.
bert_pack_inputs
,
seq_length
=
seq_length
)
return
if
tokenization
==
'WordPiece'
:
self
.
_tokenizer
=
modeling
.
layers
.
BertTokenizer
(
vocab_file
=
vocab_file
,
lower_case
=
lower_case
)
elif
tokenization
==
'SentencePiece'
:
self
.
_tokenizer
=
modeling
.
layers
.
SentencepieceTokenizer
(
model_file_path
=
vocab_file
,
lower_case
=
lower_case
,
strip_diacritics
=
True
)
# Strip diacritics to follow ALBERT model
else
:
raise
ValueError
(
'Unsupported tokenization: %s'
%
tokenization
)
self
.
_pack_inputs
=
modeling
.
layers
.
BertPackInputs
(
seq_length
=
seq_length
,
special_tokens_dict
=
self
.
_tokenizer
.
get_special_tokens_dict
())
def
__call__
(
self
,
segments
):
segments
=
[
self
.
_tokenizer
(
s
)
for
s
in
segments
]
# BertTokenizer returns a RaggedTensor with shape [batch, word, subword],
# and SentencepieceTokenizer returns a RaggedTensor with shape
# [batch, sentencepiece],
segments
=
[
tf
.
cast
(
x
.
merge_dims
(
1
,
-
1
)
if
x
.
shape
.
rank
>
2
else
x
,
tf
.
int32
)
for
x
in
segments
]
return
self
.
_pack_inputs
(
segments
)
@
data_loader_factory
.
register_data_loader_cls
(
SentencePredictionTextDataConfig
)
class
SentencePredictionTextDataLoader
(
data_loader
.
DataLoader
):
"""Loads dataset with raw text for sentence prediction task."""
def
__init__
(
self
,
params
):
if
bool
(
params
.
tfds_name
)
!=
bool
(
params
.
tfds_split
):
raise
ValueError
(
'`tfds_name` and `tfds_split` should be specified or '
'unspecified at the same time.'
)
if
bool
(
params
.
tfds_name
)
==
bool
(
params
.
input_path
):
raise
ValueError
(
'Must specify either `tfds_name` and `tfds_split` '
'or `input_path`.'
)
if
not
params
.
text_fields
:
raise
ValueError
(
'Unexpected empty text fields.'
)
if
bool
(
params
.
vocab_file
)
==
bool
(
params
.
preprocessing_hub_module_url
):
raise
ValueError
(
'Must specify exactly one of vocab_file (with matching '
'lower_case flag) or preprocessing_hub_module_url.'
)
self
.
_params
=
params
self
.
_text_fields
=
params
.
text_fields
self
.
_label_field
=
params
.
label_field
self
.
_label_type
=
params
.
label_type
self
.
_include_example_id
=
params
.
include_example_id
self
.
_text_processor
=
TextProcessor
(
seq_length
=
params
.
seq_length
,
vocab_file
=
params
.
vocab_file
,
tokenization
=
params
.
tokenization
,
lower_case
=
params
.
lower_case
,
preprocessing_hub_module_url
=
params
.
preprocessing_hub_module_url
)
def
_bert_preprocess
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Berts preprocess."""
segments
=
[
record
[
x
]
for
x
in
self
.
_text_fields
]
model_inputs
=
self
.
_text_processor
(
segments
)
if
self
.
_include_example_id
:
model_inputs
[
'example_id'
]
=
record
[
'example_id'
]
y
=
record
[
self
.
_label_field
]
return
model_inputs
,
y
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
{}
for
text_field
in
self
.
_text_fields
:
name_to_features
[
text_field
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
string
)
label_type
=
LABEL_TYPES_MAP
[
self
.
_label_type
]
name_to_features
[
self
.
_label_field
]
=
tf
.
io
.
FixedLenFeature
([],
label_type
)
if
self
.
_include_example_id
:
name_to_features
[
'example_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
example
:
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
cast
(
t
,
tf
.
int32
)
example
[
name
]
=
t
return
example
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
reader
=
input_reader
.
InputReader
(
dataset_fn
=
dataset_fn
.
pick_dataset_fn
(
self
.
_params
.
file_type
),
decoder_fn
=
self
.
_decode
if
self
.
_params
.
input_path
else
None
,
params
=
self
.
_params
,
postprocess_fn
=
self
.
_bert_preprocess
)
return
reader
.
read
(
input_context
)
official/nlp/data/sentence_prediction_dataloader_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.sentence_prediction_dataloader."""
import
os
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
sentencepiece
import
SentencePieceTrainer
from
official.nlp.data
import
sentence_prediction_dataloader
as
loader
def
_create_fake_preprocessed_dataset
(
output_path
,
seq_length
,
label_type
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
for
_
in
range
(
100
):
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
features
[
'input_ids'
]
=
create_int_feature
(
input_ids
)
features
[
'input_mask'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'segment_ids'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
if
label_type
==
'int'
:
features
[
'label_ids'
]
=
create_int_feature
([
1
])
elif
label_type
==
'float'
:
features
[
'label_ids'
]
=
create_float_feature
([
0.5
])
else
:
raise
ValueError
(
'Unsupported label_type: %s'
%
label_type
)
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
def
_create_fake_raw_dataset
(
output_path
,
text_fields
,
label_type
):
"""Creates a fake tf record file."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_str_feature
(
value
):
f
=
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
value
))
return
f
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
for
_
in
range
(
100
):
features
=
{}
for
text_field
in
text_fields
:
features
[
text_field
]
=
create_str_feature
([
b
'hello world'
])
if
label_type
==
'int'
:
features
[
'label'
]
=
create_int_feature
([
0
])
elif
label_type
==
'float'
:
features
[
'label'
]
=
create_float_feature
([
0.5
])
else
:
raise
ValueError
(
'Unexpected label_type: %s'
%
label_type
)
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
def
_create_fake_sentencepiece_model
(
output_dir
):
vocab
=
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'abc'
,
'def'
,
'ABC'
,
'DEF'
]
model_prefix
=
os
.
path
.
join
(
output_dir
,
'spm_model'
)
input_text_file_path
=
os
.
path
.
join
(
output_dir
,
'train_input.txt'
)
with
tf
.
io
.
gfile
.
GFile
(
input_text_file_path
,
'w'
)
as
f
:
f
.
write
(
' '
.
join
(
vocab
+
[
'
\n
'
]))
# Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
full_vocab_size
=
len
(
vocab
)
+
7
flags
=
dict
(
model_prefix
=
model_prefix
,
model_type
=
'word'
,
input
=
input_text_file_path
,
pad_id
=
0
,
unk_id
=
1
,
control_symbols
=
'[CLS],[SEP],[MASK]'
,
vocab_size
=
full_vocab_size
,
bos_id
=
full_vocab_size
-
2
,
eos_id
=
full_vocab_size
-
1
)
SentencePieceTrainer
.
Train
(
' '
.
join
(
[
'--{}={}'
.
format
(
k
,
v
)
for
k
,
v
in
flags
.
items
()]))
return
model_prefix
+
'.model'
def
_create_fake_vocab_file
(
vocab_file_path
):
tokens
=
[
'[PAD]'
]
for
i
in
range
(
1
,
100
):
tokens
.
append
(
'[unused%d]'
%
i
)
tokens
.
extend
([
'[UNK]'
,
'[CLS]'
,
'[SEP]'
,
'[MASK]'
,
'hello'
,
'world'
])
with
tf
.
io
.
gfile
.
GFile
(
vocab_file_path
,
'w'
)
as
outfile
:
outfile
.
write
(
'
\n
'
.
join
(
tokens
))
class
SentencePredictionDataTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
((
'int'
,
tf
.
int32
),
(
'float'
,
tf
.
float32
))
def
test_load_dataset
(
self
,
label_type
,
expected_label_type
):
input_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
batch_size
=
10
seq_length
=
128
_create_fake_preprocessed_dataset
(
input_path
,
seq_length
,
label_type
)
data_config
=
loader
.
SentencePredictionDataConfig
(
input_path
=
input_path
,
seq_length
=
seq_length
,
global_batch_size
=
batch_size
,
label_type
=
label_type
)
dataset
=
loader
.
SentencePredictionDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
self
.
assertCountEqual
([
'input_word_ids'
,
'input_mask'
,
'input_type_ids'
],
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,))
self
.
assertEqual
(
labels
.
dtype
,
expected_label_type
)
class
SentencePredictionTfdsDataLoaderTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
True
,
False
)
def
test_python_wordpiece_preprocessing
(
self
,
use_tfds
):
batch_size
=
10
seq_length
=
256
# Non-default value.
lower_case
=
True
tf_record_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
text_fields
=
[
'sentence1'
,
'sentence2'
]
if
not
use_tfds
:
_create_fake_raw_dataset
(
tf_record_path
,
text_fields
,
label_type
=
'int'
)
vocab_file_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'vocab.txt'
)
_create_fake_vocab_file
(
vocab_file_path
)
data_config
=
loader
.
SentencePredictionTextDataConfig
(
input_path
=
''
if
use_tfds
else
tf_record_path
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
text_fields
=
text_fields
,
global_batch_size
=
batch_size
,
seq_length
=
seq_length
,
is_training
=
True
,
lower_case
=
lower_case
,
vocab_file
=
vocab_file_path
)
dataset
=
loader
.
SentencePredictionTextDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
self
.
assertCountEqual
([
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
],
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,))
@
parameterized
.
parameters
(
True
,
False
)
def
test_python_sentencepiece_preprocessing
(
self
,
use_tfds
):
batch_size
=
10
seq_length
=
256
# Non-default value.
lower_case
=
True
tf_record_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
text_fields
=
[
'sentence1'
,
'sentence2'
]
if
not
use_tfds
:
_create_fake_raw_dataset
(
tf_record_path
,
text_fields
,
label_type
=
'int'
)
sp_model_file_path
=
_create_fake_sentencepiece_model
(
self
.
get_temp_dir
())
data_config
=
loader
.
SentencePredictionTextDataConfig
(
input_path
=
''
if
use_tfds
else
tf_record_path
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
text_fields
=
text_fields
,
global_batch_size
=
batch_size
,
seq_length
=
seq_length
,
is_training
=
True
,
lower_case
=
lower_case
,
tokenization
=
'SentencePiece'
,
vocab_file
=
sp_model_file_path
,
)
dataset
=
loader
.
SentencePredictionTextDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
self
.
assertCountEqual
([
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
],
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,))
@
parameterized
.
parameters
(
True
,
False
)
def
test_saved_model_preprocessing
(
self
,
use_tfds
):
batch_size
=
10
seq_length
=
256
# Non-default value.
tf_record_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
text_fields
=
[
'sentence1'
,
'sentence2'
]
if
not
use_tfds
:
_create_fake_raw_dataset
(
tf_record_path
,
text_fields
,
label_type
=
'float'
)
vocab_file_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'vocab.txt'
)
_create_fake_vocab_file
(
vocab_file_path
)
data_config
=
loader
.
SentencePredictionTextDataConfig
(
input_path
=
''
if
use_tfds
else
tf_record_path
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
text_fields
=
text_fields
,
global_batch_size
=
batch_size
,
seq_length
=
seq_length
,
is_training
=
True
,
preprocessing_hub_module_url
=
(
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
),
label_type
=
'int'
if
use_tfds
else
'float'
,
)
dataset
=
loader
.
SentencePredictionTextDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
self
.
assertCountEqual
([
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
],
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/data/sentence_retrieval_lib.py
View file @
f16a7b5b
# Copyright 202
0
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""BERT library to process data for cross lingual sentence retrieval task."""
import
os
...
...
@@ -25,8 +25,7 @@ class BuccProcessor(classifier_data_lib.DataProcessor):
"""Procssor for Xtreme BUCC data set."""
supported_languages
=
[
"de"
,
"fr"
,
"ru"
,
"zh"
]
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
super
(
BuccProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
languages
=
BuccProcessor
.
supported_languages
...
...
@@ -50,11 +49,11 @@ class BuccProcessor(classifier_data_lib.DataProcessor):
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
int
_id
en
=
int
(
line
[
0
].
split
(
"-"
)[
1
])
example
_id
=
int
(
line
[
0
].
split
(
"-"
)[
1
])
text_a
=
self
.
process_text_fn
(
line
[
1
])
examples
.
append
(
classifier_data_lib
.
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
int_iden
=
int
_id
en
))
guid
=
guid
,
text_a
=
text_a
,
example_id
=
example
_id
))
return
examples
...
...
@@ -66,8 +65,7 @@ class TatoebaProcessor(classifier_data_lib.DataProcessor):
"nl"
,
"pt"
,
"ru"
,
"sw"
,
"ta"
,
"te"
,
"th"
,
"tl"
,
"tr"
,
"ur"
,
"vi"
,
"zh"
]
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
super
(
TatoebaProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
languages
=
TatoebaProcessor
.
supported_languages
...
...
@@ -88,7 +86,7 @@ class TatoebaProcessor(classifier_data_lib.DataProcessor):
text_a
=
self
.
process_text_fn
(
line
[
0
])
examples
.
append
(
classifier_data_lib
.
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
int
_id
en
=
i
))
guid
=
guid
,
text_a
=
text_a
,
example
_id
=
i
))
return
examples
...
...
official/nlp/data/squad_lib.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,19 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to process data for SQuAD 1.1 and SQuAD 2.0."""
"""Library to process data for SQuAD 1.1 and SQuAD 2.0."""
# pylint: disable=g-bad-import-order
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
copy
import
json
import
math
import
os
import
six
from
absl
import
logging
...
...
@@ -40,8 +36,8 @@ class SquadExample(object):
Attributes:
qas_id: ID of the question-answer pair.
question_text: Original text for the question.
doc_tokens: The list of tokens in the context obtained by splitting
on
whitespace only.
doc_tokens: The list of tokens in the context obtained by splitting
on
whitespace only.
orig_answer_text: Original text for the answer.
start_position: Starting index of the answer in `doc_tokens`.
end_position: Ending index of the answer in `doc_tokens`.
...
...
@@ -96,6 +92,8 @@ class InputFeatures(object):
input_ids
,
input_mask
,
segment_ids
,
paragraph_mask
=
None
,
class_index
=
None
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
None
):
...
...
@@ -111,6 +109,8 @@ class InputFeatures(object):
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
self
.
paragraph_mask
=
paragraph_mask
self
.
class_index
=
class_index
class
FeatureWriter
(
object
):
...
...
@@ -138,6 +138,11 @@ class FeatureWriter(object):
features
[
"input_mask"
]
=
create_int_feature
(
feature
.
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
feature
.
segment_ids
)
if
feature
.
paragraph_mask
is
not
None
:
features
[
"paragraph_mask"
]
=
create_int_feature
(
feature
.
paragraph_mask
)
if
feature
.
class_index
is
not
None
:
features
[
"class_index"
]
=
create_int_feature
([
feature
.
class_index
])
if
self
.
is_training
:
features
[
"start_positions"
]
=
create_int_feature
([
feature
.
start_position
])
features
[
"end_positions"
]
=
create_int_feature
([
feature
.
end_position
])
...
...
@@ -153,11 +158,20 @@ class FeatureWriter(object):
self
.
_writer
.
close
()
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
):
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
,
translated_input_folder
=
None
):
"""Read a SQuAD json file into a list of SquadExample."""
with
tf
.
io
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
if
translated_input_folder
is
not
None
:
translated_files
=
tf
.
io
.
gfile
.
glob
(
os
.
path
.
join
(
translated_input_folder
,
"*.json"
))
for
file
in
translated_files
:
with
tf
.
io
.
gfile
.
GFile
(
file
,
"r"
)
as
reader
:
input_data
.
extend
(
json
.
load
(
reader
)[
"data"
])
def
is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
...
...
@@ -209,8 +223,8 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
tokenization
.
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
...
...
@@ -242,6 +256,7 @@ def convert_examples_to_features(examples,
max_query_length
,
is_training
,
output_fn
,
xlnet_format
=
False
,
batch_size
=
None
):
"""Loads a data file into a list of `InputBatch`s."""
...
...
@@ -303,25 +318,54 @@ def convert_examples_to_features(examples,
token_to_orig_map
=
{}
token_is_max_context
=
{}
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
query_tokens
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
token_to_orig_map
[
len
(
tokens
)]
=
tok_to_orig_index
[
split_token_index
]
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
# Paragraph mask used in XLNet.
# 1 represents paragraph and class tokens.
# 0 represents query and other special tokens.
paragraph_mask
=
[]
# pylint: disable=cell-var-from-loop
def
process_query
(
seg_q
):
for
token
in
query_tokens
:
tokens
.
append
(
token
)
segment_ids
.
append
(
seg_q
)
paragraph_mask
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
seg_q
)
paragraph_mask
.
append
(
0
)
def
process_paragraph
(
seg_p
):
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
token_to_orig_map
[
len
(
tokens
)]
=
tok_to_orig_index
[
split_token_index
]
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
segment_ids
.
append
(
seg_p
)
paragraph_mask
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
seg_p
)
paragraph_mask
.
append
(
0
)
def
process_class
(
seg_class
):
class_index
=
len
(
segment_ids
)
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
seg_class
)
paragraph_mask
.
append
(
1
)
return
class_index
if
xlnet_format
:
seg_p
,
seg_q
,
seg_class
,
seg_pad
=
0
,
1
,
2
,
3
process_paragraph
(
seg_p
)
process_query
(
seg_q
)
class_index
=
process_class
(
seg_class
)
else
:
seg_p
,
seg_q
,
seg_class
,
seg_pad
=
1
,
0
,
0
,
0
class_index
=
process_class
(
seg_class
)
process_query
(
seg_q
)
process_paragraph
(
seg_p
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
...
...
@@ -333,35 +377,30 @@ def convert_examples_to_features(examples,
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
segment_ids
.
append
(
seg_pad
)
paragraph_mask
.
append
(
0
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
assert
len
(
paragraph_mask
)
==
max_seq_length
start_position
=
0
end_position
=
0
span_contains_answer
=
False
start_position
=
None
end_position
=
None
if
is_training
and
not
example
.
is_impossible
:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start
=
doc_span
.
start
doc_end
=
doc_span
.
start
+
doc_span
.
length
-
1
out_of_span
=
False
if
not
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
):
out_of_span
=
True
if
out_of_span
:
start_position
=
0
end_position
=
0
else
:
doc_offset
=
len
(
query_tokens
)
+
2
span_contains_answer
=
(
tok_start_position
>=
doc_start
and
tok_end_position
<=
doc_end
)
if
span_contains_answer
:
doc_offset
=
0
if
xlnet_format
else
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
if
is_training
and
example
.
is_impossible
:
start_position
=
0
end_position
=
0
if
example_index
<
20
:
logging
.
info
(
"*** Example ***"
)
logging
.
info
(
"unique_id: %s"
,
(
unique_id
))
...
...
@@ -381,19 +420,25 @@ def convert_examples_to_features(examples,
logging
.
info
(
"input_ids: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logging
.
info
(
"input_mask: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logging
.
info
(
"segment_ids: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
if
is_training
and
example
.
is_impossible
:
logging
.
info
(
"impossible example"
)
if
is_training
and
not
example
.
is_impossible
:
answer_text
=
" "
.
join
(
tokens
[
start_position
:(
end_position
+
1
)])
logging
.
info
(
"start_position: %d"
,
(
start_position
))
logging
.
info
(
"end_position: %d"
,
(
end_position
))
logging
.
info
(
"answer: %s"
,
tokenization
.
printable_text
(
answer_text
))
logging
.
info
(
"paragraph_mask: %s"
,
" "
.
join
(
[
str
(
x
)
for
x
in
paragraph_mask
]))
logging
.
info
(
"class_index: %d"
,
class_index
)
if
is_training
:
if
span_contains_answer
:
answer_text
=
" "
.
join
(
tokens
[
start_position
:(
end_position
+
1
)])
logging
.
info
(
"start_position: %d"
,
(
start_position
))
logging
.
info
(
"end_position: %d"
,
(
end_position
))
logging
.
info
(
"answer: %s"
,
tokenization
.
printable_text
(
answer_text
))
else
:
logging
.
info
(
"document span doesn't contain answer"
)
feature
=
InputFeatures
(
unique_id
=
unique_id
,
example_index
=
example_index
,
doc_span_index
=
doc_span_index
,
tokens
=
tokens
,
paragraph_mask
=
paragraph_mask
,
class_index
=
class_index
,
token_to_orig_map
=
token_to_orig_map
,
token_is_max_context
=
token_is_max_context
,
input_ids
=
input_ids
,
...
...
@@ -401,7 +446,7 @@ def convert_examples_to_features(examples,
segment_ids
=
segment_ids
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
example
.
is_impossible
)
is_impossible
=
not
span_contains_answer
)
# Run callback
if
is_training
:
...
...
@@ -520,15 +565,16 @@ def write_predictions(all_examples,
logging
.
info
(
"Writing nbest to: %s"
,
(
output_nbest_file
))
all_predictions
,
all_nbest_json
,
scores_diff_json
=
(
postprocess_output
(
all_examples
=
all_examples
,
all_features
=
all_features
,
all_results
=
all_results
,
n_best_size
=
n_best_size
,
max_answer_length
=
max_answer_length
,
do_lower_case
=
do_lower_case
,
version_2_with_negative
=
version_2_with_negative
,
null_score_diff_threshold
=
null_score_diff_threshold
,
verbose
=
verbose
))
postprocess_output
(
all_examples
=
all_examples
,
all_features
=
all_features
,
all_results
=
all_results
,
n_best_size
=
n_best_size
,
max_answer_length
=
max_answer_length
,
do_lower_case
=
do_lower_case
,
version_2_with_negative
=
version_2_with_negative
,
null_score_diff_threshold
=
null_score_diff_threshold
,
verbose
=
verbose
))
write_to_json_files
(
all_predictions
,
output_prediction_file
)
write_to_json_files
(
all_nbest_json
,
output_nbest_file
)
...
...
@@ -544,6 +590,7 @@ def postprocess_output(all_examples,
do_lower_case
,
version_2_with_negative
=
False
,
null_score_diff_threshold
=
0.0
,
xlnet_format
=
False
,
verbose
=
False
):
"""Postprocess model output, to form predicton results."""
...
...
@@ -572,46 +619,54 @@ def postprocess_output(all_examples,
null_start_logit
=
0
# the start logit at the slice with min null score
null_end_logit
=
0
# the end logit at the slice with min null score
for
(
feature_index
,
feature
)
in
enumerate
(
features
):
if
feature
.
unique_id
not
in
unique_id_to_result
:
logging
.
info
(
"Skip eval example %s, not in pred."
,
feature
.
unique_id
)
continue
result
=
unique_id_to_result
[
feature
.
unique_id
]
start_indexes
=
_get_best_indexes
(
result
.
start_logits
,
n_best_size
)
end_indexes
=
_get_best_indexes
(
result
.
end_logits
,
n_best_size
)
# if we could have irrelevant answers, get the min score of irrelevant
if
version_2_with_negative
:
feature_null_score
=
result
.
start_logits
[
0
]
+
result
.
end_logits
[
0
]
if
xlnet_format
:
feature_null_score
=
result
.
class_logits
else
:
feature_null_score
=
result
.
start_logits
[
0
]
+
result
.
end_logits
[
0
]
if
feature_null_score
<
score_null
:
score_null
=
feature_null_score
min_null_feature_index
=
feature_index
null_start_logit
=
result
.
start_logits
[
0
]
null_end_logit
=
result
.
end_logits
[
0
]
for
start_index
in
start_indexes
:
for
end_index
in
end_indexes
:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if
start_index
>=
len
(
feature
.
tokens
):
continue
if
end_index
>=
len
(
feature
.
tokens
):
continue
if
start_index
not
in
feature
.
token_to_orig_map
:
continue
if
end_index
not
in
feature
.
token_to_orig_map
:
continue
if
not
feature
.
token_is_max_context
.
get
(
start_index
,
False
):
continue
if
end_index
<
start_index
:
continue
length
=
end_index
-
start_index
+
1
if
length
>
max_answer_length
:
continue
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
feature_index
,
start_index
=
start_index
,
end_index
=
end_index
,
start_logit
=
result
.
start_logits
[
start_index
],
end_logit
=
result
.
end_logits
[
end_index
]))
if
version_2_with_negative
:
for
(
start_index
,
start_logit
,
end_index
,
end_logit
)
in
_get_best_indexes_and_logits
(
result
=
result
,
n_best_size
=
n_best_size
,
xlnet_format
=
xlnet_format
):
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if
start_index
>=
len
(
feature
.
tokens
):
continue
if
end_index
>=
len
(
feature
.
tokens
):
continue
if
start_index
not
in
feature
.
token_to_orig_map
:
continue
if
end_index
not
in
feature
.
token_to_orig_map
:
continue
if
not
feature
.
token_is_max_context
.
get
(
start_index
,
False
):
continue
if
end_index
<
start_index
:
continue
length
=
end_index
-
start_index
+
1
if
length
>
max_answer_length
:
continue
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
feature_index
,
start_index
=
start_index
,
end_index
=
end_index
,
start_logit
=
start_logit
,
end_logit
=
end_logit
))
if
version_2_with_negative
and
not
xlnet_format
:
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
min_null_feature_index
,
...
...
@@ -633,7 +688,7 @@ def postprocess_output(all_examples,
if
len
(
nbest
)
>=
n_best_size
:
break
feature
=
features
[
pred
.
feature_index
]
if
pred
.
start_index
>
0
:
# this is a non-null prediction
if
pred
.
start_index
>
0
or
xlnet_format
:
# this is a non-null prediction
tok_tokens
=
feature
.
tokens
[
pred
.
start_index
:(
pred
.
end_index
+
1
)]
orig_doc_start
=
feature
.
token_to_orig_map
[
pred
.
start_index
]
orig_doc_end
=
feature
.
token_to_orig_map
[
pred
.
end_index
]
...
...
@@ -666,7 +721,7 @@ def postprocess_output(all_examples,
end_logit
=
pred
.
end_logit
))
# if we didn't inlude the empty option in the n-best, inlcude it
if
version_2_with_negative
:
if
version_2_with_negative
and
not
xlnet_format
:
if
""
not
in
seen_predictions
:
nbest
.
append
(
_NbestPrediction
(
...
...
@@ -707,13 +762,18 @@ def postprocess_output(all_examples,
# pytype: disable=attribute-error
# predict "" iff the null score - the score of best non-null > threshold
if
best_non_null_entry
is
not
None
:
score_diff
=
score_null
-
best_non_null_entry
.
start_logit
-
(
best_non_null_entry
.
end_logit
)
scores_diff_json
[
example
.
qas_id
]
=
score_diff
if
score_diff
>
null_score_diff_threshold
:
all_predictions
[
example
.
qas_id
]
=
""
else
:
if
xlnet_format
:
score_diff
=
score_null
scores_diff_json
[
example
.
qas_id
]
=
score_diff
all_predictions
[
example
.
qas_id
]
=
best_non_null_entry
.
text
else
:
score_diff
=
score_null
-
best_non_null_entry
.
start_logit
-
(
best_non_null_entry
.
end_logit
)
scores_diff_json
[
example
.
qas_id
]
=
score_diff
if
score_diff
>
null_score_diff_threshold
:
all_predictions
[
example
.
qas_id
]
=
""
else
:
all_predictions
[
example
.
qas_id
]
=
best_non_null_entry
.
text
else
:
logging
.
warning
(
"best_non_null_entry is None"
)
scores_diff_json
[
example
.
qas_id
]
=
score_null
...
...
@@ -825,16 +885,29 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
return
output_text
def
_get_best_indexes
(
logits
,
n_best_size
):
"""Get the n-best logits from a list."""
index_and_score
=
sorted
(
enumerate
(
logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
best_indexes
=
[]
for
i
in
range
(
len
(
index_and_score
)):
# pylint: disable=consider-using-enumerate
if
i
>=
n_best_size
:
break
best_indexes
.
append
(
index_and_score
[
i
][
0
])
return
best_indexes
def
_get_best_indexes_and_logits
(
result
,
n_best_size
,
xlnet_format
=
False
):
"""Generates the n-best indexes and logits from a list."""
if
xlnet_format
:
for
i
in
range
(
n_best_size
):
for
j
in
range
(
n_best_size
):
j_index
=
i
*
n_best_size
+
j
yield
(
result
.
start_indexes
[
i
],
result
.
start_logits
[
i
],
result
.
end_indexes
[
j_index
],
result
.
end_logits
[
j_index
])
else
:
start_index_and_score
=
sorted
(
enumerate
(
result
.
start_logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
end_index_and_score
=
sorted
(
enumerate
(
result
.
end_logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
i
in
range
(
len
(
start_index_and_score
)):
if
i
>=
n_best_size
:
break
for
j
in
range
(
len
(
end_index_and_score
)):
if
j
>=
n_best_size
:
break
yield
(
start_index_and_score
[
i
][
0
],
start_index_and_score
[
i
][
1
],
end_index_and_score
[
j
][
0
],
end_index_and_score
[
j
][
1
])
def
_compute_softmax
(
scores
):
...
...
@@ -863,16 +936,19 @@ def _compute_softmax(scores):
def
generate_tf_record_from_json_file
(
input_file_path
,
vocab_file_path
,
output_path
,
translated_input_folder
=
None
,
max_seq_length
=
384
,
do_lower_case
=
True
,
max_query_length
=
64
,
doc_stride
=
128
,
version_2_with_negative
=
False
):
version_2_with_negative
=
False
,
xlnet_format
=
False
):
"""Generates and saves training data into a tf record file."""
train_examples
=
read_squad_examples
(
input_file
=
input_file_path
,
is_training
=
True
,
version_2_with_negative
=
version_2_with_negative
)
version_2_with_negative
=
version_2_with_negative
,
translated_input_folder
=
translated_input_folder
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_file_path
,
do_lower_case
=
do_lower_case
)
train_writer
=
FeatureWriter
(
filename
=
output_path
,
is_training
=
True
)
...
...
@@ -883,7 +959,8 @@ def generate_tf_record_from_json_file(input_file_path,
doc_stride
=
doc_stride
,
max_query_length
=
max_query_length
,
is_training
=
True
,
output_fn
=
train_writer
.
process_feature
)
output_fn
=
train_writer
.
process_feature
,
xlnet_format
=
xlnet_format
)
train_writer
.
close
()
meta_data
=
{
...
...
official/nlp/data/squad_lib_sp.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,22 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 using sentence piece tokenization.
The file is forked from:
https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
copy
import
json
import
math
import
os
from
absl
import
logging
import
numpy
as
np
import
tensorflow
as
tf
...
...
@@ -89,6 +86,8 @@ class InputFeatures(object):
input_mask
,
segment_ids
,
paragraph_len
,
class_index
=
None
,
paragraph_mask
=
None
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
None
):
...
...
@@ -101,19 +100,31 @@ class InputFeatures(object):
self
.
tokens
=
tokens
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
paragraph_mask
=
paragraph_mask
self
.
segment_ids
=
segment_ids
self
.
paragraph_len
=
paragraph_len
self
.
class_index
=
class_index
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
):
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
,
translated_input_folder
=
None
):
"""Read a SQuAD json file into a list of SquadExample."""
del
version_2_with_negative
with
tf
.
io
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
if
translated_input_folder
is
not
None
:
translated_files
=
tf
.
io
.
gfile
.
glob
(
os
.
path
.
join
(
translated_input_folder
,
"*.json"
))
for
file
in
translated_files
:
with
tf
.
io
.
gfile
.
GFile
(
file
,
"r"
)
as
reader
:
input_data
.
extend
(
json
.
load
(
reader
)[
"data"
])
examples
=
[]
for
entry
in
input_data
:
for
paragraph
in
entry
[
"paragraphs"
]:
...
...
@@ -197,6 +208,7 @@ def convert_examples_to_features(examples,
is_training
,
output_fn
,
do_lower_case
,
xlnet_format
=
False
,
batch_size
=
None
):
"""Loads a data file into a list of `InputBatch`s."""
cnt_pos
,
cnt_neg
=
0
,
0
...
...
@@ -246,6 +258,7 @@ def convert_examples_to_features(examples,
f
=
np
.
zeros
((
max_n
,
max_m
),
dtype
=
np
.
float32
)
g
=
{}
# pylint: disable=cell-var-from-loop
def
_lcs_match
(
max_dist
,
n
=
n
,
m
=
m
):
"""Longest-common-substring algorithm."""
...
...
@@ -277,6 +290,7 @@ def convert_examples_to_features(examples,
remove_space
=
False
)
==
tok_cat_text
[
j
]
and
f_prev
+
1
>
f
[
i
,
j
]):
g
[(
i
,
j
)]
=
2
f
[
i
,
j
]
=
f_prev
+
1
# pylint: enable=cell-var-from-loop
max_dist
=
abs
(
n
-
m
)
+
5
...
...
@@ -354,6 +368,7 @@ def convert_examples_to_features(examples,
"DocSpan"
,
[
"start"
,
"length"
])
doc_spans
=
[]
start_offset
=
0
while
start_offset
<
len
(
all_doc_tokens
):
length
=
len
(
all_doc_tokens
)
-
start_offset
if
length
>
max_tokens_for_doc
:
...
...
@@ -368,34 +383,62 @@ def convert_examples_to_features(examples,
token_is_max_context
=
{}
segment_ids
=
[]
# Paragraph mask used in XLNet.
# 1 represents paragraph and class tokens.
# 0 represents query and other special tokens.
paragraph_mask
=
[]
cur_tok_start_to_orig_index
=
[]
cur_tok_end_to_orig_index
=
[]
tokens
.
append
(
tokenizer
.
sp_model
.
PieceToId
(
"[CLS]"
))
segment_ids
.
append
(
0
)
for
token
in
query_tokens
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
tokenizer
.
sp_model
.
PieceToId
(
"[SEP]"
))
segment_ids
.
append
(
0
)
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
cur_tok_start_to_orig_index
.
append
(
tok_start_to_orig_index
[
split_token_index
])
cur_tok_end_to_orig_index
.
append
(
tok_end_to_orig_index
[
split_token_index
])
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
segment_ids
.
append
(
1
)
tokens
.
append
(
tokenizer
.
sp_model
.
PieceToId
(
"[SEP]"
))
segment_ids
.
append
(
1
)
paragraph_len
=
len
(
tokens
)
# pylint: disable=cell-var-from-loop
def
process_query
(
seg_q
):
for
token
in
query_tokens
:
tokens
.
append
(
token
)
segment_ids
.
append
(
seg_q
)
paragraph_mask
.
append
(
0
)
tokens
.
append
(
tokenizer
.
sp_model
.
PieceToId
(
"[SEP]"
))
segment_ids
.
append
(
seg_q
)
paragraph_mask
.
append
(
0
)
def
process_paragraph
(
seg_p
):
for
i
in
range
(
doc_span
.
length
):
split_token_index
=
doc_span
.
start
+
i
cur_tok_start_to_orig_index
.
append
(
tok_start_to_orig_index
[
split_token_index
])
cur_tok_end_to_orig_index
.
append
(
tok_end_to_orig_index
[
split_token_index
])
is_max_context
=
_check_is_max_context
(
doc_spans
,
doc_span_index
,
split_token_index
)
token_is_max_context
[
len
(
tokens
)]
=
is_max_context
tokens
.
append
(
all_doc_tokens
[
split_token_index
])
segment_ids
.
append
(
seg_p
)
paragraph_mask
.
append
(
1
)
tokens
.
append
(
tokenizer
.
sp_model
.
PieceToId
(
"[SEP]"
))
segment_ids
.
append
(
seg_p
)
paragraph_mask
.
append
(
0
)
return
len
(
tokens
)
def
process_class
(
seg_class
):
class_index
=
len
(
segment_ids
)
tokens
.
append
(
tokenizer
.
sp_model
.
PieceToId
(
"[CLS]"
))
segment_ids
.
append
(
seg_class
)
paragraph_mask
.
append
(
1
)
return
class_index
if
xlnet_format
:
seg_p
,
seg_q
,
seg_class
,
seg_pad
=
0
,
1
,
2
,
3
paragraph_len
=
process_paragraph
(
seg_p
)
process_query
(
seg_q
)
class_index
=
process_class
(
seg_class
)
else
:
seg_p
,
seg_q
,
seg_class
,
seg_pad
=
1
,
0
,
0
,
0
class_index
=
process_class
(
seg_class
)
process_query
(
seg_q
)
paragraph_len
=
process_paragraph
(
seg_p
)
input_ids
=
tokens
# The mask has 1 for real tokens and 0 for padding tokens. Only real
...
...
@@ -406,11 +449,13 @@ def convert_examples_to_features(examples,
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
segment_ids
.
append
(
seg_pad
)
paragraph_mask
.
append
(
0
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
assert
len
(
paragraph_mask
)
==
max_seq_length
span_is_impossible
=
example
.
is_impossible
start_position
=
None
...
...
@@ -430,13 +475,13 @@ def convert_examples_to_features(examples,
end_position
=
0
span_is_impossible
=
True
else
:
doc_offset
=
len
(
query_tokens
)
+
2
doc_offset
=
0
if
xlnet_format
else
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
if
is_training
and
span_is_impossible
:
start_position
=
0
end_position
=
0
start_position
=
class_index
end_position
=
class_index
if
example_index
<
20
:
logging
.
info
(
"*** Example ***"
)
...
...
@@ -456,6 +501,9 @@ def convert_examples_to_features(examples,
logging
.
info
(
"input_ids: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logging
.
info
(
"input_mask: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logging
.
info
(
"segment_ids: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
logging
.
info
(
"paragraph_mask: %s"
,
" "
.
join
(
[
str
(
x
)
for
x
in
paragraph_mask
]))
logging
.
info
(
"class_index: %d"
,
class_index
)
if
is_training
and
span_is_impossible
:
logging
.
info
(
"impossible example span"
)
...
...
@@ -489,8 +537,10 @@ def convert_examples_to_features(examples,
tokens
=
[
tokenizer
.
sp_model
.
IdToPiece
(
x
)
for
x
in
tokens
],
input_ids
=
input_ids
,
input_mask
=
input_mask
,
paragraph_mask
=
paragraph_mask
,
segment_ids
=
segment_ids
,
paragraph_len
=
paragraph_len
,
class_index
=
class_index
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
span_is_impossible
)
...
...
@@ -580,15 +630,16 @@ def write_predictions(all_examples,
logging
.
info
(
"Writing nbest to: %s"
,
(
output_nbest_file
))
all_predictions
,
all_nbest_json
,
scores_diff_json
=
(
postprocess_output
(
all_examples
=
all_examples
,
all_features
=
all_features
,
all_results
=
all_results
,
n_best_size
=
n_best_size
,
max_answer_length
=
max_answer_length
,
do_lower_case
=
do_lower_case
,
version_2_with_negative
=
version_2_with_negative
,
null_score_diff_threshold
=
null_score_diff_threshold
,
verbose
=
verbose
))
postprocess_output
(
all_examples
=
all_examples
,
all_features
=
all_features
,
all_results
=
all_results
,
n_best_size
=
n_best_size
,
max_answer_length
=
max_answer_length
,
do_lower_case
=
do_lower_case
,
version_2_with_negative
=
version_2_with_negative
,
null_score_diff_threshold
=
null_score_diff_threshold
,
verbose
=
verbose
))
write_to_json_files
(
all_predictions
,
output_prediction_file
)
write_to_json_files
(
all_nbest_json
,
output_nbest_file
)
...
...
@@ -604,11 +655,11 @@ def postprocess_output(all_examples,
do_lower_case
,
version_2_with_negative
=
False
,
null_score_diff_threshold
=
0.0
,
xlnet_format
=
False
,
verbose
=
False
):
"""Postprocess model output, to form predicton results."""
del
do_lower_case
,
verbose
example_index_to_features
=
collections
.
defaultdict
(
list
)
for
feature
in
all_features
:
example_index_to_features
[
feature
.
example_index
].
append
(
feature
)
...
...
@@ -635,47 +686,53 @@ def postprocess_output(all_examples,
null_start_logit
=
0
# the start logit at the slice with min null score
null_end_logit
=
0
# the end logit at the slice with min null score
for
(
feature_index
,
feature
)
in
enumerate
(
features
):
if
feature
.
unique_id
not
in
unique_id_to_result
:
logging
.
info
(
"Skip eval example %s, not in pred."
,
feature
.
unique_id
)
continue
result
=
unique_id_to_result
[
feature
.
unique_id
]
start_indexes
=
_get_best_indexes
(
result
.
start_logits
,
n_best_size
)
end_indexes
=
_get_best_indexes
(
result
.
end_logits
,
n_best_size
)
# if we could have irrelevant answers, get the min score of irrelevant
if
version_2_with_negative
:
feature_null_score
=
result
.
start_logits
[
0
]
+
result
.
end_logits
[
0
]
if
xlnet_format
:
feature_null_score
=
result
.
class_logits
else
:
feature_null_score
=
result
.
start_logits
[
0
]
+
result
.
end_logits
[
0
]
if
feature_null_score
<
score_null
:
score_null
=
feature_null_score
min_null_feature_index
=
feature_index
null_start_logit
=
result
.
start_logits
[
0
]
null_end_logit
=
result
.
end_logits
[
0
]
for
start_index
in
start_indexes
:
for
end_index
in
end_indexes
:
doc_offset
=
feature
.
tokens
.
index
(
"[SEP]"
)
+
1
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if
start_index
-
doc_offset
>=
len
(
feature
.
tok_start_to_orig_index
):
continue
if
end_index
-
doc_offset
>=
len
(
feature
.
tok_end_to_orig_index
):
continue
# if start_index not in feature.tok_start_to_orig_index:
# continue
# if end_index not in feature.tok_end_to_orig_index:
# continue
if
not
feature
.
token_is_max_context
.
get
(
start_index
,
False
):
continue
if
end_index
<
start_index
:
continue
length
=
end_index
-
start_index
+
1
if
length
>
max_answer_length
:
continue
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
feature_index
,
start_index
=
start_index
-
doc_offset
,
end_index
=
end_index
-
doc_offset
,
start_logit
=
result
.
start_logits
[
start_index
],
end_logit
=
result
.
end_logits
[
end_index
]))
if
version_2_with_negative
:
doc_offset
=
0
if
xlnet_format
else
feature
.
tokens
.
index
(
"[SEP]"
)
+
1
for
(
start_index
,
start_logit
,
end_index
,
end_logit
)
in
_get_best_indexes_and_logits
(
result
=
result
,
n_best_size
=
n_best_size
,
xlnet_format
=
xlnet_format
):
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if
start_index
-
doc_offset
>=
len
(
feature
.
tok_start_to_orig_index
):
continue
if
end_index
-
doc_offset
>=
len
(
feature
.
tok_end_to_orig_index
):
continue
if
not
feature
.
token_is_max_context
.
get
(
start_index
,
False
):
continue
if
end_index
<
start_index
:
continue
length
=
end_index
-
start_index
+
1
if
length
>
max_answer_length
:
continue
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
feature_index
,
start_index
=
start_index
-
doc_offset
,
end_index
=
end_index
-
doc_offset
,
start_logit
=
start_logit
,
end_logit
=
end_logit
))
if
version_2_with_negative
and
not
xlnet_format
:
prelim_predictions
.
append
(
_PrelimPrediction
(
feature_index
=
min_null_feature_index
,
...
...
@@ -697,7 +754,7 @@ def postprocess_output(all_examples,
if
len
(
nbest
)
>=
n_best_size
:
break
feature
=
features
[
pred
.
feature_index
]
if
pred
.
start_index
>=
0
:
# this is a non-null prediction
if
pred
.
start_index
>=
0
or
xlnet_format
:
# this is a non-null prediction
tok_start_to_orig_index
=
feature
.
tok_start_to_orig_index
tok_end_to_orig_index
=
feature
.
tok_end_to_orig_index
start_orig_pos
=
tok_start_to_orig_index
[
pred
.
start_index
]
...
...
@@ -719,8 +776,8 @@ def postprocess_output(all_examples,
start_logit
=
pred
.
start_logit
,
end_logit
=
pred
.
end_logit
))
# if we didn't inlude the empty option in the n-best, in
l
cude it
if
version_2_with_negative
:
# if we didn't inlude the empty option in the n-best, inc
l
ude it
if
version_2_with_negative
and
not
xlnet_format
:
if
""
not
in
seen_predictions
:
nbest
.
append
(
_NbestPrediction
(
...
...
@@ -759,14 +816,19 @@ def postprocess_output(all_examples,
all_predictions
[
example
.
qas_id
]
=
nbest_json
[
0
][
"text"
]
else
:
assert
best_non_null_entry
is
not
None
# predict "" iff the null score - the score of best non-null > threshold
score_diff
=
score_null
-
best_non_null_entry
.
start_logit
-
(
best_non_null_entry
.
end_logit
)
scores_diff_json
[
example
.
qas_id
]
=
score_diff
if
score_diff
>
null_score_diff_threshold
:
all_predictions
[
example
.
qas_id
]
=
""
else
:
if
xlnet_format
:
score_diff
=
score_null
scores_diff_json
[
example
.
qas_id
]
=
score_diff
all_predictions
[
example
.
qas_id
]
=
best_non_null_entry
.
text
else
:
# predict "" iff the null score - the score of best non-null > threshold
score_diff
=
score_null
-
best_non_null_entry
.
start_logit
-
(
best_non_null_entry
.
end_logit
)
scores_diff_json
[
example
.
qas_id
]
=
score_diff
if
score_diff
>
null_score_diff_threshold
:
all_predictions
[
example
.
qas_id
]
=
""
else
:
all_predictions
[
example
.
qas_id
]
=
best_non_null_entry
.
text
all_nbest_json
[
example
.
qas_id
]
=
nbest_json
...
...
@@ -778,16 +840,29 @@ def write_to_json_files(json_records, json_file):
writer
.
write
(
json
.
dumps
(
json_records
,
indent
=
4
)
+
"
\n
"
)
def
_get_best_indexes
(
logits
,
n_best_size
):
"""Get the n-best logits from a list."""
index_and_score
=
sorted
(
enumerate
(
logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
best_indexes
=
[]
for
i
in
range
(
len
(
index_and_score
)):
if
i
>=
n_best_size
:
break
best_indexes
.
append
(
index_and_score
[
i
][
0
])
return
best_indexes
def
_get_best_indexes_and_logits
(
result
,
n_best_size
,
xlnet_format
=
False
):
"""Generates the n-best indexes and logits from a list."""
if
xlnet_format
:
for
i
in
range
(
n_best_size
):
for
j
in
range
(
n_best_size
):
j_index
=
i
*
n_best_size
+
j
yield
(
result
.
start_indexes
[
i
],
result
.
start_logits
[
i
],
result
.
end_indexes
[
j_index
],
result
.
end_logits
[
j_index
])
else
:
start_index_and_score
=
sorted
(
enumerate
(
result
.
start_logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
end_index_and_score
=
sorted
(
enumerate
(
result
.
end_logits
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
i
in
range
(
len
(
start_index_and_score
)):
if
i
>=
n_best_size
:
break
for
j
in
range
(
len
(
end_index_and_score
)):
if
j
>=
n_best_size
:
break
yield
(
start_index_and_score
[
i
][
0
],
start_index_and_score
[
i
][
1
],
end_index_and_score
[
j
][
0
],
end_index_and_score
[
j
][
1
])
def
_compute_softmax
(
scores
):
...
...
@@ -837,6 +912,10 @@ class FeatureWriter(object):
features
[
"input_ids"
]
=
create_int_feature
(
feature
.
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
feature
.
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
feature
.
segment_ids
)
if
feature
.
paragraph_mask
is
not
None
:
features
[
"paragraph_mask"
]
=
create_int_feature
(
feature
.
paragraph_mask
)
if
feature
.
class_index
is
not
None
:
features
[
"class_index"
]
=
create_int_feature
([
feature
.
class_index
])
if
self
.
is_training
:
features
[
"start_positions"
]
=
create_int_feature
([
feature
.
start_position
])
...
...
@@ -856,19 +935,23 @@ class FeatureWriter(object):
def
generate_tf_record_from_json_file
(
input_file_path
,
sp_model_file
,
output_path
,
translated_input_folder
=
None
,
max_seq_length
=
384
,
do_lower_case
=
True
,
max_query_length
=
64
,
doc_stride
=
128
,
xlnet_format
=
False
,
version_2_with_negative
=
False
):
"""Generates and saves training data into a tf record file."""
train_examples
=
read_squad_examples
(
input_file
=
input_file_path
,
is_training
=
True
,
version_2_with_negative
=
version_2_with_negative
)
version_2_with_negative
=
version_2_with_negative
,
translated_input_folder
=
translated_input_folder
)
tokenizer
=
tokenization
.
FullSentencePieceTokenizer
(
sp_model_file
=
sp_model_file
)
train_writer
=
FeatureWriter
(
filename
=
output_path
,
is_training
=
True
)
train_writer
=
FeatureWriter
(
filename
=
output_path
,
is_training
=
True
)
number_of_examples
=
convert_examples_to_features
(
examples
=
train_examples
,
tokenizer
=
tokenizer
,
...
...
@@ -877,6 +960,7 @@ def generate_tf_record_from_json_file(input_file_path,
max_query_length
=
max_query_length
,
is_training
=
True
,
output_fn
=
train_writer
.
process_feature
,
xlnet_format
=
xlnet_format
,
do_lower_case
=
do_lower_case
)
train_writer
.
close
()
...
...
official/nlp/data/tagging_data_lib.py
View file @
f16a7b5b
# Copyright 202
0
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to process data for tagging task such as NER/POS."""
import
collections
import
os
...
...
@@ -19,6 +19,7 @@ import os
from
absl
import
logging
import
tensorflow
as
tf
from
official.nlp.bert
import
tokenization
from
official.nlp.data
import
classifier_data_lib
# A negative label id for the padding label, which will not contribute
...
...
@@ -33,9 +34,14 @@ _UNK_TOKEN = "[UNK]"
class
InputExample
(
object
):
"""A single training/test example for token classification."""
def
__init__
(
self
,
sentence_id
,
words
=
None
,
label_ids
=
None
):
def
__init__
(
self
,
sentence_id
,
sub_sentence_id
=
0
,
words
=
None
,
label_ids
=
None
):
"""Constructs an InputExample."""
self
.
sentence_id
=
sentence_id
self
.
sub_sentence_id
=
sub_sentence_id
self
.
words
=
words
if
words
else
[]
self
.
label_ids
=
label_ids
if
label_ids
else
[]
...
...
@@ -84,13 +90,48 @@ class PanxProcessor(classifier_data_lib.DataProcessor):
"tr"
,
"et"
,
"fi"
,
"hu"
]
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
,
only_use_en_train
=
True
,
only_use_en_dev
=
True
):
"""See base class.
Args:
process_text_fn: See base class.
only_use_en_train: If True, only use english training data. Otherwise, use
training data from all languages.
only_use_en_dev: If True, only use english dev data. Otherwise, use dev
data from all languages.
"""
super
(
PanxProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
only_use_en_train
=
only_use_en_train
self
.
only_use_en_dev
=
only_use_en_dev
def
get_train_examples
(
self
,
data_dir
):
return
_read_one_file
(
examples
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
if
not
self
.
only_use_en_train
:
for
language
in
self
.
supported_languages
:
if
language
==
"en"
:
continue
examples
.
extend
(
_read_one_file
(
os
.
path
.
join
(
data_dir
,
f
"train-
{
language
}
.tsv"
),
self
.
get_labels
()))
return
examples
def
get_dev_examples
(
self
,
data_dir
):
return
_read_one_file
(
examples
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
if
not
self
.
only_use_en_dev
:
for
language
in
self
.
supported_languages
:
if
language
==
"en"
:
continue
examples
.
extend
(
_read_one_file
(
os
.
path
.
join
(
data_dir
,
f
"dev-
{
language
}
.tsv"
),
self
.
get_labels
()))
return
examples
def
get_test_examples
(
self
,
data_dir
):
examples_dict
=
{}
...
...
@@ -115,13 +156,49 @@ class UdposProcessor(classifier_data_lib.DataProcessor):
"ta"
,
"te"
,
"th"
,
"tl"
,
"tr"
,
"ur"
,
"vi"
,
"yo"
,
"zh"
]
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
,
only_use_en_train
=
True
,
only_use_en_dev
=
True
):
"""See base class.
Args:
process_text_fn: See base class.
only_use_en_train: If True, only use english training data. Otherwise, use
training data from all languages.
only_use_en_dev: If True, only use english dev data. Otherwise, use dev
data from all languages.
"""
super
(
UdposProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
only_use_en_train
=
only_use_en_train
self
.
only_use_en_dev
=
only_use_en_dev
def
get_train_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
if
self
.
only_use_en_train
:
examples
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
else
:
examples
=
[]
# Uses glob because some languages are missing in train.
for
filepath
in
tf
.
io
.
gfile
.
glob
(
os
.
path
.
join
(
data_dir
,
"train-*.tsv"
)):
examples
.
extend
(
_read_one_file
(
filepath
,
self
.
get_labels
()))
return
examples
def
get_dev_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
if
self
.
only_use_en_dev
:
examples
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
else
:
examples
=
[]
for
filepath
in
tf
.
io
.
gfile
.
glob
(
os
.
path
.
join
(
data_dir
,
"dev-*.tsv"
)):
examples
.
extend
(
_read_one_file
(
filepath
,
self
.
get_labels
()))
return
examples
def
get_test_examples
(
self
,
data_dir
):
examples_dict
=
{}
...
...
@@ -146,11 +223,11 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
# Needs additional [CLS] and [SEP] tokens.
max_length
=
max_length
-
2
new_examples
=
[]
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
for
i
,
word
in
enumerate
(
example
.
words
):
if
any
([
x
<
0
for
x
in
example
.
label_ids
]):
raise
ValueError
(
"Unexpected negative label_id: %s"
%
example
.
label_ids
)
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
,
sub_sentence_id
=
0
)
if
any
([
x
<
0
for
x
in
example
.
label_ids
]):
raise
ValueError
(
"Unexpected negative label_id: %s"
%
example
.
label_ids
)
for
i
,
word
in
enumerate
(
example
.
words
):
if
text_preprocessing
:
word
=
text_preprocessing
(
word
)
subwords
=
tokenizer
.
tokenize
(
word
)
...
...
@@ -160,7 +237,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
if
len
(
subwords
)
+
len
(
new_example
.
words
)
>
max_length
:
# Start a new example.
new_examples
.
append
(
new_example
)
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
last_sub_sentence_id
=
new_example
.
sub_sentence_id
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
,
sub_sentence_id
=
last_sub_sentence_id
+
1
)
for
j
,
subword
in
enumerate
(
subwords
):
# Use the real label for the first subword, and pad label for
...
...
@@ -203,6 +283,7 @@ def _convert_single_example(example, max_seq_length, tokenizer):
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"label_ids"
]
=
create_int_feature
(
label_ids
)
features
[
"sentence_id"
]
=
create_int_feature
([
example
.
sentence_id
])
features
[
"sub_sentence_id"
]
=
create_int_feature
([
example
.
sub_sentence_id
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
return
tf_example
...
...
@@ -267,12 +348,12 @@ def write_example_to_file(examples,
logging
.
info
(
"Writing example %d of %d to %s"
,
ex_index
,
len
(
examples
),
output_file
)
tokenized_examples
=
_tokenize_example
(
example
,
max_seq_length
,
tokenizer
,
text_preprocessing
)
tokenized_examples
=
_tokenize_example
(
example
,
max_seq_length
,
tokenizer
,
text_preprocessing
)
num_tokenized_examples
+=
len
(
tokenized_examples
)
for
per_tokenized_example
in
tokenized_examples
:
tf_example
=
_convert_single_example
(
per_tokenized_example
,
max_seq_length
,
tokenizer
)
tf_example
=
_convert_single_example
(
per_tokenized_example
,
max_seq_length
,
tokenizer
)
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
...
...
@@ -307,17 +388,16 @@ def token_classification_meta_data(train_data_size,
return
meta_data
def
generate_tf_record_from_data_file
(
processor
,
data_dir
,
tokenizer
,
max_seq_length
,
train_data_output_path
,
def
generate_tf_record_from_data_file
(
processor
,
data_dir
,
tokenizer
,
max_seq_length
,
train_data_output_path
,
eval_data_output_path
,
test_data_output_path
,
text_preprocessing
):
"""Generates tfrecord files from the raw data."""
common_kwargs
=
dict
(
tokenizer
=
tokenizer
,
max_seq_length
=
max_seq_length
,
text_preprocessing
=
text_preprocessing
)
common_kwargs
=
dict
(
tokenizer
=
tokenizer
,
max_seq_length
=
max_seq_length
,
text_preprocessing
=
text_preprocessing
)
train_examples
=
processor
.
get_train_examples
(
data_dir
)
train_data_size
=
write_example_to_file
(
train_examples
,
output_file
=
train_data_output_path
,
**
common_kwargs
)
...
...
official/nlp/data/tagging_data_lib_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.tagging_data_lib."""
import
os
import
random
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.nlp.bert
import
tokenization
from
official.nlp.data
import
tagging_data_lib
def
_create_fake_file
(
filename
,
labels
,
is_test
):
def
write_one_sentence
(
writer
,
length
):
for
_
in
range
(
length
):
line
=
"hiworld"
if
not
is_test
:
line
+=
"
\t
%s"
%
(
labels
[
random
.
randint
(
0
,
len
(
labels
)
-
1
)])
writer
.
write
(
line
+
"
\n
"
)
# Writes two sentences with length of 3 and 12 respectively.
with
tf
.
io
.
gfile
.
GFile
(
filename
,
"w"
)
as
writer
:
write_one_sentence
(
writer
,
3
)
writer
.
write
(
"
\n
"
)
write_one_sentence
(
writer
,
12
)
class
TaggingDataLibTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
setUp
(
self
):
super
(
TaggingDataLibTest
,
self
).
setUp
()
self
.
processors
=
{
"panx"
:
tagging_data_lib
.
PanxProcessor
,
"udpos"
:
tagging_data_lib
.
UdposProcessor
,
}
self
.
vocab_file
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"vocab.txt"
)
with
tf
.
io
.
gfile
.
GFile
(
self
.
vocab_file
,
"w"
)
as
writer
:
writer
.
write
(
"
\n
"
.
join
([
"[CLS]"
,
"[SEP]"
,
"hi"
,
"##world"
,
"[UNK]"
]))
@
parameterized
.
parameters
(
{
"task_type"
:
"panx"
},
{
"task_type"
:
"udpos"
},
)
def
test_generate_tf_record
(
self
,
task_type
):
processor
=
self
.
processors
[
task_type
]()
input_data_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
task_type
)
tf
.
io
.
gfile
.
mkdir
(
input_data_dir
)
# Write fake train file.
_create_fake_file
(
os
.
path
.
join
(
input_data_dir
,
"train-en.tsv"
),
processor
.
get_labels
(),
is_test
=
False
)
# Write fake dev file.
_create_fake_file
(
os
.
path
.
join
(
input_data_dir
,
"dev-en.tsv"
),
processor
.
get_labels
(),
is_test
=
False
)
# Write fake test files.
for
lang
in
processor
.
supported_languages
:
_create_fake_file
(
os
.
path
.
join
(
input_data_dir
,
"test-%s.tsv"
%
lang
),
processor
.
get_labels
(),
is_test
=
True
)
output_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
task_type
,
"output"
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
self
.
vocab_file
,
do_lower_case
=
True
)
metadata
=
tagging_data_lib
.
generate_tf_record_from_data_file
(
processor
,
input_data_dir
,
tokenizer
,
max_seq_length
=
8
,
train_data_output_path
=
os
.
path
.
join
(
output_path
,
"train.tfrecord"
),
eval_data_output_path
=
os
.
path
.
join
(
output_path
,
"eval.tfrecord"
),
test_data_output_path
=
os
.
path
.
join
(
output_path
,
"test_{}.tfrecord"
),
text_preprocessing
=
tokenization
.
convert_to_unicode
)
self
.
assertEqual
(
metadata
[
"train_data_size"
],
5
)
files
=
tf
.
io
.
gfile
.
glob
(
output_path
+
"/*"
)
expected_files
=
[]
expected_files
.
append
(
os
.
path
.
join
(
output_path
,
"train.tfrecord"
))
expected_files
.
append
(
os
.
path
.
join
(
output_path
,
"eval.tfrecord"
))
for
lang
in
processor
.
supported_languages
:
expected_files
.
append
(
os
.
path
.
join
(
output_path
,
"test_%s.tfrecord"
%
lang
))
self
.
assertCountEqual
(
files
,
expected_files
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/data/tagging_data_loader.py
deleted
100644 → 0
View file @
8e9296ff
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the tagging (e.g., NER/POS) task."""
from
typing
import
Mapping
,
Optional
import
dataclasses
import
tensorflow
as
tf
from
official.core
import
input_reader
from
official.modeling.hyperparams
import
config_definitions
as
cfg
from
official.nlp.data
import
data_loader_factory
@
dataclasses
.
dataclass
class
TaggingDataConfig
(
cfg
.
DataConfig
):
"""Data config for tagging (tasks/tagging)."""
is_training
:
bool
=
True
seq_length
:
int
=
128
include_sentence_id
:
bool
=
False
@
data_loader_factory
.
register_data_loader_cls
(
TaggingDataConfig
)
class
TaggingDataLoader
:
"""A class to load dataset for tagging (e.g., NER and POS) task."""
def
__init__
(
self
,
params
:
TaggingDataConfig
):
self
.
_params
=
params
self
.
_seq_length
=
params
.
seq_length
self
.
_include_sentence_id
=
params
.
include_sentence_id
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
{
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_mask'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'label_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
}
if
self
.
_include_sentence_id
:
name_to_features
[
'sentence_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
example
:
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
cast
(
t
,
tf
.
int32
)
example
[
name
]
=
t
return
example
def
_parse
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
=
{
'input_word_ids'
:
record
[
'input_ids'
],
'input_mask'
:
record
[
'input_mask'
],
'input_type_ids'
:
record
[
'segment_ids'
]
}
if
self
.
_include_sentence_id
:
x
[
'sentence_id'
]
=
record
[
'sentence_id'
]
y
=
record
[
'label_ids'
]
return
(
x
,
y
)
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
)
return
reader
.
read
(
input_context
)
official/nlp/data/tagging_dataloader.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Loads dataset for the tagging (e.g., NER/POS) task."""
from
typing
import
Mapping
,
Optional
import
dataclasses
import
tensorflow
as
tf
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
from
official.nlp.data
import
data_loader
from
official.nlp.data
import
data_loader_factory
@
dataclasses
.
dataclass
class
TaggingDataConfig
(
cfg
.
DataConfig
):
"""Data config for tagging (tasks/tagging)."""
is_training
:
bool
=
True
seq_length
:
int
=
128
include_sentence_id
:
bool
=
False
@
data_loader_factory
.
register_data_loader_cls
(
TaggingDataConfig
)
class
TaggingDataLoader
(
data_loader
.
DataLoader
):
"""A class to load dataset for tagging (e.g., NER and POS) task."""
def
__init__
(
self
,
params
:
TaggingDataConfig
):
self
.
_params
=
params
self
.
_seq_length
=
params
.
seq_length
self
.
_include_sentence_id
=
params
.
include_sentence_id
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
{
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_mask'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'label_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
}
if
self
.
_include_sentence_id
:
name_to_features
[
'sentence_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
name_to_features
[
'sub_sentence_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
example
:
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
cast
(
t
,
tf
.
int32
)
example
[
name
]
=
t
return
example
def
_parse
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
=
{
'input_word_ids'
:
record
[
'input_ids'
],
'input_mask'
:
record
[
'input_mask'
],
'input_type_ids'
:
record
[
'segment_ids'
]
}
if
self
.
_include_sentence_id
:
x
[
'sentence_id'
]
=
record
[
'sentence_id'
]
x
[
'sub_sentence_id'
]
=
record
[
'sub_sentence_id'
]
y
=
record
[
'label_ids'
]
return
(
x
,
y
)
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
)
return
reader
.
read
(
input_context
)
official/nlp/data/tagging_dataloader_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.tagging_data_loader."""
import
os
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.data
import
tagging_dataloader
def
_create_fake_dataset
(
output_path
,
seq_length
,
include_sentence_id
):
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
for
i
in
range
(
100
):
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
features
[
'input_ids'
]
=
create_int_feature
(
input_ids
)
features
[
'input_mask'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'segment_ids'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'label_ids'
]
=
create_int_feature
(
np
.
random
.
randint
(
10
,
size
=
(
seq_length
)))
if
include_sentence_id
:
features
[
'sentence_id'
]
=
create_int_feature
([
i
])
features
[
'sub_sentence_id'
]
=
create_int_feature
([
0
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
class
TaggingDataLoaderTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
True
,
False
)
def
test_load_dataset
(
self
,
include_sentence_id
):
seq_length
=
16
batch_size
=
10
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
_create_fake_dataset
(
train_data_path
,
seq_length
,
include_sentence_id
)
data_config
=
tagging_dataloader
.
TaggingDataConfig
(
input_path
=
train_data_path
,
seq_length
=
seq_length
,
global_batch_size
=
batch_size
,
include_sentence_id
=
include_sentence_id
)
dataset
=
tagging_dataloader
.
TaggingDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
expected_keys
=
[
'input_word_ids'
,
'input_mask'
,
'input_type_ids'
]
if
include_sentence_id
:
expected_keys
.
extend
([
'sentence_id'
,
'sub_sentence_id'
])
self
.
assertCountEqual
(
expected_keys
,
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,
seq_length
))
if
include_sentence_id
:
self
.
assertEqual
(
features
[
'sentence_id'
].
shape
,
(
batch_size
,))
self
.
assertEqual
(
features
[
'sub_sentence_id'
].
shape
,
(
batch_size
,))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
Prev
1
…
6
7
8
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment