Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
3d99cc89
"mmdet3d/vscode:/vscode.git/clone" did not exist on "148fea12e00a3f460cfeadd719100701ca63e5ff"
Commit
3d99cc89
authored
Jun 02, 2020
by
Maxim Neumann
Committed by
A. Unique TensorFlower
Jun 02, 2020
Browse files
Internal change
PiperOrigin-RevId: 314412294
parent
2af9696b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
76 additions
and
15 deletions
+76
-15
official/nlp/data/classifier_data_lib.py
official/nlp/data/classifier_data_lib.py
+44
-14
official/nlp/data/create_finetuning_data.py
official/nlp/data/create_finetuning_data.py
+32
-1
No files found.
official/nlp/data/classifier_data_lib.py
View file @
3d99cc89
...
@@ -462,7 +462,7 @@ class QnliProcessor(DataProcessor):
...
@@ -462,7 +462,7 @@ class QnliProcessor(DataProcessor):
class
TfdsProcessor
(
DataProcessor
):
class
TfdsProcessor
(
DataProcessor
):
"""Processor for generic text classification TFDS data set.
"""Processor for generic text classification
and regression
TFDS data set.
The TFDS parameters are expected to be provided in the tfds_params string, in
The TFDS parameters are expected to be provided in the tfds_params string, in
a comma-separated list of parameter assignments.
a comma-separated list of parameter assignments.
...
@@ -473,6 +473,8 @@ class TfdsProcessor(DataProcessor):
...
@@ -473,6 +473,8 @@ class TfdsProcessor(DataProcessor):
tfds_params="dataset=glue/sst2,text_key=sentence"
tfds_params="dataset=glue/sst2,text_key=sentence"
tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
tfds_params="dataset=glue/stsb,text_key=sentence1,text_b_key=sentence2,"
"is_regression=true,label_type=float"
Possible parameters (please refer to the documentation of Tensorflow Datasets
Possible parameters (please refer to the documentation of Tensorflow Datasets
(TFDS) for the meaning of individual parameters):
(TFDS) for the meaning of individual parameters):
dataset: Required dataset name (potentially with subset and version number).
dataset: Required dataset name (potentially with subset and version number).
...
@@ -487,6 +489,8 @@ class TfdsProcessor(DataProcessor):
...
@@ -487,6 +489,8 @@ class TfdsProcessor(DataProcessor):
test_text_key: Key of the text feature to use in test set.
test_text_key: Key of the text feature to use in test set.
test_text_b_key: Key of the second text feature to use in test set.
test_text_b_key: Key of the second text feature to use in test set.
test_label: String to be used as the label for all test examples.
test_label: String to be used as the label for all test examples.
label_type: Type of the label key (defaults to `int`).
is_regression: Whether the task is a regression problem (defaults to False).
"""
"""
def
__init__
(
self
,
tfds_params
,
def
__init__
(
self
,
tfds_params
,
...
@@ -498,10 +502,16 @@ class TfdsProcessor(DataProcessor):
...
@@ -498,10 +502,16 @@ class TfdsProcessor(DataProcessor):
self
.
dataset
,
info
=
tfds
.
load
(
self
.
dataset_name
,
data_dir
=
self
.
data_dir
,
self
.
dataset
,
info
=
tfds
.
load
(
self
.
dataset_name
,
data_dir
=
self
.
data_dir
,
with_info
=
True
)
with_info
=
True
)
self
.
_labels
=
list
(
range
(
info
.
features
[
self
.
label_key
].
num_classes
))
if
self
.
is_regression
:
self
.
_labels
=
None
else
:
self
.
_labels
=
list
(
range
(
info
.
features
[
self
.
label_key
].
num_classes
))
def
_process_tfds_params_str
(
self
,
params_str
):
def
_process_tfds_params_str
(
self
,
params_str
):
"""Extracts TFDS parameters from a comma-separated assignements string."""
"""Extracts TFDS parameters from a comma-separated assignements string."""
dtype_map
=
{
"int"
:
int
,
"float"
:
float
}
cast_str_to_bool
=
lambda
s
:
s
.
lower
()
not
in
[
"false"
,
"0"
]
tuples
=
[
x
.
split
(
"="
)
for
x
in
params_str
.
split
(
","
)]
tuples
=
[
x
.
split
(
"="
)
for
x
in
params_str
.
split
(
","
)]
d
=
{
k
.
strip
():
v
.
strip
()
for
k
,
v
in
tuples
}
d
=
{
k
.
strip
():
v
.
strip
()
for
k
,
v
in
tuples
}
self
.
dataset_name
=
d
[
"dataset"
]
# Required.
self
.
dataset_name
=
d
[
"dataset"
]
# Required.
...
@@ -516,6 +526,8 @@ class TfdsProcessor(DataProcessor):
...
@@ -516,6 +526,8 @@ class TfdsProcessor(DataProcessor):
self
.
test_text_key
=
d
.
get
(
"test_text_key"
,
self
.
text_key
)
self
.
test_text_key
=
d
.
get
(
"test_text_key"
,
self
.
text_key
)
self
.
test_text_b_key
=
d
.
get
(
"test_text_b_key"
,
self
.
text_b_key
)
self
.
test_text_b_key
=
d
.
get
(
"test_text_b_key"
,
self
.
text_b_key
)
self
.
test_label
=
d
.
get
(
"test_label"
,
"test_example"
)
self
.
test_label
=
d
.
get
(
"test_label"
,
"test_example"
)
self
.
label_type
=
dtype_map
[
d
.
get
(
"label_type"
,
"int"
)]
self
.
is_regression
=
cast_str_to_bool
(
d
.
get
(
"is_regression"
,
"False"
))
def
get_train_examples
(
self
,
data_dir
):
def
get_train_examples
(
self
,
data_dir
):
assert
data_dir
is
None
assert
data_dir
is
None
...
@@ -553,7 +565,7 @@ class TfdsProcessor(DataProcessor):
...
@@ -553,7 +565,7 @@ class TfdsProcessor(DataProcessor):
text_a
=
self
.
process_text_fn
(
example
[
self
.
text_key
])
text_a
=
self
.
process_text_fn
(
example
[
self
.
text_key
])
if
self
.
text_b_key
:
if
self
.
text_b_key
:
text_b
=
self
.
process_text_fn
(
example
[
self
.
text_b_key
])
text_b
=
self
.
process_text_fn
(
example
[
self
.
text_b_key
])
label
=
int
(
example
[
self
.
label_key
])
label
=
self
.
label_type
(
example
[
self
.
label_key
])
examples
.
append
(
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
return
examples
...
@@ -563,8 +575,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
...
@@ -563,8 +575,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer
):
tokenizer
):
"""Converts a single `InputExample` into a single `InputFeatures`."""
"""Converts a single `InputExample` into a single `InputFeatures`."""
label_map
=
{}
label_map
=
{}
for
(
i
,
label
)
in
enumerate
(
label_list
):
if
label_list
:
label_map
[
label
]
=
i
for
(
i
,
label
)
in
enumerate
(
label_list
):
label_map
[
label
]
=
i
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
tokens_b
=
None
...
@@ -632,7 +645,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
...
@@ -632,7 +645,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
label_id
=
label_map
[
example
.
label
]
label_id
=
label_map
[
example
.
label
]
if
label_map
else
example
.
label
if
ex_index
<
5
:
if
ex_index
<
5
:
logging
.
info
(
"*** Example ***"
)
logging
.
info
(
"*** Example ***"
)
logging
.
info
(
"guid: %s"
,
(
example
.
guid
))
logging
.
info
(
"guid: %s"
,
(
example
.
guid
))
...
@@ -654,7 +667,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
...
@@ -654,7 +667,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
def
file_based_convert_examples_to_features
(
examples
,
label_list
,
def
file_based_convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
max_seq_length
,
tokenizer
,
output_file
):
output_file
,
label_type
=
None
):
"""Convert a set of `InputExample`s to a TFRecord file."""
"""Convert a set of `InputExample`s to a TFRecord file."""
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
output_file
))
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
output_file
))
...
@@ -670,12 +683,18 @@ def file_based_convert_examples_to_features(examples, label_list,
...
@@ -670,12 +683,18 @@ def file_based_convert_examples_to_features(examples, label_list,
def
create_int_feature
(
values
):
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
return
f
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
features
=
collections
.
OrderedDict
()
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
create_int_feature
(
feature
.
input_ids
)
features
[
"input_ids"
]
=
create_int_feature
(
feature
.
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
feature
.
input_mask
)
features
[
"input_mask"
]
=
create_int_feature
(
feature
.
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
feature
.
segment_ids
)
features
[
"segment_ids"
]
=
create_int_feature
(
feature
.
segment_ids
)
features
[
"label_ids"
]
=
create_int_feature
([
feature
.
label_id
])
if
label_type
is
not
None
and
label_type
==
float
:
features
[
"label_ids"
]
=
create_float_feature
([
feature
.
label_id
])
else
:
features
[
"label_ids"
]
=
create_int_feature
([
feature
.
label_id
])
features
[
"is_real_example"
]
=
create_int_feature
(
features
[
"is_real_example"
]
=
create_int_feature
(
[
int
(
feature
.
is_real_example
)])
[
int
(
feature
.
is_real_example
)])
...
@@ -731,18 +750,23 @@ def generate_tf_record_from_data_file(processor,
...
@@ -731,18 +750,23 @@ def generate_tf_record_from_data_file(processor,
assert
train_data_output_path
or
eval_data_output_path
assert
train_data_output_path
or
eval_data_output_path
label_list
=
processor
.
get_labels
()
label_list
=
processor
.
get_labels
()
label_type
=
getattr
(
processor
,
"label_type"
,
None
)
is_regression
=
getattr
(
processor
,
"is_regression"
,
False
)
assert
train_data_output_path
assert
train_data_output_path
train_input_data_examples
=
processor
.
get_train_examples
(
data_dir
)
train_input_data_examples
=
processor
.
get_train_examples
(
data_dir
)
file_based_convert_examples_to_features
(
train_input_data_examples
,
label_list
,
file_based_convert_examples_to_features
(
train_input_data_examples
,
label_list
,
max_seq_length
,
tokenizer
,
max_seq_length
,
tokenizer
,
train_data_output_path
)
train_data_output_path
,
label_type
)
num_training_data
=
len
(
train_input_data_examples
)
num_training_data
=
len
(
train_input_data_examples
)
if
eval_data_output_path
:
if
eval_data_output_path
:
eval_input_data_examples
=
processor
.
get_dev_examples
(
data_dir
)
eval_input_data_examples
=
processor
.
get_dev_examples
(
data_dir
)
file_based_convert_examples_to_features
(
eval_input_data_examples
,
file_based_convert_examples_to_features
(
eval_input_data_examples
,
label_list
,
max_seq_length
,
label_list
,
max_seq_length
,
tokenizer
,
eval_data_output_path
)
tokenizer
,
eval_data_output_path
,
label_type
)
if
test_data_output_path
:
if
test_data_output_path
:
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
)
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
)
...
@@ -751,19 +775,25 @@ def generate_tf_record_from_data_file(processor,
...
@@ -751,19 +775,25 @@ def generate_tf_record_from_data_file(processor,
file_based_convert_examples_to_features
(
file_based_convert_examples_to_features
(
examples
,
examples
,
label_list
,
max_seq_length
,
label_list
,
max_seq_length
,
tokenizer
,
test_data_output_path
.
format
(
language
))
tokenizer
,
test_data_output_path
.
format
(
language
),
label_type
)
else
:
else
:
file_based_convert_examples_to_features
(
test_input_data_examples
,
file_based_convert_examples_to_features
(
test_input_data_examples
,
label_list
,
max_seq_length
,
label_list
,
max_seq_length
,
tokenizer
,
test_data_output_path
)
tokenizer
,
test_data_output_path
,
label_type
)
meta_data
=
{
meta_data
=
{
"task_type"
:
"bert_classification"
,
"processor_type"
:
processor
.
get_processor_name
(),
"processor_type"
:
processor
.
get_processor_name
(),
"num_labels"
:
len
(
processor
.
get_labels
()),
"train_data_size"
:
num_training_data
,
"train_data_size"
:
num_training_data
,
"max_seq_length"
:
max_seq_length
,
"max_seq_length"
:
max_seq_length
,
}
}
if
is_regression
:
meta_data
[
"task_type"
]
=
"bert_regression"
meta_data
[
"label_type"
]
=
{
int
:
"int"
,
float
:
"float"
}[
label_type
]
else
:
meta_data
[
"task_type"
]
=
"bert_classification"
meta_data
[
"num_labels"
]
=
len
(
processor
.
get_labels
())
if
eval_data_output_path
:
if
eval_data_output_path
:
meta_data
[
"eval_data_size"
]
=
len
(
eval_input_data_examples
)
meta_data
[
"eval_data_size"
]
=
len
(
eval_input_data_examples
)
...
...
official/nlp/data/create_finetuning_data.py
View file @
3d99cc89
...
@@ -35,7 +35,8 @@ from official.nlp.data import squad_lib_sp
...
@@ -35,7 +35,8 @@ from official.nlp.data import squad_lib_sp
FLAGS
=
flags
.
FLAGS
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_enum
(
flags
.
DEFINE_enum
(
"fine_tuning_task_type"
,
"classification"
,
[
"classification"
,
"squad"
],
"fine_tuning_task_type"
,
"classification"
,
[
"classification"
,
"regression"
,
"squad"
],
"The name of the BERT fine tuning task for which data "
"The name of the BERT fine tuning task for which data "
"will be generated.."
)
"will be generated.."
)
...
@@ -181,6 +182,34 @@ def generate_classifier_dataset():
...
@@ -181,6 +182,34 @@ def generate_classifier_dataset():
max_seq_length
=
FLAGS
.
max_seq_length
)
max_seq_length
=
FLAGS
.
max_seq_length
)
def
generate_regression_dataset
():
"""Generates regression dataset and returns input meta data."""
if
FLAGS
.
tokenizer_impl
==
"word_piece"
:
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
FLAGS
.
vocab_file
,
do_lower_case
=
FLAGS
.
do_lower_case
)
processor_text_fn
=
tokenization
.
convert_to_unicode
else
:
assert
FLAGS
.
tokenizer_impl
==
"sentence_piece"
tokenizer
=
tokenization
.
FullSentencePieceTokenizer
(
FLAGS
.
sp_model_file
)
processor_text_fn
=
functools
.
partial
(
tokenization
.
preprocess_text
,
lower
=
FLAGS
.
do_lower_case
)
if
FLAGS
.
tfds_params
:
processor
=
classifier_data_lib
.
TfdsProcessor
(
tfds_params
=
FLAGS
.
tfds_params
,
process_text_fn
=
processor_text_fn
)
return
classifier_data_lib
.
generate_tf_record_from_data_file
(
processor
,
None
,
tokenizer
,
train_data_output_path
=
FLAGS
.
train_data_output_path
,
eval_data_output_path
=
FLAGS
.
eval_data_output_path
,
test_data_output_path
=
FLAGS
.
test_data_output_path
,
max_seq_length
=
FLAGS
.
max_seq_length
)
else
:
raise
ValueError
(
"No data processor found for the given regression task."
)
def
generate_squad_dataset
():
def
generate_squad_dataset
():
"""Generates squad training dataset and returns input meta data."""
"""Generates squad training dataset and returns input meta data."""
assert
FLAGS
.
squad_data_file
assert
FLAGS
.
squad_data_file
...
@@ -210,6 +239,8 @@ def main(_):
...
@@ -210,6 +239,8 @@ def main(_):
if
FLAGS
.
fine_tuning_task_type
==
"classification"
:
if
FLAGS
.
fine_tuning_task_type
==
"classification"
:
input_meta_data
=
generate_classifier_dataset
()
input_meta_data
=
generate_classifier_dataset
()
elif
FLAGS
.
fine_tuning_task_type
==
"regression"
:
input_meta_data
=
generate_regression_dataset
()
else
:
else
:
input_meta_data
=
generate_squad_dataset
()
input_meta_data
=
generate_squad_dataset
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment