Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
2284f823
Commit
2284f823
authored
Jun 29, 2020
by
Chen Chen
Committed by
A. Unique TensorFlower
Jun 29, 2020
Browse files
Support create fine-tuning data for tagging task. (XTREME's udpos/panx)
PiperOrigin-RevId: 318829996
parent
db39ef82
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
391 additions
and
6 deletions
+391
-6
official/nlp/data/create_finetuning_data.py
official/nlp/data/create_finetuning_data.py
+44
-6
official/nlp/data/tagging_data_lib.py
official/nlp/data/tagging_data_lib.py
+347
-0
No files found.
official/nlp/data/create_finetuning_data.py
View file @
2284f823
...
...
@@ -32,14 +32,16 @@ from official.nlp.data import sentence_retrieval_lib
from
official.nlp.data
import
squad_lib
as
squad_lib_wp
# sentence-piece tokenizer based squad_lib
from
official.nlp.data
import
squad_lib_sp
from
official.nlp.data
import
tagging_data_lib
FLAGS
=
flags
.
FLAGS
# TODO(chendouble): consider moving each task to its own binary.
flags
.
DEFINE_enum
(
"fine_tuning_task_type"
,
"classification"
,
[
"classification"
,
"regression"
,
"squad"
,
"retrieval"
],
[
"classification"
,
"regression"
,
"squad"
,
"retrieval"
,
"tagging"
],
"The name of the BERT fine tuning task for which data "
"will be generated.
.
"
)
"will be generated."
)
# BERT classification specific flags.
flags
.
DEFINE_string
(
...
...
@@ -56,9 +58,6 @@ flags.DEFINE_enum("classification_task_name", "MNLI",
"only and for XNLI is all languages combined. Same for "
"PAWS-X."
)
flags
.
DEFINE_enum
(
"retrieval_task_name"
,
"bucc"
,
[
"bucc"
,
"tatoeba"
],
"The name of sentence retrieval task for scoring"
)
# XNLI task specific flag.
flags
.
DEFINE_string
(
"xnli_language"
,
"en"
,
...
...
@@ -71,6 +70,14 @@ flags.DEFINE_string(
"Language of trainig data for PAWS-X task. If the value is 'all', the data "
"of all languages will be used for training."
)
# Retrieva task specific flags
flags
.
DEFINE_enum
(
"retrieval_task_name"
,
"bucc"
,
[
"bucc"
,
"tatoeba"
],
"The name of sentence retrieval task for scoring"
)
# Tagging task specific flags
flags
.
DEFINE_enum
(
"tagging_task_name"
,
"panx"
,
[
"panx"
,
"udpos"
],
"The name of BERT tagging (token classification) task."
)
# BERT Squad task specific flags.
flags
.
DEFINE_string
(
"squad_data_file"
,
None
,
...
...
@@ -284,6 +291,34 @@ def generate_retrieval_dataset():
FLAGS
.
max_seq_length
)
def
generate_tagging_dataset
():
"""Generates tagging dataset."""
processors
=
{
"panx"
:
tagging_data_lib
.
PanxProcessor
,
"udpos"
:
tagging_data_lib
.
UdposProcessor
,
}
task_name
=
FLAGS
.
tagging_task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
task_name
)
if
FLAGS
.
tokenizer_impl
==
"word_piece"
:
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
FLAGS
.
vocab_file
,
do_lower_case
=
FLAGS
.
do_lower_case
)
processor_text_fn
=
tokenization
.
convert_to_unicode
elif
FLAGS
.
tokenizer_impl
==
"sentence_piece"
:
tokenizer
=
tokenization
.
FullSentencePieceTokenizer
(
FLAGS
.
sp_model_file
)
processor_text_fn
=
functools
.
partial
(
tokenization
.
preprocess_text
,
lower
=
FLAGS
.
do_lower_case
)
else
:
raise
ValueError
(
"Unsupported tokenizer_impl: %s"
%
FLAGS
.
tokenizer_impl
)
processor
=
processors
[
task_name
]()
return
tagging_data_lib
.
generate_tf_record_from_data_file
(
processor
,
FLAGS
.
input_data_dir
,
tokenizer
,
FLAGS
.
max_seq_length
,
FLAGS
.
train_data_output_path
,
FLAGS
.
eval_data_output_path
,
FLAGS
.
test_data_output_path
,
processor_text_fn
)
def
main
(
_
):
if
FLAGS
.
tokenizer_impl
==
"word_piece"
:
if
not
FLAGS
.
vocab_file
:
...
...
@@ -304,8 +339,11 @@ def main(_):
input_meta_data
=
generate_regression_dataset
()
elif
FLAGS
.
fine_tuning_task_type
==
"retrieval"
:
input_meta_data
=
generate_retrieval_dataset
()
el
se
:
el
if
FLAGS
.
fine_tuning_task_type
==
"squad"
:
input_meta_data
=
generate_squad_dataset
()
else
:
assert
FLAGS
.
fine_tuning_task_type
==
"tagging"
input_meta_data
=
generate_tagging_dataset
()
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
FLAGS
.
meta_data_file_path
))
with
tf
.
io
.
gfile
.
GFile
(
FLAGS
.
meta_data_file_path
,
"w"
)
as
writer
:
...
...
official/nlp/data/tagging_data_lib.py
0 → 100644
View file @
2284f823
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to process data for tagging task such as NER/POS."""
import
collections
import
os
from
absl
import
logging
import
tensorflow
as
tf
from
official.nlp.data
import
classifier_data_lib
# A negative label id for the padding label, which will not contribute
# to loss/metrics in training.
_PADDING_LABEL_ID
=
-
1
# The special unknown token, used to substitute a word which has too many
# subwords after tokenization.
_UNK_TOKEN
=
"[UNK]"
class
InputExample
(
object
):
"""A single training/test example for token classification."""
def
__init__
(
self
,
sentence_id
,
words
=
None
,
label_ids
=
None
):
"""Constructs an InputExample."""
self
.
sentence_id
=
sentence_id
self
.
words
=
words
if
words
else
[]
self
.
label_ids
=
label_ids
if
label_ids
else
[]
def
add_word_and_label_id
(
self
,
word
,
label_id
):
"""Adds word and label_id pair in the example."""
self
.
words
.
append
(
word
)
self
.
label_ids
.
append
(
label_id
)
def
_read_one_file
(
file_name
,
label_list
):
"""Reads one file and returns a list of `InputExample` instances."""
lines
=
tf
.
io
.
gfile
.
GFile
(
file_name
,
"r"
).
readlines
()
examples
=
[]
label_id_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
sentence_id
=
0
example
=
InputExample
(
sentence_id
=
0
)
for
line
in
lines
:
line
=
line
.
strip
(
"
\n
"
)
if
line
:
# The format is: <token>\t<label> for train/dev set and <token> for test.
items
=
line
.
split
(
"
\t
"
)
assert
len
(
items
)
==
2
or
len
(
items
)
==
1
token
=
items
[
0
].
strip
()
# Assign a dummy label_id for test set
label_id
=
label_id_map
[
items
[
1
].
strip
()]
if
len
(
items
)
==
2
else
0
example
.
add_word_and_label_id
(
token
,
label_id
)
else
:
# Empty line indicates a new sentence.
if
example
.
words
:
examples
.
append
(
example
)
sentence_id
+=
1
example
=
InputExample
(
sentence_id
=
sentence_id
)
if
example
.
words
:
examples
.
append
(
example
)
return
examples
class
PanxProcessor
(
classifier_data_lib
.
DataProcessor
):
"""Processor for the Panx data set."""
supported_languages
=
[
"ar"
,
"he"
,
"vi"
,
"id"
,
"jv"
,
"ms"
,
"tl"
,
"eu"
,
"ml"
,
"ta"
,
"te"
,
"af"
,
"nl"
,
"en"
,
"de"
,
"el"
,
"bn"
,
"hi"
,
"mr"
,
"ur"
,
"fa"
,
"fr"
,
"it"
,
"pt"
,
"es"
,
"bg"
,
"ru"
,
"ja"
,
"ka"
,
"ko"
,
"th"
,
"sw"
,
"yo"
,
"my"
,
"zh"
,
"kk"
,
"tr"
,
"et"
,
"fi"
,
"hu"
]
def
get_train_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
def
get_dev_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
def
get_test_examples
(
self
,
data_dir
):
examples_dict
=
{}
for
language
in
self
.
supported_languages
:
examples_dict
[
language
]
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"test-%s.tsv"
%
language
),
self
.
get_labels
())
return
examples_dict
def
get_labels
(
self
):
return
[
"O"
,
"B-PER"
,
"I-PER"
,
"B-LOC"
,
"I-LOC"
,
"B-ORG"
,
"I-ORG"
]
@
staticmethod
def
get_processor_name
():
return
"panx"
class
UdposProcessor
(
classifier_data_lib
.
DataProcessor
):
"""Processor for the Udpos data set."""
supported_languages
=
[
"af"
,
"ar"
,
"bg"
,
"de"
,
"el"
,
"en"
,
"es"
,
"et"
,
"eu"
,
"fa"
,
"fi"
,
"fr"
,
"he"
,
"hi"
,
"hu"
,
"id"
,
"it"
,
"ja"
,
"kk"
,
"ko"
,
"mr"
,
"nl"
,
"pt"
,
"ru"
,
"ta"
,
"te"
,
"th"
,
"tl"
,
"tr"
,
"ur"
,
"vi"
,
"yo"
,
"zh"
]
def
get_train_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
def
get_dev_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
def
get_test_examples
(
self
,
data_dir
):
examples_dict
=
{}
for
language
in
self
.
supported_languages
:
examples_dict
[
language
]
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"test-%s.tsv"
%
language
),
self
.
get_labels
())
return
examples_dict
def
get_labels
(
self
):
return
[
"ADJ"
,
"ADP"
,
"ADV"
,
"AUX"
,
"CCONJ"
,
"DET"
,
"INTJ"
,
"NOUN"
,
"NUM"
,
"PART"
,
"PRON"
,
"PROPN"
,
"PUNCT"
,
"SCONJ"
,
"SYM"
,
"VERB"
,
"X"
]
@
staticmethod
def
get_processor_name
():
return
"udpos"
def
_tokenize_example
(
example
,
max_length
,
tokenizer
,
text_preprocessing
=
None
):
"""Tokenizes words and breaks long example into short ones."""
# Needs additional [CLS] and [SEP] tokens.
max_length
=
max_length
-
2
new_examples
=
[]
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
for
i
,
word
in
enumerate
(
example
.
words
):
if
text_preprocessing
:
word
=
text_preprocessing
(
word
)
subwords
=
tokenizer
.
tokenize
(
word
)
if
(
not
subwords
or
len
(
subwords
)
>
max_length
)
and
word
:
subwords
=
[
_UNK_TOKEN
]
if
len
(
subwords
)
+
len
(
new_example
.
words
)
>
max_length
:
# Start a new example.
new_examples
.
append
(
new_example
)
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
for
j
,
subword
in
enumerate
(
subwords
):
# Use the real label for the first subword, and pad label for
# the remainings.
subword_label
=
example
.
label_ids
[
i
]
if
j
==
0
else
_PADDING_LABEL_ID
new_example
.
add_word_and_label_id
(
subword
,
subword_label
)
if
new_example
.
words
:
new_examples
.
append
(
new_example
)
return
new_examples
def
_convert_single_example
(
example
,
max_seq_length
,
tokenizer
):
"""Converts an `InputExample` instance to a `tf.train.Example` instance."""
tokens
=
[
"[CLS]"
]
tokens
.
extend
(
example
.
words
)
tokens
.
append
(
"[SEP]"
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
label_ids
=
[
_PADDING_LABEL_ID
]
if
any
([
x
<
0
for
x
in
example
.
label_ids
]):
raise
ValueError
(
"Unexpected negative label_id: %s"
%
example
.
label_ids
)
label_ids
.
extend
(
example
.
label_ids
)
label_ids
.
append
(
_PADDING_LABEL_ID
)
segment_ids
=
[
0
]
*
len
(
input_ids
)
input_mask
=
[
1
]
*
len
(
input_ids
)
# Pad up to the sequence length.
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
label_ids
.
append
(
_PADDING_LABEL_ID
)
def
create_int_feature
(
values
):
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"label_ids"
]
=
create_int_feature
(
label_ids
)
features
[
"sentence_id"
]
=
create_int_feature
([
example
.
sentence_id
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
return
tf_example
def
write_example_to_file
(
examples
,
tokenizer
,
max_seq_length
,
output_file
,
text_preprocessing
=
None
):
"""Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
Note that the words inside each example will be tokenized and be applied by
`text_preprocessing` if available. Also, if the length of sentence (plus
special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
will be broken into multiple short examples. For example:
Example (text_preprocessing=lowercase, max_seq_length=5)
words: ["What", "a", "great", "weekend"]
labels: [ 7, 5, 9, 10]
sentence_id: 0
preprocessed: ["what", "a", "great", "weekend"]
tokenized: ["what", "a", "great", "week", "##end"]
will result in two tf.example protos:
tokens: ["[CLS]", "what", "a", "great", "[SEP]"]
label_ids: [-1, 7, 5, 9, -1]
input_mask: [ 1, 1, 1, 1, 1]
segment_ids: [ 0, 0, 0, 0, 0]
input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
sentence_id: 0
tokens: ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
label_ids: [-1, 10, -1, -1, -1]
input_mask: [ 1, 1, 1, 0, 0]
segment_ids: [ 0, 0, 0, 0, 0]
input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
sentence_id: 0
Note the use of -1 in `label_ids` to indicate that a token should not be
considered for classification (e.g., trailing ## wordpieces or special
token). Token classification models should accordingly ignore these when
calculating loss, metrics, etc...
Args:
examples: A list of `InputExample` instances.
tokenizer: The tokenizer to be applied on the data.
max_seq_length: Maximum length of generated sequences.
output_file: The name of the output tfrecord file.
text_preprocessing: optional preprocessing run on each word prior to
tokenization.
Returns:
The total number of tf.train.Example proto written to file.
"""
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
output_file
))
writer
=
tf
.
io
.
TFRecordWriter
(
output_file
)
num_tokenized_examples
=
0
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
logging
.
info
(
"Writing example %d of %d to %s"
,
ex_index
,
len
(
examples
),
output_file
)
tokenized_examples
=
_tokenize_example
(
example
,
max_seq_length
,
tokenizer
,
text_preprocessing
)
num_tokenized_examples
+=
len
(
tokenized_examples
)
for
per_tokenized_example
in
tokenized_examples
:
tf_example
=
_convert_single_example
(
per_tokenized_example
,
max_seq_length
,
tokenizer
)
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
return
num_tokenized_examples
def
token_classification_meta_data
(
train_data_size
,
max_seq_length
,
num_labels
,
eval_data_size
=
None
,
test_data_size
=
None
,
label_list
=
None
,
processor_type
=
None
):
"""Creates metadata for tagging (token classification) datasets."""
meta_data
=
{
"train_data_size"
:
train_data_size
,
"max_seq_length"
:
max_seq_length
,
"num_labels"
:
num_labels
,
"task_type"
:
"tagging"
,
"label_type"
:
"int"
,
"label_shape"
:
[
max_seq_length
],
}
if
eval_data_size
:
meta_data
[
"eval_data_size"
]
=
eval_data_size
if
test_data_size
:
meta_data
[
"test_data_size"
]
=
test_data_size
if
label_list
:
meta_data
[
"label_list"
]
=
label_list
if
processor_type
:
meta_data
[
"processor_type"
]
=
processor_type
return
meta_data
def
generate_tf_record_from_data_file
(
processor
,
data_dir
,
tokenizer
,
max_seq_length
,
train_data_output_path
,
eval_data_output_path
,
test_data_output_path
,
text_preprocessing
):
"""Generates tfrecord files from the raw data."""
common_kwargs
=
dict
(
tokenizer
=
tokenizer
,
max_seq_length
=
max_seq_length
,
text_preprocessing
=
text_preprocessing
)
train_examples
=
processor
.
get_train_examples
(
data_dir
)
train_data_size
=
write_example_to_file
(
train_examples
,
output_file
=
train_data_output_path
,
**
common_kwargs
)
eval_examples
=
processor
.
get_dev_examples
(
data_dir
)
eval_data_size
=
write_example_to_file
(
eval_examples
,
output_file
=
eval_data_output_path
,
**
common_kwargs
)
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
)
test_data_size
=
{}
for
language
,
examples
in
test_input_data_examples
.
items
():
test_data_size
[
language
]
=
write_example_to_file
(
examples
,
output_file
=
test_data_output_path
.
format
(
language
),
**
common_kwargs
)
labels
=
processor
.
get_labels
()
meta_data
=
token_classification_meta_data
(
train_data_size
,
max_seq_length
,
len
(
labels
),
eval_data_size
,
test_data_size
,
label_list
=
labels
,
processor_type
=
processor
.
get_processor_name
())
return
meta_data
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment