Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
eb3ec508
Commit
eb3ec508
authored
Oct 26, 2020
by
Allen Wang
Committed by
A. Unique TensorFlower
Oct 26, 2020
Browse files
Add in IMDB dataset processor to TF-NLP.
PiperOrigin-RevId: 339134320
parent
764c18c5
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
54 additions
and
9 deletions
+54
-9
official/nlp/data/classifier_data_lib.py
official/nlp/data/classifier_data_lib.py
+49
-6
official/nlp/data/create_finetuning_data.py
official/nlp/data/create_finetuning_data.py
+5
-3
No files found.
official/nlp/data/classifier_data_lib.py
View file @
eb3ec508
...
@@ -214,6 +214,44 @@ class ColaProcessor(DataProcessor):
...
@@ -214,6 +214,44 @@ class ColaProcessor(DataProcessor):
return
examples
return
examples
class
ImdbProcessor
(
DataProcessor
):
"""Processor for the IMDb dataset."""
def
get_labels
(
self
):
return
[
"neg"
,
"pos"
]
def
get_train_examples
(
self
,
data_dir
):
return
self
.
_create_examples
(
os
.
path
.
join
(
data_dir
,
"train"
))
def
get_dev_examples
(
self
,
data_dir
):
return
self
.
_create_examples
(
os
.
path
.
join
(
data_dir
,
"test"
))
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"IMDB"
def
_create_examples
(
self
,
data_dir
):
"""Creates examples."""
examples
=
[]
for
label
in
[
"neg"
,
"pos"
]:
cur_dir
=
os
.
path
.
join
(
data_dir
,
label
)
for
filename
in
tf
.
io
.
gfile
.
listdir
(
cur_dir
):
if
not
filename
.
endswith
(
"txt"
):
continue
if
len
(
examples
)
%
1000
==
0
:
logging
.
info
(
"Loading dev example %d"
,
len
(
examples
))
path
=
os
.
path
.
join
(
cur_dir
,
filename
)
with
tf
.
io
.
gfile
.
GFile
(
path
,
"r"
)
as
f
:
text
=
f
.
read
().
strip
().
replace
(
"<br />"
,
" "
)
examples
.
append
(
InputExample
(
guid
=
"unused_id"
,
text_a
=
text
,
text_b
=
None
,
label
=
label
))
return
examples
class
MnliProcessor
(
DataProcessor
):
class
MnliProcessor
(
DataProcessor
):
"""Processor for the MultiNLI data set (GLUE version)."""
"""Processor for the MultiNLI data set (GLUE version)."""
...
@@ -1032,6 +1070,11 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
...
@@ -1032,6 +1070,11 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
seg_id_a
=
0
seg_id_b
=
1
seg_id_cls
=
0
seg_id_pad
=
0
# The convention in BERT is:
# The convention in BERT is:
# (a) For sequence pairs:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
...
@@ -1053,19 +1096,19 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
...
@@ -1053,19 +1096,19 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
tokens
=
[]
tokens
=
[]
segment_ids
=
[]
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
segment_ids
.
append
(
seg_id_cls
)
for
token
in
tokens_a
:
for
token
in
tokens_a
:
tokens
.
append
(
token
)
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
segment_ids
.
append
(
seg_id_a
)
tokens
.
append
(
"[SEP]"
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
segment_ids
.
append
(
seg_id_a
)
if
tokens_b
:
if
tokens_b
:
for
token
in
tokens_b
:
for
token
in
tokens_b
:
tokens
.
append
(
token
)
tokens
.
append
(
token
)
segment_ids
.
append
(
1
)
segment_ids
.
append
(
seg_id_b
)
tokens
.
append
(
"[SEP]"
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
segment_ids
.
append
(
seg_id_b
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
...
@@ -1077,7 +1120,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
...
@@ -1077,7 +1120,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
while
len
(
input_ids
)
<
max_seq_length
:
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
segment_ids
.
append
(
seg_id_pad
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
...
...
official/nlp/data/create_finetuning_data.py
View file @
eb3ec508
...
@@ -47,9 +47,9 @@ flags.DEFINE_string(
...
@@ -47,9 +47,9 @@ flags.DEFINE_string(
"for the task."
)
"for the task."
)
flags
.
DEFINE_enum
(
"classification_task_name"
,
"MNLI"
,
flags
.
DEFINE_enum
(
"classification_task_name"
,
"MNLI"
,
[
"AX"
,
"COLA"
,
"MNLI"
,
"MRPC"
,
"PAWS-X"
,
"QNLI"
,
"QQP"
,
"RTE"
,
[
"AX"
,
"COLA"
,
"IMDB"
,
"MNLI"
,
"MRPC"
,
"PAWS-X"
,
"QNLI"
,
"SST-2"
,
"STS-B"
,
"WNLI"
,
"XNLI"
,
"XTREME-XNLI"
,
"QQP"
,
"RTE"
,
"SST-2"
,
"STS-B"
,
"WNLI"
,
"XNLI"
,
"XTREME-PAWS-X"
],
"XTREME-XNLI"
,
"XTREME-PAWS-X"
],
"The name of the task to train BERT classifier. The "
"The name of the task to train BERT classifier. The "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"of input tsv files; 2. the dev set for XTREME is english "
"of input tsv files; 2. the dev set for XTREME is english "
...
@@ -182,6 +182,8 @@ def generate_classifier_dataset():
...
@@ -182,6 +182,8 @@ def generate_classifier_dataset():
classifier_data_lib
.
AxProcessor
,
classifier_data_lib
.
AxProcessor
,
"cola"
:
"cola"
:
classifier_data_lib
.
ColaProcessor
,
classifier_data_lib
.
ColaProcessor
,
"imdb"
:
classifier_data_lib
.
ImdbProcessor
,
"mnli"
:
"mnli"
:
functools
.
partial
(
classifier_data_lib
.
MnliProcessor
,
functools
.
partial
(
classifier_data_lib
.
MnliProcessor
,
mnli_type
=
FLAGS
.
mnli_type
),
mnli_type
=
FLAGS
.
mnli_type
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment