Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
e0b6ce02
Commit
e0b6ce02
authored
Aug 21, 2020
by
Chen Chen
Committed by
A. Unique TensorFlower
Aug 21, 2020
Browse files
Internal change
PiperOrigin-RevId: 327830072
parent
fe30e189
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
65 additions
and
34 deletions
+65
-34
official/nlp/data/tagging_data_lib.py
official/nlp/data/tagging_data_lib.py
+12
-3
official/nlp/data/tagging_dataloader.py
official/nlp/data/tagging_dataloader.py
+3
-0
official/nlp/data/tagging_dataloader_test.py
official/nlp/data/tagging_dataloader_test.py
+21
-8
official/nlp/tasks/tagging.py
official/nlp/tasks/tagging.py
+25
-20
official/nlp/tasks/tagging_test.py
official/nlp/tasks/tagging_test.py
+4
-3
No files found.
official/nlp/data/tagging_data_lib.py
View file @
e0b6ce02
...
@@ -33,9 +33,14 @@ _UNK_TOKEN = "[UNK]"
...
@@ -33,9 +33,14 @@ _UNK_TOKEN = "[UNK]"
class
InputExample
(
object
):
class
InputExample
(
object
):
"""A single training/test example for token classification."""
"""A single training/test example for token classification."""
def
__init__
(
self
,
sentence_id
,
words
=
None
,
label_ids
=
None
):
def
__init__
(
self
,
sentence_id
,
sub_sentence_id
=
0
,
words
=
None
,
label_ids
=
None
):
"""Constructs an InputExample."""
"""Constructs an InputExample."""
self
.
sentence_id
=
sentence_id
self
.
sentence_id
=
sentence_id
self
.
sub_sentence_id
=
sub_sentence_id
self
.
words
=
words
if
words
else
[]
self
.
words
=
words
if
words
else
[]
self
.
label_ids
=
label_ids
if
label_ids
else
[]
self
.
label_ids
=
label_ids
if
label_ids
else
[]
...
@@ -146,7 +151,7 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
...
@@ -146,7 +151,7 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
# Needs additional [CLS] and [SEP] tokens.
# Needs additional [CLS] and [SEP] tokens.
max_length
=
max_length
-
2
max_length
=
max_length
-
2
new_examples
=
[]
new_examples
=
[]
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
,
sub_sentence_id
=
0
)
for
i
,
word
in
enumerate
(
example
.
words
):
for
i
,
word
in
enumerate
(
example
.
words
):
if
any
([
x
<
0
for
x
in
example
.
label_ids
]):
if
any
([
x
<
0
for
x
in
example
.
label_ids
]):
raise
ValueError
(
"Unexpected negative label_id: %s"
%
example
.
label_ids
)
raise
ValueError
(
"Unexpected negative label_id: %s"
%
example
.
label_ids
)
...
@@ -160,7 +165,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
...
@@ -160,7 +165,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
if
len
(
subwords
)
+
len
(
new_example
.
words
)
>
max_length
:
if
len
(
subwords
)
+
len
(
new_example
.
words
)
>
max_length
:
# Start a new example.
# Start a new example.
new_examples
.
append
(
new_example
)
new_examples
.
append
(
new_example
)
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
last_sub_sentence_id
=
new_example
.
sub_sentence_id
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
,
sub_sentence_id
=
last_sub_sentence_id
+
1
)
for
j
,
subword
in
enumerate
(
subwords
):
for
j
,
subword
in
enumerate
(
subwords
):
# Use the real label for the first subword, and pad label for
# Use the real label for the first subword, and pad label for
...
@@ -203,6 +211,7 @@ def _convert_single_example(example, max_seq_length, tokenizer):
...
@@ -203,6 +211,7 @@ def _convert_single_example(example, max_seq_length, tokenizer):
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"label_ids"
]
=
create_int_feature
(
label_ids
)
features
[
"label_ids"
]
=
create_int_feature
(
label_ids
)
features
[
"sentence_id"
]
=
create_int_feature
([
example
.
sentence_id
])
features
[
"sentence_id"
]
=
create_int_feature
([
example
.
sentence_id
])
features
[
"sub_sentence_id"
]
=
create_int_feature
([
example
.
sub_sentence_id
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
return
tf_example
return
tf_example
...
...
official/nlp/data/tagging_dataloader.py
View file @
e0b6ce02
...
@@ -52,6 +52,7 @@ class TaggingDataLoader(data_loader.DataLoader):
...
@@ -52,6 +52,7 @@ class TaggingDataLoader(data_loader.DataLoader):
}
}
if
self
.
_include_sentence_id
:
if
self
.
_include_sentence_id
:
name_to_features
[
'sentence_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
name_to_features
[
'sentence_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
name_to_features
[
'sub_sentence_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
...
@@ -74,6 +75,8 @@ class TaggingDataLoader(data_loader.DataLoader):
...
@@ -74,6 +75,8 @@ class TaggingDataLoader(data_loader.DataLoader):
}
}
if
self
.
_include_sentence_id
:
if
self
.
_include_sentence_id
:
x
[
'sentence_id'
]
=
record
[
'sentence_id'
]
x
[
'sentence_id'
]
=
record
[
'sentence_id'
]
x
[
'sub_sentence_id'
]
=
record
[
'sub_sentence_id'
]
y
=
record
[
'label_ids'
]
y
=
record
[
'label_ids'
]
return
(
x
,
y
)
return
(
x
,
y
)
...
...
official/nlp/data/tagging_dataloader_test.py
View file @
e0b6ce02
...
@@ -16,13 +16,14 @@
...
@@ -16,13 +16,14 @@
"""Tests for official.nlp.data.tagging_data_loader."""
"""Tests for official.nlp.data.tagging_data_loader."""
import
os
import
os
from
absl.testing
import
parameterized
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp.data
import
tagging_dataloader
from
official.nlp.data
import
tagging_dataloader
def
_create_fake_dataset
(
output_path
,
seq_length
):
def
_create_fake_dataset
(
output_path
,
seq_length
,
include_sentence_id
):
"""Creates a fake dataset."""
"""Creates a fake dataset."""
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
writer
=
tf
.
io
.
TFRecordWriter
(
output_path
)
...
@@ -30,7 +31,7 @@ def _create_fake_dataset(output_path, seq_length):
...
@@ -30,7 +31,7 @@ def _create_fake_dataset(output_path, seq_length):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
return
f
for
_
in
range
(
100
):
for
i
in
range
(
100
):
features
=
{}
features
=
{}
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
input_ids
=
np
.
random
.
randint
(
100
,
size
=
(
seq_length
))
features
[
'input_ids'
]
=
create_int_feature
(
input_ids
)
features
[
'input_ids'
]
=
create_int_feature
(
input_ids
)
...
@@ -38,32 +39,44 @@ def _create_fake_dataset(output_path, seq_length):
...
@@ -38,32 +39,44 @@ def _create_fake_dataset(output_path, seq_length):
features
[
'segment_ids'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'segment_ids'
]
=
create_int_feature
(
np
.
ones_like
(
input_ids
))
features
[
'label_ids'
]
=
create_int_feature
(
features
[
'label_ids'
]
=
create_int_feature
(
np
.
random
.
randint
(
10
,
size
=
(
seq_length
)))
np
.
random
.
randint
(
10
,
size
=
(
seq_length
)))
if
include_sentence_id
:
features
[
'sentence_id'
]
=
create_int_feature
([
i
])
features
[
'sub_sentence_id'
]
=
create_int_feature
([
0
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
writer
.
close
()
class
TaggingDataLoaderTest
(
tf
.
test
.
TestCase
):
class
TaggingDataLoaderTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
test_load_dataset
(
self
):
@
parameterized
.
parameters
(
True
,
False
)
def
test_load_dataset
(
self
,
include_sentence_id
):
seq_length
=
16
seq_length
=
16
batch_size
=
10
batch_size
=
10
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
train_data_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train.tf_record'
)
_create_fake_dataset
(
train_data_path
,
seq_length
)
_create_fake_dataset
(
train_data_path
,
seq_length
,
include_sentence_id
)
data_config
=
tagging_dataloader
.
TaggingDataConfig
(
data_config
=
tagging_dataloader
.
TaggingDataConfig
(
input_path
=
train_data_path
,
input_path
=
train_data_path
,
seq_length
=
seq_length
,
seq_length
=
seq_length
,
global_batch_size
=
batch_size
)
global_batch_size
=
batch_size
,
include_sentence_id
=
include_sentence_id
)
dataset
=
tagging_dataloader
.
TaggingDataLoader
(
data_config
).
load
()
dataset
=
tagging_dataloader
.
TaggingDataLoader
(
data_config
).
load
()
features
,
labels
=
next
(
iter
(
dataset
))
features
,
labels
=
next
(
iter
(
dataset
))
self
.
assertCountEqual
([
'input_word_ids'
,
'input_mask'
,
'input_type_ids'
],
features
.
keys
())
expected_keys
=
[
'input_word_ids'
,
'input_mask'
,
'input_type_ids'
]
if
include_sentence_id
:
expected_keys
.
extend
([
'sentence_id'
,
'sub_sentence_id'
])
self
.
assertCountEqual
(
expected_keys
,
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
labels
.
shape
,
(
batch_size
,
seq_length
))
if
include_sentence_id
:
self
.
assertEqual
(
features
[
'sentence_id'
].
shape
,
(
batch_size
,))
self
.
assertEqual
(
features
[
'sub_sentence_id'
].
shape
,
(
batch_size
,))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
official/nlp/tasks/tagging.py
View file @
e0b6ce02
...
@@ -214,8 +214,9 @@ class TaggingTask(base_task.Task):
...
@@ -214,8 +214,9 @@ class TaggingTask(base_task.Task):
}
}
def
predict
(
task
:
TaggingTask
,
params
:
cfg
.
DataConfig
,
def
predict
(
task
:
TaggingTask
,
model
:
tf
.
keras
.
Model
)
->
Tuple
[
List
[
List
[
int
]],
List
[
int
]]:
params
:
cfg
.
DataConfig
,
model
:
tf
.
keras
.
Model
)
->
List
[
Tuple
[
int
,
int
,
List
[
int
]]]:
"""Predicts on the input data.
"""Predicts on the input data.
Args:
Args:
...
@@ -224,46 +225,50 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
...
@@ -224,46 +225,50 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
model: A keras.Model.
model: A keras.Model.
Returns:
Returns:
A tuple of `predict_ids` and `sentence_ids`, which are list with length
A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
of `num_examples`. Each element in `predict_ids` is a sequence of
a list of predicted ids.
predicted per-word label id, and each element in `sentence_ids` is the
sentence id of the corresponding example.
"""
"""
def
predict_step
(
inputs
):
def
predict_step
(
inputs
):
"""Replicated prediction calculation."""
"""Replicated prediction calculation."""
x
,
y
=
inputs
x
,
y
=
inputs
sentence_ids
=
x
.
pop
(
'sentence_id'
)
sentence_ids
=
x
.
pop
(
'sentence_id'
)
sub_sentence_ids
=
x
.
pop
(
'sub_sentence_id'
)
outputs
=
task
.
inference_step
(
x
,
model
)
outputs
=
task
.
inference_step
(
x
,
model
)
predict_ids
=
outputs
[
'predict_ids'
]
predict_ids
=
outputs
[
'predict_ids'
]
label_mask
=
tf
.
greater_equal
(
y
,
0
)
label_mask
=
tf
.
greater_equal
(
y
,
0
)
return
dict
(
return
dict
(
predict_ids
=
predict_ids
,
predict_ids
=
predict_ids
,
label_mask
=
label_mask
,
label_mask
=
label_mask
,
sentence_ids
=
sentence_ids
)
sentence_ids
=
sentence_ids
,
sub_sentence_ids
=
sub_sentence_ids
)
def
aggregate_fn
(
state
,
outputs
):
def
aggregate_fn
(
state
,
outputs
):
"""Concatenates model's outputs."""
"""Concatenates model's outputs."""
if
state
is
None
:
if
state
is
None
:
state
=
{
'predict_ids'
:
[],
'sentence_ids'
:
[]}
state
=
[]
cur_predict_ids
=
state
[
'predict_ids'
]
for
(
batch_predict_ids
,
batch_label_mask
,
batch_sentence_ids
,
cur_sentence_ids
=
state
[
'sentence_ids'
]
batch_sub_sentence_ids
)
in
zip
(
outputs
[
'predict_ids'
],
for
batch_predict_ids
,
batch_label_mask
,
batch_sentence_ids
in
zip
(
outputs
[
'label_mask'
],
outputs
[
'predict_ids'
],
outputs
[
'label_mask'
],
outputs
[
'sentence_ids'
]):
outputs
[
'sentence_ids'
],
for
tmp_predict_ids
,
tmp_label_mask
,
tmp_sentence_id
in
zip
(
outputs
[
'sub_sentence_ids'
]):
batch_predict_ids
.
numpy
(),
batch_label_mask
.
numpy
(),
for
(
tmp_predict_ids
,
tmp_label_mask
,
tmp_sentence_id
,
batch_sentence_ids
.
numpy
()):
tmp_sub_sentence_id
)
in
zip
(
batch_predict_ids
.
numpy
(),
cur_sentence_ids
.
append
(
tmp_sentence_id
)
batch_label_mask
.
numpy
(),
cur_predict_ids
.
append
([])
batch_sentence_ids
.
numpy
(),
batch_sub_sentence_ids
.
numpy
()):
real_predict_ids
=
[]
assert
len
(
tmp_predict_ids
)
==
len
(
tmp_label_mask
)
assert
len
(
tmp_predict_ids
)
==
len
(
tmp_label_mask
)
for
i
in
range
(
len
(
tmp_predict_ids
)):
for
i
in
range
(
len
(
tmp_predict_ids
)):
# Skip the padding label.
# Skip the padding label.
if
tmp_label_mask
[
i
]:
if
tmp_label_mask
[
i
]:
cur_predict_ids
[
-
1
].
append
(
tmp_predict_ids
[
i
])
real_predict_ids
.
append
(
tmp_predict_ids
[
i
])
state
.
append
((
tmp_sentence_id
,
tmp_sub_sentence_id
,
real_predict_ids
))
return
state
return
state
dataset
=
orbit
.
utils
.
make_distributed_dataset
(
tf
.
distribute
.
get_strategy
(),
dataset
=
orbit
.
utils
.
make_distributed_dataset
(
tf
.
distribute
.
get_strategy
(),
task
.
build_inputs
,
params
)
task
.
build_inputs
,
params
)
outputs
=
utils
.
predict
(
predict_step
,
aggregate_fn
,
dataset
)
outputs
=
utils
.
predict
(
predict_step
,
aggregate_fn
,
dataset
)
return
outputs
[
'predict_ids'
],
outputs
[
'sentence_ids'
]
return
sorted
(
outputs
,
key
=
lambda
x
:
(
x
[
0
],
x
[
1
]))
official/nlp/tasks/tagging_test.py
View file @
e0b6ce02
...
@@ -44,6 +44,7 @@ def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
...
@@ -44,6 +44,7 @@ def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
features
[
"label_ids"
]
=
create_int_feature
(
features
[
"label_ids"
]
=
create_int_feature
(
np
.
random
.
random_integers
(
-
1
,
num_labels
-
1
,
size
=
(
seq_length
)))
np
.
random
.
random_integers
(
-
1
,
num_labels
-
1
,
size
=
(
seq_length
)))
features
[
"sentence_id"
]
=
create_int_feature
([
i
])
features
[
"sentence_id"
]
=
create_int_feature
([
i
])
features
[
"sub_sentence_id"
]
=
create_int_feature
([
0
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
write
(
tf_example
.
SerializeToString
())
...
@@ -189,9 +190,9 @@ class TaggingTest(tf.test.TestCase):
...
@@ -189,9 +190,9 @@ class TaggingTest(tf.test.TestCase):
drop_remainder
=
False
,
drop_remainder
=
False
,
include_sentence_id
=
True
)
include_sentence_id
=
True
)
p
re
dict_ids
,
sentence_id
s
=
tagging
.
predict
(
task
,
test_data_config
,
model
)
re
sult
s
=
tagging
.
predict
(
task
,
test_data_config
,
model
)
self
.
assertLen
(
p
re
dict_id
s
,
num_examples
)
self
.
assertLen
(
re
sult
s
,
num_examples
)
self
.
assertLen
(
sentence_ids
,
num_examples
)
self
.
assertLen
(
results
[
0
],
3
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment