Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f71758f7
Commit
f71758f7
authored
Sep 25, 2019
by
thomwolf
Browse files
update internal glue processors
parent
0f091062
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
108 additions
and
36 deletions
+108
-36
examples/run_glue.py
examples/run_glue.py
+13
-9
pytorch_transformers/data/processors/glue.py
pytorch_transformers/data/processors/glue.py
+66
-23
pytorch_transformers/data/processors/utils.py
pytorch_transformers/data/processors/utils.py
+29
-4
No files found.
examples/run_glue.py
View file @
f71758f7
...
@@ -278,10 +278,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -278,10 +278,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
# HACK(label indices are swapped in RoBERTa pretrained model)
# HACK(label indices are swapped in RoBERTa pretrained model)
label_list
[
1
],
label_list
[
2
]
=
label_list
[
2
],
label_list
[
1
]
label_list
[
1
],
label_list
[
2
]
=
label_list
[
2
],
label_list
[
1
]
examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
features
=
convert_examples_to_features
(
examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
,
features
=
convert_examples_to_features
(
examples
,
pad_on_left
=
bool
(
args
.
model_type
in
[
'xlnet'
]),
# pad on the left for xlnet
label_list
,
pad_token
=
tokenizer
.
convert_tokens_to_ids
([
tokenizer
.
pad_token
])[
0
],
args
.
max_seq_length
,
pad_token_segment_id
=
4
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
tokenizer
,
output_mode
,
pad_on_left
=
bool
(
args
.
model_type
in
[
'xlnet'
]),
# pad on the left for xlnet
pad_token
=
tokenizer
.
convert_tokens_to_ids
([
tokenizer
.
pad_token
])[
0
],
pad_token_segment_id
=
4
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
)
)
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
@@ -292,14 +296,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -292,14 +296,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
# Convert to Tensors and build dataset
# Convert to Tensors and build dataset
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_
input
_mask
=
torch
.
tensor
([
f
.
input
_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_
attention
_mask
=
torch
.
tensor
([
f
.
attention
_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_
segment
_ids
=
torch
.
tensor
([
f
.
segment
_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_
token_type
_ids
=
torch
.
tensor
([
f
.
token_type
_ids
for
f
in
features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
if
output_mode
==
"classification"
:
all_label
_id
s
=
torch
.
tensor
([
f
.
label
_id
for
f
in
features
],
dtype
=
torch
.
long
)
all_labels
=
torch
.
tensor
([
f
.
label
for
f
in
features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
elif
output_mode
==
"regression"
:
all_label
_id
s
=
torch
.
tensor
([
f
.
label
_id
for
f
in
features
],
dtype
=
torch
.
float
)
all_labels
=
torch
.
tensor
([
f
.
label
for
f
in
features
],
dtype
=
torch
.
float
)
dataset
=
TensorDataset
(
all_input_ids
,
all_
input
_mask
,
all_
segment
_ids
,
all_label
_id
s
)
dataset
=
TensorDataset
(
all_input_ids
,
all_
attention
_mask
,
all_
token_type
_ids
,
all_labels
)
return
dataset
return
dataset
...
...
pytorch_transformers/data/processors/glue.py
View file @
f71758f7
...
@@ -19,11 +19,18 @@ import logging
...
@@ -19,11 +19,18 @@ import logging
import
os
import
os
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
...file_utils
import
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
glue_convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
def
glue_convert_examples_to_features
(
examples
,
tokenizer
,
tokenizer
,
output_mode
,
max_length
=
512
,
task
=
None
,
label_list
=
None
,
output_mode
=
None
,
pad_on_left
=
False
,
pad_on_left
=
False
,
pad_token
=
0
,
pad_token
=
0
,
pad_token_segment_id
=
0
,
pad_token_segment_id
=
0
,
...
@@ -31,46 +38,63 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -31,46 +38,63 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
"""
"""
Loads a data file into a list of `InputBatch`s
Loads a data file into a list of `InputBatch`s
"""
"""
is_tf_dataset
=
False
if
is_tf_available
()
and
isinstance
(
examples
,
tf
.
data
.
Dataset
):
is_tf_dataset
=
True
if
task
is
not
None
:
processor
=
glue_processors
[
task
]()
if
label_list
is
None
:
label_list
=
processor
.
get_labels
()
logger
.
info
(
"Using label list %s for task %s"
%
(
label_list
,
task
))
if
output_mode
is
None
:
output_mode
=
glue_output_modes
[
task
]
logger
.
info
(
"Using output mode %s for task %s"
%
(
output_mode
,
task
))
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
features
=
[]
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
logger
.
info
(
"Writing example %d"
%
(
ex_index
))
if
is_tf_dataset
:
example
=
InputExample
(
example
[
'idx'
].
numpy
(),
example
[
'sentence1'
].
numpy
().
decode
(
'utf-8'
),
example
[
'sentence2'
].
numpy
().
decode
(
'utf-8'
),
str
(
example
[
'label'
].
numpy
()))
inputs
=
tokenizer
.
encode_plus
(
inputs
=
tokenizer
.
encode_plus
(
example
.
text_a
,
example
.
text_a
,
example
.
text_b
,
example
.
text_b
,
add_special_tokens
=
True
,
add_special_tokens
=
True
,
max_length
=
max_
seq_
length
,
max_length
=
max_length
,
truncate_first_sequence
=
True
# We're truncating the first sequence
as a
priority
truncate_first_sequence
=
True
# We're truncating the first sequence
in
priority
)
)
input_ids
,
segment
_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type_ids"
]
input_ids
,
token_type
_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type_ids"
]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
# tokens are attended to.
input
_mask
=
[
1
if
mask_padding_with_zero
else
0
]
*
len
(
input_ids
)
attention
_mask
=
[
1
if
mask_padding_with_zero
else
0
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
# Zero-pad up to the sequence length.
padding_length
=
max_
seq_
length
-
len
(
input_ids
)
padding_length
=
max_length
-
len
(
input_ids
)
if
pad_on_left
:
if
pad_on_left
:
input_ids
=
([
pad_token
]
*
padding_length
)
+
input_ids
input_ids
=
([
pad_token
]
*
padding_length
)
+
input_ids
input
_mask
=
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
+
input
_mask
attention
_mask
=
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
+
attention
_mask
segment
_ids
=
([
pad_token_segment_id
]
*
padding_length
)
+
segment
_ids
token_type
_ids
=
([
pad_token_segment_id
]
*
padding_length
)
+
token_type
_ids
else
:
else
:
input_ids
=
input_ids
+
([
pad_token
]
*
padding_length
)
input_ids
=
input_ids
+
([
pad_token
]
*
padding_length
)
input_mask
=
input
_mask
+
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
attention_mask
=
attention
_mask
+
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
segment_ids
=
segment
_ids
+
([
pad_token_segment_id
]
*
padding_length
)
token_type_ids
=
token_type
_ids
+
([
pad_token_segment_id
]
*
padding_length
)
assert
len
(
input_ids
)
==
max_
seq
_length
assert
len
(
input_ids
)
==
max_
length
,
"Error with input length {} vs {}"
.
format
(
len
(
input_ids
),
max
_length
)
assert
len
(
input
_mask
)
==
max_
seq
_length
assert
len
(
attention
_mask
)
==
max_
length
,
"Error with input length {} vs {}"
.
format
(
len
(
attention_mask
),
max
_length
)
assert
len
(
segment
_ids
)
==
max_
seq
_length
assert
len
(
token_type
_ids
)
==
max_
length
,
"Error with input length {} vs {}"
.
format
(
len
(
token_type_ids
),
max
_length
)
if
output_mode
==
"classification"
:
if
output_mode
==
"classification"
:
label
_id
=
label_map
[
example
.
label
]
label
=
label_map
[
example
.
label
]
elif
output_mode
==
"regression"
:
elif
output_mode
==
"regression"
:
label
_id
=
float
(
example
.
label
)
label
=
float
(
example
.
label
)
else
:
else
:
raise
KeyError
(
output_mode
)
raise
KeyError
(
output_mode
)
...
@@ -78,15 +102,34 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -78,15 +102,34 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"
input
_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input
_mask
]))
logger
.
info
(
"
attention
_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
attention
_mask
]))
logger
.
info
(
"
segment
_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment
_ids
]))
logger
.
info
(
"
token_type
_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
token_type
_ids
]))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label
_id
))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label
))
features
.
append
(
features
.
append
(
InputFeatures
(
input_ids
=
input_ids
,
InputFeatures
(
input_ids
=
input_ids
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
segment_ids
=
segment_ids
,
token_type_ids
=
token_type_ids
,
label_id
=
label_id
))
label
=
label
))
if
is_tf_available
()
and
is_tf_dataset
:
def
gen
():
for
ex
in
features
:
yield
({
'input_ids'
:
ex
.
input_ids
,
'attention_mask'
:
ex
.
attention_mask
,
'token_type_ids'
:
ex
.
token_type_ids
},
ex
.
label
)
return
tf
.
data
.
Dataset
.
from_generator
(
gen
,
({
'input_ids'
:
tf
.
int32
,
'attention_mask'
:
tf
.
int32
,
'token_type_ids'
:
tf
.
int32
},
tf
.
int64
),
({
'input_ids'
:
tf
.
TensorShape
([
None
]),
'attention_mask'
:
tf
.
TensorShape
([
None
]),
'token_type_ids'
:
tf
.
TensorShape
([
None
])},
tf
.
TensorShape
([])))
return
features
return
features
...
...
pytorch_transformers/data/processors/utils.py
View file @
f71758f7
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
import
csv
import
csv
import
sys
import
sys
import
copy
class
InputExample
(
object
):
class
InputExample
(
object
):
"""A single training/test example for simple sequence classification."""
"""A single training/test example for simple sequence classification."""
...
@@ -36,15 +37,39 @@ class InputExample(object):
...
@@ -36,15 +37,39 @@ class InputExample(object):
self
.
text_b
=
text_b
self
.
text_b
=
text_b
self
.
label
=
label
self
.
label
=
label
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
class
InputFeatures
(
object
):
class
InputFeatures
(
object
):
"""A single set of features of data."""
"""A single set of features of data."""
def
__init__
(
self
,
input_ids
,
input_mask
,
segment
_ids
,
label
_id
):
def
__init__
(
self
,
input_ids
,
attention_mask
,
token_type
_ids
,
label
):
self
.
input_ids
=
input_ids
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
attention_mask
=
attention_mask
self
.
segment_ids
=
segment_ids
self
.
token_type_ids
=
token_type_ids
self
.
label_id
=
label_id
self
.
label
=
label
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
class
DataProcessor
(
object
):
class
DataProcessor
(
object
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment