Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f71758f7
Commit
f71758f7
authored
Sep 25, 2019
by
thomwolf
Browse files
update internal glue processors
parent
0f091062
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
108 additions
and
36 deletions
+108
-36
examples/run_glue.py
examples/run_glue.py
+13
-9
pytorch_transformers/data/processors/glue.py
pytorch_transformers/data/processors/glue.py
+66
-23
pytorch_transformers/data/processors/utils.py
pytorch_transformers/data/processors/utils.py
+29
-4
No files found.
examples/run_glue.py
View file @
f71758f7
...
@@ -278,10 +278,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -278,10 +278,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
# HACK(label indices are swapped in RoBERTa pretrained model)
# HACK(label indices are swapped in RoBERTa pretrained model)
label_list
[
1
],
label_list
[
2
]
=
label_list
[
2
],
label_list
[
1
]
label_list
[
1
],
label_list
[
2
]
=
label_list
[
2
],
label_list
[
1
]
examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
features
=
convert_examples_to_features
(
examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
,
features
=
convert_examples_to_features
(
examples
,
pad_on_left
=
bool
(
args
.
model_type
in
[
'xlnet'
]),
# pad on the left for xlnet
label_list
,
pad_token
=
tokenizer
.
convert_tokens_to_ids
([
tokenizer
.
pad_token
])[
0
],
args
.
max_seq_length
,
pad_token_segment_id
=
4
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
tokenizer
,
output_mode
,
pad_on_left
=
bool
(
args
.
model_type
in
[
'xlnet'
]),
# pad on the left for xlnet
pad_token
=
tokenizer
.
convert_tokens_to_ids
([
tokenizer
.
pad_token
])[
0
],
pad_token_segment_id
=
4
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
)
)
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
@@ -292,14 +296,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -292,14 +296,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
# Convert to Tensors and build dataset
# Convert to Tensors and build dataset
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_
input
_mask
=
torch
.
tensor
([
f
.
input
_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_
attention
_mask
=
torch
.
tensor
([
f
.
attention
_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_
segment
_ids
=
torch
.
tensor
([
f
.
segment
_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_
token_type
_ids
=
torch
.
tensor
([
f
.
token_type
_ids
for
f
in
features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
if
output_mode
==
"classification"
:
all_label
_id
s
=
torch
.
tensor
([
f
.
label
_id
for
f
in
features
],
dtype
=
torch
.
long
)
all_labels
=
torch
.
tensor
([
f
.
label
for
f
in
features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
elif
output_mode
==
"regression"
:
all_label
_id
s
=
torch
.
tensor
([
f
.
label
_id
for
f
in
features
],
dtype
=
torch
.
float
)
all_labels
=
torch
.
tensor
([
f
.
label
for
f
in
features
],
dtype
=
torch
.
float
)
dataset
=
TensorDataset
(
all_input_ids
,
all_
input
_mask
,
all_
segment
_ids
,
all_label
_id
s
)
dataset
=
TensorDataset
(
all_input_ids
,
all_
attention
_mask
,
all_
token_type
_ids
,
all_labels
)
return
dataset
return
dataset
...
...
pytorch_transformers/data/processors/glue.py
View file @
f71758f7
...
@@ -19,11 +19,18 @@ import logging
...
@@ -19,11 +19,18 @@ import logging
import
os
import
os
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
...file_utils
import
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
glue_convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
def
glue_convert_examples_to_features
(
examples
,
tokenizer
,
tokenizer
,
output_mode
,
max_length
=
512
,
task
=
None
,
label_list
=
None
,
output_mode
=
None
,
pad_on_left
=
False
,
pad_on_left
=
False
,
pad_token
=
0
,
pad_token
=
0
,
pad_token_segment_id
=
0
,
pad_token_segment_id
=
0
,
...
@@ -31,46 +38,63 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -31,46 +38,63 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
"""
"""
Loads a data file into a list of `InputBatch`s
Loads a data file into a list of `InputBatch`s
"""
"""
is_tf_dataset
=
False
if
is_tf_available
()
and
isinstance
(
examples
,
tf
.
data
.
Dataset
):
is_tf_dataset
=
True
if
task
is
not
None
:
processor
=
glue_processors
[
task
]()
if
label_list
is
None
:
label_list
=
processor
.
get_labels
()
logger
.
info
(
"Using label list %s for task %s"
%
(
label_list
,
task
))
if
output_mode
is
None
:
output_mode
=
glue_output_modes
[
task
]
logger
.
info
(
"Using output mode %s for task %s"
%
(
output_mode
,
task
))
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
features
=
[]
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
logger
.
info
(
"Writing example %d"
%
(
ex_index
))
if
is_tf_dataset
:
example
=
InputExample
(
example
[
'idx'
].
numpy
(),
example
[
'sentence1'
].
numpy
().
decode
(
'utf-8'
),
example
[
'sentence2'
].
numpy
().
decode
(
'utf-8'
),
str
(
example
[
'label'
].
numpy
()))
inputs
=
tokenizer
.
encode_plus
(
inputs
=
tokenizer
.
encode_plus
(
example
.
text_a
,
example
.
text_a
,
example
.
text_b
,
example
.
text_b
,
add_special_tokens
=
True
,
add_special_tokens
=
True
,
max_length
=
max_
seq_
length
,
max_length
=
max_length
,
truncate_first_sequence
=
True
# We're truncating the first sequence
as a
priority
truncate_first_sequence
=
True
# We're truncating the first sequence
in
priority
)
)
input_ids
,
segment
_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type_ids"
]
input_ids
,
token_type
_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type_ids"
]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
# tokens are attended to.
input
_mask
=
[
1
if
mask_padding_with_zero
else
0
]
*
len
(
input_ids
)
attention
_mask
=
[
1
if
mask_padding_with_zero
else
0
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
# Zero-pad up to the sequence length.
padding_length
=
max_
seq_
length
-
len
(
input_ids
)
padding_length
=
max_length
-
len
(
input_ids
)
if
pad_on_left
:
if
pad_on_left
:
input_ids
=
([
pad_token
]
*
padding_length
)
+
input_ids
input_ids
=
([
pad_token
]
*
padding_length
)
+
input_ids
input
_mask
=
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
+
input
_mask
attention
_mask
=
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
+
attention
_mask
segment
_ids
=
([
pad_token_segment_id
]
*
padding_length
)
+
segment
_ids
token_type
_ids
=
([
pad_token_segment_id
]
*
padding_length
)
+
token_type
_ids
else
:
else
:
input_ids
=
input_ids
+
([
pad_token
]
*
padding_length
)
input_ids
=
input_ids
+
([
pad_token
]
*
padding_length
)
input_mask
=
input
_mask
+
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
attention_mask
=
attention
_mask
+
([
0
if
mask_padding_with_zero
else
1
]
*
padding_length
)
segment_ids
=
segment
_ids
+
([
pad_token_segment_id
]
*
padding_length
)
token_type_ids
=
token_type
_ids
+
([
pad_token_segment_id
]
*
padding_length
)
assert
len
(
input_ids
)
==
max_
seq
_length
assert
len
(
input_ids
)
==
max_
length
,
"Error with input length {} vs {}"
.
format
(
len
(
input_ids
),
max
_length
)
assert
len
(
input
_mask
)
==
max_
seq
_length
assert
len
(
attention
_mask
)
==
max_
length
,
"Error with input length {} vs {}"
.
format
(
len
(
attention_mask
),
max
_length
)
assert
len
(
segment
_ids
)
==
max_
seq
_length
assert
len
(
token_type
_ids
)
==
max_
length
,
"Error with input length {} vs {}"
.
format
(
len
(
token_type_ids
),
max
_length
)
if
output_mode
==
"classification"
:
if
output_mode
==
"classification"
:
label
_id
=
label_map
[
example
.
label
]
label
=
label_map
[
example
.
label
]
elif
output_mode
==
"regression"
:
elif
output_mode
==
"regression"
:
label
_id
=
float
(
example
.
label
)
label
=
float
(
example
.
label
)
else
:
else
:
raise
KeyError
(
output_mode
)
raise
KeyError
(
output_mode
)
...
@@ -78,15 +102,34 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -78,15 +102,34 @@ def glue_convert_examples_to_features(examples, label_list, max_seq_length,
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"
input
_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input
_mask
]))
logger
.
info
(
"
attention
_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
attention
_mask
]))
logger
.
info
(
"
segment
_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment
_ids
]))
logger
.
info
(
"
token_type
_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
token_type
_ids
]))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label
_id
))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label
))
features
.
append
(
features
.
append
(
InputFeatures
(
input_ids
=
input_ids
,
InputFeatures
(
input_ids
=
input_ids
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
segment_ids
=
segment_ids
,
token_type_ids
=
token_type_ids
,
label_id
=
label_id
))
label
=
label
))
if
is_tf_available
()
and
is_tf_dataset
:
def
gen
():
for
ex
in
features
:
yield
({
'input_ids'
:
ex
.
input_ids
,
'attention_mask'
:
ex
.
attention_mask
,
'token_type_ids'
:
ex
.
token_type_ids
},
ex
.
label
)
return
tf
.
data
.
Dataset
.
from_generator
(
gen
,
({
'input_ids'
:
tf
.
int32
,
'attention_mask'
:
tf
.
int32
,
'token_type_ids'
:
tf
.
int32
},
tf
.
int64
),
({
'input_ids'
:
tf
.
TensorShape
([
None
]),
'attention_mask'
:
tf
.
TensorShape
([
None
]),
'token_type_ids'
:
tf
.
TensorShape
([
None
])},
tf
.
TensorShape
([])))
return
features
return
features
...
...
pytorch_transformers/data/processors/utils.py
View file @
f71758f7
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
import
csv
import
csv
import
sys
import
sys
import
copy
class
InputExample
(
object
):
class
InputExample
(
object
):
"""A single training/test example for simple sequence classification."""
"""A single training/test example for simple sequence classification."""
...
@@ -36,15 +37,39 @@ class InputExample(object):
...
@@ -36,15 +37,39 @@ class InputExample(object):
self
.
text_b
=
text_b
self
.
text_b
=
text_b
self
.
label
=
label
self
.
label
=
label
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
class
InputFeatures
(
object
):
class
InputFeatures
(
object
):
"""A single set of features of data."""
"""A single set of features of data."""
def
__init__
(
self
,
input_ids
,
input_mask
,
segment
_ids
,
label
_id
):
def
__init__
(
self
,
input_ids
,
attention_mask
,
token_type
_ids
,
label
):
self
.
input_ids
=
input_ids
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
attention_mask
=
attention_mask
self
.
segment_ids
=
segment_ids
self
.
token_type_ids
=
token_type_ids
self
.
label_id
=
label_id
self
.
label
=
label
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
class
DataProcessor
(
object
):
class
DataProcessor
(
object
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment