Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bac332fe
Commit
bac332fe
authored
Sep 02, 2019
by
LysandreJik
Browse files
Updated the GLUE data processor. Corrections to RoBERTa and XLNet.
parent
c3df2136
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
56 deletions
+4
-56
examples/utils_glue.py
examples/utils_glue.py
+1
-54
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+1
-1
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+2
-1
No files found.
examples/utils_glue.py
View file @
bac332fe
...
...
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
special_tokens_count
=
4
if
sep_token_extra
else
3
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
special_tokens_count
)
else
:
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count
=
3
if
sep_token_extra
else
2
if
len
(
tokens_a
)
>
max_seq_length
-
special_tokens_count
:
tokens_a
=
tokens_a
[:(
max_seq_length
-
special_tokens_count
)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
tokens_a
+
[
sep_token
]
if
sep_token_extra
:
# roberta uses an extra separator b/w pairs of sentences
tokens
+=
[
sep_token
]
segment_ids
=
[
sequence_a_segment_id
]
*
len
(
tokens
)
if
tokens_b
:
tokens
+=
tokens_b
+
[
sep_token
]
segment_ids
+=
[
sequence_b_segment_id
]
*
(
len
(
tokens_b
)
+
1
)
if
cls_token_at_end
:
tokens
=
tokens
+
[
cls_token
]
segment_ids
=
segment_ids
+
[
cls_token_segment_id
]
else
:
tokens
=
[
cls_token
]
+
tokens
segment_ids
=
[
cls_token_segment_id
]
+
segment_ids
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
input_ids
,
input_mask
=
tokenizer
.
encode
(
example
.
text_a
,
example
.
text_b
,
add_special_tokens
=
True
,
output_mask
=
True
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
...
...
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
if
ex_index
<
5
:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
[
str
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logger
.
info
(
"segment_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
...
...
pytorch_transformers/tokenization_roberta.py
View file @
bac332fe
...
...
@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
sep
+
token_ids_1
+
sep
)
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
pytorch_transformers/tokenization_xlnet.py
View file @
bac332fe
...
...
@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls_segment_ids
=
[
2
]
if
output_mask
:
return
(
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
,
[
0
]
*
len
(
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
+
cls
)
[
0
]
*
len
(
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
+
cls
_segment_ids
)
else
:
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment