Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bac332fe
"...resnet50_tensorflow.git" did not exist on "18d627910061a815f68a266c31bc3c21b361e4bb"
Commit
bac332fe
authored
Sep 02, 2019
by
LysandreJik
Browse files
Updated the GLUE data processor. Corrections to RoBERTa and XLNet.
parent
c3df2136
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
56 deletions
+4
-56
examples/utils_glue.py
examples/utils_glue.py
+1
-54
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+1
-1
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+2
-1
No files found.
examples/utils_glue.py
View file @
bac332fe
...
...
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
special_tokens_count
=
4
if
sep_token_extra
else
3
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
special_tokens_count
)
else
:
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count
=
3
if
sep_token_extra
else
2
if
len
(
tokens_a
)
>
max_seq_length
-
special_tokens_count
:
tokens_a
=
tokens_a
[:(
max_seq_length
-
special_tokens_count
)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
tokens_a
+
[
sep_token
]
if
sep_token_extra
:
# roberta uses an extra separator b/w pairs of sentences
tokens
+=
[
sep_token
]
segment_ids
=
[
sequence_a_segment_id
]
*
len
(
tokens
)
if
tokens_b
:
tokens
+=
tokens_b
+
[
sep_token
]
segment_ids
+=
[
sequence_b_segment_id
]
*
(
len
(
tokens_b
)
+
1
)
if
cls_token_at_end
:
tokens
=
tokens
+
[
cls_token
]
segment_ids
=
segment_ids
+
[
cls_token_segment_id
]
else
:
tokens
=
[
cls_token
]
+
tokens
segment_ids
=
[
cls_token_segment_id
]
+
segment_ids
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
input_ids
,
input_mask
=
tokenizer
.
encode
(
example
.
text_a
,
example
.
text_b
,
add_special_tokens
=
True
,
output_mask
=
True
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
...
...
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
if
ex_index
<
5
:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
[
str
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logger
.
info
(
"segment_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
...
...
pytorch_transformers/tokenization_roberta.py
View file @
bac332fe
...
...
@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
sep
+
token_ids_1
+
sep
)
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
pytorch_transformers/tokenization_xlnet.py
View file @
bac332fe
...
...
@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls_segment_ids
=
[
2
]
if
output_mask
:
return
(
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
,
[
0
]
*
len
(
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
+
cls
)
[
0
]
*
len
(
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
+
cls
_segment_ids
)
else
:
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment