Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
aebd8323
Commit
aebd8323
authored
Oct 02, 2019
by
LysandreJik
Browse files
Update naming + remove f string in run_lm_finetuning example
parent
651bfb7a
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
22 additions
and
21 deletions
+22
-21
examples/run_lm_finetuning.py
examples/run_lm_finetuning.py
+2
-2
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+11
-11
transformers/tokenization_bert.py
transformers/tokenization_bert.py
+1
-1
transformers/tokenization_roberta.py
transformers/tokenization_roberta.py
+1
-1
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+5
-4
transformers/tokenization_xlm.py
transformers/tokenization_xlm.py
+1
-1
transformers/tokenization_xlnet.py
transformers/tokenization_xlnet.py
+1
-1
No files found.
examples/run_lm_finetuning.py
View file @
aebd8323
...
...
@@ -59,7 +59,7 @@ class TextDataset(Dataset):
def
__init__
(
self
,
tokenizer
,
file_path
=
'train'
,
block_size
=
512
):
assert
os
.
path
.
isfile
(
file_path
)
directory
,
filename
=
os
.
path
.
split
(
file_path
)
cached_features_file
=
os
.
path
.
join
(
directory
,
'cached_lm_
{}_{}'
.
format
(
block_size
,
filename
)
)
cached_features_file
=
os
.
path
.
join
(
directory
,
'cached_lm_
'
+
block_size
+
'_'
+
filename
)
if
os
.
path
.
exists
(
cached_features_file
):
logger
.
info
(
"Loading features from cached file %s"
,
cached_features_file
)
...
...
@@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args):
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix
=
torch
.
full
(
labels
.
shape
,
args
.
mlm_probability
)
probability_matrix
*=
torch
.
tensor
(
[
tokenizer
.
get_s
equence_ids
(
val
,
special_tokens_present
=
True
)
for
val
in
labels
.
tolist
()],
[
tokenizer
.
get_s
pecial_tokens_mask
(
val
,
special_tokens_present
=
True
)
for
val
in
labels
.
tolist
()],
dtype
=
torch
.
float
)
masked_indices
=
torch
.
bernoulli
(
probability_matrix
).
bool
()
...
...
transformers/tests/tokenization_tests_commons.py
View file @
aebd8323
...
...
@@ -276,7 +276,7 @@ class CommonTestCases:
assert
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
)
==
formatted_input
assert
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
)
==
formatted_input
def
test_s
equence_ids
(
self
):
def
test_s
pecial_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence_0
=
"Encode this."
...
...
@@ -286,10 +286,10 @@ class CommonTestCases:
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
s
equence_ids
=
encoded_sequence_dict
[
"s
equence_ids
"
]
assert
len
(
s
equence_ids
)
==
len
(
encoded_sequence_w_special
)
s
pecial_tokens_mask
=
encoded_sequence_dict
[
"s
pecial_tokens_mask
"
]
assert
len
(
s
pecial_tokens_mask
)
==
len
(
encoded_sequence_w_special
)
filtered_sequence
=
[(
x
if
s
equence_ids
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[(
x
if
s
pecial_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
assert
encoded_sequence
==
filtered_sequence
...
...
@@ -297,10 +297,10 @@ class CommonTestCases:
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
s
equence_ids
=
encoded_sequence_dict
[
"s
equence_ids
"
]
assert
len
(
s
equence_ids
)
==
len
(
encoded_sequence_w_special
)
s
pecial_tokens_mask
=
encoded_sequence_dict
[
"s
pecial_tokens_mask
"
]
assert
len
(
s
pecial_tokens_mask
)
==
len
(
encoded_sequence_w_special
)
filtered_sequence
=
[(
x
if
s
equence_ids
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[(
x
if
s
pecial_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
assert
encoded_sequence
==
filtered_sequence
...
...
@@ -309,10 +309,10 @@ class CommonTestCases:
tokenizer
.
add_special_tokens
({
'cls_token'
:
'</s>'
,
'sep_token'
:
'<s>'
})
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
s
equence_ids
_orig
=
encoded_sequence_dict
[
"s
equence_ids
"
]
s
equence_ids
=
tokenizer
.
get_sequence_ids
(
encoded_sequence_w_special
,
special_tokens_present
=
True
)
assert
len
(
s
equence_ids
)
==
len
(
encoded_sequence_w_special
)
assert
s
equence_ids_orig
==
sequence_ids
s
pecial_tokens_mask
_orig
=
encoded_sequence_dict
[
"s
pecial_tokens_mask
"
]
s
pecial_tokens_mask
=
tokenizer
.
get_special_tokens_mask
(
encoded_sequence_w_special
,
special_tokens_present
=
True
)
assert
len
(
s
pecial_tokens_mask
)
==
len
(
encoded_sequence_w_special
)
assert
s
pecial_tokens_mask_orig
==
special_tokens_mask
transformers/tokenization_bert.py
View file @
aebd8323
...
...
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_s
equence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
def
get_s
pecial_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
...
...
transformers/tokenization_roberta.py
View file @
aebd8323
...
...
@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer):
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_s
equence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
def
get_s
pecial_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
...
...
transformers/tokenization_utils.py
View file @
aebd8323
...
...
@@ -820,7 +820,7 @@ class PreTrainedTokenizer(object):
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
s
equence_ids
: list[int] if ``add_special_tokens`` if set to ``True``
s
pecial_tokens_mask
: list[int] if ``add_special_tokens`` if set to ``True``
}
With the fields:
...
...
@@ -828,7 +828,7 @@ class PreTrainedTokenizer(object):
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``s
equence_ids
``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
``s
pecial_tokens_mask
``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
pair
=
bool
(
pair_ids
is
not
None
)
...
...
@@ -857,7 +857,7 @@ class PreTrainedTokenizer(object):
if
add_special_tokens
:
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids
,
pair_ids
)
if
pair
else
self
.
add_special_tokens_single_sequence
(
ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
if
pair
else
[
0
]
*
len
(
sequence
)
encoded_inputs
[
"s
equence_ids"
]
=
self
.
get_sequence_ids
(
ids
,
pair_ids
)
encoded_inputs
[
"s
pecial_tokens_mask"
]
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
else
:
sequence
=
ids
+
pair_ids
if
pair
else
ids
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
...
...
@@ -877,6 +877,7 @@ class PreTrainedTokenizer(object):
if
max_length
and
len
(
encoded_inputs
[
"input_ids"
])
>
max_length
:
encoded_inputs
[
"input_ids"
]
=
encoded_inputs
[
"input_ids"
][:
max_length
]
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
return
encoded_inputs
...
...
@@ -892,7 +893,7 @@ class PreTrainedTokenizer(object):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
get_s
equence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
def
get_s
pecial_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
return
[
1
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
...
...
transformers/tokenization_xlm.py
View file @
aebd8323
...
...
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_s
equence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
def
get_s
pecial_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
...
...
transformers/tokenization_xlnet.py
View file @
aebd8323
...
...
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
get_s
equence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
def
get_s
pecial_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
special_tokens_present
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment