Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
983c484f
Commit
983c484f
authored
Jan 06, 2020
by
Branden Chan
Committed by
Lysandre Debut
Jan 21, 2020
Browse files
add __getstate__ and __setstate__ to XLMRobertaTokenizer
parent
cefd51c5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
2 deletions
+28
-2
src/transformers/tokenization_xlm_roberta.py
src/transformers/tokenization_xlm_roberta.py
+28
-2
No files found.
src/transformers/tokenization_xlm_roberta.py
View file @
983c484f
...
@@ -19,8 +19,6 @@ import logging
...
@@ -19,8 +19,6 @@ import logging
import
os
import
os
from
shutil
import
copyfile
from
shutil
import
copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
...
@@ -87,6 +85,16 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
...
@@ -87,6 +85,16 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
)
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
str
(
vocab_file
))
self
.
sp_model
.
Load
(
str
(
vocab_file
))
self
.
vocab_file
=
vocab_file
self
.
vocab_file
=
vocab_file
...
@@ -106,6 +114,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
...
@@ -106,6 +114,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
self
.
fairseq_tokens_to_ids
[
"<mask>"
]
=
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_tokens_to_ids
[
"<mask>"
]
=
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_ids_to_tokens
=
{
v
:
k
for
k
,
v
in
self
.
fairseq_tokens_to_ids
.
items
()}
self
.
fairseq_ids_to_tokens
=
{
v
:
k
for
k
,
v
in
self
.
fairseq_tokens_to_ids
.
items
()}
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment