Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
983c484f
Commit
983c484f
authored
Jan 06, 2020
by
Branden Chan
Committed by
Lysandre Debut
Jan 21, 2020
Browse files
add __getstate__ and __setstate__ to XLMRobertaTokenizer
parent
cefd51c5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
2 deletions
+28
-2
src/transformers/tokenization_xlm_roberta.py
src/transformers/tokenization_xlm_roberta.py
+28
-2
No files found.
src/transformers/tokenization_xlm_roberta.py
View file @
983c484f
...
...
@@ -19,8 +19,6 @@ import logging
import
os
from
shutil
import
copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
...
...
@@ -87,6 +85,16 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
str
(
vocab_file
))
self
.
vocab_file
=
vocab_file
...
...
@@ -106,6 +114,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
self
.
fairseq_tokens_to_ids
[
"<mask>"
]
=
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_ids_to_tokens
=
{
v
:
k
for
k
,
v
in
self
.
fairseq_tokens_to_ids
.
items
()}
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment