Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e02ed0ee
Unverified
Commit
e02ed0ee
authored
Sep 16, 2021
by
Benjamin Davidson
Committed by
GitHub
Sep 16, 2021
Browse files
XLMR tokenizer is fully picklable (#13577)
* made tokenizer fully picklable * remove whitespace * added testcase
parent
af5c6ae5
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
1 deletion
+12
-1
src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
...ansformers/models/xlm_roberta/tokenization_xlm_roberta.py
+2
-1
tests/test_tokenization_xlm_roberta.py
tests/test_tokenization_xlm_roberta.py
+10
-0
No files found.
src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
View file @
e02ed0ee
...
...
@@ -171,6 +171,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
state
[
"sp_model_proto"
]
=
self
.
sp_model
.
serialized_model_proto
()
return
state
def
__setstate__
(
self
,
d
):
...
...
@@ -181,7 +182,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
self
.
sp_model_kwargs
=
{}
self
.
sp_model
=
spm
.
SentencePieceProcessor
(
**
self
.
sp_model_kwargs
)
self
.
sp_model
.
Load
(
self
.
vocab_file
)
self
.
sp_model
.
Load
FromSerializedProto
(
self
.
sp_model_proto
)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
...
...
tests/test_tokenization_xlm_roberta.py
View file @
e02ed0ee
...
...
@@ -14,6 +14,9 @@
# limitations under the License.
import
os
import
pickle
import
shutil
import
tempfile
import
unittest
from
transformers
import
SPIECE_UNDERLINE
,
XLMRobertaTokenizer
,
XLMRobertaTokenizerFast
...
...
@@ -141,6 +144,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def
big_tokenizer
(
self
):
return
XLMRobertaTokenizer
.
from_pretrained
(
"xlm-roberta-base"
)
def
test_picklable_without_disk
(
self
):
with
tempfile
.
NamedTemporaryFile
()
as
f
:
shutil
.
copyfile
(
SAMPLE_VOCAB
,
f
.
name
)
tokenizer
=
XLMRobertaTokenizer
(
f
.
name
,
keep_accents
=
True
)
pickled_tokenizer
=
pickle
.
dumps
(
tokenizer
)
pickle
.
loads
(
pickled_tokenizer
)
def
test_rust_and_python_full_tokenizers
(
self
):
if
not
self
.
test_rust_tokenizer
:
return
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment