Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e37ca8e1
Commit
e37ca8e1
authored
Dec 20, 2019
by
thomwolf
Browse files
fix camembert and XLM-R tokenizer
parent
ceae85ad
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
0 deletions
+12
-0
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+6
-0
transformers/tokenization_xlm_roberta.py
transformers/tokenization_xlm_roberta.py
+6
-0
No files found.
transformers/tokenization_camembert.py
View file @
e37ca8e1
...
...
@@ -22,6 +22,7 @@ from shutil import copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
...
...
transformers/tokenization_xlm_roberta.py
View file @
e37ca8e1
...
...
@@ -22,6 +22,7 @@ from shutil import copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -161,6 +162,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment