Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e37ca8e1
You need to sign in or sign up before continuing.
Commit
e37ca8e1
authored
Dec 20, 2019
by
thomwolf
Browse files
fix camembert and XLM-R tokenizer
parent
ceae85ad
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
0 deletions
+12
-0
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+6
-0
transformers/tokenization_xlm_roberta.py
transformers/tokenization_xlm_roberta.py
+6
-0
No files found.
transformers/tokenization_camembert.py
View file @
e37ca8e1
...
@@ -22,6 +22,7 @@ from shutil import copyfile
...
@@ -22,6 +22,7 @@ from shutil import copyfile
import
sentencepiece
as
spm
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
to a directory.
...
...
transformers/tokenization_xlm_roberta.py
View file @
e37ca8e1
...
@@ -22,6 +22,7 @@ from shutil import copyfile
...
@@ -22,6 +22,7 @@ from shutil import copyfile
import
sentencepiece
as
spm
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_xlnet
import
SPIECE_UNDERLINE
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -161,6 +162,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
...
@@ -161,6 +162,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
to a directory.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment