fix camembert and XLM-R tokenizer

e37ca8e1 · thomwolf · ceae85ad · e37ca8e1 · e37ca8e1
Commit e37ca8e1 authored Dec 20, 2019 by thomwolf
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 0 deletions

transformers/tokenization_camembert.py transformers/tokenization_camembert.py +6 -0

transformers/tokenization_xlm_roberta.py transformers/tokenization_xlm_roberta.py +6 -0

No files found.
--- a/transformers/tokenization_camembert.py
+++ b/transformers/tokenization_camembert.py
@@ -22,6 +22,7 @@ from shutil import copyfile

 import sentencepiece as spm
 from transformers.tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE

 logger = logging.getLogger(__name__)

@@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.

--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -22,6 +22,7 @@ from shutil import copyfile

 import sentencepiece as spm
 from transformers.tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE

 logger = logging.getLogger(__name__)

@@ -161,6 +162,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.