"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "0c92e7d9fa9f759b410b1ac85cb80f93208669d1"
Commit 908230d2 authored by Lysandre's avatar Lysandre
Browse files

Pickle CamemBERT tokenizer

parent 24d5ad1d
...@@ -169,6 +169,24 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -169,6 +169,24 @@ class CamembertTokenizer(PreTrainedTokenizer):
return self.fairseq_ids_to_tokens[index] return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset) return self.sp_model.IdToPiece(index - self.fairseq_offset)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment