Unverified Commit dd8b7d28 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`Sentencepiece`] make sure `legacy` do not require `protobuf` (#25684)

make sure legacy does not require `protobuf`
parent 0770ce6c
...@@ -27,9 +27,10 @@ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_ ...@@ -27,9 +27,10 @@ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_
from tokenizers.models import BPE, Unigram, WordPiece from tokenizers.models import BPE, Unigram, WordPiece
from .utils import is_protobuf_available, requires_backends from .utils import is_protobuf_available, requires_backends
from .utils.import_utils import PROTOBUF_IMPORT_ERROR
def import_protobuf(): def import_protobuf(error_message=""):
if is_protobuf_available(): if is_protobuf_available():
import google.protobuf import google.protobuf
...@@ -37,7 +38,9 @@ def import_protobuf(): ...@@ -37,7 +38,9 @@ def import_protobuf():
from transformers.utils import sentencepiece_model_pb2 from transformers.utils import sentencepiece_model_pb2
else: else:
from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2 from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
return sentencepiece_model_pb2 return sentencepiece_model_pb2
else:
raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
class SentencePieceExtractor: class SentencePieceExtractor:
......
...@@ -162,14 +162,17 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -162,14 +162,17 @@ class LlamaTokenizer(PreTrainedTokenizer):
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def get_spm_processor(self): def get_spm_processor(self):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy: # no dependency on protobuf
tokenizer.Load(self.vocab_file)
return tokenizer
with open(self.vocab_file, "rb") as f: with open(self.vocab_file, "rb") as f:
sp_model = f.read() sp_model = f.read()
model_pb2 = import_protobuf() model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
model = model_pb2.ModelProto.FromString(sp_model) model = model_pb2.ModelProto.FromString(sp_model)
if not self.legacy: normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec = model_pb2.NormalizerSpec() normalizer_spec.add_dummy_prefix = False
normalizer_spec.add_dummy_prefix = False model.normalizer_spec.MergeFrom(normalizer_spec)
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString() sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model) tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer return tokenizer
......
...@@ -195,14 +195,17 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -195,14 +195,17 @@ class T5Tokenizer(PreTrainedTokenizer):
def get_spm_processor(self): def get_spm_processor(self):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy: # no dependency on protobuf
tokenizer.Load(self.vocab_file)
return tokenizer
with open(self.vocab_file, "rb") as f: with open(self.vocab_file, "rb") as f:
sp_model = f.read() sp_model = f.read()
model_pb2 = import_protobuf() model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
model = model_pb2.ModelProto.FromString(sp_model) model = model_pb2.ModelProto.FromString(sp_model)
if not self.legacy: normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec = model_pb2.NormalizerSpec() normalizer_spec.add_dummy_prefix = False
normalizer_spec.add_dummy_prefix = False model.normalizer_spec.MergeFrom(normalizer_spec)
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString() sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model) tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer return tokenizer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment