Unverified Commit 1025a9b7 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

add a warning in `SpmConverter` for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model

* Revert "update proto sentencepiece model"

This reverts commit b07f671747fec35773d0b3d4788b8b15aefa0229.

* add check

* add test

* Revert "Revert "update proto sentencepiece model""

This reverts commit 46108257b8927b73627ec8f4f3eed53a95fc700d.

* test for log level

* test for log level 2

* warning at the warning level

* clean

* format

* add explanation in docstring
parent 7c5d7991
...@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid ...@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional. allow to make our dependency on SentencePiece optional.
""" """
import warnings
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
...@@ -429,6 +430,14 @@ class SpmConverter(Converter): ...@@ -429,6 +430,14 @@ class SpmConverter(Converter):
m.ParseFromString(f.read()) m.ParseFromString(f.read())
self.proto = m self.proto = m
if self.proto.trainer_spec.byte_fallback:
warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
"unknown tokens into a sequence of byte tokens matching the original piece of text."
)
def vocab(self, proto): def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces] return [(piece.piece, piece.score) for piece in proto.pieces]
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# Generated by the protocol buffer compiler. DO NOT EDIT! # Generated by the protocol buffer compiler. DO NOT EDIT!
# source: sentencepiece_model.proto # source: sentencepiece_model.proto
# Copyright 2020 The HuggingFace Team. All rights reserved. # Copyright 2022 The HuggingFace Team. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -15,13 +15,7 @@ ...@@ -15,13 +15,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1"))
from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pb2
from google.protobuf import message as _message from google.protobuf import message as _message
from google.protobuf import reflection as _reflection from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database from google.protobuf import symbol_database as _symbol_database
...@@ -36,11 +30,10 @@ DESCRIPTOR = _descriptor.FileDescriptor( ...@@ -36,11 +30,10 @@ DESCRIPTOR = _descriptor.FileDescriptor(
name="sentencepiece_model.proto", name="sentencepiece_model.proto",
package="sentencepiece", package="sentencepiece",
syntax="proto2", syntax="proto2",
serialized_pb=_b( serialized_options=b"H\003",
'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xf4\x08\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 "5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xba\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x1a\xc8\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"J\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03' create_key=_descriptor._internal_create_key,
), serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03',
) )
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor( _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
...@@ -48,16 +41,45 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor( ...@@ -48,16 +41,45 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
full_name="sentencepiece.TrainerSpec.ModelType", full_name="sentencepiece.TrainerSpec.ModelType",
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
values=[ values=[
_descriptor.EnumValueDescriptor(name="UNIGRAM", index=0, number=1, options=None, type=None), _descriptor.EnumValueDescriptor(
_descriptor.EnumValueDescriptor(name="BPE", index=1, number=2, options=None, type=None), name="UNIGRAM",
_descriptor.EnumValueDescriptor(name="WORD", index=2, number=3, options=None, type=None), index=0,
_descriptor.EnumValueDescriptor(name="CHAR", index=3, number=4, options=None, type=None), number=1,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="BPE",
index=1,
number=2,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="WORD",
index=2,
number=3,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="CHAR",
index=3,
number=4,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
], ],
containing_type=None, containing_type=None,
options=None, serialized_options=None,
serialized_start=1121, serialized_start=1294,
serialized_end=1174, serialized_end=1347,
) )
_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE) _sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
...@@ -66,17 +88,61 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor( ...@@ -66,17 +88,61 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
full_name="sentencepiece.ModelProto.SentencePiece.Type", full_name="sentencepiece.ModelProto.SentencePiece.Type",
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
values=[ values=[
_descriptor.EnumValueDescriptor(name="NORMAL", index=0, number=1, options=None, type=None), _descriptor.EnumValueDescriptor(
_descriptor.EnumValueDescriptor(name="UNKNOWN", index=1, number=2, options=None, type=None), name="NORMAL",
_descriptor.EnumValueDescriptor(name="CONTROL", index=2, number=3, options=None, type=None), index=0,
_descriptor.EnumValueDescriptor(name="USER_DEFINED", index=3, number=4, options=None, type=None), number=1,
_descriptor.EnumValueDescriptor(name="UNUSED", index=4, number=5, options=None, type=None), serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="UNKNOWN",
index=1,
number=2,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="CONTROL",
index=2,
number=3,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="USER_DEFINED",
index=3,
number=4,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="BYTE",
index=4,
number=6,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="UNUSED",
index=5,
number=5,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
], ],
containing_type=None, containing_type=None,
options=None, serialized_options=None,
serialized_start=1869, serialized_start=2100,
serialized_end=1943, serialized_end=2184,
) )
_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE) _sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
...@@ -87,6 +153,7 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -87,6 +153,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
containing_type=None, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[ fields=[
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="input", name="input",
...@@ -103,7 +170,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -103,7 +170,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="input_format", name="input_format",
...@@ -114,13 +183,15 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -114,13 +183,15 @@ _TRAINERSPEC = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="model_prefix", name="model_prefix",
...@@ -131,13 +202,15 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -131,13 +202,15 @@ _TRAINERSPEC = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="model_type", name="model_type",
...@@ -154,7 +227,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -154,7 +227,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="vocab_size", name="vocab_size",
...@@ -171,7 +246,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -171,7 +246,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="accept_language", name="accept_language",
...@@ -188,7 +265,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -188,7 +265,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="self_test_sample_size", name="self_test_sample_size",
...@@ -205,7 +284,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -205,7 +284,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="character_coverage", name="character_coverage",
...@@ -222,15 +303,17 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -222,15 +303,17 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="input_sentence_size", name="input_sentence_size",
full_name="sentencepiece.TrainerSpec.input_sentence_size", full_name="sentencepiece.TrainerSpec.input_sentence_size",
index=8, index=8,
number=11, number=11,
type=5, type=4,
cpp_type=1, cpp_type=4,
label=1, label=1,
has_default_value=True, has_default_value=True,
default_value=0, default_value=0,
...@@ -239,7 +322,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -239,7 +322,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="shuffle_input_sentence", name="shuffle_input_sentence",
...@@ -256,7 +341,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -256,7 +341,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="mining_sentence_size", name="mining_sentence_size",
...@@ -273,7 +360,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -273,7 +360,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b("\030\001")), serialized_options=b"\030\001",
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="training_sentence_size", name="training_sentence_size",
...@@ -290,7 +379,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -290,7 +379,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b("\030\001")), serialized_options=b"\030\001",
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="seed_sentencepiece_size", name="seed_sentencepiece_size",
...@@ -307,7 +398,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -307,7 +398,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="shrinking_factor", name="shrinking_factor",
...@@ -324,7 +417,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -324,7 +417,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="max_sentence_length", name="max_sentence_length",
...@@ -341,7 +436,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -341,7 +436,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="num_threads", name="num_threads",
...@@ -358,7 +455,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -358,7 +455,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="num_sub_iterations", name="num_sub_iterations",
...@@ -375,7 +474,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -375,7 +474,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="max_sentencepiece_length", name="max_sentencepiece_length",
...@@ -392,7 +493,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -392,7 +493,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="split_by_unicode_script", name="split_by_unicode_script",
...@@ -409,7 +512,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -409,7 +512,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="split_by_number", name="split_by_number",
...@@ -426,7 +531,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -426,7 +531,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="split_by_whitespace", name="split_by_whitespace",
...@@ -443,7 +550,9 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -443,7 +550,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="treat_whitespace_as_suffix", name="treat_whitespace_as_suffix",
...@@ -460,12 +569,33 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -460,12 +569,33 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="split_digits",
full_name="sentencepiece.TrainerSpec.split_digits",
index=22,
number=25,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="control_symbols", name="control_symbols",
full_name="sentencepiece.TrainerSpec.control_symbols", full_name="sentencepiece.TrainerSpec.control_symbols",
index=22, index=23,
number=30, number=30,
type=9, type=9,
cpp_type=9, cpp_type=9,
...@@ -477,12 +607,14 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -477,12 +607,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="user_defined_symbols", name="user_defined_symbols",
full_name="sentencepiece.TrainerSpec.user_defined_symbols", full_name="sentencepiece.TrainerSpec.user_defined_symbols",
index=23, index=24,
number=31, number=31,
type=9, type=9,
cpp_type=9, cpp_type=9,
...@@ -494,12 +626,71 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -494,12 +626,71 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="required_chars",
full_name="sentencepiece.TrainerSpec.required_chars",
index=25,
number=36,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="byte_fallback",
full_name="sentencepiece.TrainerSpec.byte_fallback",
index=26,
number=35,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="vocabulary_output_piece_score",
full_name="sentencepiece.TrainerSpec.vocabulary_output_piece_score",
index=27,
number=32,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=True,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="hard_vocab_limit", name="hard_vocab_limit",
full_name="sentencepiece.TrainerSpec.hard_vocab_limit", full_name="sentencepiece.TrainerSpec.hard_vocab_limit",
index=24, index=28,
number=33, number=33,
type=8, type=8,
cpp_type=7, cpp_type=7,
...@@ -511,12 +702,14 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -511,12 +702,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="use_all_vocab", name="use_all_vocab",
full_name="sentencepiece.TrainerSpec.use_all_vocab", full_name="sentencepiece.TrainerSpec.use_all_vocab",
index=25, index=29,
number=34, number=34,
type=8, type=8,
cpp_type=7, cpp_type=7,
...@@ -528,12 +721,14 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -528,12 +721,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="unk_id", name="unk_id",
full_name="sentencepiece.TrainerSpec.unk_id", full_name="sentencepiece.TrainerSpec.unk_id",
index=26, index=30,
number=40, number=40,
type=5, type=5,
cpp_type=1, cpp_type=1,
...@@ -545,12 +740,14 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -545,12 +740,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="bos_id", name="bos_id",
full_name="sentencepiece.TrainerSpec.bos_id", full_name="sentencepiece.TrainerSpec.bos_id",
index=27, index=31,
number=41, number=41,
type=5, type=5,
cpp_type=1, cpp_type=1,
...@@ -562,12 +759,14 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -562,12 +759,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="eos_id", name="eos_id",
full_name="sentencepiece.TrainerSpec.eos_id", full_name="sentencepiece.TrainerSpec.eos_id",
index=28, index=32,
number=42, number=42,
type=5, type=5,
cpp_type=1, cpp_type=1,
...@@ -579,12 +778,14 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -579,12 +778,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="pad_id", name="pad_id",
full_name="sentencepiece.TrainerSpec.pad_id", full_name="sentencepiece.TrainerSpec.pad_id",
index=29, index=33,
number=43, number=43,
type=5, type=5,
cpp_type=1, cpp_type=1,
...@@ -596,92 +797,123 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -596,92 +797,123 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="unk_piece", name="unk_piece",
full_name="sentencepiece.TrainerSpec.unk_piece", full_name="sentencepiece.TrainerSpec.unk_piece",
index=30, index=34,
number=45, number=45,
type=9, type=9,
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=True, has_default_value=True,
default_value=_b("<unk>").decode("utf-8"), default_value=b"<unk>".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="bos_piece", name="bos_piece",
full_name="sentencepiece.TrainerSpec.bos_piece", full_name="sentencepiece.TrainerSpec.bos_piece",
index=31, index=35,
number=46, number=46,
type=9, type=9,
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=True, has_default_value=True,
default_value=_b("<s>").decode("utf-8"), default_value=b"<s>".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="eos_piece", name="eos_piece",
full_name="sentencepiece.TrainerSpec.eos_piece", full_name="sentencepiece.TrainerSpec.eos_piece",
index=32, index=36,
number=47, number=47,
type=9, type=9,
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=True, has_default_value=True,
default_value=_b("</s>").decode("utf-8"), default_value=b"</s>".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="pad_piece", name="pad_piece",
full_name="sentencepiece.TrainerSpec.pad_piece", full_name="sentencepiece.TrainerSpec.pad_piece",
index=33, index=37,
number=48, number=48,
type=9, type=9,
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=True, has_default_value=True,
default_value=_b("<pad>").decode("utf-8"), default_value=b"<pad>".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="unk_surface", name="unk_surface",
full_name="sentencepiece.TrainerSpec.unk_surface", full_name="sentencepiece.TrainerSpec.unk_surface",
index=34, index=38,
number=44, number=44,
type=9, type=9,
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=True, has_default_value=True,
default_value=_b(" \342\201\207 ").decode("utf-8"), default_value=b" \342\201\207 ".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="train_extremely_large_corpus",
full_name="sentencepiece.TrainerSpec.train_extremely_large_corpus",
index=39,
number=49,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
], ],
extensions=[], extensions=[],
...@@ -689,7 +921,7 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -689,7 +921,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
enum_types=[ enum_types=[
_TRAINERSPEC_MODELTYPE, _TRAINERSPEC_MODELTYPE,
], ],
options=None, serialized_options=None,
is_extendable=True, is_extendable=True,
syntax="proto2", syntax="proto2",
extension_ranges=[ extension_ranges=[
...@@ -697,7 +929,7 @@ _TRAINERSPEC = _descriptor.Descriptor( ...@@ -697,7 +929,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
], ],
oneofs=[], oneofs=[],
serialized_start=45, serialized_start=45,
serialized_end=1185, serialized_end=1358,
) )
...@@ -707,6 +939,7 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -707,6 +939,7 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
containing_type=None, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[ fields=[
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="name", name="name",
...@@ -717,13 +950,15 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -717,13 +950,15 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="precompiled_charsmap", name="precompiled_charsmap",
...@@ -734,13 +969,15 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -734,13 +969,15 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b(""), default_value=b"",
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="add_dummy_prefix", name="add_dummy_prefix",
...@@ -757,7 +994,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -757,7 +994,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="remove_extra_whitespaces", name="remove_extra_whitespaces",
...@@ -774,7 +1013,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -774,7 +1013,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="escape_whitespaces", name="escape_whitespaces",
...@@ -791,7 +1032,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -791,7 +1032,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="normalization_rule_tsv", name="normalization_rule_tsv",
...@@ -802,27 +1045,29 @@ _NORMALIZERSPEC = _descriptor.Descriptor( ...@@ -802,27 +1045,29 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
], ],
extensions=[], extensions=[],
nested_types=[], nested_types=[],
enum_types=[], enum_types=[],
options=None, serialized_options=None,
is_extendable=True, is_extendable=True,
syntax="proto2", syntax="proto2",
extension_ranges=[ extension_ranges=[
(200, 536870912), (200, 536870912),
], ],
oneofs=[], oneofs=[],
serialized_start=1188, serialized_start=1361,
serialized_end=1397, serialized_end=1570,
) )
...@@ -832,6 +1077,7 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor( ...@@ -832,6 +1077,7 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
containing_type=None, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[ fields=[
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="input", name="input",
...@@ -842,13 +1088,15 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor( ...@@ -842,13 +1088,15 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="expected", name="expected",
...@@ -859,25 +1107,27 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor( ...@@ -859,25 +1107,27 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
], ],
extensions=[], extensions=[],
nested_types=[], nested_types=[],
enum_types=[], enum_types=[],
options=None, serialized_options=None,
is_extendable=False, is_extendable=False,
syntax="proto2", syntax="proto2",
extension_ranges=[], extension_ranges=[],
oneofs=[], oneofs=[],
serialized_start=1468, serialized_start=1641,
serialized_end=1509, serialized_end=1682,
) )
_SELFTESTDATA = _descriptor.Descriptor( _SELFTESTDATA = _descriptor.Descriptor(
...@@ -886,6 +1136,7 @@ _SELFTESTDATA = _descriptor.Descriptor( ...@@ -886,6 +1136,7 @@ _SELFTESTDATA = _descriptor.Descriptor(
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
containing_type=None, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[ fields=[
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="samples", name="samples",
...@@ -902,7 +1153,9 @@ _SELFTESTDATA = _descriptor.Descriptor( ...@@ -902,7 +1153,9 @@ _SELFTESTDATA = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
], ],
extensions=[], extensions=[],
...@@ -910,15 +1163,15 @@ _SELFTESTDATA = _descriptor.Descriptor( ...@@ -910,15 +1163,15 @@ _SELFTESTDATA = _descriptor.Descriptor(
_SELFTESTDATA_SAMPLE, _SELFTESTDATA_SAMPLE,
], ],
enum_types=[], enum_types=[],
options=None, serialized_options=None,
is_extendable=True, is_extendable=True,
syntax="proto2", syntax="proto2",
extension_ranges=[ extension_ranges=[
(200, 536870912), (200, 536870912),
], ],
oneofs=[], oneofs=[],
serialized_start=1399, serialized_start=1572,
serialized_end=1520, serialized_end=1693,
) )
...@@ -928,6 +1181,7 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( ...@@ -928,6 +1181,7 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
containing_type=None, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[ fields=[
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="piece", name="piece",
...@@ -938,13 +1192,15 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( ...@@ -938,13 +1192,15 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
cpp_type=9, cpp_type=9,
label=1, label=1,
has_default_value=False, has_default_value=False,
default_value=_b("").decode("utf-8"), default_value=b"".decode("utf-8"),
message_type=None, message_type=None,
enum_type=None, enum_type=None,
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="score", name="score",
...@@ -961,7 +1217,9 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( ...@@ -961,7 +1217,9 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="type", name="type",
...@@ -978,7 +1236,9 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( ...@@ -978,7 +1236,9 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
], ],
extensions=[], extensions=[],
...@@ -986,15 +1246,15 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( ...@@ -986,15 +1246,15 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
enum_types=[ enum_types=[
_MODELPROTO_SENTENCEPIECE_TYPE, _MODELPROTO_SENTENCEPIECE_TYPE,
], ],
options=None, serialized_options=None,
is_extendable=True, is_extendable=True,
syntax="proto2", syntax="proto2",
extension_ranges=[ extension_ranges=[
(200, 536870912), (200, 536870912),
], ],
oneofs=[], oneofs=[],
serialized_start=1754, serialized_start=1985,
serialized_end=1954, serialized_end=2195,
) )
_MODELPROTO = _descriptor.Descriptor( _MODELPROTO = _descriptor.Descriptor(
...@@ -1003,6 +1263,7 @@ _MODELPROTO = _descriptor.Descriptor( ...@@ -1003,6 +1263,7 @@ _MODELPROTO = _descriptor.Descriptor(
filename=None, filename=None,
file=DESCRIPTOR, file=DESCRIPTOR,
containing_type=None, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[ fields=[
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="pieces", name="pieces",
...@@ -1019,7 +1280,9 @@ _MODELPROTO = _descriptor.Descriptor( ...@@ -1019,7 +1280,9 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="trainer_spec", name="trainer_spec",
...@@ -1036,7 +1299,9 @@ _MODELPROTO = _descriptor.Descriptor( ...@@ -1036,7 +1299,9 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="normalizer_spec", name="normalizer_spec",
...@@ -1053,7 +1318,9 @@ _MODELPROTO = _descriptor.Descriptor( ...@@ -1053,7 +1318,9 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
_descriptor.FieldDescriptor( _descriptor.FieldDescriptor(
name="self_test_data", name="self_test_data",
...@@ -1070,7 +1337,28 @@ _MODELPROTO = _descriptor.Descriptor( ...@@ -1070,7 +1337,28 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None, containing_type=None,
is_extension=False, is_extension=False,
extension_scope=None, extension_scope=None,
options=None, serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="denormalizer_spec",
full_name="sentencepiece.ModelProto.denormalizer_spec",
index=4,
number=5,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
), ),
], ],
extensions=[], extensions=[],
...@@ -1078,15 +1366,15 @@ _MODELPROTO = _descriptor.Descriptor( ...@@ -1078,15 +1366,15 @@ _MODELPROTO = _descriptor.Descriptor(
_MODELPROTO_SENTENCEPIECE, _MODELPROTO_SENTENCEPIECE,
], ],
enum_types=[], enum_types=[],
options=None, serialized_options=None,
is_extendable=True, is_extendable=True,
syntax="proto2", syntax="proto2",
extension_ranges=[ extension_ranges=[
(200, 536870912), (200, 536870912),
], ],
oneofs=[], oneofs=[],
serialized_start=1523, serialized_start=1696,
serialized_end=1965, serialized_end=2206,
) )
_TRAINERSPEC.fields_by_name["model_type"].enum_type = _TRAINERSPEC_MODELTYPE _TRAINERSPEC.fields_by_name["model_type"].enum_type = _TRAINERSPEC_MODELTYPE
...@@ -1100,50 +1388,52 @@ _MODELPROTO.fields_by_name["pieces"].message_type = _MODELPROTO_SENTENCEPIECE ...@@ -1100,50 +1388,52 @@ _MODELPROTO.fields_by_name["pieces"].message_type = _MODELPROTO_SENTENCEPIECE
_MODELPROTO.fields_by_name["trainer_spec"].message_type = _TRAINERSPEC _MODELPROTO.fields_by_name["trainer_spec"].message_type = _TRAINERSPEC
_MODELPROTO.fields_by_name["normalizer_spec"].message_type = _NORMALIZERSPEC _MODELPROTO.fields_by_name["normalizer_spec"].message_type = _NORMALIZERSPEC
_MODELPROTO.fields_by_name["self_test_data"].message_type = _SELFTESTDATA _MODELPROTO.fields_by_name["self_test_data"].message_type = _SELFTESTDATA
_MODELPROTO.fields_by_name["denormalizer_spec"].message_type = _NORMALIZERSPEC
DESCRIPTOR.message_types_by_name["TrainerSpec"] = _TRAINERSPEC DESCRIPTOR.message_types_by_name["TrainerSpec"] = _TRAINERSPEC
DESCRIPTOR.message_types_by_name["NormalizerSpec"] = _NORMALIZERSPEC DESCRIPTOR.message_types_by_name["NormalizerSpec"] = _NORMALIZERSPEC
DESCRIPTOR.message_types_by_name["SelfTestData"] = _SELFTESTDATA DESCRIPTOR.message_types_by_name["SelfTestData"] = _SELFTESTDATA
DESCRIPTOR.message_types_by_name["ModelProto"] = _MODELPROTO DESCRIPTOR.message_types_by_name["ModelProto"] = _MODELPROTO
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
TrainerSpec = _reflection.GeneratedProtocolMessageType( TrainerSpec = _reflection.GeneratedProtocolMessageType(
"TrainerSpec", "TrainerSpec",
(_message.Message,), (_message.Message,),
dict( {
DESCRIPTOR=_TRAINERSPEC, "DESCRIPTOR": _TRAINERSPEC,
__module__="sentencepiece_model_pb2" "__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec) # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec)
), },
) )
_sym_db.RegisterMessage(TrainerSpec) _sym_db.RegisterMessage(TrainerSpec)
NormalizerSpec = _reflection.GeneratedProtocolMessageType( NormalizerSpec = _reflection.GeneratedProtocolMessageType(
"NormalizerSpec", "NormalizerSpec",
(_message.Message,), (_message.Message,),
dict( {
DESCRIPTOR=_NORMALIZERSPEC, "DESCRIPTOR": _NORMALIZERSPEC,
__module__="sentencepiece_model_pb2" "__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec) # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec)
), },
) )
_sym_db.RegisterMessage(NormalizerSpec) _sym_db.RegisterMessage(NormalizerSpec)
SelfTestData = _reflection.GeneratedProtocolMessageType( SelfTestData = _reflection.GeneratedProtocolMessageType(
"SelfTestData", "SelfTestData",
(_message.Message,), (_message.Message,),
dict( {
Sample=_reflection.GeneratedProtocolMessageType( "Sample": _reflection.GeneratedProtocolMessageType(
"Sample", "Sample",
(_message.Message,), (_message.Message,),
dict( {
DESCRIPTOR=_SELFTESTDATA_SAMPLE, "DESCRIPTOR": _SELFTESTDATA_SAMPLE,
__module__="sentencepiece_model_pb2" "__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample) # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample)
},
), ),
), "DESCRIPTOR": _SELFTESTDATA,
DESCRIPTOR=_SELFTESTDATA, "__module__": "sentencepiece_model_pb2"
__module__="sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData) # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData)
), },
) )
_sym_db.RegisterMessage(SelfTestData) _sym_db.RegisterMessage(SelfTestData)
_sym_db.RegisterMessage(SelfTestData.Sample) _sym_db.RegisterMessage(SelfTestData.Sample)
...@@ -1151,33 +1441,26 @@ _sym_db.RegisterMessage(SelfTestData.Sample) ...@@ -1151,33 +1441,26 @@ _sym_db.RegisterMessage(SelfTestData.Sample)
ModelProto = _reflection.GeneratedProtocolMessageType( ModelProto = _reflection.GeneratedProtocolMessageType(
"ModelProto", "ModelProto",
(_message.Message,), (_message.Message,),
dict( {
SentencePiece=_reflection.GeneratedProtocolMessageType( "SentencePiece": _reflection.GeneratedProtocolMessageType(
"SentencePiece", "SentencePiece",
(_message.Message,), (_message.Message,),
dict( {
DESCRIPTOR=_MODELPROTO_SENTENCEPIECE, "DESCRIPTOR": _MODELPROTO_SENTENCEPIECE,
__module__="sentencepiece_model_pb2" "__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece) # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece)
},
), ),
), "DESCRIPTOR": _MODELPROTO,
DESCRIPTOR=_MODELPROTO, "__module__": "sentencepiece_model_pb2"
__module__="sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.ModelProto) # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto)
), },
) )
_sym_db.RegisterMessage(ModelProto) _sym_db.RegisterMessage(ModelProto)
_sym_db.RegisterMessage(ModelProto.SentencePiece) _sym_db.RegisterMessage(ModelProto.SentencePiece)
DESCRIPTOR.has_options = True DESCRIPTOR._options = None
DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b("H\003")) _TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
_TRAINERSPEC.fields_by_name["mining_sentence_size"].has_options = True _TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
_TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = _descriptor._ParseOptions(
descriptor_pb2.FieldOptions(), _b("\030\001")
)
_TRAINERSPEC.fields_by_name["training_sentence_size"].has_options = True
_TRAINERSPEC.fields_by_name["training_sentence_size"]._options = _descriptor._ParseOptions(
descriptor_pb2.FieldOptions(), _b("\030\001")
)
# @@protoc_insertion_point(module_scope) # @@protoc_insertion_point(module_scope)
import unittest
import warnings
from dataclasses import dataclass
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.testing_utils import get_tests_dir
@dataclass
class FakeOriginalTokenizer:
vocab_file: str
class ConvertSlowTokenizerTest(unittest.TestCase):
def test_spm_converter_bytefallback_warning(self):
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_without_bytefallback)
self.assertEqual(len(w), 0)
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers."
),
str(w[0].message),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment