Unverified Commit 1025a9b7 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

add a warning in `SpmConverter` for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model

* Revert "update proto sentencepiece model"

This reverts commit b07f671747fec35773d0b3d4788b8b15aefa0229.

* add check

* add test

* Revert "Revert "update proto sentencepiece model""

This reverts commit 46108257b8927b73627ec8f4f3eed53a95fc700d.

* test for log level

* test for log level 2

* warning at the warning level

* clean

* format

* add explanation in docstring
parent 7c5d7991
......@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional.
"""
import warnings
from typing import Dict, List, Tuple
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
......@@ -429,6 +430,14 @@ class SpmConverter(Converter):
m.ParseFromString(f.read())
self.proto = m
if self.proto.trainer_spec.byte_fallback:
warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
"unknown tokens into a sequence of byte tokens matching the original piece of text."
)
def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces]
......
......@@ -2,7 +2,7 @@
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: sentencepiece_model.proto
# Copyright 2020 The HuggingFace Team. All rights reserved.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -15,13 +15,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1"))
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pb2
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
......@@ -36,11 +30,10 @@ DESCRIPTOR = _descriptor.FileDescriptor(
name="sentencepiece_model.proto",
package="sentencepiece",
syntax="proto2",
serialized_pb=_b(
'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xf4\x08\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 "5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xba\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x1a\xc8\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"J\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
),
serialized_options=b"H\003",
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03',
)
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
......@@ -48,16 +41,45 @@ _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
full_name="sentencepiece.TrainerSpec.ModelType",
filename=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
values=[
_descriptor.EnumValueDescriptor(name="UNIGRAM", index=0, number=1, options=None, type=None),
_descriptor.EnumValueDescriptor(name="BPE", index=1, number=2, options=None, type=None),
_descriptor.EnumValueDescriptor(name="WORD", index=2, number=3, options=None, type=None),
_descriptor.EnumValueDescriptor(name="CHAR", index=3, number=4, options=None, type=None),
_descriptor.EnumValueDescriptor(
name="UNIGRAM",
index=0,
number=1,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="BPE",
index=1,
number=2,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="WORD",
index=2,
number=3,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="CHAR",
index=3,
number=4,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
],
containing_type=None,
options=None,
serialized_start=1121,
serialized_end=1174,
serialized_options=None,
serialized_start=1294,
serialized_end=1347,
)
_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
......@@ -66,17 +88,61 @@ _MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
full_name="sentencepiece.ModelProto.SentencePiece.Type",
filename=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
values=[
_descriptor.EnumValueDescriptor(name="NORMAL", index=0, number=1, options=None, type=None),
_descriptor.EnumValueDescriptor(name="UNKNOWN", index=1, number=2, options=None, type=None),
_descriptor.EnumValueDescriptor(name="CONTROL", index=2, number=3, options=None, type=None),
_descriptor.EnumValueDescriptor(name="USER_DEFINED", index=3, number=4, options=None, type=None),
_descriptor.EnumValueDescriptor(name="UNUSED", index=4, number=5, options=None, type=None),
_descriptor.EnumValueDescriptor(
name="NORMAL",
index=0,
number=1,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="UNKNOWN",
index=1,
number=2,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="CONTROL",
index=2,
number=3,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="USER_DEFINED",
index=3,
number=4,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="BYTE",
index=4,
number=6,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.EnumValueDescriptor(
name="UNUSED",
index=5,
number=5,
serialized_options=None,
type=None,
create_key=_descriptor._internal_create_key,
),
],
containing_type=None,
options=None,
serialized_start=1869,
serialized_end=1943,
serialized_options=None,
serialized_start=2100,
serialized_end=2184,
)
_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
......@@ -87,6 +153,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="input",
......@@ -103,7 +170,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="input_format",
......@@ -114,13 +183,15 @@ _TRAINERSPEC = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="model_prefix",
......@@ -131,13 +202,15 @@ _TRAINERSPEC = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="model_type",
......@@ -154,7 +227,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="vocab_size",
......@@ -171,7 +246,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="accept_language",
......@@ -188,7 +265,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="self_test_sample_size",
......@@ -205,7 +284,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="character_coverage",
......@@ -222,15 +303,17 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="input_sentence_size",
full_name="sentencepiece.TrainerSpec.input_sentence_size",
index=8,
number=11,
type=5,
cpp_type=1,
type=4,
cpp_type=4,
label=1,
has_default_value=True,
default_value=0,
......@@ -239,7 +322,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="shuffle_input_sentence",
......@@ -256,7 +341,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="mining_sentence_size",
......@@ -273,7 +360,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b("\030\001")),
serialized_options=b"\030\001",
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="training_sentence_size",
......@@ -290,7 +379,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b("\030\001")),
serialized_options=b"\030\001",
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="seed_sentencepiece_size",
......@@ -307,7 +398,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="shrinking_factor",
......@@ -324,7 +417,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="max_sentence_length",
......@@ -341,7 +436,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="num_threads",
......@@ -358,7 +455,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="num_sub_iterations",
......@@ -375,7 +474,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="max_sentencepiece_length",
......@@ -392,7 +493,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="split_by_unicode_script",
......@@ -409,7 +512,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="split_by_number",
......@@ -426,7 +531,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="split_by_whitespace",
......@@ -443,7 +550,9 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="treat_whitespace_as_suffix",
......@@ -460,12 +569,33 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="split_digits",
full_name="sentencepiece.TrainerSpec.split_digits",
index=22,
number=25,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="control_symbols",
full_name="sentencepiece.TrainerSpec.control_symbols",
index=22,
index=23,
number=30,
type=9,
cpp_type=9,
......@@ -477,12 +607,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="user_defined_symbols",
full_name="sentencepiece.TrainerSpec.user_defined_symbols",
index=23,
index=24,
number=31,
type=9,
cpp_type=9,
......@@ -494,12 +626,71 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="required_chars",
full_name="sentencepiece.TrainerSpec.required_chars",
index=25,
number=36,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="byte_fallback",
full_name="sentencepiece.TrainerSpec.byte_fallback",
index=26,
number=35,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="vocabulary_output_piece_score",
full_name="sentencepiece.TrainerSpec.vocabulary_output_piece_score",
index=27,
number=32,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=True,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="hard_vocab_limit",
full_name="sentencepiece.TrainerSpec.hard_vocab_limit",
index=24,
index=28,
number=33,
type=8,
cpp_type=7,
......@@ -511,12 +702,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="use_all_vocab",
full_name="sentencepiece.TrainerSpec.use_all_vocab",
index=25,
index=29,
number=34,
type=8,
cpp_type=7,
......@@ -528,12 +721,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="unk_id",
full_name="sentencepiece.TrainerSpec.unk_id",
index=26,
index=30,
number=40,
type=5,
cpp_type=1,
......@@ -545,12 +740,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="bos_id",
full_name="sentencepiece.TrainerSpec.bos_id",
index=27,
index=31,
number=41,
type=5,
cpp_type=1,
......@@ -562,12 +759,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="eos_id",
full_name="sentencepiece.TrainerSpec.eos_id",
index=28,
index=32,
number=42,
type=5,
cpp_type=1,
......@@ -579,12 +778,14 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="pad_id",
full_name="sentencepiece.TrainerSpec.pad_id",
index=29,
index=33,
number=43,
type=5,
cpp_type=1,
......@@ -596,92 +797,123 @@ _TRAINERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="unk_piece",
full_name="sentencepiece.TrainerSpec.unk_piece",
index=30,
index=34,
number=45,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("<unk>").decode("utf-8"),
default_value=b"<unk>".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="bos_piece",
full_name="sentencepiece.TrainerSpec.bos_piece",
index=31,
index=35,
number=46,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("<s>").decode("utf-8"),
default_value=b"<s>".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="eos_piece",
full_name="sentencepiece.TrainerSpec.eos_piece",
index=32,
index=36,
number=47,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("</s>").decode("utf-8"),
default_value=b"</s>".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="pad_piece",
full_name="sentencepiece.TrainerSpec.pad_piece",
index=33,
index=37,
number=48,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b("<pad>").decode("utf-8"),
default_value=b"<pad>".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="unk_surface",
full_name="sentencepiece.TrainerSpec.unk_surface",
index=34,
index=38,
number=44,
type=9,
cpp_type=9,
label=1,
has_default_value=True,
default_value=_b(" \342\201\207 ").decode("utf-8"),
default_value=b" \342\201\207 ".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="train_extremely_large_corpus",
full_name="sentencepiece.TrainerSpec.train_extremely_large_corpus",
index=39,
number=49,
type=8,
cpp_type=7,
label=1,
has_default_value=True,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
......@@ -689,7 +921,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
enum_types=[
_TRAINERSPEC_MODELTYPE,
],
options=None,
serialized_options=None,
is_extendable=True,
syntax="proto2",
extension_ranges=[
......@@ -697,7 +929,7 @@ _TRAINERSPEC = _descriptor.Descriptor(
],
oneofs=[],
serialized_start=45,
serialized_end=1185,
serialized_end=1358,
)
......@@ -707,6 +939,7 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="name",
......@@ -717,13 +950,15 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="precompiled_charsmap",
......@@ -734,13 +969,15 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b(""),
default_value=b"",
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="add_dummy_prefix",
......@@ -757,7 +994,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="remove_extra_whitespaces",
......@@ -774,7 +1013,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="escape_whitespaces",
......@@ -791,7 +1032,9 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="normalization_rule_tsv",
......@@ -802,27 +1045,29 @@ _NORMALIZERSPEC = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
serialized_options=None,
is_extendable=True,
syntax="proto2",
extension_ranges=[
(200, 536870912),
],
oneofs=[],
serialized_start=1188,
serialized_end=1397,
serialized_start=1361,
serialized_end=1570,
)
......@@ -832,6 +1077,7 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="input",
......@@ -842,13 +1088,15 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="expected",
......@@ -859,25 +1107,27 @@ _SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
serialized_options=None,
is_extendable=False,
syntax="proto2",
extension_ranges=[],
oneofs=[],
serialized_start=1468,
serialized_end=1509,
serialized_start=1641,
serialized_end=1682,
)
_SELFTESTDATA = _descriptor.Descriptor(
......@@ -886,6 +1136,7 @@ _SELFTESTDATA = _descriptor.Descriptor(
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="samples",
......@@ -902,7 +1153,9 @@ _SELFTESTDATA = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
......@@ -910,15 +1163,15 @@ _SELFTESTDATA = _descriptor.Descriptor(
_SELFTESTDATA_SAMPLE,
],
enum_types=[],
options=None,
serialized_options=None,
is_extendable=True,
syntax="proto2",
extension_ranges=[
(200, 536870912),
],
oneofs=[],
serialized_start=1399,
serialized_end=1520,
serialized_start=1572,
serialized_end=1693,
)
......@@ -928,6 +1181,7 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="piece",
......@@ -938,13 +1192,15 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
cpp_type=9,
label=1,
has_default_value=False,
default_value=_b("").decode("utf-8"),
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="score",
......@@ -961,7 +1217,9 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="type",
......@@ -978,7 +1236,9 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
......@@ -986,15 +1246,15 @@ _MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
enum_types=[
_MODELPROTO_SENTENCEPIECE_TYPE,
],
options=None,
serialized_options=None,
is_extendable=True,
syntax="proto2",
extension_ranges=[
(200, 536870912),
],
oneofs=[],
serialized_start=1754,
serialized_end=1954,
serialized_start=1985,
serialized_end=2195,
)
_MODELPROTO = _descriptor.Descriptor(
......@@ -1003,6 +1263,7 @@ _MODELPROTO = _descriptor.Descriptor(
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="pieces",
......@@ -1019,7 +1280,9 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="trainer_spec",
......@@ -1036,7 +1299,9 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="normalizer_spec",
......@@ -1053,7 +1318,9 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="self_test_data",
......@@ -1070,7 +1337,28 @@ _MODELPROTO = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="denormalizer_spec",
full_name="sentencepiece.ModelProto.denormalizer_spec",
index=4,
number=5,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
......@@ -1078,15 +1366,15 @@ _MODELPROTO = _descriptor.Descriptor(
_MODELPROTO_SENTENCEPIECE,
],
enum_types=[],
options=None,
serialized_options=None,
is_extendable=True,
syntax="proto2",
extension_ranges=[
(200, 536870912),
],
oneofs=[],
serialized_start=1523,
serialized_end=1965,
serialized_start=1696,
serialized_end=2206,
)
_TRAINERSPEC.fields_by_name["model_type"].enum_type = _TRAINERSPEC_MODELTYPE
......@@ -1100,50 +1388,52 @@ _MODELPROTO.fields_by_name["pieces"].message_type = _MODELPROTO_SENTENCEPIECE
_MODELPROTO.fields_by_name["trainer_spec"].message_type = _TRAINERSPEC
_MODELPROTO.fields_by_name["normalizer_spec"].message_type = _NORMALIZERSPEC
_MODELPROTO.fields_by_name["self_test_data"].message_type = _SELFTESTDATA
_MODELPROTO.fields_by_name["denormalizer_spec"].message_type = _NORMALIZERSPEC
DESCRIPTOR.message_types_by_name["TrainerSpec"] = _TRAINERSPEC
DESCRIPTOR.message_types_by_name["NormalizerSpec"] = _NORMALIZERSPEC
DESCRIPTOR.message_types_by_name["SelfTestData"] = _SELFTESTDATA
DESCRIPTOR.message_types_by_name["ModelProto"] = _MODELPROTO
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
TrainerSpec = _reflection.GeneratedProtocolMessageType(
"TrainerSpec",
(_message.Message,),
dict(
DESCRIPTOR=_TRAINERSPEC,
__module__="sentencepiece_model_pb2"
{
"DESCRIPTOR": _TRAINERSPEC,
"__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec)
),
},
)
_sym_db.RegisterMessage(TrainerSpec)
NormalizerSpec = _reflection.GeneratedProtocolMessageType(
"NormalizerSpec",
(_message.Message,),
dict(
DESCRIPTOR=_NORMALIZERSPEC,
__module__="sentencepiece_model_pb2"
{
"DESCRIPTOR": _NORMALIZERSPEC,
"__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec)
),
},
)
_sym_db.RegisterMessage(NormalizerSpec)
SelfTestData = _reflection.GeneratedProtocolMessageType(
"SelfTestData",
(_message.Message,),
dict(
Sample=_reflection.GeneratedProtocolMessageType(
{
"Sample": _reflection.GeneratedProtocolMessageType(
"Sample",
(_message.Message,),
dict(
DESCRIPTOR=_SELFTESTDATA_SAMPLE,
__module__="sentencepiece_model_pb2"
{
"DESCRIPTOR": _SELFTESTDATA_SAMPLE,
"__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample)
),
},
),
DESCRIPTOR=_SELFTESTDATA,
__module__="sentencepiece_model_pb2"
"DESCRIPTOR": _SELFTESTDATA,
"__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData)
),
},
)
_sym_db.RegisterMessage(SelfTestData)
_sym_db.RegisterMessage(SelfTestData.Sample)
......@@ -1151,33 +1441,26 @@ _sym_db.RegisterMessage(SelfTestData.Sample)
ModelProto = _reflection.GeneratedProtocolMessageType(
"ModelProto",
(_message.Message,),
dict(
SentencePiece=_reflection.GeneratedProtocolMessageType(
{
"SentencePiece": _reflection.GeneratedProtocolMessageType(
"SentencePiece",
(_message.Message,),
dict(
DESCRIPTOR=_MODELPROTO_SENTENCEPIECE,
__module__="sentencepiece_model_pb2"
{
"DESCRIPTOR": _MODELPROTO_SENTENCEPIECE,
"__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece)
),
},
),
DESCRIPTOR=_MODELPROTO,
__module__="sentencepiece_model_pb2"
"DESCRIPTOR": _MODELPROTO,
"__module__": "sentencepiece_model_pb2"
# @@protoc_insertion_point(class_scope:sentencepiece.ModelProto)
),
},
)
_sym_db.RegisterMessage(ModelProto)
_sym_db.RegisterMessage(ModelProto.SentencePiece)
DESCRIPTOR.has_options = True
DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b("H\003"))
_TRAINERSPEC.fields_by_name["mining_sentence_size"].has_options = True
_TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = _descriptor._ParseOptions(
descriptor_pb2.FieldOptions(), _b("\030\001")
)
_TRAINERSPEC.fields_by_name["training_sentence_size"].has_options = True
_TRAINERSPEC.fields_by_name["training_sentence_size"]._options = _descriptor._ParseOptions(
descriptor_pb2.FieldOptions(), _b("\030\001")
)
DESCRIPTOR._options = None
_TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
_TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
# @@protoc_insertion_point(module_scope)
import unittest
import warnings
from dataclasses import dataclass
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.testing_utils import get_tests_dir
@dataclass
class FakeOriginalTokenizer:
vocab_file: str
class ConvertSlowTokenizerTest(unittest.TestCase):
def test_spm_converter_bytefallback_warning(self):
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_without_bytefallback)
self.assertEqual(len(w), 0)
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers."
),
str(w[0].message),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment