Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
dd8b7d28
Unverified
Commit
dd8b7d28
authored
Aug 25, 2023
by
Arthur
Committed by
GitHub
Aug 25, 2023
Browse files
[`Sentencepiece`] make sure `legacy` do not require `protobuf` (#25684)
make sure legacy does not require `protobuf`
parent
0770ce6c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
12 deletions
+21
-12
src/transformers/convert_slow_tokenizer.py
src/transformers/convert_slow_tokenizer.py
+5
-2
src/transformers/models/llama/tokenization_llama.py
src/transformers/models/llama/tokenization_llama.py
+8
-5
src/transformers/models/t5/tokenization_t5.py
src/transformers/models/t5/tokenization_t5.py
+8
-5
No files found.
src/transformers/convert_slow_tokenizer.py
View file @
dd8b7d28
...
...
@@ -27,9 +27,10 @@ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_
from
tokenizers.models
import
BPE
,
Unigram
,
WordPiece
from
.utils
import
is_protobuf_available
,
requires_backends
from
.utils.import_utils
import
PROTOBUF_IMPORT_ERROR
def
import_protobuf
():
def
import_protobuf
(
error_message
=
""
):
if
is_protobuf_available
():
import
google.protobuf
...
...
@@ -38,6 +39,8 @@ def import_protobuf():
else
:
from
transformers.utils
import
sentencepiece_model_pb2_new
as
sentencepiece_model_pb2
return
sentencepiece_model_pb2
else
:
raise
ImportError
(
PROTOBUF_IMPORT_ERROR
.
format
(
error_message
))
class
SentencePieceExtractor
:
...
...
src/transformers/models/llama/tokenization_llama.py
View file @
dd8b7d28
...
...
@@ -162,11 +162,14 @@ class LlamaTokenizer(PreTrainedTokenizer):
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def
get_spm_processor
(
self
):
tokenizer
=
spm
.
SentencePieceProcessor
(
**
self
.
sp_model_kwargs
)
if
self
.
legacy
:
# no dependency on protobuf
tokenizer
.
Load
(
self
.
vocab_file
)
return
tokenizer
with
open
(
self
.
vocab_file
,
"rb"
)
as
f
:
sp_model
=
f
.
read
()
model_pb2
=
import_protobuf
()
model_pb2
=
import_protobuf
(
f
"The new behaviour of
{
self
.
__class__
.
__name__
}
(with `self.legacy = False`)"
)
model
=
model_pb2
.
ModelProto
.
FromString
(
sp_model
)
if
not
self
.
legacy
:
normalizer_spec
=
model_pb2
.
NormalizerSpec
()
normalizer_spec
.
add_dummy_prefix
=
False
model
.
normalizer_spec
.
MergeFrom
(
normalizer_spec
)
...
...
src/transformers/models/t5/tokenization_t5.py
View file @
dd8b7d28
...
...
@@ -195,11 +195,14 @@ class T5Tokenizer(PreTrainedTokenizer):
def
get_spm_processor
(
self
):
tokenizer
=
spm
.
SentencePieceProcessor
(
**
self
.
sp_model_kwargs
)
if
self
.
legacy
:
# no dependency on protobuf
tokenizer
.
Load
(
self
.
vocab_file
)
return
tokenizer
with
open
(
self
.
vocab_file
,
"rb"
)
as
f
:
sp_model
=
f
.
read
()
model_pb2
=
import_protobuf
()
model_pb2
=
import_protobuf
(
f
"The new behaviour of
{
self
.
__class__
.
__name__
}
(with `self.legacy = False`)"
)
model
=
model_pb2
.
ModelProto
.
FromString
(
sp_model
)
if
not
self
.
legacy
:
normalizer_spec
=
model_pb2
.
NormalizerSpec
()
normalizer_spec
.
add_dummy_prefix
=
False
model
.
normalizer_spec
.
MergeFrom
(
normalizer_spec
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment