Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
d06980df
"git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "ed0c8714efcbbbba242f71876aa8fac39c2b6985"
Unverified
Commit
d06980df
authored
Nov 30, 2023
by
Woosuk Kwon
Committed by
GitHub
Nov 30, 2023
Browse files
Fix Baichuan tokenizer error (#1874)
parent
66785cc0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
281 additions
and
0 deletions
+281
-0
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+13
-0
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/__init__.py
+5
-0
vllm/transformers_utils/tokenizers/baichuan.py
vllm/transformers_utils/tokenizers/baichuan.py
+263
-0
No files found.
vllm/transformers_utils/tokenizer.py
View file @
d06980df
...
@@ -4,6 +4,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
...
@@ -4,6 +4,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast
)
PreTrainedTokenizerFast
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizers
import
*
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -61,6 +62,18 @@ def get_tokenizer(
...
@@ -61,6 +62,18 @@ def get_tokenizer(
raise
RuntimeError
(
err_msg
)
from
e
raise
RuntimeError
(
err_msg
)
from
e
else
:
else
:
raise
e
raise
e
except
AttributeError
as
e
:
if
"BaichuanTokenizer"
in
str
(
e
):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer
=
BaichuanTokenizer
.
from_pretrained
(
tokenizer_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
tokenizer_revision
=
tokenizer_revision
,
**
kwargs
)
else
:
raise
e
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
logger
.
warning
(
logger
.
warning
(
...
...
vllm/transformers_utils/tokenizers/__init__.py
0 → 100644
View file @
d06980df
from
vllm.transformers_utils.tokenizers.baichuan
import
BaichuanTokenizer
__all__
=
[
"BaichuanTokenizer"
,
]
vllm/transformers_utils/tokenizers/baichuan.py
0 → 100644
View file @
d06980df
# yapf: disable
# Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in
# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058
# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
import
os
from
shutil
import
copyfile
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
VOCAB_FILES_NAMES
=
{
"vocab_file"
:
"tokenizer.model"
}
PRETRAINED_VOCAB_FILES_MAP
=
{
"vocab_file"
:
{},
"tokenizer_file"
:
{},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{}
class
BaichuanTokenizer
(
PreTrainedTokenizer
):
"""
Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names
=
[
"input_ids"
,
"attention_mask"
]
def
__init__
(
self
,
vocab_file
,
unk_token
=
"<unk>"
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
pad_token
=
None
,
sp_model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
add_bos_token
=
True
,
add_eos_token
=
False
,
clean_up_tokenization_spaces
=
False
,
**
kwargs
,
):
self
.
sp_model_kwargs
=
{}
if
sp_model_kwargs
is
None
else
sp_model_kwargs
bos_token
=
(
AddedToken
(
bos_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
bos_token
,
str
)
else
bos_token
)
eos_token
=
(
AddedToken
(
eos_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
eos_token
,
str
)
else
eos_token
)
unk_token
=
(
AddedToken
(
unk_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
unk_token
,
str
)
else
unk_token
)
pad_token
=
(
AddedToken
(
pad_token
,
lstrip
=
False
,
rstrip
=
False
)
if
isinstance
(
pad_token
,
str
)
else
pad_token
)
self
.
vocab_file
=
vocab_file
self
.
add_bos_token
=
add_bos_token
self
.
add_eos_token
=
add_eos_token
self
.
sp_model
=
spm
.
SentencePieceProcessor
(
**
self
.
sp_model_kwargs
)
self
.
sp_model
.
Load
(
vocab_file
)
super
().
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
pad_token
=
pad_token
,
add_bos_token
=
add_bos_token
,
add_eos_token
=
add_eos_token
,
sp_model_kwargs
=
self
.
sp_model_kwargs
,
clean_up_tokenization_spaces
=
clean_up_tokenization_spaces
,
**
kwargs
,
)
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
self
.
sp_model
=
spm
.
SentencePieceProcessor
(
**
self
.
sp_model_kwargs
)
self
.
sp_model
.
Load
(
self
.
vocab_file
)
@
property
def
vocab_size
(
self
):
"""Returns vocab size"""
return
self
.
sp_model
.
get_piece_size
()
def
get_vocab
(
self
):
"""Returns vocab as a dict"""
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
_tokenize
(
self
,
text
):
"""Returns a tokenized string."""
return
self
.
sp_model
.
encode
(
text
,
out_type
=
str
)
def
_convert_token_to_id
(
self
,
token
):
"""Converts a token (str) in an id using the vocab."""
return
self
.
sp_model
.
piece_to_id
(
token
)
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens
=
[]
out_string
=
""
prev_is_special
=
False
for
i
,
token
in
enumerate
(
tokens
):
# make sure that special tokens are not decoded using sentencepiece model
if
token
in
self
.
all_special_tokens
:
if
not
prev_is_special
and
i
!=
0
:
out_string
+=
" "
out_string
+=
self
.
sp_model
.
decode
(
current_sub_tokens
)
+
token
prev_is_special
=
True
current_sub_tokens
=
[]
else
:
current_sub_tokens
.
append
(
token
)
prev_is_special
=
False
out_string
+=
self
.
sp_model
.
decode
(
current_sub_tokens
)
return
out_string
def
save_vocabulary
(
self
,
save_directory
,
filename_prefix
:
Optional
[
str
]
=
None
)
->
Tuple
[
str
]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
f
"Vocabulary path (
{
save_directory
}
) should be a directory"
)
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
(
filename_prefix
+
"-"
if
filename_prefix
else
""
)
+
VOCAB_FILES_NAMES
[
"vocab_file"
],
)
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
)
and
os
.
path
.
isfile
(
self
.
vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
elif
not
os
.
path
.
isfile
(
self
.
vocab_file
):
with
open
(
out_vocab_file
,
"wb"
)
as
fi
:
content_spiece_model
=
self
.
sp_model
.
serialized_model_proto
()
fi
.
write
(
content_spiece_model
)
return
(
out_vocab_file
,)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
bos_token_id
=
[
self
.
bos_token_id
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
self
.
eos_token_id
]
if
self
.
add_eos_token
else
[]
output
=
bos_token_id
+
token_ids_0
+
eos_token_id
if
token_ids_1
is
not
None
:
output
=
output
+
bos_token_id
+
token_ids_1
+
eos_token_id
return
output
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
,
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
,
)
bos_token_id
=
[
1
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
1
]
if
self
.
add_eos_token
else
[]
if
token_ids_1
is
None
:
return
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
return
(
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
+
bos_token_id
+
([
0
]
*
len
(
token_ids_1
))
+
eos_token_id
)
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
bos_token_id
=
[
self
.
bos_token_id
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
self
.
eos_token_id
]
if
self
.
add_eos_token
else
[]
output
=
[
0
]
*
len
(
bos_token_id
+
token_ids_0
+
eos_token_id
)
if
token_ids_1
is
not
None
:
output
+=
[
1
]
*
len
(
bos_token_id
+
token_ids_1
+
eos_token_id
)
return
output
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment