Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
64936449
Unverified
Commit
64936449
authored
Jun 29, 2023
by
q.yao
Committed by
GitHub
Jun 29, 2023
Browse files
use huggingface tokenizer (#26)
* add hf tokenizer * format * fix for comment * don't skip speical tokens
parent
0cc48011
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
211 additions
and
104 deletions
+211
-104
docker/Dockerfile
docker/Dockerfile
+1
-1
examples/cpp/llama/tokenizer.py
examples/cpp/llama/tokenizer.py
+27
-11
llmdeploy/serve/fastertransformer/chatbot.py
llmdeploy/serve/fastertransformer/chatbot.py
+35
-29
llmdeploy/serve/fastertransformer/deploy.py
llmdeploy/serve/fastertransformer/deploy.py
+50
-38
llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/model.py
...fastertransformer/triton_models/postprocessing/1/model.py
+48
-7
llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/model.py
.../fastertransformer/triton_models/preprocessing/1/model.py
+50
-18
No files found.
docker/Dockerfile
View file @
64936449
...
@@ -5,6 +5,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
...
@@ -5,6 +5,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&&
rm
-rf
/var/lib/apt/lists/
*
&&
rm
-rf
/var/lib/apt/lists/
*
RUN
python3
-m
pip
install
torch
==
1.13.1+cu117
torchvision
==
0.14.1+cu117
--extra-index-url
https://download.pytorch.org/whl/cu117
RUN
python3
-m
pip
install
torch
==
1.13.1+cu117
torchvision
==
0.14.1+cu117
--extra-index-url
https://download.pytorch.org/whl/cu117
RUN
python3
-m
pip
install
sentencepiece cmake
RUN
python3
-m
pip
install
sentencepiece cmake
transformers
protobuf
==
3.20.3
ENV
NCCL_LAUNCH_MODE=GROUP
ENV
NCCL_LAUNCH_MODE=GROUP
examples/cpp/llama/tokenizer.py
View file @
64936449
from
sentencepiece
import
SentencePieceProcessor
from
typing
import
List
from
typing
import
List
import
fire
import
fire
import
sys
class
Tokenizer
:
class
Tokenizer
:
def
__init__
(
self
,
model_file
:
str
):
def
__init__
(
self
,
model_file
:
str
):
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
if
model_file
.
endswith
(
'.model'
):
self
.
vocab_size
=
self
.
model
.
vocab_size
()
from
sentencepiece
import
SentencePieceProcessor
self
.
start_id
=
self
.
model
.
bos_id
()
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
end_id
=
self
.
model
.
eos_id
()
self
.
vocab_size
=
self
.
model
.
vocab_size
()
self
.
pad_id
=
self
.
model
.
pad_id
()
self
.
start_id
=
self
.
model
.
bos_id
()
self
.
end_id
=
self
.
model
.
eos_id
()
self
.
pad_id
=
self
.
model
.
pad_id
()
else
:
from
transformers
import
AutoTokenizer
self
.
model
=
AutoTokenizer
.
from_pretrained
(
model_file
)
self
.
vocab_size
=
self
.
model
.
vocab_size
self
.
start_id
=
self
.
model
.
bos_token_id
self
.
end_id
=
self
.
model
.
eos_token_id
self
.
pad_id
=
self
.
model
.
pad_token_id
print
(
f
'vocab_size =
{
self
.
vocab_size
}
'
)
print
(
f
'vocab_size =
{
self
.
vocab_size
}
'
)
print
(
f
'start_id =
{
self
.
start_id
}
'
)
print
(
f
'start_id =
{
self
.
start_id
}
'
)
print
(
f
'end_id =
{
self
.
end_id
}
'
)
print
(
f
'end_id =
{
self
.
end_id
}
'
)
print
(
f
'pad_id =
{
self
.
pad_id
}
'
)
print
(
f
'pad_id =
{
self
.
pad_id
}
'
)
def
encode
(
self
,
s
:
str
):
def
encode
(
self
,
s
:
str
):
return
self
.
model
.
Encode
(
s
,
add_bos
=
True
)
if
hasattr
(
self
.
model
,
'Encode'
):
return
self
.
model
.
Encode
(
s
,
add_bos
=
True
)
else
:
return
self
.
model
.
encode
(
s
,
add_special_tokens
=
True
)
def
decode
(
self
,
t
:
List
[
int
]):
def
decode
(
self
,
t
:
List
[
int
]):
return
self
.
model
.
Decode
(
t
)
if
hasattr
(
self
.
model
,
'Decode'
):
return
self
.
model
.
Decode
(
t
)
else
:
return
self
.
model
.
decode
(
t
)
def
main
(
model_file
:
str
=
'/data/llama/model/tokenizer.model'
,
def
main
(
model_file
:
str
=
'/data/llama/model/tokenizer.model'
,
encode_file
:
str
=
None
,
decode_file
:
str
=
None
):
encode_file
:
str
=
None
,
decode_file
:
str
=
None
):
tokenizer
=
Tokenizer
(
model_file
)
tokenizer
=
Tokenizer
(
model_file
)
if
encode_file
:
if
encode_file
:
with
open
(
encode_file
,
'r'
)
as
f
:
with
open
(
encode_file
,
'r'
)
as
f
:
...
@@ -54,4 +70,4 @@ def main(model_file: str = '/data/llama/model/tokenizer.model',
...
@@ -54,4 +70,4 @@ def main(model_file: str = '/data/llama/model/tokenizer.model',
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
fire
.
Fire
(
main
)
fire
.
Fire
(
main
)
\ No newline at end of file
llmdeploy/serve/fastertransformer/chatbot.py
View file @
64936449
...
@@ -107,13 +107,14 @@ class Chatbot:
...
@@ -107,13 +107,14 @@ class Chatbot:
stop_words
=
None
stop_words
=
None
bad_words
=
np
.
array
([[[
self
.
eos_id
],
[
1
]]],
dtype
=
np
.
int32
)
bad_words
=
np
.
array
([[[
self
.
eos_id
],
[
1
]]],
dtype
=
np
.
int32
)
self
.
cfg
=
mmengine
.
Config
(
self
.
cfg
=
mmengine
.
Config
(
dict
(
session_len
=
session_len
,
dict
(
top_p
=
top_p
,
session_len
=
session_len
,
top_k
=
top_k
,
top_p
=
top_p
,
temperature
=
temperature
,
top_k
=
top_k
,
repetition_penalty
=
repetition_penalty
,
temperature
=
temperature
,
stop_words
=
stop_words
,
repetition_penalty
=
repetition_penalty
,
bad_words
=
bad_words
))
stop_words
=
stop_words
,
bad_words
=
bad_words
))
self
.
log_level
=
log_level
self
.
log_level
=
log_level
self
.
display
=
display
self
.
display
=
display
self
.
profile_generation
=
profile_generation
self
.
profile_generation
=
profile_generation
...
@@ -200,13 +201,16 @@ class Chatbot:
...
@@ -200,13 +201,16 @@ class Chatbot:
return
StatusCode
.
TRITON_SESSION_CLOSED
return
StatusCode
.
TRITON_SESSION_CLOSED
self
.
_session
.
status
=
0
self
.
_session
.
status
=
0
for
status
,
_
,
_
in
self
.
_stream_infer
(
self
.
_session
,
for
status
,
_
,
_
in
self
.
_stream_infer
(
prompt
=
''
,
self
.
_session
,
request_output_len
=
0
,
prompt
=
''
,
sequence_start
=
False
,
request_output_len
=
0
,
sequence_end
=
True
):
sequence_start
=
False
,
sequence_end
=
True
):
if
status
!=
StatusCode
.
TRITON_STREAM_END
:
if
status
!=
StatusCode
.
TRITON_STREAM_END
:
return
status
return
status
self
.
reset_session
()
return
StatusCode
.
TRITON_STREAM_END
return
StatusCode
.
TRITON_STREAM_END
def
cancel
(
self
,
session_id
:
int
,
*
args
,
**
kwargs
):
def
cancel
(
self
,
session_id
:
int
,
*
args
,
**
kwargs
):
...
@@ -238,12 +242,13 @@ class Chatbot:
...
@@ -238,12 +242,13 @@ class Chatbot:
return
StatusCode
.
TRITON_SESSION_CLOSED
return
StatusCode
.
TRITON_SESSION_CLOSED
prev_session
=
self
.
_session
prev_session
=
self
.
_session
for
status
,
res
,
_
in
self
.
_stream_infer
(
self
.
_session
,
for
status
,
res
,
_
in
self
.
_stream_infer
(
prompt
=
''
,
self
.
_session
,
request_output_len
=
0
,
prompt
=
''
,
sequence_start
=
False
,
request_output_len
=
0
,
sequence_end
=
False
,
sequence_start
=
False
,
cancel
=
True
):
sequence_end
=
False
,
cancel
=
True
):
if
status
.
value
<
0
:
if
status
.
value
<
0
:
break
break
if
status
==
StatusCode
.
TRITON_STREAM_END
:
if
status
==
StatusCode
.
TRITON_STREAM_END
:
...
@@ -336,11 +341,11 @@ class Chatbot:
...
@@ -336,11 +341,11 @@ class Chatbot:
session
.
response
=
''
session
.
response
=
''
que
=
queue
.
Queue
()
que
=
queue
.
Queue
()
producer
=
threading
.
Thread
(
target
=
self
.
_stream_producer
,
producer
=
threading
.
Thread
(
args
=
(
self
.
tritonserver_addr
,
session
,
que
,
target
=
self
.
_stream_producer
,
self
.
cfg
,
input_ids
,
input_lengths
,
args
=
(
self
.
tritonserver_addr
,
session
,
que
,
self
.
cfg
,
input_ids
,
request_output_len
,
sequence_start
,
input_lengths
,
request_output_len
,
sequence_start
,
sequence_end
,
preseq_length
,
cancel
))
sequence_end
,
preseq_length
,
cancel
))
producer
.
start
()
producer
.
start
()
for
state
,
res
,
tokens
in
self
.
stream_consumer
(
for
state
,
res
,
tokens
in
self
.
stream_consumer
(
self
.
postprocess
,
que
,
session
,
preseq_length
,
cancel
,
logger
,
self
.
postprocess
,
que
,
session
,
preseq_length
,
cancel
,
logger
,
...
@@ -411,12 +416,13 @@ class Chatbot:
...
@@ -411,12 +416,13 @@ class Chatbot:
random_seed
*
np
.
ones
((
1
,
1
),
dtype
=
np
.
uint64
))
random_seed
*
np
.
ones
((
1
,
1
),
dtype
=
np
.
uint64
))
]
]
client
.
start_stream
(
callback
)
client
.
start_stream
(
callback
)
client
.
async_stream_infer
(
'fastertransformer'
,
client
.
async_stream_infer
(
inputs
,
'fastertransformer'
,
sequence_id
=
session
.
session_id
,
inputs
,
request_id
=
session
.
request_id
,
sequence_id
=
session
.
session_id
,
sequence_start
=
sequence_start
,
request_id
=
session
.
request_id
,
sequence_end
=
sequence_end
)
sequence_start
=
sequence_start
,
sequence_end
=
sequence_end
)
que
.
put
(
None
)
que
.
put
(
None
)
@
staticmethod
@
staticmethod
...
...
llmdeploy/serve/fastertransformer/deploy.py
View file @
64936449
...
@@ -127,28 +127,29 @@ def export(model_name: str,
...
@@ -127,28 +127,29 @@ def export(model_name: str,
vocab_size
,
bos_id
,
eos_id
=
tokenizer_info
(
tokenizer_path
)
vocab_size
,
bos_id
,
eos_id
=
tokenizer_info
(
tokenizer_path
)
assert
_vocab_size
==
vocab_size
,
\
assert
_vocab_size
==
vocab_size
,
\
f
'different vocab size
{
_vocab_size
}
vs
{
vocab_size
}
'
f
'different vocab size
{
_vocab_size
}
vs
{
vocab_size
}
'
cfg
=
dict
(
llama
=
dict
(
cfg
=
dict
(
model_name
=
model_name
,
llama
=
dict
(
head_num
=
head_num
,
model_name
=
model_name
,
size_per_head
=
size_per_head
,
head_num
=
head_num
,
vocab_size
=
vocab_size
,
size_per_head
=
size_per_head
,
num_layer
=
num_layer
,
vocab_size
=
vocab_size
,
rotary_embedding
=
size_per_head
,
num_layer
=
num_layer
,
inter_size
=
inter_size
,
rotary_embedding
=
size_per_head
,
norm_eps
=
norm_eps
,
inter_size
=
inter_size
,
attn_bias
=
attn_bias
,
norm_eps
=
norm_eps
,
start_id
=
bos_id
,
attn_bias
=
attn_bias
,
end_id
=
eos_id
,
start_id
=
bos_id
,
weight_type
=
'fp16'
,
end_id
=
eos_id
,
# parameters for fastertransformer
weight_type
=
'fp16'
,
max_batch_size
=
32
,
# parameters for fastertransformer
max_context_token_num
=
4
,
max_batch_size
=
32
,
session_len
=
2048
,
max_context_token_num
=
4
,
step_length
=
1
,
session_len
=
2048
,
cache_max_entry_count
=
48
,
step_length
=
1
,
cache_chunk_size
=
8
,
cache_max_entry_count
=
48
,
use_context_fmha
=
1
,
cache_chunk_size
=
8
,
quant_policy
=
0
))
use_context_fmha
=
1
,
quant_policy
=
0
))
config
=
configparser
.
ConfigParser
()
config
=
configparser
.
ConfigParser
()
for
section
,
key_values
in
cfg
.
items
():
for
section
,
key_values
in
cfg
.
items
():
...
@@ -166,7 +167,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
...
@@ -166,7 +167,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
shutil
.
copy
(
tokenizer_path
,
shutil
.
copy
(
tokenizer_path
,
osp
.
join
(
triton_models_path
,
'tokenizer/tokenizer.model'
))
osp
.
join
(
triton_models_path
,
'tokenizer/tokenizer.model'
))
else
:
else
:
print
(
'tokenizer model {tokenizer_path} does not exist'
)
print
(
f
'tokenizer model
{
tokenizer_path
}
does not exist'
)
return
False
return
False
# read model arguments from params.json
# read model arguments from params.json
try
:
try
:
...
@@ -190,9 +191,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
...
@@ -190,9 +191,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
def
get_param
(
_name
,
_size
):
def
get_param
(
_name
,
_size
):
print
(
_name
,
_size
)
print
(
_name
,
_size
)
if
_name
not
in
model_params
:
if
_name
not
in
model_params
:
model_params
[
_name
]
=
torch
.
zeros
(
_size
,
model_params
[
_name
]
=
torch
.
zeros
(
dtype
=
torch
.
float16
,
_size
,
dtype
=
torch
.
float16
,
device
=
'cpu'
)
device
=
'cpu'
)
return
model_params
[
_name
]
return
model_params
[
_name
]
for
i
,
ckpt_path
in
enumerate
(
checkpoints
):
for
i
,
ckpt_path
in
enumerate
(
checkpoints
):
...
@@ -204,7 +204,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
...
@@ -204,7 +204,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
size
=
param_data
.
size
(
0
)
size
=
param_data
.
size
(
0
)
if
ext
==
'weight'
:
if
ext
==
'weight'
:
param
=
get_param
(
param
=
get_param
(
param_name
,
[
size
*
n_ckpt
,
param_data
.
size
(
1
)])
param_name
,
[
size
*
n_ckpt
,
param_data
.
size
(
1
)])
param
.
data
[
size
*
i
:
size
*
(
i
+
1
),
:]
=
param_data
param
.
data
[
size
*
i
:
size
*
(
i
+
1
),
:]
=
param_data
else
:
# bias
else
:
# bias
param
=
get_param
(
param_name
,
[
size
*
n_ckpt
])
param
=
get_param
(
param_name
,
[
size
*
n_ckpt
])
...
@@ -235,8 +236,9 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
...
@@ -235,8 +236,9 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
# concat qkv projection
# concat qkv projection
for
t
in
[
'weight'
,
'bias'
]:
for
t
in
[
'weight'
,
'bias'
]:
for
i
in
range
(
1000
):
for
i
in
range
(
1000
):
_qkv
=
[
f
'layers.
{
i
}
.attention.
{
k
}
.
{
t
}
'
for
k
in
[
_qkv
=
[
'wq'
,
'wk'
,
'wv'
]]
f
'layers.
{
i
}
.attention.
{
k
}
.
{
t
}
'
for
k
in
[
'wq'
,
'wk'
,
'wv'
]
]
try
:
try
:
qkv
=
tuple
(
map
(
model_params
.
pop
,
_qkv
))
qkv
=
tuple
(
map
(
model_params
.
pop
,
_qkv
))
except
KeyError
:
except
KeyError
:
...
@@ -278,8 +280,15 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
...
@@ -278,8 +280,15 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
if
osp
.
exists
(
tokenizer_path
):
if
osp
.
exists
(
tokenizer_path
):
shutil
.
copy
(
tokenizer_path
,
shutil
.
copy
(
tokenizer_path
,
osp
.
join
(
triton_models_path
,
'tokenizer/tokenizer.model'
))
osp
.
join
(
triton_models_path
,
'tokenizer/tokenizer.model'
))
for
json_file
in
os
.
listdir
(
model_path
):
if
json_file
.
endswith
(
'.json'
)
and
json_file
!=
'pytorch_model.bin.index.json'
:
json_path
=
osp
.
join
(
model_path
,
json_file
)
shutil
.
copy
(
json_path
,
osp
.
join
(
triton_models_path
,
'tokenizer'
,
json_file
))
else
:
else
:
print
(
'tokenizer model {tokenizer_path} does not exist'
)
print
(
f
'tokenizer model
{
tokenizer_path
}
does not exist'
)
exit
(
-
1
)
exit
(
-
1
)
# read model arguments from params.json
# read model arguments from params.json
...
@@ -371,19 +380,22 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
...
@@ -371,19 +380,22 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
for
ft
,
hf
in
other
:
for
ft
,
hf
in
other
:
model_params
[
ft
]
=
get_tensor
(
hf
)
model_params
[
ft
]
=
get_tensor
(
hf
)
return
export
(
model_name
,
num_layer
,
norm_eps
,
model_params
,
tokenizer_path
,
return
export
(
model_name
,
num_layer
,
norm_eps
,
model_params
,
triton_models_path
,
tp
)
tokenizer_path
,
triton_models_path
,
tp
)
def
pack_model_repository
(
workspace_path
:
str
):
def
pack_model_repository
(
workspace_path
:
str
):
model_repo_dir
=
osp
.
join
(
workspace_path
,
'model_repository'
)
model_repo_dir
=
osp
.
join
(
workspace_path
,
'model_repository'
)
os
.
makedirs
(
model_repo_dir
,
exist_ok
=
True
)
os
.
makedirs
(
model_repo_dir
,
exist_ok
=
True
)
os
.
symlink
(
src
=
osp
.
join
(
'../triton_models/interactive'
),
os
.
symlink
(
dst
=
osp
.
join
(
model_repo_dir
,
'fastertransformer'
))
src
=
osp
.
join
(
'../triton_models/interactive'
),
os
.
symlink
(
src
=
osp
.
join
(
'../triton_models/preprocessing'
),
dst
=
osp
.
join
(
model_repo_dir
,
'fastertransformer'
))
dst
=
osp
.
join
(
model_repo_dir
,
'preprocessing'
))
os
.
symlink
(
os
.
symlink
(
src
=
osp
.
join
(
'../triton_models/postprocessing'
),
src
=
osp
.
join
(
'../triton_models/preprocessing'
),
dst
=
osp
.
join
(
model_repo_dir
,
'postprocessing'
))
dst
=
osp
.
join
(
model_repo_dir
,
'preprocessing'
))
os
.
symlink
(
src
=
osp
.
join
(
'../triton_models/postprocessing'
),
dst
=
osp
.
join
(
model_repo_dir
,
'postprocessing'
))
def
main
(
model_name
:
str
,
def
main
(
model_name
:
str
,
...
...
llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/model.py
View file @
64936449
...
@@ -6,22 +6,63 @@ from typing import List
...
@@ -6,22 +6,63 @@ from typing import List
import
numpy
as
np
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
import
triton_python_backend_utils
as
pb_utils
from
sentencepiece
import
SentencePieceProcessor
class
Tokenizer
:
class
Tokenizer
:
def
__init__
(
self
,
model_file
:
str
):
def
__init__
(
self
,
model_file
:
str
):
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
model_folder
=
osp
.
split
(
model_file
)[
0
]
self
.
vocab_size
=
self
.
model
.
vocab_size
()
tokenizer_config_file
=
osp
.
join
(
model_folder
,
'tokenizer_config.json'
)
self
.
start_id
=
self
.
model
.
bos_id
()
use_hf_model
=
osp
.
exists
(
tokenizer_config_file
)
self
.
eos_id
=
self
.
model
.
eos_id
()
self
.
use_hf_model
=
use_hf_model
if
not
self
.
use_hf_model
:
from
sentencepiece
import
SentencePieceProcessor
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
vocab_size
=
self
.
model
.
vocab_size
()
self
.
start_id
=
self
.
model
.
bos_id
()
self
.
end_id
=
self
.
model
.
eos_id
()
else
:
from
transformers
import
AutoTokenizer
backend_tokenizer_file
=
osp
.
join
(
model_folder
,
'tokenizer.json'
)
if
not
osp
.
exists
(
backend_tokenizer_file
):
print
(
'WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.'
)
self
.
model
=
AutoTokenizer
.
from_pretrained
(
model_folder
)
self
.
vocab_size
=
self
.
model
.
vocab_size
self
.
start_id
=
self
.
model
.
bos_token_id
self
.
end_id
=
self
.
model
.
eos_token_id
# save tokenizer.json to reuse
if
not
osp
.
exists
(
backend_tokenizer_file
):
self
.
model
.
backend_tokenizer
.
save
(
backend_tokenizer_file
)
def
encode
(
self
,
s
:
str
):
def
encode
(
self
,
s
:
str
):
return
self
.
model
.
Encode
(
s
)
if
not
self
.
use_hf_model
:
add_bos
=
False
add_eos
=
False
if
s
.
find
(
'<BOS>'
)
!=
-
1
:
s
=
s
.
replace
(
'<BOS>'
,
''
)
add_bos
=
True
if
s
==
'<EOS>'
:
s
=
''
add_eos
=
True
return
self
.
model
.
Encode
(
s
,
add_bos
=
add_bos
,
add_eos
=
add_eos
)
else
:
add_special_tokens
=
False
if
s
.
find
(
'<BOS>'
)
!=
-
1
:
s
=
s
.
replace
(
'<BOS>'
,
'<s>'
)
if
s
==
'<EOS>'
:
s
=
'</s>'
if
len
(
s
)
==
0
:
add_special_tokens
=
True
return
self
.
model
.
encode
(
s
,
add_special_tokens
=
add_special_tokens
)
def
decode
(
self
,
t
:
List
[
int
]):
def
decode
(
self
,
t
:
List
[
int
]):
return
self
.
model
.
Decode
(
t
)
if
not
self
.
use_hf_model
:
return
self
.
model
.
Decode
(
t
)
else
:
skip_special_tokens
=
False
return
self
.
model
.
decode
(
t
,
skip_special_tokens
=
skip_special_tokens
)
class
TritonPythonModel
:
class
TritonPythonModel
:
...
...
llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/model.py
View file @
64936449
...
@@ -7,31 +7,64 @@ from typing import List
...
@@ -7,31 +7,64 @@ from typing import List
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
import
triton_python_backend_utils
as
pb_utils
import
triton_python_backend_utils
as
pb_utils
from
sentencepiece
import
SentencePieceProcessor
from
torch.nn.utils.rnn
import
pad_sequence
from
torch.nn.utils.rnn
import
pad_sequence
class
Tokenizer
:
class
Tokenizer
:
def
__init__
(
self
,
model_file
:
str
):
def
__init__
(
self
,
model_file
:
str
):
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
model_folder
=
osp
.
split
(
model_file
)[
0
]
self
.
vocab_size
=
self
.
model
.
vocab_size
()
tokenizer_config_file
=
osp
.
join
(
model_folder
,
'tokenizer_config.json'
)
self
.
start_id
=
self
.
model
.
bos_id
()
use_hf_model
=
osp
.
exists
(
tokenizer_config_file
)
self
.
end_id
=
self
.
model
.
eos_id
()
self
.
use_hf_model
=
use_hf_model
if
not
self
.
use_hf_model
:
from
sentencepiece
import
SentencePieceProcessor
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
vocab_size
=
self
.
model
.
vocab_size
()
self
.
start_id
=
self
.
model
.
bos_id
()
self
.
end_id
=
self
.
model
.
eos_id
()
else
:
from
transformers
import
AutoTokenizer
backend_tokenizer_file
=
osp
.
join
(
model_folder
,
'tokenizer.json'
)
if
not
osp
.
exists
(
backend_tokenizer_file
):
print
(
'WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.'
)
self
.
model
=
AutoTokenizer
.
from_pretrained
(
model_folder
)
self
.
vocab_size
=
self
.
model
.
vocab_size
self
.
start_id
=
self
.
model
.
bos_token_id
self
.
end_id
=
self
.
model
.
eos_token_id
# save tokenizer.json to reuse
if
not
osp
.
exists
(
backend_tokenizer_file
):
self
.
model
.
backend_tokenizer
.
save
(
backend_tokenizer_file
)
def
encode
(
self
,
s
:
str
):
def
encode
(
self
,
s
:
str
):
add_bos
=
False
if
not
self
.
use_hf_model
:
add_eos
=
False
add_bos
=
False
if
s
.
find
(
'<BOS>'
)
!=
-
1
:
add_eos
=
False
s
=
s
.
replace
(
'<BOS>'
,
''
)
if
s
.
find
(
'<BOS>'
)
!=
-
1
:
add_bos
=
True
s
=
s
.
replace
(
'<BOS>'
,
''
)
if
s
==
'<EOS>'
:
add_bos
=
True
s
=
''
if
s
==
'<EOS>'
:
add_eos
=
True
s
=
''
return
self
.
model
.
Encode
(
s
,
add_bos
=
add_bos
,
add_eos
=
add_eos
)
add_eos
=
True
return
self
.
model
.
Encode
(
s
,
add_bos
=
add_bos
,
add_eos
=
add_eos
)
else
:
add_special_tokens
=
False
if
s
.
find
(
'<BOS>'
)
!=
-
1
:
s
=
s
.
replace
(
'<BOS>'
,
'<s>'
)
if
s
==
'<EOS>'
:
s
=
'</s>'
if
len
(
s
)
==
0
:
add_special_tokens
=
True
return
self
.
model
.
encode
(
s
,
add_special_tokens
=
add_special_tokens
)
def
decode
(
self
,
t
:
List
[
int
]):
def
decode
(
self
,
t
:
List
[
int
]):
return
self
.
model
.
Decode
(
t
)
if
not
self
.
use_hf_model
:
return
self
.
model
.
Decode
(
t
)
else
:
skip_special_tokens
=
False
return
self
.
model
.
decode
(
t
,
skip_special_tokens
=
skip_special_tokens
)
class
TritonPythonModel
:
class
TritonPythonModel
:
...
@@ -157,7 +190,6 @@ class TritonPythonModel:
...
@@ -157,7 +190,6 @@ class TritonPythonModel:
for
s
in
query
for
s
in
query
]
]
start_lengths
=
torch
.
IntTensor
([[
len
(
ids
)]
for
ids
in
start_ids
])
start_lengths
=
torch
.
IntTensor
([[
len
(
ids
)]
for
ids
in
start_ids
])
start_ids
=
pad_sequence
(
start_ids
,
start_ids
=
pad_sequence
(
batch_first
=
True
,
start_ids
,
batch_first
=
True
,
padding_value
=
self
.
end_id
)
padding_value
=
self
.
end_id
)
return
start_ids
,
start_lengths
return
start_ids
,
start_lengths
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment