Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
d7117b95
Commit
d7117b95
authored
Mar 22, 2024
by
zhouxiang
Browse files
同步0.2.6代码
parent
5f83e392
Changes
151
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
849 additions
and
701 deletions
+849
-701
lmdeploy/turbomind/__init__.py
lmdeploy/turbomind/__init__.py
+23
-1
lmdeploy/turbomind/chat.py
lmdeploy/turbomind/chat.py
+41
-20
lmdeploy/turbomind/deploy/converter.py
lmdeploy/turbomind/deploy/converter.py
+4
-5
lmdeploy/turbomind/deploy/source_model/__init__.py
lmdeploy/turbomind/deploy/source_model/__init__.py
+1
-0
lmdeploy/turbomind/deploy/source_model/baichuan.py
lmdeploy/turbomind/deploy/source_model/baichuan.py
+6
-4
lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
+6
-4
lmdeploy/turbomind/deploy/source_model/llama.py
lmdeploy/turbomind/deploy/source_model/llama.py
+11
-8
lmdeploy/turbomind/deploy/source_model/llama_awq.py
lmdeploy/turbomind/deploy/source_model/llama_awq.py
+3
-2
lmdeploy/turbomind/deploy/source_model/qwen.py
lmdeploy/turbomind/deploy/source_model/qwen.py
+3
-2
lmdeploy/turbomind/deploy/source_model/qwen_awq.py
lmdeploy/turbomind/deploy/source_model/qwen_awq.py
+3
-2
lmdeploy/turbomind/deploy/target_model/base.py
lmdeploy/turbomind/deploy/target_model/base.py
+51
-15
lmdeploy/turbomind/generate_gemm_config.py
lmdeploy/turbomind/generate_gemm_config.py
+43
-1
lmdeploy/turbomind/hf_repo/config.json
lmdeploy/turbomind/hf_repo/config.json
+0
-11
lmdeploy/turbomind/hf_repo/configuration_lmdeploy.py
lmdeploy/turbomind/hf_repo/configuration_lmdeploy.py
+0
-36
lmdeploy/turbomind/hf_repo/modeling_lmdeploy.py
lmdeploy/turbomind/hf_repo/modeling_lmdeploy.py
+0
-226
lmdeploy/turbomind/turbomind.py
lmdeploy/turbomind/turbomind.py
+440
-244
lmdeploy/turbomind/utils.py
lmdeploy/turbomind/utils.py
+27
-94
lmdeploy/utils.py
lmdeploy/utils.py
+183
-23
lmdeploy/version.py
lmdeploy/version.py
+2
-2
requirements/docs.txt
requirements/docs.txt
+2
-1
No files found.
lmdeploy/turbomind/__init__.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
from
.turbomind
import
TurboMind
def
bootstrap
():
import
os
import
sys
has_turbomind
=
False
pwd
=
os
.
path
.
dirname
(
__file__
)
if
os
.
path
.
exists
(
os
.
path
.
join
(
pwd
,
'lib'
)):
has_turbomind
=
True
if
os
.
name
==
'nt'
and
has_turbomind
:
if
sys
.
version_info
[:
2
]
>=
(
3
,
8
):
CUDA_PATH
=
os
.
getenv
(
'CUDA_PATH'
)
assert
CUDA_PATH
is
not
None
,
'Can not find $env:CUDA_PATH'
dll_path
=
os
.
path
.
join
(
CUDA_PATH
,
'bin'
)
print
(
'Add dll path {dll_path}, please note cuda version '
'should >= 11.3 when compiled with cuda 11'
)
os
.
add_dll_directory
(
dll_path
)
bootstrap
()
from
.turbomind
import
TurboMind
# noqa: E402
__all__
=
[
'TurboMind'
]
lmdeploy/turbomind/chat.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
import
dataclasses
import
os
import
random
from
lmdeploy.turbomind.utils
import
get_gen_param
from
lmdeploy.messages
import
EngineGenerationConfig
from
lmdeploy.model
import
ChatTemplateConfig
from
lmdeploy.tokenizer
import
DetokenizeState
os
.
environ
[
'TM_LOG_LEVEL'
]
=
'ERROR'
...
...
@@ -29,32 +30,51 @@ def valid_str(string, coding='utf-8'):
return
ret
def
main
(
model_path
,
def
main
(
model_path
:
str
,
model_name
:
str
=
None
,
session_id
:
int
=
1
,
cap
:
str
=
'chat'
,
tp
:
int
=
1
,
stream_output
:
bool
=
True
,
request_output_len
:
int
=
512
,
request_output_len
:
int
=
1024
,
chat_template_cfg
:
ChatTemplateConfig
=
None
,
**
kwargs
):
"""An example to perform model inference through the command line
interface.
Args:
model_path (str): the path of the deployed model
model_name (str): the name of deployed model
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infilling', 'chat', 'python']
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
request_output_len (int): output token nums
chat_template_cfg (ChatTemplateConfig): Chat template config
**kwarg (dict): other arguments for initializing model's chat template
"""
from
lmdeploy
import
turbomind
as
tm
tm_model
=
tm
.
TurboMind
.
from_pretrained
(
model_path
,
tp
=
tp
,
capability
=
cap
,
**
kwargs
)
if
chat_template_cfg
is
None
:
chat_template_cfg
=
ChatTemplateConfig
(
model_name
=
model_name
,
capability
=
cap
)
new_kwargs
=
{}
for
k
,
v
in
kwargs
.
items
():
if
hasattr
(
chat_template_cfg
,
k
):
setattr
(
chat_template_cfg
,
k
,
v
)
else
:
new_kwargs
[
k
]
=
v
kwargs
=
new_kwargs
tm_model
=
tm
.
TurboMind
.
from_pretrained
(
model_path
,
model_name
=
model_name
,
tp
=
tp
,
capability
=
cap
,
chat_template_config
=
chat_template_cfg
,
**
kwargs
)
tokenizer
=
tm_model
.
tokenizer
generator
=
tm_model
.
create_instance
()
gen_config
=
EngineGenerationConfig
(
top_k
=
40
)
nth_round
=
1
step
=
0
...
...
@@ -90,29 +110,30 @@ def main(model_path,
' Please end the session.'
)
continue
gen_param
=
get_gen_param
(
cap
,
model
.
sampling_param
,
nth_round
,
step
,
request_output_len
,
**
kwargs
)
sequence_start
=
(
nth_round
==
1
)
sequence_end
=
False
if
cap
!=
'chat'
:
# not interactive for other capability
sequence_start
,
sequence_end
=
True
,
True
step
=
0
print
(
f
'
{
prompt
}
'
,
end
=
''
,
flush
=
True
)
response_size
=
0
state
=
DetokenizeState
()
for
outputs
in
generator
.
stream_infer
(
session_id
=
session_id
,
input_ids
=
[
input_ids
],
sequence_start
=
sequence_start
,
sequence_end
=
sequence_end
,
step
=
step
,
stream_output
=
stream_output
,
**
dataclasses
.
asdict
(
gen_param
)
,
gen_config
=
gen_config
,
ignore_eos
=
False
,
random_seed
=
seed
if
nth_round
==
1
else
None
):
res
,
tokens
=
outputs
[
0
]
_
,
res
,
tokens
=
outputs
# decode res
response
=
tokenizer
.
decode
(
res
.
tolist
(),
offset
=
response_size
)
# utf-8 char at the end means it's a potential unfinished
# byte sequence, continue to concate it with the next
# sequence and decode them together
if
response
.
endswith
(
'�'
):
continue
response
,
state
=
tokenizer
.
detokenize_incrementally
(
res
,
state
=
state
)
response
=
valid_str
(
response
)
print
(
f
'
{
response
}
'
,
end
=
''
,
flush
=
True
)
response_size
=
tokens
# update step
step
+=
len
(
input_ids
)
+
tokens
...
...
lmdeploy/turbomind/deploy/converter.py
View file @
d7117b95
...
...
@@ -7,10 +7,9 @@ from pathlib import Path
import
fire
import
torch
from
huggingface_hub
import
snapshot_download
from
lmdeploy.model
import
MODELS
from
lmdeploy.
turbomind.
utils
import
create_hf_download_args
from
lmdeploy.utils
import
get_model
from
.source_model.base
import
INPUT_MODELS
from
.target_model.base
import
OUTPUT_MODELS
,
TurbomindModelConfig
...
...
@@ -19,7 +18,8 @@ supported_formats = ['llama', 'hf', 'awq', None]
special_input_model_map
=
{
'qwen'
:
'qwen'
,
'baichuan'
:
'baichuan'
,
'baichuan2'
:
'baichuan2'
'baichuan2'
:
'baichuan2'
,
'internlm2'
:
'internlm2'
}
...
...
@@ -241,8 +241,7 @@ def main(model_name: str,
if
not
os
.
path
.
exists
(
model_path
):
print
(
f
'can
\'
t find model from local_path
{
model_path
}
, '
'try to download from huggingface'
)
download_kwargs
=
create_hf_download_args
(
**
kwargs
)
model_path
=
snapshot_download
(
model_path
,
**
download_kwargs
)
model_path
=
get_model
(
model_path
)
print
(
f
'load model from
{
model_path
}
'
)
# get tokenizer path
...
...
lmdeploy/turbomind/deploy/source_model/__init__.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
from
.baichuan
import
Baichuan2Model
,
BaichuanModel
# noqa: F401
from
.baichuan_awq
import
Baichuan2AwqModel
,
BaichuanAwqModel
# noqa: F401
from
.internlm2
import
InternLM2AwqModel
,
InternLM2Model
# noqa: F401
from
.llama
import
LlamaModel
# noqa: F401
from
.llama_awq
import
LlamaAwqModel
# noqa: F401
from
.meta_llama
import
MetaLlamaModel
# noqa: F401
...
...
lmdeploy/turbomind/deploy/source_model/baichuan.py
View file @
d7117b95
...
...
@@ -9,8 +9,9 @@ from .llama import LlamaModel, LlamaReader
class
BaichuanReader
(
LlamaReader
):
"""BaichuanReader."""
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
_attn
(
self
,
i
:
int
,
kind
:
str
,
size_dim
:
int
,
dim
:
int
=
0
):
"""Get q, k, v, o kind for layer i."""
...
...
@@ -34,8 +35,9 @@ class BaichuanReader(LlamaReader):
class
Baichuan2Reader
(
BaichuanReader
):
"""Baichuan2Reader."""
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
output_weight
(
self
):
"""Get output."""
...
...
lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
View file @
d7117b95
...
...
@@ -9,8 +9,9 @@ from .llama_awq import ensure_fp16orint32
class
BaichuanAwqReader
(
BaichuanReader
):
"""BaichuanAwqReader."""
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
attn
(
self
,
i
:
int
):
"""Get q, k, v, o qweight for layer i."""
...
...
@@ -40,8 +41,9 @@ class BaichuanAwqReader(BaichuanReader):
class
Baichuan2AwqReader
(
BaichuanAwqReader
):
"""Baichuan2AwqReader."""
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
output_weight
(
self
):
"""Get output."""
...
...
lmdeploy/turbomind/deploy/source_model/llama.py
View file @
d7117b95
...
...
@@ -2,6 +2,7 @@
import
json
import
os
import
os.path
as
osp
from
glob
import
glob
import
torch
from
safetensors.torch
import
load_file
...
...
@@ -19,11 +20,13 @@ class LlamaReader(BaseReader):
norm_weight_key
=
'model.norm.weight'
output_weight_key
=
'lm_head.weight'
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
()
self
.
params
=
unused_params
self
.
params
.
update
(
new_params
)
self
.
last_bin
=
last_bin
self
.
model_cfg
=
model_cfg
self
.
init_layer_id
()
def
init_layer_id
(
self
):
...
...
@@ -128,13 +131,11 @@ class LlamaModel(BaseInputModel):
def
get_ckpt
(
self
):
"""Get weight files."""
suffixe
s
=
[
'.safetensors'
,
'.bin'
]
pattern
s
=
[
'
*
.safetensors'
,
'
pytorch_model*
.bin'
]
files
=
[]
for
suffix
in
suffixes
:
files
=
[
file
for
file
in
os
.
listdir
(
self
.
ckpt_path
)
if
file
.
endswith
(
suffix
)
]
for
pattern
in
patterns
:
files
=
glob
(
os
.
path
.
join
(
self
.
ckpt_path
,
pattern
))
files
=
[
os
.
path
.
basename
(
file
)
for
file
in
files
]
if
len
(
files
)
>
0
:
break
files
=
sorted
(
files
)
...
...
@@ -159,7 +160,7 @@ class LlamaModel(BaseInputModel):
else
:
new_params
=
load_file
(
osp
.
join
(
self
.
ckpt_path
,
ckpt
))
ret
=
self
.
Reader
(
new_params
,
unused_params
,
i
==
self
.
nmgrs
-
1
)
i
==
self
.
nmgrs
-
1
,
self
.
model_info
()
)
yield
ret
ret
.
clean_up
(
is_last_bin
)
except
GeneratorExit
:
...
...
@@ -181,6 +182,7 @@ class LlamaModel(BaseInputModel):
model_arg
=
json
.
load
(
f
)
num_layer
=
model_arg
[
'num_hidden_layers'
]
norm_eps
=
model_arg
[
'rms_norm_eps'
]
attn_head_num
=
model_arg
[
'num_attention_heads'
]
if
'num_key_value_heads'
in
model_arg
:
kv_head_num
=
model_arg
[
'num_key_value_heads'
]
else
:
...
...
@@ -192,6 +194,7 @@ class LlamaModel(BaseInputModel):
return
dict
(
num_layer
=
num_layer
,
norm_eps
=
norm_eps
,
attn_head_num
=
attn_head_num
,
kv_head_num
=
kv_head_num
,
rope_theta
=
rope_theta
,
max_position_embeddings
=
max_position_embeddings
,
...
...
lmdeploy/turbomind/deploy/source_model/llama_awq.py
View file @
d7117b95
...
...
@@ -23,8 +23,9 @@ def ensure_fp16orint32(tensors: torch.Tensor):
class
LlamaAwqReader
(
LlamaReader
):
"""LlamaAwqReader."""
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
attn
(
self
,
i
:
int
):
"""Get q, k, v, o qweight for layer i."""
...
...
lmdeploy/turbomind/deploy/source_model/qwen.py
View file @
d7117b95
...
...
@@ -16,8 +16,9 @@ class QwenReader(LlamaReader):
norm_weight_key
=
'transformer.ln_f.weight'
output_weight_key
=
'lm_head.weight'
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
_attn
(
self
,
i
:
int
,
kind
:
str
,
size_dim
:
int
,
dim
:
int
=
0
):
"""Get q, k, v, o kind for layer i."""
...
...
lmdeploy/turbomind/deploy/source_model/qwen_awq.py
View file @
d7117b95
...
...
@@ -7,8 +7,9 @@ from .qwen import QwenModel, QwenReader
class
QwenAwqReader
(
QwenReader
):
"""QwenAwqReader."""
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
)
def
__init__
(
self
,
new_params
:
dict
,
unused_params
:
dict
,
last_bin
:
bool
,
model_cfg
:
dict
):
super
().
__init__
(
new_params
,
unused_params
,
last_bin
,
model_cfg
)
def
attn
(
self
,
i
:
int
):
"""Get q, k, v, o qweight for layer i."""
...
...
lmdeploy/turbomind/deploy/target_model/base.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
import
configparser
import
copy
import
inspect
import
io
import
json
import
os.path
as
osp
from
abc
import
ABC
,
abstractmethod
from
dataclas
se
s
import
dataclass
from
configpar
se
r
import
ConfigParser
import
torch
import
tqdm
from
mmengine
import
Registry
from
pydantic.dataclasses
import
dataclass
from
lmdeploy.messages
import
TurbomindEngineConfig
from
lmdeploy.model
import
MODELS
from
..source_model.base
import
BaseInputModel
,
BaseReader
...
...
@@ -30,18 +35,18 @@ def tprint(*args, **kwargs):
@
dataclass
class
TurbomindModelConfig
:
"""Config for turbomind model."""
model_name
:
str
tensor_para_size
:
int
head_num
:
int
kv_head_num
:
int
vocab_size
:
int
num_layer
:
int
inter_size
:
int
norm_eps
:
float
attn_bias
:
int
start_id
:
int
end_id
:
int
session_len
:
int
model_name
:
str
=
None
tensor_para_size
:
int
=
None
head_num
:
int
=
None
kv_head_num
:
int
=
None
vocab_size
:
int
=
None
num_layer
:
int
=
None
inter_size
:
int
=
None
norm_eps
:
float
=
None
attn_bias
:
int
=
None
start_id
:
int
=
None
end_id
:
int
=
None
session_len
:
int
=
None
weight_type
:
str
=
'fp16'
rotary_embedding
:
int
=
128
rope_theta
:
float
=
10000.0
...
...
@@ -50,9 +55,12 @@ class TurbomindModelConfig:
max_batch_size
:
int
=
64
max_context_token_num
:
int
=
1
step_length
:
int
=
1
cache_max_entry_count
:
float
=
0.
5
cache_max_entry_count
:
float
=
0.
8
cache_block_seq_len
:
int
=
128
cache_chunk_size
:
int
=
1
cache_chunk_size
:
int
=
-
1
num_tokens_per_iter
:
int
=
0
max_prefill_iters
:
int
=
1
extra_tokens_per_iter
:
int
=
0
use_context_fmha
:
int
=
1
quant_policy
:
int
=
0
max_position_embeddings
:
int
=
0
...
...
@@ -74,6 +82,34 @@ class TurbomindModelConfig:
default
.
update
(
used
)
return
cls
(
**
default
)
@
classmethod
def
from_engine_config
(
cls
,
config
:
TurbomindEngineConfig
):
env
=
copy
.
deepcopy
(
config
.
__dict__
)
env
[
'tensor_para_size'
]
=
env
[
'tp'
]
ret
=
TurbomindModelConfig
.
from_dict
(
env
,
allow_none
=
True
)
ret
.
rotary_embedding
=
ret
.
size_per_head
# workround to support `max_prefill_token_num` in turbomind engine
if
config
.
max_prefill_token_num
is
not
None
and
\
config
.
session_len
is
not
None
:
ret
.
num_tokens_per_iter
=
config
.
max_prefill_token_num
ret
.
max_prefill_iters
=
(
config
.
session_len
+
config
.
max_prefill_token_num
-
1
)
//
config
.
max_prefill_token_num
return
ret
def
toini
(
self
):
config
=
copy
.
deepcopy
(
self
.
__dict__
)
parser
=
ConfigParser
()
parser
[
'llama'
]
=
config
with
io
.
StringIO
()
as
ss
:
parser
.
write
(
ss
)
ss
.
seek
(
0
)
ini
=
ss
.
read
()
return
ini
def
__str__
(
self
):
return
json
.
dumps
(
self
.
__dict__
,
indent
=
2
)
@
property
def
valid
(
self
):
"""Check if cfg is valid."""
...
...
lmdeploy/turbomind/generate_gemm_config.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
import
os.path
as
osp
import
subprocess
def
get_llama_gemm
():
"""get the executable binary llama_gemm."""
import
os.path
as
osp
import
lmdeploy
...
...
@@ -13,12 +15,52 @@ def get_llama_gemm():
return
bin_path
def
read_config
(
ini_path
:
str
):
"""read turbomind config from turbomind.
Args:
ini_path (str): the path of `config.ini` file in turbomind model
"""
from
configparser
import
ConfigParser
from
lmdeploy.turbomind.deploy.target_model.base
import
\
TurbomindModelConfig
with
open
(
ini_path
,
'r'
)
as
f
:
parser
=
ConfigParser
()
parser
.
read_file
(
f
)
section_name
=
'llama'
_cfg
=
parser
.
_sections
[
section_name
]
cfg
=
TurbomindModelConfig
.
from_dict
(
_cfg
)
return
cfg
.
head_num
,
cfg
.
size_per_head
,
cfg
.
inter_size
,
\
cfg
.
vocab_size
,
cfg
.
tensor_para_size
def
main
(
head_num
:
int
=
32
,
size_per_head
:
int
=
128
,
vocab_size
:
int
=
32000
,
inter_size
:
int
=
11008
,
tensor_para_size
:
int
=
1
,
max_batch_size
:
int
=
64
):
max_batch_size
:
int
=
64
,
model_path
:
str
=
None
):
if
model_path
is
not
None
:
from
lmdeploy.turbomind.turbomind
import
get_model_source
from
lmdeploy.turbomind.utils
import
ModelSource
model_source
=
get_model_source
(
model_path
)
if
model_source
==
ModelSource
.
WORKSPACE
:
head_num
,
size_per_head
,
inter_size
,
vocab_size
,
\
tensor_para_size
=
read_config
(
osp
.
join
(
model_path
,
'triton_models'
,
'weights'
,
'config.ini'
))
else
:
from
transformers
import
AutoConfig
config
=
AutoConfig
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
head_num
=
config
.
num_attention_heads
size_per_head
=
config
.
hidden_size
//
head_num
inter_size
=
config
.
intermediate_size
vocab_size
=
config
.
vocab_size
for
bsz
in
range
(
1
,
max_batch_size
+
1
):
subprocess
.
call
(
f
'
{
get_llama_gemm
()
}
{
bsz
}
1 1
{
head_num
}
{
size_per_head
}
'
...
...
lmdeploy/turbomind/hf_repo/config.json
deleted
100644 → 0
View file @
5f83e392
{
"architectures"
:
[
"LMDeployForCausalLM"
],
"auto_map"
:
{
"AutoConfig"
:
"configuration_lmdeploy.LMDeployConfig"
,
"AutoModel"
:
"modeling_lmdeploy.LMDeployForCausalLM"
,
"AutoModelForCausalLM"
:
"modeling_lmdeploy.LMDeployForCausalLM"
},
"turbomind"
:
{}
}
lmdeploy/turbomind/hf_repo/configuration_lmdeploy.py
deleted
100644 → 0
View file @
5f83e392
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
from
transformers
import
PretrainedConfig
from
lmdeploy.turbomind.deploy.target_model.base
import
TurbomindModelConfig
from
lmdeploy.version
import
__version__
as
lm_version
class
LMDeployConfig
(
PretrainedConfig
):
"""Lmdeploy config."""
def
__init__
(
self
,
turbomind
:
dict
=
None
,
**
kwargs
):
default_tm_cfg
=
copy
.
deepcopy
(
TurbomindModelConfig
.
from_dict
({},
allow_none
=
True
).
__dict__
)
if
turbomind
is
not
None
:
default_tm_cfg
.
update
(
turbomind
)
self
.
turbomind
=
default_tm_cfg
self
.
lmdeploy_version
=
lm_version
super
().
__init__
(
**
kwargs
)
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
**
kwargs
):
return_unused_kwargs
=
kwargs
.
pop
(
'return_unused_kwargs'
,
False
)
config
,
kwargs
=
super
().
from_pretrained
(
pretrained_model_name_or_path
,
return_unused_kwargs
=
True
,
**
kwargs
)
for
k
,
v
in
kwargs
.
items
():
if
k
in
config
.
turbomind
.
keys
():
config
.
turbomind
[
k
]
=
v
if
'tp'
in
kwargs
:
config
.
turbomind
[
'tensor_para_size'
]
=
kwargs
[
'tp'
]
if
return_unused_kwargs
:
return
config
,
kwargs
else
:
return
config
lmdeploy/turbomind/hf_repo/modeling_lmdeploy.py
deleted
100644 → 0
View file @
5f83e392
# Copyright (c) OpenMMLab. All rights reserved.
import
dataclasses
import
os
from
contextlib
import
contextmanager
from
dataclasses
import
dataclass
,
field
from
itertools
import
count
from
queue
import
Queue
from
typing
import
List
,
Optional
,
Tuple
,
Union
from
huggingface_hub
import
snapshot_download
from
transformers
import
PretrainedConfig
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.utils
import
logging
from
lmdeploy.turbomind
import
TurboMind
from
lmdeploy.turbomind.utils
import
get_gen_param
from
.configuration_lmdeploy
import
LMDeployConfig
logger
=
logging
.
get_logger
(
__name__
)
@
dataclass
class
Session
:
_count
=
count
()
_session_id
:
int
=
None
_message
:
List
[
Tuple
[
str
,
str
]]
=
field
(
default_factory
=
list
)
_step
:
int
=
0
_nth_round
:
int
=
0
_error
:
int
=
0
def
__init__
(
self
):
self
.
_session_id
=
next
(
Session
.
_count
)
self
.
_message
=
[]
self
.
_step
=
0
self
.
_nth_round
=
0
@
property
def
session_id
(
self
):
return
self
.
_session_id
@
property
def
message
(
self
):
return
self
.
_message
@
property
def
step
(
self
):
return
self
.
_step
@
property
def
nth_round
(
self
):
return
self
.
_nth_round
@
property
def
error
(
self
):
return
self
.
_error
class
LMDeployForCausalLM
(
PreTrainedModel
):
config_class
=
LMDeployConfig
def
__init__
(
self
,
config
:
LMDeployConfig
,
*
inputs
,
model_path
:
str
=
None
,
**
kwargs
):
super
().
__init__
(
config
)
self
.
tm_model
=
TurboMind
.
from_pretrained
(
model_path
,
**
kwargs
)
que
=
Queue
()
for
_
in
range
(
config
.
turbomind
[
'max_batch_size'
]):
que
.
put
(
self
.
tm_model
.
create_instance
())
self
.
que
=
que
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
*
model_args
,
config
:
Optional
[
Union
[
PretrainedConfig
,
str
,
os
.
PathLike
]]
=
None
,
cache_dir
:
Optional
[
Union
[
str
,
os
.
PathLike
]]
=
None
,
force_download
:
bool
=
False
,
local_files_only
:
bool
=
False
,
token
:
Optional
[
Union
[
str
,
bool
]]
=
None
,
revision
:
str
=
'main'
,
**
kwargs
):
"""Instantiate a LM model with turbomind backend."""
resume_download
=
kwargs
.
pop
(
'resume_download'
,
True
)
proxies
=
kwargs
.
pop
(
'proxies'
,
None
)
if
os
.
path
.
isdir
(
pretrained_model_name_or_path
):
local_folder
=
pretrained_model_name_or_path
else
:
local_folder
=
snapshot_download
(
pretrained_model_name_or_path
,
revision
=
revision
,
cache_dir
=
cache_dir
,
proxies
=
proxies
,
resume_download
=
resume_download
,
force_download
=
force_download
,
token
=
token
,
local_files_only
=
local_files_only
,
)
if
not
isinstance
(
config
,
PretrainedConfig
):
config_path
=
config
if
config
is
not
None
else
local_folder
kwargs
.
pop
(
'return_unused_kwargs'
)
config
,
model_kwargs
=
cls
.
config_class
.
from_pretrained
(
config_path
,
return_unused_kwargs
=
True
,
**
kwargs
)
else
:
model_kwargs
=
kwargs
model
=
cls
(
config
,
*
model_args
,
model_path
=
local_folder
,
**
model_kwargs
)
generation_config
=
model
.
tm_model
.
model
.
sampling_param
for
k
,
v
in
dataclasses
.
asdict
(
generation_config
).
items
():
if
hasattr
(
model
.
generation_config
,
k
):
base_value
=
getattr
(
model
.
generation_config
,
k
)
setattr
(
generation_config
,
k
,
base_value
)
if
k
in
kwargs
:
setattr
(
generation_config
,
k
,
v
)
model
.
generation_config
=
generation_config
return
model
@
contextmanager
def
managed_generator
(
self
,
session
:
Session
):
generator
=
self
.
que
.
get
()
try
:
yield
generator
except
:
# noqa E722
for
_
in
generator
.
stream_infer
(
session
.
session_id
,
[
0
],
request_output_len
=
0
,
sequence_start
=
False
,
sequence_end
=
False
,
stop
=
True
):
pass
session
.
_error
=
1
finally
:
self
.
que
.
put
(
generator
)
def
generate
(
self
,
input_ids
:
List
[
int
],
session
:
Session
,
**
kwargs
,
):
"""Generates sequences of token ids for models with a language modeling
head.
Args:
input_ids (List(int)): list of input token ids
session (Session) session information
kwargs (dict): hoc parametrization of generation
"""
with
self
.
managed_generator
(
session
)
as
generator
:
for
outputs
in
generator
.
stream_infer
(
session_id
=
session
.
session_id
,
input_ids
=
[
input_ids
],
**
kwargs
,
):
res
,
tokens
=
outputs
[
0
]
yield
res
,
tokens
def
chat
(
self
,
query
:
str
,
session
:
Optional
[
Session
]
=
None
,
cap
:
str
=
'chat'
,
request_output_len
:
int
=
512
,
stream_output
:
bool
=
False
,
ignore_eos
=
False
,
random_seed
:
Optional
[
int
]
=
None
,
**
kwargs
,
)
->
Tuple
[
str
,
Session
]:
"""chat."""
if
session
is
None
:
session
=
Session
()
assert
session
.
_error
==
0
,
'An error occurred before, '
\
'please start a new session.'
session
.
_message
.
append
([
query
,
''
])
prompt
=
self
.
tm_model
.
model
.
get_prompt
(
query
,
session
.
nth_round
==
0
)
input_ids
=
self
.
tm_model
.
tokenizer
.
encode
(
prompt
)
if
len
(
input_ids
)
+
session
.
step
+
request_output_len
>=
self
.
tm_model
.
session_len
:
logger
.
error
(
f
'session_length exceeded
{
self
.
tm_model
.
session_len
}
'
)
session
.
_error
=
1
yield
''
,
session
else
:
gen_param
=
get_gen_param
(
cap
,
self
.
generation_config
,
session
.
nth_round
+
1
,
session
.
step
,
request_output_len
,
**
kwargs
)
gen_kwargs
=
dataclasses
.
asdict
(
gen_param
)
gen_kwargs
.
update
(
random_seed
=
random_seed
if
session
.
nth_round
==
0
else
None
,
stream_output
=
stream_output
,
ignore_eos
=
ignore_eos
,
**
kwargs
)
_step
=
session
.
_step
_nth_round
=
session
.
_nth_round
response_size
=
0
for
res
,
tokens
in
self
.
generate
(
input_ids
,
session
=
session
,
**
gen_kwargs
):
response
=
self
.
tm_model
.
tokenizer
.
decode
(
res
.
tolist
(),
offset
=
response_size
)
if
response
.
endswith
(
'�'
):
continue
response_size
=
tokens
session
.
_message
[
-
1
][
-
1
]
+=
response
session
.
_nth_round
=
_nth_round
+
1
session
.
_step
=
_step
+
response_size
yield
response
,
session
lmdeploy/turbomind/turbomind.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
import
asyncio
import
copy
import
io
import
json
import
logging
import
os.path
as
osp
import
sys
from
configparser
import
ConfigParser
from
contextlib
import
contextmanager
from
queue
import
Queue
from
queue
import
LifoQueue
,
Queue
from
threading
import
Thread
from
typing
import
Iterable
,
List
,
Optional
from
typing
import
Iterable
,
List
,
Optional
,
Union
import
numpy
as
np
import
torch
from
huggingface_hub
import
snapshot_download
from
torch.nn.utils.rnn
import
pad_sequence
import
lmdeploy
from
lmdeploy.model
import
MODELS
,
BaseModel
from
lmdeploy.messages
import
(
EngineGenerationConfig
,
ResponseType
,
TurbomindEngineConfig
)
from
lmdeploy.model
import
(
MODELS
,
BaseModel
,
ChatTemplateConfig
,
best_match_model
)
from
lmdeploy.tokenizer
import
Tokenizer
from
lmdeploy.utils
import
get_logger
from
lmdeploy.utils
import
_stop_words
,
get_logger
,
get_model
from
.deploy.converter
import
(
get_model_format
,
supported_formats
,
update_config_weight_type
,
update_output_format
)
from
.deploy.source_model.base
import
INPUT_MODELS
from
.deploy.target_model.base
import
OUTPUT_MODELS
,
TurbomindModelConfig
from
.utils
import
(
ModelSource
,
check_tm_model_input
,
create_hf_download_args
,
get_hf_config_content
,
get_model_source
)
from
.utils
import
ModelSource
,
get_model_from_config
,
get_model_source
# TODO: find another way import _turbomind
lmdeploy_dir
=
osp
.
split
(
lmdeploy
.
__file__
)[
0
]
sys
.
path
.
append
(
osp
.
join
(
lmdeploy_dir
,
'lib'
))
import
_turbomind
as
_tm
# noqa: E402
logger
=
logging
.
get
L
ogger
(
__name__
)
logger
=
get
_l
ogger
(
'lmdeploy'
)
def
_stop_words
(
stop_words
:
List
[
str
],
tokenizer
:
Tokenizer
):
"""return list of stop-words to numpy.ndarray."""
if
stop_words
is
None
:
def
_construct_stop_or_bad_words
(
words
:
List
[
int
]
=
None
):
if
words
is
None
or
len
(
words
)
==
0
:
return
None
assert
isinstance
(
stop_words
,
List
)
and
\
all
(
isinstance
(
elem
,
str
)
for
elem
in
stop_words
),
\
f
'stop_words must be a list but got
{
type
(
stop_words
)
}
'
stop_words
=
[
tokenizer
.
encode
(
stop_word
,
False
)[
-
1
]
for
stop_word
in
stop_words
]
assert
isinstance
(
stop_words
,
List
)
and
all
(
isinstance
(
elem
,
int
)
for
elem
in
stop_words
),
'invalid stop_words'
# each id in stop_words represents a stop word
# refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
# detailed explanation about fastertransformer's stop_words
stop_word_offsets
=
range
(
1
,
len
(
stop_words
)
+
1
)
stop_words
=
np
.
array
([[
stop_words
,
stop_word_offsets
]]).
astype
(
np
.
int32
)
return
stop_words
offsets
=
range
(
1
,
len
(
words
)
+
1
)
combined
=
np
.
array
([[
words
,
offsets
]]).
astype
(
np
.
int32
)
return
combined
def
_np_dict_to_tm_dict
(
np_dict
:
dict
):
...
...
@@ -77,6 +64,59 @@ def _tm_dict_to_torch_dict(tm_dict: _tm.TensorMap):
return
ret
def
_update_engine_config
(
config
:
TurbomindEngineConfig
,
**
kwargs
):
if
config
is
None
:
config
=
TurbomindEngineConfig
()
for
k
,
v
in
kwargs
.
items
():
if
v
and
hasattr
(
config
,
k
):
setattr
(
config
,
k
,
v
)
logger
.
warning
(
f
'kwargs
{
k
}
is deprecated to initialize model, '
'use TurbomindEngineConfig instead.'
)
if
config
.
model_name
is
not
None
:
logger
.
warning
(
'model_name is deprecated in TurbomindEngineConfig '
'and has no effect'
)
return
config
def
_update_tm_config
(
dst
:
TurbomindModelConfig
,
src
:
TurbomindEngineConfig
):
# A workaround to support max token number of each iteration in prefill
if
src
.
max_prefill_token_num
is
not
None
and
src
.
session_len
is
not
None
:
dst
.
num_tokens_per_iter
=
src
.
max_prefill_token_num
dst
.
max_prefill_iters
=
(
src
.
session_len
+
src
.
max_prefill_token_num
-
1
)
//
src
.
max_prefill_token_num
dst_dict
=
copy
.
deepcopy
(
dst
.
__dict__
)
src_dict
=
copy
.
deepcopy
(
src
.
__dict__
)
src_dict
[
'tensor_para_size'
]
=
src_dict
[
'tp'
]
for
k
,
v
in
src_dict
.
items
():
if
v
is
not
None
and
k
in
dst_dict
:
dst_dict
[
k
]
=
v
return
TurbomindModelConfig
.
from_dict
(
dst_dict
)
def
_compare_individual_gpu_memory
(
tp
:
int
):
logger
.
setLevel
(
level
=
logging
.
INFO
)
try
:
total_mem
=
[]
free_mem
=
[]
for
i
in
range
(
tp
):
torch
.
cuda
.
set_device
(
i
)
free
,
total
=
torch
.
cuda
.
mem_get_info
()
total_mem
.
append
(
total
/
(
1024
**
2
))
free_mem
.
append
(
free
/
(
1024
**
2
))
all_total_equal
=
all
(
total
==
total_mem
[
0
]
for
total
in
total_mem
)
all_free_equal
=
all
(
free
==
free_mem
[
0
]
for
free
in
free_mem
)
if
not
all_total_equal
or
not
all_free_equal
:
logger
.
warning
(
f
'Memory discrepancy detected: Total Memory=
{
total_mem
}
MB,
\
Free Memory=
{
free_mem
}
MB'
)
except
Exception
as
e
:
logger
.
error
(
f
'An exception occurred:
{
e
}
'
)
@
contextmanager
def
cuda_ctx
(
device_id
):
old_device
=
torch
.
cuda
.
current_device
()
...
...
@@ -102,34 +142,75 @@ class TurboMind:
def
__init__
(
self
,
model_path
:
str
,
engine_config
:
TurbomindEngineConfig
=
None
,
model_source
:
ModelSource
=
ModelSource
.
WORKSPACE
,
model_name
:
Optional
[
str
]
=
None
,
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
chat_template_config
:
Optional
[
ChatTemplateConfig
]
=
None
,
**
kwargs
):
# check memory equality when tp
if
tp
is
not
None
:
assert
((
tp
&
(
tp
-
1
)
==
0
)
and
tp
!=
0
),
'tp should be 2^n'
self
.
gpu_count
=
tp
if
tp
is
not
None
else
1
if
tp
>
1
:
_compare_individual_gpu_memory
(
tp
)
elif
engine_config
is
not
None
and
engine_config
.
tp
is
not
None
:
if
engine_config
.
tp
>
1
:
_compare_individual_gpu_memory
(
engine_config
.
tp
)
# if loading from workspace and engine_config is None, use config.ini
# and ignore passed args like model_format, tp, etc.
if
model_source
==
ModelSource
.
WORKSPACE
and
engine_config
is
None
:
def
_catch_args
(
**
kwargs
):
args
=
[]
for
k
,
v
in
kwargs
.
items
():
if
v
and
hasattr
(
TurbomindEngineConfig
,
k
):
args
.
append
(
k
)
return
args
args
=
_catch_args
(
**
kwargs
,
model_format
=
model_format
,
tp
=
tp
)
if
len
(
args
)
>
0
:
logger
.
warning
(
f
'loading from workspace, ignore args
{
args
}
'
'please use TurbomindEngineConfig or modify config.ini'
)
else
:
engine_config
=
_update_engine_config
(
engine_config
,
model_format
=
model_format
,
group_size
=
group_size
,
tp
=
tp
,
**
kwargs
)
tp
=
engine_config
.
tp
if
engine_config
is
not
None
else
1
assert
((
tp
&
(
tp
-
1
)
==
0
)
and
tp
!=
0
),
'tp should be 2^n'
self
.
gpu_count
=
tp
if
model_source
==
ModelSource
.
WORKSPACE
:
tokenizer_model_path
=
osp
.
join
(
model_path
,
'triton_models'
,
'tokenizer'
)
self
.
tokenizer
=
Tokenizer
(
tokenizer_model_path
)
self
.
model_comm
=
self
.
_from_workspace
(
model_path
)
self
.
model_comm
=
self
.
_from_workspace
(
model_path
=
model_path
,
engine_config
=
engine_config
)
else
:
if
not
osp
.
exists
(
model_path
):
model_path
=
get_model
(
model_path
,
engine_config
.
download_dir
,
engine_config
.
revision
)
self
.
tokenizer
=
Tokenizer
(
model_path
)
self
.
model_comm
=
self
.
_from_hf
(
model_source
=
model_source
,
model_path
=
model_path
,
model_name
=
model_name
,
model_format
=
model_format
,
group_size
=
group_size
,
tp
=
tp
,
**
kwargs
)
engine_config
=
engine_config
)
if
chat_template_config
:
if
chat_template_config
.
model_name
is
None
:
chat_template_config
.
model_name
=
self
.
model_name
logger
.
warning
(
f
'Input chat template with model_name is None. '
f
'Forcing to use
{
self
.
model_name
}
'
)
self
.
model
=
chat_template_config
.
chat_template
else
:
self
.
model
:
BaseModel
=
MODELS
.
get
(
self
.
model_name
)(
**
kwargs
)
self
.
session_len
=
self
.
config
.
session_len
self
.
eos_id
=
self
.
tokenizer
.
eos_token_id
self
.
model
:
BaseModel
=
MODELS
.
get
(
self
.
model_name
)(
**
kwargs
)
self
.
session_len
=
self
.
model
.
session_len
self
.
stop_words
=
_stop_words
(
self
.
model
.
stop_words
,
self
.
tokenizer
)
def
_create_weight
(
self
,
model_comm
):
...
...
@@ -194,88 +275,61 @@ class TurboMind:
tm_params
[
k
]
=
[]
tm_params
[
k
].
append
(
v
)
def
_from_hf
(
self
,
model_source
:
ModelSource
,
model_path
:
str
,
model_name
:
Optional
[
str
]
=
None
,
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
**
kwargs
):
def
_from_hf
(
self
,
model_source
:
ModelSource
,
model_path
:
str
,
engine_config
:
TurbomindEngineConfig
):
"""Load model which is in hf format."""
# get model_name, group_size if is lmdeploy managed.
if
model_source
==
ModelSource
.
HF_LMDEPLOY
:
config
=
get_hf_config_content
(
model_path
,
local_files_only
=
True
)
tm_config
=
config
[
'turbomind'
]
tm_config
.
update
(
kwargs
)
var_shoud_be_none
=
dict
(
model_name
=
model_name
,
model_format
=
model_format
,
group_size
=
group_size
)
for
key
,
value
in
var_shoud_be_none
.
items
():
assert
value
is
None
,
f
'
{
key
}
should be None when model is '
\
f
'from
{
model_source
}
'
model_name
=
tm_config
[
'model_name'
]
group_size
=
tm_config
[
'group_size'
]
if
tm_config
[
'weight_type'
]
==
'int4'
:
model_format
=
'awq'
else
:
assert
model_name
is
not
None
,
'please supply model_name when '
\
f
'model is form
{
model_source
}
'
if
osp
.
exists
(
osp
.
join
(
model_path
,
'outputs_stats.pth'
)):
model_format
=
'awq'
if
model_format
is
None
else
model_format
group_size
=
128
if
group_size
is
None
else
group_size
tm_config
=
kwargs
assert
model_name
in
MODELS
.
module_dict
.
keys
(),
\
f
"'
{
model_name
}
' is not supported. "
\
f
'The supported models are:
{
MODELS
.
module_dict
.
keys
()
}
'
assert
model_format
in
supported_formats
,
'the model format '
\
f
'should be in
{
supported_formats
}
'
assert
model_source
==
ModelSource
.
HF_MODEL
,
\
f
'
{
model_source
}
is not supported'
assert
engine_config
.
model_format
in
supported_formats
,
\
f
'The model format should be in
{
supported_formats
}
'
# update model_format if not supplied and outputs_stats.pth exists
if
osp
.
exists
(
osp
.
join
(
model_path
,
'outputs_stats.pth'
))
and
\
engine_config
.
model_format
is
None
:
engine_config
.
model_format
=
'awq'
# when convert model, use architectures in config.json
model_arch
=
get_model_from_config
(
model_path
)
data_type
=
'fp16'
output_format
=
'fp16'
inferred_model_format
=
get_model_format
(
model_name
,
model_format
)
cfg
=
TurbomindModelConfig
.
from_dict
(
tm_config
,
allow_none
=
True
)
# overwrite with input params
cfg
.
model_name
=
model_name
cfg
.
tensor_para_size
=
1
if
tp
is
None
else
tp
cfg
.
rotary_embedding
=
cfg
.
size_per_head
cfg
.
group_size
=
group_size
inferred_model_format
=
get_model_format
(
model_arch
,
engine_config
.
model_format
)
cfg
=
TurbomindModelConfig
.
from_engine_config
(
engine_config
)
match_name
=
best_match_model
(
model_path
)
# for session len
cfg
.
model_name
=
match_name
\
if
match_name
is
not
None
else
'base'
if
inferred_model_format
.
find
(
'awq'
)
!=
-
1
:
cfg
.
weight_type
=
'int4'
output_format
=
'w4'
data_type
=
'int4'
assert
group_size
>
0
,
f
'group_size:
{
group_size
}
should > 0'
cfg
.
group_size
=
128
else
:
output_format
=
update_output_format
(
model_name
,
output_format
=
update_output_format
(
cfg
.
model_name
,
inferred_model_format
,
model_path
,
output_format
)
data_type
=
output_format
update_config_weight_type
(
output_format
,
cfg
)
self
.
config
=
cfg
self
.
model_name
=
model_name
self
.
data_type
=
data_type
input_model
=
INPUT_MODELS
.
get
(
inferred_model_format
)(
model_path
=
model_path
,
tokenizer_path
=
model_path
,
ckpt_path
=
None
)
output_model
=
OUTPUT_MODELS
.
get
(
output_format
)(
input_model
=
input_model
,
cfg
=
cfg
,
to_file
=
False
,
out_dir
=
''
)
config
=
copy
.
deepcopy
(
output_model
.
cfg
.
__dict__
)
logger
.
warning
(
f
'model_config:
\n
{
json
.
dumps
(
config
,
indent
=
2
)
}
'
)
parser
=
ConfigParser
()
parser
[
'llama'
]
=
config
with
io
.
StringIO
()
as
ss
:
parser
.
write
(
ss
)
ss
.
seek
(
0
)
config
=
ss
.
read
()
cfg
=
output_model
.
cfg
if
engine_config
.
session_len
is
not
None
:
cfg
.
session_len
=
engine_config
.
session_len
self
.
model_name
=
cfg
.
model_name
self
.
config
=
cfg
self
.
data_type
=
data_type
logger
.
warning
(
f
'model_config:
\n\n
{
cfg
.
toini
()
}
'
)
model_comm
=
_tm
.
AbstractTransformerModel
.
create_llama_model
(
model_dir
=
''
,
config
=
c
onfig
,
config
=
c
fg
.
toini
()
,
tensor_para_size
=
self
.
gpu_count
,
data_type
=
data_type
)
...
...
@@ -289,35 +343,48 @@ class TurboMind:
output_model
.
export
()
# load kv qparams
self
.
_load_kv_qparams
(
model_path
,
tm_params
,
**
kwargs
)
self
.
_load_kv_qparams
(
model_path
,
tm_params
,
kv_sym
=
False
,
kv_bits
=
8
)
assert
len
(
tm_params
)
==
0
,
f
'missing
{
tm_params
.
keys
()
}
'
return
model_comm
def
_from_workspace
(
self
,
model_path
:
str
):
def
_from_workspace
(
self
,
model_path
:
str
,
engine_config
:
TurbomindEngineConfig
):
"""Load model which is converted by `lmdeploy convert`"""
ini_path
=
osp
.
join
(
model_path
,
'triton_models'
,
'weights'
,
'config.ini'
)
# load cfg
with
open
(
ini_path
,
'r'
)
as
f
:
parser
=
ConfigParser
()
parser
.
read_file
(
f
)
section_name
=
'llama'
tp_cfg
=
parser
.
getint
(
section_name
,
'tensor_para_size'
)
if
tp_cfg
!=
1
and
tp_cfg
!=
self
.
gpu_count
:
get_logger
(
'turbomind'
).
info
(
f
'found tp=
{
tp_cfg
}
in config.ini.'
)
self
.
gpu_count
=
tp_cfg
self
.
model_name
=
parser
.
get
(
section_name
,
'model_name'
)
self
.
data_type
=
parser
.
get
(
section_name
,
'weight_type'
)
cfg
=
parser
.
_sections
[
section_name
]
cfg
=
TurbomindModelConfig
.
from_dict
(
cfg
)
self
.
config
=
cfg
section_name
=
'llama'
_cfg
=
parser
.
_sections
[
section_name
]
cfg
=
TurbomindModelConfig
.
from_dict
(
_cfg
)
# check whether input tp is valid
if
cfg
.
tensor_para_size
!=
1
and
\
self
.
gpu_count
!=
cfg
.
tensor_para_size
:
logger
.
info
(
f
'found tp=
{
cfg
.
tensor_para_size
}
in config.ini.'
)
self
.
gpu_count
=
cfg
.
tensor_para_size
# update cfg
if
engine_config
is
not
None
:
engine_config
.
tp
=
cfg
.
tensor_para_size
cfg
=
_update_tm_config
(
cfg
,
engine_config
)
if
engine_config
.
session_len
is
not
None
:
cfg
.
session_len
=
engine_config
.
session_len
# update cls
self
.
config
=
cfg
self
.
model_name
=
cfg
.
model_name
self
.
data_type
=
cfg
.
weight_type
# create model
logger
.
warning
(
f
'model_config:
\n\n
{
cfg
.
toini
()
}
'
)
weight_dir
=
osp
.
join
(
model_path
,
'triton_models'
,
'weights'
)
model_comm
=
_tm
.
AbstractTransformerModel
.
create_llama_model
(
weight_dir
,
model_dir
=
weight_dir
,
config
=
cfg
.
toini
(),
tensor_para_size
=
self
.
gpu_count
,
data_type
=
self
.
data_type
)
...
...
@@ -326,13 +393,16 @@ class TurboMind:
return
model_comm
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
:
str
,
model_name
:
Optional
[
str
]
=
None
,
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
**
kwargs
):
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
:
str
,
engine_config
:
TurbomindEngineConfig
=
None
,
model_name
:
Optional
[
str
]
=
None
,
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
chat_template_config
:
Optional
[
ChatTemplateConfig
]
=
None
,
**
kwargs
):
"""LMDeploy's turbomind inference engine.
Args:
...
...
@@ -346,7 +416,7 @@ class TurboMind:
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "
I
ntern
LM
/internlm-chat-7b",
on huggingface.co, such as "
i
ntern
lm
/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when pretrained_model_name_or_path is c)
...
...
@@ -357,26 +427,14 @@ class TurboMind:
Can be used to update configuration when initialize the engine.
"""
model_source
=
get_model_source
(
pretrained_model_name_or_path
)
if
model_source
==
ModelSource
.
WORKSPACE
:
local_path
=
pretrained_model_name_or_path
else
:
check_tm_model_input
(
pretrained_model_name_or_path
,
model_name
=
model_name
,
**
kwargs
)
if
not
osp
.
exists
(
pretrained_model_name_or_path
):
download_kwargs
=
create_hf_download_args
(
**
kwargs
)
local_path
=
snapshot_download
(
pretrained_model_name_or_path
,
**
download_kwargs
)
else
:
local_path
=
pretrained_model_name_or_path
logger
.
warning
(
f
'model_source:
{
model_source
}
'
)
return
cls
(
model_
source
=
model_source
,
model_path
=
local_path
,
model_
nam
e
=
model_
nam
e
,
return
cls
(
model_
path
=
pretrained_model_name_or_path
,
engine_config
=
engine_config
,
model_
sourc
e
=
model_
sourc
e
,
model_format
=
model_format
,
group_size
=
group_size
,
tp
=
tp
,
chat_template_config
=
chat_template_config
,
**
kwargs
)
def
create_instance
(
self
,
cuda_stream_id
=
0
):
...
...
@@ -406,8 +464,6 @@ class TurboMindInstance:
self
.
gpu_count
=
tm_model
.
gpu_count
self
.
stop_words
=
tm_model
.
stop_words
self
.
stop_tokens
=
[]
if
self
.
stop_words
is
None
else
\
self
.
stop_words
.
flatten
().
tolist
()
self
.
eos_id
=
tm_model
.
eos_id
self
.
session_len
=
tm_model
.
session_len
...
...
@@ -456,23 +512,92 @@ class TurboMindInstance:
t
.
start
()
self
.
threads
[
device_id
]
=
t
def
_async_forward_callback
(
self
,
result
,
ctx
,
que
:
LifoQueue
):
que
.
put
((
False
,
result
))
def
_async_forward_thread
(
self
,
inputs
,
que
:
LifoQueue
):
instance_comm
=
self
.
tm_model
.
model_comm
.
create_instance_comm
(
self
.
gpu_count
)
def
_func
(
device_id
,
enque_output
):
with
cuda_ctx
(
device_id
):
output
=
self
.
model_insts
[
device_id
].
forward
(
inputs
,
instance_comm
)
if
enque_output
:
que
.
put
((
True
,
output
))
for
device_id
in
range
(
self
.
gpu_count
):
t
=
Thread
(
target
=
_func
,
args
=
(
device_id
,
device_id
==
0
),
daemon
=
True
)
t
.
start
()
self
.
threads
[
device_id
]
=
t
def
_update_generation_config
(
self
,
config
:
EngineGenerationConfig
,
**
kwargs
:
dict
):
if
config
is
None
:
config
=
EngineGenerationConfig
()
# backward compatibility
# if doesn't supply stop words, use default
if
config
.
stop_words
is
None
and
self
.
stop_words
is
not
None
:
config
.
stop_words
=
self
.
stop_words
[
0
][
0
].
tolist
()
deprecated_kwargs
=
[]
for
k
,
v
in
kwargs
.
items
():
if
k
in
config
.
__dict__
:
config
.
__dict__
[
k
]
=
v
deprecated_kwargs
.
append
(
k
)
if
'request_output_len'
in
kwargs
:
config
.
max_new_tokens
=
kwargs
[
'request_output_len'
]
deprecated_kwargs
.
append
(
'request_output_len'
)
for
k
in
deprecated_kwargs
:
logger
.
warning
(
f
'kwargs
{
k
}
is deprecated for inference, '
'use GenerationConfig instead.'
)
return
config
def
end
(
self
,
session_id
:
int
):
"""End the given session."""
input_ids
=
[
self
.
tm_model
.
tokenizer
.
eos_token_id
]
end_generator
=
self
.
tm_model
.
create_instance
()
for
outputs
in
end_generator
.
stream_infer
(
session_id
,
input_ids
,
request_output_len
=
0
,
sequence_start
=
False
,
sequence_end
=
True
):
pass
async
def
async_end
(
self
,
session_id
:
int
):
"""End the given session."""
self
.
end
(
session_id
)
await
asyncio
.
sleep
(
0.002
)
def
cancel
(
self
,
session_id
:
int
):
"""Stop current streaming inference."""
input_ids
=
[
self
.
tm_model
.
tokenizer
.
eos_token_id
]
stop_generator
=
self
.
tm_model
.
create_instance
()
for
outputs
in
stop_generator
.
stream_infer
(
session_id
,
input_ids
,
request_output_len
=
0
,
sequence_start
=
False
,
sequence_end
=
False
,
stop
=
True
):
pass
async
def
async_cancel
(
self
,
session_id
:
int
):
"""End the given session."""
self
.
cancel
(
session_id
)
await
asyncio
.
sleep
(
0.002
)
def
prepare_inputs
(
self
,
session_id
,
input_ids
,
gen_config
:
EngineGenerationConfig
,
input_embeddings
=
None
,
input_embedding_ranges
=
None
,
request_output_len
:
int
=
512
,
sequence_start
:
bool
=
True
,
sequence_end
:
bool
=
False
,
step
=
0
,
stop
=
False
,
top_p
=
0.8
,
top_k
=
40
,
temperature
=
0.8
,
repetition_penalty
=
1.0
,
ignore_eos
=
False
,
random_seed
=
None
,
stream_output
=
False
):
stop
=
False
):
"""Convert inputs format."""
if
len
(
input_ids
)
==
0
:
input_ids
=
[[]]
...
...
@@ -504,19 +629,16 @@ class TurboMindInstance:
input_ids
=
input_ids
,
input_lengths
=
input_lengths
,
request_output_len
=
np
.
full
(
input_lengths
.
shape
,
request_output_l
en
,
gen_config
.
max_new_tok
en
s
,
dtype
=
np
.
uint32
),
runtime_top_k
=
_broadcast_np
(
top_k
,
np
.
uint32
),
runtime_top_p
=
_broadcast_np
(
top_p
,
np
.
float32
),
temperature
=
_broadcast_np
(
temperature
,
np
.
float32
),
repetition_penalty
=
_broadcast_np
(
repetition_penalty
,
np
.
float32
),
runtime_top_k
=
_broadcast_np
(
gen_config
.
top_k
,
np
.
uint32
),
runtime_top_p
=
_broadcast_np
(
gen_config
.
top_p
,
np
.
float32
),
temperature
=
_broadcast_np
(
gen_config
.
temperature
,
np
.
float32
),
repetition_penalty
=
_broadcast_np
(
gen_config
.
repetition_penalty
,
np
.
float32
),
step
=
step
,
# session input
session_len
=
self
.
session_len
*
np
.
ones
([
batch_size
,
],
dtype
=
np
.
uint32
),
START
=
_broadcast_np
((
1
if
sequence_start
else
0
),
np
.
int32
),
END
=
_broadcast_np
((
1
if
sequence_end
else
0
),
np
.
int32
),
CORRID
=
np
.
array
(
session_id
,
dtype
=
np
.
uint64
),
...
...
@@ -560,20 +682,29 @@ class TurboMindInstance:
inputs
[
'input_embeddings'
]
=
input_embeddings
inputs
[
'input_embedding_ranges'
]
=
input_embedding_ranges
if
ignore_eos
:
if
gen_config
.
min_new_tokens
is
not
None
:
inputs
[
'min_length'
]
=
_broadcast_np
(
gen_config
.
min_new_tokens
,
np
.
int32
)
bad_words
=
[]
if
gen_config
.
bad_words
is
not
None
:
bad_words
.
extend
(
gen_config
.
bad_words
)
if
gen_config
.
ignore_eos
:
stop_words
=
None
bad_words
=
torch
.
tensor
([[[
self
.
eos_id
],
[
1
]]],
dtype
=
torch
.
int32
)
bad_words
.
append
(
self
.
eos_id
)
else
:
stop_words
=
self
.
stop_words
bad_words
=
None
stop_words
=
gen_config
.
stop_words
stop_words
=
_construct_stop_or_bad_words
(
stop_words
)
bad_words
=
_construct_stop_or_bad_words
(
bad_words
)
if
stop_words
is
not
None
:
inputs
[
'stop_words_list'
]
=
stop_words
if
bad_words
is
not
None
:
inputs
[
'bad_words_list'
]
=
bad_words
if
random_seed
is
not
None
:
inputs
[
'random_seed'
]
=
_broadcast_np
(
random_seed
,
np
.
uint64
)
if
gen_config
.
random_seed
is
not
None
:
inputs
[
'random_seed'
]
=
_broadcast_np
(
gen_config
.
random_seed
,
np
.
uint64
)
return
inputs
,
input_lengths
async
def
async_stream_infer
(
self
,
...
...
@@ -581,18 +712,13 @@ class TurboMindInstance:
input_ids
,
input_embeddings
=
None
,
input_embedding_ranges
=
None
,
request_output_len
:
int
=
512
,
sequence_start
:
bool
=
True
,
sequence_end
:
bool
=
False
,
step
=
0
,
stop
=
False
,
top_p
=
0.8
,
top_k
=
40
,
temperature
=
0.8
,
repetition_penalty
=
1.0
,
ignore_eos
=
False
,
random_seed
=
None
,
stream_output
=
False
):
gen_config
:
EngineGenerationConfig
=
None
,
stream_output
=
False
,
**
kwargs
):
"""Perform model inference.
Args:
...
...
@@ -601,60 +727,46 @@ class TurboMindInstance:
input_embeddings (List[numpy.ndarray]): embeddings features
input_embedding_ranges (List[Tuple[int,int]]): the begin/end
offsets of input_embeddings to input_ids
request_output_len (int): the max number of to-be-generated tokens
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
step (int): the offset of the k/v cache
stop (bool): indicator for cancelling the session
top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
random_seed (int): seed used by sampling
gen_config (EngineGenerationConfig): generation config
stream_output (bool): indicator for stream output
kwargs (dict): kwargs for backward compatibility
"""
# start forward thread
que
=
LifoQueue
()
from
functools
import
partial
_forward_callback
=
partial
(
self
.
_async_forward_callback
,
que
=
que
)
_forward_thread
=
partial
(
self
.
_async_forward_thread
,
que
=
que
)
if
stream_output
and
not
stop
:
self
.
model_insts
[
0
].
register_callback
(
self
.
_forward_callback
)
self
.
model_insts
[
0
].
register_callback
(
_forward_callback
)
gen_config
=
self
.
_update_generation_config
(
gen_config
,
**
kwargs
)
inputs
,
input_lengths
=
self
.
prepare_inputs
(
session_id
=
session_id
,
input_ids
=
input_ids
,
input_embeddings
=
input_embeddings
,
input_embedding_ranges
=
input_embedding_ranges
,
request_output_len
=
request_output_len
,
sequence_start
=
sequence_start
,
sequence_end
=
sequence_end
,
step
=
step
,
stop
=
stop
,
top_p
=
top_p
,
top_k
=
top_k
,
temperature
=
temperature
,
repetition_penalty
=
repetition_penalty
,
ignore_eos
=
ignore_eos
,
random_seed
=
random_seed
,
stream_output
=
stream_output
)
gen_config
=
gen_config
)
tm_inputs
=
_np_dict_to_tm_dict
(
inputs
)
# start forward thread
self
.
que
=
Queue
()
self
.
_forward_thread
(
tm_inputs
)
_forward_thread
(
tm_inputs
)
seq_start
=
input_lengths
+
input_lengths
.
new_tensor
(
step
)
prev_len
=
0
# generator
while
True
:
# Thanks for https://github.com/frankxyy and his issue
# https://github.com/InternLM/lmdeploy/issues/832
while
self
.
que
.
qsize
()
==
0
:
await
asyncio
.
sleep
(
0
)
while
self
.
que
.
qsize
()
>
1
:
self
.
que
.
get
()
while
que
.
qsize
()
==
0
:
# let other requests in
await
asyncio
.
sleep
(
0.002
)
finish
,
tm_outputs
=
self
.
que
.
get
()
finish
,
tm_outputs
=
que
.
get
()
outputs
=
_tm_dict_to_torch_dict
(
tm_outputs
)
...
...
@@ -667,22 +779,27 @@ class TurboMindInstance:
sequence_length
-=
seq_start
.
to
(
sequence_length
.
device
)
outputs
=
[]
status
=
ResponseType
.
FINISH
if
finish
else
ResponseType
.
SUCCESS
for
output
,
len_
in
zip
(
output_ids
,
sequence_length
):
output
,
len_
=
output
,
len_
.
item
()
if
len
(
output
)
>
0
and
output
[
-
1
].
item
(
)
==
self
.
eos_id
and
not
ignore_eos
:
outputs
.
append
((
output
[:
-
1
],
len_
-
1
))
elif
len
(
output
)
>
0
and
output
[
-
1
].
item
()
in
self
.
stop_tokens
:
outputs
.
append
((
output
[:
-
1
],
len_
))
if
len
(
output
)
>
0
and
output
[
-
1
].
item
()
==
self
.
eos_id
\
and
not
gen_config
.
ignore_eos
:
outputs
=
(
status
,
output
[:
-
1
].
tolist
(),
len_
-
1
)
elif
len
(
output
)
>
0
and
\
gen_config
.
stop_words
is
not
None
and
\
output
[
-
1
].
item
()
in
gen_config
.
stop_words
:
outputs
=
(
status
,
output
[:
-
1
].
tolist
(),
len_
)
else
:
outputs
.
append
((
output
,
len_
))
outputs
=
(
status
,
output
.
tolist
(),
len_
)
if
outputs
[
-
1
]
<
prev_len
and
not
finish
:
continue
else
:
prev_len
=
outputs
[
-
1
]
yield
outputs
if
finish
:
for
t
in
self
.
threads
:
t
.
join
()
while
self
.
que
.
qsize
()
>
0
:
self
.
que
.
get
()
break
if
stream_output
and
not
stop
:
...
...
@@ -693,18 +810,13 @@ class TurboMindInstance:
input_ids
,
input_embeddings
=
None
,
input_embedding_ranges
=
None
,
request_output_len
:
int
=
512
,
sequence_start
:
bool
=
True
,
sequence_end
:
bool
=
False
,
step
=
0
,
stop
=
False
,
top_p
=
0.8
,
top_k
=
40
,
temperature
=
0.8
,
repetition_penalty
=
1.0
,
ignore_eos
=
False
,
random_seed
=
None
,
stream_output
=
False
):
gen_config
:
EngineGenerationConfig
=
None
,
stream_output
=
False
,
**
kwargs
):
"""Perform model inference.
Args:
...
...
@@ -713,42 +825,28 @@ class TurboMindInstance:
input_embeddings (List[numpy.ndarray]): embeddings features
input_embedding_ranges (List[Tuple[int,int]]): the begin/end
offsets of input_embeddings to input_ids
request_output_len (int): the max number of to-be-generated tokens
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
step (int): the offset of the k/v cache
stop (bool): indicator for cancelling the session
top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
random_seed (int): seed used by sampling
gen_config (EngineGenerationConfig): generation config
stream_output (bool): indicator for stream output
kwargs (dict): kwargs for backward compatibility
"""
if
stream_output
and
not
stop
:
self
.
model_insts
[
0
].
register_callback
(
self
.
_forward_callback
)
gen_config
=
self
.
_update_generation_config
(
gen_config
,
**
kwargs
)
inputs
,
input_lengths
=
self
.
prepare_inputs
(
session_id
=
session_id
,
input_ids
=
input_ids
,
input_embeddings
=
input_embeddings
,
input_embedding_ranges
=
input_embedding_ranges
,
request_output_len
=
request_output_len
,
sequence_start
=
sequence_start
,
sequence_end
=
sequence_end
,
step
=
step
,
stop
=
stop
,
top_p
=
top_p
,
top_k
=
top_k
,
temperature
=
temperature
,
repetition_penalty
=
repetition_penalty
,
ignore_eos
=
ignore_eos
,
random_seed
=
random_seed
,
stream_output
=
stream_output
)
gen_config
=
gen_config
)
tm_inputs
=
_np_dict_to_tm_dict
(
inputs
)
# start forward thread
...
...
@@ -775,15 +873,18 @@ class TurboMindInstance:
sequence_length
-=
seq_start
.
to
(
sequence_length
.
device
)
outputs
=
[]
status
=
ResponseType
.
FINISH
if
finish
else
ResponseType
.
SUCCESS
for
output
,
len_
in
zip
(
output_ids
,
sequence_length
):
output
,
len_
=
output
,
len_
.
item
()
if
len
(
output
)
>
0
and
output
[
-
1
].
item
(
)
==
self
.
eos_id
and
not
ignore_eos
:
outputs
.
append
((
output
[:
-
1
],
len_
-
1
))
elif
len
(
output
)
>
0
and
output
[
-
1
].
item
()
in
self
.
stop_tokens
:
outputs
.
append
((
output
[:
-
1
],
len_
))
if
len
(
output
)
>
0
and
output
[
-
1
].
item
()
==
self
.
eos_id
\
and
not
gen_config
.
ignore_eos
:
outputs
=
(
status
,
output
[:
-
1
].
tolist
(),
len_
-
1
)
elif
len
(
output
)
>
0
and
\
gen_config
.
stop_words
is
not
None
and
\
output
[
-
1
].
item
()
in
gen_config
.
stop_words
:
outputs
=
(
status
,
output
[:
-
1
].
tolist
(),
len_
)
else
:
outputs
.
append
((
output
,
len_
)
)
outputs
=
(
status
,
output
.
tolist
()
,
len_
)
yield
outputs
if
finish
:
...
...
@@ -796,17 +897,27 @@ class TurboMindInstance:
if
stream_output
and
not
stop
:
self
.
model_insts
[
0
].
unregister_callback
()
def
decode
(
self
,
input_ids
):
def
decode
(
self
,
input_ids
,
steps
:
List
[
int
]
=
None
,
sequence_start
:
bool
=
True
,
sequence_end
:
bool
=
True
):
"""Perform context decode on input tokens.
Args:
input_ids (numpy.ndarray): the batch of input token ids
steps (List[int]): the offset of the k/v cache
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
"""
if
len
(
input_ids
)
==
0
:
input_ids
=
[[]]
if
isinstance
(
input_ids
[
0
],
int
):
input_ids
=
[
input_ids
]
if
steps
is
None
:
steps
=
[
0
]
*
len
(
input_ids
)
assert
isinstance
(
steps
,
List
)
and
len
(
steps
)
==
len
(
input_ids
)
# append an extra token since input_len-1 tokens will be
# decoded by context decoder
...
...
@@ -827,11 +938,16 @@ class TurboMindInstance:
input_ids
=
pad_sequence
(
input_ids
,
batch_first
=
True
,
padding_value
=
self
.
eos_id
)
steps
=
torch
.
IntTensor
([
step
for
step
in
steps
])
inputs
=
dict
(
input_ids
=
input_ids
,
input_lengths
=
input_lengths
,
request_output_len
=
_broadcast_np
(
0
,
dtype
=
np
.
uint32
),
is_return_logits
=
_broadcast_np
(
1
,
np
.
uint32
))
is_return_logits
=
_broadcast_np
(
1
,
np
.
uint32
),
START
=
_broadcast_np
((
1
if
sequence_start
else
0
),
np
.
int32
),
END
=
_broadcast_np
((
1
if
sequence_end
else
0
),
np
.
int32
),
step
=
steps
)
tm_inputs
=
_np_dict_to_tm_dict
(
inputs
)
...
...
@@ -844,3 +960,83 @@ class TurboMindInstance:
logits
=
outputs
[
'logits'
]
return
logits
[:,
:
-
1
,
:]
def
get_ppl
(
self
,
input_ids
:
Union
[
List
[
int
],
List
[
List
[
int
]]]):
"""Get perplexity scores given a list of input tokens.
Args:
input_ids (Union[List[int], List[List[int]]]): the batch of input token ids
"""
# noqa 501
if
len
(
input_ids
)
==
0
:
input_ids
=
[[]]
if
isinstance
(
input_ids
[
0
],
int
):
input_ids
=
[
input_ids
]
max_input_len
=
16
*
1024
# max_input_len = 16
n_max_iter
=
np
.
ceil
(
max
([
len
(
input_id
)
for
input_id
in
input_ids
])
/
max_input_len
).
astype
(
int
)
device
=
'cpu'
if
n_max_iter
>
1
else
'cuda'
index_range_starts
=
[]
index_range_ends
=
[]
for
input_id
in
input_ids
:
index_range_start
=
np
.
array
(
[
i
*
max_input_len
for
i
in
range
(
n_max_iter
)])
index_range_end
=
index_range_start
+
max_input_len
index_range_start
[
index_range_start
>=
len
(
input_id
)]
=
len
(
input_id
)
index_range_end
[
index_range_end
>=
len
(
input_id
)]
=
len
(
input_id
)
index_range_starts
.
append
(
index_range_start
)
index_range_ends
.
append
(
index_range_end
)
logits
=
[]
for
i
in
range
(
n_max_iter
):
steps
=
[
start
[
i
]
for
start
in
index_range_starts
]
_input_ids
=
[
input_id
[
start
[
i
]:
end
[
i
]]
for
input_id
,
start
,
end
in
zip
(
input_ids
,
index_range_starts
,
index_range_ends
)
]
_logits
=
self
.
decode
(
_input_ids
,
steps
,
sequence_start
=
(
i
==
0
),
sequence_end
=
(
i
==
n_max_iter
-
1
))
_logits
=
_logits
.
to
(
device
=
device
)
logits
.
append
(
_logits
)
# concat logits. Shape is [bsz, seq_len, vocab_size]
logits
=
torch
.
cat
(
logits
,
dim
=
1
)
# get target ids
padding_token_id
=
-
100
target_ids
=
[(
_input_ids
+
[
padding_token_id
])[
1
:]
for
_input_ids
in
input_ids
]
target_ids
=
[
torch
.
Tensor
(
torch
.
LongTensor
(
_target_ids
))
for
_target_ids
in
target_ids
]
target_ids
=
pad_sequence
(
target_ids
,
batch_first
=
True
,
padding_value
=
padding_token_id
)
target_ids
=
target_ids
.
to
(
logits
.
device
)
target_mask
=
target_ids
!=
padding_token_id
target_count
=
torch
.
sum
(
target_mask
,
dim
=-
1
)
# compute cross entropy loss
bsz
,
seq_len
,
vocab_size
=
logits
.
shape
flat_logits
=
logits
.
contiguous
().
view
(
-
1
,
vocab_size
)
flat_target_ids
=
target_ids
.
contiguous
().
view
(
-
1
)
flat_loss_matrix
=
torch
.
nn
.
functional
.
cross_entropy
(
flat_logits
,
flat_target_ids
,
reduction
=
'none'
,
ignore_index
=
padding_token_id
)
loss_matrix
=
flat_loss_matrix
.
view
(
bsz
,
seq_len
)
loss_sum
=
torch
.
sum
(
loss_matrix
*
target_mask
,
dim
=
1
)
loss_avg
=
loss_sum
/
target_count
loss_avg
=
loss_avg
.
cpu
().
numpy
()
return
loss_avg
lmdeploy/turbomind/utils.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
import
dataclasses
import
json
import
logging
import
os
from
huggingface_hub
import
hf_hub_download
from
transformers.utils
import
ExplicitEnum
logger
=
logging
.
getLogger
(
__name__
)
from
lmdeploy.utils
import
get_logger
logger
=
get_logger
(
'lmdeploy'
)
class
ModelSource
(
ExplicitEnum
):
"""Turbomind model source."""
WORKSPACE
=
'workspace'
HF_MODEL
=
'hf_model'
HF_LMDEPLOY
=
'hf_lmdeploy'
def
create_hf_download_args
(
**
kwargs
)
->
dict
:
download_kwargs
=
{
'revision'
:
None
,
'cache_dir'
:
None
,
'proxies'
:
None
,
'resume_download'
:
True
,
'force_download'
:
False
,
'token'
:
None
,
'local_files_only'
:
False
}
for
k
in
download_kwargs
.
keys
():
if
k
in
kwargs
:
download_kwargs
[
k
]
=
kwargs
[
k
]
return
download_kwargs
def
get_hf_config_path
(
pretrained_model_name_or_path
,
**
kwargs
)
->
str
:
"""Get local hf config local file path."""
if
os
.
path
.
exists
(
pretrained_model_name_or_path
):
config_path
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
'config.json'
)
else
:
download_kwargs
=
create_hf_download_args
(
**
kwargs
)
config_path
=
hf_hub_download
(
pretrained_model_name_or_path
,
'config.json'
,
**
download_kwargs
)
return
config_path
def
get_hf_config_content
(
pretrained_model_name_or_path
,
**
kwargs
)
->
dict
:
"""Get config content of a hf model."""
config_path
=
get_hf_config_path
(
pretrained_model_name_or_path
,
**
kwargs
)
with
open
(
config_path
,
'r'
)
as
f
:
config
=
json
.
load
(
f
)
return
config
def
get_model_source
(
pretrained_model_name_or_path
:
str
,
...
...
@@ -60,61 +21,33 @@ def get_model_source(pretrained_model_name_or_path: str,
'triton_models'
)
if
os
.
path
.
exists
(
triton_model_path
):
return
ModelSource
.
WORKSPACE
config
=
get_hf_config_content
(
pretrained_model_name_or_path
,
**
kwargs
)
model_source
=
ModelSource
.
HF_LMDEPLOY
if
'turbomind'
in
config
\
else
ModelSource
.
HF_MODEL
return
model_source
def
check_tm_model_input
(
pretrained_model_name_or_path
,
**
kwargs
):
"""Check if single input pretrained_model_name_or_path is enough to use."""
if
kwargs
.
get
(
'model_name'
,
None
):
return
model_source
=
get_model_source
(
pretrained_model_name_or_path
,
**
kwargs
)
if
model_source
==
ModelSource
.
WORKSPACE
:
return
return
ModelSource
.
HF_MODEL
config
=
get_hf_config_content
(
pretrained_model_name_or_path
,
**
kwargs
)
if
'turbomind'
in
config
and
config
[
'turbomind'
][
'model_name'
]
!=
''
:
return
assert
(
0
),
'
\n
Can not get model name from input model, '
\
'please supply model name with arg --model-name,'
\
'you can list supported models by `lmdeploy list`'
def
get_model_from_config
(
model_dir
:
str
):
import
json
config_file
=
os
.
path
.
join
(
model_dir
,
'config.json'
)
default
=
'llama'
if
not
os
.
path
.
exists
(
config_file
):
return
default
with
open
(
config_file
)
as
f
:
config
=
json
.
load
(
f
)
@
dataclasses
.
dataclass
class
GenParam
:
top_p
:
float
top_k
:
float
temperature
:
float
repetition_penalty
:
float
sequence_start
:
bool
=
False
sequence_end
:
bool
=
False
step
:
int
=
0
request_output_len
:
int
=
512
ARCH_MAP
=
{
'LlavaLlamaForCausalLM'
:
default
,
'LlamaForCausalLM'
:
default
,
'InternLM2ForCausalLM'
:
'internlm2'
,
'InternLMForCausalLM'
:
default
,
'BaiChuanForCausalLM'
:
'baichuan'
,
# Baichuan-7B
'BaichuanForCausalLM'
:
'baichuan2'
,
# not right for Baichuan-13B-Chat
'QWenLMHeadModel'
:
'qwen'
,
}
def
get_gen_param
(
cap
,
sampling_param
,
nth_round
,
step
,
request_output_len
=
512
,
**
kwargs
):
"""return parameters used by token generation."""
gen_param
=
GenParam
(
**
dataclasses
.
asdict
(
sampling_param
),
request_output_len
=
request_output_len
)
# Fix me later. turbomind.py doesn't support None top_k
if
gen_param
.
top_k
is
None
:
gen_param
.
top_k
=
40
arch
=
'LlamaForCausalLM'
if
'auto_map'
in
config
:
arch
=
config
[
'auto_map'
][
'AutoModelForCausalLM'
].
split
(
'.'
)[
-
1
]
elif
'architectures'
in
config
:
arch
=
config
[
'architectures'
][
0
]
if
cap
==
'chat'
:
gen_param
.
sequence_start
=
(
nth_round
==
1
)
gen_param
.
sequence_end
=
False
gen_param
.
step
=
step
else
:
gen_param
.
sequence_start
=
True
gen_param
.
sequence_end
=
True
gen_param
.
step
=
0
return
gen_param
return
ARCH_MAP
[
arch
]
lmdeploy/utils.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
import
asyncio
import
functools
import
json
import
logging
import
os
import
sys
import
time
from
contextlib
import
contextmanager
from
logging
import
Logger
,
LogRecord
from
typing
import
List
,
Optional
from
huggingface_hub
import
hf_hub_download
logger_initialized
=
{}
def
get_logger
(
name
:
str
,
log_file
:
Optional
[
str
]
=
None
,
log_level
:
int
=
logging
.
INFO
,
file_mode
:
str
=
'w'
):
class
_ASNI_COLOR
:
BRIGHT_RED
=
'
\033
[91m'
RED
=
'
\033
[31m'
YELLOW
=
'
\033
[33m'
WHITE
=
'
\033
[37m'
GREEN
=
'
\033
[32m'
class
ColorFormatter
(
logging
.
Formatter
):
_LEVELNAME_COLOR_MAP
=
dict
(
CRITICAL
=
_ASNI_COLOR
.
BRIGHT_RED
,
ERROR
=
_ASNI_COLOR
.
RED
,
WARN
=
_ASNI_COLOR
.
YELLOW
,
WARNING
=
_ASNI_COLOR
.
YELLOW
,
INFO
=
_ASNI_COLOR
.
WHITE
,
DEBUG
=
_ASNI_COLOR
.
GREEN
)
_RESET_COLOR
=
'
\033
[0m'
def
format
(
self
,
record
:
LogRecord
):
"""format."""
if
sys
.
platform
==
'win32'
:
# windows does not support ASNI color
return
super
().
format
(
record
)
levelname
=
record
.
levelname
level_color
=
self
.
_LEVELNAME_COLOR_MAP
.
get
(
levelname
,
self
.
_RESET_COLOR
)
levelname
=
f
'
{
level_color
}{
levelname
}{
self
.
_RESET_COLOR
}
'
record
.
levelname
=
levelname
return
super
().
format
(
record
)
class
FilterDuplicateWarning
(
logging
.
Filter
):
"""Filter the repeated warning message.
Args:
name (str): name of the filter.
"""
def
__init__
(
self
,
name
:
str
=
'lmdeploy'
):
super
().
__init__
(
name
)
self
.
seen
:
set
=
set
()
def
filter
(
self
,
record
:
LogRecord
)
->
bool
:
"""Filter the repeated warning message.
Args:
record (LogRecord): The log record.
Returns:
bool: Whether to output the log record.
"""
if
record
.
levelno
!=
logging
.
WARNING
:
return
True
if
record
.
msg
not
in
self
.
seen
:
self
.
seen
.
add
(
record
.
msg
)
return
True
return
False
def
get_logger
(
name
:
Optional
[
str
]
=
None
,
log_file
:
Optional
[
str
]
=
None
,
log_level
:
int
=
logging
.
INFO
,
file_mode
:
str
=
'w'
,
log_formatter
:
str
=
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
->
Logger
:
"""Initialize and get a logger by name.
If the logger has not been initialized, this method will initialize the
...
...
@@ -22,25 +96,10 @@ def get_logger(name: str,
log_level (int): The logger level.
file_mode (str): The file mode used in opening log file.
Defaults to 'w'.
log_formatter (str): The logger output format.
Returns:
logging.Logger: The expected logger.
"""
# use logger in mmengine if exists.
try
:
from
mmengine.logging
import
MMLogger
if
MMLogger
.
check_instance_created
(
name
):
logger
=
MMLogger
.
get_instance
(
name
)
else
:
logger
=
MMLogger
.
get_instance
(
name
,
logger_name
=
name
,
log_file
=
log_file
,
log_level
=
log_level
,
file_mode
=
file_mode
)
return
logger
except
Exception
:
pass
logger
=
logging
.
getLogger
(
name
)
if
name
in
logger_initialized
:
return
logger
...
...
@@ -56,7 +115,7 @@ def get_logger(name: str,
if
type
(
handler
)
is
logging
.
StreamHandler
:
handler
.
setLevel
(
logging
.
ERROR
)
stream_handler
=
logging
.
StreamHandler
()
stream_handler
=
logging
.
StreamHandler
(
stream
=
sys
.
stdout
)
handlers
=
[
stream_handler
]
if
log_file
is
not
None
:
...
...
@@ -66,14 +125,15 @@ def get_logger(name: str,
file_handler
=
logging
.
FileHandler
(
log_file
,
file_mode
)
handlers
.
append
(
file_handler
)
formatter
=
logging
.
Formatter
(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
formatter
=
ColorFormatter
(
log_formatter
)
for
handler
in
handlers
:
handler
.
setFormatter
(
formatter
)
handler
.
setLevel
(
log_level
)
handler
.
addFilter
(
FilterDuplicateWarning
(
name
))
logger
.
addHandler
(
handler
)
logger
.
setLevel
(
log_level
)
logger
.
propagate
=
False
logger_initialized
[
name
]
=
True
return
logger
...
...
@@ -95,3 +155,103 @@ def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
if
response
.
endswith
(
item
):
response
=
response
[:
len
(
response
)
-
len
(
item
)]
return
response
# TODO remove stop_word_offsets stuff and make it clean
def
_stop_words
(
stop_words
:
List
[
str
],
tokenizer
:
object
):
"""return list of stop-words to numpy.ndarray."""
import
numpy
as
np
if
stop_words
is
None
:
return
None
assert
isinstance
(
stop_words
,
List
)
and
\
all
(
isinstance
(
elem
,
str
)
for
elem
in
stop_words
),
\
f
'stop_words must be a list but got
{
type
(
stop_words
)
}
'
stop_indexes
=
[]
for
stop_word
in
stop_words
:
stop_indexes
+=
tokenizer
.
indexes_containing_token
(
stop_word
)
assert
isinstance
(
stop_indexes
,
List
)
and
all
(
isinstance
(
elem
,
int
)
for
elem
in
stop_indexes
),
'invalid stop_words'
# each id in stop_indexes represents a stop word
# refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
# detailed explanation about fastertransformer's stop_indexes
stop_word_offsets
=
range
(
1
,
len
(
stop_indexes
)
+
1
)
stop_words
=
np
.
array
([[
stop_indexes
,
stop_word_offsets
]]).
astype
(
np
.
int32
)
return
stop_words
def
get_hf_config_content
(
pretrained_model_name_or_path
:
str
,
**
kwargs
)
->
dict
:
"""Get config content of a hf model."""
if
os
.
path
.
exists
(
pretrained_model_name_or_path
):
config_path
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
'config.json'
)
else
:
config_path
=
hf_hub_download
(
pretrained_model_name_or_path
,
'config.json'
)
with
open
(
config_path
,
'r'
)
as
f
:
config
=
json
.
load
(
f
)
return
config
def
get_model
(
pretrained_model_name_or_path
:
str
,
download_dir
:
str
=
None
,
revision
:
str
=
None
):
"""Get model from huggingface or modelscope."""
import
os
if
os
.
getenv
(
'LMDEPLOY_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
else
:
from
huggingface_hub
import
snapshot_download
download_kwargs
=
{}
if
download_dir
is
not
None
:
download_kwargs
[
'cache_dir'
]
=
download_dir
if
revision
is
not
None
:
download_kwargs
[
'revision'
]
=
revision
model_path
=
snapshot_download
(
pretrained_model_name_or_path
,
**
download_kwargs
)
return
model_path
def
logging_timer
(
op_name
:
str
,
logger
:
Logger
,
level
:
int
=
logging
.
DEBUG
):
"""logging timer."""
@
contextmanager
def
__timer
():
"""timer."""
start
=
time
.
perf_counter
()
yield
end
=
time
.
perf_counter
()
duration
=
(
end
-
start
)
*
1000
logger
.
log
(
level
,
f
'<
{
op_name
}
> take time:
{
duration
:.
2
f
}
ms'
)
def
__inner
(
func
):
"""inner."""
@
functools
.
wraps
(
func
)
def
__func_warpper
(
*
args
,
**
kwargs
):
"""func warpper."""
if
logger
.
level
>
level
:
return
func
(
*
args
,
**
kwargs
)
with
__timer
():
return
func
(
*
args
,
**
kwargs
)
@
functools
.
wraps
(
func
)
def
__async_warpper
(
*
args
,
**
kwargs
):
"""async warpper."""
async
def
__tmp
():
if
logger
.
level
>
level
:
return
(
await
func
(
*
args
,
**
kwargs
))
with
__timer
():
return
(
await
func
(
*
args
,
**
kwargs
))
return
__tmp
()
if
asyncio
.
iscoroutinefunction
(
func
):
return
__async_warpper
else
:
return
__func_warpper
return
__inner
lmdeploy/version.py
View file @
d7117b95
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
__dcu_version__
=
'0.
1.0
'
__version__
=
'0.
1.0
'
__dcu_version__
=
'0.
2.6
'
__version__
=
'0.
2.6
'
short_version
=
__version__
...
...
requirements/docs.txt
View file @
d7117b95
...
...
@@ -3,9 +3,10 @@ m2r==0.2.1
markdown>=3.4.0
mistune==0.8.4
myst-parser
-e git+https://github.com/
open-mmlab
/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-e git+https://github.com/
InternLM
/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
recommonmark
sphinx==4.0.2
sphinx-copybutton
sphinx-tabs
sphinx_markdown_tables>=0.0.16
sphinxcontrib-mermaid
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment