Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Qwen-7B_fastllm
Commits
d8b9f285
Commit
d8b9f285
authored
Oct 26, 2023
by
zhouxiang
Browse files
更新修复偶发的持续输出问题
parent
a9bd16ac
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
281 additions
and
159 deletions
+281
-159
README.md
README.md
+1
-1
cli_demo.py
cli_demo.py
+1
-1
package/fastllm_pytools/hf_model.py
package/fastllm_pytools/hf_model.py
+57
-56
package/fastllm_pytools/libfastllm_tools.so
package/fastllm_pytools/libfastllm_tools.so
+0
-0
package/fastllm_pytools/llm.py
package/fastllm_pytools/llm.py
+194
-87
package/fastllm_pytools/torch2flm.py
package/fastllm_pytools/torch2flm.py
+28
-14
No files found.
README.md
View file @
d8b9f285
...
...
@@ -3,7 +3,7 @@
## 模型介绍
通义千问-7B(Qwen-7B) 是阿里云研发的通义千问大模型系列的70亿参数规模的模型。Qwen-7B是基于Transformer的大语言模型, 在超大规模的预训练数据上进行训练得到。预训练数据类型多样,覆盖广泛,包括大量网络文本、专业书籍、代码等。同时,在Qwen-7B的基础上,使用对齐机制打造了基于大语言模型的AI助手Qwen-7B-Chat。
本项目主要针对Qwen-7B-Chat在DCU平台的推理性能优化,达到DCU平台较快的对话效果。
)
本项目主要针对Qwen-7B-Chat在DCU平台的推理性能优化,达到DCU平台较快的对话效果。
## 模型结构
...
...
cli_demo.py
View file @
d8b9f285
...
...
@@ -24,7 +24,7 @@ if __name__ == "__main__":
continue
print
(
"AI:"
,
end
=
""
)
curResponse
=
""
for
response
in
model
.
stream_response
(
query
,
history
=
history
):
for
response
in
model
.
stream_response
(
query
,
history
=
history
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
):
curResponse
+=
response
print
(
response
,
flush
=
True
,
end
=
""
)
history
.
append
((
query
,
curResponse
))
\ No newline at end of file
package/fastllm_pytools/hf_model.py
View file @
d8b9f285
from
fastllm_pytools
import
llm
import
torch
import
ctypes
import
numpy
as
np
from
fastllm_pytools
import
llm
;
import
torch
;
import
ctypes
;
import
numpy
as
np
;
fastllm_data_type_dict
=
{
"int4"
:
8
,
...
...
@@ -22,60 +22,61 @@ def create(model,
history_sep
=
None
,
dtype
=
"float16"
):
if
(
dtype
not
in
fastllm_data_type_dict
):
print
(
"dtype should in "
,
list
(
fastllm_data_type_dict
.
keys
()))
exit
(
0
)
print
(
"dtype should in "
,
list
(
fastllm_data_type_dict
.
keys
()))
;
exit
(
0
)
;
# 0.1 model info
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
if
(
pre_prompt
):
modelInfo
[
"pre_prompt"
]
=
pre_prompt
modelInfo
[
"pre_prompt"
]
=
pre_prompt
;
if
(
user_role
):
modelInfo
[
"user_role"
]
=
user_role
modelInfo
[
"user_role"
]
=
user_role
;
if
(
bot_role
):
modelInfo
[
"bot_role"
]
=
bot_role
modelInfo
[
"bot_role"
]
=
bot_role
;
if
(
history_sep
):
modelInfo
[
"history_sep"
]
=
history_sep
modelInfo
[
"history_sep"
]
=
history_sep
;
if
(
modelInfo
[
"model_type"
]
==
"baichuan"
and
hasattr
(
model
,
"model"
)
and
hasattr
(
model
.
model
,
"get_alibi_mask"
)):
# Baichuan 2代
modelInfo
[
"use_alibi"
]
=
"1"
modelInfo
[
"pre_prompt"
]
=
""
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
modelInfo
[
"history_sep"
]
=
""
modelInfo
[
"use_alibi"
]
=
"1"
;
modelInfo
[
"pre_prompt"
]
=
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
;
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
;
modelInfo
[
"history_sep"
]
=
""
;
if
(
modelInfo
[
"model_type"
]
==
"qwen"
):
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
weight_type_dict
=
{}
module_dict
=
{}
weight_bits
=
{}
weight_type_dict
=
{}
;
module_dict
=
{}
;
weight_bits
=
{}
;
for
key
,
m
in
model
.
named_modules
():
if
(
str
(
type
(
m
)).
find
(
"QuantizedLinear"
)
!=
-
1
):
weight_type_dict
[
key
+
".weight"
]
=
"QuantizedLinear"
weight_bits
[
key
+
".weight"
]
=
m
.
weight_bit_width
weight_type_dict
[
key
+
".weight"
]
=
"QuantizedLinear"
;
weight_bits
[
key
+
".weight"
]
=
m
.
weight_bit_width
;
if
(
isinstance
(
m
,
torch
.
nn
.
Linear
)):
weight_type_dict
[
key
+
".weight"
]
=
"linear"
module_dict
[
key
+
".weight"
]
=
m
weight_type_dict
[
key
+
".weight"
]
=
"linear"
;
module_dict
[
key
+
".weight"
]
=
m
;
if
(
isinstance
(
m
,
torch
.
nn
.
Embedding
)):
weight_type_dict
[
key
]
=
"embedding"
weight_type_dict
[
key
]
=
"embedding"
;
peft_config
=
{}
active_adapter
=
""
if
hasattr
(
model
,
"peft_config"
):
peft_config
=
model
.
peft_config
if
hasattr
(
model
,
"active_adapter"
):
if
hasattr
(
model
,
"active_adapter"
)
and
isinstance
(
model
.
active_adapter
,
str
):
# in transformers >= 4.33.0, active_adapter is a funtion in model, ignore it now
active_adapter
=
model
.
active_adapter
model
=
model
.
cpu
()
dict
=
model
.
state_dict
()
model_type
=
model
.
config
.
__dict__
[
"model_type"
]
model
=
llm
.
fastllm_lib
.
create_empty_llm_model
(
model_type
.
encode
())
model
=
model
.
cpu
()
;
dict
=
model
.
state_dict
()
;
model_type
=
model
.
config
.
__dict__
[
"model_type"
]
;
model
=
llm
.
fastllm_lib
.
create_empty_llm_model
(
model_type
.
encode
())
;
for
it
in
modelInfo
.
keys
():
llm
.
fastllm_lib
.
add_dict_llm_model
(
model
,
str
(
it
).
encode
(),
str
(
modelInfo
[
it
]).
encode
())
llm
.
fastllm_lib
.
add_dict_llm_model
(
model
,
str
(
it
).
encode
(),
str
(
modelInfo
[
it
]).
encode
())
;
for
adapter_name
in
peft_config
.
keys
():
adapter_dict
=
peft_config
[
adapter_name
].
__dict__
...
...
@@ -90,39 +91,39 @@ def create(model,
if
modelInfo
[
"model_type"
]
==
"qwen"
:
pass
else
:
tokenizer
=
tokenizer
.
tokenizer
tokenizer
=
tokenizer
.
tokenizer
;
if
(
hasattr
(
tokenizer
,
"sp_model"
)):
piece_size
=
tokenizer
.
sp_model
.
piece_size
()
piece_size
=
tokenizer
.
sp_model
.
piece_size
()
;
for
i
in
range
(
piece_size
):
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
tokenizer
.
sp_model
.
id_to_piece
(
i
).
encode
(),
i
,
ctypes
.
c_float
(
tokenizer
.
sp_model
.
get_score
(
i
)))
i
,
ctypes
.
c_float
(
tokenizer
.
sp_model
.
get_score
(
i
)))
;
else
:
vocab
=
tokenizer
.
get_vocab
()
vocab
=
tokenizer
.
get_vocab
()
;
for
v
in
vocab
.
keys
():
if
(
modelInfo
[
"model_type"
]
==
"moss"
):
vv
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
vv
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
vv
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
;
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
vv
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
;
elif
(
modelInfo
[
"model_type"
]
==
"qwen"
):
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
;
else
:
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
.
encode
(),
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
tot
=
0
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
.
encode
(),
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
;
tot
=
0
;
for
key
in
dict
:
ori_data_type
=
0
ori_np_data_type
=
np
.
float32
cur_weight_type
=
0
ori_data_type
=
0
;
ori_np_data_type
=
np
.
float32
;
cur_weight_type
=
0
;
if
(
key
in
weight_type_dict
and
weight_type_dict
[
key
]
in
fastllm_weight_type_dict
):
cur_weight_type
=
fastllm_weight_type_dict
[
weight_type_dict
[
key
]]
to_data_type
=
0
cur_weight_type
=
fastllm_weight_type_dict
[
weight_type_dict
[
key
]]
;
to_data_type
=
0
;
if
(
cur_weight_type
==
1
):
to_data_type
=
fastllm_data_type_dict
[
dtype
]
to_data_type
=
fastllm_data_type_dict
[
dtype
]
;
if
(
to_data_type
==
7
):
ori_data_type
=
7
ori_np_data_type
=
np
.
float16
ori_data_type
=
7
;
ori_np_data_type
=
np
.
float16
;
elif
(
cur_weight_type
==
2
):
# TODO bfloat
to_data_type
=
0
to_data_type
=
0
;
weight_name
=
key
if
peft_config
is
not
None
:
...
...
@@ -133,19 +134,19 @@ def create(model,
(
ctypes
.
c_int
*
len
(
dict
[
key
].
shape
))(
*
list
(
dict
[
key
].
shape
)),
weight_bits
[
key
],
dict
[
key
+
"_scale"
].
numpy
().
astype
(
np
.
float32
).
ctypes
.
data_as
(
ctypes
.
c_void_p
),
dict
[
key
].
numpy
().
ctypes
.
data_as
(
ctypes
.
c_void_p
))
dict
[
key
].
numpy
().
ctypes
.
data_as
(
ctypes
.
c_void_p
))
;
else
:
llm
.
fastllm_lib
.
add_weight_llm_model
(
model
,
weight_name
.
encode
(),
len
(
dict
[
key
].
shape
),
(
ctypes
.
c_int
*
len
(
dict
[
key
].
shape
))(
*
list
(
dict
[
key
].
shape
)),
to_data_type
,
cur_weight_type
,
ori_data_type
,
dict
[
key
].
numpy
().
astype
(
ori_np_data_type
).
ctypes
.
data_as
(
ctypes
.
c_void_p
))
tot
+=
1
print
(
"convert ("
,
tot
,
"/"
,
len
(
dict
),
end
=
" )
\r
"
)
dict
[
key
].
numpy
().
astype
(
ori_np_data_type
).
ctypes
.
data_as
(
ctypes
.
c_void_p
))
;
tot
+=
1
;
print
(
"convert ("
,
tot
,
"/"
,
len
(
dict
),
end
=
" )
\r
"
)
;
print
(
""
)
llm
.
fastllm_lib
.
init_params_llm_model
(
model
)
llm
.
fastllm_lib
.
warmup_llm_model
(
model
)
ret
=
llm
.
model
(
""
,
id
=
model
)
return
ret
print
(
""
)
;
llm
.
fastllm_lib
.
init_params_llm_model
(
model
)
;
llm
.
fastllm_lib
.
warmup_llm_model
(
model
)
;
ret
=
llm
.
model
(
""
,
id
=
model
)
;
return
ret
;
package/fastllm_pytools/libfastllm_tools.so
View file @
d8b9f285
No preview for this file type
package/fastllm_pytools/llm.py
View file @
d8b9f285
import
ctypes
import
os
from
typing
import
Optional
,
Tuple
,
Union
,
List
,
Callable
,
Dict
,
Any
import
ctypes
;
import
math
import
os
;
import
threading
from
typing
import
Optional
,
Tuple
,
Union
,
List
,
Callable
,
Dict
,
Any
;
import
platform
if
platform
.
system
()
==
'Windows'
:
...
...
@@ -11,6 +13,12 @@ else:
fastllm_lib
.
create_llm_model
.
argtypes
=
[
ctypes
.
c_char_p
]
fastllm_lib
.
create_llm_model
.
restype
=
ctypes
.
c_int
fastllm_lib
.
token_decode
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
c_int
,
ctypes
.
c_int
,
ctypes
.
c_char_p
]
fastllm_lib
.
token_decode
.
restype
=
ctypes
.
c_int
fastllm_lib
.
token_encode_string
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
c_char_p
,
ctypes
.
c_int
,
ctypes
.
POINTER
(
ctypes
.
c_int
)]
fastllm_lib
.
token_encode_string
.
restype
=
ctypes
.
c_int
fastllm_lib
.
launch_response_llm_model
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
c_int
,
ctypes
.
c_void_p
,
ctypes
.
c_int
,
ctypes
.
c_bool
,
ctypes
.
c_float
,
ctypes
.
c_int
,
ctypes
.
c_float
,
ctypes
.
c_float
,
ctypes
.
c_bool
]
...
...
@@ -46,106 +54,178 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
fastllm_lib
.
set_device_map
.
argtype
=
[
ctypes
.
c_int
,
ctypes
.
c_void_p
,
ctypes
.
c_char_p
,
ctypes
.
c_void_p
]
def
set_cpu_threads
(
threads
:
int
):
fastllm_lib
.
set_cpu_threads
(
threads
)
fastllm_lib
.
set_cpu_threads
(
threads
)
;
def
get_cpu_threads
()
->
int
:
return
fastllm_lib
.
get_cpu_threads
()
return
fastllm_lib
.
get_cpu_threads
()
;
def
print_ins_info
():
fastllm_lib
.
print_cpu_ins
()
fastllm_lib
.
print_cpu_ins
()
;
def
set_cpu_kvcache
(
cpu_kvcache
):
fastllm_lib
.
set_kvcache_in_cpu
(
ctypes
.
c_bool
(
cpu_kvcache
))
fastllm_lib
.
set_kvcache_in_cpu
(
ctypes
.
c_bool
(
cpu_kvcache
))
;
def
get_cpu_kvcache
():
return
fastllm_lib
.
get_kvcache_in_cpu
()
return
fastllm_lib
.
get_kvcache_in_cpu
()
;
def
set_cpu_low_mem
(
low_mem
):
fastllm_lib
.
set_cpu_low_mem
(
ctypes
.
c_bool
(
low_mem
))
fastllm_lib
.
set_cpu_low_mem
(
ctypes
.
c_bool
(
low_mem
))
;
def
get_cpu_low_mem
():
return
fastllm_lib
.
get_cpu_low_mem
()
return
fastllm_lib
.
get_cpu_low_mem
()
;
def
set_device_map
(
device_map
):
devices
=
[]
values
=
[]
devices
=
[]
;
values
=
[]
;
if
(
isinstance
(
device_map
,
str
)):
devices
.
append
(
device_map
)
values
.
append
(
1
)
devices
.
append
(
device_map
)
;
values
.
append
(
1
)
;
elif
(
isinstance
(
device_map
,
list
)):
devices
=
[
str
(
x
)
for
x
in
device_map
]
values
=
[
1
for
x
in
device_map
]
devices
=
[
str
(
x
)
for
x
in
device_map
]
;
values
=
[
1
for
x
in
device_map
]
;
elif
(
isinstance
(
device_map
,
dict
)):
devices
=
[
str
(
x
)
for
x
in
device_map
.
keys
()]
values
=
[
int
(
device_map
[
x
])
for
x
in
device_map
.
keys
()]
devices
=
[
str
(
x
)
for
x
in
device_map
.
keys
()]
;
values
=
[
int
(
device_map
[
x
])
for
x
in
device_map
.
keys
()]
;
else
:
print
(
"set_device_map error."
)
return
device_str
=
''
.
join
(
devices
)
device_len
=
[
len
(
x
)
for
x
in
devices
]
print
(
"set_device_map error."
)
;
return
;
device_str
=
''
.
join
(
devices
)
;
device_len
=
[
len
(
x
)
for
x
in
devices
]
;
fastllm_lib
.
set_device_map
(
len
(
device_len
),
(
ctypes
.
c_int
*
len
(
device_len
))(
*
device_len
),
device_str
.
encode
(),
(
ctypes
.
c_int
*
len
(
values
))(
*
values
))
(
ctypes
.
c_int
*
len
(
values
))(
*
values
))
;
def
from_hf
(
model
,
tokenizer
=
None
,
dtype
=
"float16"
):
from
fastllm_pytools
import
hf_model
return
hf_model
.
create
(
model
,
tokenizer
,
dtype
=
dtype
)
from
fastllm_pytools
import
hf_model
;
return
hf_model
.
create
(
model
,
tokenizer
,
dtype
=
dtype
)
;
class
model
:
def
__init__
(
self
,
path
:
str
,
id
:
int
=
-
99999
):
if
(
id
!=
-
99999
):
self
.
model
=
id
self
.
model
=
id
;
else
:
self
.
model
=
fastllm_lib
.
create_llm_model
(
path
.
encode
())
self
.
direct_query
=
False
self
.
model
=
fastllm_lib
.
create_llm_model
(
path
.
encode
());
self
.
direct_query
=
False
;
# 为了减少重复申请释放buffer对象而使用的线程局部存储区对象池
self
.
thread_local_obj
=
threading
.
local
()
self
.
thread_local_obj
.
tokenizer_encode_string__output_buffer
=
None
self
.
thread_local_obj
.
tokenizer_decode_token__output_buffer
=
None
# tokenizer_decode_token 输出结果的静态缓存,手工触发构建
# 由于token数量有限且不太多,所以缓存该结果来减少调用较为适合。
# 不做成自动缓存是为了避免在多线程调用的时候对缓存dict加锁,同时也为不同场景提供选择空间
self
.
tokenizer_decode_token_cache
=
None
def
get_prompt
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
)
->
str
:
if
(
not
(
history
)):
history
=
[]
prompt
=
""
history
=
[]
;
prompt
=
""
;
for
i
,
(
old_query
,
response
)
in
enumerate
(
history
):
prompt
=
fastllm_lib
.
make_history_llm_model
(
self
.
model
,
prompt
.
encode
(),
i
,
old_query
.
encode
(),
response
.
encode
()).
decode
()
prompt
=
fastllm_lib
.
make_input_llm_model
(
self
.
model
,
prompt
.
encode
(),
len
(
history
),
query
.
encode
()).
decode
()
return
prompt
prompt
=
fastllm_lib
.
make_history_llm_model
(
self
.
model
,
prompt
.
encode
(),
i
,
old_query
.
encode
(),
response
.
encode
()).
decode
()
;
prompt
=
fastllm_lib
.
make_input_llm_model
(
self
.
model
,
prompt
.
encode
(),
len
(
history
),
query
.
encode
()).
decode
()
;
return
prompt
;
def
save
(
self
,
path
:
str
):
fastllm_lib
.
save_llm_model
(
self
.
model
,
path
.
encode
())
fastllm_lib
.
save_llm_model
(
self
.
model
,
path
.
encode
())
;
def
eval
(
self
):
pass
pass
;
def
build_tokenizer_decode_token_cache
(
self
):
if
self
.
tokenizer_decode_token_cache
is
not
None
:
return
cache_dict
=
dict
()
vocab_size
=
fastllm_lib
.
get_tokenizer_vocab_size
(
self
.
model
)
for
token_id
in
range
(
vocab_size
):
cache_dict
[
token_id
]
=
self
.
tokenizer_decode_token
(
token_id
)
self
.
tokenizer_decode_token_cache
=
cache_dict
def
tokenizer_encode_string
(
self
,
content
:
str
)
->
List
[
int
]:
output_buffer_init_len
=
1024
if
self
.
thread_local_obj
.
tokenizer_encode_string__output_buffer
is
None
:
self
.
thread_local_obj
.
tokenizer_encode_string__output_buffer
=
(
ctypes
.
c_int
*
output_buffer_init_len
)()
buffer
=
self
.
thread_local_obj
.
tokenizer_encode_string__output_buffer
buffer_len
=
len
(
buffer
)
result_len
=
fastllm_lib
.
token_encode_string
(
self
.
model
,
content
.
encode
(),
buffer_len
,
buffer
)
if
result_len
>
buffer_len
:
if
result_len
>
10240
:
# 要处理的数据过长,使用一次性的buffer
temp_buffer
=
(
ctypes
.
c_int
*
result_len
)()
ret
=
fastllm_lib
.
token_encode_string
(
self
.
model
,
content
.
encode
(),
result_len
,
temp_buffer
)
return
[
i
for
i
in
temp_buffer
]
else
:
# 扩展buffer大小
new_buffer_len
=
round
(
math
.
ceil
(
result_len
/
1024.0
))
*
1024
buffer
=
(
ctypes
.
c_int
*
new_buffer_len
)()
self
.
thread_local_obj
.
tokenizer_encode_string__output_buffer
=
buffer
result_len
=
fastllm_lib
.
token_encode_string
(
self
.
model
,
content
.
encode
(),
new_buffer_len
,
buffer
)
return
[
buffer
[
i
]
for
i
in
range
(
result_len
)]
def
tokenizer_decode_token
(
self
,
token_id
:
int
)
->
bytes
:
if
self
.
tokenizer_decode_token_cache
is
not
None
:
cache_result
=
self
.
tokenizer_decode_token_cache
.
get
(
token_id
)
if
cache_result
is
not
None
:
return
cache_result
output_buffer_init_len
=
256
if
self
.
thread_local_obj
.
tokenizer_decode_token__output_buffer
is
None
:
self
.
thread_local_obj
.
tokenizer_decode_token__output_buffer
=
ctypes
.
create_string_buffer
(
output_buffer_init_len
)
buffer
=
self
.
thread_local_obj
.
tokenizer_decode_token__output_buffer
ret
=
fastllm_lib
.
token_decode
(
self
.
model
,
token_id
,
len
(
buffer
),
buffer
)
if
ret
>
0
:
# buffer长度不够,扩展buffer大小
new_buffer_len
=
round
(
math
.
ceil
(
ret
/
16.0
))
*
16
buffer
=
ctypes
.
create_string_buffer
(
new_buffer_len
)
self
.
thread_local_obj
.
tokenizer_decode_token__output_buffer
=
buffer
ret
=
fastllm_lib
.
token_decode
(
self
.
model
,
token_id
,
len
(
buffer
),
buffer
)
assert
ret
==
0
buffer_bytes
=
buffer
.
raw
result_len
=
len
(
buffer_bytes
)
for
i
in
range
(
len
(
buffer_bytes
)):
if
buffer_bytes
[
i
]
==
0
:
result_len
=
i
break
return
buffer_bytes
[:
result_len
]
def
response_logits
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
tokenizer
=
None
)
->
str
:
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
if
(
tokenizer
==
None
):
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
1
),
ctypes
.
c_bool
(
False
),
ctypes
.
c_float
(
1
),
ctypes
.
c_int
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_bool
(
True
))
ctypes
.
c_float
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_bool
(
True
))
;
else
:
input
=
tokenizer
.
encode
(
prompt
)
input
=
tokenizer
.
encode
(
prompt
)
;
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
1
,
False
,
1
,
1
,
1
,
1
,
True
)
vocab_size
=
fastllm_lib
.
get_tokenizer_vocab_size
(
self
.
model
)
1
,
False
,
1
,
1
,
1
,
1
,
True
)
;
vocab_size
=
fastllm_lib
.
get_tokenizer_vocab_size
(
self
.
model
)
;
logits
=
list
(
range
(
vocab_size
))
array
=
(
ctypes
.
c_float
*
(
vocab_size
*
4
))(
*
logits
)
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
out
=
list
(
array
)[:
vocab_size
]
array
=
(
ctypes
.
c_float
*
(
vocab_size
*
4
))(
*
logits
)
;
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
;
out
=
list
(
array
)[:
vocab_size
]
;
while
(
ret
!=
-
1
):
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
return
out
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
;
return
out
;
def
response
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
)
->
str
:
ret
=
""
ret
=
""
;
for
i
in
self
.
stream_response
(
query
=
query
,
history
=
history
,
max_length
=
max_length
,
...
...
@@ -154,84 +234,111 @@ class model:
temperature
=
temperature
,
repeat_penalty
=
repeat_penalty
,
one_by_one
=
True
):
ret
+=
i
return
ret
ret
+=
i
;
return
ret
;
def
stream_response
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
one_by_one
=
True
):
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
max_length
),
ctypes
.
c_bool
(
do_sample
),
ctypes
.
c_float
(
top_p
),
ctypes
.
c_int
(
top_k
),
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
res
=
""
ret
=
b
''
fail_cnt
=
0
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
;
res
=
""
;
ret
=
b
''
;
fail_cnt
=
0
;
while
True
:
ret
+=
fastllm_lib
.
fetch_response_str_llm_model
(
self
.
model
,
handle
)
cur
=
""
ret
+=
fastllm_lib
.
fetch_response_str_llm_model
(
self
.
model
,
handle
)
;
cur
=
""
;
try
:
cur
=
ret
.
decode
()
ret
=
b
''
cur
=
ret
.
decode
()
;
ret
=
b
''
;
except
:
fail_cnt
+=
1
fail_cnt
+=
1
;
if
(
fail_cnt
==
20
):
break
break
;
else
:
continue
fail_cnt
=
0
continue
;
fail_cnt
=
0
;
if
(
cur
==
"<flmeos>"
):
break
;
if
one_by_one
:
yield
cur
;
else
:
res
+=
cur
;
yield
res
;
def
stream_response_raw
(
self
,
input_tokens
:
List
[
int
],
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
one_by_one
=
True
):
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input_tokens
),
(
ctypes
.
c_int
*
len
(
input_tokens
))(
*
input_tokens
),
ctypes
.
c_int
(
max_length
),
ctypes
.
c_bool
(
do_sample
),
ctypes
.
c_float
(
top_p
),
ctypes
.
c_int
(
top_k
),
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
# 可能遇到长尾char需要多个token才能够生成,所以只返回bytes,string.decode策略交给外部
# 方便统计输出token数量,和控制不完整utf8时候解码的逻辑
total_bytes
=
b
''
while
True
:
cur_token
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
if
cur_token
==
-
1
:
break
cur_bytes
=
self
.
tokenizer_decode_token
(
cur_token
)
if
one_by_one
:
yield
cur
yield
cur
_bytes
else
:
r
es
+=
cur
yield
r
es
total_byt
es
+=
cur
_bytes
yield
total_byt
es
def
chat
(
self
,
tokenizer
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
**
kwargs
):
if
(
not
(
history
)):
history
=
[]
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
input
=
tokenizer
.
encode
(
prompt
)
history
=
[]
;
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
input
=
tokenizer
.
encode
(
prompt
)
;
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
)
False
)
;
result
=
[]
result
=
[]
;
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
;
if
(
cur
==
-
1
):
break
result
.
append
(
cur
)
response
=
tokenizer
.
decode
(
result
)
history
=
history
+
[(
query
,
response
)]
return
response
,
history
break
;
result
.
append
(
cur
)
;
response
=
tokenizer
.
decode
(
result
)
;
history
=
history
+
[(
query
,
response
)]
;
return
response
,
history
;
def
stream_chat
(
self
,
tokenizer
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
past_key_values
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
return_past_key_values
=
False
,
**
kwargs
)
->
str
:
if
(
not
(
history
)):
history
=
[]
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
input
=
tokenizer
.
encode
(
prompt
)
history
=
[]
;
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
input
=
tokenizer
.
encode
(
prompt
)
;
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
)
tokens
=
[]
False
)
;
tokens
=
[]
;
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
;
if
(
cur
==
-
1
):
break
tokens
.
append
(
cur
)
response
=
tokenizer
.
decode
(
tokens
)
new_history
=
history
+
[(
query
,
response
)]
break
;
tokens
.
append
(
cur
)
;
response
=
tokenizer
.
decode
(
tokens
)
;
new_history
=
history
+
[(
query
,
response
)]
;
if
return_past_key_values
:
yield
response
,
new_history
,
None
yield
response
,
new_history
,
None
;
else
:
yield
response
,
new_history
yield
response
,
new_history
;
def
set_adapter
(
self
,
name
:
str
):
fastllm_lib
.
set_adapter
(
self
.
model
,
str
(
name
).
encode
())
...
...
package/fastllm_pytools/torch2flm.py
View file @
d8b9f285
...
...
@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding"
:
2
}
v
=
np
.
random
.
randint
(
-
127
,
127
,
[
10
,
20
])
temp
=
v
v
=
np
.
random
.
randint
(
-
127
,
127
,
[
10
,
20
])
;
temp
=
v
;
c_max
=
np
.
expand_dims
(
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_scale
=
c_max
/
127.0
v
=
(
v
/
c_scale
+
128.5
).
clip
(
1
,
255
).
astype
(
np
.
uint8
)
...
...
@@ -34,23 +34,31 @@ def write_int8(fo, v):
fo
.
write
(
struct
.
pack
(
'i'
,
3
))
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
for
i
in
range
(
c_max
.
shape
[
0
]):
fo
.
write
(
struct
.
pack
(
'f'
,
-
c_max
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
-
c_max
[
i
][
0
]))
;
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
;
fo
.
write
(
v
.
data
)
def
write_int4
(
fo
,
v
):
c_min
=
np
.
expand_dims
(
-
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_max
=
np
.
expand_dims
(
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_scale
=
c_max
/
7.0
c_min
=
c_scale
*
-
8.0
# c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
# c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
# c_scale = c_max / 7.0
# c_min = c_scale * -8.0
c_min
=
np
.
expand_dims
(
v
.
min
(
axis
=
-
1
),
-
1
)
c_max
=
np
.
expand_dims
(
v
.
max
(
axis
=
-
1
),
-
1
)
c_scale
=
(
c_max
-
c_min
)
/
15.0
c_zero
=
np
.
round
(
0.0
-
c_min
/
c_scale
)
c_zero
=
c_zero
.
clip
(
0
,
15
)
c_min
=
-
c_scale
*
c_zero
v
=
(
v
-
c_min
)
/
c_scale
v
=
(
v
+
0.5
).
astype
(
np
.
int8
).
clip
(
0
,
15
).
astype
(
np
.
uint8
)
v
=
v
[:,
0
::
2
]
*
16
+
v
[:,
1
::
2
]
fo
.
write
(
struct
.
pack
(
'i'
,
8
))
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
for
i
in
range
(
c_min
.
shape
[
0
]):
fo
.
write
(
struct
.
pack
(
'f'
,
c_min
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_min
[
i
][
0
]))
;
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
;
fo
.
write
(
v
.
data
)
def
tofile
(
exportPath
,
...
...
@@ -91,8 +99,14 @@ def tofile(exportPath,
# Baichuan 2代
modelInfo
[
"use_alibi"
]
=
"1"
modelInfo
[
"pre_prompt"
]
=
""
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
;
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
;
modelInfo
[
"history_sep"
]
=
""
if
(
modelInfo
[
"model_type"
]
==
"baichuan"
and
modelInfo
[
"vocab_size"
]
==
125696
):
# Baichuan 2代 7B
modelInfo
[
"pre_prompt"
]
=
""
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
;
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
;
modelInfo
[
"history_sep"
]
=
""
if
modelInfo
[
"model_type"
]
==
"qwen"
:
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
...
...
@@ -140,10 +154,10 @@ def tofile(exportPath,
for
v
in
vocab
.
keys
():
if
(
modelInfo
[
'model_type'
]
==
"qwen"
):
s
=
v
elif
(
modelInfo
[
"model_type"
]
==
"moss"
):
s
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
else
:
s
=
v
.
encode
()
if
(
modelInfo
[
"model_type"
]
==
"moss"
):
s
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
s
)))
for
c
in
s
:
fo
.
write
(
struct
.
pack
(
'i'
,
c
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment