Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
fastllm
Commits
56215723
Commit
56215723
authored
Jan 31, 2024
by
zhouxiang
Browse files
1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题
parent
44be91d3
Changes
83
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
130 additions
and
8 deletions
+130
-8
tools/scripts/qwen2flm.py
tools/scripts/qwen2flm.py
+4
-3
tools/scripts/setup.py
tools/scripts/setup.py
+2
-2
tools/src/pytools.cpp
tools/src/pytools.cpp
+124
-3
No files found.
tools/scripts/qwen2flm.py
View file @
56215723
...
@@ -4,9 +4,10 @@ from transformers.generation import GenerationConfig
...
@@ -4,9 +4,10 @@ from transformers.generation import GenerationConfig
from
fastllm_pytools
import
torch2flm
from
fastllm_pytools
import
torch2flm
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"Qwen/Qwen-7B-Chat"
,
trust_remote_code
=
True
)
model_path
=
sys
.
argv
[
3
]
if
len
(
sys
.
argv
)
>=
4
else
"Qwen/Qwen-7B-Chat"
model
=
AutoModelForCausalLM
.
from_pretrained
(
"Qwen/Qwen-7B-Chat"
,
device_map
=
"cpu"
,
trust_remote_code
=
True
,
fp32
=
True
).
eval
()
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
model
.
generation_config
=
GenerationConfig
.
from_pretrained
(
"Qwen/Qwen-7B-Chat"
,
trust_remote_code
=
True
)
# 可指定不同的生成长度、top_p等相关超参
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
"cpu"
,
trust_remote_code
=
True
,
fp32
=
True
).
eval
()
model
.
generation_config
=
GenerationConfig
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# 可指定不同的生成长度、top_p等相关超参
dtype
=
sys
.
argv
[
2
]
if
len
(
sys
.
argv
)
>=
3
else
"float16"
dtype
=
sys
.
argv
[
2
]
if
len
(
sys
.
argv
)
>=
3
else
"float16"
exportPath
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
>=
2
else
"qwen-7b-"
+
dtype
+
".flm"
exportPath
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
>=
2
else
"qwen-7b-"
+
dtype
+
".flm"
...
...
tools/scripts/setup.py
View file @
56215723
...
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
...
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup
(
setup
(
name
=
"fastllm_pytools"
,
name
=
"fastllm_pytools"
,
version
=
"0.
0.1
"
,
version
=
"0.
1.0
"
,
author
=
"huangyuyang"
,
author
=
"huangyuyang"
,
author_email
=
"ztxz16@foxmail.com"
,
author_email
=
"ztxz16@foxmail.com"
,
description
=
"Fastllm pytools"
,
description
=
"Fastllm pytools"
,
...
@@ -10,6 +10,6 @@ setup (
...
@@ -10,6 +10,6 @@ setup (
packages
=
[
'fastllm_pytools'
],
packages
=
[
'fastllm_pytools'
],
package_data
=
{
package_data
=
{
''
:
[
'*.dll'
,
'*.so'
]
''
:
[
'*.dll'
,
'*.so'
,
'*.dylib'
]
}
}
)
)
tools/src/pytools.cpp
View file @
56215723
...
@@ -71,7 +71,7 @@ extern "C" {
...
@@ -71,7 +71,7 @@ extern "C" {
DLL_EXPORT
char
*
string_to_chars
(
const
std
::
string
&
s
)
{
DLL_EXPORT
char
*
string_to_chars
(
const
std
::
string
&
s
)
{
char
*
svalue
=
new
char
[
s
.
size
()
+
1
];
char
*
svalue
=
new
char
[
s
.
size
()
+
1
];
memcpy
(
svalue
,
s
.
data
(),
s
.
size
());
memcpy
(
svalue
,
s
.
data
(),
s
.
size
());
svalue
[
s
.
size
()]
=
0
;
svalue
[
s
.
size
()]
=
0
;
return
svalue
;
return
svalue
;
}
}
...
@@ -117,6 +117,34 @@ extern "C" {
...
@@ -117,6 +117,34 @@ extern "C" {
return
;
return
;
}
}
DLL_EXPORT
int
token_decode
(
int
modelId
,
int
tokenId
,
int
output_buffer_len
,
char
*
output_buffer
)
{
// 正常时候返回0,输出buffer长度不足时返回输出的bytes数量,包含末尾的\0
if
(
tokenId
==
-
1
)
{
output_buffer
[
0
]
=
'\0'
;
return
0
;
}
auto
model
=
models
.
GetModel
(
modelId
);
std
::
string
s
=
model
->
weight
.
tokenizer
.
DecodeTokens
(
std
::
vector
<
int
>
{
tokenId
});
if
(
s
.
length
()
+
1
>
output_buffer_len
)
{
return
(
int
)
s
.
length
()
+
1
;
}
memcpy
(
output_buffer
,
s
.
c_str
(),
s
.
length
()
+
1
);
return
0
;
}
DLL_EXPORT
int
token_encode_string
(
int
modelId
,
char
*
content
,
int
output_buffer_len
,
int
*
output_buffer
)
{
// 返回写入到output_buffer中的数量。当output不足时候,只输出对应的部分
auto
model
=
models
.
GetModel
(
modelId
);
auto
v
=
model
->
weight
.
tokenizer
.
Encode
(
content
);
for
(
int
i
=
0
;
i
<
v
.
Count
(
0
);
i
++
)
{
if
(
i
>=
output_buffer_len
)
{
break
;
}
output_buffer
[
i
]
=
(
int
)((
float
*
)
v
.
cpuData
)[
i
];
}
return
(
int
)
v
.
Count
(
0
);
}
DLL_EXPORT
void
add_dict_llm_model
(
int
modelId
,
char
*
key
,
char
*
value
)
{
DLL_EXPORT
void
add_dict_llm_model
(
int
modelId
,
char
*
key
,
char
*
value
)
{
auto
model
=
models
.
GetModel
(
modelId
);
auto
model
=
models
.
GetModel
(
modelId
);
model
->
weight
.
AddDict
(
key
,
value
);
model
->
weight
.
AddDict
(
key
,
value
);
...
@@ -141,6 +169,11 @@ extern "C" {
...
@@ -141,6 +169,11 @@ extern "C" {
return
;
return
;
}
}
DLL_EXPORT
void
release_memory
(
int
modelId
)
{
auto
model
=
models
.
GetModel
(
modelId
);
model
->
weight
.
ReleaseWeight
();
return
;
}
DLL_EXPORT
void
init_params_llm_model
(
int
modelId
)
{
DLL_EXPORT
void
init_params_llm_model
(
int
modelId
)
{
auto
model
=
models
.
GetModel
(
modelId
);
auto
model
=
models
.
GetModel
(
modelId
);
model
->
InitParams
();
model
->
InitParams
();
...
@@ -207,7 +240,8 @@ extern "C" {
...
@@ -207,7 +240,8 @@ extern "C" {
DLL_EXPORT
int
launch_response_str_llm_model
(
int
modelId
,
char
*
content
,
DLL_EXPORT
int
launch_response_str_llm_model
(
int
modelId
,
char
*
content
,
int
max_length
,
bool
do_sample
,
float
top_p
,
int
top_k
,
int
max_length
,
bool
do_sample
,
float
top_p
,
int
top_k
,
float
temperature
,
float
repeat_penalty
,
bool
output_logits
)
{
float
temperature
,
float
repeat_penalty
,
bool
output_logits
,
int
stop_token_len
,
int
*
stop_token_ids
)
{
auto
model
=
models
.
GetModel
(
modelId
);
auto
model
=
models
.
GetModel
(
modelId
);
std
::
vector
<
int
>
tokens
;
std
::
vector
<
int
>
tokens
;
auto
v
=
model
->
weight
.
tokenizer
.
Encode
(
content
);
auto
v
=
model
->
weight
.
tokenizer
.
Encode
(
content
);
...
@@ -215,6 +249,10 @@ extern "C" {
...
@@ -215,6 +249,10 @@ extern "C" {
tokens
.
push_back
((
int
)((
float
*
)
v
.
cpuData
)[
i
]);
tokens
.
push_back
((
int
)((
float
*
)
v
.
cpuData
)[
i
]);
}
}
auto
config
=
make_config
(
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
output_logits
);
auto
config
=
make_config
(
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
output_logits
);
for
(
int
i
=
0
;
i
<
stop_token_len
;
i
++
)
{
config
.
stop_token_ids
.
insert
(
stop_token_ids
[
i
]);
}
return
model
->
LaunchResponseTokens
(
tokens
,
config
);
return
model
->
LaunchResponseTokens
(
tokens
,
config
);
}
}
...
@@ -227,12 +265,17 @@ extern "C" {
...
@@ -227,12 +265,17 @@ extern "C" {
DLL_EXPORT
int
launch_response_llm_model
(
int
modelId
,
int
len
,
int
*
values
,
DLL_EXPORT
int
launch_response_llm_model
(
int
modelId
,
int
len
,
int
*
values
,
int
max_length
,
bool
do_sample
,
float
top_p
,
int
top_k
,
int
max_length
,
bool
do_sample
,
float
top_p
,
int
top_k
,
float
temperature
,
float
repeat_penalty
,
bool
output_logits
)
{
float
temperature
,
float
repeat_penalty
,
bool
output_logits
,
int
stop_token_len
,
int
*
stop_token_ids
)
{
std
::
vector
<
int
>
input
;
std
::
vector
<
int
>
input
;
for
(
int
i
=
0
;
i
<
len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
len
;
i
++
)
{
input
.
push_back
(
values
[
i
]);
input
.
push_back
(
values
[
i
]);
}
}
auto
config
=
make_config
(
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
output_logits
);
auto
config
=
make_config
(
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
output_logits
);
for
(
int
i
=
0
;
i
<
stop_token_len
;
i
++
)
{
config
.
stop_token_ids
.
insert
(
stop_token_ids
[
i
]);
}
auto
model
=
models
.
GetModel
(
modelId
);
auto
model
=
models
.
GetModel
(
modelId
);
return
model
->
LaunchResponseTokens
(
input
,
config
);
return
model
->
LaunchResponseTokens
(
input
,
config
);
}
}
...
@@ -251,4 +294,82 @@ extern "C" {
...
@@ -251,4 +294,82 @@ extern "C" {
}
}
return
ret
;
return
ret
;
}
}
DLL_EXPORT
char
*
get_llm_model_type
(
int
modelId
)
{
auto
model
=
models
.
GetModel
(
modelId
);
return
string_to_chars
(
model
->
model_type
);
}
char
**
convertToCharArray
(
const
std
::
vector
<
std
::
string
>&
strings
)
{
// 分配 char** 数组的内存
char
**
charArray
=
new
char
*
[
strings
.
size
()];
// 遍历 std::vector<std::string>
for
(
size_t
i
=
0
;
i
<
strings
.
size
();
i
++
)
{
// 获取当前字符串
const
std
::
string
&
str
=
strings
[
i
];
// 分配内存并复制字符串内容
charArray
[
i
]
=
new
char
[
str
.
length
()
+
1
];
std
::
strcpy
(
charArray
[
i
],
str
.
c_str
());
}
return
charArray
;
}
DLL_EXPORT
void
freeCharArray
(
char
**
charArray
,
size_t
size
)
{
// 释放每个字符串的内存
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
delete
[]
charArray
[
i
];
}
// 释放 char** 数组的内存
delete
[]
charArray
;
}
DLL_EXPORT
char
**
response_batch_str_llm_model
(
int
modelId
,
char
**
content
,
int
content_size
,
int
max_length
,
bool
do_sample
,
float
top_p
,
int
top_k
,
float
temperature
,
float
repeat_penalty
,
bool
output_logits
)
{
std
::
vector
<
std
::
string
>
inputs
;
std
::
vector
<
std
::
string
>
outputs
;
inputs
.
resize
(
content_size
);
outputs
.
resize
(
content_size
);
for
(
int
i
=
0
;
i
<
content_size
;
++
i
)
{
inputs
[
i
]
=
content
[
i
];
}
auto
model
=
models
.
GetModel
(
modelId
);
auto
config
=
make_config
(
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
output_logits
);
model
->
ResponseBatch
(
inputs
,
outputs
,
NULL
,
config
);
return
convertToCharArray
(
outputs
);
}
DLL_EXPORT
char
**
response_batch_tokens_llm_model
(
int
modelId
,
int
batch
,
int
*
tokens_lens
,
int
*
tokens
,
int
max_length
,
bool
do_sample
,
float
top_p
,
int
top_k
,
float
temperature
,
float
repeat_penalty
,
bool
output_logits
)
{
std
::
vector
<
std
::
vector
<
float
>>
inputTokens
;
inputTokens
.
resize
(
batch
);
int
index
=
0
;
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
for
(
int
j
=
0
;
j
<
tokens_lens
[
i
];
j
++
)
{
inputTokens
[
i
].
push_back
(
tokens
[
index
++
]);
}
}
std
::
vector
<
std
::
string
>
outputs
;
auto
model
=
models
.
GetModel
(
modelId
);
auto
config
=
make_config
(
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
output_logits
);
model
->
ResponseBatch
(
inputTokens
,
outputs
,
NULL
,
config
);
return
convertToCharArray
(
outputs
);
}
DLL_EXPORT
void
freeChars
(
char
*
charArray
)
{
if
(
charArray
!=
nullptr
)
{
// 释放字符串的内存
delete
[]
charArray
;
}
}
};
};
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment