Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ChatGLM2-6B_fastllm
Commits
7d96fda9
Commit
7d96fda9
authored
Sep 07, 2023
by
zhouxiang
Browse files
更新版本
parent
8e2381d6
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
210 additions
and
337 deletions
+210
-337
chatglm_export.py
chatglm_export.py
+5
-177
package/fastllm_pytools/hf_model.py
package/fastllm_pytools/hf_model.py
+79
-59
package/fastllm_pytools/libfastllm_tools.so
package/fastllm_pytools/libfastllm_tools.so
+0
-0
package/fastllm_pytools/llm.py
package/fastllm_pytools/llm.py
+94
-88
package/fastllm_pytools/torch2flm.py
package/fastllm_pytools/torch2flm.py
+32
-13
No files found.
chatglm_export.py
View file @
7d96fda9
import
sys
from
transformers
import
AutoTokenizer
,
AutoModel
import
struct
import
numpy
as
np
import
torch
def
writeString
(
fo
,
s
):
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
s
)))
fo
.
write
(
s
.
encode
())
def
writeKeyValue
(
fo
,
key
,
value
):
writeString
(
fo
,
key
)
writeString
(
fo
,
value
)
fastllm_data_type_dict
=
{
"int4"
:
8
,
"int8"
:
3
,
"float16"
:
7
,
"float32"
:
0
,
}
fastllm_weight_type_dict
=
{
"linear"
:
1
,
"embedding"
:
2
}
v
=
np
.
random
.
randint
(
-
127
,
127
,
[
10
,
20
])
temp
=
v
c_max
=
np
.
expand_dims
(
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_scale
=
c_max
/
127.0
v
=
(
v
/
c_scale
+
128.5
).
clip
(
1
,
255
).
astype
(
np
.
uint8
)
def
write_int8
(
fo
,
v
):
c_max
=
np
.
expand_dims
(
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
).
clip
(
0.1
,
1e100
)
c_scale
=
c_max
/
127.0
v
=
(
v
/
c_scale
+
128.5
).
clip
(
1
,
255
).
astype
(
np
.
uint8
)
fo
.
write
(
struct
.
pack
(
'i'
,
3
))
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
for
i
in
range
(
c_max
.
shape
[
0
]):
fo
.
write
(
struct
.
pack
(
'f'
,
-
c_max
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
fo
.
write
(
v
.
data
)
def
write_int4
(
fo
,
v
):
c_min
=
np
.
expand_dims
(
-
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_max
=
np
.
expand_dims
(
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_scale
=
c_max
/
7.0
c_min
=
c_scale
*
-
8.0
v
=
(
v
-
c_min
)
/
c_scale
v
=
(
v
+
0.5
).
astype
(
np
.
int8
).
clip
(
0
,
15
).
astype
(
np
.
uint8
)
v
=
v
[:,
0
::
2
]
*
16
+
v
[:,
1
::
2
]
fo
.
write
(
struct
.
pack
(
'i'
,
8
))
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
for
i
in
range
(
c_min
.
shape
[
0
]):
fo
.
write
(
struct
.
pack
(
'f'
,
c_min
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
fo
.
write
(
v
.
data
)
def
tofile
(
exportPath
,
model
,
tokenizer
=
None
,
pre_prompt
=
None
,
user_role
=
None
,
bot_role
=
None
,
history_sep
=
None
,
dtype
=
"float16"
):
if
(
dtype
not
in
fastllm_data_type_dict
):
print
(
"dtype should in "
,
list
(
fastllm_data_type_dict
.
keys
()))
exit
(
0
)
dict
=
model
.
state_dict
()
fo
=
open
(
exportPath
,
"wb"
)
# 0. version id
fo
.
write
(
struct
.
pack
(
'i'
,
2
))
# 0.1 model info
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
if
(
"model_type"
not
in
modelInfo
):
print
(
"unknown model_type."
)
exit
(
0
)
if
(
pre_prompt
):
modelInfo
[
"pre_prompt"
]
=
pre_prompt
if
(
user_role
):
modelInfo
[
"user_role"
]
=
user_role
if
(
bot_role
):
modelInfo
[
"bot_role"
]
=
bot_role
if
(
history_sep
):
modelInfo
[
"history_sep"
]
=
history_sep
modelInfo
[
"tokenizer_use_score"
]
=
"1"
# 分词带分数
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
modelInfo
)))
for
it
in
modelInfo
.
keys
():
writeKeyValue
(
fo
,
str
(
it
),
str
(
modelInfo
[
it
]))
# 1. vocab
if
(
tokenizer
):
if
(
hasattr
(
tokenizer
,
"tokenizer"
)):
if
(
modelInfo
[
'model_type'
]
==
"qwen"
):
pass
else
:
tokenizer
=
tokenizer
.
tokenizer
if
(
hasattr
(
tokenizer
,
"sp_model"
)):
piece_size
=
tokenizer
.
sp_model
.
piece_size
()
fo
.
write
(
struct
.
pack
(
'i'
,
piece_size
))
for
i
in
range
(
piece_size
):
s
=
tokenizer
.
sp_model
.
id_to_piece
(
i
).
encode
()
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
s
)))
for
c
in
s
:
fo
.
write
(
struct
.
pack
(
'i'
,
c
))
fo
.
write
(
struct
.
pack
(
'i'
,
i
))
fo
.
write
(
struct
.
pack
(
'f'
,
float
(
tokenizer
.
sp_model
.
get_score
(
i
))))
else
:
vocab
=
tokenizer
.
get_vocab
()
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
vocab
)))
for
v
in
vocab
.
keys
():
if
(
modelInfo
[
'model_type'
]
==
"qwen"
):
s
=
v
else
:
s
=
v
.
decode
()
if
(
modelInfo
[
"model_type"
]
==
"moss"
):
s
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
s
)))
for
c
in
s
:
fo
.
write
(
struct
.
pack
(
'i'
,
c
))
fo
.
write
(
struct
.
pack
(
'i'
,
vocab
[
v
]))
fo
.
write
(
struct
.
pack
(
'f'
,
1.0
))
else
:
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
weight_type_dict
=
{}
module_dict
=
{}
for
key
,
m
in
model
.
named_modules
():
if
(
isinstance
(
m
,
torch
.
nn
.
Linear
)):
weight_type_dict
[
key
+
".weight"
]
=
"linear"
module_dict
[
key
+
".weight"
]
=
m
if
(
isinstance
(
m
,
torch
.
nn
.
Embedding
)):
weight_type_dict
[
key
]
=
"embedding"
# 2. weight
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
dict
)))
tot
=
0
for
key
in
dict
:
ori_data_type
=
0
ori_np_data_type
=
np
.
float32
cur_weight_type
=
0
if
(
key
in
weight_type_dict
and
weight_type_dict
[
key
]
in
fastllm_weight_type_dict
):
cur_weight_type
=
fastllm_weight_type_dict
[
weight_type_dict
[
key
]]
to_data_type
=
0
if
(
cur_weight_type
==
1
):
to_data_type
=
fastllm_data_type_dict
[
dtype
]
if
(
to_data_type
==
7
):
ori_data_type
=
7
ori_np_data_type
=
np
.
float16
cur
=
dict
[
key
].
numpy
().
astype
(
ori_np_data_type
)
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
key
)))
fo
.
write
(
key
.
encode
())
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
cur
.
shape
)))
for
i
in
cur
.
shape
:
fo
.
write
(
struct
.
pack
(
'i'
,
i
))
if
(
to_data_type
==
3
):
write_int8
(
fo
,
cur
)
elif
(
to_data_type
==
8
):
write_int4
(
fo
,
cur
)
else
:
fo
.
write
(
struct
.
pack
(
'i'
,
to_data_type
))
fo
.
write
(
cur
.
data
)
tot
+=
1
print
(
"output ("
,
tot
,
"/"
,
len
(
dict
),
end
=
" )
\r
"
)
print
(
"
\n
finish."
)
fo
.
close
()
from
fastllm_pytools
import
torch2flm
if
__name__
==
"__main__"
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"
.
/chatglm2-6b
/
"
,
trust_remote_code
=
True
)
model
=
AutoModel
.
from_pretrained
(
"
./chatglm2_model
/chatglm2-6b
/
"
,
trust_remote_code
=
True
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"
THUDM
/chatglm2-6b"
,
trust_remote_code
=
True
)
model
=
AutoModel
.
from_pretrained
(
"
THUDM
/chatglm2-6b"
,
trust_remote_code
=
True
)
model
=
model
.
eval
()
dtype
=
sys
.
argv
[
2
]
if
len
(
sys
.
argv
)
>=
3
else
"float16"
exportPath
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
>=
2
else
"chatglm-6b-' + dtype + '.
bin
"
tofile
(
exportPath
,
model
,
tokenizer
,
dtype
=
dtype
)
exportPath
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
>=
2
else
"chatglm-6b-' + dtype + '.
flm
"
torch2flm
.
tofile
(
exportPath
,
model
,
tokenizer
,
dtype
=
dtype
)
package/fastllm_pytools/hf_model.py
View file @
7d96fda9
from
fastllm_pytools
import
llm
;
import
torch
;
import
ctypes
;
import
numpy
as
np
;
from
fastllm_pytools
import
llm
import
torch
import
ctypes
import
numpy
as
np
fastllm_data_type_dict
=
{
"int4"
:
8
,
...
...
@@ -22,50 +22,67 @@ def create(model,
history_sep
=
None
,
dtype
=
"float16"
):
if
(
dtype
not
in
fastllm_data_type_dict
):
print
(
"dtype should in "
,
list
(
fastllm_data_type_dict
.
keys
()))
;
exit
(
0
)
;
print
(
"dtype should in "
,
list
(
fastllm_data_type_dict
.
keys
()))
exit
(
0
)
# 0.1 model info
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
if
(
pre_prompt
):
modelInfo
[
"pre_prompt"
]
=
pre_prompt
;
modelInfo
[
"pre_prompt"
]
=
pre_prompt
if
(
user_role
):
modelInfo
[
"user_role"
]
=
user_role
;
modelInfo
[
"user_role"
]
=
user_role
if
(
bot_role
):
modelInfo
[
"bot_role"
]
=
bot_role
;
modelInfo
[
"bot_role"
]
=
bot_role
if
(
history_sep
):
modelInfo
[
"history_sep"
]
=
history_sep
;
modelInfo
[
"history_sep"
]
=
history_sep
if
(
modelInfo
[
"model_type"
]
==
"baichuan"
and
hasattr
(
model
,
"model"
)
and
hasattr
(
model
.
model
,
"get_alibi_mask"
)):
# Baichuan 2代
modelInfo
[
"use_alibi"
]
=
"1"
;
modelInfo
[
"pre_prompt"
]
=
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
;
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
;
modelInfo
[
"history_sep"
]
=
""
;
modelInfo
[
"use_alibi"
]
=
"1"
modelInfo
[
"pre_prompt"
]
=
""
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
modelInfo
[
"history_sep"
]
=
""
if
(
modelInfo
[
"model_type"
]
==
"qwen"
):
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
weight_type_dict
=
{}
;
module_dict
=
{}
;
weight_bits
=
{}
;
weight_type_dict
=
{}
module_dict
=
{}
weight_bits
=
{}
for
key
,
m
in
model
.
named_modules
():
if
(
str
(
type
(
m
)).
find
(
"QuantizedLinear"
)
!=
-
1
):
weight_type_dict
[
key
+
".weight"
]
=
"QuantizedLinear"
;
weight_bits
[
key
+
".weight"
]
=
m
.
weight_bit_width
;
weight_type_dict
[
key
+
".weight"
]
=
"QuantizedLinear"
weight_bits
[
key
+
".weight"
]
=
m
.
weight_bit_width
if
(
isinstance
(
m
,
torch
.
nn
.
Linear
)):
weight_type_dict
[
key
+
".weight"
]
=
"linear"
;
module_dict
[
key
+
".weight"
]
=
m
;
weight_type_dict
[
key
+
".weight"
]
=
"linear"
module_dict
[
key
+
".weight"
]
=
m
if
(
isinstance
(
m
,
torch
.
nn
.
Embedding
)):
weight_type_dict
[
key
]
=
"embedding"
;
weight_type_dict
[
key
]
=
"embedding"
model
=
model
.
cpu
();
dict
=
model
.
state_dict
();
model_type
=
model
.
config
.
__dict__
[
"model_type"
];
model
=
llm
.
fastllm_lib
.
create_empty_llm_model
(
model_type
.
encode
());
peft_config
=
{}
active_adapter
=
""
if
hasattr
(
model
,
"peft_config"
):
peft_config
=
model
.
peft_config
if
hasattr
(
model
,
"active_adapter"
):
active_adapter
=
model
.
active_adapter
model
=
model
.
cpu
()
dict
=
model
.
state_dict
()
model_type
=
model
.
config
.
__dict__
[
"model_type"
]
model
=
llm
.
fastllm_lib
.
create_empty_llm_model
(
model_type
.
encode
())
for
it
in
modelInfo
.
keys
():
llm
.
fastllm_lib
.
add_dict_llm_model
(
model
,
str
(
it
).
encode
(),
str
(
modelInfo
[
it
]).
encode
());
llm
.
fastllm_lib
.
add_dict_llm_model
(
model
,
str
(
it
).
encode
(),
str
(
modelInfo
[
it
]).
encode
())
for
adapter_name
in
peft_config
.
keys
():
adapter_dict
=
peft_config
[
adapter_name
].
__dict__
for
it
in
adapter_dict
.
keys
():
llm
.
fastllm_lib
.
add_adapter_dict_llm_model
(
model
,
str
(
adapter_name
).
encode
(),
str
(
it
).
encode
(),
str
(
adapter_dict
[
it
]).
encode
())
if
len
(
active_adapter
)
!=
0
:
llm
.
fastllm_lib
.
set_adapter
(
model
,
str
(
active_adapter
).
encode
())
# 1. vocab
if
(
tokenizer
):
...
...
@@ -73,59 +90,62 @@ def create(model,
if
modelInfo
[
"model_type"
]
==
"qwen"
:
pass
else
:
tokenizer
=
tokenizer
.
tokenizer
;
tokenizer
=
tokenizer
.
tokenizer
if
(
hasattr
(
tokenizer
,
"sp_model"
)):
piece_size
=
tokenizer
.
sp_model
.
piece_size
()
;
piece_size
=
tokenizer
.
sp_model
.
piece_size
()
for
i
in
range
(
piece_size
):
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
tokenizer
.
sp_model
.
id_to_piece
(
i
).
encode
(),
i
,
ctypes
.
c_float
(
tokenizer
.
sp_model
.
get_score
(
i
)))
;
i
,
ctypes
.
c_float
(
tokenizer
.
sp_model
.
get_score
(
i
)))
else
:
vocab
=
tokenizer
.
get_vocab
()
;
vocab
=
tokenizer
.
get_vocab
()
for
v
in
vocab
.
keys
():
if
(
modelInfo
[
"model_type"
]
==
"moss"
):
vv
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
;
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
vv
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
;
vv
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
vv
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
elif
(
modelInfo
[
"model_type"
]
==
"qwen"
):
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
;
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
,
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
else
:
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
.
encode
(),
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
;
tot
=
0
;
llm
.
fastllm_lib
.
add_tokenizer_word_llm_model
(
model
,
v
.
encode
(),
vocab
[
v
],
ctypes
.
c_float
(
1.0
))
tot
=
0
for
key
in
dict
:
ori_data_type
=
0
;
ori_np_data_type
=
np
.
float32
;
cur_weight_type
=
0
;
ori_data_type
=
0
ori_np_data_type
=
np
.
float32
cur_weight_type
=
0
if
(
key
in
weight_type_dict
and
weight_type_dict
[
key
]
in
fastllm_weight_type_dict
):
cur_weight_type
=
fastllm_weight_type_dict
[
weight_type_dict
[
key
]]
;
to_data_type
=
0
;
cur_weight_type
=
fastllm_weight_type_dict
[
weight_type_dict
[
key
]]
to_data_type
=
0
if
(
cur_weight_type
==
1
):
to_data_type
=
fastllm_data_type_dict
[
dtype
]
;
to_data_type
=
fastllm_data_type_dict
[
dtype
]
if
(
to_data_type
==
7
):
ori_data_type
=
7
;
ori_np_data_type
=
np
.
float16
;
ori_data_type
=
7
ori_np_data_type
=
np
.
float16
elif
(
cur_weight_type
==
2
):
# TODO bfloat
to_data_type
=
0
;
to_data_type
=
0
weight_name
=
key
if
peft_config
is
not
None
:
weight_name
=
weight_name
.
replace
(
'base_model.model.'
,
''
)
if
(
cur_weight_type
==
111
):
llm
.
fastllm_lib
.
add_qlinear_weight_llm_model
(
model
,
key
.
encode
(),
llm
.
fastllm_lib
.
add_qlinear_weight_llm_model
(
model
,
weight_name
.
encode
(),
len
(
dict
[
key
].
shape
),
(
ctypes
.
c_int
*
len
(
dict
[
key
].
shape
))(
*
list
(
dict
[
key
].
shape
)),
weight_bits
[
key
],
dict
[
key
+
"_scale"
].
numpy
().
astype
(
np
.
float32
).
ctypes
.
data_as
(
ctypes
.
c_void_p
),
dict
[
key
].
numpy
().
ctypes
.
data_as
(
ctypes
.
c_void_p
))
;
dict
[
key
].
numpy
().
ctypes
.
data_as
(
ctypes
.
c_void_p
))
else
:
llm
.
fastllm_lib
.
add_weight_llm_model
(
model
,
key
.
encode
(),
llm
.
fastllm_lib
.
add_weight_llm_model
(
model
,
weight_name
.
encode
(),
len
(
dict
[
key
].
shape
),
(
ctypes
.
c_int
*
len
(
dict
[
key
].
shape
))(
*
list
(
dict
[
key
].
shape
)),
to_data_type
,
cur_weight_type
,
ori_data_type
,
dict
[
key
].
numpy
().
astype
(
ori_np_data_type
).
ctypes
.
data_as
(
ctypes
.
c_void_p
))
;
tot
+=
1
;
print
(
"convert ("
,
tot
,
"/"
,
len
(
dict
),
end
=
" )
\r
"
)
;
dict
[
key
].
numpy
().
astype
(
ori_np_data_type
).
ctypes
.
data_as
(
ctypes
.
c_void_p
))
tot
+=
1
print
(
"convert ("
,
tot
,
"/"
,
len
(
dict
),
end
=
" )
\r
"
)
print
(
""
)
;
llm
.
fastllm_lib
.
init_params_llm_model
(
model
)
;
llm
.
fastllm_lib
.
warmup_llm_model
(
model
)
;
ret
=
llm
.
model
(
""
,
id
=
model
)
;
return
ret
;
print
(
""
)
llm
.
fastllm_lib
.
init_params_llm_model
(
model
)
llm
.
fastllm_lib
.
warmup_llm_model
(
model
)
ret
=
llm
.
model
(
""
,
id
=
model
)
return
ret
package/fastllm_pytools/libfastllm_tools.so
View file @
7d96fda9
No preview for this file type
package/fastllm_pytools/llm.py
View file @
7d96fda9
import
ctypes
;
import
os
;
from
typing
import
Optional
,
Tuple
,
Union
,
List
,
Callable
,
Dict
,
Any
;
import
ctypes
import
os
from
typing
import
Optional
,
Tuple
,
Union
,
List
,
Callable
,
Dict
,
Any
import
platform
if
platform
.
system
()
==
'Windows'
:
...
...
@@ -46,106 +46,106 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
fastllm_lib
.
set_device_map
.
argtype
=
[
ctypes
.
c_int
,
ctypes
.
c_void_p
,
ctypes
.
c_char_p
,
ctypes
.
c_void_p
]
def
set_cpu_threads
(
threads
:
int
):
fastllm_lib
.
set_cpu_threads
(
threads
)
;
fastllm_lib
.
set_cpu_threads
(
threads
)
def
get_cpu_threads
()
->
int
:
return
fastllm_lib
.
get_cpu_threads
()
;
return
fastllm_lib
.
get_cpu_threads
()
def
print_ins_info
():
fastllm_lib
.
print_cpu_ins
()
;
fastllm_lib
.
print_cpu_ins
()
def
set_cpu_kvcache
(
cpu_kvcache
):
fastllm_lib
.
set_kvcache_in_cpu
(
ctypes
.
c_bool
(
cpu_kvcache
))
;
fastllm_lib
.
set_kvcache_in_cpu
(
ctypes
.
c_bool
(
cpu_kvcache
))
def
get_cpu_kvcache
():
return
fastllm_lib
.
get_kvcache_in_cpu
()
;
return
fastllm_lib
.
get_kvcache_in_cpu
()
def
set_cpu_low_mem
(
low_mem
):
fastllm_lib
.
set_cpu_low_mem
(
ctypes
.
c_bool
(
low_mem
))
;
fastllm_lib
.
set_cpu_low_mem
(
ctypes
.
c_bool
(
low_mem
))
def
get_cpu_low_mem
():
return
fastllm_lib
.
get_cpu_low_mem
()
;
return
fastllm_lib
.
get_cpu_low_mem
()
def
set_device_map
(
device_map
):
devices
=
[]
;
values
=
[]
;
devices
=
[]
values
=
[]
if
(
isinstance
(
device_map
,
str
)):
devices
.
append
(
device_map
)
;
values
.
append
(
1
)
;
devices
.
append
(
device_map
)
values
.
append
(
1
)
elif
(
isinstance
(
device_map
,
list
)):
devices
=
[
str
(
x
)
for
x
in
device_map
]
;
values
=
[
1
for
x
in
device_map
]
;
devices
=
[
str
(
x
)
for
x
in
device_map
]
values
=
[
1
for
x
in
device_map
]
elif
(
isinstance
(
device_map
,
dict
)):
devices
=
[
str
(
x
)
for
x
in
device_map
.
keys
()]
;
values
=
[
int
(
device_map
[
x
])
for
x
in
device_map
.
keys
()]
;
devices
=
[
str
(
x
)
for
x
in
device_map
.
keys
()]
values
=
[
int
(
device_map
[
x
])
for
x
in
device_map
.
keys
()]
else
:
print
(
"set_device_map error."
)
;
return
;
device_str
=
''
.
join
(
devices
)
;
device_len
=
[
len
(
x
)
for
x
in
devices
]
;
print
(
"set_device_map error."
)
return
device_str
=
''
.
join
(
devices
)
device_len
=
[
len
(
x
)
for
x
in
devices
]
fastllm_lib
.
set_device_map
(
len
(
device_len
),
(
ctypes
.
c_int
*
len
(
device_len
))(
*
device_len
),
device_str
.
encode
(),
(
ctypes
.
c_int
*
len
(
values
))(
*
values
))
;
(
ctypes
.
c_int
*
len
(
values
))(
*
values
))
def
from_hf
(
model
,
tokenizer
=
None
,
dtype
=
"float16"
):
from
fastllm_pytools
import
hf_model
;
return
hf_model
.
create
(
model
,
tokenizer
,
dtype
=
dtype
)
;
from
fastllm_pytools
import
hf_model
return
hf_model
.
create
(
model
,
tokenizer
,
dtype
=
dtype
)
class
model
:
def
__init__
(
self
,
path
:
str
,
id
:
int
=
-
99999
):
if
(
id
!=
-
99999
):
self
.
model
=
id
;
self
.
model
=
id
else
:
self
.
model
=
fastllm_lib
.
create_llm_model
(
path
.
encode
())
;
self
.
direct_query
=
False
;
self
.
model
=
fastllm_lib
.
create_llm_model
(
path
.
encode
())
self
.
direct_query
=
False
def
get_prompt
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
)
->
str
:
if
(
not
(
history
)):
history
=
[]
;
prompt
=
""
;
history
=
[]
prompt
=
""
for
i
,
(
old_query
,
response
)
in
enumerate
(
history
):
prompt
=
fastllm_lib
.
make_history_llm_model
(
self
.
model
,
prompt
.
encode
(),
i
,
old_query
.
encode
(),
response
.
encode
()).
decode
()
;
prompt
=
fastllm_lib
.
make_input_llm_model
(
self
.
model
,
prompt
.
encode
(),
len
(
history
),
query
.
encode
()).
decode
()
;
return
prompt
;
prompt
=
fastllm_lib
.
make_history_llm_model
(
self
.
model
,
prompt
.
encode
(),
i
,
old_query
.
encode
(),
response
.
encode
()).
decode
()
prompt
=
fastllm_lib
.
make_input_llm_model
(
self
.
model
,
prompt
.
encode
(),
len
(
history
),
query
.
encode
()).
decode
()
return
prompt
def
save
(
self
,
path
:
str
):
fastllm_lib
.
save_llm_model
(
self
.
model
,
path
.
encode
())
;
fastllm_lib
.
save_llm_model
(
self
.
model
,
path
.
encode
())
def
eval
(
self
):
pass
;
pass
def
response_logits
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
tokenizer
=
None
)
->
str
:
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
if
(
tokenizer
==
None
):
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
1
),
ctypes
.
c_bool
(
False
),
ctypes
.
c_float
(
1
),
ctypes
.
c_int
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_bool
(
True
))
;
ctypes
.
c_float
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_bool
(
True
))
else
:
input
=
tokenizer
.
encode
(
prompt
)
;
input
=
tokenizer
.
encode
(
prompt
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
1
,
False
,
1
,
1
,
1
,
1
,
True
)
;
vocab_size
=
fastllm_lib
.
get_tokenizer_vocab_size
(
self
.
model
)
;
1
,
False
,
1
,
1
,
1
,
1
,
True
)
vocab_size
=
fastllm_lib
.
get_tokenizer_vocab_size
(
self
.
model
)
logits
=
list
(
range
(
vocab_size
))
array
=
(
ctypes
.
c_float
*
(
vocab_size
*
4
))(
*
logits
)
;
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
;
out
=
list
(
array
)[:
vocab_size
]
;
array
=
(
ctypes
.
c_float
*
(
vocab_size
*
4
))(
*
logits
)
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
out
=
list
(
array
)[:
vocab_size
]
while
(
ret
!=
-
1
):
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
;
return
out
;
ret
=
fastllm_lib
.
fetch_response_logits_llm_model
(
self
.
model
,
handle
,
array
)
return
out
def
response
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
)
->
str
:
ret
=
""
;
ret
=
""
for
i
in
self
.
stream_response
(
query
=
query
,
history
=
history
,
max_length
=
max_length
,
...
...
@@ -154,81 +154,87 @@ class model:
temperature
=
temperature
,
repeat_penalty
=
repeat_penalty
,
one_by_one
=
True
):
ret
+=
i
;
return
ret
;
ret
+=
i
return
ret
def
stream_response
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
one_by_one
=
True
):
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
max_length
),
ctypes
.
c_bool
(
do_sample
),
ctypes
.
c_float
(
top_p
),
ctypes
.
c_int
(
top_k
),
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
;
res
=
""
;
ret
=
b
''
;
fail_cnt
=
0
;
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
res
=
""
ret
=
b
''
fail_cnt
=
0
while
True
:
ret
+=
fastllm_lib
.
fetch_response_str_llm_model
(
self
.
model
,
handle
)
;
cur
=
""
;
ret
+=
fastllm_lib
.
fetch_response_str_llm_model
(
self
.
model
,
handle
)
cur
=
""
try
:
cur
=
ret
.
decode
()
;
ret
=
b
''
;
cur
=
ret
.
decode
()
ret
=
b
''
except
:
fail_cnt
+=
1
;
fail_cnt
+=
1
if
(
fail_cnt
==
20
):
break
;
break
else
:
continue
;
fail_cnt
=
0
;
continue
fail_cnt
=
0
if
(
cur
==
"<flmeos>"
):
break
;
break
if
one_by_one
:
yield
cur
;
yield
cur
else
:
res
+=
cur
;
yield
res
;
res
+=
cur
yield
res
def
chat
(
self
,
tokenizer
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
**
kwargs
):
if
(
not
(
history
)):
history
=
[]
;
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
input
=
tokenizer
.
encode
(
prompt
)
;
history
=
[]
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
input
=
tokenizer
.
encode
(
prompt
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
)
;
False
)
result
=
[]
;
result
=
[]
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
;
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
if
(
cur
==
-
1
):
break
;
result
.
append
(
cur
)
;
response
=
tokenizer
.
decode
(
result
)
;
history
=
history
+
[(
query
,
response
)]
;
return
response
,
history
;
break
result
.
append
(
cur
)
response
=
tokenizer
.
decode
(
result
)
history
=
history
+
[(
query
,
response
)]
return
response
,
history
def
stream_chat
(
self
,
tokenizer
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
past_key_values
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.0
,
return_past_key_values
=
False
,
**
kwargs
)
->
str
:
if
(
not
(
history
)):
history
=
[]
;
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
;
input
=
tokenizer
.
encode
(
prompt
)
;
history
=
[]
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
)
input
=
tokenizer
.
encode
(
prompt
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
)
;
tokens
=
[]
;
False
)
tokens
=
[]
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
;
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
)
if
(
cur
==
-
1
):
break
;
tokens
.
append
(
cur
)
;
response
=
tokenizer
.
decode
(
tokens
)
;
new_history
=
history
+
[(
query
,
response
)]
;
break
tokens
.
append
(
cur
)
response
=
tokenizer
.
decode
(
tokens
)
new_history
=
history
+
[(
query
,
response
)]
if
return_past_key_values
:
yield
response
,
new_history
,
None
;
yield
response
,
new_history
,
None
else
:
yield
response
,
new_history
;
yield
response
,
new_history
def
set_adapter
(
self
,
name
:
str
):
fastllm_lib
.
set_adapter
(
self
.
model
,
str
(
name
).
encode
())
def
disable_adapter
(
self
):
fastllm_lib
.
disable_adapter
(
self
.
model
)
package/fastllm_pytools/torch2flm.py
View file @
7d96fda9
...
...
@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding"
:
2
}
v
=
np
.
random
.
randint
(
-
127
,
127
,
[
10
,
20
])
;
temp
=
v
;
v
=
np
.
random
.
randint
(
-
127
,
127
,
[
10
,
20
])
temp
=
v
c_max
=
np
.
expand_dims
(
np
.
abs
(
v
).
max
(
axis
=
-
1
),
-
1
)
c_scale
=
c_max
/
127.0
v
=
(
v
/
c_scale
+
128.5
).
clip
(
1
,
255
).
astype
(
np
.
uint8
)
...
...
@@ -34,8 +34,8 @@ def write_int8(fo, v):
fo
.
write
(
struct
.
pack
(
'i'
,
3
))
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
for
i
in
range
(
c_max
.
shape
[
0
]):
fo
.
write
(
struct
.
pack
(
'f'
,
-
c_max
[
i
][
0
]))
;
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
;
fo
.
write
(
struct
.
pack
(
'f'
,
-
c_max
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
fo
.
write
(
v
.
data
)
def
write_int4
(
fo
,
v
):
...
...
@@ -49,8 +49,8 @@ def write_int4(fo, v):
fo
.
write
(
struct
.
pack
(
'i'
,
8
))
fo
.
write
(
struct
.
pack
(
'i'
,
0
))
for
i
in
range
(
c_min
.
shape
[
0
]):
fo
.
write
(
struct
.
pack
(
'f'
,
c_min
[
i
][
0
]))
;
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
;
fo
.
write
(
struct
.
pack
(
'f'
,
c_min
[
i
][
0
]))
fo
.
write
(
struct
.
pack
(
'f'
,
c_max
[
i
][
0
]))
fo
.
write
(
v
.
data
)
def
tofile
(
exportPath
,
...
...
@@ -91,19 +91,32 @@ def tofile(exportPath,
# Baichuan 2代
modelInfo
[
"use_alibi"
]
=
"1"
modelInfo
[
"pre_prompt"
]
=
""
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
;
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
user_token_id
)
+
"> "
)
if
hasattr
(
model
.
generation_config
,
"user_token_id"
)
else
""
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
model
.
generation_config
.
assistant_token_id
)
+
">"
)
if
hasattr
(
model
.
generation_config
,
"assistant_token_id"
)
else
""
modelInfo
[
"history_sep"
]
=
""
if
modelInfo
[
"model_type"
]
==
"qwen"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
modelInfo
[
"tokenizer_use_score"
]
=
"1"
# 分词带分数
if
hasattr
(
model
,
"peft_config"
):
adapter_size
=
len
(
model
.
peft_config
)
modelInfo
[
"peft_size"
]
=
adapter_size
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
modelInfo
)))
for
it
in
modelInfo
.
keys
():
writeKeyValue
(
fo
,
str
(
it
),
str
(
modelInfo
[
it
]))
if
hasattr
(
model
,
"peft_config"
):
for
adapter_name
in
model
.
peft_config
.
keys
():
adapter_dict
=
model
.
peft_config
[
adapter_name
].
__dict__
writeString
(
fo
,
adapter_name
)
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
adapter_dict
)))
for
it
in
adapter_dict
.
keys
():
writeKeyValue
(
fo
,
str
(
it
),
str
(
adapter_dict
[
it
]))
# 1. vocab
if
(
tokenizer
):
if
(
hasattr
(
tokenizer
,
"tokenizer"
)):
...
...
@@ -128,7 +141,7 @@ def tofile(exportPath,
if
(
modelInfo
[
'model_type'
]
==
"qwen"
):
s
=
v
else
:
s
=
v
.
d
ecode
()
s
=
v
.
e
n
code
()
if
(
modelInfo
[
"model_type"
]
==
"moss"
):
s
=
[(
ord
(
c
)
if
c
not
in
tokenizer
.
byte_decoder
else
tokenizer
.
byte_decoder
[
c
])
for
c
in
v
]
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
s
)))
...
...
@@ -165,8 +178,14 @@ def tofile(exportPath,
ori_np_data_type
=
np
.
float16
cur
=
dict
[
key
].
numpy
().
astype
(
ori_np_data_type
)
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
key
)))
fo
.
write
(
key
.
encode
())
if
hasattr
(
model
,
"peft_config"
):
weight_name
=
key
.
replace
(
'base_model.model.'
,
''
)
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
weight_name
)))
fo
.
write
(
weight_name
.
encode
())
else
:
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
key
)))
fo
.
write
(
key
.
encode
())
fo
.
write
(
struct
.
pack
(
'i'
,
len
(
cur
.
shape
)))
for
i
in
cur
.
shape
:
fo
.
write
(
struct
.
pack
(
'i'
,
i
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment