"tests/vscode:/vscode.git/clone" did not exist on "2f7ca41459da49148664371292063c202d88835b"
Commit 7d96fda9 authored by zhouxiang's avatar zhouxiang
Browse files

更新版本

parent 8e2381d6
import sys import sys
from transformers import AutoTokenizer, AutoModel from transformers import AutoTokenizer, AutoModel
import struct from fastllm_pytools import torch2flm
import numpy as np
import torch
def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
fo.write(s.encode())
def writeKeyValue(fo, key, value):
writeString(fo, key)
writeString(fo, value)
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7,
"float32": 0,
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20])
temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def tofile(exportPath,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if (modelInfo['model_type'] == "qwen"):
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
if __name__ == "__main__": if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("./chatglm2-6b/", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("./chatglm2_model/chatglm2-6b/", trust_remote_code=True) model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = model.eval() model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.bin" exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.flm"
tofile(exportPath, model, tokenizer, dtype = dtype) torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
from fastllm_pytools import llm; from fastllm_pytools import llm
import torch; import torch
import ctypes; import ctypes
import numpy as np; import numpy as np
fastllm_data_type_dict = { fastllm_data_type_dict = {
"int4": 8, "int4": 8,
...@@ -22,50 +22,67 @@ def create(model, ...@@ -22,50 +22,67 @@ def create(model,
history_sep = None, history_sep = None,
dtype = "float16"): dtype = "float16"):
if (dtype not in fastllm_data_type_dict): if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys())); print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0); exit(0)
# 0.1 model info # 0.1 model info
modelInfo = model.config.__dict__ modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if (pre_prompt): if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt; modelInfo["pre_prompt"] = pre_prompt
if (user_role): if (user_role):
modelInfo["user_role"] = user_role; modelInfo["user_role"] = user_role
if (bot_role): if (bot_role):
modelInfo["bot_role"] = bot_role; modelInfo["bot_role"] = bot_role
if (history_sep): if (history_sep):
modelInfo["history_sep"] = history_sep; modelInfo["history_sep"] = history_sep
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")): if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代 # Baichuan 2代
modelInfo["use_alibi"] = "1"; modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""; modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""; modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""; modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""; modelInfo["history_sep"] = ""
if (modelInfo["model_type"] == "qwen"): if (modelInfo["model_type"] == "qwen"):
modelInfo["im_end_id"] = tokenizer.im_end_id if modelInfo["chat_format"] == "chatml":
modelInfo["im_start_id"] = tokenizer.im_start_id modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
weight_type_dict = {}; weight_type_dict = {}
module_dict = {}; module_dict = {}
weight_bits = {}; weight_bits = {}
for key, m in model.named_modules(): for key, m in model.named_modules():
if (str(type(m)).find("QuantizedLinear") != -1): if (str(type(m)).find("QuantizedLinear") != -1):
weight_type_dict[key + ".weight"] = "QuantizedLinear"; weight_type_dict[key + ".weight"] = "QuantizedLinear"
weight_bits[key + ".weight"] = m.weight_bit_width; weight_bits[key + ".weight"] = m.weight_bit_width
if (isinstance(m, torch.nn.Linear)): if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"; weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m; module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)): if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"; weight_type_dict[key] = "embedding"
model = model.cpu(); peft_config = {}
dict = model.state_dict(); active_adapter = ""
model_type = model.config.__dict__["model_type"]; if hasattr(model, "peft_config"):
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode()); peft_config = model.peft_config
if hasattr(model, "active_adapter"):
active_adapter = model.active_adapter
model = model.cpu()
dict = model.state_dict()
model_type = model.config.__dict__["model_type"]
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
for it in modelInfo.keys(): for it in modelInfo.keys():
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode()); llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
for adapter_name in peft_config.keys():
adapter_dict = peft_config[adapter_name].__dict__
for it in adapter_dict.keys():
llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
if len(active_adapter) != 0:
llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
# 1. vocab # 1. vocab
if (tokenizer): if (tokenizer):
...@@ -73,59 +90,62 @@ def create(model, ...@@ -73,59 +90,62 @@ def create(model,
if modelInfo["model_type"] == "qwen": if modelInfo["model_type"] == "qwen":
pass pass
else: else:
tokenizer = tokenizer.tokenizer; tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")): if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size(); piece_size = tokenizer.sp_model.piece_size()
for i in range(piece_size): for i in range(piece_size):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(), llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i))); i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
else: else:
vocab = tokenizer.get_vocab(); vocab = tokenizer.get_vocab()
for v in vocab.keys(): for v in vocab.keys():
if (modelInfo["model_type"] == "moss"): if (modelInfo["model_type"] == "moss"):
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]; vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0)); llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
elif (modelInfo["model_type"] == "qwen"): elif (modelInfo["model_type"] == "qwen"):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0)); llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
else: else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0)); llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
tot = 0; tot = 0
for key in dict: for key in dict:
ori_data_type = 0; ori_data_type = 0
ori_np_data_type = np.float32; ori_np_data_type = np.float32
cur_weight_type = 0; cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict): if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]; cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0; to_data_type = 0
if (cur_weight_type == 1): if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]; to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7): if (to_data_type == 7):
ori_data_type = 7; ori_data_type = 7
ori_np_data_type = np.float16; ori_np_data_type = np.float16
elif (cur_weight_type == 2): elif (cur_weight_type == 2):
# TODO bfloat # TODO bfloat
to_data_type = 0; to_data_type = 0
weight_name = key
if peft_config is not None:
weight_name = weight_name.replace('base_model.model.', '')
if (cur_weight_type == 111): if (cur_weight_type == 111):
llm.fastllm_lib.add_qlinear_weight_llm_model(model, key.encode(), llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape), len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)), (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
weight_bits[key], weight_bits[key],
dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p), dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
dict[key].numpy().ctypes.data_as(ctypes.c_void_p)); dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
else: else:
llm.fastllm_lib.add_weight_llm_model(model, key.encode(), llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape), len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)), (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
to_data_type, cur_weight_type, ori_data_type, to_data_type, cur_weight_type, ori_data_type,
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p)); dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
tot += 1; tot += 1
print("convert (", tot, "/", len(dict), end = " )\r"); print("convert (", tot, "/", len(dict), end = " )\r")
print(""); print("")
llm.fastllm_lib.init_params_llm_model(model); llm.fastllm_lib.init_params_llm_model(model)
llm.fastllm_lib.warmup_llm_model(model); llm.fastllm_lib.warmup_llm_model(model)
ret = llm.model("", id = model); ret = llm.model("", id = model)
return ret; return ret
import ctypes; import ctypes
import os; import os
from typing import Optional, Tuple, Union, List, Callable, Dict, Any; from typing import Optional, Tuple, Union, List, Callable, Dict, Any
import platform import platform
if platform.system() == 'Windows': if platform.system() == 'Windows':
...@@ -46,106 +46,106 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_ ...@@ -46,106 +46,106 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
def set_cpu_threads(threads: int): def set_cpu_threads(threads: int):
fastllm_lib.set_cpu_threads(threads); fastllm_lib.set_cpu_threads(threads)
def get_cpu_threads() -> int: def get_cpu_threads() -> int:
return fastllm_lib.get_cpu_threads(); return fastllm_lib.get_cpu_threads()
def print_ins_info(): def print_ins_info():
fastllm_lib.print_cpu_ins(); fastllm_lib.print_cpu_ins()
def set_cpu_kvcache(cpu_kvcache): def set_cpu_kvcache(cpu_kvcache):
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache)); fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
def get_cpu_kvcache(): def get_cpu_kvcache():
return fastllm_lib.get_kvcache_in_cpu(); return fastllm_lib.get_kvcache_in_cpu()
def set_cpu_low_mem(low_mem): def set_cpu_low_mem(low_mem):
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem)); fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
def get_cpu_low_mem(): def get_cpu_low_mem():
return fastllm_lib.get_cpu_low_mem(); return fastllm_lib.get_cpu_low_mem()
def set_device_map(device_map): def set_device_map(device_map):
devices = []; devices = []
values = []; values = []
if (isinstance(device_map, str)): if (isinstance(device_map, str)):
devices.append(device_map); devices.append(device_map)
values.append(1); values.append(1)
elif (isinstance(device_map, list)): elif (isinstance(device_map, list)):
devices = [str(x) for x in device_map]; devices = [str(x) for x in device_map]
values = [1 for x in device_map]; values = [1 for x in device_map]
elif (isinstance(device_map, dict)): elif (isinstance(device_map, dict)):
devices = [str(x) for x in device_map.keys()]; devices = [str(x) for x in device_map.keys()]
values = [int(device_map[x]) for x in device_map.keys()]; values = [int(device_map[x]) for x in device_map.keys()]
else: else:
print("set_device_map error."); print("set_device_map error.")
return; return
device_str = ''.join(devices); device_str = ''.join(devices)
device_len = [len(x) for x in devices]; device_len = [len(x) for x in devices]
fastllm_lib.set_device_map(len(device_len), fastllm_lib.set_device_map(len(device_len),
(ctypes.c_int * len(device_len))(*device_len), (ctypes.c_int * len(device_len))(*device_len),
device_str.encode(), device_str.encode(),
(ctypes.c_int * len(values))(*values)); (ctypes.c_int * len(values))(*values))
def from_hf(model, def from_hf(model,
tokenizer = None, tokenizer = None,
dtype = "float16"): dtype = "float16"):
from fastllm_pytools import hf_model; from fastllm_pytools import hf_model
return hf_model.create(model, tokenizer, dtype = dtype); return hf_model.create(model, tokenizer, dtype = dtype)
class model: class model:
def __init__ (self, path : str, def __init__ (self, path : str,
id : int = -99999): id : int = -99999):
if (id != -99999): if (id != -99999):
self.model = id; self.model = id
else: else:
self.model = fastllm_lib.create_llm_model(path.encode()); self.model = fastllm_lib.create_llm_model(path.encode())
self.direct_query = False; self.direct_query = False
def get_prompt(self, def get_prompt(self,
query: str, query: str,
history: List[Tuple[str, str]] = None) -> str: history: List[Tuple[str, str]] = None) -> str:
if (not(history)): if (not(history)):
history = []; history = []
prompt = ""; prompt = ""
for i, (old_query, response) in enumerate(history): for i, (old_query, response) in enumerate(history):
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode(); prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode(); prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
return prompt; return prompt
def save(self, path : str): def save(self, path : str):
fastllm_lib.save_llm_model(self.model, path.encode()); fastllm_lib.save_llm_model(self.model, path.encode())
def eval(self): def eval(self):
pass; pass
def response_logits(self, def response_logits(self,
query: str, query: str,
history: List[Tuple[str, str]] = None, history: List[Tuple[str, str]] = None,
tokenizer = None) -> str: tokenizer = None) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history); prompt = query if self.direct_query else self.get_prompt(query, history)
if (tokenizer == None): if (tokenizer == None):
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(), handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1), ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True)); ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
else: else:
input = tokenizer.encode(prompt); input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input), handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
1, False, 1, 1, 1, 1, True); 1, False, 1, 1, 1, 1, True)
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model); vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
logits = list(range(vocab_size)) logits = list(range(vocab_size))
array = (ctypes.c_float * (vocab_size * 4))(*logits); array = (ctypes.c_float * (vocab_size * 4))(*logits)
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array); ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
out = list(array)[:vocab_size]; out = list(array)[:vocab_size]
while (ret != -1): while (ret != -1):
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array); ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
return out; return out
def response(self, def response(self,
query: str, query: str,
history: List[Tuple[str, str]] = None, history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str: max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
ret = ""; ret = ""
for i in self.stream_response(query = query, for i in self.stream_response(query = query,
history = history, history = history,
max_length = max_length, max_length = max_length,
...@@ -154,81 +154,87 @@ class model: ...@@ -154,81 +154,87 @@ class model:
temperature = temperature, temperature = temperature,
repeat_penalty = repeat_penalty, repeat_penalty = repeat_penalty,
one_by_one = True): one_by_one = True):
ret += i; ret += i
return ret; return ret
def stream_response(self, def stream_response(self,
query: str, query: str,
history: List[Tuple[str, str]] = None, history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True): one_by_one = True):
prompt = query if self.direct_query else self.get_prompt(query, history); prompt = query if self.direct_query else self.get_prompt(query, history)
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(), handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k), ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False)); ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
res = ""; res = ""
ret = b''; ret = b''
fail_cnt = 0; fail_cnt = 0
while True: while True:
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle); ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
cur = ""; cur = ""
try: try:
cur = ret.decode(); cur = ret.decode()
ret = b''; ret = b''
except: except:
fail_cnt += 1; fail_cnt += 1
if (fail_cnt == 20): if (fail_cnt == 20):
break; break
else: else:
continue; continue
fail_cnt = 0; fail_cnt = 0
if (cur == "<flmeos>"): if (cur == "<flmeos>"):
break; break
if one_by_one: if one_by_one:
yield cur; yield cur
else: else:
res += cur; res += cur
yield res; yield res
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs): do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
if (not(history)): if (not(history)):
history = []; history = []
prompt = query if self.direct_query else self.get_prompt(query, history); prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt); input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input), handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty, max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False); False)
result = []; result = []
while True: while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle); cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1): if (cur == -1):
break; break
result.append(cur); result.append(cur)
response = tokenizer.decode(result); response = tokenizer.decode(result)
history = history + [(query, response)]; history = history + [(query, response)]
return response, history; return response, history
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None, def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
return_past_key_values = False, **kwargs) -> str: return_past_key_values = False, **kwargs) -> str:
if (not(history)): if (not(history)):
history = []; history = []
prompt = query if self.direct_query else self.get_prompt(query, history); prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt); input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input), handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty, max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False); False)
tokens = []; tokens = []
while True: while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle); cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1): if (cur == -1):
break; break
tokens.append(cur); tokens.append(cur)
response = tokenizer.decode(tokens); response = tokenizer.decode(tokens)
new_history = history + [(query, response)]; new_history = history + [(query, response)]
if return_past_key_values: if return_past_key_values:
yield response, new_history, None; yield response, new_history, None
else: else:
yield response, new_history; yield response, new_history
def set_adapter(self, name: str):
fastllm_lib.set_adapter(self.model, str(name).encode())
def disable_adapter(self):
fastllm_lib.disable_adapter(self.model)
...@@ -21,8 +21,8 @@ fastllm_weight_type_dict = { ...@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding": 2 "embedding": 2
} }
v = np.random.randint(-127, 127, [10, 20]); v = np.random.randint(-127, 127, [10, 20])
temp = v; temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1) c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0 c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8) v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
...@@ -34,8 +34,8 @@ def write_int8(fo, v): ...@@ -34,8 +34,8 @@ def write_int8(fo, v):
fo.write(struct.pack('i', 3)) fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0)) fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]): for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0])); fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0])); fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data) fo.write(v.data)
def write_int4(fo, v): def write_int4(fo, v):
...@@ -49,8 +49,8 @@ def write_int4(fo, v): ...@@ -49,8 +49,8 @@ def write_int4(fo, v):
fo.write(struct.pack('i', 8)) fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0)) fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]): for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0])); fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0])); fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data) fo.write(v.data)
def tofile(exportPath, def tofile(exportPath,
...@@ -91,19 +91,32 @@ def tofile(exportPath, ...@@ -91,19 +91,32 @@ def tofile(exportPath,
# Baichuan 2代 # Baichuan 2代
modelInfo["use_alibi"] = "1" modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = "" modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""; modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""; modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = "" modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen": if modelInfo["model_type"] == "qwen":
modelInfo["im_end_id"] = tokenizer.im_end_id if modelInfo["chat_format"] == "chatml":
modelInfo["im_start_id"] = tokenizer.im_start_id modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
modelInfo["tokenizer_use_score"] = "1" # 分词带分数 modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
modelInfo["peft_size"] = adapter_size
fo.write(struct.pack('i', len(modelInfo))) fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys(): for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it])) writeKeyValue(fo, str(it), str(modelInfo[it]))
if hasattr(model, "peft_config"):
for adapter_name in model.peft_config.keys():
adapter_dict = model.peft_config[adapter_name].__dict__
writeString(fo, adapter_name)
fo.write(struct.pack('i', len(adapter_dict)))
for it in adapter_dict.keys():
writeKeyValue(fo, str(it), str(adapter_dict[it]))
# 1. vocab # 1. vocab
if (tokenizer): if (tokenizer):
if (hasattr(tokenizer, "tokenizer")): if (hasattr(tokenizer, "tokenizer")):
...@@ -128,7 +141,7 @@ def tofile(exportPath, ...@@ -128,7 +141,7 @@ def tofile(exportPath,
if (modelInfo['model_type'] == "qwen"): if (modelInfo['model_type'] == "qwen"):
s = v s = v
else: else:
s = v.decode() s = v.encode()
if (modelInfo["model_type"] == "moss"): if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v] s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s))) fo.write(struct.pack('i', len(s)))
...@@ -165,8 +178,14 @@ def tofile(exportPath, ...@@ -165,8 +178,14 @@ def tofile(exportPath,
ori_np_data_type = np.float16 ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type) cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode()) if hasattr(model, "peft_config"):
weight_name = key.replace('base_model.model.', '')
fo.write(struct.pack('i', len(weight_name)))
fo.write(weight_name.encode())
else:
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape))) fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape: for i in cur.shape:
fo.write(struct.pack('i', i)) fo.write(struct.pack('i', i))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment