"vscode:/vscode.git/clone" did not exist on "03bb324576d9fe86a15a7b86a43638a838234fbd"
Commit 7d96fda9 authored by zhouxiang's avatar zhouxiang
Browse files

更新版本

parent 8e2381d6
import sys
from transformers import AutoTokenizer, AutoModel
import struct
import numpy as np
import torch
def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
fo.write(s.encode())
def writeKeyValue(fo, key, value):
writeString(fo, key)
writeString(fo, value)
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7,
"float32": 0,
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20])
temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def tofile(exportPath,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if (modelInfo['model_type'] == "qwen"):
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
from fastllm_pytools import torch2flm
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("./chatglm2-6b/", trust_remote_code=True)
model = AutoModel.from_pretrained("./chatglm2_model/chatglm2-6b/", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.bin"
tofile(exportPath, model, tokenizer, dtype = dtype)
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
from fastllm_pytools import llm;
import torch;
import ctypes;
import numpy as np;
from fastllm_pytools import llm
import torch
import ctypes
import numpy as np
fastllm_data_type_dict = {
"int4": 8,
......@@ -22,50 +22,67 @@ def create(model,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()));
exit(0);
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt;
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role;
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role;
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep;
modelInfo["history_sep"] = history_sep
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1";
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = "";
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
if (modelInfo["model_type"] == "qwen"):
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
weight_type_dict = {};
module_dict = {};
weight_bits = {};
weight_type_dict = {}
module_dict = {}
weight_bits = {}
for key, m in model.named_modules():
if (str(type(m)).find("QuantizedLinear") != -1):
weight_type_dict[key + ".weight"] = "QuantizedLinear";
weight_bits[key + ".weight"] = m.weight_bit_width;
weight_type_dict[key + ".weight"] = "QuantizedLinear"
weight_bits[key + ".weight"] = m.weight_bit_width
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear";
module_dict[key + ".weight"] = m;
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding";
weight_type_dict[key] = "embedding"
model = model.cpu();
dict = model.state_dict();
model_type = model.config.__dict__["model_type"];
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
peft_config = {}
active_adapter = ""
if hasattr(model, "peft_config"):
peft_config = model.peft_config
if hasattr(model, "active_adapter"):
active_adapter = model.active_adapter
model = model.cpu()
dict = model.state_dict()
model_type = model.config.__dict__["model_type"]
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
for it in modelInfo.keys():
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
for adapter_name in peft_config.keys():
adapter_dict = peft_config[adapter_name].__dict__
for it in adapter_dict.keys():
llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
if len(active_adapter) != 0:
llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
# 1. vocab
if (tokenizer):
......@@ -73,59 +90,62 @@ def create(model,
if modelInfo["model_type"] == "qwen":
pass
else:
tokenizer = tokenizer.tokenizer;
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size();
piece_size = tokenizer.sp_model.piece_size()
for i in range(piece_size):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
else:
vocab = tokenizer.get_vocab();
vocab = tokenizer.get_vocab()
for v in vocab.keys():
if (modelInfo["model_type"] == "moss"):
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v];
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0));
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
elif (modelInfo["model_type"] == "qwen"):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0));
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0));
tot = 0;
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
tot = 0
for key in dict:
ori_data_type = 0;
ori_np_data_type = np.float32;
cur_weight_type = 0;
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
to_data_type = 0;
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype];
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7;
ori_np_data_type = np.float16;
ori_data_type = 7
ori_np_data_type = np.float16
elif (cur_weight_type == 2):
# TODO bfloat
to_data_type = 0;
to_data_type = 0
weight_name = key
if peft_config is not None:
weight_name = weight_name.replace('base_model.model.', '')
if (cur_weight_type == 111):
llm.fastllm_lib.add_qlinear_weight_llm_model(model, key.encode(),
llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
weight_bits[key],
dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
dict[key].numpy().ctypes.data_as(ctypes.c_void_p));
dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
else:
llm.fastllm_lib.add_weight_llm_model(model, key.encode(),
llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
to_data_type, cur_weight_type, ori_data_type,
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
tot += 1;
print("convert (", tot, "/", len(dict), end = " )\r");
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
tot += 1
print("convert (", tot, "/", len(dict), end = " )\r")
print("");
llm.fastllm_lib.init_params_llm_model(model);
llm.fastllm_lib.warmup_llm_model(model);
ret = llm.model("", id = model);
return ret;
print("")
llm.fastllm_lib.init_params_llm_model(model)
llm.fastllm_lib.warmup_llm_model(model)
ret = llm.model("", id = model)
return ret
import ctypes;
import os;
from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
import ctypes
import os
from typing import Optional, Tuple, Union, List, Callable, Dict, Any
import platform
if platform.system() == 'Windows':
......@@ -46,106 +46,106 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
def set_cpu_threads(threads: int):
fastllm_lib.set_cpu_threads(threads);
fastllm_lib.set_cpu_threads(threads)
def get_cpu_threads() -> int:
return fastllm_lib.get_cpu_threads();
return fastllm_lib.get_cpu_threads()
def print_ins_info():
fastllm_lib.print_cpu_ins();
fastllm_lib.print_cpu_ins()
def set_cpu_kvcache(cpu_kvcache):
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache));
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
def get_cpu_kvcache():
return fastllm_lib.get_kvcache_in_cpu();
return fastllm_lib.get_kvcache_in_cpu()
def set_cpu_low_mem(low_mem):
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem));
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
def get_cpu_low_mem():
return fastllm_lib.get_cpu_low_mem();
return fastllm_lib.get_cpu_low_mem()
def set_device_map(device_map):
devices = [];
values = [];
devices = []
values = []
if (isinstance(device_map, str)):
devices.append(device_map);
values.append(1);
devices.append(device_map)
values.append(1)
elif (isinstance(device_map, list)):
devices = [str(x) for x in device_map];
values = [1 for x in device_map];
devices = [str(x) for x in device_map]
values = [1 for x in device_map]
elif (isinstance(device_map, dict)):
devices = [str(x) for x in device_map.keys()];
values = [int(device_map[x]) for x in device_map.keys()];
devices = [str(x) for x in device_map.keys()]
values = [int(device_map[x]) for x in device_map.keys()]
else:
print("set_device_map error.");
return;
device_str = ''.join(devices);
device_len = [len(x) for x in devices];
print("set_device_map error.")
return
device_str = ''.join(devices)
device_len = [len(x) for x in devices]
fastllm_lib.set_device_map(len(device_len),
(ctypes.c_int * len(device_len))(*device_len),
device_str.encode(),
(ctypes.c_int * len(values))(*values));
(ctypes.c_int * len(values))(*values))
def from_hf(model,
tokenizer = None,
dtype = "float16"):
from fastllm_pytools import hf_model;
return hf_model.create(model, tokenizer, dtype = dtype);
from fastllm_pytools import hf_model
return hf_model.create(model, tokenizer, dtype = dtype)
class model:
def __init__ (self, path : str,
id : int = -99999):
if (id != -99999):
self.model = id;
self.model = id
else:
self.model = fastllm_lib.create_llm_model(path.encode());
self.direct_query = False;
self.model = fastllm_lib.create_llm_model(path.encode())
self.direct_query = False
def get_prompt(self,
query: str,
history: List[Tuple[str, str]] = None) -> str:
if (not(history)):
history = [];
prompt = "";
history = []
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode();
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode();
return prompt;
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
return prompt
def save(self, path : str):
fastllm_lib.save_llm_model(self.model, path.encode());
fastllm_lib.save_llm_model(self.model, path.encode())
def eval(self):
pass;
pass
def response_logits(self,
query: str,
history: List[Tuple[str, str]] = None,
tokenizer = None) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history);
prompt = query if self.direct_query else self.get_prompt(query, history)
if (tokenizer == None):
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
else:
input = tokenizer.encode(prompt);
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
1, False, 1, 1, 1, 1, True);
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model);
1, False, 1, 1, 1, 1, True)
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
logits = list(range(vocab_size))
array = (ctypes.c_float * (vocab_size * 4))(*logits);
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
out = list(array)[:vocab_size];
array = (ctypes.c_float * (vocab_size * 4))(*logits)
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
out = list(array)[:vocab_size]
while (ret != -1):
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
return out;
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
return out
def response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
ret = "";
ret = ""
for i in self.stream_response(query = query,
history = history,
max_length = max_length,
......@@ -154,81 +154,87 @@ class model:
temperature = temperature,
repeat_penalty = repeat_penalty,
one_by_one = True):
ret += i;
return ret;
ret += i
return ret
def stream_response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True):
prompt = query if self.direct_query else self.get_prompt(query, history);
prompt = query if self.direct_query else self.get_prompt(query, history)
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False));
res = "";
ret = b'';
fail_cnt = 0;
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
res = ""
ret = b''
fail_cnt = 0
while True:
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
cur = "";
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
cur = ""
try:
cur = ret.decode();
ret = b'';
cur = ret.decode()
ret = b''
except:
fail_cnt += 1;
fail_cnt += 1
if (fail_cnt == 20):
break;
break
else:
continue;
fail_cnt = 0;
continue
fail_cnt = 0
if (cur == "<flmeos>"):
break;
break
if one_by_one:
yield cur;
yield cur
else:
res += cur;
yield res;
res += cur
yield res
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False);
False)
result = [];
result = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break;
result.append(cur);
response = tokenizer.decode(result);
history = history + [(query, response)];
return response, history;
break
result.append(cur)
response = tokenizer.decode(result)
history = history + [(query, response)]
return response, history
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
return_past_key_values = False, **kwargs) -> str:
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False);
tokens = [];
False)
tokens = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break;
tokens.append(cur);
response = tokenizer.decode(tokens);
new_history = history + [(query, response)];
break
tokens.append(cur)
response = tokenizer.decode(tokens)
new_history = history + [(query, response)]
if return_past_key_values:
yield response, new_history, None;
yield response, new_history, None
else:
yield response, new_history;
yield response, new_history
def set_adapter(self, name: str):
fastllm_lib.set_adapter(self.model, str(name).encode())
def disable_adapter(self):
fastllm_lib.disable_adapter(self.model)
......@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20]);
temp = v;
v = np.random.randint(-127, 127, [10, 20])
temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
......@@ -34,8 +34,8 @@ def write_int8(fo, v):
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
......@@ -49,8 +49,8 @@ def write_int4(fo, v):
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def tofile(exportPath,
......@@ -91,19 +91,32 @@ def tofile(exportPath,
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
modelInfo["peft_size"] = adapter_size
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
if hasattr(model, "peft_config"):
for adapter_name in model.peft_config.keys():
adapter_dict = model.peft_config[adapter_name].__dict__
writeString(fo, adapter_name)
fo.write(struct.pack('i', len(adapter_dict)))
for it in adapter_dict.keys():
writeKeyValue(fo, str(it), str(adapter_dict[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
......@@ -128,7 +141,7 @@ def tofile(exportPath,
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
s = v.encode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
......@@ -165,8 +178,14 @@ def tofile(exportPath,
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
if hasattr(model, "peft_config"):
weight_name = key.replace('base_model.model.', '')
fo.write(struct.pack('i', len(weight_name)))
fo.write(weight_name.encode())
else:
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment