Commit 7d96fda9 authored by zhouxiang's avatar zhouxiang
Browse files

更新版本

parent 8e2381d6
import sys
from transformers import AutoTokenizer, AutoModel
import struct
import numpy as np
import torch
def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
fo.write(s.encode())
def writeKeyValue(fo, key, value):
writeString(fo, key)
writeString(fo, value)
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7,
"float32": 0,
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20])
temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def tofile(exportPath,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if (modelInfo['model_type'] == "qwen"):
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
from fastllm_pytools import torch2flm
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("./chatglm2-6b/", trust_remote_code=True)
model = AutoModel.from_pretrained("./chatglm2_model/chatglm2-6b/", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.bin"
tofile(exportPath, model, tokenizer, dtype = dtype)
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
from fastllm_pytools import llm;
import torch;
import ctypes;
import numpy as np;
from fastllm_pytools import llm
import torch
import ctypes
import numpy as np
fastllm_data_type_dict = {
"int4": 8,
......@@ -22,50 +22,67 @@ def create(model,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()));
exit(0);
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt;
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role;
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role;
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep;
modelInfo["history_sep"] = history_sep
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1";
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = "";
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
if (modelInfo["model_type"] == "qwen"):
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
weight_type_dict = {};
module_dict = {};
weight_bits = {};
weight_type_dict = {}
module_dict = {}
weight_bits = {}
for key, m in model.named_modules():
if (str(type(m)).find("QuantizedLinear") != -1):
weight_type_dict[key + ".weight"] = "QuantizedLinear";
weight_bits[key + ".weight"] = m.weight_bit_width;
weight_type_dict[key + ".weight"] = "QuantizedLinear"
weight_bits[key + ".weight"] = m.weight_bit_width
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear";
module_dict[key + ".weight"] = m;
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding";
weight_type_dict[key] = "embedding"
model = model.cpu();
dict = model.state_dict();
model_type = model.config.__dict__["model_type"];
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
peft_config = {}
active_adapter = ""
if hasattr(model, "peft_config"):
peft_config = model.peft_config
if hasattr(model, "active_adapter"):
active_adapter = model.active_adapter
model = model.cpu()
dict = model.state_dict()
model_type = model.config.__dict__["model_type"]
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
for it in modelInfo.keys():
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
for adapter_name in peft_config.keys():
adapter_dict = peft_config[adapter_name].__dict__
for it in adapter_dict.keys():
llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
if len(active_adapter) != 0:
llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
# 1. vocab
if (tokenizer):
......@@ -73,59 +90,62 @@ def create(model,
if modelInfo["model_type"] == "qwen":
pass
else:
tokenizer = tokenizer.tokenizer;
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size();
piece_size = tokenizer.sp_model.piece_size()
for i in range(piece_size):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
else:
vocab = tokenizer.get_vocab();
vocab = tokenizer.get_vocab()
for v in vocab.keys():
if (modelInfo["model_type"] == "moss"):
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v];
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0));
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
elif (modelInfo["model_type"] == "qwen"):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0));
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0));
tot = 0;
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
tot = 0
for key in dict:
ori_data_type = 0;
ori_np_data_type = np.float32;
cur_weight_type = 0;
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
to_data_type = 0;
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype];
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7;
ori_np_data_type = np.float16;
ori_data_type = 7
ori_np_data_type = np.float16
elif (cur_weight_type == 2):
# TODO bfloat
to_data_type = 0;
to_data_type = 0
weight_name = key
if peft_config is not None:
weight_name = weight_name.replace('base_model.model.', '')
if (cur_weight_type == 111):
llm.fastllm_lib.add_qlinear_weight_llm_model(model, key.encode(),
llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
weight_bits[key],
dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
dict[key].numpy().ctypes.data_as(ctypes.c_void_p));
dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
else:
llm.fastllm_lib.add_weight_llm_model(model, key.encode(),
llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
to_data_type, cur_weight_type, ori_data_type,
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
tot += 1;
print("convert (", tot, "/", len(dict), end = " )\r");
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
tot += 1
print("convert (", tot, "/", len(dict), end = " )\r")
print("");
llm.fastllm_lib.init_params_llm_model(model);
llm.fastllm_lib.warmup_llm_model(model);
ret = llm.model("", id = model);
return ret;
print("")
llm.fastllm_lib.init_params_llm_model(model)
llm.fastllm_lib.warmup_llm_model(model)
ret = llm.model("", id = model)
return ret
import ctypes;
import os;
from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
import ctypes
import os
from typing import Optional, Tuple, Union, List, Callable, Dict, Any
import platform
if platform.system() == 'Windows':
......@@ -46,106 +46,106 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
def set_cpu_threads(threads: int):
fastllm_lib.set_cpu_threads(threads);
fastllm_lib.set_cpu_threads(threads)
def get_cpu_threads() -> int:
return fastllm_lib.get_cpu_threads();
return fastllm_lib.get_cpu_threads()
def print_ins_info():
fastllm_lib.print_cpu_ins();
fastllm_lib.print_cpu_ins()
def set_cpu_kvcache(cpu_kvcache):
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache));
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
def get_cpu_kvcache():
return fastllm_lib.get_kvcache_in_cpu();
return fastllm_lib.get_kvcache_in_cpu()
def set_cpu_low_mem(low_mem):
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem));
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
def get_cpu_low_mem():
return fastllm_lib.get_cpu_low_mem();
return fastllm_lib.get_cpu_low_mem()
def set_device_map(device_map):
devices = [];
values = [];
devices = []
values = []
if (isinstance(device_map, str)):
devices.append(device_map);
values.append(1);
devices.append(device_map)
values.append(1)
elif (isinstance(device_map, list)):
devices = [str(x) for x in device_map];
values = [1 for x in device_map];
devices = [str(x) for x in device_map]
values = [1 for x in device_map]
elif (isinstance(device_map, dict)):
devices = [str(x) for x in device_map.keys()];
values = [int(device_map[x]) for x in device_map.keys()];
devices = [str(x) for x in device_map.keys()]
values = [int(device_map[x]) for x in device_map.keys()]
else:
print("set_device_map error.");
return;
device_str = ''.join(devices);
device_len = [len(x) for x in devices];
print("set_device_map error.")
return
device_str = ''.join(devices)
device_len = [len(x) for x in devices]
fastllm_lib.set_device_map(len(device_len),
(ctypes.c_int * len(device_len))(*device_len),
device_str.encode(),
(ctypes.c_int * len(values))(*values));
(ctypes.c_int * len(values))(*values))
def from_hf(model,
tokenizer = None,
dtype = "float16"):
from fastllm_pytools import hf_model;
return hf_model.create(model, tokenizer, dtype = dtype);
from fastllm_pytools import hf_model
return hf_model.create(model, tokenizer, dtype = dtype)
class model:
def __init__ (self, path : str,
id : int = -99999):
if (id != -99999):
self.model = id;
self.model = id
else:
self.model = fastllm_lib.create_llm_model(path.encode());
self.direct_query = False;
self.model = fastllm_lib.create_llm_model(path.encode())
self.direct_query = False
def get_prompt(self,
query: str,
history: List[Tuple[str, str]] = None) -> str:
if (not(history)):
history = [];
prompt = "";
history = []
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode();
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode();
return prompt;
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
return prompt
def save(self, path : str):
fastllm_lib.save_llm_model(self.model, path.encode());
fastllm_lib.save_llm_model(self.model, path.encode())
def eval(self):
pass;
pass
def response_logits(self,
query: str,
history: List[Tuple[str, str]] = None,
tokenizer = None) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history);
prompt = query if self.direct_query else self.get_prompt(query, history)
if (tokenizer == None):
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
else:
input = tokenizer.encode(prompt);
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
1, False, 1, 1, 1, 1, True);
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model);
1, False, 1, 1, 1, 1, True)
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
logits = list(range(vocab_size))
array = (ctypes.c_float * (vocab_size * 4))(*logits);
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
out = list(array)[:vocab_size];
array = (ctypes.c_float * (vocab_size * 4))(*logits)
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
out = list(array)[:vocab_size]
while (ret != -1):
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
return out;
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
return out
def response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
ret = "";
ret = ""
for i in self.stream_response(query = query,
history = history,
max_length = max_length,
......@@ -154,81 +154,87 @@ class model:
temperature = temperature,
repeat_penalty = repeat_penalty,
one_by_one = True):
ret += i;
return ret;
ret += i
return ret
def stream_response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True):
prompt = query if self.direct_query else self.get_prompt(query, history);
prompt = query if self.direct_query else self.get_prompt(query, history)
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False));
res = "";
ret = b'';
fail_cnt = 0;
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
res = ""
ret = b''
fail_cnt = 0
while True:
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
cur = "";
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
cur = ""
try:
cur = ret.decode();
ret = b'';
cur = ret.decode()
ret = b''
except:
fail_cnt += 1;
fail_cnt += 1
if (fail_cnt == 20):
break;
break
else:
continue;
fail_cnt = 0;
continue
fail_cnt = 0
if (cur == "<flmeos>"):
break;
break
if one_by_one:
yield cur;
yield cur
else:
res += cur;
yield res;
res += cur
yield res
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False);
False)
result = [];
result = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break;
result.append(cur);
response = tokenizer.decode(result);
history = history + [(query, response)];
return response, history;
break
result.append(cur)
response = tokenizer.decode(result)
history = history + [(query, response)]
return response, history
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
return_past_key_values = False, **kwargs) -> str:
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False);
tokens = [];
False)
tokens = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break;
tokens.append(cur);
response = tokenizer.decode(tokens);
new_history = history + [(query, response)];
break
tokens.append(cur)
response = tokenizer.decode(tokens)
new_history = history + [(query, response)]
if return_past_key_values:
yield response, new_history, None;
yield response, new_history, None
else:
yield response, new_history;
yield response, new_history
def set_adapter(self, name: str):
fastllm_lib.set_adapter(self.model, str(name).encode())
def disable_adapter(self):
fastllm_lib.disable_adapter(self.model)
......@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20]);
temp = v;
v = np.random.randint(-127, 127, [10, 20])
temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
......@@ -34,8 +34,8 @@ def write_int8(fo, v):
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
......@@ -49,8 +49,8 @@ def write_int4(fo, v):
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def tofile(exportPath,
......@@ -91,19 +91,32 @@ def tofile(exportPath,
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
modelInfo["peft_size"] = adapter_size
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
if hasattr(model, "peft_config"):
for adapter_name in model.peft_config.keys():
adapter_dict = model.peft_config[adapter_name].__dict__
writeString(fo, adapter_name)
fo.write(struct.pack('i', len(adapter_dict)))
for it in adapter_dict.keys():
writeKeyValue(fo, str(it), str(adapter_dict[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
......@@ -128,7 +141,7 @@ def tofile(exportPath,
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
s = v.encode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
......@@ -165,8 +178,14 @@ def tofile(exportPath,
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
if hasattr(model, "peft_config"):
weight_name = key.replace('base_model.model.', '')
fo.write(struct.pack('i', len(weight_name)))
fo.write(weight_name.encode())
else:
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment