import sys from transformers import AutoTokenizer, AutoModel import struct import numpy as np import torch def writeString(fo, s): fo.write(struct.pack('i', len(s))) fo.write(s.encode()) def writeKeyValue(fo, key, value): writeString(fo, key) writeString(fo, value) fastllm_data_type_dict = { "int4": 8, "int8": 3, "float16": 7, "float32": 0, } fastllm_weight_type_dict = { "linear": 1, "embedding": 2 } v = np.random.randint(-127, 127, [10, 20]) temp = v c_max = np.expand_dims(np.abs(v).max(axis = -1), -1) c_scale = c_max / 127.0 v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8) def write_int8(fo, v): c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100) c_scale = c_max / 127.0 v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8) fo.write(struct.pack('i', 3)) fo.write(struct.pack('i', 0)) for i in range(c_max.shape[0]): fo.write(struct.pack('f', -c_max[i][0])) fo.write(struct.pack('f', c_max[i][0])) fo.write(v.data) def write_int4(fo, v): c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1) c_max = np.expand_dims(np.abs(v).max(axis = -1), -1) c_scale = c_max / 7.0 c_min = c_scale * -8.0 v = (v - c_min) / c_scale v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8) v = v[:, 0::2] * 16 + v[:, 1::2] fo.write(struct.pack('i', 8)) fo.write(struct.pack('i', 0)) for i in range(c_min.shape[0]): fo.write(struct.pack('f', c_min[i][0])) fo.write(struct.pack('f', c_max[i][0])) fo.write(v.data) def tofile(exportPath, model, tokenizer = None, pre_prompt = None, user_role = None, bot_role = None, history_sep = None, dtype = "float16"): if (dtype not in fastllm_data_type_dict): print("dtype should in ", list(fastllm_data_type_dict.keys())) exit(0) dict = model.state_dict() fo = open(exportPath, "wb") # 0. version id fo.write(struct.pack('i', 2)) # 0.1 model info modelInfo = model.config.__dict__ if model.generation_config is not None: modelInfo.update(model.generation_config.__dict__) if ("model_type" not in modelInfo): print("unknown model_type.") exit(0) if (pre_prompt): modelInfo["pre_prompt"] = pre_prompt if (user_role): modelInfo["user_role"] = user_role if (bot_role): modelInfo["bot_role"] = bot_role if (history_sep): modelInfo["history_sep"] = history_sep modelInfo["tokenizer_use_score"] = "1" # 分词带分数 fo.write(struct.pack('i', len(modelInfo))) for it in modelInfo.keys(): writeKeyValue(fo, str(it), str(modelInfo[it])) # 1. vocab if (tokenizer): if (hasattr(tokenizer, "tokenizer")): if (modelInfo['model_type'] == "qwen"): pass else: tokenizer = tokenizer.tokenizer if (hasattr(tokenizer, "sp_model")): piece_size = tokenizer.sp_model.piece_size() fo.write(struct.pack('i', piece_size)) for i in range(piece_size): s = tokenizer.sp_model.id_to_piece(i).encode() fo.write(struct.pack('i', len(s))) for c in s: fo.write(struct.pack('i', c)) fo.write(struct.pack('i', i)) fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i)))) else: vocab = tokenizer.get_vocab() fo.write(struct.pack('i', len(vocab))) for v in vocab.keys(): if (modelInfo['model_type'] == "qwen"): s = v else: s = v.decode() if (modelInfo["model_type"] == "moss"): s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v] fo.write(struct.pack('i', len(s))) for c in s: fo.write(struct.pack('i', c)) fo.write(struct.pack('i', vocab[v])) fo.write(struct.pack('f', 1.0)) else: fo.write(struct.pack('i', 0)) weight_type_dict = {} module_dict = {} for key, m in model.named_modules(): if (isinstance(m, torch.nn.Linear)): weight_type_dict[key + ".weight"] = "linear" module_dict[key + ".weight"] = m if (isinstance(m, torch.nn.Embedding)): weight_type_dict[key] = "embedding" # 2. weight fo.write(struct.pack('i', len(dict))) tot = 0 for key in dict: ori_data_type = 0 ori_np_data_type = np.float32 cur_weight_type = 0 if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict): cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]] to_data_type = 0 if (cur_weight_type == 1): to_data_type = fastllm_data_type_dict[dtype] if (to_data_type == 7): ori_data_type = 7 ori_np_data_type = np.float16 cur = dict[key].numpy().astype(ori_np_data_type) fo.write(struct.pack('i', len(key))) fo.write(key.encode()) fo.write(struct.pack('i', len(cur.shape))) for i in cur.shape: fo.write(struct.pack('i', i)) if (to_data_type == 3): write_int8(fo, cur) elif (to_data_type == 8): write_int4(fo, cur) else: fo.write(struct.pack('i', to_data_type)) fo.write(cur.data) tot += 1 print("output (", tot, "/", len(dict), end = " )\r") print("\nfinish.") fo.close() if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained("./chatglm2-6b/", trust_remote_code=True) model = AutoModel.from_pretrained("./chatglm2_model/chatglm2-6b/", trust_remote_code=True) model = model.eval() dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.bin" tofile(exportPath, model, tokenizer, dtype = dtype)