Commit 56215723 authored by zhouxiang's avatar zhouxiang
Browse files

1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题

parent 44be91d3
......@@ -34,14 +34,16 @@ namespace fastllm {
cos.resize(max_positions);
std::vector <float> invFreq;
for (int i = 0; i < rotary_dim; i += 2) {
invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
int base = this->bot_role.empty() ? 10000 : 10000 * rope;
invFreq.push_back(1.0 / pow(base, (float)i / rotary_dim));
}
for (int i = 0; i < max_positions; i++) {
sin[i].resize(rotary_dim);
cos[i].resize(rotary_dim);
for (int j = 0; j < invFreq.size(); j++) {
sin[i][j] = ::sin((float)i / rope * invFreq[j]);
cos[i][j] = ::cos((float)i / rope * invFreq[j]);
float scale = this->bot_role.empty() ? rope : 1.0f;
sin[i][j] = ::sin((float)i / scale * invFreq[j]);
cos[i][j] = ::cos((float)i / scale * invFreq[j]);
}
}
......@@ -59,8 +61,9 @@ namespace fastllm {
ChatGLMModel::ChatGLMModel() {
this->model_type = "chatglm";
this->bos_token_id = 130004;
this->eos_token_id = 130005;
this->bos_token_id = 130004; // V1 后期版本 bos token,可通过 config.json 覆盖
this->eos_token_id = 130005; // V1 后期版本 eos token,可通过 config.json 覆盖
this->gmask_token_id= 150001; // V1最初版本, 150528 tokens,部分 config.json 没有 gmask_token_id,因此取默认值。
this->rope = -1.0;
this->UpdateSinCos(1.0f);
......@@ -68,6 +71,33 @@ namespace fastllm {
weight.embeddingNames.insert("transformer.embedding.word_embeddings.weight");
}
void ChatGLMModel::InitParams() {
basellm::InitParams();
if (GetVersion() == 1) {
if (this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end()) {
this->gmask_token_id = atoi(this->weight.dicts["gmask_token_id"].c_str());
}
} else if (GetVersion() == 2) {
this->gmask_token_id = 64790;
this->bos_token_id = 64792;
}
if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
}
if (model_type == "chatglm3"){
int special_id = 64789;
this->mask_token_id = special_id++;
this->gmask_token_id = special_id++;
this->smask_token_id = special_id++;
this->bos_token_id = special_id++;
this->eop_token_id = special_id++;
this->system_token_id = special_id++;
this->user_token_id = special_id++;
this->assistant_token_id = special_id++;
this->observation_token_id = special_id++;
}
}
int ChatGLMModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
......@@ -86,9 +116,6 @@ namespace fastllm {
const GenerationConfig &generationConfig,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
}
int maxLen = inputIds.dims[1];
Data inputEmbeddings;
Data attenInput;
......@@ -218,18 +245,16 @@ namespace fastllm {
CatDirect(pastKey, k, 1);
CatDirect(pastValue, v, 1);
std::vector<int> outputSize = {q.dims[1], q.dims[2], q.dims[0], pastKey.dims[1]};
q.Reshape({q.dims[0], q.dims[1] * q.dims[2], q.dims[3]});
PermuteSelf(q, {1, 0, 2});
//Attention(q, pastKey, pastValue, attentionMask, contextLayer, q.dims[0] / pastKey.dims[0], 1.0 / scale_attn, 1);
Attention(q, pastKey, pastValue, attentionMask, contextLayer, q.dims[0] / pastKey.dims[0], 1.0 / scale_attn, 1);
/*
// 1.2 Attention
// 1.2.0 q * k^T
q.Reshape({pastKey.dims[0], -1, q.dims[2]});
MatMulTransB(q, pastKey, attnProbs, 1.0 / (scale_attn * (i + 1)));
attnProbs.Reshape(outputSize);
// 1.2.1 Mask
if (attentionMask.dims.size() != 0) {
AttentionMask(attnProbs, attentionMask, -10000);
......@@ -243,6 +268,8 @@ namespace fastllm {
attnProbs.Reshape({pastValue.dims[0], -1, attnProbs.dims[2]});
MatMul(attnProbs, pastValue, contextLayer);
*/
contextLayer.Reshape({batch, num_attention_heads, maxLen, -1});
PermuteSelf(contextLayer, {2, 0, 1, 3});
contextLayer.Reshape({contextLayer.dims[0], contextLayer.dims[1], embed_dim});
......@@ -286,34 +313,47 @@ namespace fastllm {
}
Data logits, topk;
if (version == 1) {
LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
weight["transformer.final_layernorm.bias"], -1, hiddenStates);
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
Data tempHiddenStates;
Data *lastHiddenStates;
if (maxLen > 1) {
Split(hiddenStates, 0, maxLen - 1, maxLen, tempHiddenStates);
lastHiddenStates = &tempHiddenStates;
} else {
RMSNorm(hiddenStates, weight["transformer.encoder.final_layernorm.weight"], 1e-5, hiddenStates);
Linear(hiddenStates, weight["transformer.output_layer.weight"], Data(), logits);
lastHiddenStates = &hiddenStates;
}
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
(*retLogits)[b]->resize(size);
memcpy((float*)(*retLogits)[b]->data(), ((float*)logits.cpuData) + base * size, size * logits.unitSize);
{
auto &hiddenStates = *lastHiddenStates;
if (version == 1) {
LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
weight["transformer.final_layernorm.bias"], -1, hiddenStates);
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
} else {
RMSNorm(hiddenStates, weight["transformer.encoder.final_layernorm.weight"], 1e-5, hiddenStates);
Linear(hiddenStates, weight["transformer.output_layer.weight"], Data(), logits);
}
}
if (generationConfig.IsSimpleGreedy()) {
TopK(logits, topk, 1);
topk.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
lastRet.push_back((int) (((float *) topk.cpuData)[base * 2] + 1e-3));
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = b;
(*retLogits)[b]->resize(size);
memcpy((float *) (*retLogits)[b]->data(), ((float *) logits.cpuData) + base * size,
size * logits.unitSize);
}
}
} else if (!lastTokens.units.empty()) {
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
if (generationConfig.IsSimpleGreedy()) {
TopK(logits, topk, 1);
topk.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = b;
lastRet.push_back((int) (((float *) topk.cpuData)[base * 2] + 1e-3));
}
} else if (!lastTokens.units.empty()) {
for (int b = 0; b < batch; b++) {
int base = b;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
}
}
}
return lastRet;
......@@ -329,9 +369,6 @@ namespace fastllm {
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
}
int seqLen = inputIds.dims[1];
sinData.ToDevice(DataDevice::CUDA);
cosData.ToDevice(DataDevice::CUDA);
......@@ -344,7 +381,6 @@ namespace fastllm {
weightPre = "transformer.encoder.layers.";
weightMiddle = ".self_attention";
}
Data inputEmbeddings;
Data inputIdsPermute;
Permute(inputIds, {1, 0}, inputIdsPermute);
......@@ -352,7 +388,6 @@ namespace fastllm {
".word_embeddings.weight"], inputEmbeddings);
Data &hiddenStates = inputEmbeddings;
hiddenStates.ToDevice(DataDevice::CUDA);
Data attenInput;
Data qkv, q, k, v;
Data attnOutput;
......@@ -365,7 +400,6 @@ namespace fastllm {
curKs.resize(batch);
curVs.resize(batch);
curQs.resize(batch);
bool all1 = true;
for (int i = 0; i < batch; i++) {
all1 &= (seqLens[i] == 1);
......@@ -392,7 +426,6 @@ namespace fastllm {
std::vector <std::vector <int> > outputSizes;
outputSizes.resize(batch);
for (int i = 0; i < block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
if (version == 1) {
......@@ -451,7 +484,6 @@ namespace fastllm {
Data contextLayer = Data(DataType::FLOAT32);
int total = 0;
if (all1 && batch > 1) {
for (int b = 0; b < batch; b++) {
pointersK[b] = (&curKs[b]);
......@@ -482,6 +514,7 @@ namespace fastllm {
total += seqLens[b];
}
}
for (int b = 0; b < batch; b++) {
auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt +
......@@ -717,8 +750,6 @@ namespace fastllm {
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
int gmask_token_id = this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end() ?
atoi(this->weight.dicts["gmask_token_id"].c_str()) : 130001;
int index = params.find("index")->second;
int promptLen = params.find("promptLen")->second;
......@@ -728,9 +759,9 @@ namespace fastllm {
ids.push_back(gmask_token_id);
ids.push_back(bos_token_id);
} else if (GetVersion() == 2) {
if (ids.size() < 2 || ids[0] != 64790 || ids[1] != 64792) {
ids.insert(ids.begin(), 64792);
ids.insert(ids.begin(), 64790);
if (ids.size() < 2 || ids[0] != this->gmask_token_id || ids[1] != this->bos_token_id) {
ids.insert(ids.begin(), this->bos_token_id);
ids.insert(ids.begin(), this->gmask_token_id);
}
}
}
......@@ -780,8 +811,6 @@ namespace fastllm {
int batch = inputTokens.size();
int index = params[0].find("index")->second;
if (index == 0) {
int gmask_token_id = this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end() ?
atoi(this->weight.dicts["gmask_token_id"].c_str()) : 130001;
std::vector<int> seqLens;
seqLens.resize(batch);
int maxLen = 0;
......@@ -820,8 +849,8 @@ namespace fastllm {
} else {
auto &tokens = inputTokens[i];
int len = tokens.size(), base = maxLen - 2 - len;
ids[i * maxLen + base] = 64790;
ids[i * maxLen + base + 1] = 64792;
ids[i * maxLen + base] = gmask_token_id;
ids[i * maxLen + base + 1] = bos_token_id;
for (int j = 0; j < len; j++) {
ids[i * maxLen + base + 2 + j] = tokens[j];
}
......@@ -894,28 +923,32 @@ namespace fastllm {
}
std::string ChatGLMModel::MakeInput(const std::string &history, int round, const std::string &input) {
if (round == 0 && GetVersion() == 1) {
return input;
if (this->bot_role != "") {
return (round == 0 ? pre_prompt : history) + user_role + input + bot_role;
} else {
if (GetVersion() == 2)
round++;
if (round == 0 && GetVersion() == 1) {
return input;
} else {
#if defined(_WIN32) or defined(_WIN64)
std::vector <uint8_t> vask = {233, 151, 174, 239, 188, 154, 0};
std::vector <uint8_t> vans = {231, 173, 148, 239, 188, 154, 0};
std::string sask = (char*)vask.data();
std::string sans = (char*)vans.data();
return (history + ("[Round " + std::to_string(round) + "]\n\n" + sask + input + "\n\n" + sans));
return history + ("[Round " + std::to_string(round) + u8"]\n\n问:" + input + u8"\n\n答:");
#else
return history + ("[Round " + std::to_string(round) + "]\n\n问:" + input + "\n\n答:");
return history + ("[Round " + std::to_string(round) + "]\n\n问:" + input + "\n\n答:");
#endif
}
}
}
std::string ChatGLMModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
if (this->bot_role != "") {
return (round == 0 ? pre_prompt : history) + user_role + input + bot_role + output + history_sep;
}
if (GetVersion() == 2)
round++;
#if defined(_WIN32) or defined(_WIN64)
std::vector <uint8_t> vask = {233, 151, 174, 239, 188, 154, 0};
std::vector <uint8_t> vans = {231, 173, 148, 239, 188, 154, 0};
std::string sask = (char*)vask.data();
std::string sans = (char*)vans.data();
return (history + ("[Round " + std::to_string(round) + "]\n\n" + sask + input + "\n\n" + sans + output + "\n"));
return (history + ("[Round " + std::to_string(round) + u8"]\n\n问:" + input + u8"\n\n答:" + output + "\n"));
#else
return (history + ("[Round " + std::to_string(round) + "]\n\n问:" + input + "\n\n答:" + output + "\n\n"));
#endif
......
//
// Created by huangyuyang on 5/11/23.
//
#include "utils.h"
#include "glm.h"
#include <cmath>
#include <chrono>
#include <algorithm>
#include <map>
#include <sstream>
#include <unordered_map>
#include <cstring>
#ifdef USE_CUDA
#include "fastllm-cuda.cuh"
#endif
namespace fastllm {
GLMModel::GLMModel() {
this->model_type = "glm";
this->bos_token_id = 50006;//<|startofpiece|>
this->eos_token_id = 50007;//<|endofpiece|>
weight.embeddingNames.insert("word_embeddings.weight");
weight.embeddingNames.insert("transformer.position_embeddings.weight");
weight.embeddingNames.insert("transformer.block_position_embeddings.weight");
weight.tokenizer.type=Tokenizer::GLM;
weight.tokenizer.Insert("[MASK]",mask_token_id);
weight.tokenizer.Insert("[sMASK]",smask_token_id);
weight.tokenizer.Insert("[gMASK]",gmask_token_id);
}
int GLMModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector <float> *logits) {
std::vector <std::vector <float>*> batchLogits;
batchLogits.push_back(logits);
return ForwardBatch(1, inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, lastTokens, &batchLogits)[0];
}
std::vector <int> GLMModel::ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
int maxLen = inputIds.dims[1];
Data attentionMask4D;
Data attnScoreAdds;
Data inputEmbeddings;
Data position_ids_1D;
Data block_position_ids_1D;
Data positionEmbeddings;
Data blockPositionEmbeddings;
Data attenInput;
Data qkv, q, k, v,q0;
Data attnScores;
Data attnProbs;
Data attnOutput;
Data contextLayer;
Data contextLayerPermute;
Data mlpInput;
Data mlpOutput;
Data middle, middle2;
Data toSave;
Data mem2,mem3;
std::vector<int> lastRet;
// GLMBlock
std::string weightPre, weightMiddle;
weightPre = "transformer.layers.";
weightMiddle = ".attention";
{
Data attentionMask4D_1x;
attentionMask4D_1x.CopyFrom(attentionMask);
attentionMask4D_1x.Reshape({1,1,attentionMask.dims[0],attentionMask.dims[1]});
std::vector<Data*> masks(num_attention_heads);
for(int i=0;i<num_attention_heads;i++){
masks[i]=&attentionMask4D_1x;
}
CatBatch(masks,1,attentionMask4D);
std::vector<float> one(attentionMask4D.Count(0),-65504.0);
attnScoreAdds.CopyFrom(Data(DataType::FLOAT32,attentionMask4D.dims,one));
AddTo(attnScoreAdds,attentionMask4D,65504.0);
}
Embedding(inputIds, this->weight["word_embeddings.weight"], inputEmbeddings);
Data &hiddenStates = inputEmbeddings;
Split(positionIds,0,0,1,position_ids_1D);
Split(positionIds,0,1,2,block_position_ids_1D);
Embedding(position_ids_1D, this->weight["transformer.position_embeddings.weight"], positionEmbeddings);
AddTo(hiddenStates,positionEmbeddings);
Embedding(block_position_ids_1D, this->weight["transformer.block_position_embeddings.weight"], blockPositionEmbeddings);
AddTo(hiddenStates,blockPositionEmbeddings);
int memory_length=(pastKeyValues[0].first.dims.size()==0?0:pastKeyValues[0].first.dims.at(1));
int query_length=hiddenStates.dims.at(1);
int new_memory_length=memory_length+query_length;
if(new_memory_length<=query_length){
Split(hiddenStates,1,hiddenStates.dims.at(1)-new_memory_length,hiddenStates.dims.at(1),toSave);
}else{
Split(hiddenStates,1,0,hiddenStates.dims.at(1),toSave);//Copy
}
for (int i = 0; i < block_cnt; i++) {
Data &mem=pastKeyValues[i].first;
bool hasMem=(mem.dims.size()!=0);
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
std::string inputLNWeightName = "transformer.layers." + std::to_string(i) + ".input_layernorm.weight";
std::string inputLNBiasName = "transformer.layers." + std::to_string(i) + ".input_layernorm.bias";
LayerNorm(hiddenStates, weight[inputLNWeightName], weight[inputLNBiasName], -1, attenInput);
std::string qkvWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.weight";
std::string qkvBiasName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.bias";
if(!hasMem){
Linear(attenInput, weight[qkvWeightName], weight[qkvBiasName], qkv);
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
}else{
LayerNorm(mem, weight[inputLNWeightName], weight[inputLNBiasName], -1, mem2);
Cat(mem2,attenInput,1,mem3);
Linear(mem3, weight[qkvWeightName], weight[qkvBiasName], qkv);
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q0);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
int tLen=q0.dims.at(1);
Split(q0,1,tLen-attenInput.dims.at(1),tLen,q);
}
q.Reshape({q.dims[0], q.dims[1], num_attention_heads, -1});
PermuteSelf(q,{0,2,1,3});
k.Reshape({k.dims[0], k.dims[1], num_attention_heads, -1});
//PermuteSelf(k,{0,2,1,3});// (1)
v.Reshape({v.dims[0], v.dims[1], num_attention_heads, -1});
PermuteSelf(v,{0,2,1,3});
//PermuteSelf(k,{0,1,2,3});// (2)
PermuteSelf(k,{0,2,3,1});// Merged (1) + (2)
MatMul(q,k,attnScores,scale_attn_1);
MulTo(attnScores,attentionMask4D);
AddTo(attnScores,attnScoreAdds);
Softmax(attnScores, attnProbs, -1);
MatMul(attnProbs,v,contextLayer);
PermuteSelf(contextLayer,{0,2,1,3});
contextLayer.Reshape({contextLayer.dims[0],contextLayer.dims[1],embed_dim});
std::string denseWeightName = weightPre + std::to_string(i) + weightMiddle + ".dense.weight";
std::string denseBiasName = weightPre + std::to_string(i) + weightMiddle + ".dense.bias";
Linear(contextLayer, weight[denseWeightName], weight[denseBiasName], attnOutput);
AddTo(hiddenStates,attnOutput);
std::string postLNWeightName =
"transformer.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
std::string postLNBiasName =
"transformer.layers." + std::to_string(i) + ".post_attention_layernorm.bias";
LayerNorm(hiddenStates, weight[postLNWeightName], weight[postLNBiasName], -1, mlpInput);
std::string fcInKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
std::string fcOutKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
GeluNew(middle, middle);
Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], mlpOutput);
AddTo(hiddenStates,mlpOutput);
if(new_memory_length<=query_length){
Split(toSave,1,0,toSave.dims.at(1),mem);//Copy
Split(hiddenStates,1,hiddenStates.dims.at(1)-new_memory_length,hiddenStates.dims.at(1),toSave);
}else{
Split(mem,1,mem.dims.at(1)-new_memory_length+query_length,mem.dims.at(1),mem2);
Cat(mem2,toSave,1,mem);
Split(hiddenStates,1,0,hiddenStates.dims.at(1),toSave);//Copy
}
}
Data logits, topk;
LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
weight["transformer.final_layernorm.bias"], -1, hiddenStates);
Linear(hiddenStates, weight["word_embeddings.weight"], Data(), logits);
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
(*retLogits)[b]->resize(size);
memcpy((float*)(*retLogits)[b]->data(), ((float*)logits.cpuData) + base * size, size * logits.unitSize);
}
}
if (generationConfig.IsSimpleGreedy()) {
TopK(logits, topk, 1);
topk.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
lastRet.push_back((int) (((float *) topk.cpuData)[base * 2] + 1e-3));
}
} else if (!lastTokens.units.empty()) {
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
}
}
return lastRet;
}
void GLMModel::FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds) {
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
int index = params.find("index")->second;
if (index == 0) {
int mask_pos=-1;
for (auto &ids: inputTokens) {
bool hasMask=false;
for(unsigned int i=0;i<ids.size();i++){
const float &id=ids.at(i);
if(id==mask_token_id||id==smask_token_id||id==gmask_token_id){
hasMask=true;
if(mask_pos<0){
mask_pos=i+1;
}
break;
}
}
ids.insert(ids.begin(),cls_token_id);
if(!hasMask){
if(mask_pos<0){
mask_pos=ids.size();
}
ids.push_back(gmask_token_id);
}
ids.push_back(eot_token_id);
ids.push_back(bos_token_id);
}
int seqLen = inputTokens[0].size();
std::vector<float> vpids=std::vector<float>(seqLen*2,0);//position_ids
for(int i=0;i<seqLen-1;i++){
vpids[i]=i;
}
for(int i=0;i<seqLen-(seqLen-1);i++){
vpids[seqLen-1+i]=mask_pos;
vpids[seqLen+seqLen-1+i]=(i+1);
}
vpids[seqLen-1]=mask_pos;
vpids[seqLen+seqLen-1]=1;
std::vector<float> vmask=std::vector<float>(seqLen*seqLen,1);//attention_mask
for(int i=0;i<seqLen-1;i++){
for(int j=std::max(i+1,seqLen-1);j<seqLen;j++){
vmask[seqLen*i+j]=0;
}
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
positionIds.CopyFrom(Data(DataType::FLOAT32, {2, seqLen}, vpids));
} else {
const auto &inputToken=inputTokens[0];
unsigned long tokenLen=inputToken.size();
int oldLen=attentionMask.dims.at(1);
int totalLen=oldLen+tokenLen;
float *positionDat=reinterpret_cast<float*>(positionIds.cpuData);
int posLen=positionIds.dims.at(1);
std::vector<float> newAttention(totalLen,1);
std::vector<float> newPosition(tokenLen*2);
for(unsigned int i=0;i<tokenLen;i++){
newPosition[i]=positionDat[posLen-1];
newPosition[tokenLen+i]=positionDat[posLen+posLen-1]+1;
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, static_cast<int>(tokenLen)}, inputTokens[0]));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, totalLen}, newAttention));
positionIds.CopyFrom(Data(DataType::FLOAT32, {2, static_cast<int>(tokenLen)}, newPosition));
}
}
void GLMModel::InitParams()
{
basellm::InitParams();
head_dim = embed_dim / num_attention_heads;
scale_attn_1 = 1.0f/sqrt(head_dim);
#ifdef USE_SENTENCEPIECE
if (this->weight.dicts.find("tokenizer_serialized") != this->weight.dicts.end()) {
const std::string &hexString=this->weight.dicts["tokenizer_serialized"];
if(hexString.length()%2!=0){
std::cerr << "Invalid hex string\n";
}else{
std::string decoded;
for(unsigned int i=0;i<hexString.length();i+=2){
decoded.push_back(std::stoi(hexString.substr(i,2),nullptr,16));
}
weight.tokenizer.spProcessor=std::make_unique<sentencepiece::SentencePieceProcessor>();
weight.tokenizer.spProcessor->LoadFromSerializedProto(decoded);
}
}
#endif
}
void GLMModel::WarmUp() {
// printf("Warmup...\n");
// Data inputIds = Data(DataType::FLOAT32, {1, 1}, {(float)bos_token_id});
// Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
// Data positionIds = Data(DataType::FLOAT32, {2, 1}, {0, 0});
// std::vector <std::pair <Data, Data> > pastKeyValues;
// for (int i = 0; i < block_cnt; i++) {
// pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
// Data(DataType::FLOAT32)));
// }
// Forward(inputIds, attentionMask, positionIds, pastKeyValues);
// printf("finish.\n");
}
std::string GLMModel::MakeInput(const std::string &history, int round, const std::string &input) {
(void)history;
(void)round;
return input;
}
std::string GLMModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
(void)history;
(void)round;
(void)input;
(void)output;
return std::string("");
}
}
......@@ -91,6 +91,7 @@ namespace fastllm {
alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
}
int maxLen = inputIds.dims[1];
Data hiddenStates;
Data attenInput;
Data q, k, v, qkv;
......@@ -143,6 +144,14 @@ namespace fastllm {
PermuteSelf(v, {1, 0, 2});
Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
if (GetKVCacheInCPU()) {
pastKey.lockInCPU = true;
pastValue.lockInCPU = true;
} else {
pastKey.ToDevice(DataDevice::CUDA);
pastValue.ToDevice(DataDevice::CUDA);
}
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
......@@ -200,28 +209,34 @@ namespace fastllm {
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
AddTo(hiddenStates, w2);
}
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
logits.ToDevice(DataDevice::CPU);
Data logits, topk;
Data tempHiddenStates;
Data *lastHiddenStates;
if (maxLen > 1) {
Split(hiddenStates, 1, maxLen - 1, maxLen, tempHiddenStates);
lastHiddenStates = &tempHiddenStates;
} else {
lastHiddenStates = &hiddenStates;
}
int lastRet = -1;
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
retLogits->resize(size);
memcpy((float*)retLogits->data(), ((float*)logits.cpuData) + (logits.dims[1] - 1) * size, size * logits.unitSize);
}
if (generationConfig.IsSimpleGreedy()) {
std::pair <float, int> ret = std::make_pair(-1e9, -1);
int base = logits.dims[1] - 1;
for (int i = 0; i < logits.dims.back(); i++) {
ret = max(ret, std::make_pair(((float*)logits.cpuData)[base * logits.dims.back() + i], i));
{
auto &hiddenStates = *lastHiddenStates;
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
retLogits->resize(size);
memcpy((float*)retLogits->data(), ((float*)logits.cpuData) + (logits.dims[1] - 1) * size, size * logits.unitSize);
}
if (generationConfig.IsSimpleGreedy()) {
TopK(logits, topk, 1);
topk.ToDevice(DataDevice::CPU);
lastRet = (int) (((float *) topk.cpuData)[0] + 1e-3);
} else if (!lastTokens.units.empty()) {
lastRet = LLMSampling(logits, logits.dims[1] - 1, generationConfig, lastTokens.units[0]);
}
lastRet = ret.second;
} else if (!lastTokens.units.empty()) {
lastRet = LLMSampling(logits, logits.dims[1] - 1, generationConfig, lastTokens.units[0]);
}
return lastRet;
......@@ -237,6 +252,7 @@ namespace fastllm {
alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
}
int maxLen = inputIds.dims[1];
Data hiddenStates;
Data attenInput;
Data q, k, v, qkv;
......@@ -290,6 +306,14 @@ namespace fastllm {
v.Reshape(qkvSize);
Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
if (GetKVCacheInCPU()) {
pastKey.lockInCPU = true;
pastValue.lockInCPU = true;
} else {
pastKey.ToDevice(DataDevice::CUDA);
pastValue.ToDevice(DataDevice::CUDA);
}
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
......@@ -351,25 +375,33 @@ namespace fastllm {
AddTo(hiddenStates, w2);
}
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
logits.ToDevice(DataDevice::CPU);
Data logits, topk;
Data tempHiddenStates;
Data *lastHiddenStates;
if (maxLen > 1) {
Split(hiddenStates, 1, maxLen - 1, maxLen, tempHiddenStates);
lastHiddenStates = &tempHiddenStates;
} else {
lastHiddenStates = &hiddenStates;
}
std::vector <int> lastRet;
if (generationConfig.IsSimpleGreedy()) {
for (int b = 0; b < batch; b++) {
int base = b * logits.dims[1] + logits.dims[1] - 1;
std::pair <float, int> ret = std::make_pair(-1e9, -1);
for (int i = 0; i < logits.dims.back(); i++) {
ret = max(ret, std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
{
auto &hiddenStates = *lastHiddenStates;
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
if (generationConfig.IsSimpleGreedy()) {
TopK(logits, topk, 1);
topk.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = b;
lastRet.push_back((int) (((float *) topk.cpuData)[base * 2] + 1e-3));
}
} else {
for (int b = 0; b < batch; b++) {
int base = b * logits.dims[1] + logits.dims[1] - 1;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
}
lastRet.push_back(ret.second);
}
} else {
for (int b = 0; b < batch; b++) {
int base = b * logits.dims[1] + logits.dims[1] - 1;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
}
}
......@@ -460,6 +492,14 @@ namespace fastllm {
v.Reshape(qkvSize);
Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
if (GetKVCacheInCPU()) {
pastKey.lockInCPU = true;
pastValue.lockInCPU = true;
} else {
pastKey.ToDevice(DataDevice::CUDA);
pastValue.ToDevice(DataDevice::CUDA);
}
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
......@@ -528,31 +568,27 @@ namespace fastllm {
AddTo(hiddenStates, w2);
}
Data logits, curLogit;
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
logits.ToDevice(DataDevice::CPU);
std::vector <int> lastRet;
int total = 0;
for (int b = 0; b < batch; b++) {
Split(logits, 1, total + seqLens[b] - 1, total + seqLens[b], curLogit);
if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
int base = (total + seqLens[b] - 1);
(*retLogits)[b]->resize(logits.dims.back());
memcpy((float*)(*retLogits)[b]->data(), (float*)(logits.cpuData + base * logits.dims.back() * logits.unitSize), logits.dims.back() * logits.unitSize);
curLogit.ToDevice(DataDevice::CPU);
(*retLogits)[b]->resize(curLogit.Count(0));
memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
}
if (generationConfigs[b].IsSimpleGreedy()) {
std::pair<float, int> ret = std::make_pair(-1e9, -1);
int base = (total + seqLens[b] - 1);
total += seqLens[b];
for (int i = 0; i < logits.dims.back(); i++) {
ret = max(ret, std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
}
lastRet.push_back(ret.second);
Data topk;
TopK(curLogit, topk, 1);
topk.ToDevice(DataDevice::CPU);
lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
} else {
int base = (total + seqLens[b] - 1);
total += seqLens[b];
lastRet.push_back(LLMSampling(logits, base, generationConfigs[b], lastTokens.units[b]));
lastRet.push_back(LLMSampling(curLogit, 0, generationConfigs[b], lastTokens.units[b]));
}
total += seqLens[b];
}
return lastRet;
}
......@@ -564,8 +600,8 @@ namespace fastllm {
#endif
//auto st = std::chrono::system_clock::now();
#ifdef PY_API
size_t pos = input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)? input.substr(0, pos-10):input;
size_t pos = input.rfind("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input;
size_t hash_id = std::hash<std::string>{}(input);
Data inputIds = this->weight.tokenizer.Encode(prompt);
#else
......@@ -681,8 +717,8 @@ namespace fastllm {
size_t hash_id = std::hash<std::string>{}(_input);
hash_ids.push_back(hash_id);
size_t pos = _input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? _input.substr(0, pos - 10) : _input;
size_t pos = _input.rfind("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != -1) ? _input.substr(0, pos) : _input;
prompts.push_back(prompt);
}
#else
......@@ -949,13 +985,24 @@ namespace fastllm {
}
if (seqLens.size() > 0) {
//model->dictLocker.unlock();
#ifdef USE_CUDA
FastllmCudaClearBigBuffer();
#endif
Data inputIds = Data(DataType::FLOAT32, {1, (int) ids.size()}, ids);
// auto st = std::chrono::system_clock::now();
std::vector<int> ret = model->ForwardBatch(seqLens.size(), inputIds, attentionMasks,
positionIds, seqLens, pastKeyValues, generationConfigs, tokensManager, &logits);
// static int tot = 0;
// printf("len = %d, spend = %f s.\n", (int)seqLens.size(), GetSpan(st, std::chrono::system_clock::now()));
// tot += (int)seqLens.size();
// printf("tot = %d\n", tot);
int idx = 0;
//model->dictLocker.lock();
model->resultTokenQueueLocker.lock();
for (auto &it: model->responseContextDict.dicts) {
if (it.second->isEnding) {
continue;
......@@ -964,6 +1011,12 @@ namespace fastllm {
if (curRet == model->eos_token_id) {
it.second->isEnding = true;
} else {
auto itStopTk = it.second->generationConfig.stop_token_ids.find(curRet);
if (itStopTk != it.second->generationConfig.stop_token_ids.end()) {
it.second->isEnding = true;
}
}
if (it.second->isEnding == false) {
it.second->currentTokens = std::vector<int>{curRet};
it.second->resultTokenQueue.push(curRet);
it.second->tokens.Push(curRet);
......@@ -973,6 +1026,7 @@ namespace fastllm {
}
}
}
model->resultTokenQueueLocker.unlock();
}
for (int i = 0; i < attentionMasks.size(); i++) {
......@@ -983,7 +1037,14 @@ namespace fastllm {
}
model->dictLocker.unlock();
MySleep(0);
MySleep(0);
// 介意cpu持续占用较高可以去掉上面行使用下面的代码
/*if (seqLens.size() > 0) {
MySleep(0);
}
else{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}*/
}
}, this);
}
......@@ -993,7 +1054,11 @@ namespace fastllm {
dictLocker.lock();
int handleId = responseContextDict.CreateHandle();
ResponseContext *context = responseContextDict.GetHandle(handleId);
resultTokenQueueLocker.lock();
context->Init(this->block_cnt);
resultTokenQueueLocker.unlock();
context->currentTokens = inputTokens;
context->generationConfig = generationConfig;
context->tokens = LastTokensUnit(generationConfig.last_n);
......@@ -1002,28 +1067,36 @@ namespace fastllm {
}
int LlamaModel::FetchResponseTokens(int handleId) {
dictLocker.lock();
//dictLocker.lock();
ResponseContext *context = responseContextDict.GetHandle(handleId);
if (context == nullptr) {
dictLocker.unlock();
//dictLocker.unlock();
return -1;
} else {
while (true) {
resultTokenQueueLocker.lock();
if (context->resultTokenQueue.size() > 0) {
int ret = context->resultTokenQueue.front();
context->resultTokenQueue.pop();
dictLocker.unlock();
resultTokenQueueLocker.unlock();
// dictLocker.unlock();
return ret;
} else {
if (context->isEnding) {
resultTokenQueueLocker.unlock();
dictLocker.lock();
responseContextDict.RemoveHandle(handleId);
dictLocker.unlock();
return -1;
}
}
dictLocker.unlock();
resultTokenQueueLocker.unlock();
// dictLocker.unlock();
MySleep(0);
dictLocker.lock();
// 介意cpu持续占用较高可以去掉上面行使用下面的代码
// std::this_thread::sleep_for(std::chrono::milliseconds(10));
// dictLocker.lock();
}
}
}
......
......@@ -207,8 +207,8 @@ namespace fastllm {
RuntimeResult retCb,
const GenerationConfig &generationConfig) {
#ifdef PY_API
size_t pos = input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)? input.substr(0, pos-10):input;
size_t pos = input.rfind("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input;
size_t hash_id = std::hash<std::string>{}(input);
Data inputIds = this->weight.tokenizer.Encode(prompt);
#else
......
......@@ -467,24 +467,36 @@ namespace fastllm {
positionIds.ToDevice(DataDevice::CPU);
if (index == 0) {
int seqLen = inputTokens[0].size();
std::vector<float> ids = std::vector<float>(batch * seqLen, 0);
std::vector <float> vmask = std::vector <float> (batch * seqLen * seqLen, 0);
std::vector<int> seqLens;
seqLens.resize(batch);
int maxLen = 0;
for (int i = 0; i < batch; i++) {
maxLen = std::max(maxLen, (int) inputTokens[i].size());
seqLens[i] = (int) inputTokens[i].size();
}
int seqLen = maxLen;
std::vector<float> ids = std::vector<float>(batch * seqLen, 151643);
std::vector<float> vpids = std::vector<float>(batch * seqLen, 0);
std::vector <float> vmask = std::vector <float> (batch * seqLen * seqLen, 0);
for (int b = 0; b < batch; b++) {
for (int i = 0; i < seqLen; i++) {
ids[b * seqLen + i] = inputTokens[b][i];
auto &tokens = inputTokens[b];
int len = tokens.size(), base = maxLen - len;
for (int i = 0; i < len; i++) {
ids[b * seqLen + base + i] = inputTokens[b][i];
vpids[b * seqLen + base + i] = i;
}
}
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
std::fill(vmask.data() + b * maxLen * maxLen,
vmask.data() + b * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
for (int j = maxLen - len; j < maxLen; j++) {
std::fill(vmask.data() + b * maxLen * maxLen + j * maxLen,
vmask.data() + b * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
}
for (int j = 0; j < len; j++) {
for (int k = j + 1; k < len; k++) {
vmask[b * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
}
}
}
for (int b = 1; b < batch; b++) {
memcpy(vmask.data() + b * seqLen * seqLen, vmask.data(), seqLen * seqLen * sizeof(float));
memcpy(vpids.data() + b * seqLen, vpids.data(), seqLen * sizeof(float));
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen}, ids));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen, seqLen}, vmask));
......
#include "model.h"
#include "factoryllm.h"
namespace pyfastllm{
// 对接不断更新的后端接口
// 需优化,减少内存拷贝
fastllm::Data RMSNorm(const fastllm::Data &input, const fastllm::Data &weight, float eps){
fastllm::Data output;
// std::cout<<"run rms norm"<<std::endl;
fastllm::RMSNorm(input, weight, eps, output);
// output.Print();
// std::cout<<"return val"<<std::endl;
return output;
}
fastllm::Data LayerNorm(fastllm::Data &input, fastllm::Data &gamma, fastllm::Data &beta, int axis){
fastllm::Data output;
fastllm::LayerNorm(input, gamma, beta, axis, output);
return output;
}
fastllm::Data Linear(fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias){
fastllm::Data output;
fastllm::Linear(input, weight, bias, output);
return output;
}
fastllm::Data MatMul(const fastllm::Data &input0, const fastllm::Data &input1, float alpha){
fastllm::Data output;
fastllm::MatMul(input0, input1, output, alpha);
return output;
}
fastllm::Data Attention(const fastllm::Data &q, const fastllm::Data &k, const fastllm::Data &v, const fastllm::Data &mask,
int group, float scale, int attentionType) {
fastllm::Data output;
fastllm::Attention(q, k, v, mask, output, group, scale, attentionType);
return output;
}
fastllm::Data Softmax(const fastllm::Data &input,int axis) {
fastllm::Data output;
fastllm::Softmax(input, output, axis);
return output;
}
fastllm::Data Silu(const fastllm::Data &input) {
fastllm::Data output;
fastllm::Silu(input, output);
return output;
}
fastllm::Data Gelu(const fastllm::Data &input) {
fastllm::Data output;
fastllm::GeluNew(input, output);
return output;
}
fastllm::Data Swiglu(const fastllm::Data &input) {
fastllm::Data output;
fastllm::Swiglu(input, output);
return output;
}
fastllm::Data Mul(const fastllm::Data &input, float v){
fastllm::Data output;
fastllm::Mul(input, v, output);
return output;
}
fastllm::Data Add(fastllm::Data &input0, const fastllm::Data &input1, float alpha) {
// fastllm::Data output;
fastllm::AddTo(input0, input1);
return input0;
}
std::string String(const fastllm::Data &data){
std::string ss;
ss += "[";
int last_dim = data.dims.back();
int n = data.Count(0) / last_dim, m = last_dim;
for (int i = 0; i < n; i++) {
if (i > 0) ss += "\n";
for (int j = 0; j < 10 && j < m; j++) {
if (j>0) ss += " ";
ss += std::to_string(reinterpret_cast<float*>(data.cpuData)[i*m+j]);
}
if (m > 10) {
ss += "... ";
for (int j = 0; j < 10 && j < m; j++) {
if (j>0) ss += " ";
ss += std::to_string(reinterpret_cast<float*>(data.cpuData)[i*m + (m-10+j)]);
}
}
}
ss += "]";
return ss;
}
}
#ifdef PY_API
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
......@@ -48,6 +146,34 @@ PYBIND11_MODULE(pyfastllm, m) {
// low level
m.def("get_llm_type", &fastllm::GetModelTypeFromFile);
m.def("llm_sampling", &fastllm::LLMSampling)
// .def("embedding", &fastllm::Embedding)
.def("rms_norm", &pyfastllm::RMSNorm)
.def("layer_norm", &pyfastllm::LayerNorm)
.def("linear", &pyfastllm::Linear)
// .def("split", &fastllm::Split)
// .def("cat", &fastllm::Cat)
// .def("cat_direct", &fastllm::CatDirect)
.def("matmul", &pyfastllm::MatMul)
// .def("matmul_transB", &fastllm::MatMulTransB)
.def("softmax", &pyfastllm::Softmax)
.def("silu", &pyfastllm::Silu)
.def("gelu", &pyfastllm::Gelu)
.def("swiglu", &pyfastllm::Swiglu)
.def("mul", &pyfastllm::Mul)
.def("attention", &pyfastllm::Attention);
// .def("mul_to", &fastllm::MulTo)
// .def("add_to", &fastllm::AddTo)
// .def("attention_mask", &fastllm::AttentionMask)
// .def("alibi_mask", &fastllm::AlibiMask)
// .def("permute", &fastllm::Permute)
// .def("permute_self", &fastllm::PermuteSelf)
// .def("topk", &fastllm::TopK)
// .def("rotateposition2D", &fastllm::RotatePosition2D)
// .def("nearlyrotateposition2D", &fastllm::NearlyRotatePosition2D)
// .def("llama_rotateposition2D", &fastllm::LlamaRotatePosition2D)
// .def("repeat_penalty", &fastllm::RepeatPenalty);
py::enum_<fastllm::DataType>(m, "Dtype")
.value("float32", fastllm::DataType::FLOAT32)
.value("bfloat16", fastllm::DataType::BFLOAT16)
......@@ -60,13 +186,25 @@ PYBIND11_MODULE(pyfastllm, m) {
.value("int32param", fastllm::DataType::INT32PARAM)
.export_values();
py::class_<fastllm::Data>(m, "Tensor")
py::class_<fastllm::Data>(m, "Tensor", py::buffer_protocol())
.def_buffer([](fastllm::Data &m) -> py::buffer_info {
return py::buffer_info(
m.cpuData, /* Pointer to buffer */
sizeof(float), /* Size of one scalar */
py::format_descriptor<float>::format(), /* Python struct-style format descriptor */
m.dims.size(), /* Number of dimensions */
m.dims, /* Buffer dimensions */
{ sizeof(float) * m.dims[1], /* Strides (in bytes) for each index */
sizeof(float) }
);
})
.def_readonly("dims", &fastllm::Data::dims)
.def(py::init<>())
.def(py::init<fastllm::DataType>())
.def(py::init<fastllm::DataType, const std::vector<int>&>())
.def(py::init<fastllm::DataType, const std::vector<int>&, const std::vector<float>&>())
.def(py::init<fastllm::Data>())
.def_readonly("shape", &fastllm::Data::dims)
.def("copy_from", &fastllm::Data::CopyFrom)
.def("count", &fastllm::Data::Count)
.def("to_list", [](fastllm::Data& data){
......@@ -76,6 +214,7 @@ PYBIND11_MODULE(pyfastllm, m) {
}
return vecData;
})
.def("__str__", &pyfastllm::String)
.def("print", &fastllm::Data::Print)
.def("to", static_cast<void (fastllm::Data::*)(void *device)>(&fastllm::Data::ToDevice));
......@@ -155,6 +294,7 @@ PYBIND11_MODULE(pyfastllm, m) {
.def_readonly("block_cnt", &fastllm::ChatGLMModel::block_cnt)
.def_readonly("bos_token_id", &fastllm::ChatGLMModel::bos_token_id)
.def_readonly("eos_token_id", &fastllm::ChatGLMModel::eos_token_id)
.def_readonly("gmask_token_id", &fastllm::ChatGLMModel::gmask_token_id)
.def("load_weights", &fastllm::ChatGLMModel::LoadFromFile)
.def("make_input", &fastllm::ChatGLMModel::MakeInput)
.def("make_history", &fastllm::ChatGLMModel::MakeHistory)
......
#include "fastllm.h"
void callBaseOp(int optype=0){
fastllm::Data inputs = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {1, 5});
fastllm::Data outputs = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {3, 4});
switch (optype)
{
case 0:
fastllm::AddTo(inputs, outputs, 1);
break;
case 1:
fastllm::Cat(inputs, inputs, 0, outputs);
break;
case 2:
fastllm::Mul(inputs, 2, outputs);
break;
case 3:
fastllm::Permute(inputs, {1, 0}, outputs);
break;
case 4:
fastllm::Split(inputs, 0, 0, 1, outputs);
break;
case 5:
fastllm::Permute(inputs, {1, 0}, outputs);
fastllm::MatMul(inputs, outputs, outputs);
break;
default:
break;
}
outputs.Print();
}
void callNormOp(int normType=0){
fastllm::Data inputs = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {1, 5});
fastllm::Data weights = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {1, 2});
fastllm::Data gamma = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {1, 1});
fastllm::Data beta = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {0, 0});
fastllm::Data outputs;
switch (normType)
{
case 0:
fastllm::LayerNorm(inputs, gamma, beta, -1, outputs);
break;
case 1:
fastllm::RMSNorm(inputs, weights, 1e-5, outputs);
break;
default:
break;
}
outputs.Print();
}
void callLinearOp(){
fastllm::Data inputs = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {1, 2});
fastllm::Data weights = fastllm::Data(fastllm::DataType::FLOAT32, {3, 2}, {3, 4, 5, 5, 6, 7});
fastllm::Data bias = fastllm::Data(fastllm::DataType::FLOAT32, {1, 3}, {0, 1, 1});
fastllm::Data outputs;
fastllm::Linear(inputs, weights, bias, outputs);
outputs.Print();
}
void callActivationOp(int activateType=0){
fastllm::Data inputs = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2}, {1, 5});
fastllm::Data outputs;
switch (activateType)
{
case 0:
fastllm::Silu(inputs, outputs);
break;
case 1:
fastllm::Softmax(inputs, outputs, -1);
break;
case 2:
fastllm::GeluNew(inputs, outputs);
break;
case 3:
fastllm::Swiglu(inputs, outputs);
break;
default:
break;
}
outputs.Print();
}
void callAttentionOp(int group=1, int attentionType=0){
const fastllm::Data q = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2, 3}, {1, 2, 3, 4, 5, 6});
const fastllm::Data k = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2, 3}, {5, 6, 7, 8, 9, 10});
const fastllm::Data v = fastllm::Data(fastllm::DataType::FLOAT32, {1, 2, 3}, {1, 1, 1, 2, 1, 3});
const fastllm::Data mask = fastllm::Data();
int dims = q.dims.back();
float scale = 1/sqrt(dims);
fastllm::Data output;
fastllm::Attention(q, k, v, mask, output, group, scale, attentionType);
}
void testBase(){
printf("testing BaseOp...\n");
for (int i=0;i<6;i++){
callBaseOp(i);
}
printf("test BaseOp finished!\n");
}
void testActivation(){
printf("testing ActivationOp...\n");
for (int i=0;i<4;i++){
callActivationOp(i);
}
printf("test ActivationOp finished!\n");
}
void testAttention(){
printf("testing AttentionOp...\n");
callAttentionOp();
printf("test AttentionOp finished!\n");
}
void testLinaer(){
printf("testing LinearOp...\n");
callLinearOp();
printf("test LinearOp finished!\n");
}
void testNorm(){
printf("testing NormOp...\n");
for (int i=0;i<2;i++){
callNormOp(i);
}
printf("test NormOp finished!\n");
}
void testAll(){
testBase();
testActivation();
testAttention();
testNorm();
testLinaer();
}
int main(){
testAll();
}
\ No newline at end of file
from fastllm_pytools import llm
import torch
import ctypes
import numpy as np
from fastllm_pytools import llm;
import torch;
import ctypes;
import numpy as np;
fastllm_data_type_dict = {
"int4": 8,
......@@ -22,60 +22,71 @@ def create(model,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
print("dtype should in ", list(fastllm_data_type_dict.keys()));
exit(0);
# 0.1 model info
# if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
# print("model.config.model_type: chatglm3!")
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
modelInfo["pre_prompt"] = pre_prompt;
if (user_role):
modelInfo["user_role"] = user_role
modelInfo["user_role"] = user_role;
if (bot_role):
modelInfo["bot_role"] = bot_role
modelInfo["bot_role"] = bot_role;
if (history_sep):
modelInfo["history_sep"] = history_sep
modelInfo["history_sep"] = history_sep;
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
modelInfo["use_alibi"] = "1";
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = "";
if (modelInfo["model_type"] == "qwen"):
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
# chatglm3
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + "> \n");
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
modelInfo["history_sep"] = "";
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
weight_type_dict = {}
module_dict = {}
weight_bits = {}
weight_type_dict = {};
module_dict = {};
weight_bits = {};
for key, m in model.named_modules():
if (str(type(m)).find("QuantizedLinear") != -1):
weight_type_dict[key + ".weight"] = "QuantizedLinear"
weight_bits[key + ".weight"] = m.weight_bit_width
weight_type_dict[key + ".weight"] = "QuantizedLinear";
weight_bits[key + ".weight"] = m.weight_bit_width;
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
weight_type_dict[key + ".weight"] = "linear";
module_dict[key + ".weight"] = m;
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
weight_type_dict[key] = "embedding";
peft_config = {}
active_adapter = ""
if hasattr(model, "peft_config"):
peft_config = model.peft_config
if hasattr(model, "active_adapter"):
if hasattr(model, "active_adapter") and isinstance(model.active_adapter, str):
# in transformers >= 4.33.0, active_adapter is a funtion in model, ignore it now
active_adapter = model.active_adapter
model = model.cpu()
dict = model.state_dict()
model_type = model.config.__dict__["model_type"]
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
model = model.cpu();
dict = model.state_dict();
model_type = model.config.__dict__["model_type"];
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
for it in modelInfo.keys():
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
for adapter_name in peft_config.keys():
adapter_dict = peft_config[adapter_name].__dict__
......@@ -90,39 +101,39 @@ def create(model,
if modelInfo["model_type"] == "qwen":
pass
else:
tokenizer = tokenizer.tokenizer
tokenizer = tokenizer.tokenizer;
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
piece_size = tokenizer.sp_model.piece_size();
for i in range(piece_size):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
else:
vocab = tokenizer.get_vocab()
vocab = tokenizer.get_vocab();
for v in vocab.keys():
if (modelInfo["model_type"] == "moss"):
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v];
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0));
elif (modelInfo["model_type"] == "qwen"):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0));
else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
tot = 0
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0));
tot = 0;
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
ori_data_type = 0;
ori_np_data_type = np.float32;
cur_weight_type = 0;
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
to_data_type = 0;
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
to_data_type = fastllm_data_type_dict[dtype];
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
ori_data_type = 7;
ori_np_data_type = np.float16;
elif (cur_weight_type == 2):
# TODO bfloat
to_data_type = 0
to_data_type = 0;
weight_name = key
if peft_config is not None:
......@@ -133,19 +144,19 @@ def create(model,
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
weight_bits[key],
dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
dict[key].numpy().ctypes.data_as(ctypes.c_void_p));
else:
llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
to_data_type, cur_weight_type, ori_data_type,
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
tot += 1
print("convert (", tot, "/", len(dict), end = " )\r")
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
tot += 1;
print("convert (", tot, "/", len(dict), end = " )\r");
print("")
llm.fastllm_lib.init_params_llm_model(model)
llm.fastllm_lib.warmup_llm_model(model)
ret = llm.model("", id = model)
return ret
print("");
llm.fastllm_lib.init_params_llm_model(model);
llm.fastllm_lib.warmup_llm_model(model);
ret = llm.model("", id = model);
return ret;
import ctypes
import os
from typing import Optional, Tuple, Union, List, Callable, Dict, Any
import ctypes;
import math
import os;
import threading
from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
from copy import deepcopy
import json
import platform
if platform.system() == 'Windows':
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
fastllm_lib = ctypes.CDLL(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"), winmode=0)
elif platform.system() == 'Darwin':
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.dylib"))
else:
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))
fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
fastllm_lib.create_llm_model.restype = ctypes.c_int
fastllm_lib.token_decode.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_char_p]
fastllm_lib.token_decode.restype = ctypes.c_int
fastllm_lib.token_encode_string.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
fastllm_lib.token_encode_string.restype = ctypes.c_int
fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
ctypes.c_float, ctypes.c_float, ctypes.c_bool,
ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
fastllm_lib.launch_response_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
......@@ -25,127 +38,243 @@ fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int
fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
# fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)
fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
ctypes.c_float, ctypes.c_float, ctypes.c_bool,
ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
# fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.fetch_response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)
fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
# fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_history_llm_model.restype = ctypes.POINTER(ctypes.c_char)
fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
# fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_input_llm_model.restype = ctypes.POINTER(ctypes.c_char)
fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]
fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
fastllm_lib.get_llm_model_type.argtype = [ctypes.c_int]
fastllm_lib.get_llm_model_type.restype = ctypes.POINTER(ctypes.c_char)
fastllm_lib.response_batch_str_llm_model.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_batch_str_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)
fastllm_lib.response_batch_tokens_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int),
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_batch_tokens_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)
fastllm_lib.freeChars.argtype = [ctypes.POINTER(ctypes.c_char)]
# fastllm_lib.freeChars.restype = ctypes.c_char_p
fastllm_lib.freeCharArray.argtype = [ctypes.POINTER(ctypes.c_char_p)]
def set_cpu_threads(threads: int):
fastllm_lib.set_cpu_threads(threads)
fastllm_lib.set_cpu_threads(threads);
def get_cpu_threads() -> int:
return fastllm_lib.get_cpu_threads()
return fastllm_lib.get_cpu_threads();
def print_ins_info():
fastllm_lib.print_cpu_ins()
fastllm_lib.print_cpu_ins();
def set_cpu_kvcache(cpu_kvcache):
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache));
def get_cpu_kvcache():
return fastllm_lib.get_kvcache_in_cpu()
return fastllm_lib.get_kvcache_in_cpu();
def set_cpu_low_mem(low_mem):
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem));
def get_cpu_low_mem():
return fastllm_lib.get_cpu_low_mem()
return fastllm_lib.get_cpu_low_mem();
def set_device_map(device_map):
devices = []
values = []
devices = [];
values = [];
if (isinstance(device_map, str)):
devices.append(device_map)
values.append(1)
devices.append(device_map);
values.append(1);
elif (isinstance(device_map, list)):
devices = [str(x) for x in device_map]
values = [1 for x in device_map]
devices = [str(x) for x in device_map];
values = [1 for x in device_map];
elif (isinstance(device_map, dict)):
devices = [str(x) for x in device_map.keys()]
values = [int(device_map[x]) for x in device_map.keys()]
devices = [str(x) for x in device_map.keys()];
values = [int(device_map[x]) for x in device_map.keys()];
else:
print("set_device_map error.")
return
device_str = ''.join(devices)
device_len = [len(x) for x in devices]
print("set_device_map error.");
return;
device_str = ''.join(devices);
device_len = [len(x) for x in devices];
fastllm_lib.set_device_map(len(device_len),
(ctypes.c_int * len(device_len))(*device_len),
device_str.encode(),
(ctypes.c_int * len(values))(*values))
(ctypes.c_int * len(values))(*values));
def from_hf(model,
tokenizer = None,
dtype = "float16"):
from fastllm_pytools import hf_model
return hf_model.create(model, tokenizer, dtype = dtype)
from fastllm_pytools import hf_model;
return hf_model.create(model, tokenizer, dtype = dtype);
class model:
def __init__ (self, path : str,
id : int = -99999):
if (id != -99999):
self.model = id
self.model = id;
else:
self.model = fastllm_lib.create_llm_model(path.encode())
self.direct_query = False
self.model = fastllm_lib.create_llm_model(path.encode());
self.direct_query = False;
# 为了减少重复申请释放buffer对象而使用的线程局部存储区对象池
self.thread_local_obj = threading.local()
self.thread_local_obj.tokenizer_encode_string__output_buffer = None
self.thread_local_obj.tokenizer_decode_token__output_buffer = None
# tokenizer_decode_token 输出结果的静态缓存,手工触发构建
# 由于token数量有限且不太多,所以缓存该结果来减少调用较为适合。
# 不做成自动缓存是为了避免在多线程调用的时候对缓存dict加锁,同时也为不同场景提供选择空间
self.tokenizer_decode_token_cache = None
model_type_ptr = fastllm_lib.get_llm_model_type(self.model)
self.model_type = ctypes.string_at(model_type_ptr).decode()
fastllm_lib.freeChars(model_type_ptr)
# print("model_type:", self.model_type)
def get_prompt(self,
query: str,
history: List[Tuple[str, str]] = None) -> str:
if (not(history)):
history = []
prompt = ""
history = [];
prompt = "";
for i, (old_query, response) in enumerate(history):
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
return prompt
history_ptr = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode())
prompt = ctypes.string_at(history_ptr).decode()
fastllm_lib.freeChars(history_ptr)
input_ptr = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode())
prompt = ctypes.string_at(input_ptr).decode()
fastllm_lib.freeChars(input_ptr)
return prompt;
def save(self, path : str):
fastllm_lib.save_llm_model(self.model, path.encode())
fastllm_lib.save_llm_model(self.model, path.encode());
def eval(self):
pass
pass;
def build_tokenizer_decode_token_cache(self):
if self.tokenizer_decode_token_cache is not None:
return
cache_dict = dict()
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
for token_id in range(vocab_size):
cache_dict[token_id] = self.tokenizer_decode_token(token_id)
self.tokenizer_decode_token_cache = cache_dict
def tokenizer_encode_string(self, content: str) -> List[int]:
output_buffer_init_len = 1024
if not hasattr(self.thread_local_obj, 'tokenizer_encode_string__output_buffer') or self.thread_local_obj.tokenizer_encode_string__output_buffer is None:
self.thread_local_obj.tokenizer_encode_string__output_buffer = (ctypes.c_int * output_buffer_init_len)()
buffer = self.thread_local_obj.tokenizer_encode_string__output_buffer
buffer_len = len(buffer)
result_len = fastllm_lib.token_encode_string(self.model, content.encode(), buffer_len, buffer)
if result_len > buffer_len:
if result_len > 10240:
# 要处理的数据过长,使用一次性的buffer
temp_buffer = (ctypes.c_int * result_len)()
ret = fastllm_lib.token_encode_string(self.model, content.encode(), result_len, temp_buffer)
return [i for i in temp_buffer]
else:
# 扩展buffer大小
new_buffer_len = round(math.ceil(result_len / 1024.0)) * 1024
buffer = (ctypes.c_int * new_buffer_len)()
self.thread_local_obj.tokenizer_encode_string__output_buffer = buffer
result_len = fastllm_lib.token_encode_string(self.model, content.encode(), new_buffer_len, buffer)
return [buffer[i] for i in range(result_len)]
def tokenizer_decode_token(self, token_id: int) -> bytes:
if self.tokenizer_decode_token_cache is not None:
cache_result = self.tokenizer_decode_token_cache.get(token_id)
if cache_result is not None:
return cache_result
output_buffer_init_len = 256
if not hasattr(self.thread_local_obj, 'tokenizer_decode_token__output_buffer') or self.thread_local_obj.tokenizer_decode_token__output_buffer is None:
self.thread_local_obj.tokenizer_decode_token__output_buffer = ctypes.create_string_buffer(output_buffer_init_len)
buffer = self.thread_local_obj.tokenizer_decode_token__output_buffer
ret = fastllm_lib.token_decode(self.model, token_id, len(buffer), buffer)
if ret > 0:
# buffer长度不够,扩展buffer大小
new_buffer_len = round(math.ceil(ret / 16.0)) * 16
buffer = ctypes.create_string_buffer(new_buffer_len)
self.thread_local_obj.tokenizer_decode_token__output_buffer = buffer
ret = fastllm_lib.token_decode(self.model, token_id, len(buffer), buffer)
assert ret == 0
buffer_bytes = buffer.raw
result_len = len(buffer_bytes)
for i in range(len(buffer_bytes)):
if buffer_bytes[i] == 0:
result_len = i
break
return buffer_bytes[:result_len]
def stop_token_ctypes(self, stop_token_ids):
if stop_token_ids is None:
return 0, None
else:
return ctypes.c_int(len(stop_token_ids)), (ctypes.c_int * len(stop_token_ids))(*stop_token_ids)
def response_logits(self,
query: str,
history: List[Tuple[str, str]] = None,
tokenizer = None) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history)
tokenizer = None,
stop_token_ids: List[int] = None,
) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history);
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
if (tokenizer == None):
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True),
stop_token_len, stop_token_list);
else:
input = tokenizer.encode(prompt)
input = tokenizer.encode(prompt);
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
1, False, 1, 1, 1, 1, True)
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
1, False, 1, 1, 1, 1, True, stop_token_len, stop_token_list);
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model);
logits = list(range(vocab_size))
array = (ctypes.c_float * (vocab_size * 4))(*logits)
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
out = list(array)[:vocab_size]
array = (ctypes.c_float * (vocab_size * 4))(*logits);
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
out = list(array)[:vocab_size];
while (ret != -1):
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
return out
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
return out;
def response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
ret = ""
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
stop_token_ids: List[int] = None) -> str:
ret = "";
for i in self.stream_response(query = query,
history = history,
max_length = max_length,
......@@ -153,88 +282,339 @@ class model:
top_p = top_p, top_k = top_k,
temperature = temperature,
repeat_penalty = repeat_penalty,
one_by_one = True):
ret += i
return ret
one_by_one = True,
stop_token_ids = stop_token_ids):
ret += i;
return ret;
def stream_response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True):
prompt = query if self.direct_query else self.get_prompt(query, history)
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
one_by_one = True, stop_token_ids: List[int] = None):
prompt = query if self.direct_query else self.get_prompt(query, history);
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids);
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
res = ""
ret = b''
fail_cnt = 0
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False),
stop_token_len, stop_token_list);
res = "";
ret = b'';
fail_cnt = 0;
while True:
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
cur = ""
# ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
ret_chararry = fastllm_lib.fetch_response_str_llm_model(self.model, handle);
ret += ctypes.string_at(ret_chararry)
fastllm_lib.freeChars(ret_chararry)
cur = "";
try:
cur = ret.decode()
ret = b''
ret = b'';
except:
fail_cnt += 1
fail_cnt += 1;
if (fail_cnt == 20):
break
break;
else:
continue
fail_cnt = 0
continue;
fail_cnt = 0;
if (cur == "<flmeos>"):
break
break;
if one_by_one:
yield cur
yield cur;
else:
res += cur
yield res
res += cur;
yield res;
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
if (not(history)):
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False)
def stream_response_raw(self,
input_tokens: List[int],
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
one_by_one = True,
stop_token_ids: List[int] = None
):
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input_tokens),
(ctypes.c_int * len(input_tokens))(*input_tokens),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False),
stop_token_len, stop_token_list)
result = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break
result.append(cur)
response = tokenizer.decode(result)
history = history + [(query, response)]
return response, history
# 可能遇到长尾char需要多个token才能够生成,所以只返回bytes,string.decode策略交给外部
# 方便统计输出token数量,和控制不完整utf8时候解码的逻辑
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
return_past_key_values = False, **kwargs) -> str:
if (not(history)):
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False)
tokens = []
total_bytes = b''
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
cur_token = fastllm_lib.fetch_response_llm_model(self.model, handle)
if cur_token == -1:
break
tokens.append(cur)
response = tokenizer.decode(tokens)
new_history = history + [(query, response)]
if return_past_key_values:
yield response, new_history, None
cur_bytes = self.tokenizer_decode_token(cur_token)
if one_by_one:
yield cur_bytes
else:
yield response, new_history
total_bytes += cur_bytes
yield total_bytes
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01, stop_token_ids: List[int] = None, **kwargs):
if self.model_type != "chatglm3":
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False, stop_token_len, stop_token_list);
result = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
result.append(cur);
response = tokenizer.decode(result);
history = history + [(query, response)];
return response, history;
else:
if history is None:
history = []
role = "user"
input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
history.append({"role": role, "content": query})
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False, stop_token_len, stop_token_list);
tokens = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
tokens.append(cur);
response = tokenizer.decode(tokens);
if response and response[-1] != "�":
response, new_history = self.process_chatglm3_response(response, history)
return response, new_history
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
return_past_key_values = False, stop_token_ids: List[int] = None, **kwargs) -> str:
if self.model_type != "chatglm3":
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False, stop_token_len, stop_token_list);
tokens = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
tokens.append(cur);
response = tokenizer.decode(tokens);
new_history = history + [(query, response)];
if return_past_key_values:
yield response, new_history, None;
else:
yield response, new_history;
else:
if history is None:
history = []
role = "user"
input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
history.append({"role": role, "content": query})
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False, stop_token_len, stop_token_list);
tokens = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
tokens.append(cur);
response = tokenizer.decode(tokens);
if response and response[-1] != "�":
response, new_history = self.process_chatglm3_response(response, history)
if return_past_key_values:
yield response, new_history, past_key_values
else:
yield response, new_history
def set_adapter(self, name: str):
fastllm_lib.set_adapter(self.model, str(name).encode())
def disable_adapter(self):
fastllm_lib.disable_adapter(self.model)
def process_chatglm3_response(self, output, history):
content = ""
history = deepcopy(history)
for response in output.split("<|assistant|>"):
metadata, content = response.split("\n", maxsplit=1)
if not metadata.strip():
content = content.strip()
history.append({"role": "assistant", "metadata": metadata, "content": content})
content = content.replace("[[训练时间]]", "2023年")
else:
history.append({"role": "assistant", "metadata": metadata, "content": content})
if history[0]["role"] == "system" and "tools" in history[0]:
content = "\n".join(content.split("\n")[1:-1])
def tool_call(**kwargs):
return kwargs
parameters = eval(content)
content = {"name": metadata.strip(), "parameters": parameters}
else:
content = {"name": metadata.strip(), "content": content}
return content, history
def build_chatglm3_input(self, tokenizer, query, history=None, role="user"):
if history is None:
history = []
input_ids = []
for item in history:
content = item["content"]
if item["role"] == "system" and "tools" in item:
content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
input_ids.extend(tokenizer.build_single_message(item["role"], item.get("metadata", ""), content))
input_ids.extend(tokenizer.build_single_message(role, "", query))
input_ids.extend([tokenizer.get_command("<|assistant|>")])
return input_ids
def response_batch_raw(self, querys: List[str],
historys: List[List[Tuple[str, str]]] = None,
max_length: int = 1024, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
**kwargs) -> List[str]:
query_size = len(querys)
if (not(historys)):
historys = [[] for _ in range(query_size)]
inputs = (ctypes.c_char_p * query_size)()
for i, query in enumerate(querys):
prompt = query if self.direct_query else self.get_prompt(query, historys[i])
inputs[i] = ctypes.c_char_p(prompt.encode())
outputs = fastllm_lib.response_batch_str_llm_model(self.model, inputs, query_size,
max_length, do_sample, top_p, top_k, temperature, repeat_penalty, False)
responses = []
for i in range(query_size):
response = ctypes.string_at(outputs[i]).decode()
responses.append(response)
historys[i] = historys[i] + [(querys[i], response)]
fastllm_lib.freeCharArray(outputs, query_size)
return responses, historys
def chat_batch_raw(self, tokenizer, querys: List[str], historys: List[List[Tuple[str, str]]] = None, max_length: int = 1024,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01, **kwargs):
query_size = len(querys)
if (not(historys)):
historys = [[] for _ in range(query_size)]
inputs = []
inputs_len = []
for i, query in enumerate(querys):
prompt = query if self.direct_query else self.get_prompt(query, historys[i])
input = tokenizer.encode(prompt);
inputs.extend(input)
inputs_len.append(len(input))
outputs = fastllm_lib.response_batch_tokens_llm_model(self.model, query_size,
(ctypes.c_int * len(inputs_len))(*inputs_len),
(ctypes.c_int * len(inputs))(*inputs),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False)
responses = []
for i in range(query_size):
response = ctypes.string_at(outputs[i]).decode()
responses.append(response)
historys[i] = historys[i] + [(querys[i], response)]
fastllm_lib.freeCharArray(outputs, query_size)
return responses, historys
def response_batch(self, querys: List[str],
historys: List[List[Tuple[str, str]]] = None,
max_length: int = 1024, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
stop_token_ids: List[int] = None, **kwargs) -> List[str]:
query_size = len(querys)
if (not(historys)):
historys = [[] for _ in range(query_size)]
handles = []
for i, query in enumerate(querys):
prompt = query if self.direct_query else self.get_prompt(query, historys[i])
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids);
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False),
stop_token_len, stop_token_list)
handles.append(handle)
responses = []
for i, handle in enumerate(handles):
res = ""
ret = b''
fail_cnt = 0
while True:
# ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
ret_chararry = fastllm_lib.fetch_response_str_llm_model(self.model, handle);
ret += ctypes.string_at(ret_chararry)
fastllm_lib.freeChars(ret_chararry)
cur = ""
try:
cur = ret.decode()
ret = b''
except:
fail_cnt += 1
if (fail_cnt == 20):
break
else:
continue
fail_cnt = 0
if (cur == "<flmeos>"):
break;
res += cur
responses.append(res)
historys[i] = historys[i] + [(querys[i], res)]
return responses, historys
def chat_batch(self, tokenizer, querys: List[str], historys: List[List[Tuple[str, str]]] = None, max_length: int = 1024,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01, stop_token_ids: List[int] = None, **kwargs):
query_size = len(querys)
if (not(historys)):
historys = [[] for _ in range(query_size)]
handles = []
for i, query in enumerate(querys):
prompt = query if self.direct_query else self.get_prompt(query, historys[i])
input = tokenizer.encode(prompt);
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids);
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False, stop_token_len, stop_token_list);
handles.append(handle)
responses = []
for i, handle in enumerate(handles):
result = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
result.append(cur);
response = tokenizer.decode(result);
responses.append(response)
historys[i] = historys[i] + [(querys[i], response)]
return responses, historys
def release_memory(self):
fastllm_lib.release_memory(self.model)
......@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20])
temp = v
v = np.random.randint(-127, 127, [10, 20]);
temp = v;
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
......@@ -34,23 +34,31 @@ def write_int8(fo, v):
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(struct.pack('f', -c_max[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
# c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
# c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
# c_scale = c_max / 7.0
# c_min = c_scale * -8.0
c_min = np.expand_dims(v.min(axis = -1), -1)
c_max = np.expand_dims(v.max(axis = -1), -1)
c_scale = (c_max - c_min) / 15.0
c_zero = np.round(0.0 - c_min / c_scale)
c_zero = c_zero.clip(0, 15)
c_min = -c_scale * c_zero
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(struct.pack('f', c_min[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def tofile(exportPath,
......@@ -72,6 +80,8 @@ def tofile(exportPath,
fo.write(struct.pack('i', 2))
# 0.1 model info
#if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
......@@ -91,13 +101,26 @@ def tofile(exportPath,
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + ">") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = ""
if (modelInfo["model_type"] == "baichuan" and modelInfo["vocab_size"] == 125696):
# Baichuan 2代 7B
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + ">") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen":
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
print("chatglm3")
# chatglm3
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + "> \n");
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
modelInfo["history_sep"] = "";
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
......@@ -140,10 +163,10 @@ def tofile(exportPath,
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
elif (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
else:
s = v.encode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
......
# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8100/docs for documents.
import time
import json
import torch
import uvicorn
import argparse
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from typing import Any, Dict, List, Literal, Optional, Union
#from transformers import AutoTokenizer, AutoModel
from sse_starlette.sse import ServerSentEvent, EventSourceResponse
from fastllm_pytools import llm
@asynccontextmanager
async def lifespan(app: FastAPI): # collects GPU memory
yield
global device_map
if torch.cuda.is_available():
for device in device_map:
with torch.cuda.device(device):
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelCard(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = []
class ChatMessage(BaseModel):
role: Literal["user", "assistant", "system"]
content: str
class Usage(BaseModel):
prompt_tokens: int = None
total_tokens: int = None
completion_tokens: int = None
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = None
top_p: Optional[float] = None
max_length: Optional[int] = None
stream: Optional[bool] = False
class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Literal["stop", "length"]
class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]]
class ChatCompletionResponse(BaseModel):
id: str
object: Literal["chat.completion", "chat.completion.chunk"]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
usage: Usage = None
@app.get("/v1/models", response_model=ModelList)
def list_models():
global model_list
for model in model_list:
ModelCard(id=model)
ModelList.data.append(ModelCard)
return ModelList()
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
def create_chat_completion(request: ChatCompletionRequest):
if request.model not in model_list:
raise HTTPException(status_code=400, detail="Invalid Model Name")
global model
id = "chatcmpl-A"
if request.messages[-1].role != "user":
raise HTTPException(status_code=400, detail="Invalid request")
query = request.messages[-1].content
if request.max_length is not None:
max_length = request.max_length
else:
max_length = 1024
if request.temperature is not None:
temperature = request.temperature
else:
temperature = 0.1
if request.top_p is not None:
top_p = request.top_p
else:
top_p = 0.8
prev_messages = request.messages[:-1]
# print(prev_messages)
if len(prev_messages) > 0 and prev_messages[0].role == "system":
query = prev_messages.pop(0).content + query
history = []
if len(prev_messages) % 2 == 0:
for i in range(0, len(prev_messages), 2):
if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":
history.append([prev_messages[i].content, prev_messages[i+1].content])
if request.stream:
generate = predict(id=id, query=query, history=history, max_length=max_length, top_p = top_p, temperature = temperature, model_id = request.model)
return EventSourceResponse(generate, media_type="text/event-stream")
response = model.response(query=query, history=history, max_length=max_length, top_p = top_p, temperature = temperature)
choice_data = ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop"
)
prompt_tokens = len(model.tokenizer_encode_string(query))
completion_tokens = len(model.tokenizer_encode_string(response))
usage = Usage(
prompt_tokens = prompt_tokens,
completion_tokens = completion_tokens,
total_tokens = prompt_tokens+completion_tokens,
)
return ChatCompletionResponse(id=id ,model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
def predict(id: str, query: str, history: List[List[str]], model_id: str, max_length: int, top_p: float, temperature: float):
global model
creat_time = int(time.time())
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) //pydantic从1.8.0开始不支持dumps_kwags参数,参考https://github.com/THUDM/ChatGLM2-6B/issues/308
yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
for new_response in model.stream_response(query=query, history=history, max_length=max_length, top_p = top_p, temperature = temperature):
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(content=new_response),
finish_reason=None
)
chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
yield '[DONE]'
def args_parser():
parser = argparse.ArgumentParser(description = 'baichuan2_chat_demo')
parser.add_argument('-p', '--path', type = str, default = "/model", help = '模型文件的路径')
parser.add_argument('-g', '--gpus', type = str, default = "0", help = '指定运行的gpu卡,例如“0,1”')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = args_parser()
global model_list
model_list = ["chatglm3-6b-fastllm"]
global device_map
device_map = ["cuda:"+num for num in args.gpus.split(',')]
llm.set_device_map(device_map)
model = llm.model(args.path)
uvicorn.run(app, host='127.0.0.1', port=8100)
import openai
import time
import threading
import queue
from concurrent.futures import ThreadPoolExecutor, as_completed
def jls_extract_def(model, messages, temperature, max_length, stream, index):
openai.api_base = "http://127.0.0.1:8100/v1"
openai.api_key = "none"
output_tokens = 0
ret = ""
t0 = time.time()
result = openai.ChatCompletion.create(model=model,messages=messages, temperature=temperature, max_length=max_length, stream=stream)
for chunk in result:
# print(chunk)
output_tokens += 1
if hasattr(chunk.choices[0].delta, "content"):
if (index == 0):
print(chunk.choices[0].delta.content, end="", flush=True)
ret += chunk.choices[0].delta.content
t1 = time.time()
# print("\ntoken/s: {:.2f}, output_tokens: {}".format(output_tokens/(t1-t0),output_tokens))
result = output_tokens, ret, output_tokens/(t1-t0)
return result
if __name__ == "__main__":
prompt = "满江红全文"
concurrencys = [1]
temperature = 0.1
max_length = 4096
stream = True
prompts = [prompt]
model="chatglm3-6b-fastllm"
messages=[{"role": "user", "content": "你好"}]
pool = ThreadPoolExecutor(max_workers=32)
for i in range(len(concurrencys)):
cur_prompts = prompts * concurrencys[i]
token_count = 0
threads = []
t0 = time.time()
for index, prompt in enumerate(cur_prompts):
messages[0]["content"] = prompt
t = pool.submit(jls_extract_def, model, messages, temperature, max_length, stream, index)
t.index = index
threads.append(t)
for future in as_completed(threads):
result = future.result()
print(future.index)
print(result)
print("\n")
token_count += result[0]
t1 = time.time()
print("\n---------------------------------------------\n")
print("\nconcurrency: {}".format(concurrencys[i]))
print("\ntotal use: {:.2f}".format(t1-t0))
print("\ntoken/s: {:.2f}, token_count: {}".format(token_count/(t1-t0),token_count))
print("\n---------------------------------------------\n")
uvicorn==0.23.2
pydantic==2.5.1
fastapi==0.103.1
sse_starlette
openaiopenai==0.28
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from fastllm_pytools import torch2flm
if __name__ == "__main__":
modelpath = "baichuan-inc/Baichuan2-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(modelpath, device_map="auto", torch_dtype=torch.float32, trust_remote_code=True)
# normalize lm_head
state_dict = model.state_dict()
state_dict['lm_head.weight'] = torch.nn.functional.normalize(state_dict['lm_head.weight'])
model.load_state_dict(state_dict)
try:
model.generation_config = GenerationConfig.from_pretrained(modelpath)
except:
pass
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan2-7b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model.to('cpu'), tokenizer, dtype=dtype)
\ No newline at end of file
......@@ -14,5 +14,5 @@ if __name__ == "__main__":
except:
pass
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-' + dtype + '.flm"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
import argparse
from fastllm_pytools import llm
import time
def args_parser():
parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = args_parser()
model_path = args.path
prompts = ["深圳有什么好玩的", "上海有什么好玩的", "晚上睡不着怎么办", "南京有什么好吃的"] * 2
prompts = [
"23. 已知动点 $P 、 Q$ 都在曲线 $C:\\left\\{\\begin{array}{l}x=2 \\cos \\beta \\\\ y=2 \\sin \\beta\\end{array}\\right.$ ( $\\beta$ 为参数) 上, 对应参数分别为 $\\beta=\\alpha$ 与 $\\beta=2 \\alpha(0<\\alpha<2 \\pi), M$ 为 $P Q$ 的中点.\n\n(1) 求 $M$ 的轨迹的参数方程;\n\n(2) 将 $M$ 到坐标原点的距离 $d$ 表示为 $\\alpha$ 的函数, 并判断 $M$ 的轨迹是否过坐标 原点.\n",
"23. 在直角坐标系 $x O y$ 中, 曲线 $C_{1}$ 的参数方程为 $\\left\\{\\begin{array}{l}x=2 \\cos \\alpha \\\\ y=2+2 \\sin \\alpha\\end{array}\\right.$ ( $\\alpha$ 为参数) $M$ 是 $\\mathrm{C}_{1}$ 上的动点, $\\mathrm{P}$ 点满足 $\\overrightarrow{\\mathrm{OP}}=2 \\overrightarrow{\\mathrm{OM}}, \\mathrm{P}$ 点的轨迹为曲线 $\\mathrm{C}_{2}$\n\n(I) 求 $\\mathrm{C}_{2}$ 的方程;\n\n(II ) 在以 $O$ 为极点, $x$ 轴的正半轴为极轴的极坐标系中, 射线 $\\theta=\\frac{\\pi}{3}$ 与 $C_{1}$ 的异 于极点的交点为 $A$, 与 $C_{2}$ 的异于极点的交点为 $B$, 求 $|A B|$.\n",
"阅读下面这首唐诗,完成各题。 \n春日秦国怀古 周朴① \n荒郊一望欲消魂②,泾水萦纡傍远村。 \n牛马放多春草尽,原田耕破古碑存。 \n云和积雪苍山晚,烟伴残阳绿树昏。 \n数里黄沙行客路,不堪回首思秦原。 \n[注]①周朴(?~ 878):字太朴,吴兴(今属浙江)人。②消魂;这里形容极\n其哀愁。③ 泾水:渭水支流,在今陕西省中部,古属秦国。 萦纡:旋绕曲折 。 \n(1)这首诗表现了诗人什么样的感情?请简要分析。 \n(2)你认为这首诗在写作上是如何处理情景关系的?\n",
"阅读下面这首宋诗,完成下列各题。 \n次韵雪后书事二首(其一) \n朱熹 \n 惆怅江头几树梅,杖藜行绕去还来。 \n前时雪压无寻处,昨夜月明依旧开。 \n折寄遥怜人似玉,相思应恨劫成灰。 \n沉吟日落寒鸦起,却望柴荆独自回。 \n(1)这首咏梅诗中,作者用什么手法来表现梅花的?请简要分析 \n \n \n(2)诗的最后一联表达了作者什么样的心情?请简要分析。\n",
"17. (12 分) 已知 $a, b, c$ 分别为 $\\triangle A B C$ 三个内角 $A, B, C$ 的对边, $c=\\sqrt{3} a \\sin C-\\cos A$.\n\n(1) 求 $A$;\n\n(2) 若 $a=2, \\triangle A B C$ 的面积为 $\\sqrt{3}$, 求 $b, c$.\n",
"阅读下面这首宋诗,完成下列各题。 \n礼部贡院阅进士就试 欧阳修 \n紫案焚香暖吹轻,广庭清晓席群英。 \n无哗战士衔枚勇,下笔春蚕食叶声。 \n 乡里献贤先德行,朝廷列爵待公卿。 \n自惭衰病心神耗,赖有群公鉴裁精。 \n(1)下列对这首诗的赏析,不恰当的两项是〔 5分〕   。 \nA.诗的第一句写出了考场肃穆而又怡人的环境,衬托出作者的喜悦心情。 \nB.第三句重点在表现考生奋勇争先、一往无前,所以把他们比作战士。 \nC.参加礼部考试的考生都由各地选送而来,道德品行是选送的首要依据。 \nD.朝廷对考生寄予了殷切的期望,希望他们能够成长为国家的栋梁之材。 \nE.作者承认自己体弱多病的事实,表示选材工作要依靠其他考官来完成。 \n(2)本诗的第四句 “下笔春蚕食叶声 ”广受后世称道 ,请赏析这一句的精妙之处 。\n〔6分〕\n",
"阅读下面这首唐诗,完成下列各题。 \n野歌 李贺 \n鸦翎羽箭山桑弓,仰天射落衔芦鸿。 \n麻衣黑肥冲北风,带酒日晚歌田中。 \n男儿屈穷心不穷,枯荣不等嗔天公。 \n寒风又变为春柳,条条看即烟濛濛。 \n(1)下列对这首诗的赏析,不正确的一项是     \nA.弯弓射鸿、麻衣冲风、饮酒高歌都是诗人排解心头苦闷与抑郁的方式。 \nB.诗人虽不得不接受生活贫穷的命运,但意志并未消沉,气概仍然豪迈。 \nC.诗中形容春柳的方式与韩愈《早春呈水部张十八员外》相同,较为常见。 \nD.本诗前半描写场景,后半感事抒怀,描写与抒情紧密关联,脉络清晰。 \n(2)诗的最后两句有何含意?请简要分析。\n",
"17. (12 分) 已知 $a, b, c$ 分别是 $\\triangle A B C$ 内角 $A, B, C$ 的对边, $\\sin ^{2} B=2 \\sin A \\sin C$\n\n(I ) 若 $a=b$, 求 $\\cos B$;\n\n(II ) 设 $B=90^{\\circ}$, 且 $a=\\sqrt{2}$, 求 $\\triangle A B C$ 的面积.\n",
"17. (12 分) 已知等比数列 $\\left\\{a_{n}\\right\\}$ 中, $a_{1}=\\frac{1}{3}$, 公比 $q=\\frac{1}{3}$.\n\n( I ) $S_{n}$ 为 $\\left\\{a_{n}\\right\\}$ 的前 $n$ 项和, 证明: $S_{n}=\\frac{1-a_{n}}{2}$\n\n(II ) 设 $b_{n}=\\log _{3} a_{1}+\\log _{3} a_{2}+\\ldots+\\log _{3} a_{n}$, 求数列 $\\left\\{b_{n}\\right\\}$ 的通项公式.\n",
"阅读下面这首宋词,完成各题。 \n阮郎归 \n无名氏① \n春风吹雨绕残枝,落花无可飞。小池寒渌欲生漪,雨晴还日西。 \n帘半卷,燕双归。讳愁无奈眉②.翻身整顿着残棋,沉吟应劫迟③. \n[注]①作者一作秦观。②讳愁:隐瞒内心的痛苦。③劫:围棋术语。 \n(1)词上半阕的景物描写对全词的感情抒发起了什么作用?请结合内容分析。 \n \n \n(2)末尾两句表现了词人什么样的情绪,是如何表现的,请简要阐述。\n",
"3.(11 分)古代诗歌阅读 阅读下面的宋诗,完成问题。\n内宴奉诏作 曹翰① 三十年前学六韬②,英名常得预时髦③. 曾因国难披金甲,不为家贫卖宝刀。 臂健尚嫌弓力软,眼明犹识阵云高④. 庭前昨夜秋风起,羞见盘花旧战袍。 【注】①曹翰(923~992),宋初名将,②六韬:古代兵书。 ③时髦: 指当代俊杰。 ④阵云:战争中的云气,这里有站阵之意。 (1)诗的颈联又作“臂弱尚嫌弓力软,眼昏犹识阵云高”,你认为哪一种比较好? 为什么?请简要分析。 (2)这首诗与辛弃疾的《破阵子(醉里挑灯看剑)》题材相似,但情感基调却有 所不同,请指出二者的不同之处。\n",
"阅读下面这首乐府诗,完成问题。 \n 雨雪曲 \n江总① \n雨雪隔榆溪②,从军度陇西③.绕阵看狐迹,依山见马蹄。 \n天寒旗彩坏,地暗鼓声低。漫漫愁云起,苍苍别路迷。 \n 【注】①江总( 518~590):南朝陈文学家,字总持,济阳考城(今河南兰\n考)人。历仕梁、陈、隋三朝。 ②榆溪:指边塞。③陇西:在今甘肃东部。 \n(1)这首诗描写了什么样的环境?末句中的 “别路 ”是什么意思? \n \n \n(2)诗人把 “旗彩坏 ”、“鼓声低 ”分别接在 “天寒 ”、“地暗 ”之后,这样写有什么好\n处?这首诗表现了戍卒什么样的情感?\n",
"阅读下面这首宋词,完成下列各题。 \n鹊 桥 仙 \n陆 游 \n华灯纵博,雕鞍驰射,谁记当年豪举①?酒徒一一取封候,独去作江边渔父。 \n轻舟八尺,低逢三扇,占断苹洲烟雨②.镜湖③元自属闲人,又何必君恩赐与! \n 【注】①这三句是追忆当年军中的生活。博,古代的一种棋戏。 ②占断:\n占尽。苹洲烟雨:指长满苹草、烟雨空濛的风光。③镜湖:即鉴湖,在今浙\n江绍兴。唐天宝初,贺知章请求回家乡会稽当道士,玄宗诏赐他镜湖一角。 \n(1)上阕最后两句是什么意思?他表达了作者什么意思的情感? \n \n \n \n(2)词的结尾借用了贺知章的故事,这有什么用意?请简要分析。\n",
"阅读下面这首宋词,完成下列各题。 \n思远人 \n晏几道红叶黄花秋意晚,千里念行客。飞云过尽,归鸿无信,何处寄书得。 \n泪弹不尽临窗滴,就砚旋研墨。渐写到别来,此情深处,红笺为无色。 \n(1)这首词表达了什么样的感情? “红叶黄花秋意晚 ”一句对表达这种感情有什\n么作用? \n \n \n(2)“就砚旋研墨 ”与“临窗滴 ”有什么关系? “红笺为无色 ”的原因是什么?请简要\n分析。\n",
"24. 设函数 $f(x)=|x-a|+3 x$, 其中 $a>0$.\n\n(I) 当 $a=1$ 时, 求不等式 $f(x) \\geqslant 3 x+2$ 的解集\n\n(II ) 若不等式 $f(x) \\leqslant 0$ 的解集为 $\\{x \\mid x \\leqslant-1\\}$, 求 $a$ 的值.\n",
"19. (10 分) 为调查某地区老年人是否需要志愿者提供帮助, 用简单随机抽样 方法从该地区调查了 500 位老年人,结果如表:\n\n\\begin{tabular}{|c|c|c|}\n\\hline 性别 & 男 & 女 \\\\\n\\hline 是否需要志愿者 & & \\\\\n\\hline 需要 & 40 & 30 \\\\\n\\hline 不需要 & 160 & 270 \\\\\n\\hline\n\\end{tabular}\n\n(1)估计该地区老年人中,需要志愿者提供帮助的比例;\n\n(2) 能否有 $99 \\%$ 的把握认为该地区的老年人是否需要志愿者提供帮助与性别有 关?\n\n(3) 根据(2)的结论, 能否提出更好的调查方法来估计该地区的老年人中需要 志愿者提供帮助的老年人比例? 说明理由. \n\n\\begin{tabular}{|c|c|c|c|}\n\\hline$P\\left(K^{2} \\geqslant k\\right)$ & 0.050 & 0.010 & 0.001 \\\\\n\\hline & 3.841 & 6.635 & 10.828 \\\\\n\\hline\n\\end{tabular}\n\n附: $K^{2}=\\frac{n(a d-b c)^{2}}{(a+b)(c+d)(a+c)(b+d)}$.\n"
] * 2
prompts = ["满江红全文"] * 32
print(prompts)
responses, historys = [], []
use_hf_model = False
if use_hf_model:
# 以qwen为例
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, fp32=True).eval()
model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
llm.set_device_map(["cuda:1", "cuda:2"])
model = llm.from_hf(model, tokenizer, dtype = "float16")
t0 = time.time()
responses, historys = model.chat_batch(tokenizer, prompts)
t1 = time.time()
else:
# llm.set_device_map(["cuda:1", "cuda:2"])
model = llm.model(model_path)
t0 = time.time()
responses, historys = model.response_batch(prompts)
t1 = time.time()
token_output_count = 0
word_len = 0
for i, res in enumerate(responses):
tokens = model.tokenizer_encode_string(res)
token_output_count += len(tokens)
word_len += len(res)
print("batch index: ", i)
print(res)
print(historys[i])
print("\n")
print("\ntokens_count: {}, token/s: {:.2f}, character/s: {:.2f}".format(token_output_count, token_output_count/(t1-t0), word_len/(t1-t0)))
......@@ -3,10 +3,11 @@ from transformers import AutoTokenizer, AutoModel
from fastllm_pytools import torch2flm
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model_path = "/home/ZhipuAI/chatglm3-6b"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.flm"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
import argparse
from fastllm_pytools import llm
import time
def args_parser():
parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
......@@ -21,9 +22,24 @@ if __name__ == "__main__":
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
continue
print("AI:", end = "");
curResponse = "";
for response in model.stream_response(query, history = history):
curResponse += response;
print("AI:", end = "")
curResponse = ""
prompt = model.get_prompt(query, history)
tokens = model.tokenizer_encode_string(prompt)
token_input_count = len(tokens)
token_count = 0
t0 = time.time()
for response in model.stream_response(query, history = history, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01):
curResponse += response
print(response, flush = True, end = "")
history.append((query, curResponse))
\ No newline at end of file
token_count += 1
t1 = time.time()
word_len = len(curResponse)
print("\n\ntoken_input_count", token_input_count)
print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_count/(t1-t0), word_len/(t1-t0)))
history.append((query, curResponse))
model.release_memory()
\ No newline at end of file
"""
使用python的分词构造指令方式实现chatglm3,需要修改hf_model.py中的chatglm3模型转换时model.config.model_type的赋值实现,不推荐外部使用
"""
import argparse
from fastllm_pytools import llm
import time
from transformers import AutoTokenizer, AutoModel
def args_parser():
parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = args_parser()
model_path = args.path
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
# model = llm.model(args.path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = llm.from_hf(model, tokenizer, dtype = "float16")
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
while True:
query = input("\n用户:")
if query.strip() == "stop":
break
if query.strip() == "clear":
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
continue
print("AI:", end = "")
current_length = 0
token_count = 0
t0 = time.time()
for response, history in model.stream_chat(tokenizer, query, history=history):
print(response[current_length:], end="", flush=True)
token_count += 1
current_length = len(response)
t1 = time.time()
print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_count/(t1-t0), current_length/(t1-t0)))
import sys
import struct
import numpy as np
import torch
import binascii
from transformers import AutoTokenizer, AutoModel
from fastllm_pytools import torch2flm
def glmtofile(exportPath,
model,
tokenizer = None,
dtype = "float16"):
if (dtype not in torch2flm.fastllm_data_type_dict):
print("dtype should in ", list(torch2flm.fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
modelInfo["tokenizer_serialized"]=binascii.hexlify(tokenizer.sp_model.serialized_model_proto()).decode("latin-1") # sentencepiece分词器序列化存储
if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
modelInfo["peft_size"] = adapter_size
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
torch2flm.writeKeyValue(fo, str(it), str(modelInfo[it]))
if hasattr(model, "peft_config"):
for adapter_name in model.peft_config.keys():
adapter_dict = model.peft_config[adapter_name].__dict__
torch2flm.writeString(fo, adapter_name)
fo.write(struct.pack('i', len(adapter_dict)))
for it in adapter_dict.keys():
torch2flm.writeKeyValue(fo, str(it), str(adapter_dict[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
s = v.encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in torch2flm.fastllm_weight_type_dict):
cur_weight_type = torch2flm.fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = torch2flm.fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
if hasattr(model, "peft_config"):
weight_name = key.replace('base_model.model.', '')
fo.write(struct.pack('i', len(weight_name)))
fo.write(weight_name.encode())
else:
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-large-chinese", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/glm-large-chinese", trust_remote_code=True)
model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float32"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "glm-" + dtype + ".flm"
glmtofile(exportPath, model, tokenizer, dtype = dtype)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment