You need to sign in or sign up before continuing.
Commit aefd9f11 authored by zhouxiang's avatar zhouxiang
Browse files

dcu平台fastllm推理框架

parents
Pipeline #543 failed with stages
in 0 seconds
//
// Created by huangyuyang on 5/11/23.
//
#include "utils.h"
#include "chatglm.h"
#include <cmath>
#include <chrono>
#include <algorithm>
#include <map>
#include <sstream>
#include <unordered_map>
#include <cstring>
#ifdef USE_CUDA
#include "fastllm-cuda.cuh"
#endif
namespace fastllm {
void ChatGLMModel::UpdateSinCos(float rope) {
if (rope == this->rope) {
return;
}
this->rope = rope;
sin.resize(max_positions);
cos.resize(max_positions);
std::vector <float> invFreq;
for (int i = 0; i < rotary_dim; i += 2) {
invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
}
for (int i = 0; i < max_positions; i++) {
sin[i].resize(rotary_dim);
cos[i].resize(rotary_dim);
for (int j = 0; j < invFreq.size(); j++) {
sin[i][j] = ::sin((float)i / rope * invFreq[j]);
cos[i][j] = ::cos((float)i / rope * invFreq[j]);
}
}
std::vector <float> fsin, fcos;
for (int i = 0; i < sin.size(); i++) {
for (int j = 0; j < sin[0].size(); j++) {
fsin.push_back(sin[i][j]);
fcos.push_back(cos[i][j]);
}
}
sinData.CopyFrom(Data(DataType::FLOAT32, {(int)this->sin.size(), (int)this->sin[0].size()}, fsin));
cosData.CopyFrom(Data(DataType::FLOAT32, {(int)this->cos.size(), (int)this->cos[0].size()}, fcos));
}
ChatGLMModel::ChatGLMModel() {
this->model_type = "chatglm";
this->bos_token_id = 130004;
this->eos_token_id = 130005;
this->rope = -1.0;
this->UpdateSinCos(1.0f);
weight.embeddingNames.insert("transformer.word_embeddings.weight");
weight.embeddingNames.insert("transformer.embedding.word_embeddings.weight");
}
int ChatGLMModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector <float> *logits) {
std::vector <std::vector <float>*> batchLogits;
batchLogits.push_back(logits);
return ForwardBatch(1, inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, lastTokens, &batchLogits)[0];
}
std::vector <int> ChatGLMModel::ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
}
int maxLen = inputIds.dims[1];
Data inputEmbeddings;
Data attenInput;
Data qkv, q, k, v;
Data attnProbs;
Data attnOutput;
Data contextLayer;
Data mlpInput;
Data middle, middle2;
Data temp;
std::vector<int> lastRet;
// ChatGLMBlock
int version = GetVersion();
std::string weightPre, weightMiddle;
if (version == 1) {
weightPre = "transformer.layers.";
weightMiddle = ".attention";
} else if (version == 2) {
weightPre = "transformer.encoder.layers.";
weightMiddle = ".self_attention";
}
// ChatGLM2
Data inputIdsPermute;
Permute(inputIds, {1, 0}, inputIdsPermute);
Embedding(inputIdsPermute, this->weight["transformer" + std::string((version == 2 ? ".embedding" : "")) +
".word_embeddings.weight"], inputEmbeddings);
Data &hiddenStates = inputEmbeddings;
for (int i = 0; i < block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
if (version == 1) {
std::string inputLNWeightName = "transformer.layers." + std::to_string(i) + ".input_layernorm.weight";
std::string inputLNBiasName = "transformer.layers." + std::to_string(i) + ".input_layernorm.bias";
LayerNorm(hiddenStates, weight[inputLNWeightName], weight[inputLNBiasName], -1, attenInput);
} else if (version == 2) {
std::string inputRMSWeightName =
"transformer.encoder.layers." + std::to_string(i) + ".input_layernorm.weight";
RMSNorm(hiddenStates, weight[inputRMSWeightName], 1e-5, attenInput);
}
std::string qkvWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.weight";
std::string qkvBiasName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.bias";
if (!adapterName.empty()) {
std::string peftType = weight.peftDict[adapterName]["peft_type"];
if (peftType == "LORA") {
std::string loraAWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_A." + adapterName + ".weight";
std::string loraBWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_B." + adapterName + ".weight";
LoraLayer(attenInput, weight[qkvWeightName], weight[loraAWeightName], weight[loraBWeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
} else if (peftType == "IA3") {
std::string ia3WeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.ia3_l" + adapterName + ".weight";
IA3Layer(attenInput, weight[qkvWeightName], weight[ia3WeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
}
} else {
Linear(attenInput, weight[qkvWeightName], weight[qkvBiasName], qkv);
}
if (version == 1) {
qkv.Reshape({qkv.dims[0], qkv.dims[1], num_attention_heads, -1});
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
fastllm::RotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
fastllm::RotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
} else if (version == 2) {
int qLen = embed_dim, kvLen = (qkv.dims.back() - embed_dim) / 2;
Split(qkv, -1, 0, qLen, q);
Split(qkv, -1, qLen, qLen + kvLen, k);
Split(qkv, -1, qLen + kvLen, qLen + kvLen + kvLen, v);
q.Reshape({q.dims[0], q.dims[1], -1, embed_dim / num_attention_heads});
k.Reshape({k.dims[0], k.dims[1], -1, embed_dim / num_attention_heads});
v.Reshape({v.dims[0], v.dims[1], -1, embed_dim / num_attention_heads});
fastllm::NearlyRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
fastllm::NearlyRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
}
Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
if (GetKVCacheInCPU()) {
pastKey.lockInCPU = true;
pastValue.lockInCPU = true;
} else {
pastKey.ToDevice(DataDevice::CUDA);
pastValue.ToDevice(DataDevice::CUDA);
};
k.Resize({k.dims[0], k.dims[1] * k.dims[2], k.dims[3]});
v.Resize({v.dims[0], v.dims[1] * v.dims[2], v.dims[3]});
PermuteSelf(k, {1, 0, 2});
PermuteSelf(v, {1, 0, 2});
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 &&
(pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && (pastKey.expansionDims.size() == 0 ||
pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1]))) {
std::vector<int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector<int>{k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
if (generationConfig.output_token_limit > 0) {
newDims[1] = std::min(newDims[1], k.dims[1] + generationConfig.output_token_limit);
}
} else {
newDims = pastKey.dims;
newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 &&
(pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && (pastValue.expansionDims.size() == 0 ||
pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1]))) {
std::vector<int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector<int>{v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
if (generationConfig.output_token_limit > 0) {
newDims[1] = std::min(newDims[1], k.dims[1] + generationConfig.output_token_limit);
}
} else {
newDims = pastValue.dims;
newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
CatDirect(pastKey, k, 1);
CatDirect(pastValue, v, 1);
std::vector<int> outputSize = {q.dims[1], q.dims[2], q.dims[0], pastKey.dims[1]};
q.Reshape({q.dims[0], q.dims[1] * q.dims[2], q.dims[3]});
PermuteSelf(q, {1, 0, 2});
//Attention(q, pastKey, pastValue, attentionMask, contextLayer, q.dims[0] / pastKey.dims[0], 1.0 / scale_attn, 1);
// 1.2 Attention
// 1.2.0 q * k^T
q.Reshape({pastKey.dims[0], -1, q.dims[2]});
MatMulTransB(q, pastKey, attnProbs, 1.0 / (scale_attn * (i + 1)));
attnProbs.Reshape(outputSize);
// 1.2.1 Mask
if (attentionMask.dims.size() != 0) {
AttentionMask(attnProbs, attentionMask, -10000);
}
// 1.2.2 softmax
Mul(attnProbs, i + 1, attnProbs);
Softmax(attnProbs, attnProbs, -1);
outputSize = {1, pastValue.dims[0], q.dims[1], pastValue.dims[1]};
attnProbs.Reshape({outputSize[0] * outputSize[1], outputSize[2], -1});
// 1.2.3 prob * v
attnProbs.Reshape({pastValue.dims[0], -1, attnProbs.dims[2]});
MatMul(attnProbs, pastValue, contextLayer);
contextLayer.Reshape({batch, num_attention_heads, maxLen, -1});
PermuteSelf(contextLayer, {2, 0, 1, 3});
contextLayer.Reshape({contextLayer.dims[0], contextLayer.dims[1], embed_dim});
// 1.2.4 dense
std::string denseWeightName = weightPre + std::to_string(i) + weightMiddle + ".dense.weight";
std::string denseBiasName = weightPre + std::to_string(i) + weightMiddle + ".dense.bias";
Linear(contextLayer, weight[denseWeightName], weight[denseBiasName], attnOutput);
// 1.3
if (GetVersion() == 1) {
float alpha = sqrt(2 * block_cnt);
Mul(attenInput, alpha, hiddenStates);
AddTo(hiddenStates, attnOutput);
std::string postLNWeightName =
"transformer.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
std::string postLNBiasName =
"transformer.layers." + std::to_string(i) + ".post_attention_layernorm.bias";
LayerNorm(hiddenStates, weight[postLNWeightName], weight[postLNBiasName], -1, mlpInput);
// 1.4 MLP
std::string fcInKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
std::string fcOutKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
GeluNew(middle, middle);
Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
AddTo(hiddenStates, mlpInput, alpha);
} else {
AddTo(hiddenStates, attnOutput);
std::string postRMSWeightName =
"transformer.encoder.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
Mul(hiddenStates, 1.0, temp);
RMSNorm(hiddenStates, weight[postRMSWeightName], 1e-5, mlpInput);
// 1.4 MLP
std::string fcInKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
std::string fcOutKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
Swiglu(middle, middle2);
Linear(middle2, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
AddTo(hiddenStates, temp);
}
}
Data logits, topk;
if (version == 1) {
LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
weight["transformer.final_layernorm.bias"], -1, hiddenStates);
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
} else {
RMSNorm(hiddenStates, weight["transformer.encoder.final_layernorm.weight"], 1e-5, hiddenStates);
Linear(hiddenStates, weight["transformer.output_layer.weight"], Data(), logits);
}
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
(*retLogits)[b]->resize(size);
memcpy((float*)(*retLogits)[b]->data(), ((float*)logits.cpuData) + base * size, size * logits.unitSize);
}
}
if (generationConfig.IsSimpleGreedy()) {
TopK(logits, topk, 1);
topk.ToDevice(DataDevice::CPU);
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
lastRet.push_back((int) (((float *) topk.cpuData)[base * 2] + 1e-3));
}
} else if (!lastTokens.units.empty()) {
for (int b = 0; b < batch; b++) {
int base = (maxLen - 1) * batch + b;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
}
}
return lastRet;
}
std::vector <int> ChatGLMModel::ForwardBatch(
int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
}
int seqLen = inputIds.dims[1];
sinData.ToDevice(DataDevice::CUDA);
cosData.ToDevice(DataDevice::CUDA);
int version = GetVersion();
std::string weightPre, weightMiddle;
if (version == 1) {
weightPre = "transformer.layers.";
weightMiddle = ".attention";
} else if (version == 2) {
weightPre = "transformer.encoder.layers.";
weightMiddle = ".self_attention";
}
Data inputEmbeddings;
Data inputIdsPermute;
Permute(inputIds, {1, 0}, inputIdsPermute);
Embedding(inputIdsPermute, this->weight["transformer" + std::string((version == 2 ? ".embedding" : "")) +
".word_embeddings.weight"], inputEmbeddings);
Data &hiddenStates = inputEmbeddings;
hiddenStates.ToDevice(DataDevice::CUDA);
Data attenInput;
Data qkv, q, k, v;
Data attnOutput;
Data mlpInput, middle, middle2;
std::vector <Data> attnProbs;
std::vector <Data> curContextLayer;
std::vector <Data> curKs, curVs, curQs;
attnProbs.resize(batch);
curContextLayer.resize(batch);
curKs.resize(batch);
curVs.resize(batch);
curQs.resize(batch);
bool all1 = true;
for (int i = 0; i < batch; i++) {
all1 &= (seqLens[i] == 1);
}
if (batch > 1) {
positionIds[0]->Expansion({2, seqLen});
for (int i = 1; i < batch; i++) {
CatDirect(*(Data*)positionIds[0], *(Data*)positionIds[i], 1);
}
}
std::vector <Data*> keys, values, qs, attns, contexts;
keys.resize(batch);
values.resize(batch);
qs.resize(batch);
attns.resize(batch);
contexts.resize(batch);
std::vector <Data*> pointersK, pointersV, pointersQ;
pointersK.resize(batch);
pointersV.resize(batch);
pointersQ.resize(batch);
std::vector <std::vector <int> > outputSizes;
outputSizes.resize(batch);
for (int i = 0; i < block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
if (version == 1) {
std::string inputLNWeightName = "transformer.layers." + std::to_string(i) + ".input_layernorm.weight";
std::string inputLNBiasName = "transformer.layers." + std::to_string(i) + ".input_layernorm.bias";
LayerNorm(hiddenStates, weight[inputLNWeightName], weight[inputLNBiasName], -1, attenInput);
} else if (version == 2) {
std::string inputRMSWeightName =
"transformer.encoder.layers." + std::to_string(i) + ".input_layernorm.weight";
RMSNorm(hiddenStates, weight[inputRMSWeightName], 1e-5, attenInput);
}
std::string qkvWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.weight";
std::string qkvBiasName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.bias";
if (!adapterName.empty()) {
std::string peftType = weight.peftDict[adapterName]["peft_type"];
if (peftType == "LORA") {
std::string loraAWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_A." + adapterName + ".weight";
std::string loraBWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_B." + adapterName + ".weight";
LoraLayer(attenInput, weight[qkvWeightName], weight[loraAWeightName], weight[loraBWeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
} else if (peftType == "IA3") {
std::string ia3WeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.ia3_l" + adapterName + ".weight";
IA3Layer(attenInput, weight[qkvWeightName], weight[ia3WeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
}
} else {
Linear(attenInput, weight[qkvWeightName], weight[qkvBiasName], qkv);
}
if (version == 1) {
qkv.Reshape({qkv.dims[0], qkv.dims[1], num_attention_heads, -1});
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
} else if (version == 2) {
int qLen = embed_dim, kvLen = (qkv.dims.back() - embed_dim) / 2;
Split(qkv, -1, 0, qLen, q);
Split(qkv, -1, qLen, qLen + kvLen, k);
Split(qkv, -1, qLen + kvLen, qLen + kvLen + kvLen, v);
q.Reshape({q.dims[0], q.dims[1], -1, embed_dim / num_attention_heads});
k.Reshape({k.dims[0], k.dims[1], -1, embed_dim / num_attention_heads});
v.Reshape({v.dims[0], v.dims[1], -1, embed_dim / num_attention_heads});
}
if (version == 1) {
fastllm::RotatePosition2D(q, *positionIds[0], sinData, cosData, rotary_dim);
fastllm::RotatePosition2D(k, *positionIds[0], sinData, cosData, rotary_dim);
} else if (version == 2) {
fastllm::NearlyRotatePosition2D(q, *positionIds[0], sinData, cosData, rotary_dim);
fastllm::NearlyRotatePosition2D(k, *positionIds[0], sinData, cosData, rotary_dim);
}
k.Resize({k.dims[0], k.dims[1] * k.dims[2], k.dims[3]});
v.Resize({v.dims[0], v.dims[1] * v.dims[2], v.dims[3]});
q.Resize({q.dims[0], q.dims[1] * q.dims[2], q.dims[3]});
Data contextLayer = Data(DataType::FLOAT32);
int total = 0;
if (all1 && batch > 1) {
for (int b = 0; b < batch; b++) {
pointersK[b] = (&curKs[b]);
pointersV[b] = (&curVs[b]);
pointersQ[b] = (&curQs[b]);
}
SplitBatch(k, 0, batch, pointersK);
SplitBatch(v, 0, batch, pointersV);
SplitBatch(q, 0, batch, pointersQ);
total = batch;
for (int b = 0; b < batch; b++) {
auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
std::swap(k.dims[0], k.dims[1]);
k.strides[0] = k.dims[1] * k.dims[2]; k.strides[1] = k.dims[2];
std::swap(v.dims[0], v.dims[1]);
v.strides[0] = v.dims[1] * v.dims[2]; v.strides[1] = v.dims[2];
std::swap(q.dims[0], q.dims[1]);
q.strides[0] = q.dims[1] * q.dims[2]; q.strides[1] = q.dims[2];
}
} else {
PermuteSelf(k, {1, 0, 2});
PermuteSelf(v, {1, 0, 2});
PermuteSelf(q, {1, 0, 2});
for (int b = 0; b < batch; b++) {
Split(k, 1, total, total + seqLens[b], curKs[b]);
Split(v, 1, total, total + seqLens[b], curVs[b]);
Split(q, 1, total, total + seqLens[b], curQs[b]);
total += seqLens[b];
}
}
for (int b = 0; b < batch; b++) {
auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt +
i].second;
pastKey.ToDevice(DataDevice::CUDA);
pastValue.ToDevice(DataDevice::CUDA);
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 &&
(pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
std::vector<int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector<int>{k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
if (generationConfigs[b].output_token_limit > 0) {
newDims[1] = std::min(newDims[1], k.dims[1] + generationConfigs[b].output_token_limit);
}
} else {
newDims = pastKey.dims;
newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 &&
(pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
std::vector<int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector<int>{v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
if (generationConfigs[b].output_token_limit > 0) {
newDims[1] = std::min(newDims[1], k.dims[1] + generationConfigs[b].output_token_limit);
}
} else {
newDims = pastValue.dims;
newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
}
for (int b = 0; b < batch; b++) {
keys[b] = (pastKeyValues[b * block_cnt + i].first);
values[b] = (pastKeyValues[b * block_cnt + i].second);
pointersK[b] = (&curKs[b]);
pointersV[b] = (&curVs[b]);
}
CatDirectBatch(keys, pointersK, 1);
CatDirectBatch(values, pointersV, 1);
for (int b = 0; b < batch; b++) {
auto &q = curQs[b];
Data &pastKey = *pastKeyValues[b * block_cnt + i].first;
outputSizes[b] = {1, q.dims[0], q.dims[1], pastKey.dims[1]};
q.Reshape({pastKey.dims[0], -1, q.dims[2]});
}
// 1.2 Attention
// 1.2.0 q * k^T
if (all1 && batch > 1) {
for (int b = 0; b < batch; b++) {
qs[b] = (&curQs[b]);
keys[b] = (pastKeyValues[b * block_cnt + i].first);
attns[b] = (&attnProbs[b]);
}
MatMulTransBBatch(qs, keys, attns, 1.0 / (scale_attn * (i + 1)));
} else {
for (int b = 0; b < batch; b++) {
auto &q = curQs[b];
Data &pastKey = *pastKeyValues[b * block_cnt + i].first;
MatMulTransB(q, pastKey, attnProbs[b], 1.0 / (scale_attn * (i + 1)));
}
}
for (int b = 0; b < batch; b++) {
attnProbs[b].Reshape(outputSizes[b]);
// 1.2.1 Mask
if (attentionMask[b] != nullptr) {
AttentionMask(attnProbs[b], *attentionMask[b], -10000);
}
}
// 1.2.2 softmax
for (int i = 0; i < attnProbs.size(); i++) {
attns[i] = (&attnProbs[i]);
}
MulBatch(attns, i + 1, attns);
SoftmaxBatch(attns, attns, -1);
for (int b = 0; b < batch; b++) {
Data &pastValue = *pastKeyValues[b * block_cnt + i].second;
outputSizes[b] = {1, num_attention_heads, -1, pastValue.dims[2]};
attnProbs[b].Reshape({pastValue.dims[0], -1, attnProbs[b].dims[3]});
}
// 1.2.3 prob * v
if (all1 && batch > 1) {
for (int b = 0; b < batch; b++) {
attns[b] = (&attnProbs[b]);
values[b] = (pastKeyValues[b * block_cnt + i].second);
contexts[b] = (&curContextLayer[b]);
}
MatMulBatch(attns, values, contexts);
} else {
for (int b = 0; b < batch; b++) {
Data &pastValue = *pastKeyValues[b * block_cnt + i].second;
MatMul(attnProbs[b], pastValue, curContextLayer[b]);
}
}
if (all1) {
for (int b = 0; b < batch; b++) {
curContextLayer[b].dims[0] = outputSizes[b][2];
curContextLayer[b].dims[1] = outputSizes[b][0];
curContextLayer[b].dims[2] = embed_dim;
curContextLayer[b].strides[0] = curContextLayer[b].dims[1] * curContextLayer[b].dims[2];
curContextLayer[b].strides[1] = curContextLayer[b].dims[2];
curContextLayer[b].strides[2] = 1;
}
} else {
for (int b = 0; b < batch; b++) {
curContextLayer[b].Reshape(outputSizes[b]);
PermuteSelf(curContextLayer[b], {2, 0, 1, 3});
curContextLayer[b].Reshape({curContextLayer[b].dims[0], curContextLayer[b].dims[1], embed_dim});
}
}
if (all1 && batch > 1) {
for (int b = 0; b < batch; b++) {
contexts[b] = (&curContextLayer[b]);
}
CatBatch(contexts, 0, contextLayer);
} else {
for (int b = 0; b < batch; b++) {
if (contextLayer.dims.size() == 0) {
std::vector<int> dims = curContextLayer[b].dims;
dims[0] = total;
contextLayer.Expansion(dims);
}
contextLayer.ToDevice(DataDevice::CUDA);
CatDirect(contextLayer, curContextLayer[b], 0);
}
}
// 1.2.4 dense
std::string denseWeightName = weightPre + std::to_string(i) + weightMiddle + ".dense.weight";
std::string denseBiasName = weightPre + std::to_string(i) + weightMiddle + ".dense.bias";
Linear(contextLayer, weight[denseWeightName], weight[denseBiasName], attnOutput);
if (GetVersion() == 1) {
float alpha = sqrt(2 * block_cnt);
Mul(attenInput, alpha, hiddenStates);
AddTo(hiddenStates, attnOutput);
std::string postLNWeightName =
"transformer.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
std::string postLNBiasName =
"transformer.layers." + std::to_string(i) + ".post_attention_layernorm.bias";
LayerNorm(hiddenStates, weight[postLNWeightName], weight[postLNBiasName], -1, mlpInput);
// 1.4 MLP
std::string fcInKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
std::string fcOutKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
GeluNew(middle, middle);
Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
AddTo(hiddenStates, mlpInput, alpha);
} else {
AddTo(hiddenStates, attnOutput);
std::string postRMSWeightName =
"transformer.encoder.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
Data temp;
Mul(hiddenStates, 1.0, temp);
RMSNorm(hiddenStates, weight[postRMSWeightName], 1e-5, mlpInput);
// 1.4 MLP
std::string fcInKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
std::string fcOutKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
Swiglu(middle, middle2);
Linear(middle2, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
AddTo(hiddenStates, temp);
}
}
Data logits;
if (version == 1) {
LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
weight["transformer.final_layernorm.bias"], -1, hiddenStates);
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
} else {
RMSNorm(hiddenStates, weight["transformer.encoder.final_layernorm.weight"], 1e-5, hiddenStates);
Linear(hiddenStates, weight["transformer.output_layer.weight"], Data(), logits);
}
std::vector <int> lastRet;
int total = 0;
Data curLogit;
for (int b = 0; b < batch; b++) {
Split(logits, 0, total + seqLens[b] - 1, total + seqLens[b], curLogit);
if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
curLogit.ToDevice(DataDevice::CPU);
(*retLogits)[b]->resize(curLogit.Count(0));
memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
}
if (generationConfigs[b].IsSimpleGreedy()) {
Data topk;
TopK(curLogit, topk, 1);
topk.ToDevice(DataDevice::CPU);
lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
} else {
lastRet.push_back(LLMSampling(curLogit, 0, generationConfigs[b], lastTokens.units[b]));
}
total += seqLens[b];
}
return lastRet;
}
void ChatGLMModel::FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds) {
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
int gmask_token_id = this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end() ?
atoi(this->weight.dicts["gmask_token_id"].c_str()) : 130001;
int index = params.find("index")->second;
int promptLen = params.find("promptLen")->second;
if (index == 0) {
for (auto &ids: inputTokens) {
if (GetVersion() == 1) {
ids.push_back(gmask_token_id);
ids.push_back(bos_token_id);
} else if (GetVersion() == 2) {
if (ids.size() < 2 || ids[0] != 64790 || ids[1] != 64792) {
ids.insert(ids.begin(), 64792);
ids.insert(ids.begin(), 64790);
}
}
}
int seqLen = inputTokens[0].size();
std::vector<float> vmask = std::vector<float>(seqLen * seqLen, 0);
std::vector<float> vpids = std::vector<float>(seqLen * 2, 0);
for (int i = 0; i < seqLen - 1; i++) {
vmask[i * seqLen + seqLen - 1] = 1;
vpids[i] = i;
}
vpids[seqLen - 1] = seqLen - 2;
vpids[seqLen * 2 - 1] = 1;
if (GetVersion() == 2) {
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
}
}
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
positionIds.CopyFrom(Data(DataType::FLOAT32, {2, seqLen}, vpids));
} else {
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
attentionMask = Data();
if (GetVersion() == 1) {
positionIds.CopyFrom(Data(DataType::FLOAT32, {2, 1}, {(float) promptLen, (float) (index + 1)}));
} else {
positionIds.CopyFrom(Data(DataType::FLOAT32, {2, 1}, {(float) promptLen + index + 1, (float) (index + 1)}));
}
}
}
void ChatGLMModel::FillLLMInputsBatch(std::vector<std::vector<float>> &inputTokens,
const std::vector<std::map<std::string, int>> &params,
fastllm::Data &inputIds, fastllm::Data &attentionMask,
fastllm::Data &positionIds) {
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
int batch = inputTokens.size();
int index = params[0].find("index")->second;
if (index == 0) {
int gmask_token_id = this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end() ?
atoi(this->weight.dicts["gmask_token_id"].c_str()) : 130001;
std::vector<int> seqLens;
seqLens.resize(batch);
int maxLen = 0;
for (int i = 0; i < batch; i++) {
maxLen = std::max(maxLen, (int) inputTokens[i].size() + 2);
seqLens[i] = (int) inputTokens[i].size();
}
std::vector<float> ids = std::vector<float>(batch * maxLen, 0);
std::vector<float> vpids = std::vector<float>(batch * 2 * maxLen, 0);
std::vector<float> vmask = std::vector<float>(batch * maxLen * maxLen, 0);
for (int i = 0; i < batch; i++) {
if (GetVersion() == 1) {
auto &tokens = inputTokens[i];
int len = tokens.size(), base = maxLen - 2 - len;
for (int j = 0; j < len; j++) {
ids[i * maxLen + base + j] = tokens[j];
}
ids[i * maxLen + base + len] = gmask_token_id;
ids[i * maxLen + base + len + 1] = bos_token_id;
len += 2;
for (int j = 0; j < len - 1; j++) {
vpids[i * 2 * maxLen + base + j] = j;
}
vpids[i * 2 * maxLen + base + len - 1] = len - 2;
vpids[i * 2 * maxLen + maxLen + base + len - 1] = 1;
std::fill(vmask.data() + i * maxLen * maxLen,
vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
for (int j = maxLen - len; j < maxLen; j++) {
std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
}
for (int j = 0; j < len - 1; j++) {
vmask[i * maxLen * maxLen + (base + j) * maxLen + base + len - 1] = 1;
}
} else {
auto &tokens = inputTokens[i];
int len = tokens.size(), base = maxLen - 2 - len;
ids[i * maxLen + base] = 64790;
ids[i * maxLen + base + 1] = 64792;
for (int j = 0; j < len; j++) {
ids[i * maxLen + base + 2 + j] = tokens[j];
}
len += 2;
for (int j = 0; j < len; j++) {
vpids[i * 2 * maxLen + base + j] = j;
}
std::fill(vmask.data() + i * maxLen * maxLen,
vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
for (int j = maxLen - len; j < maxLen; j++) {
std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
}
for (int j = 0; j < len; j++) {
for (int k = j + 1; k < len; k++) {
vmask[i * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
}
}
}
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, maxLen}, ids));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, maxLen, maxLen}, vmask));
positionIds.CopyFrom(Data(DataType::FLOAT32, {batch * 2, maxLen}, vpids));
} else {
std::vector <float> fret;
for (int i = 0; i < batch; i++) {
fret.push_back(inputTokens[i][0]);
}
std::vector <float> pids = std::vector<float>(batch * 2);
int maxLen = 0;
for (int i = 0; i < batch; i++) {
int promptLen = params[i].find("promptLen")->second;
maxLen = std::max(promptLen + 2, maxLen);
pids[i * 2 + 1] = index + 1;
if (GetVersion() == 1) {
pids[i * 2] = promptLen;
} else {
pids[i * 2] = promptLen + index + 1;
}
}
maxLen += index;
std::vector<float> vmasks = std::vector<float>(batch * maxLen, 0.0f);
for (int i = 0; i < batch; i++) {
int promptLen = params[i].find("promptLen")->second;
for (int j = 0; j < maxLen - index - promptLen - 2; j++) {
vmasks[i * maxLen + j] = 1.0f;
}
}
attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, 1, maxLen}, vmasks));
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, fret));
positionIds.CopyFrom(Data(DataType::FLOAT32, {batch * 2, 1}, pids));
}
}
void ChatGLMModel::WarmUp() {
printf("Warmup...\n");
Data inputIds = Data(DataType::FLOAT32, {1, 1}, {(float)bos_token_id});
Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
Data positionIds = Data(DataType::FLOAT32, {2, 1}, {0, 0});
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
Forward(inputIds, attentionMask, positionIds, pastKeyValues);
printf("finish.\n");
}
std::string ChatGLMModel::MakeInput(const std::string &history, int round, const std::string &input) {
if (round == 0 && GetVersion() == 1) {
return input;
} else {
#if defined(_WIN32) or defined(_WIN64)
std::vector <uint8_t> vask = {233, 151, 174, 239, 188, 154, 0};
std::vector <uint8_t> vans = {231, 173, 148, 239, 188, 154, 0};
std::string sask = (char*)vask.data();
std::string sans = (char*)vans.data();
return (history + ("[Round " + std::to_string(round) + "]\n\n" + sask + input + "\n\n" + sans));
#else
return history + ("[Round " + std::to_string(round) + "]\n\n问:" + input + "\n\n答:");
#endif
}
}
std::string ChatGLMModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
#if defined(_WIN32) or defined(_WIN64)
std::vector <uint8_t> vask = {233, 151, 174, 239, 188, 154, 0};
std::vector <uint8_t> vans = {231, 173, 148, 239, 188, 154, 0};
std::string sask = (char*)vask.data();
std::string sans = (char*)vans.data();
return (history + ("[Round " + std::to_string(round) + "]\n\n" + sask + input + "\n\n" + sans + output + "\n"));
#else
return (history + ("[Round " + std::to_string(round) + "]\n\n问:" + input + "\n\n答:" + output + "\n\n"));
#endif
}
int ChatGLMModel::GetVersion() {
if (this->weight.weight.find("transformer.embedding.word_embeddings.weight") != this->weight.weight.end()) {
return 2;
} else {
return 1;
}
}
}
//
// Created by huangyuyang on 6/1/23.
//
#include "utils.h"
#include "llama.h"
#include <sstream>
#include <unordered_map>
#include <cstring>
#ifdef USE_CUDA
#include "fastllm-cuda.cuh"
#endif
namespace fastllm {
std::vector <float> GetInterLeavePowerOf2(int n) {
float start = powf(2, -powf(2, -(log2f(n) - 3)));
float ratio = start;
std::vector <float> ret;
for (int i = 0; i < n; i++) {
ret.push_back(start * powf(ratio, i));
}
return ret;
}
std::vector <float> GetInterleave(int n) {
int base = 1;
while (base < n) {
base <<= 1;
}
if (base == n) {
return GetInterLeavePowerOf2(n);
} else {
std::vector <float> ret = GetInterLeavePowerOf2(base / 2);
std::vector <float> part2 = GetInterLeavePowerOf2(base);
for (int i = 0; i < n - base / 2; i++) {
ret.push_back(part2[i * 2]);
}
return ret;
}
}
LlamaModel::LlamaModel() {
this->model_type = "llama";
// 默认使用alpaca的提示词和instruction
this->pre_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n";
this->user_role = "### Instruction:\n";
this->bot_role = "\n\n### Response:";
this->history_sep = "</s>";
block_cnt = 32;
rotary_dim = 128;
sin.resize(max_positions);
cos.resize(max_positions);
std::vector <float> invFreq;
for (int i = 0; i < rotary_dim; i += 2) {
invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
}
for (int i = 0; i < max_positions; i++) {
sin[i].resize(rotary_dim);
cos[i].resize(rotary_dim);
for (int j = 0; j < invFreq.size(); j++) {
sin[i][j] = ::sin((float)i * invFreq[j]);
cos[i][j] = ::cos((float)i * invFreq[j]);
}
}
std::vector <float> fsin, fcos;
for (int i = 0; i < sin.size(); i++) {
for (int j = 0; j < sin[0].size(); j++) {
fsin.push_back(sin[i][j]);
fcos.push_back(cos[i][j]);
}
}
sinData.CopyFrom(Data(DataType::FLOAT32, {(int)this->sin.size(), (int)this->sin[0].size()}, fsin));
cosData.CopyFrom(Data(DataType::FLOAT32, {(int)this->cos.size(), (int)this->cos[0].size()}, fcos));
weight.embeddingNames.insert("model.embed_tokens.weight");
}
int LlamaModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector <float> *retLogits) {
Data alibiData;
if (this->weight.dicts["use_alibi"] == "1") {
std::vector<float> alibi = GetInterleave(num_attention_heads);
alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
}
Data hiddenStates;
Data attenInput;
Data q, k, v, qkv;
Data attenWeights, attenOutput;
Data attenLastOutput;
Data w1, w2, w3;
Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
for (int i = 0; i < block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
1e-6, attenInput);
std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
// 1.1 Get q, k, v
int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
if (weight.weight.find(qkvWeightName) != weight.weight.end()) {
Linear(attenInput, weight[qkvWeightName], Data(), qkv);
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
} else {
Linear(attenInput, weight[qWeightName], Data(), q);
Linear(attenInput, weight[kWeightName], Data(), k);
Linear(attenInput, weight[vWeightName], Data(), v);
}
std::vector <int> qkvSize = {bsz, seqlen, num_attention_heads, -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
if (alibiData.dims.size() == 0) {
fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
}
qkvSize = {bsz * seqlen, num_attention_heads, -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
PermuteSelf(q, {1, 0, 2});
PermuteSelf(k, {1, 0, 2});
PermuteSelf(v, {1, 0, 2});
Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
std::vector <int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector <int> {k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
} else {
newDims = pastKey.dims;
newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
std::vector <int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector <int> {v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
} else {
newDims = pastValue.dims;
newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
CatDirect(pastKey, k, 1);
CatDirect(pastValue, v, 1);
// 1.2 Attention
// 1.2.0 q * k^T
MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
if (alibiData.dims.size() != 0) {
AlibiMask(attenWeights, alibiData, -10000);
} else if (attentionMask.dims.size() != 0) {
AttentionMask(attenWeights, attentionMask, -10000);
}
Softmax(attenWeights, attenWeights, -1);
MatMul(attenWeights, pastValue, attenOutput);
attenOutput.Reshape({attenOutput.dims[1], attenOutput.dims[2], attenOutput.dims[3]});
PermuteSelf(attenOutput, {1, 0, 2});
attenOutput.Reshape({bsz, seqlen, -1});
Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
AddTo(hiddenStates, attenLastOutput);
// 2. mlp
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
Silu(w1, w1);
MulTo(w1, w3);
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
AddTo(hiddenStates, w2);
}
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
logits.ToDevice(DataDevice::CPU);
int lastRet = -1;
if (generationConfig.output_logits && retLogits != nullptr) {
int size = logits.dims.back();
logits.ToDevice(DataDevice::CPU);
retLogits->resize(size);
memcpy((float*)retLogits->data(), ((float*)logits.cpuData) + (logits.dims[1] - 1) * size, size * logits.unitSize);
}
if (generationConfig.IsSimpleGreedy()) {
std::pair <float, int> ret = std::make_pair(-1e9, -1);
int base = logits.dims[1] - 1;
for (int i = 0; i < logits.dims.back(); i++) {
ret = max(ret, std::make_pair(((float*)logits.cpuData)[base * logits.dims.back() + i], i));
}
lastRet = ret.second;
} else if (!lastTokens.units.empty()) {
lastRet = LLMSampling(logits, logits.dims[1] - 1, generationConfig, lastTokens.units[0]);
}
return lastRet;
}
std::vector <int> LlamaModel::ForwardBatch(int batch, const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
Data alibiData;
if (this->weight.dicts["use_alibi"] == "1") {
std::vector<float> alibi = GetInterleave(num_attention_heads);
alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
}
Data hiddenStates;
Data attenInput;
Data q, k, v, qkv;
Data attenWeights, attenOutput;
Data attenLastOutput;
Data w1, w2, w3;
Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
int seqlen = hiddenStates.dims[1];
for (int i = 0; i < block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
1e-6, attenInput);
std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
// 1.1 Get q, k, v
int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
if (weight.weight.find(qkvWeightName) != weight.weight.end()) {
Linear(attenInput, weight[qkvWeightName], Data(), qkv);
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
} else {
Linear(attenInput, weight[qWeightName], Data(), q);
Linear(attenInput, weight[kWeightName], Data(), k);
Linear(attenInput, weight[vWeightName], Data(), v);
}
std::vector <int> qkvSize = {bsz, seqlen, num_attention_heads, -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
if (alibiData.dims.size() == 0) {
fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
}
PermuteSelf(q, {0, 2, 1, 3});
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
qkvSize = {bsz * num_attention_heads, seqlen, -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
std::vector <int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector <int> {k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
} else {
newDims = pastKey.dims;
newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
std::vector <int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector <int> {v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
} else {
newDims = pastValue.dims;
newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
CatDirect(pastKey, k, 1);
CatDirect(pastValue, v, 1);
// 1.2 Attention
// 1.2.0 q * k^T
MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
if (alibiData.dims.size() != 0) {
attenWeights.Reshape({-1, num_attention_heads, attenWeights.dims[2], attenWeights.dims[3]});
AlibiMask(attenWeights, alibiData, -10000);
attenWeights.Reshape({1, -1, attenWeights.dims[2], attenWeights.dims[3]});
} else if (attentionMask.dims.size() != 0) {
AttentionMask(attenWeights, attentionMask, -10000);
}
Softmax(attenWeights, attenWeights, -1);
MatMul(attenWeights, pastValue, attenOutput);
attenOutput.Reshape({attenOutput.dims[1], attenOutput.dims[2], attenOutput.dims[3]});
PermuteSelf(attenOutput, {1, 0, 2});
attenOutput.Reshape({seqlen, bsz, -1});
PermuteSelf(attenOutput, {1, 0, 2});
Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
AddTo(hiddenStates, attenLastOutput);
// 2. mlp
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
Silu(w1, w1);
MulTo(w1, w3);
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
AddTo(hiddenStates, w2);
}
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
logits.ToDevice(DataDevice::CPU);
std::vector <int> lastRet;
if (generationConfig.IsSimpleGreedy()) {
for (int b = 0; b < batch; b++) {
int base = b * logits.dims[1] + logits.dims[1] - 1;
std::pair <float, int> ret = std::make_pair(-1e9, -1);
for (int i = 0; i < logits.dims.back(); i++) {
ret = max(ret, std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
}
lastRet.push_back(ret.second);
}
} else {
for (int b = 0; b < batch; b++) {
int base = b * logits.dims[1] + logits.dims[1] - 1;
lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
}
}
return lastRet;
}
std::vector <int> LlamaModel::ForwardBatch(int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
Data alibiData;
if (this->weight.dicts["use_alibi"] == "1") {
std::vector<float> alibi = GetInterleave(num_attention_heads);
alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
}
Data hiddenStates;
Data attenInput;
Data q, k, v, qkv;
Data attenWeights, curAttenOutput;
Data attenLastOutput;
Data w1, w2, w3;
Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
int seqlen = hiddenStates.dims[1];
for (int i = 0; i < block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
1e-6, attenInput);
std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
// 1.1 Get q, k, v
int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
if (weight.weight.find(qkvWeightName) != weight.weight.end()) {
Linear(attenInput, weight[qkvWeightName], Data(), qkv);
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, k);
Split(qkv, -1, per * 2, per * 3, v);
} else {
Linear(attenInput, weight[qWeightName], Data(), q);
Linear(attenInput, weight[kWeightName], Data(), k);
Linear(attenInput, weight[vWeightName], Data(), v);
}
Data attenOutput = Data(DataType::FLOAT32);
int total = 0;
std::vector <Data> curKs, curVs, curQs;
curKs.resize(batch);
curVs.resize(batch);
curQs.resize(batch);
for (int b = 0; b < batch; b++) {
Split(k, 1, total, total + seqLens[b], curKs[b]);
Split(v, 1, total, total + seqLens[b], curVs[b]);
Split(q, 1, total, total + seqLens[b], curQs[b]);
total += seqLens[b];
}
for (int b = 0; b < batch; b++) {
auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
std::vector<int> qkvSize = {bsz, seqLens[b], num_attention_heads, -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
if (alibiData.dims.size() == 0) {
fastllm::LlamaRotatePosition2D(q, *positionIds[b], sinData, cosData, rotary_dim);
fastllm::LlamaRotatePosition2D(k, *positionIds[b], sinData, cosData, rotary_dim);
}
PermuteSelf(q, {0, 2, 1, 3});
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
qkvSize = {bsz * num_attention_heads, seqLens[b], -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 &&
(pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
std::vector<int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector<int>{k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
} else {
newDims = pastKey.dims;
newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 &&
(pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
std::vector<int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector<int>{v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
} else {
newDims = pastValue.dims;
newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
CatDirect(pastKey, k, 1);
CatDirect(pastValue, v, 1);
// 1.2 Attention
// 1.2.0 q * k^T
MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
if (alibiData.dims.size() != 0) {
AlibiMask(attenWeights, alibiData, -10000);
} else if (attentionMask[b] != nullptr) {
AttentionMask(attenWeights, *attentionMask[b], -10000);
}
Softmax(attenWeights, attenWeights, -1);
MatMul(attenWeights, pastValue, curAttenOutput);
curAttenOutput.Reshape({curAttenOutput.dims[1], curAttenOutput.dims[2], curAttenOutput.dims[3]});
PermuteSelf(curAttenOutput, {1, 0, 2});
curAttenOutput.Reshape({seqLens[b], bsz, -1});
PermuteSelf(curAttenOutput, {1, 0, 2});
if (attenOutput.dims.size() == 0) {
std::vector <int> dims = curAttenOutput.dims;
dims[1] = total;
attenOutput.Expansion(dims);
}
CatDirect(attenOutput, curAttenOutput, 1);
}
Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
AddTo(hiddenStates, attenLastOutput);
// 2. mlp
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
Silu(w1, w1);
MulTo(w1, w3);
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
AddTo(hiddenStates, w2);
}
RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
logits.ToDevice(DataDevice::CPU);
std::vector <int> lastRet;
int total = 0;
for (int b = 0; b < batch; b++) {
if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
int base = (total + seqLens[b] - 1);
(*retLogits)[b]->resize(logits.dims.back());
memcpy((float*)(*retLogits)[b]->data(), (float*)(logits.cpuData + base * logits.dims.back() * logits.unitSize), logits.dims.back() * logits.unitSize);
}
if (generationConfigs[b].IsSimpleGreedy()) {
std::pair<float, int> ret = std::make_pair(-1e9, -1);
int base = (total + seqLens[b] - 1);
total += seqLens[b];
for (int i = 0; i < logits.dims.back(); i++) {
ret = max(ret, std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
}
lastRet.push_back(ret.second);
} else {
int base = (total + seqLens[b] - 1);
total += seqLens[b];
lastRet.push_back(LLMSampling(logits, base, generationConfigs[b], lastTokens.units[b]));
}
}
return lastRet;
}
std::string LlamaModel::Response(const std::string& input, RuntimeResult retCb,
const GenerationConfig &generationConfig) {
#ifdef USE_CUDA
FastllmCudaClearBigBuffer();
#endif
//auto st = std::chrono::system_clock::now();
#ifdef PY_API
size_t pos = input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)? input.substr(0, pos-10):input;
size_t hash_id = std::hash<std::string>{}(input);
Data inputIds = this->weight.tokenizer.Encode(prompt);
#else
Data inputIds = this->weight.tokenizer.Encode(input);
#endif
std::vector <float> ids;
for (int i = 0; i < inputIds.Count(0); i++) {
ids.push_back(((float*)inputIds.cpuData)[i]);
}
int seqLen = ids.size();
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, ids));
std::vector <float> vmask = std::vector <float> (seqLen * seqLen, 0);
std::vector <float> vpids = std::vector <float> (seqLen, 0);
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
}
}
Data attentionMask = Data(DataType::FLOAT32, {seqLen, seqLen}, vmask);
Data positionIds = Data(DataType::FLOAT32, {1, seqLen}, vpids);
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
std::string retString = "";
int len = seqLen;
std::vector <float> results;
int index = 0;
LastTokensManager tokens (1, generationConfig.last_n);
while (true) {
auto st = std::chrono::system_clock::now();
int ret = Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
tokens.units[0].Push(ret);
if (ret == eos_token_id) {
break;
}
results.push_back(ret);
std::string curString = weight.tokenizer.Decode(Data(DataType::FLOAT32, {(int)results.size()}, results)).c_str();
retString += curString;
if (retCb)
#ifdef PY_API
{
if(generationConfig.enable_hash_id){
std::stringstream ss;
ss << retString << "hash_id:"<<hash_id;
retCb(index, pybind11::bytes(ss.str()));
}else{
retCb(index, pybind11::bytes(retString));
}
}
#else
retCb(index, curString.c_str());
#endif
index++;
if (index == generationConfig.output_token_limit) {
break;
}
results.clear();
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)ret}));
attentionMask = Data();
positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)len}));
//if (do_sample) {
// tokenPenaltyManager.InsertToken(ret);
//}
len++;
if (index == generationConfig.output_token_limit) {
break;
}
//printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now()));
}
if (retCb)
#ifdef PY_API
{
if(generationConfig.enable_hash_id){
std::stringstream ss;
ss << retString << "hash_id:"<<hash_id;
retCb(-1, pybind11::bytes(ss.str()));
}else{
retCb(-1, pybind11::bytes(retString));
}
}
#else
retCb(-1, retString.c_str());
#endif
return retString;
}
void LlamaModel::ResponseBatch(const std::vector<std::string> &inputs, std::vector<std::string> &outputs,
RuntimeResultBatch retCb,
const GenerationConfig &generationConfig) {
#ifdef USE_CUDA
FastllmCudaClearBigBuffer();
#endif
#ifdef PY_API
std::vector<std::string> prompts;
std::vector < size_t > hash_ids;
for (auto _input: inputs){
size_t hash_id = std::hash<std::string>{}(_input);
hash_ids.push_back(hash_id);
size_t pos = _input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? _input.substr(0, pos - 10) : _input;
prompts.push_back(prompt);
}
#else
std::vector<std::string> prompts = inputs;
#endif
int batch = prompts.size();
outputs.clear();
outputs.resize(batch, "");
std::vector <Data> inputTokens;
std::vector <int> seqLens;
inputTokens.resize(batch);
seqLens.resize(batch);
int maxLen = 0;
for (int i = 0; i < batch; i++) {
inputTokens[i].CopyFrom(this->weight.tokenizer.Encode(prompts[i]));
maxLen = std::max(maxLen, (int)inputTokens[i].Count(0));
seqLens[i] = (int)inputTokens[i].Count(0);
}
std::vector <float> ids = std::vector <float> (batch * maxLen, 0);
std::vector <float> vpids = std::vector <float> (batch * maxLen, 0);
std::vector <float> vmask = std::vector <float> (batch * maxLen * maxLen, 0);
for (int i = 0; i < batch; i++) {
Data &tokens = inputTokens[i];
int len = tokens.Count(0), base = maxLen - len;
for (int j = 0; j < len; j++) {
ids[i * maxLen + base + j] = ((float*)tokens.cpuData)[j];
}
for (int j = 0; j < len; j++) {
vpids[i * maxLen + base + j] = j;
}
std::fill(vmask.data() + i * maxLen * maxLen,
vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
for (int j = maxLen - len; j < maxLen; j++) {
std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
}
for (int j = 0; j < len; j++) {
for (int k = j + 1; k < len; k++) {
vmask[i * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
}
}
}
Data inputIds = Data(DataType::FLOAT32, {batch, maxLen}, ids);
Data attentionMask = Data(DataType::FLOAT32, {batch, maxLen, maxLen}, vmask);
Data positionIds = Data(DataType::FLOAT32, {batch, maxLen}, vpids);
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
std::string retString = "";
std::vector <int> lens = seqLens;
std::vector <bool> isEnding = std::vector <bool> (batch, false);
std::vector <float> results;
int index = 0;
LastTokensManager tokensManager (batch, generationConfig.last_n);
while (true) {
auto st = std::chrono::system_clock::now();
std::vector <int> ret = ForwardBatch(batch, inputIds, attentionMask, positionIds, pastKeyValues,
generationConfig, tokensManager);
for (int i = 0; i < batch; i++) {
tokensManager.units[i].Push(ret[i]);
}
std::vector <float> fret;
std::vector <float> results;
int endingCount = 0;
std::vector <std::string> curStrings;
for (int i = 0; i < batch; i++) {
fret.push_back(ret[i]);
if (ret[i] == eos_token_id) {
isEnding[i] = true;
}
if (isEnding[i]) {
curStrings.push_back("");
endingCount++;
continue;
}
results.push_back(ret[i]);
std::string curString = weight.tokenizer.Decode(
Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
outputs[i] += curString;
curStrings.push_back(curString);
results.clear();
}
if (endingCount == batch) {
break;
}
if (retCb)
#ifdef PY_API
{
if (generationConfig.enable_hash_id) {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << curStrings[i] << "hash_id:" << hash_ids[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(index, rtnStrings);
} else {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << curStrings[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(index, rtnStrings);
}
}
#else
retCb(index, curStrings);
#endif
index++;
maxLen++;
std::vector <float> pids = std::vector <float> (batch);
std::vector <float> vmasks = std::vector <float> (batch * maxLen, 0.0f);
for (int i = 0; i < batch; i++) {
pids[i] = lens[i];
lens[i]++;
for (int j = 0; j < maxLen - lens[i]; j++) {
vmasks[i * maxLen + j] = 1.0f;
}
}
positionIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, 1, maxLen}, vmasks));
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, fret));
positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, pids));
if (index == generationConfig.output_token_limit) {
break;
}
//printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now()));
}
if (retCb)
#ifdef PY_API
{
if (generationConfig.enable_hash_id) {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << outputs[i] << "hash_id:" << hash_ids[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(-1, rtnStrings);
} else {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << outputs[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(-1, rtnStrings);
}
}
#else
retCb(-1, outputs);
#endif
}
std::string LlamaModel::MakeInput(const std::string &history, int round, const std::string &input) {
if(is_nsql){
return input;
}
else{
return (round == 0 ? pre_prompt : history) + user_role + input + bot_role;
}
}
std::string LlamaModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
if(is_nsql){
return input;
}
else {
return (round == 0 ? pre_prompt : history) + user_role + input + bot_role + output + history_sep;
}
}
void LlamaModel::WarmUp() {
printf("Warmup...\n");
Data inputIds = Data(DataType::FLOAT32, {1, 1}, {1});
Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
Data positionIds = Data(DataType::FLOAT32, {1, 1}, {0, 0});
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
Forward(inputIds, attentionMask, positionIds, pastKeyValues);
printf("finish.\n");
}
int LlamaModel::LaunchResponseTokens(const std::vector<int> &inputTokens,
const GenerationConfig &generationConfig) {
mainLoopLocker.lock();
if (mainLoop == nullptr) {
if (mainLoop == nullptr) {
mainLoop = new std::thread([](LlamaModel *model) {
while (true) {
std::vector <Data*> attentionMasks;
std::vector <Data*> positionIds;
std::vector <std::pair <Data*, Data*> > pastKeyValues;
std::vector <float> ids;
std::vector <int> seqLens;
std::vector <GenerationConfig> generationConfigs;
LastTokensManager tokensManager;
std::vector <std::vector <float>* > logits;
model->dictLocker.lock();
for (auto &it: model->responseContextDict.dicts) {
if (it.second->isEnding) {
continue;
}
generationConfigs.push_back(it.second->generationConfig);
if (it.second->generationConfig.output_logits) {
it.second->resultLogits.push(new std::vector <float> ());
logits.push_back(it.second->resultLogits.back());
} else {
logits.push_back(nullptr);
}
tokensManager.units.push_back(it.second->tokens);
if (it.second->preTokens == 0) {
int seqLen = it.second->currentTokens.size();
for (int i = 0; i < it.second->currentTokens.size(); i++) {
ids.push_back(it.second->currentTokens[i]);
}
seqLens.push_back(seqLen);
std::vector <float> vmask = std::vector <float> (seqLen * seqLen, 0);
std::vector <float> vpids = std::vector <float> (seqLen, 0);
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
}
}
it.second->intParams["len"] = seqLen;
attentionMasks.push_back(new Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
positionIds.push_back(new Data(DataType::FLOAT32, {2, seqLen}, vpids));
} else {
int ret = it.second->currentTokens[0];
seqLens.push_back(1);
ids.push_back(ret);
attentionMasks.push_back(nullptr);
positionIds.push_back(new Data(DataType::FLOAT32, {1, 1}, {(float)it.second->intParams["len"]}));
it.second->intParams["len"]++;
}
it.second->preTokens += seqLens.back();
for (int i = 0; i < model->block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(&it.second->pastKeyValues[i].first,
&it.second->pastKeyValues[i].second));
}
}
if (seqLens.size() > 0) {
#ifdef USE_CUDA
FastllmCudaClearBigBuffer();
#endif
Data inputIds = Data(DataType::FLOAT32, {1, (int) ids.size()}, ids);
std::vector<int> ret = model->ForwardBatch(seqLens.size(), inputIds, attentionMasks,
positionIds, seqLens, pastKeyValues, generationConfigs, tokensManager, &logits);
int idx = 0;
for (auto &it: model->responseContextDict.dicts) {
if (it.second->isEnding) {
continue;
}
int curRet = ret[idx++];
if (curRet == model->eos_token_id) {
it.second->isEnding = true;
} else {
it.second->currentTokens = std::vector<int>{curRet};
it.second->resultTokenQueue.push(curRet);
it.second->tokens.Push(curRet);
it.second->curTokens++;
if (it.second->curTokens == it.second->generationConfig.output_token_limit) {
it.second->isEnding = true;
}
}
}
}
for (int i = 0; i < attentionMasks.size(); i++) {
delete attentionMasks[i];
}
for (int i = 0; i < positionIds.size(); i++) {
delete positionIds[i];
}
model->dictLocker.unlock();
MySleep(0);
}
}, this);
}
}
mainLoopLocker.unlock();
dictLocker.lock();
int handleId = responseContextDict.CreateHandle();
ResponseContext *context = responseContextDict.GetHandle(handleId);
context->Init(this->block_cnt);
context->currentTokens = inputTokens;
context->generationConfig = generationConfig;
context->tokens = LastTokensUnit(generationConfig.last_n);
dictLocker.unlock();
return handleId;
}
int LlamaModel::FetchResponseTokens(int handleId) {
dictLocker.lock();
ResponseContext *context = responseContextDict.GetHandle(handleId);
if (context == nullptr) {
dictLocker.unlock();
return -1;
} else {
while (true) {
if (context->resultTokenQueue.size() > 0) {
int ret = context->resultTokenQueue.front();
context->resultTokenQueue.pop();
dictLocker.unlock();
return ret;
} else {
if (context->isEnding) {
responseContextDict.RemoveHandle(handleId);
dictLocker.unlock();
return -1;
}
}
dictLocker.unlock();
MySleep(0);
dictLocker.lock();
}
}
}
}
//
// Created by huangyuyang on 5/12/23.
//
#include "utils.h"
#include "moss.h"
#include <cmath>
#include <chrono>
#include <algorithm>
#include <sstream>
#include <unordered_map>
namespace fastllm {
extern double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2);
MOSSModel::MOSSModel() {
this->model_type = "moss";
this->pre_prompt = "You are an AI assistant whose name is MOSS. ";
this->user_role = "<|Human|>: ";
this->bot_role = "<eoh>";
this->history_sep = "";
// 初始化sin, cos
embed_dim = 6144;
num_attention_heads = 24;
head_dim = embed_dim / num_attention_heads;
block_cnt = 34;
sin.resize(max_positions);
cos.resize(max_positions);
std::vector <float> invFreq;
for (int i = 0; i < rotary_dim; i += 2) {
invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
}
for (int i = 0; i < max_positions; i++) {
sin[i].resize(rotary_dim);
cos[i].resize(rotary_dim);
for (int j = 0; j < invFreq.size(); j++) {
sin[i][j] = ::sin((float)i * invFreq[j]);
cos[i][j] = ::cos((float)i * invFreq[j]);
}
}
this->weight.embeddingNames.insert("transformer.wte.weight");
}
void MOSSModel::CausalMask(Data &data, int start) {
int outer = data.dims[0] * data.dims[1];
int spatial = data.Count(2);
int n = data.dims[2], m = data.dims[3];
for (int o = 0; o < outer; o++) {
float *d = (float*)data.cpuData + o * spatial;
for (int i = 0; i < n; i++) {
if (i + start + 1 < m) {
std::fill(d + i * m + i + start + 1, d + (i + 1) * m, -std::numeric_limits<float>::max());
}
}
}
}
void MOSSModel::RotatePosition2D(Data &data, const Data &positionIds) {
int outer = data.dims[0] * data.dims[1];
int spatial = data.Count(2);
int n = data.dims[2], m = data.dims[3];
for (int o = 0; o < outer; o++) {
int index = (int)((float*)positionIds.cpuData)[o];
std::vector <float> &sin = this->sin[index];
std::vector <float> &cos = this->cos[index];
float *d = (float*)data.cpuData + o * spatial;
for (int i = 0; i < n; i++) {
for (int j = 0; j + 1 < rotary_dim && j + 1 < m; j += 2) {
float a = d[j], b = d[j + 1];
d[j] = a * cos[j / 2] - b * sin[j / 2];
d[j + 1] = a * sin[j / 2] + b * cos[j / 2];
}
d += m;
}
}
}
int MOSSModel::Forward(const Data &inputIds, const Data &attentionMask,
const Data &positionIds, std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector <float> *retLogits) {
auto st = std::chrono::system_clock::now();
Data inputEmbeddings;
Embedding(inputIds, this->weight["transformer.wte.weight"], inputEmbeddings);
Data hiddenStates = inputEmbeddings;
// MossBlock
for (int i = 0; i < block_cnt; i++) {
// 1.0 LayerNorm
Data residual;
Mul(hiddenStates, 1.0, residual);
std::string lnWeightName = "transformer.h." + std::to_string(i) + ".ln_1.weight";
std::string lnBiasName = "transformer.h." + std::to_string(i) + ".ln_1.bias";
LayerNorm(residual, weight[lnWeightName], weight[lnBiasName], -1, hiddenStates);
// 1.1 Get query, key, value
std::string qkvProjName = "transformer.h." + std::to_string(i) + ".attn.qkv_proj.weight";
Data qkv, q, k, v;
Linear(hiddenStates, weight[qkvProjName], Data(), qkv);
qkv.Reshape({qkv.dims[0], qkv.dims[1], 4, -1});
int per = qkv.dims.back() / 3;
Split(qkv, -1, 0, per, q);
Split(qkv, -1, per, per * 2, v);
Split(qkv, -1, per * 2, per * 3, k);
q.Reshape({q.dims[0], q.dims[1], -1, head_dim});
k.Reshape({k.dims[0], k.dims[1], -1, head_dim});
v.Reshape({v.dims[0], v.dims[1], -1, head_dim});
q.ToDevice(DataDevice::CPU);
k.ToDevice(DataDevice::CPU);
RotatePosition2D(q, positionIds);
RotatePosition2D(k, positionIds);
q.ToDevice(DataDevice::CUDA);
k.ToDevice(DataDevice::CUDA);
PermuteSelf(q, {0, 2, 1, 3});
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
Data pastKey = pastKeyValues[i].first, pastValue = pastKeyValues[i].second;
Cat(pastKey, k, -2, pastKeyValues[i].first);
Cat(pastValue, v, -2, pastKeyValues[i].second);
k.CopyFrom(pastKeyValues[i].first);
v.CopyFrom(pastKeyValues[i].second);
// 1.2 Attention
// 1.2.0 q * k^T
Data attnWeights;
MatMulTransB(q, k, attnWeights, 1.0 / scale_attn);
// 1.2.1 causal_mask
attnWeights.ToDevice(DataDevice::CPU);
CausalMask(attnWeights, k.dims[2] - q.dims[2]);
attnWeights.ToDevice(DataDevice::CUDA);
// 1.2.2 attentionMask
// TODO: attentionMask, 这里似乎都是1, 暂且跳过了
// 1.2.3 softmax
Softmax(attnWeights, attnWeights, -1);
// 1.2.4 headMask
// TODO: headMask, 这里似乎都是None, 暂且跳过了
// 1.2.5 attention_weights * v
Data attnOutput;
PermuteSelf(v, {0, 1, 3, 2});
MatMulTransB(attnWeights, v, attnOutput);
// 1.3
PermuteSelf(attnOutput, {0, 2, 1, 3});
attnOutput.Reshape({attnOutput.dims[0], attnOutput.dims[1], -1});
std::string outProjName = "transformer.h." + std::to_string(i) + ".attn.out_proj.weight";
Data realOutput;
Linear(attnOutput, weight[outProjName], Data(), realOutput);
// 1.4 MLP
std::string fcInKeyName = "transformer.h." + std::to_string(i) + ".mlp.fc_in";
std::string fcOutKeyName = "transformer.h." + std::to_string(i) + ".mlp.fc_out";
Data middle;
Linear(hiddenStates, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
GeluNew(middle, middle);
Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
AddTo(hiddenStates, residual);
AddTo(hiddenStates, realOutput);
}
LayerNorm(hiddenStates, weight["transformer.ln_f.weight"], weight["transformer.ln_f.bias"], -1, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], weight["lm_head.bias"], logits);
logits.ToDevice(DataDevice::CPU);
int ret = -1;
if (generationConfig.IsSimpleGreedy()) {
std::vector<std::pair<float, int> > v;
int base = logits.dims[logits.dims.size() - 2] - 1;
for (int i = 0; i < logits.dims.back(); i++) {
v.push_back(std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
}
std::sort(v.begin(), v.end());
std::reverse(v.begin(), v.end());
ret = v[0].second;
} else if (!lastTokens.units.empty()) {
ret = LLMSampling(logits, logits.dims[logits.dims.size() - 2] - 1, generationConfig, lastTokens.units[0]);
}
float spend = GetSpan(st, std::chrono::system_clock::now());
//printf("forward spend %f s.\n", spend);
return ret;
}
std::string MOSSModel::Response(const std::string &input,
RuntimeResult retCb,
const GenerationConfig &generationConfig) {
#ifdef PY_API
size_t pos = input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)? input.substr(0, pos-10):input;
size_t hash_id = std::hash<std::string>{}(input);
Data inputIds = this->weight.tokenizer.Encode(prompt);
#else
Data inputIds = this->weight.tokenizer.Encode(input);
#endif
Data attentionMask = inputIds;
Data positionIds = inputIds;
std::vector<std::pair<Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(), Data()));
}
int len = inputIds.dims[1];
for (int i = 0; i < len; i++) {
((float *) attentionMask.cpuData)[i] = 1;
((float *) positionIds.cpuData)[i] = i;
}
std::vector<float> results;
std::string retString = "";
int index = 0;
LastTokensManager tokens (1, generationConfig.last_n);
while (true) {
int ret = Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
tokens.units[0].Push(ret);
if (ret == 106068) {
break;
}
results.push_back(ret);
std::string current = weight.tokenizer.Decode(
Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
retString += current;
if (retCb)
#ifdef PY_API
{
if(generationConfig.enable_hash_id){
std::stringstream ss;
ss << retString << "hash_id:"<<hash_id;
retCb(index, pybind11::bytes(ss.str()));
}else{
retCb(index, pybind11::bytes(retString));
}
}
#else
retCb(index, current.c_str());
#endif
index++;
fflush(stdout);
results.clear();
len++;
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) ret}));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, len}, std::vector<float>(len, 1.0f)));
positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) (len - 1)}));
if (index == generationConfig.output_token_limit) {
break;
}
}
if (retCb)
#ifdef PY_API
{
if(generationConfig.enable_hash_id){
std::stringstream ss;
ss << retString << "hash_id:"<<hash_id;
retCb(-1, pybind11::bytes(ss.str()));
}else{
retCb(-1, pybind11::bytes(retString));
}
}
#else
retCb(-1, retString.c_str());
#endif
return retString;
}
std::string MOSSModel::MakeInput(const std::string &history, int round, const std::string &input) {
return (round == 0 ? pre_prompt : history) + user_role + input + bot_role;
}
std::string MOSSModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
return (round == 0 ? pre_prompt : history) + user_role + input + bot_role + output + history_sep;
}
void MOSSModel::WarmUp() {
printf("Warmup...\n");
Data inputIds = Data(DataType::FLOAT32, {1, 1}, {(float)bos_token_id});
Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
Data positionIds = Data(DataType::FLOAT32, {1, 1}, {0});
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
Forward(inputIds, attentionMask, positionIds, pastKeyValues);
printf("finish.\n");
}
void
MOSSModel::FillLLMInputs(std::vector<std::vector<float>> &inputTokens, const std::map<std::string, int> &params,
fastllm::Data &inputIds, fastllm::Data &attentionMask, fastllm::Data &positionIds) {
int index = params.find("index")->second;
int promptLen = params.find("promptLen")->second;
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
if (index == 0) {
int seqLen = inputTokens[0].size();
std::vector<float> vmask = std::vector<float>(seqLen, 1);
std::vector<float> vpids = std::vector<float>(seqLen, 0);
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vmask));
positionIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vpids));
} else {
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, promptLen + index}, std::vector<float>(promptLen + index, 1.0f)));
positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) (promptLen + index - 1)}));
}
}
}
//
// Created by siemon on 8/9/23.
//
#include "utils.h"
#include "qwen.h"
#include <cmath>
#include <chrono>
#include <algorithm>
#include <sstream>
#include <unordered_map>
#include <cstring>
#ifdef USE_CUDA
#include "fastllm-cuda.cuh"
#endif
namespace fastllm {
extern double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2);
QWenModel::QWenModel() {
this->model_type = "qwen";
this->pre_prompt = "You are a helpful assistant.";
this->user_role = "user";
this->bot_role = "assistant";
embed_dim = 4096;
num_attention_heads = 32;
head_dim = embed_dim / num_attention_heads;
block_cnt = 32;
rotary_dim = 128;
seq_length = 2048;
use_log_attn = true;
ntk_alpha = 1.f;
UpdateRotaryPosEmb(ntk_alpha);
if (use_log_attn) {
logn_list = Data(DataType::FLOAT32);
logn_list.Resize({1, max_positions, 1, 1});
logn_list.Allocate();
float *logn = (float *) logn_list.cpuData;
for (int i = 0; i < seq_length; i++) {
logn[i] = 1;
}
for (int i = seq_length; i < max_positions; i++) {
logn[i] = std::log(i) / std::log(seq_length);
}
}
weight.embeddingNames.insert("transformer.wte.weight");
}
int QWenModel::Forward(const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig,
const LastTokensManager &lastTokens,
std::vector <float> *logits) {
std::vector <std::vector <float>*> batchLogits;
batchLogits.push_back(logits);
return ForwardBatch(1, inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, lastTokens, &batchLogits)[0];
}
std::vector <int> QWenModel::ForwardBatch(int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
int maxLen = inputIds.dims[1];
Data hiddenStates;
Data attnInput, attnOutput;
Data query, key, value;
Data attnWeights, attnLastOutput;
Data a1, a2, mlpOutput;
// printf("input id: ");
// for (int i = 0; i < inputIds.Count(0); i++) {
// printf("%d ", (int )((float *) inputIds.cpuData)[i]);
// }
// printf("\n");
Embedding(inputIds, this->weight["transformer.wte.weight"], hiddenStates);
for (int i = 0; i < this->block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
int seqlen = hiddenStates.dims[1];
std::string ln_1_name = "transformer.h." + std::to_string(i) + ".ln_1.weight";
std::string attn_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.weight";
std::string attn_bias_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.bias";
RMSNorm(hiddenStates, weight[ln_1_name], 1e-6, attnInput);
Linear(attnInput, weight[attn_weight_name], weight[attn_bias_name], attnOutput); // attnOutput [batch, seqlen, embed_dim * 3]
Split(attnOutput, 2, 0, embed_dim, query);
Split(attnOutput, 2, embed_dim, 2 * embed_dim, key);
Split(attnOutput, 2, embed_dim * 2, embed_dim * 3, value);
query.Reshape({query.dims[0], query.dims[1], num_attention_heads, head_dim});
key.Reshape({key.dims[0], key.dims[1], num_attention_heads, head_dim});
value.Reshape({value.dims[0], value.dims[1], num_attention_heads, head_dim});
Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
if (pastKey.dims.empty()) {
// 计算new_ntk_alpha
float context_value = std::log2((float) seqlen / seq_length) + 1;
float new_ntk_alpha = std::max(std::pow(2, std::ceil(context_value) - 1), 1.);
if (new_ntk_alpha != ntk_alpha) {
UpdateRotaryPosEmb(new_ntk_alpha);
}
}
LlamaRotatePosition2D(query, positionIds, sinData, cosData, rotary_dim);
LlamaRotatePosition2D(key, positionIds, sinData, cosData, rotary_dim);
if (use_log_attn) {
ApplyLognAttn(query, logn_list, positionIds);
}
PermuteSelf(query, {0, 2, 1, 3});
PermuteSelf(key, {0, 2, 1, 3});
PermuteSelf(value, {0, 2, 1, 3});
std::vector<int> qkvSize = {batch * num_attention_heads, seqlen, -1};
query.Reshape(qkvSize);
key.Reshape(qkvSize);
value.Reshape(qkvSize);
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || key.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + key.dims[1] > pastKey.expansionDims[1])) {
std::vector <int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector <int> {key.dims[0], ((key.dims[1] - 1) / unitLen + 1) * unitLen, key.dims[2]};
} else {
newDims = pastKey.dims;
newDims[1] += ((key.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || value.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + value.dims[1] > pastValue.expansionDims[1])) {
std::vector <int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector <int> {value.dims[0], ((value.dims[1] - 1) / unitLen + 1) * unitLen, value.dims[2]};
} else {
newDims = pastValue.dims;
newDims[1] += ((value.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
CatDirect(pastKey, key, 1);
CatDirect(pastValue, value, 1);
// Attention
MatMulTransB(query, pastKey, attnWeights, 1.0 / sqrt(head_dim));
attnWeights.Reshape({1, attnWeights.dims[0], attnWeights.dims[1], attnWeights.dims[2]});
if (!attentionMask.dims.empty()) {
AttentionMask(attnWeights, attentionMask, -10000);
}
Softmax(attnWeights, attnWeights, -1);
MatMul(attnWeights, pastValue, attnOutput);
attnOutput.Reshape({attnOutput.dims[1], attnOutput.dims[2], attnOutput.dims[3]});
PermuteSelf(attnOutput, {1, 0, 2});
attnOutput.Reshape({seqlen, batch, -1});
PermuteSelf(attnOutput, {1, 0, 2});
std::string proj_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_proj.weight";
Linear(attnOutput, weight[proj_weight_name], Data(), attnLastOutput);
AddTo(hiddenStates, attnLastOutput);
std::string ln_2_name = "transformer.h." + std::to_string(i) + ".ln_2.weight";
RMSNorm(hiddenStates, weight[ln_2_name], 1e-6, attnInput);
std::string mlp_w1_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w1.weight";
std::string mlp_w2_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w2.weight";
std::string mlp_proj_weight_name = "transformer.h." + std::to_string(i) + ".mlp.c_proj.weight";
Linear(attnInput, weight[mlp_w1_weight_name], Data(), a1);
Linear(attnInput, weight[mlp_w2_weight_name], Data(), a2);
Silu(a2, a2);
MulTo(a1, a2);
Linear(a1, weight[mlp_proj_weight_name], Data(), mlpOutput);
AddTo(hiddenStates, mlpOutput);
}
RMSNorm(hiddenStates, weight["transformer.ln_f.weight"], 1e-6, hiddenStates);
Data logits, topk;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
std::vector <int> lastRet;
int total = 0;
Data curLogitTemp, curLogit;
for (int b = 0; b < batch; b++) {
Split(logits, 0, b, b + 1, curLogitTemp);
Split(curLogitTemp, 1, maxLen - 1, maxLen, curLogit);
if (generationConfig.output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
curLogit.ToDevice(DataDevice::CPU);
(*retLogits)[b]->resize(curLogit.Count(0));
memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
}
if (generationConfig.IsSimpleGreedy()) {
Data topk;
TopK(curLogit, topk, 1);
topk.ToDevice(DataDevice::CPU);
lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
} else {
lastRet.push_back(LLMSampling(curLogit, 0, generationConfig, lastTokens.units[b]));
}
total += maxLen;
}
return lastRet;
}
std::vector <int> QWenModel::ForwardBatch(int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens,
std::vector <std::vector <float>*> *retLogits) {
int maxLen = inputIds.dims[1];
Data hiddenStates;
Data attnInput, attnOutput;
Data query, key, value;
Data attnWeights, attnLastOutput;
Data a1, a2, mlpOutput;
Embedding(inputIds, this->weight["transformer.wte.weight"], hiddenStates);
for (int i = 0; i < this->block_cnt; i++) {
ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
std::string ln_1_name = "transformer.h." + std::to_string(i) + ".ln_1.weight";
std::string attn_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.weight";
std::string attn_bias_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.bias";
RMSNorm(hiddenStates, weight[ln_1_name], 1e-6, attnInput);
Linear(attnInput, weight[attn_weight_name], weight[attn_bias_name], attnOutput); // attnOutput [batch, seqlen, embed_dim * 3]
Split(attnOutput, 2, 0, embed_dim, query);
Split(attnOutput, 2, embed_dim, 2 * embed_dim, key);
Split(attnOutput, 2, embed_dim * 2, embed_dim * 3, value);
std::vector<Data> curKs, curVs, curQs;
curKs.resize(batch);
curVs.resize(batch);
curQs.resize(batch);
int total = 0;
for (int b = 0; b < batch; b++) {
Split(query, 1, total, total + seqLens[b], curQs[b]);
Split(key, 1, total, total + seqLens[b], curKs[b]);
Split(value, 1, total, total + seqLens[b], curVs[b]);
total += seqLens[b];
}
Data attnOutputAll = Data(DataType::FLOAT32);
for (int b = 0; b < batch; b++) {
// in this loop, batch = 1
auto &query = curQs[b];
auto &key = curKs[b];
auto &value = curVs[b];
query.Reshape({1, seqLens[b], num_attention_heads, head_dim});
key.Reshape({1, seqLens[b], num_attention_heads, head_dim});
value.Reshape({1, seqLens[b], num_attention_heads, head_dim});
Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
if (pastKey.dims.empty()) {
// 计算new_ntk_alpha
float context_value = std::log2((float) seqLens[b] / seq_length) + 1;
float new_ntk_alpha = std::max(std::pow(2, std::ceil(context_value) - 1), 1.);
if (new_ntk_alpha != ntk_alpha) {
UpdateRotaryPosEmb(new_ntk_alpha);
}
}
LlamaRotatePosition2D(query, *positionIds[b], sinData, cosData, rotary_dim);
LlamaRotatePosition2D(key, *positionIds[b], sinData, cosData, rotary_dim);
if (use_log_attn) {
ApplyLognAttn(query, logn_list, *positionIds[b]);
}
PermuteSelf(query, {0, 2, 1, 3});
PermuteSelf(key, {0, 2, 1, 3});
PermuteSelf(value, {0, 2, 1, 3});
std::vector<int> qkvSize = {num_attention_heads, seqLens[b], -1};
query.Reshape(qkvSize);
key.Reshape(qkvSize);
value.Reshape(qkvSize);
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || key.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + key.dims[1] > pastKey.expansionDims[1])) {
std::vector <int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector <int> {key.dims[0], ((key.dims[1] - 1) / unitLen + 1) * unitLen, key.dims[2]};
} else {
newDims = pastKey.dims;
newDims[1] += ((key.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastKey.Expansion(newDims);
}
while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || value.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + value.dims[1] > pastValue.expansionDims[1])) {
std::vector <int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector <int> {value.dims[0], ((value.dims[1] - 1) / unitLen + 1) * unitLen, value.dims[2]};
} else {
newDims = pastValue.dims;
newDims[1] += ((value.dims[1] - 1) / unitLen + 1) * unitLen;
}
pastValue.Expansion(newDims);
}
CatDirect(pastKey, key, 1);
CatDirect(pastValue, value, 1);
MatMulTransB(query, pastKey, attnWeights, 1.0 / sqrt(head_dim));
attnWeights.Reshape({1, attnWeights.dims[0], attnWeights.dims[1], attnWeights.dims[2]});
if (attentionMask[b]) {
AttentionMask(attnWeights, *attentionMask[b], -10000);
}
Softmax(attnWeights, attnWeights, -1);
MatMul(attnWeights, pastValue, attnOutput);
attnOutput.Reshape({attnOutput.dims[1], attnOutput.dims[2], attnOutput.dims[3]});
PermuteSelf(attnOutput, {1, 0, 2});
attnOutput.Reshape({seqLens[b], 1, -1});
PermuteSelf(attnOutput, {1, 0, 2});
if (attnOutputAll.dims.size() == 0) {
std::vector <int> dims = attnOutput.dims;
dims[1] = total;
attnOutputAll.Expansion(dims);
}
CatDirect(attnOutputAll, attnOutput, 1);
}
std::string proj_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_proj.weight";
Linear(attnOutputAll, weight[proj_weight_name], Data(), attnLastOutput);
AddTo(hiddenStates, attnLastOutput);
std::string ln_2_name = "transformer.h." + std::to_string(i) + ".ln_2.weight";
RMSNorm(hiddenStates, weight[ln_2_name], 1e-6, attnInput);
std::string mlp_w1_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w1.weight";
std::string mlp_w2_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w2.weight";
std::string mlp_proj_weight_name = "transformer.h." + std::to_string(i) + ".mlp.c_proj.weight";
Linear(attnInput, weight[mlp_w1_weight_name], Data(), a1);
Linear(attnInput, weight[mlp_w2_weight_name], Data(), a2);
Silu(a2, a2);
MulTo(a1, a2);
Linear(a1, weight[mlp_proj_weight_name], Data(), mlpOutput);
AddTo(hiddenStates, mlpOutput);
}
RMSNorm(hiddenStates, weight["transformer.ln_f.weight"], 1e-6, hiddenStates);
Data logits;
Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
std::vector <int> lastRet;
int total = 0;
Data curLogit;
for (int b = 0; b < batch; b++) {
Split(logits, 1, total + seqLens[b] - 1, total + seqLens[b], curLogit);
if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
curLogit.ToDevice(DataDevice::CPU);
(*retLogits)[b]->resize(curLogit.Count(0));
memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
}
if (generationConfigs[b].IsSimpleGreedy()) {
Data topk;
TopK(curLogit, topk, 1);
topk.ToDevice(DataDevice::CPU);
lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
} else {
lastRet.push_back(LLMSampling(curLogit, 0, generationConfigs[b], lastTokens.units[b]));
}
total += seqLens[b];
}
return lastRet;
}
std::string QWenModel::MakeInput(const std::string &history, int round, const std::string &input) {
if (weight.dicts["chat_format"] == "chatml") {
return (round == 0 ? im_start + "system" + "\n" + pre_prompt + im_end : history) +
"\n" + im_start + user_role + "\n" + input + im_end + "\n" + im_start + bot_role + "\n";
} else if (weight.dicts["chat_format"] == "raw") {
return history + input;
} else {
ErrorInFastLLM("Unknown char_format for QWen: " + weight.dicts["chat_format"]);
return "";
}
}
std::string QWenModel::MakeHistory(const std::string &history, int round,
const std::string &input, const std::string &output) {
if (weight.dicts["chat_format"] == "chatml") {
return (round == 0 ? im_start + "system" + "\n" + pre_prompt + im_end : history) +
"\n" + im_start + user_role + "\n" + input + im_end + "\n" + im_start + bot_role + "\n" + output + im_end;
} else if (weight.dicts["chat_format"] == "raw") {
return history + input + output;
} else {
ErrorInFastLLM("Unknown char_format for QWen: " + weight.dicts["chat_format"]);
return "";
}
}
void QWenModel::FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds) {
int index = params.find("index")->second;
int promptLen = params.find("promptLen")->second;
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
if (index == 0) {
int seqLen = inputTokens[0].size();
std::vector <float> vmask = std::vector <float> (seqLen * seqLen, 0);
std::vector<float> vpids = std::vector<float>(seqLen, 0);
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
}
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
positionIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vpids));
} else {
inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
attentionMask.CopyFrom(Data());
positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) (promptLen + index - 1)}));
}
}
void QWenModel::FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
const std::vector <std::map <std::string, int> > &params,
Data &inputIds, Data &attentionMask, Data &positionIds) {
int batch = inputTokens.size();
int index = params[0].find("index")->second;
int promptLen = params[0].find("promptLen")->second;
inputIds.ToDevice(DataDevice::CPU);
attentionMask.ToDevice(DataDevice::CPU);
positionIds.ToDevice(DataDevice::CPU);
if (index == 0) {
int seqLen = inputTokens[0].size();
std::vector<float> ids = std::vector<float>(batch * seqLen, 0);
std::vector <float> vmask = std::vector <float> (batch * seqLen * seqLen, 0);
std::vector<float> vpids = std::vector<float>(batch * seqLen, 0);
for (int b = 0; b < batch; b++) {
for (int i = 0; i < seqLen; i++) {
ids[b * seqLen + i] = inputTokens[b][i];
}
}
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
}
}
for (int b = 1; b < batch; b++) {
memcpy(vmask.data() + b * seqLen * seqLen, vmask.data(), seqLen * seqLen * sizeof(float));
memcpy(vpids.data() + b * seqLen, vpids.data(), seqLen * sizeof(float));
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen}, ids));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen, seqLen}, vmask));
positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen}, vpids));
} else {
std::vector<float> ids = std::vector<float>(batch * 1, 0);
std::vector<float> vpids = std::vector<float>(batch * 1, 0);
for (int b = 0; b < batch; b++) {
ids[b] = inputTokens[b][0];
vpids[b] = (float) (promptLen + index - 1);
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, ids));
attentionMask.CopyFrom(Data());
positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, vpids));
}
}
void QWenModel::WarmUp() {
printf("Warmup...\n");
Data inputIds = Data(DataType::FLOAT32, {1, 1}, {1});
Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
Data positionIds = Data(DataType::FLOAT32, {1, 1}, {0, 0});
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
Forward(inputIds, attentionMask, positionIds, pastKeyValues);
#ifdef USE_TFACC40T
FastllmTfaccReleaseTempMemory();
#endif
printf("finish.\n");
}
void QWenModel::UpdateRotaryPosEmb(float ntk_alpha) {
float base = 10000 * pow(ntk_alpha, (float) rotary_dim / (rotary_dim - 2));
if (sin.empty() || cos.empty()) {
sin.resize(max_positions);
cos.resize(max_positions);
}
std::vector <float> invFreq;
for (int i = 0; i < rotary_dim; i += 2) {
invFreq.push_back(1.0 / pow(base, (float)i / rotary_dim));
}
for (int i = 0; i < max_positions; i++) {
sin[i].resize(rotary_dim);
cos[i].resize(rotary_dim);
for (int j = 0; j < invFreq.size(); j++) {
sin[i][j] = ::sin((float)i * invFreq[j]);
cos[i][j] = ::cos((float)i * invFreq[j]);
}
}
std::vector <float> fsin, fcos;
for (int i = 0; i < sin.size(); i++) {
for (int j = 0; j < sin[0].size(); j++) {
fsin.push_back(sin[i][j]);
fcos.push_back(cos[i][j]);
}
}
sinData.ToDevice(DataDevice::CPU);
cosData.ToDevice(DataDevice::CPU);
sinData.CopyFrom(Data(DataType::FLOAT32, {(int)this->sin.size(), (int)this->sin[0].size()}, fsin));
cosData.CopyFrom(Data(DataType::FLOAT32, {(int)this->cos.size(), (int)this->cos[0].size()}, fcos));
}
}
\ No newline at end of file
#include "model.h"
#include "factoryllm.h"
#ifdef PY_API
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/chrono.h>
#include <pybind11/functional.h>
#include <unordered_map>
namespace py = pybind11;
using namespace pybind11::literals;
// template <typename... Args>
// using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
using pastKV = std::vector<std::pair<fastllm::Data,fastllm::Data>>;
// PYBIND11_MAKE_OPAQUE(std::vector<std::pair<fastllm::Data,fastllm::Data>>);
PYBIND11_MAKE_OPAQUE(fastllm::Data);
PYBIND11_MODULE(pyfastllm, m) {
m.doc() = "fastllm python bindings";
py::class_<fastllm::GenerationConfig>(m, "GenerationConfig")
.def(py::init<>())
.def_readwrite("max_length", &fastllm::GenerationConfig::output_token_limit)
.def_readwrite("last_n", &fastllm::GenerationConfig::last_n)
.def_readwrite("repeat_penalty", &fastllm::GenerationConfig::repeat_penalty)
.def_readwrite("top_k", &fastllm::GenerationConfig::top_k)
.def_readwrite("top_p", &fastllm::GenerationConfig::top_p)
.def_readwrite("temperature", &fastllm::GenerationConfig::temperature)
.def_readwrite("enable_hash_id", &fastllm::GenerationConfig::enable_hash_id)
.def("is_simple_greedy", &fastllm::GenerationConfig::IsSimpleGreedy);
// high level
m.def("set_threads", &fastllm::SetThreads)
.def("get_threads", &fastllm::GetThreads)
.def("set_low_memory", &fastllm::SetLowMemMode)
.def("get_low_memory", &fastllm::GetLowMemMode)
.def("set_kv_cache", &fastllm::SetKVCacheInCPU)
.def("get_kv_cache", &fastllm::GetKVCacheInCPU)
.def("set_device_map", &fastllm::SetDeviceMap)
.def("create_llm", &fastllm::CreateLLMModelFromFile);
m.def("std_hash", [](std::string input) -> size_t {
return std::hash<std::string>{}(input);
});
// low level
m.def("get_llm_type", &fastllm::GetModelTypeFromFile);
py::enum_<fastllm::DataType>(m, "Dtype")
.value("float32", fastllm::DataType::FLOAT32)
.value("bfloat16", fastllm::DataType::BFLOAT16)
.value("int16", fastllm::DataType::INT16)
.value("int8", fastllm::DataType::INT8)
.value("int4", fastllm::DataType::INT4)
.value("int2", fastllm::DataType::INT2)
.value("float16", fastllm::DataType::FLOAT16)
.value("bit", fastllm::DataType::BIT)
.value("int32param", fastllm::DataType::INT32PARAM)
.export_values();
py::class_<fastllm::Data>(m, "Tensor")
.def_readonly("dims", &fastllm::Data::dims)
.def(py::init<>())
.def(py::init<fastllm::DataType>())
.def(py::init<fastllm::DataType, const std::vector<int>&>())
.def(py::init<fastllm::DataType, const std::vector<int>&, const std::vector<float>&>())
.def(py::init<fastllm::Data>())
.def("copy_from", &fastllm::Data::CopyFrom)
.def("count", &fastllm::Data::Count)
.def("to_list", [](fastllm::Data& data){
std::vector <float> vecData;
for (int i = 0; i < data.Count(0); i++) {
vecData.push_back(((float*)data.cpuData)[i]);
}
return vecData;
})
.def("print", &fastllm::Data::Print)
.def("to", static_cast<void (fastllm::Data::*)(void *device)>(&fastllm::Data::ToDevice));
m.def("zeros", [](const std::vector<int> &dims, fastllm::DataType dtype)->fastllm::Data {
int nums = 1;
for (auto dim:dims){nums *= dim; }
std::vector<float>zero_data(nums, 0);
auto data = fastllm::Data(dtype, dims, zero_data);
return data;
}, py::arg("dims"), py::arg("dtype"));
m.def("cat", [](std::vector<fastllm::Data> datas, int dim)->fastllm::Data {
// int pos_dim = 0;
// // dim check
// for (int i=0;i<datas[0].dims.size();i++){
// int cur_dim = datas[0].dims[i];
// for (auto data:datas){
// if (i == dim){
// pos_dim += data.dims[i];
// continue;
// }
// if (data.dims[i] != cur_dim){
// std::cout<<"dim not the same!!!"<<std::endl;
// return fastllm::Data();
// }
// }
// }
// auto newDims = datas[0].dims;
// newDims[dim] = pos_dim;
// TODO use memcpy cp data
// TODO add different dim cat
std::vector <float> vecData;
for (auto data:datas){
for (int i = 0; i < data.Count(0); i++) {
vecData.push_back(((float*)data.cpuData)[i]);
}
}
int seqLen = vecData.size();
return fastllm::Data(fastllm::DataType::FLOAT32, {1, seqLen}, vecData);
});
py::class_<fastllm::Tokenizer>(m, "Tokenizer")
.def("encode", &fastllm::Tokenizer::Encode)
// .def("decode", &fastllm::Tokenizer::Decode)
.def("decode", &fastllm::Tokenizer::Decode, "Decode from Tensor")
.def("decode", &fastllm::Tokenizer::DecodeTokens, "Decode from Vector")
.def("decode_byte", [](fastllm::Tokenizer &tokenizer, const fastllm::Data &data){
std::string ret = tokenizer.Decode(data);
return py::bytes(ret);
})
.def("decode_byte", [](fastllm::Tokenizer &tokenizer, const std::vector<int>& data){
std::string ret = tokenizer.DecodeTokens(data);
return py::bytes(ret);
})
.def("clear", &fastllm::Tokenizer::Clear)
.def("insert", &fastllm::Tokenizer::Insert);
py::class_<fastllm::WeightMap>(m, "WeightMap")
.def_readonly("tokenizer", &fastllm::WeightMap::tokenizer)
.def("save_lowbit", &fastllm::WeightMap::SaveLowBitModel)
.def("set_kv", &fastllm::WeightMap::AddDict)
.def("set_weight", &fastllm::WeightMap::AddWeight)
.def("__getitem__", [](fastllm::WeightMap &weight, std::string key){
return weight[key]; });
// model classes
py::class_<fastllm::basellm>(m, "basellm");
py::class_<fastllm::ChatGLMModel, fastllm::basellm>(m, "ChatGLMModel")
.def(py::init<>())
.def_readonly("model_type", &fastllm::ChatGLMModel::model_type)
.def_readonly("weight", &fastllm::ChatGLMModel::weight)
.def_readonly("block_cnt", &fastllm::ChatGLMModel::block_cnt)
.def_readonly("bos_token_id", &fastllm::ChatGLMModel::bos_token_id)
.def_readonly("eos_token_id", &fastllm::ChatGLMModel::eos_token_id)
.def("load_weights", &fastllm::ChatGLMModel::LoadFromFile)
.def("make_input", &fastllm::ChatGLMModel::MakeInput)
.def("make_history", &fastllm::ChatGLMModel::MakeHistory)
.def("response", &fastllm::ChatGLMModel::Response)
.def("batch_response", [](fastllm::ChatGLMModel &model,
const std::vector <std::string> &inputs,
RuntimeResultBatch retCb,
fastllm::GenerationConfig config)->std::vector<std::string> {
std::vector <std::string> outputs;
model.ResponseBatch(inputs, outputs, retCb, config);
return outputs;
})
.def("warmup", &fastllm::ChatGLMModel::WarmUp)
.def("forward",
[](fastllm::ChatGLMModel &model,
const fastllm::Data &inputIds,
const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
return std::make_tuple(retV, pastKeyValues);
})
.def("launch_response", &fastllm::ChatGLMModel::LaunchResponseTokens)
.def("fetch_response", &fastllm::ChatGLMModel::FetchResponseTokens)
.def("save_lowbit_model", &fastllm::ChatGLMModel::SaveLowBitModel)
.def("make_input", &fastllm::ChatGLMModel::MakeInput);
py::class_<fastllm::MOSSModel, fastllm::basellm>(m, "MOSSModel")
.def(py::init<>())
.def_readonly("model_type", &fastllm::MOSSModel::model_type)
.def_readonly("weight", &fastllm::MOSSModel::weight)
.def_readonly("block_cnt", &fastllm::MOSSModel::block_cnt)
.def_readonly("bos_token_id", &fastllm::MOSSModel::bos_token_id)
.def_readonly("eos_token_id", &fastllm::MOSSModel::eos_token_id)
.def("load_weights", &fastllm::MOSSModel::LoadFromFile)
.def("make_input", &fastllm::MOSSModel::MakeInput)
.def("make_history", &fastllm::MOSSModel::MakeHistory)
.def("response", &fastllm::MOSSModel::Response)
.def("batch_response", [](fastllm::MOSSModel &model,
const std::vector <std::string> &inputs,
RuntimeResultBatch retCb,
fastllm::GenerationConfig config)->std::vector<std::string> {
std::vector <std::string> outputs;
model.ResponseBatch(inputs, outputs, retCb, config);
return outputs;
})
.def("forward",
[](fastllm::MOSSModel &model,
const fastllm::Data &inputIds,
const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
return std::make_tuple(retV, pastKeyValues);
})
.def("launch_response", &fastllm::MOSSModel::LaunchResponseTokens)
.def("fetch_response", &fastllm::MOSSModel::FetchResponseTokens)
.def("save_lowbit_model", &fastllm::MOSSModel::SaveLowBitModel)
.def("make_input", &fastllm::MOSSModel::MakeInput);
py::class_<fastllm::LlamaModel, fastllm::basellm>(m, "LlamaModel")
.def(py::init<>())
.def_readonly("model_type", &fastllm::LlamaModel::model_type)
.def_readonly("weight", &fastllm::LlamaModel::weight)
.def_readonly("block_cnt", &fastllm::LlamaModel::block_cnt)
.def_readonly("bos_token_id", &fastllm::LlamaModel::bos_token_id)
.def_readonly("eos_token_id", &fastllm::LlamaModel::eos_token_id)
.def("load_weights", &fastllm::LlamaModel::LoadFromFile)
.def("make_input", &fastllm::LlamaModel::MakeInput)
.def("make_history", &fastllm::LlamaModel::MakeHistory)
.def("response", &fastllm::LlamaModel::Response)
.def("batch_response", [](fastllm::LlamaModel &model,
const std::vector <std::string> &inputs,
RuntimeResultBatch retCb,
fastllm::GenerationConfig config)->std::vector<std::string> {
std::vector <std::string> outputs;
model.ResponseBatch(inputs, outputs, retCb, config);
return outputs;
})
.def("warmup", &fastllm::LlamaModel::WarmUp)
.def("forward",
[](fastllm::LlamaModel &model,
const fastllm::Data &inputIds,
const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
return std::make_tuple(retV, pastKeyValues);
})
.def("launch_response", &fastllm::LlamaModel::LaunchResponseTokens)
.def("fetch_response", &fastllm::LlamaModel::FetchResponseTokens)
.def("save_lowbit_model", &fastllm::LlamaModel::SaveLowBitModel)
.def("make_input", &fastllm::LlamaModel::MakeInput);
py::class_<fastllm::QWenModel, fastllm::basellm>(m, "QWenModel")
.def(py::init<>())
.def_readonly("model_type", &fastllm::QWenModel::model_type)
.def_readonly("weight", &fastllm::QWenModel::weight)
.def_readonly("block_cnt", &fastllm::QWenModel::block_cnt)
.def_readonly("bos_token_id", &fastllm::QWenModel::bos_token_id)
.def_readonly("eos_token_id", &fastllm::QWenModel::eos_token_id)
.def("load_weights", &fastllm::QWenModel::LoadFromFile)
.def("make_input", &fastllm::QWenModel::MakeInput)
.def("make_history", &fastllm::QWenModel::MakeHistory)
.def("response", &fastllm::QWenModel::Response)
.def("batch_response", [](fastllm::QWenModel &model,
const std::vector <std::string> &inputs,
RuntimeResultBatch retCb,
fastllm::GenerationConfig config)->std::vector<std::string> {
std::vector <std::string> outputs;
model.ResponseBatch(inputs, outputs, retCb, config);
return outputs;
})
.def("warmup", &fastllm::QWenModel::WarmUp)
.def("forward",
[](fastllm::QWenModel &model,
const fastllm::Data &inputIds,
const fastllm::Data &attentionMask,
const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
return std::make_tuple(retV, pastKeyValues);
})
.def("launch_response", &fastllm::QWenModel::LaunchResponseTokens)
.def("fetch_response", &fastllm::QWenModel::FetchResponseTokens)
.def("save_lowbit_model", &fastllm::QWenModel::SaveLowBitModel)
.def("make_input", &fastllm::QWenModel::MakeInput);
#ifdef VERSION_INFO
m.attr("__version__") = VERSION_INFO;
#else
m.attr("__version__") = "dev";
#endif
}
#endif
CMMLU是一个综合性的中文评估基准,专门用于评估语言模型在中文语境下的知识和推理能力。
项目官网网址为: https://github.com/haonan-li/CMMLU
本目录下的chatglm.py程序会调用fastllm框架进行测试
测试步骤如下:
- 1. 克隆CMMLU仓库
``` sh
git clone https://github.com/haonan-li/CMMLU
```
- 2. 测试
```
# chatglm测试脚本
# 这里model_name_or_path可以使用ChatGLM2-6b官方的原始模型、int4模型,dtype支持float16, int8, int4
python3 chatglm.py --model_name_or_path 此处填写模型路径 --save_dir 此处填写结果保存路径 --dtype float16
# baichuan13b测试脚本
# 这里model_name_or_path可以使用Baichuan13B-Base或Baichuan13B-Chat官方的原始模型,dtype支持float16, int8, int4
python3 baichuan.py --model_name_or_path 此处填写模型路径 --save_dir 此处填写结果保存路径 --dtype float16
```
测试数据较多,过程比较漫长,测试中途可以通过以下命令查看已完成的测试成绩
```
python3 eval.py 此处填写结果保存路径
```
- 3. 参考结果
| 模型 | Data精度 | Shot | CMMLU分数 |
|-----------------------: |-------- |----------|-----------|
| ChatGLM2-6b-fp16 | float32 |0 | 50.16 |
| ChatGLM2-6b-int8 | float32 |0 | 50.14 |
| ChatGLM2-6b-int4 | float32 |0 | 49.63 |
| QWen-7b-Base-fp16 | float32 |0 | 57.43 |
| QWen-7b-Chat-fp16 | float32 |0 | 54.82 |
| Baichuan-13b-Base-int8 | float32 |5 | 55.12 |
| Baichuan-13b-Base-int4 | float32 |5 | 52.22 |
import os
import torch
import numpy as np
import argparse
from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
def eval(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
choice_ids = [tokenizer.convert_tokens_to_ids(choice) for choice in choices]
cors = []
all_conf = []
all_preds = []
answers = choices[: test_df.shape[1] - 2]
for i in range(test_df.shape[0]):
prompt_end = format_example(test_df, i, subject, include_answer=False)
prompt = gen_prompt(dev_df=dev_df,
subject=subject,
prompt_end=prompt_end,
num_few_shot=num_few_shot,
tokenizer=tokenizer,
max_length=max_length)
label = test_df.iloc[i, test_df.shape[1] - 1]
logits = model.response_logits(prompt, tokenizer = tokenizer);
sel = 0;
for j in range(4):
if (logits[choice_ids[j]] > logits[choice_ids[sel]]):
sel = j;
pred = choices[sel];
conf = [logits[choice_ids[j]] for j in range(4)]
all_preds += pred
all_conf.append(conf)
cors.append(pred == label)
print(i, np.mean(cors))
acc = np.mean(cors)
print("Average accuracy {:.3f} - {}".format(acc, subject))
return acc, all_preds, all_conf
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default="")
parser.add_argument("--lora_weights", type=str, default="")
parser.add_argument("--data_dir", type=str, default="./CMMLU/data")
parser.add_argument("--save_dir", type=str, default="../results/not_specified")
parser.add_argument("--num_few_shot", type=int, default=0)
parser.add_argument("--max_length", type=int, default=2048)
parser.add_argument("--load_in_8bit", action='store_true')
parser.add_argument("--dtype", type=str, default="float16")
parser.add_argument("--with_conf", action='store_true')
parser.add_argument("--cot", action='store_true')
args = parser.parse_args()
# TODO: better handle
tokenizer_class = LlamaTokenizer if 'llama' in args.model_name_or_path else AutoTokenizer
model_class = LlamaForCausalLM if 'llama' in args.model_name_or_path else AutoModelForCausalLM
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, trust_remote_code=True)
model = model_class.from_pretrained(args.model_name_or_path,
trust_remote_code=True,
load_in_8bit=args.load_in_8bit,
torch_dtype=torch.float16,
device_map="cpu"
)
if args.lora_weights != "":
model = PeftModel.from_pretrained(
model,
args.lora_weights,
torch_dtype=torch.float16,
)
from fastllm_pytools import llm;
model = llm.from_hf(model, tokenizer, dtype = args.dtype);
model.direct_query = True;
run_eval(model, tokenizer, eval, args)
name_en2zh = {
"agronomy": "农学",
"anatomy": "解剖学",
"ancient_chinese": "古汉语",
"arts": "艺术学",
"astronomy": "天文学",
"business_ethics": "商业伦理",
"chinese_civil_service_exam": "中国公务员考试",
"chinese_driving_rule": "中国驾驶规则",
"chinese_food_culture": "中国饮食文化",
"chinese_foreign_policy": "中国外交政策",
"chinese_history":"中国历史",
"chinese_literature": "中国文学",
"chinese_teacher_qualification": "中国教师资格",
"clinical_knowledge": "临床知识",
"college_actuarial_science":"大学精算学",
"college_education":"大学教育学",
"college_engineering_hydrology": "大学工程水文学",
"college_law": "大学法律",
"college_mathematics": "大学数学",
"college_medical_statistics":"大学医学统计",
"college_medicine": "大学医学",
"computer_science": "计算机科学",
"computer_security": "计算机安全",
"conceptual_physics": "概念物理学",
"construction_project_management": "建设工程管理",
"economics": "经济学",
"education": "教育学",
"electrical_engineering": "电气工程",
"elementary_chinese":"小学语文",
"elementary_commonsense":"小学常识",
"elementary_information_and_technology": "小学信息技术",
"elementary_mathematics": "初等数学",
"ethnology": "民族学",
"food_science": "食品科学",
"genetics": "遗传学",
"global_facts": "全球事实",
"high_school_biology": "高中生物",
"high_school_chemistry": "高中化学",
"high_school_geography": "高中地理",
"high_school_mathematics": "高中数学",
"high_school_physics": "高中物理学",
"high_school_politics": "高中政治",
"human_sexuality": "人类性行为",
"international_law": "国际法学",
"journalism": "新闻学",
"jurisprudence": "法理学",
"legal_and_moral_basis": "法律与道德基础",
"logical": "逻辑学",
"machine_learning": "机器学习",
"management": "管理学",
"marketing": "市场营销",
"marxist_theory": "马克思主义理论",
"modern_chinese": "现代汉语",
"nutrition": "营养学",
"philosophy": "哲学",
"professional_accounting": "专业会计",
"professional_law": "专业法学",
"professional_medicine": "专业医学",
"professional_psychology": "专业心理学",
"public_relations": "公共关系",
"security_study":"安全研究",
"sociology": "社会学",
"sports_science": "体育学",
"traditional_chinese_medicine": "中医中药",
"virology": "病毒学",
"world_history":"世界历史",
"world_religions": "世界宗教",
}
subcategories = {
"agronomy": ['other'],
"anatomy": ['biology'],
"ancient_chinese": ['linguistics','china specific'],
"arts": ['arts'],
"astronomy": ['physics'],
"business_ethics": ['business'],
"chinese_civil_service_exam": ['politics','china specific'],
"chinese_driving_rule": ['other','china specific'],
"chinese_food_culture": ['culture','china specific'],
"chinese_foreign_policy": ['politics','china specific'],
"chinese_history":['history','china specific'],
"chinese_literature": ['literature','china specific'],
"chinese_teacher_qualification": ['education','china specific'],
"college_actuarial_science":['math'],
"college_education":['education'],
"college_engineering_hydrology": ['engineering'],
"college_law": ['law'],
"college_mathematics": ['math'],
"college_medical_statistics":['statistics'],
"clinical_knowledge": ['other'],
"college_medicine": ['other'],
"computer_science": ['computer science'],
"computer_security": ['other'],
"conceptual_physics": ['physics'],
"construction_project_management": ['other','china specific'],
"economics": ['economics'],
"education": ['education'],
"elementary_chinese":['linguistics','china specific'],
"elementary_commonsense":['other','china specific'],
"elementary_information_and_technology": ['other'],
"electrical_engineering": ['engineering'],
"elementary_mathematics": ['math'],
"ethnology": ['culture','china specific'],
"food_science": ['other'],
"genetics": ['biology'],
"global_facts": ['global'],
"high_school_biology": ['biology'],
"high_school_chemistry": ['chemistry'],
"high_school_geography": ['geography'],
"high_school_mathematics": ['math'],
"high_school_physics": ['physics'],
"high_school_politics": ['politics','china specific'],
"human_sexuality": ['other'],
"international_law": ['law'],
"journalism": ['sociology'],
"jurisprudence": ['law'],
"legal_and_moral_basis": ['other'],
"logical": ['philosophy'],
"machine_learning": ['computer science'],
"management": ['business'],
"marketing": ['business'],
"marxist_theory": ['philosophy'],
"modern_chinese": ['linguistics','china specific'],
"nutrition": ['other'],
"philosophy": ['philosophy'],
"professional_accounting": ['business'],
"professional_law": ['law'],
"professional_medicine": ['other'],
"professional_psychology": ['psychology'],
"public_relations": ['politics'],
"security_study": ['politics'],
"sociology": ['culture'],
"sports_science": ['other'],
"traditional_chinese_medicine": ['other','china specific'],
"virology": ['biology'],
"world_history":['history'],
"world_religions": ['global'],
}
categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
"Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
"Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
"Other":["other"],
"China specific": ["china specific"],
}
import os
import torch
import numpy as np
import argparse
from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval
from transformers import AutoModel, AutoTokenizer
import threading
def chat(model, tokenizer, prompt, output_list, idx):
pred, history = model.chat(tokenizer, prompt, history=[], max_length = 5);
if pred[0] not in choices:
pred, history = model.chat(tokenizer, prompt, history=[], max_length = 1000);
output_list[idx] = pred;
def eval_chat_multithread(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
cors = []
all_preds = []
answers = choices[: test_df.shape[1] - 2]
batch_num = 64;
output_list = ["" for i in range(test_df.shape[0])];
ths = [None for i in range(test_df.shape[0])];
for j in range(0, test_df.shape[0], batch_num):
cur_len = min(test_df.shape[0] - j, batch_num);
for i in range(j, j + cur_len):
prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
prompt = gen_prompt(dev_df=dev_df,
subject=subject,
prompt_end=prompt_end,
num_few_shot=num_few_shot,
tokenizer=tokenizer,
max_length=max_length,
cot=cot)
ths[i] = threading.Thread(target = chat, args=(model, tokenizer, prompt, output_list, i));
ths[i].start();
for i in range(j, j + cur_len):
ths[i].join();
pred = output_list[i];
label = test_df.iloc[i, test_df.shape[1] - 1]
if pred and pred[0] in choices:
cors.append(pred[0] == label);
all_preds.append(pred.replace("\n", ""))
print(i, test_df.shape[0], np.mean(cors))
acc = np.mean(cors)
print("Average accuracy {:.3f} - {}".format(acc, subject))
print("{} results, {} inappropriate formated answers.".format(len(cors), len(all_preds)-len(cors)))
return acc, all_preds, None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default="")
parser.add_argument("--lora_weights", type=str, default="")
parser.add_argument("--data_dir", type=str, default="./CMMLU/data")
parser.add_argument("--save_dir", type=str, default="./results/ChatGLM2-6B")
parser.add_argument("--num_few_shot", type=int, default=0)
parser.add_argument("--max_length", type=int, default=2048)
parser.add_argument("--dtype", type=str, default="float16")
parser.add_argument("--cot", action='store_true')
args = parser.parse_args()
# Initialize models
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True,)
model = AutoModel.from_pretrained(args.model_name_or_path, trust_remote_code=True).cpu()
from fastllm_pytools import llm;
model = llm.from_hf(model, tokenizer, dtype = args.dtype);
# model.save("/root/test.flm");
# Always use Chat-style evaluation
run_eval(model, tokenizer, eval_chat_multithread, args)
import CMMLU.src.mp_utils as mp
import sys
print(mp.get_results(sys.argv[1]))
import os
import torch
import numpy as np
import argparse
import threading
from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
def chat(model, tokenizer, prompt, output_list, idx):
pred, history = model.chat(tokenizer, prompt, history=[], max_length = 5)
if pred[0] not in choices:
pred, history = model.chat(tokenizer, prompt, history=[], max_length = 1000)
output_list[idx] = pred
def eval_chat_multithread(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
cors = []
all_preds = []
answers = choices[: test_df.shape[1] - 2]
batch_num = 1
output_list = ["" for i in range(test_df.shape[0])]
ths = [None for i in range(test_df.shape[0])]
for j in range(0, test_df.shape[0], batch_num):
cur_len = min(test_df.shape[0] - j, batch_num)
for i in range(j, j + cur_len):
prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
prompt = gen_prompt(dev_df=dev_df,
subject=subject,
prompt_end=prompt_end,
num_few_shot=num_few_shot,
tokenizer=tokenizer,
max_length=max_length,
cot=cot)
ths[i] = threading.Thread(target = chat, args=(model, tokenizer, prompt, output_list, i))
ths[i].start()
for i in range(j, j + cur_len):
ths[i].join()
pred = output_list[i]
label = test_df.iloc[i, test_df.shape[1] - 1]
if pred and pred[0] in choices:
cors.append(pred[0] == label)
all_preds.append(pred.replace("\n", ""))
print(i, test_df.shape[0], np.mean(cors))
acc = np.mean(cors)
print("Average accuracy {:.3f} - {}".format(acc, subject))
print("{} results, {} inappropriate formated answers.".format(len(cors), len(all_preds)-len(cors)))
return acc, all_preds, None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default="")
parser.add_argument("--lora_weights", type=str, default="")
parser.add_argument("--data_dir", type=str, default="./CMMLU/data")
parser.add_argument("--save_dir", type=str, default="../results/not_specified")
parser.add_argument("--num_few_shot", type=int, default=0)
parser.add_argument("--max_length", type=int, default=2048)
parser.add_argument("--load_in_8bit", action='store_true')
parser.add_argument("--dtype", type=str, default="float16")
parser.add_argument("--with_conf", action='store_true')
parser.add_argument("--cot", action='store_true')
args = parser.parse_args()
# TODO: better handle
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, device_map="cpu", trust_remote_code=True, fp16=True).eval()
model.generation_config = GenerationConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
if args.lora_weights != "":
model = PeftModel.from_pretrained(
model,
args.lora_weights,
torch_dtype=torch.float16,
)
from fastllm_pytools import llm;
model = llm.from_hf(model, tokenizer, dtype = args.dtype)
model.direct_query = True
run_eval(model, tokenizer, eval_chat_multithread, args)
\ No newline at end of file
__all__ = ["llm"]
\ No newline at end of file
from fastllm_pytools import llm
import torch
import ctypes
import numpy as np
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2,
"QuantizedLinear": 111
}
def create(model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
if (modelInfo["model_type"] == "qwen"):
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
weight_type_dict = {}
module_dict = {}
weight_bits = {}
for key, m in model.named_modules():
if (str(type(m)).find("QuantizedLinear") != -1):
weight_type_dict[key + ".weight"] = "QuantizedLinear"
weight_bits[key + ".weight"] = m.weight_bit_width
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
peft_config = {}
active_adapter = ""
if hasattr(model, "peft_config"):
peft_config = model.peft_config
if hasattr(model, "active_adapter"):
active_adapter = model.active_adapter
model = model.cpu()
dict = model.state_dict()
model_type = model.config.__dict__["model_type"]
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
for it in modelInfo.keys():
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
for adapter_name in peft_config.keys():
adapter_dict = peft_config[adapter_name].__dict__
for it in adapter_dict.keys():
llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
if len(active_adapter) != 0:
llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if modelInfo["model_type"] == "qwen":
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
for i in range(piece_size):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
else:
vocab = tokenizer.get_vocab()
for v in vocab.keys():
if (modelInfo["model_type"] == "moss"):
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
elif (modelInfo["model_type"] == "qwen"):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
elif (cur_weight_type == 2):
# TODO bfloat
to_data_type = 0
weight_name = key
if peft_config is not None:
weight_name = weight_name.replace('base_model.model.', '')
if (cur_weight_type == 111):
llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
weight_bits[key],
dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
else:
llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
to_data_type, cur_weight_type, ori_data_type,
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
tot += 1
print("convert (", tot, "/", len(dict), end = " )\r")
print("")
llm.fastllm_lib.init_params_llm_model(model)
llm.fastllm_lib.warmup_llm_model(model)
ret = llm.model("", id = model)
return ret
import ctypes
import os
from typing import Optional, Tuple, Union, List, Callable, Dict, Any
import platform
if platform.system() == 'Windows':
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
else:
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))
fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
fastllm_lib.create_llm_model.restype = ctypes.c_int
fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.launch_response_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
fastllm_lib.fetch_response_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_logits_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_float)]
fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int
fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]
fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
def set_cpu_threads(threads: int):
fastllm_lib.set_cpu_threads(threads)
def get_cpu_threads() -> int:
return fastllm_lib.get_cpu_threads()
def print_ins_info():
fastllm_lib.print_cpu_ins()
def set_cpu_kvcache(cpu_kvcache):
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
def get_cpu_kvcache():
return fastllm_lib.get_kvcache_in_cpu()
def set_cpu_low_mem(low_mem):
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
def get_cpu_low_mem():
return fastllm_lib.get_cpu_low_mem()
def set_device_map(device_map):
devices = []
values = []
if (isinstance(device_map, str)):
devices.append(device_map)
values.append(1)
elif (isinstance(device_map, list)):
devices = [str(x) for x in device_map]
values = [1 for x in device_map]
elif (isinstance(device_map, dict)):
devices = [str(x) for x in device_map.keys()]
values = [int(device_map[x]) for x in device_map.keys()]
else:
print("set_device_map error.")
return
device_str = ''.join(devices)
device_len = [len(x) for x in devices]
fastllm_lib.set_device_map(len(device_len),
(ctypes.c_int * len(device_len))(*device_len),
device_str.encode(),
(ctypes.c_int * len(values))(*values))
def from_hf(model,
tokenizer = None,
dtype = "float16"):
from fastllm_pytools import hf_model
return hf_model.create(model, tokenizer, dtype = dtype)
class model:
def __init__ (self, path : str,
id : int = -99999):
if (id != -99999):
self.model = id
else:
self.model = fastllm_lib.create_llm_model(path.encode())
self.direct_query = False
def get_prompt(self,
query: str,
history: List[Tuple[str, str]] = None) -> str:
if (not(history)):
history = []
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
return prompt
def save(self, path : str):
fastllm_lib.save_llm_model(self.model, path.encode())
def eval(self):
pass
def response_logits(self,
query: str,
history: List[Tuple[str, str]] = None,
tokenizer = None) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history)
if (tokenizer == None):
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
else:
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
1, False, 1, 1, 1, 1, True)
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
logits = list(range(vocab_size))
array = (ctypes.c_float * (vocab_size * 4))(*logits)
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
out = list(array)[:vocab_size]
while (ret != -1):
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
return out
def response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
ret = ""
for i in self.stream_response(query = query,
history = history,
max_length = max_length,
do_sample = do_sample,
top_p = top_p, top_k = top_k,
temperature = temperature,
repeat_penalty = repeat_penalty,
one_by_one = True):
ret += i
return ret
def stream_response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True):
prompt = query if self.direct_query else self.get_prompt(query, history)
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
res = ""
ret = b''
fail_cnt = 0
while True:
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
cur = ""
try:
cur = ret.decode()
ret = b''
except:
fail_cnt += 1
if (fail_cnt == 20):
break
else:
continue
fail_cnt = 0
if (cur == "<flmeos>"):
break
if one_by_one:
yield cur
else:
res += cur
yield res
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
if (not(history)):
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False)
result = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break
result.append(cur)
response = tokenizer.decode(result)
history = history + [(query, response)]
return response, history
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
return_past_key_values = False, **kwargs) -> str:
if (not(history)):
history = []
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False)
tokens = []
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur == -1):
break
tokens.append(cur)
response = tokenizer.decode(tokens)
new_history = history + [(query, response)]
if return_past_key_values:
yield response, new_history, None
else:
yield response, new_history
def set_adapter(self, name: str):
fastllm_lib.set_adapter(self.model, str(name).encode())
def disable_adapter(self):
fastllm_lib.disable_adapter(self.model)
import struct
import numpy as np
import torch
def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
fo.write(s.encode())
def writeKeyValue(fo, key, value):
writeString(fo, key)
writeString(fo, value)
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7,
"float32": 0,
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20])
temp = v
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def tofile(exportPath,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen":
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
if hasattr(model, "peft_config"):
adapter_size = len(model.peft_config)
modelInfo["peft_size"] = adapter_size
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
if hasattr(model, "peft_config"):
for adapter_name in model.peft_config.keys():
adapter_dict = model.peft_config[adapter_name].__dict__
writeString(fo, adapter_name)
fo.write(struct.pack('i', len(adapter_dict)))
for it in adapter_dict.keys():
writeKeyValue(fo, str(it), str(adapter_dict[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if (modelInfo['model_type'] == "qwen"):
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.encode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
if hasattr(model, "peft_config"):
weight_name = key.replace('base_model.model.', '')
fo.write(struct.pack('i', len(weight_name)))
fo.write(weight_name.encode())
else:
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
\ No newline at end of file
import sys
from transformers import LlamaTokenizer, LlamaForCausalLM
from fastllm_pytools import torch2flm
if __name__ == "__main__":
exportPath = sys.argv[1] if (sys.argv[1] is not None) else "alpaca-fp32.flm";
tokenizer = LlamaTokenizer.from_pretrained('minlik/chinese-alpaca-33b-merged');
model = LlamaForCausalLM.from_pretrained('minlik/chinese-alpaca-33b-merged').float();
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "alpaca-33b-' + dtype + '.flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from fastllm_pytools import torch2flm
if __name__ == "__main__":
modelpath = "baichuan-inc/baichuan-13B-Chat"
tokenizer = AutoTokenizer.from_pretrained(modelpath, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(modelpath, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.to("cpu")
try:
model.generation_config = GenerationConfig.from_pretrained(modelpath)
except:
pass
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-' + dtype + '.flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
import sys
from transformers import AutoTokenizer, AutoModel
from fastllm_pytools import torch2flm
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
import argparse
from fastllm_pytools import llm
def args_parser():
parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = args_parser()
model = llm.model(args.path)
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
while True:
query = input("\n用户:")
if query.strip() == "stop":
break
if query.strip() == "clear":
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
continue
print("AI:", end = "");
curResponse = "";
for response in model.stream_response(query, history = history):
curResponse += response;
print(response, flush = True, end = "")
history.append((query, curResponse))
\ No newline at end of file
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM
from fastllm_pytools import torch2flm
tokenizer = AutoTokenizer.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True);
model = AutoModelForCausalLM.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True).float();
model = model.eval();
if __name__ == "__main__":
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "moss-' + dtype + '.flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment