dcu平台fastllm推理框架

aefd9f11 · zhouxiang · aefd9f11 · aefd9f11 · aefd9f11 · aefd9f11
Commit aefd9f11 authored Sep 06, 2023 by zhouxiang
20 changed files
--- a/src/models/chatglm.cpp
+++ b/src/models/chatglm.cpp
+//
+// Created by huangyuyang on 5/11/23.
+//
+#include "utils.h"
+#include "chatglm.h"
+#include <cmath>
+#include <chrono>
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <cstring>
+#ifdef USE_CUDA
+#include "fastllm-cuda.cuh"
+#endif
+namespace fastllm {
+    void ChatGLMModel::UpdateSinCos(float rope) {
+        if (rope == this->rope) {
+            return;
+        }
+        this->rope = rope;
+        sin.resize(max_positions);
+        cos.resize(max_positions);
+        std::vector <float> invFreq;
+        for (int i = 0; i < rotary_dim; i += 2) {
+            invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
+        }
+        for (int i = 0; i < max_positions; i++) {
+            sin[i].resize(rotary_dim);
+            cos[i].resize(rotary_dim);
+            for (int j = 0; j < invFreq.size(); j++) {
+                sin[i][j] = ::sin((float)i / rope * invFreq[j]);
+                cos[i][j] = ::cos((float)i / rope * invFreq[j]);
+            }
+        }
+        std::vector <float> fsin, fcos;
+        for (int i = 0; i < sin.size(); i++) {
+            for (int j = 0; j < sin[0].size(); j++) {
+                fsin.push_back(sin[i][j]);
+                fcos.push_back(cos[i][j]);
+            }
+        }
+        sinData.CopyFrom(Data(DataType::FLOAT32, {(int)this->sin.size(), (int)this->sin[0].size()}, fsin));
+        cosData.CopyFrom(Data(DataType::FLOAT32, {(int)this->cos.size(), (int)this->cos[0].size()}, fcos));
+    }
+    ChatGLMModel::ChatGLMModel() {
+        this->model_type = "chatglm";
+        this->bos_token_id = 130004;
+        this->eos_token_id = 130005;
+        this->rope = -1.0;
+        this->UpdateSinCos(1.0f);
+        weight.embeddingNames.insert("transformer.word_embeddings.weight");
+        weight.embeddingNames.insert("transformer.embedding.word_embeddings.weight");
+    }
+    int ChatGLMModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
+                              const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
+                              const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
+                              std::vector <float> *logits) {
+        std::vector <std::vector <float>*> batchLogits;
+        batchLogits.push_back(logits);
+        return ForwardBatch(1, inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, lastTokens, &batchLogits)[0];
+    }
+    std::vector <int> ChatGLMModel::ForwardBatch(
+            int batch,
+            const Data &inputIds,
+            const Data &attentionMask,
+            const Data &positionIds,
+            std::vector <std::pair <Data, Data> > &pastKeyValues,
+            const GenerationConfig &generationConfig,
+            const LastTokensManager &lastTokens,
+            std::vector <std::vector <float>*> *retLogits) {
+        if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
+            UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
+        }
+        int maxLen = inputIds.dims[1];
+        Data inputEmbeddings;
+        Data attenInput;
+        Data qkv, q, k, v;
+        Data attnProbs;
+        Data attnOutput;
+        Data contextLayer;
+        Data mlpInput;
+        Data middle, middle2;
+        Data temp;
+        std::vector<int> lastRet;
+        // ChatGLMBlock
+        int version = GetVersion();
+        std::string weightPre, weightMiddle;
+        if (version == 1) {
+            weightPre = "transformer.layers.";
+            weightMiddle = ".attention";
+        } else if (version == 2) {
+            weightPre = "transformer.encoder.layers.";
+            weightMiddle = ".self_attention";
+        }
+        // ChatGLM2
+        Data inputIdsPermute;
+        Permute(inputIds, {1, 0}, inputIdsPermute);
+        Embedding(inputIdsPermute, this->weight["transformer" + std::string((version == 2 ? ".embedding" : "")) +
+                                                ".word_embeddings.weight"], inputEmbeddings);
+        Data &hiddenStates = inputEmbeddings;
+        for (int i = 0; i < block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            if (version == 1) {
+                std::string inputLNWeightName = "transformer.layers." + std::to_string(i) + ".input_layernorm.weight";
+                std::string inputLNBiasName = "transformer.layers." + std::to_string(i) + ".input_layernorm.bias";
+                LayerNorm(hiddenStates, weight[inputLNWeightName], weight[inputLNBiasName], -1, attenInput);
+            } else if (version == 2) {
+                std::string inputRMSWeightName =
+                        "transformer.encoder.layers." + std::to_string(i) + ".input_layernorm.weight";
+                RMSNorm(hiddenStates, weight[inputRMSWeightName], 1e-5, attenInput);
+            }
+            std::string qkvWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.weight";
+            std::string qkvBiasName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.bias";
+            if (!adapterName.empty()) {
+                std::string peftType = weight.peftDict[adapterName]["peft_type"];
+                if (peftType == "LORA") {
+                    std::string loraAWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_A." + adapterName + ".weight";
+                    std::string loraBWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_B." + adapterName + ".weight";
+                    LoraLayer(attenInput, weight[qkvWeightName], weight[loraAWeightName], weight[loraBWeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
+                } else if (peftType == "IA3") {
+                    std::string ia3WeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.ia3_l" + adapterName + ".weight";
+                    IA3Layer(attenInput, weight[qkvWeightName], weight[ia3WeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
+                }
+            } else {
+                Linear(attenInput, weight[qkvWeightName], weight[qkvBiasName], qkv);
+            }
+            if (version == 1) {
+                qkv.Reshape({qkv.dims[0], qkv.dims[1], num_attention_heads, -1});
+                int per = qkv.dims.back() / 3;
+                Split(qkv, -1, 0, per, q);
+                Split(qkv, -1, per, per * 2, k);
+                Split(qkv, -1, per * 2, per * 3, v);
+                fastllm::RotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
+                fastllm::RotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+            } else if (version == 2) {
+                int qLen = embed_dim, kvLen = (qkv.dims.back() - embed_dim) / 2;
+                Split(qkv, -1, 0, qLen, q);
+                Split(qkv, -1, qLen, qLen + kvLen, k);
+                Split(qkv, -1, qLen + kvLen, qLen + kvLen + kvLen, v);
+                q.Reshape({q.dims[0], q.dims[1], -1, embed_dim / num_attention_heads});
+                k.Reshape({k.dims[0], k.dims[1], -1, embed_dim / num_attention_heads});
+                v.Reshape({v.dims[0], v.dims[1], -1, embed_dim / num_attention_heads});
+                fastllm::NearlyRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
+                fastllm::NearlyRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+            }
+            Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
+            if (GetKVCacheInCPU()) {
+                pastKey.lockInCPU = true;
+                pastValue.lockInCPU = true;
+            } else {
+                pastKey.ToDevice(DataDevice::CUDA);
+                pastValue.ToDevice(DataDevice::CUDA);
+            };
+            k.Resize({k.dims[0], k.dims[1] * k.dims[2], k.dims[3]});
+            v.Resize({v.dims[0], v.dims[1] * v.dims[2], v.dims[3]});
+            PermuteSelf(k, {1, 0, 2});
+            PermuteSelf(v, {1, 0, 2});
+            int unitLen = 64;
+#ifdef USE_CUDA
+            unitLen = 128;
+#endif
+            while ((pastKey.dims.size() == 0 &&
+                    (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
+                   || (pastKey.dims.size() > 0 && (pastKey.expansionDims.size() == 0 ||
+                                                   pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1]))) {
+                std::vector<int> newDims;
+                if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                    newDims = std::vector<int>{k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
+                    if (generationConfig.output_token_limit > 0) {
+                        newDims[1] = std::min(newDims[1], k.dims[1] + generationConfig.output_token_limit);
+                    }
+                } else {
+                    newDims = pastKey.dims;
+                    newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastKey.Expansion(newDims);
+            }
+            while ((pastValue.dims.size() == 0 &&
+                    (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
+                   || (pastValue.dims.size() > 0 && (pastValue.expansionDims.size() == 0 ||
+                                                     pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1]))) {
+                std::vector<int> newDims;
+                if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                    newDims = std::vector<int>{v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
+                    if (generationConfig.output_token_limit > 0) {
+                        newDims[1] = std::min(newDims[1], k.dims[1] + generationConfig.output_token_limit);
+                    }
+                } else {
+                    newDims = pastValue.dims;
+                    newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastValue.Expansion(newDims);
+            }
+            CatDirect(pastKey, k, 1);
+            CatDirect(pastValue, v, 1);
+            std::vector<int> outputSize = {q.dims[1], q.dims[2], q.dims[0], pastKey.dims[1]};
+            q.Reshape({q.dims[0], q.dims[1] * q.dims[2], q.dims[3]});
+            PermuteSelf(q, {1, 0, 2});
+            //Attention(q, pastKey, pastValue, attentionMask, contextLayer, q.dims[0] / pastKey.dims[0], 1.0 / scale_attn, 1);
+            // 1.2 Attention
+            // 1.2.0 q * k^T
+            q.Reshape({pastKey.dims[0], -1, q.dims[2]});
+            MatMulTransB(q, pastKey, attnProbs, 1.0 / (scale_attn * (i + 1)));
+            attnProbs.Reshape(outputSize);
+            // 1.2.1 Mask
+            if (attentionMask.dims.size() != 0) {
+                AttentionMask(attnProbs, attentionMask, -10000);
+            }
+            // 1.2.2 softmax
+            Mul(attnProbs, i + 1, attnProbs);
+            Softmax(attnProbs, attnProbs, -1);
+            outputSize = {1, pastValue.dims[0], q.dims[1], pastValue.dims[1]};
+            attnProbs.Reshape({outputSize[0] * outputSize[1], outputSize[2], -1});
+            // 1.2.3 prob * v
+            attnProbs.Reshape({pastValue.dims[0], -1, attnProbs.dims[2]});
+            MatMul(attnProbs, pastValue, contextLayer);
+            contextLayer.Reshape({batch, num_attention_heads, maxLen, -1});
+            PermuteSelf(contextLayer, {2, 0, 1, 3});
+            contextLayer.Reshape({contextLayer.dims[0], contextLayer.dims[1], embed_dim});
+            // 1.2.4 dense
+            std::string denseWeightName = weightPre + std::to_string(i) + weightMiddle + ".dense.weight";
+            std::string denseBiasName = weightPre + std::to_string(i) + weightMiddle + ".dense.bias";
+            Linear(contextLayer, weight[denseWeightName], weight[denseBiasName], attnOutput);
+            // 1.3
+            if (GetVersion() == 1) {
+                float alpha = sqrt(2 * block_cnt);
+                Mul(attenInput, alpha, hiddenStates);
+                AddTo(hiddenStates, attnOutput);
+                std::string postLNWeightName =
+                        "transformer.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
+                std::string postLNBiasName =
+                        "transformer.layers." + std::to_string(i) + ".post_attention_layernorm.bias";
+                LayerNorm(hiddenStates, weight[postLNWeightName], weight[postLNBiasName], -1, mlpInput);
+                // 1.4 MLP
+                std::string fcInKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
+                std::string fcOutKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
+                Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
+                GeluNew(middle, middle);
+                Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
+                AddTo(hiddenStates, mlpInput, alpha);
+            } else {
+                AddTo(hiddenStates, attnOutput);
+                std::string postRMSWeightName =
+                        "transformer.encoder.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
+                Mul(hiddenStates, 1.0, temp);
+                RMSNorm(hiddenStates, weight[postRMSWeightName], 1e-5, mlpInput);
+                // 1.4 MLP
+                std::string fcInKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
+                std::string fcOutKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
+                Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
+                Swiglu(middle, middle2);
+                Linear(middle2, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
+                AddTo(hiddenStates, temp);
+            }
+        }
+        Data logits, topk;
+        if (version == 1) {
+            LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
+                      weight["transformer.final_layernorm.bias"], -1, hiddenStates);
+            Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        } else {
+            RMSNorm(hiddenStates, weight["transformer.encoder.final_layernorm.weight"], 1e-5, hiddenStates);
+            Linear(hiddenStates, weight["transformer.output_layer.weight"], Data(), logits);
+        }
+        if (generationConfig.output_logits && retLogits != nullptr) {
+            int size = logits.dims.back();
+            logits.ToDevice(DataDevice::CPU);
+            for (int b = 0; b < batch; b++) {
+                int base = (maxLen - 1) * batch + b;
+                (*retLogits)[b]->resize(size);
+                memcpy((float*)(*retLogits)[b]->data(), ((float*)logits.cpuData) + base * size, size * logits.unitSize);
+            }
+        }
+        if (generationConfig.IsSimpleGreedy()) {
+            TopK(logits, topk, 1);
+            topk.ToDevice(DataDevice::CPU);
+            for (int b = 0; b < batch; b++) {
+                int base = (maxLen - 1) * batch + b;
+                lastRet.push_back((int) (((float *) topk.cpuData)[base * 2] + 1e-3));
+            }
+        } else if (!lastTokens.units.empty()) {
+            for (int b = 0; b < batch; b++) {
+                int base = (maxLen - 1) * batch + b;
+                lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
+            }
+        }
+        return lastRet;
+    }
+    std::vector <int> ChatGLMModel::ForwardBatch(
+            int batch,
+            const Data &inputIds,
+            const std::vector <Data*> &attentionMask,
+            const std::vector <Data*> &positionIds,
+            const std::vector <int> &seqLens,
+            std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+            const std::vector <GenerationConfig> &generationConfigs,
+            const LastTokensManager &lastTokens,
+            std::vector <std::vector <float>*> *retLogits) {
+        if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) {
+            UpdateSinCos(atof(this->weight.dicts["rope_ratio"].c_str()));
+        }
+        int seqLen = inputIds.dims[1];
+        sinData.ToDevice(DataDevice::CUDA);
+        cosData.ToDevice(DataDevice::CUDA);
+        int version = GetVersion();
+        std::string weightPre, weightMiddle;
+        if (version == 1) {
+            weightPre = "transformer.layers.";
+            weightMiddle = ".attention";
+        } else if (version == 2) {
+            weightPre = "transformer.encoder.layers.";
+            weightMiddle = ".self_attention";
+        }
+        Data inputEmbeddings;
+        Data inputIdsPermute;
+        Permute(inputIds, {1, 0}, inputIdsPermute);
+        Embedding(inputIdsPermute, this->weight["transformer" + std::string((version == 2 ? ".embedding" : "")) +
+                                                ".word_embeddings.weight"], inputEmbeddings);
+        Data &hiddenStates = inputEmbeddings;
+        hiddenStates.ToDevice(DataDevice::CUDA);
+        Data attenInput;
+        Data qkv, q, k, v;
+        Data attnOutput;
+        Data mlpInput, middle, middle2;
+        std::vector <Data> attnProbs;
+        std::vector <Data> curContextLayer;
+        std::vector <Data> curKs, curVs, curQs;
+        attnProbs.resize(batch);
+        curContextLayer.resize(batch);
+        curKs.resize(batch);
+        curVs.resize(batch);
+        curQs.resize(batch);
+        bool all1 = true;
+        for (int i = 0; i < batch; i++) {
+            all1 &= (seqLens[i] == 1);
+        }
+        if (batch > 1) {
+            positionIds[0]->Expansion({2, seqLen});
+            for (int i = 1; i < batch; i++) {
+                CatDirect(*(Data*)positionIds[0], *(Data*)positionIds[i], 1);
+            }
+        }
+        std::vector <Data*> keys, values, qs, attns, contexts;
+        keys.resize(batch);
+        values.resize(batch);
+        qs.resize(batch);
+        attns.resize(batch);
+        contexts.resize(batch);
+        std::vector <Data*> pointersK, pointersV, pointersQ;
+        pointersK.resize(batch);
+        pointersV.resize(batch);
+        pointersQ.resize(batch);
+        std::vector <std::vector <int> > outputSizes;
+        outputSizes.resize(batch);
+        for (int i = 0; i < block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            if (version == 1) {
+                std::string inputLNWeightName = "transformer.layers." + std::to_string(i) + ".input_layernorm.weight";
+                std::string inputLNBiasName = "transformer.layers." + std::to_string(i) + ".input_layernorm.bias";
+                LayerNorm(hiddenStates, weight[inputLNWeightName], weight[inputLNBiasName], -1, attenInput);
+            } else if (version == 2) {
+                std::string inputRMSWeightName =
+                        "transformer.encoder.layers." + std::to_string(i) + ".input_layernorm.weight";
+                RMSNorm(hiddenStates, weight[inputRMSWeightName], 1e-5, attenInput);
+            }
+            std::string qkvWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.weight";
+            std::string qkvBiasName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.bias";
+            if (!adapterName.empty()) {
+                std::string peftType = weight.peftDict[adapterName]["peft_type"];
+                if (peftType == "LORA") {
+                    std::string loraAWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_A." + adapterName + ".weight";
+                    std::string loraBWeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.lora_B." + adapterName + ".weight";
+                    LoraLayer(attenInput, weight[qkvWeightName], weight[loraAWeightName], weight[loraBWeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
+                } else if (peftType == "IA3") {
+                    std::string ia3WeightName = weightPre + std::to_string(i) + weightMiddle + ".query_key_value.ia3_l" + adapterName + ".weight";
+                    IA3Layer(attenInput, weight[qkvWeightName], weight[ia3WeightName], weight[qkvBiasName], qkv, weight.peftDict[adapterName]);
+                }
+            } else {
+                Linear(attenInput, weight[qkvWeightName], weight[qkvBiasName], qkv);
+            }
+            if (version == 1) {
+                qkv.Reshape({qkv.dims[0], qkv.dims[1], num_attention_heads, -1});
+                int per = qkv.dims.back() / 3;
+                Split(qkv, -1, 0, per, q);
+                Split(qkv, -1, per, per * 2, k);
+                Split(qkv, -1, per * 2, per * 3, v);
+            } else if (version == 2) {
+                int qLen = embed_dim, kvLen = (qkv.dims.back() - embed_dim) / 2;
+                Split(qkv, -1, 0, qLen, q);
+                Split(qkv, -1, qLen, qLen + kvLen, k);
+                Split(qkv, -1, qLen + kvLen, qLen + kvLen + kvLen, v);
+                q.Reshape({q.dims[0], q.dims[1], -1, embed_dim / num_attention_heads});
+                k.Reshape({k.dims[0], k.dims[1], -1, embed_dim / num_attention_heads});
+                v.Reshape({v.dims[0], v.dims[1], -1, embed_dim / num_attention_heads});
+            }
+            if (version == 1) {
+                fastllm::RotatePosition2D(q, *positionIds[0], sinData, cosData, rotary_dim);
+                fastllm::RotatePosition2D(k, *positionIds[0], sinData, cosData, rotary_dim);
+            } else if (version == 2) {
+                fastllm::NearlyRotatePosition2D(q, *positionIds[0], sinData, cosData, rotary_dim);
+                fastllm::NearlyRotatePosition2D(k, *positionIds[0], sinData, cosData, rotary_dim);
+            }
+            k.Resize({k.dims[0], k.dims[1] * k.dims[2], k.dims[3]});
+            v.Resize({v.dims[0], v.dims[1] * v.dims[2], v.dims[3]});
+            q.Resize({q.dims[0], q.dims[1] * q.dims[2], q.dims[3]});
+            Data contextLayer = Data(DataType::FLOAT32);
+            int total = 0;
+            if (all1 && batch > 1) {
+                for (int b = 0; b < batch; b++) {
+                    pointersK[b] = (&curKs[b]);
+                    pointersV[b] = (&curVs[b]);
+                    pointersQ[b] = (&curQs[b]);
+                }
+                SplitBatch(k, 0, batch, pointersK);
+                SplitBatch(v, 0, batch, pointersV);
+                SplitBatch(q, 0, batch, pointersQ);
+                total = batch;
+                for (int b = 0; b < batch; b++) {
+                    auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
+                    std::swap(k.dims[0], k.dims[1]);
+                    k.strides[0] = k.dims[1] * k.dims[2]; k.strides[1] = k.dims[2];
+                    std::swap(v.dims[0], v.dims[1]);
+                    v.strides[0] = v.dims[1] * v.dims[2]; v.strides[1] = v.dims[2];
+                    std::swap(q.dims[0], q.dims[1]);
+                    q.strides[0] = q.dims[1] * q.dims[2]; q.strides[1] = q.dims[2];
+                }
+            } else {
+                PermuteSelf(k, {1, 0, 2});
+                PermuteSelf(v, {1, 0, 2});
+                PermuteSelf(q, {1, 0, 2});
+                for (int b = 0; b < batch; b++) {
+                    Split(k, 1, total, total + seqLens[b], curKs[b]);
+                    Split(v, 1, total, total + seqLens[b], curVs[b]);
+                    Split(q, 1, total, total + seqLens[b], curQs[b]);
+                    total += seqLens[b];
+                }
+            }
+            for (int b = 0; b < batch; b++) {
+                auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
+                Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt +
+                                                                                                     i].second;
+                pastKey.ToDevice(DataDevice::CUDA);
+                pastValue.ToDevice(DataDevice::CUDA);
+                int unitLen = 64;
+#ifdef USE_CUDA
+                unitLen = 128;
+#endif
+                while ((pastKey.dims.size() == 0 &&
+                        (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
+                       || (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
+                    std::vector<int> newDims;
+                    if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                        newDims = std::vector<int>{k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
+                        if (generationConfigs[b].output_token_limit > 0) {
+                            newDims[1] = std::min(newDims[1], k.dims[1] + generationConfigs[b].output_token_limit);
+                        }
+                    } else {
+                        newDims = pastKey.dims;
+                        newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
+                    }
+                    pastKey.Expansion(newDims);
+                }
+                while ((pastValue.dims.size() == 0 &&
+                        (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
+                       || (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
+                    std::vector<int> newDims;
+                    if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                        newDims = std::vector<int>{v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
+                        if (generationConfigs[b].output_token_limit > 0) {
+                            newDims[1] = std::min(newDims[1], k.dims[1] + generationConfigs[b].output_token_limit);
+                        }
+                    } else {
+                        newDims = pastValue.dims;
+                        newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
+                    }
+                    pastValue.Expansion(newDims);
+                }
+            }
+            for (int b = 0; b < batch; b++) {
+                keys[b] = (pastKeyValues[b * block_cnt + i].first);
+                values[b] = (pastKeyValues[b * block_cnt + i].second);
+                pointersK[b] = (&curKs[b]);
+                pointersV[b] = (&curVs[b]);
+            }
+            CatDirectBatch(keys, pointersK, 1);
+            CatDirectBatch(values, pointersV, 1);
+            for (int b = 0; b < batch; b++) {
+                auto &q = curQs[b];
+                Data &pastKey = *pastKeyValues[b * block_cnt + i].first;
+                outputSizes[b] = {1, q.dims[0], q.dims[1], pastKey.dims[1]};
+                q.Reshape({pastKey.dims[0], -1, q.dims[2]});
+            }
+            // 1.2 Attention
+            // 1.2.0 q * k^T
+            if (all1 && batch > 1) {
+                for (int b = 0; b < batch; b++) {
+                    qs[b] = (&curQs[b]);
+                    keys[b] = (pastKeyValues[b * block_cnt + i].first);
+                    attns[b] = (&attnProbs[b]);
+                }
+                MatMulTransBBatch(qs, keys, attns, 1.0 / (scale_attn * (i + 1)));
+            } else {
+                for (int b = 0; b < batch; b++) {
+                    auto &q = curQs[b];
+                    Data &pastKey = *pastKeyValues[b * block_cnt + i].first;
+                    MatMulTransB(q, pastKey, attnProbs[b], 1.0 / (scale_attn * (i + 1)));
+                }
+            }
+            for (int b = 0; b < batch; b++) {
+                attnProbs[b].Reshape(outputSizes[b]);
+                // 1.2.1 Mask
+                if (attentionMask[b] != nullptr) {
+                    AttentionMask(attnProbs[b], *attentionMask[b], -10000);
+                }
+            }
+            // 1.2.2 softmax
+            for (int i = 0; i < attnProbs.size(); i++) {
+                attns[i] = (&attnProbs[i]);
+            }
+            MulBatch(attns, i + 1, attns);
+            SoftmaxBatch(attns, attns, -1);
+            for (int b = 0; b < batch; b++) {
+                Data &pastValue = *pastKeyValues[b * block_cnt + i].second;
+                outputSizes[b] = {1, num_attention_heads, -1, pastValue.dims[2]};
+                attnProbs[b].Reshape({pastValue.dims[0], -1, attnProbs[b].dims[3]});
+            }
+            // 1.2.3 prob * v
+            if (all1 && batch > 1) {
+                for (int b = 0; b < batch; b++) {
+                    attns[b] = (&attnProbs[b]);
+                    values[b] = (pastKeyValues[b * block_cnt + i].second);
+                    contexts[b] = (&curContextLayer[b]);
+                }
+                MatMulBatch(attns, values, contexts);
+            } else {
+                for (int b = 0; b < batch; b++) {
+                    Data &pastValue = *pastKeyValues[b * block_cnt + i].second;
+                    MatMul(attnProbs[b], pastValue, curContextLayer[b]);
+                }
+            }
+            if (all1) {
+                for (int b = 0; b < batch; b++) {
+                    curContextLayer[b].dims[0] = outputSizes[b][2];
+                    curContextLayer[b].dims[1] = outputSizes[b][0];
+                    curContextLayer[b].dims[2] = embed_dim;
+                    curContextLayer[b].strides[0] = curContextLayer[b].dims[1] * curContextLayer[b].dims[2];
+                    curContextLayer[b].strides[1] = curContextLayer[b].dims[2];
+                    curContextLayer[b].strides[2] = 1;
+                }
+            } else {
+                for (int b = 0; b < batch; b++) {
+                    curContextLayer[b].Reshape(outputSizes[b]);
+                    PermuteSelf(curContextLayer[b], {2, 0, 1, 3});
+                    curContextLayer[b].Reshape({curContextLayer[b].dims[0], curContextLayer[b].dims[1], embed_dim});
+                }
+            }
+            if (all1 && batch > 1) {
+                for (int b = 0; b < batch; b++) {
+                    contexts[b] = (&curContextLayer[b]);
+                }
+                CatBatch(contexts, 0, contextLayer);
+            } else {
+                for (int b = 0; b < batch; b++) {
+                    if (contextLayer.dims.size() == 0) {
+                        std::vector<int> dims = curContextLayer[b].dims;
+                        dims[0] = total;
+                        contextLayer.Expansion(dims);
+                    }
+                    contextLayer.ToDevice(DataDevice::CUDA);
+                    CatDirect(contextLayer, curContextLayer[b], 0);
+                }
+            }
+            // 1.2.4 dense
+            std::string denseWeightName = weightPre + std::to_string(i) + weightMiddle + ".dense.weight";
+            std::string denseBiasName = weightPre + std::to_string(i) + weightMiddle + ".dense.bias";
+            Linear(contextLayer, weight[denseWeightName], weight[denseBiasName], attnOutput);
+            if (GetVersion() == 1) {
+                float alpha = sqrt(2 * block_cnt);
+                Mul(attenInput, alpha, hiddenStates);
+                AddTo(hiddenStates, attnOutput);
+                std::string postLNWeightName =
+                        "transformer.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
+                std::string postLNBiasName =
+                        "transformer.layers." + std::to_string(i) + ".post_attention_layernorm.bias";
+                LayerNorm(hiddenStates, weight[postLNWeightName], weight[postLNBiasName], -1, mlpInput);
+                // 1.4 MLP
+                std::string fcInKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
+                std::string fcOutKeyName = "transformer.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
+                Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
+                GeluNew(middle, middle);
+                Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
+                AddTo(hiddenStates, mlpInput, alpha);
+            } else {
+                AddTo(hiddenStates, attnOutput);
+                std::string postRMSWeightName =
+                        "transformer.encoder.layers." + std::to_string(i) + ".post_attention_layernorm.weight";
+                Data temp;
+                Mul(hiddenStates, 1.0, temp);
+                RMSNorm(hiddenStates, weight[postRMSWeightName], 1e-5, mlpInput);
+                // 1.4 MLP
+                std::string fcInKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_h_to_4h";
+                std::string fcOutKeyName = "transformer.encoder.layers." + std::to_string(i) + ".mlp.dense_4h_to_h";
+                Linear(mlpInput, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
+                Swiglu(middle, middle2);
+                Linear(middle2, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
+                AddTo(hiddenStates, temp);
+            }
+        }
+        Data logits;
+        if (version == 1) {
+            LayerNorm(hiddenStates, weight["transformer.final_layernorm.weight"],
+                      weight["transformer.final_layernorm.bias"], -1, hiddenStates);
+            Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        } else {
+            RMSNorm(hiddenStates, weight["transformer.encoder.final_layernorm.weight"], 1e-5, hiddenStates);
+            Linear(hiddenStates, weight["transformer.output_layer.weight"], Data(), logits);
+        }
+        std::vector <int> lastRet;
+        int total = 0;
+        Data curLogit;
+        for (int b = 0; b < batch; b++) {
+            Split(logits, 0, total + seqLens[b] - 1, total + seqLens[b], curLogit);
+            if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
+                curLogit.ToDevice(DataDevice::CPU);
+                (*retLogits)[b]->resize(curLogit.Count(0));
+                memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
+            }
+            if (generationConfigs[b].IsSimpleGreedy()) {
+                Data topk;
+                TopK(curLogit, topk, 1);
+                topk.ToDevice(DataDevice::CPU);
+                lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
+            } else {
+                lastRet.push_back(LLMSampling(curLogit, 0, generationConfigs[b], lastTokens.units[b]));
+            }
+            total += seqLens[b];
+        }
+        return lastRet;
+    }
+    void ChatGLMModel::FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
+                                     const std::map <std::string, int> &params,
+                                     Data &inputIds, Data &attentionMask, Data &positionIds) {
+        inputIds.ToDevice(DataDevice::CPU);
+        attentionMask.ToDevice(DataDevice::CPU);
+        positionIds.ToDevice(DataDevice::CPU);
+        int gmask_token_id = this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end() ?
+                             atoi(this->weight.dicts["gmask_token_id"].c_str()) : 130001;
+        int index = params.find("index")->second;
+        int promptLen = params.find("promptLen")->second;
+        if (index == 0) {
+            for (auto &ids: inputTokens) {
+                if (GetVersion() == 1) {
+                    ids.push_back(gmask_token_id);
+                    ids.push_back(bos_token_id);
+                } else if (GetVersion() == 2) {
+                    if (ids.size() < 2 || ids[0] != 64790 || ids[1] != 64792) {
+                        ids.insert(ids.begin(), 64792);
+                        ids.insert(ids.begin(), 64790);
+                    }
+                }
+            }
+            int seqLen = inputTokens[0].size();
+            std::vector<float> vmask = std::vector<float>(seqLen * seqLen, 0);
+            std::vector<float> vpids = std::vector<float>(seqLen * 2, 0);
+            for (int i = 0; i < seqLen - 1; i++) {
+                vmask[i * seqLen + seqLen - 1] = 1;
+                vpids[i] = i;
+            }
+            vpids[seqLen - 1] = seqLen - 2;
+            vpids[seqLen * 2 - 1] = 1;
+            if (GetVersion() == 2) {
+                for (int i = 0; i < seqLen; i++) {
+                    vpids[i] = i;
+                    for (int j = i + 1; j < seqLen; j++) {
+                        vmask[i * seqLen + j] = 1;
+                    }
+                }
+            }
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {2, seqLen}, vpids));
+        } else {
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
+            attentionMask = Data();
+            if (GetVersion() == 1) {
+                positionIds.CopyFrom(Data(DataType::FLOAT32, {2, 1}, {(float) promptLen, (float) (index + 1)}));
+            } else {
+                positionIds.CopyFrom(Data(DataType::FLOAT32, {2, 1}, {(float) promptLen + index + 1, (float) (index + 1)}));
+            }
+        }
+    }
+    void ChatGLMModel::FillLLMInputsBatch(std::vector<std::vector<float>> &inputTokens,
+                                          const std::vector<std::map<std::string, int>> &params,
+                                          fastllm::Data &inputIds, fastllm::Data &attentionMask,
+                                          fastllm::Data &positionIds) {
+        inputIds.ToDevice(DataDevice::CPU);
+        attentionMask.ToDevice(DataDevice::CPU);
+        positionIds.ToDevice(DataDevice::CPU);
+        int batch = inputTokens.size();
+        int index = params[0].find("index")->second;
+        if (index == 0) {
+            int gmask_token_id = this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end() ?
+                                 atoi(this->weight.dicts["gmask_token_id"].c_str()) : 130001;
+            std::vector<int> seqLens;
+            seqLens.resize(batch);
+            int maxLen = 0;
+            for (int i = 0; i < batch; i++) {
+                maxLen = std::max(maxLen, (int) inputTokens[i].size() + 2);
+                seqLens[i] = (int) inputTokens[i].size();
+            }
+            std::vector<float> ids = std::vector<float>(batch * maxLen, 0);
+            std::vector<float> vpids = std::vector<float>(batch * 2 * maxLen, 0);
+            std::vector<float> vmask = std::vector<float>(batch * maxLen * maxLen, 0);
+            for (int i = 0; i < batch; i++) {
+                if (GetVersion() == 1) {
+                    auto &tokens = inputTokens[i];
+                    int len = tokens.size(), base = maxLen - 2 - len;
+                    for (int j = 0; j < len; j++) {
+                        ids[i * maxLen + base + j] = tokens[j];
+                    }
+                    ids[i * maxLen + base + len] = gmask_token_id;
+                    ids[i * maxLen + base + len + 1] = bos_token_id;
+                    len += 2;
+                    for (int j = 0; j < len - 1; j++) {
+                        vpids[i * 2 * maxLen + base + j] = j;
+                    }
+                    vpids[i * 2 * maxLen + base + len - 1] = len - 2;
+                    vpids[i * 2 * maxLen + maxLen + base + len - 1] = 1;
+                    std::fill(vmask.data() + i * maxLen * maxLen,
+                              vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
+                    for (int j = maxLen - len; j < maxLen; j++) {
+                        std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
+                                  vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
+                    }
+                    for (int j = 0; j < len - 1; j++) {
+                        vmask[i * maxLen * maxLen + (base + j) * maxLen + base + len - 1] = 1;
+                    }
+                } else {
+                    auto &tokens = inputTokens[i];
+                    int len = tokens.size(), base = maxLen - 2 - len;
+                    ids[i * maxLen + base] = 64790;
+                    ids[i * maxLen + base + 1] = 64792;
+                    for (int j = 0; j < len; j++) {
+                        ids[i * maxLen + base + 2 + j] = tokens[j];
+                    }
+                    len += 2;
+                    for (int j = 0; j < len; j++) {
+                        vpids[i * 2 * maxLen + base + j] = j;
+                    }
+                    std::fill(vmask.data() + i * maxLen * maxLen,
+                              vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
+                    for (int j = maxLen - len; j < maxLen; j++) {
+                        std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
+                                  vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
+                    }
+                    for (int j = 0; j < len; j++) {
+                        for (int k = j + 1; k < len; k++) {
+                            vmask[i * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
+                        }
+                    }
+                }
+            }
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, maxLen}, ids));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, maxLen, maxLen}, vmask));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {batch * 2, maxLen}, vpids));
+        } else {
+            std::vector <float> fret;
+            for (int i = 0; i < batch; i++) {
+                fret.push_back(inputTokens[i][0]);
+            }
+            std::vector <float> pids = std::vector<float>(batch * 2);
+            int maxLen = 0;
+            for (int i = 0; i < batch; i++) {
+                int promptLen = params[i].find("promptLen")->second;
+                maxLen = std::max(promptLen + 2, maxLen);
+                pids[i * 2 + 1] = index + 1;
+                if (GetVersion() == 1) {
+                    pids[i * 2] = promptLen;
+                } else {
+                    pids[i * 2] = promptLen + index + 1;
+                }
+            }
+            maxLen += index;
+            std::vector<float> vmasks = std::vector<float>(batch * maxLen, 0.0f);
+            for (int i = 0; i < batch; i++) {
+                int promptLen = params[i].find("promptLen")->second;
+                for (int j = 0; j < maxLen - index - promptLen - 2; j++) {
+                    vmasks[i * maxLen + j] = 1.0f;
+                }
+            }
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, 1, maxLen}, vmasks));
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, fret));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {batch * 2, 1}, pids));
+        }
+    }
+    void ChatGLMModel::WarmUp() {
+    	printf("Warmup...\n");
+	    Data inputIds = Data(DataType::FLOAT32, {1, 1}, {(float)bos_token_id});
+	    Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
+	    Data positionIds = Data(DataType::FLOAT32, {2, 1}, {0, 0});
+	    std::vector <std::pair <Data, Data> > pastKeyValues;
+	    for (int i = 0; i < block_cnt; i++) {
+		    pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+		                                           Data(DataType::FLOAT32)));
+	    }
+	    Forward(inputIds, attentionMask, positionIds, pastKeyValues);
+	    printf("finish.\n");
+    }
+    std::string ChatGLMModel::MakeInput(const std::string &history, int round, const std::string &input) {
+        if (round == 0 && GetVersion() == 1) {
+            return input;
+        } else {
+#if defined(_WIN32) or defined(_WIN64)
+            std::vector <uint8_t> vask = {233, 151, 174, 239, 188, 154, 0};
+            std::vector <uint8_t> vans = {231, 173, 148, 239, 188, 154, 0};
+            std::string sask = (char*)vask.data();
+            std::string sans = (char*)vans.data();
+            return (history + ("[Round " + std::to_string(round) + "]\n\n" + sask + input + "\n\n" + sans));
+#else
+            return history + ("[Round " + std::to_string(round) + "]\n\n问：" + input + "\n\n答：");
+#endif
+        }
+    }
+    std::string ChatGLMModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
+#if defined(_WIN32) or defined(_WIN64)
+        std::vector <uint8_t> vask = {233, 151, 174, 239, 188, 154, 0};
+        std::vector <uint8_t> vans = {231, 173, 148, 239, 188, 154, 0};
+        std::string sask = (char*)vask.data();
+        std::string sans = (char*)vans.data();
+        return (history + ("[Round " + std::to_string(round) + "]\n\n" + sask + input + "\n\n" + sans + output + "\n"));
+#else
+        return (history + ("[Round " + std::to_string(round) + "]\n\n问：" + input + "\n\n答：" + output + "\n\n"));
+#endif
+    }
+    int ChatGLMModel::GetVersion() {
+        if (this->weight.weight.find("transformer.embedding.word_embeddings.weight") != this->weight.weight.end()) {
+            return 2;
+        } else {
+            return 1;
+        }
+    }
+}
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
+//
+// Created by huangyuyang on 6/1/23.
+//
+#include "utils.h"
+#include "llama.h"
+#include <sstream>
+#include <unordered_map>
+#include <cstring>
+#ifdef USE_CUDA
+#include "fastllm-cuda.cuh"
+#endif
+namespace fastllm {
+    std::vector <float> GetInterLeavePowerOf2(int n) {
+        float start = powf(2, -powf(2, -(log2f(n) - 3)));
+        float ratio = start;
+        std::vector <float> ret;
+        for (int i = 0; i < n; i++) {
+            ret.push_back(start * powf(ratio, i));
+        }
+        return ret;
+    }
+    std::vector <float> GetInterleave(int n) {
+        int base = 1;
+        while (base < n) {
+            base <<= 1;
+        }
+        if (base == n) {
+            return GetInterLeavePowerOf2(n);
+        } else {
+            std::vector <float> ret = GetInterLeavePowerOf2(base / 2);
+            std::vector <float> part2 = GetInterLeavePowerOf2(base);
+            for (int i = 0; i < n - base / 2; i++) {
+                ret.push_back(part2[i * 2]);
+            }
+            return ret;
+        }
+    }
+    LlamaModel::LlamaModel() {
+        this->model_type = "llama";
+        // 默认使用alpaca的提示词和instruction
+        this->pre_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n";
+        this->user_role = "### Instruction:\n";
+        this->bot_role = "\n\n### Response:";
+        this->history_sep = "</s>";
+        block_cnt = 32;
+        rotary_dim = 128;
+        sin.resize(max_positions);
+        cos.resize(max_positions);
+        std::vector <float> invFreq;
+        for (int i = 0; i < rotary_dim; i += 2) {
+            invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
+        }
+        for (int i = 0; i < max_positions; i++) {
+            sin[i].resize(rotary_dim);
+            cos[i].resize(rotary_dim);
+            for (int j = 0; j < invFreq.size(); j++) {
+                sin[i][j] = ::sin((float)i * invFreq[j]);
+                cos[i][j] = ::cos((float)i * invFreq[j]);
+            }
+        }
+        std::vector <float> fsin, fcos;
+        for (int i = 0; i < sin.size(); i++) {
+            for (int j = 0; j < sin[0].size(); j++) {
+                fsin.push_back(sin[i][j]);
+                fcos.push_back(cos[i][j]);
+            }
+        }
+        sinData.CopyFrom(Data(DataType::FLOAT32, {(int)this->sin.size(), (int)this->sin[0].size()}, fsin));
+        cosData.CopyFrom(Data(DataType::FLOAT32, {(int)this->cos.size(), (int)this->cos[0].size()}, fcos));
+        weight.embeddingNames.insert("model.embed_tokens.weight");
+    }
+    int LlamaModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
+                            const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
+                            const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
+                            std::vector <float> *retLogits) {
+        Data alibiData;
+        if (this->weight.dicts["use_alibi"] == "1") {
+            std::vector<float> alibi = GetInterleave(num_attention_heads);
+            alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
+        }
+        Data hiddenStates;
+        Data attenInput;
+        Data q, k, v, qkv;
+        Data attenWeights, attenOutput;
+        Data attenLastOutput;
+        Data w1, w2, w3;
+        Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
+        for (int i = 0; i < block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
+                    1e-6, attenInput);
+            std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
+            std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
+            std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
+            std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
+            std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
+            // 1.1 Get q, k, v
+            int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
+            if (weight.weight.find(qkvWeightName) != weight.weight.end()) {
+                Linear(attenInput, weight[qkvWeightName], Data(), qkv);
+                int per = qkv.dims.back() / 3;
+                Split(qkv, -1, 0, per, q);
+                Split(qkv, -1, per, per * 2, k);
+                Split(qkv, -1, per * 2, per * 3, v);
+            } else {
+                Linear(attenInput, weight[qWeightName], Data(), q);
+                Linear(attenInput, weight[kWeightName], Data(), k);
+                Linear(attenInput, weight[vWeightName], Data(), v);
+            }
+            std::vector <int> qkvSize = {bsz, seqlen, num_attention_heads, -1};
+            q.Reshape(qkvSize);
+            k.Reshape(qkvSize);
+            v.Reshape(qkvSize);
+            if (alibiData.dims.size() == 0) {
+                fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
+                fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+            }
+            qkvSize = {bsz * seqlen, num_attention_heads, -1};
+            q.Reshape(qkvSize);
+            k.Reshape(qkvSize);
+            v.Reshape(qkvSize);
+            PermuteSelf(q, {1, 0, 2});
+            PermuteSelf(k, {1, 0, 2});
+            PermuteSelf(v, {1, 0, 2});
+            Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
+            int unitLen = 64;
+#ifdef USE_CUDA
+            unitLen = 128;
+#endif
+            while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
+                   || (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
+                std::vector <int> newDims;
+                if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                    newDims = std::vector <int> {k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
+                } else {
+                    newDims = pastKey.dims;
+                    newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastKey.Expansion(newDims);
+            }
+            while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
+                   || (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
+                std::vector <int> newDims;
+                if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                    newDims = std::vector <int> {v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
+                } else {
+                    newDims = pastValue.dims;
+                    newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastValue.Expansion(newDims);
+            }
+            CatDirect(pastKey, k, 1);
+            CatDirect(pastValue, v, 1);
+            // 1.2 Attention
+            // 1.2.0 q * k^T
+            MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
+            attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
+            if (alibiData.dims.size() != 0) {
+                AlibiMask(attenWeights, alibiData, -10000);
+            } else if (attentionMask.dims.size() != 0) {
+                AttentionMask(attenWeights, attentionMask, -10000);
+            }
+            Softmax(attenWeights, attenWeights, -1);
+            MatMul(attenWeights, pastValue, attenOutput);
+            attenOutput.Reshape({attenOutput.dims[1], attenOutput.dims[2], attenOutput.dims[3]});
+            PermuteSelf(attenOutput, {1, 0, 2});
+            attenOutput.Reshape({bsz, seqlen, -1});
+            Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
+            AddTo(hiddenStates, attenLastOutput);
+            // 2. mlp
+            RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
+            Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
+            Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
+            Silu(w1, w1);
+            MulTo(w1, w3);
+            Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
+            AddTo(hiddenStates, w2);
+        }
+        RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
+        Data logits;
+        Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        logits.ToDevice(DataDevice::CPU);
+        int lastRet = -1;
+        if (generationConfig.output_logits && retLogits != nullptr) {
+            int size = logits.dims.back();
+            logits.ToDevice(DataDevice::CPU);
+            retLogits->resize(size);
+            memcpy((float*)retLogits->data(), ((float*)logits.cpuData) + (logits.dims[1] - 1) * size, size * logits.unitSize);
+        }
+        if (generationConfig.IsSimpleGreedy()) {
+            std::pair <float, int> ret = std::make_pair(-1e9, -1);
+            int base = logits.dims[1] - 1;
+            for (int i = 0; i < logits.dims.back(); i++) {
+                ret = max(ret, std::make_pair(((float*)logits.cpuData)[base * logits.dims.back() + i], i));
+            }
+            lastRet = ret.second;
+        } else if (!lastTokens.units.empty()) {
+            lastRet = LLMSampling(logits, logits.dims[1] - 1, generationConfig, lastTokens.units[0]);
+        }
+        return lastRet;
+    }
+    std::vector <int> LlamaModel::ForwardBatch(int batch, const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
+                            const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
+                            const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
+                            std::vector <std::vector <float>*> *retLogits) {
+        Data alibiData;
+        if (this->weight.dicts["use_alibi"] == "1") {
+            std::vector<float> alibi = GetInterleave(num_attention_heads);
+            alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
+        }
+        Data hiddenStates;
+        Data attenInput;
+        Data q, k, v, qkv;
+        Data attenWeights, attenOutput;
+        Data attenLastOutput;
+        Data w1, w2, w3;
+        Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
+        int seqlen = hiddenStates.dims[1];
+        for (int i = 0; i < block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
+                    1e-6, attenInput);
+            std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
+            std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
+            std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
+            std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
+            std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
+            // 1.1 Get q, k, v
+            int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
+            if (weight.weight.find(qkvWeightName) != weight.weight.end()) {
+                Linear(attenInput, weight[qkvWeightName], Data(), qkv);
+                int per = qkv.dims.back() / 3;
+                Split(qkv, -1, 0, per, q);
+                Split(qkv, -1, per, per * 2, k);
+                Split(qkv, -1, per * 2, per * 3, v);
+            } else {
+                Linear(attenInput, weight[qWeightName], Data(), q);
+                Linear(attenInput, weight[kWeightName], Data(), k);
+                Linear(attenInput, weight[vWeightName], Data(), v);
+            }
+            std::vector <int> qkvSize = {bsz, seqlen, num_attention_heads, -1};
+            q.Reshape(qkvSize);
+            k.Reshape(qkvSize);
+            v.Reshape(qkvSize);
+            if (alibiData.dims.size() == 0) {
+                fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
+                fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+            }
+            PermuteSelf(q, {0, 2, 1, 3});
+            PermuteSelf(k, {0, 2, 1, 3});
+            PermuteSelf(v, {0, 2, 1, 3});
+            qkvSize = {bsz * num_attention_heads, seqlen, -1};
+            q.Reshape(qkvSize);
+            k.Reshape(qkvSize);
+            v.Reshape(qkvSize);
+            Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
+            int unitLen = 64;
+#ifdef USE_CUDA
+            unitLen = 128;
+#endif
+            while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
+                   || (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
+                std::vector <int> newDims;
+                if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                    newDims = std::vector <int> {k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
+                } else {
+                    newDims = pastKey.dims;
+                    newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastKey.Expansion(newDims);
+            }
+            while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
+                   || (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
+                std::vector <int> newDims;
+                if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                    newDims = std::vector <int> {v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
+                } else {
+                    newDims = pastValue.dims;
+                    newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastValue.Expansion(newDims);
+            }
+            CatDirect(pastKey, k, 1);
+            CatDirect(pastValue, v, 1);
+            // 1.2 Attention
+            // 1.2.0 q * k^T
+            MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
+            attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
+            if (alibiData.dims.size() != 0) {
+                attenWeights.Reshape({-1, num_attention_heads, attenWeights.dims[2], attenWeights.dims[3]});
+                AlibiMask(attenWeights, alibiData, -10000);
+                attenWeights.Reshape({1, -1, attenWeights.dims[2], attenWeights.dims[3]});
+            } else if (attentionMask.dims.size() != 0) {
+                AttentionMask(attenWeights, attentionMask, -10000);
+            }
+            Softmax(attenWeights, attenWeights, -1);
+            MatMul(attenWeights, pastValue, attenOutput);
+            attenOutput.Reshape({attenOutput.dims[1], attenOutput.dims[2], attenOutput.dims[3]});
+            PermuteSelf(attenOutput, {1, 0, 2});
+            attenOutput.Reshape({seqlen, bsz, -1});
+            PermuteSelf(attenOutput, {1, 0, 2});
+            Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
+            AddTo(hiddenStates, attenLastOutput);
+            // 2. mlp
+            RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
+            Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
+            Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
+            Silu(w1, w1);
+            MulTo(w1, w3);
+            Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
+            AddTo(hiddenStates, w2);
+        }
+        RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
+        Data logits;
+        Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        logits.ToDevice(DataDevice::CPU);
+        std::vector <int> lastRet;
+        if (generationConfig.IsSimpleGreedy()) {
+            for (int b = 0; b < batch; b++) {
+                int base = b * logits.dims[1] + logits.dims[1] - 1;
+                std::pair <float, int> ret = std::make_pair(-1e9, -1);
+                for (int i = 0; i < logits.dims.back(); i++) {
+                    ret = max(ret, std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
+                }
+                lastRet.push_back(ret.second);
+            }
+        } else {
+            for (int b = 0; b < batch; b++) {
+                int base = b * logits.dims[1] + logits.dims[1] - 1;
+                lastRet.push_back(LLMSampling(logits, base, generationConfig, lastTokens.units[b]));
+            }
+        }
+        return lastRet;
+    }
+    std::vector <int> LlamaModel::ForwardBatch(int batch,
+                                               const Data &inputIds,
+                                               const std::vector <Data*> &attentionMask,
+                                               const std::vector <Data*> &positionIds,
+                                               const std::vector <int> &seqLens,
+                                               std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+                                               const std::vector <GenerationConfig> &generationConfigs,
+                                               const LastTokensManager &lastTokens,
+                                               std::vector <std::vector <float>*> *retLogits) {
+        Data alibiData;
+        if (this->weight.dicts["use_alibi"] == "1") {
+            std::vector<float> alibi = GetInterleave(num_attention_heads);
+            alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
+        }
+        Data hiddenStates;
+        Data attenInput;
+        Data q, k, v, qkv;
+        Data attenWeights, curAttenOutput;
+        Data attenLastOutput;
+        Data w1, w2, w3;
+        Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
+        int seqlen = hiddenStates.dims[1];
+        for (int i = 0; i < block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
+                    1e-6, attenInput);
+            std::string qWeightName = "model.layers." + std::to_string(i) + ".self_attn.q_proj.weight";
+            std::string kWeightName = "model.layers." + std::to_string(i) + ".self_attn.k_proj.weight";
+            std::string vWeightName = "model.layers." + std::to_string(i) + ".self_attn.v_proj.weight";
+            std::string qkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.W_pack.weight";
+            std::string oWeightName = "model.layers." + std::to_string(i) + ".self_attn.o_proj.weight";
+            // 1.1 Get q, k, v
+            int bsz = attenInput.dims[0], seqlen = attenInput.dims[1];
+            if (weight.weight.find(qkvWeightName) != weight.weight.end()) {
+                Linear(attenInput, weight[qkvWeightName], Data(), qkv);
+                int per = qkv.dims.back() / 3;
+                Split(qkv, -1, 0, per, q);
+                Split(qkv, -1, per, per * 2, k);
+                Split(qkv, -1, per * 2, per * 3, v);
+            } else {
+                Linear(attenInput, weight[qWeightName], Data(), q);
+                Linear(attenInput, weight[kWeightName], Data(), k);
+                Linear(attenInput, weight[vWeightName], Data(), v);
+            }
+            Data attenOutput = Data(DataType::FLOAT32);
+            int total = 0;
+            std::vector <Data> curKs, curVs, curQs;
+            curKs.resize(batch);
+            curVs.resize(batch);
+            curQs.resize(batch);
+            for (int b = 0; b < batch; b++) {
+                Split(k, 1, total, total + seqLens[b], curKs[b]);
+                Split(v, 1, total, total + seqLens[b], curVs[b]);
+                Split(q, 1, total, total + seqLens[b], curQs[b]);
+                total += seqLens[b];
+            }
+            for (int b = 0; b < batch; b++) {
+                auto &q = curQs[b], &k = curKs[b], &v = curVs[b];
+                std::vector<int> qkvSize = {bsz, seqLens[b], num_attention_heads, -1};
+                q.Reshape(qkvSize);
+                k.Reshape(qkvSize);
+                v.Reshape(qkvSize);
+                if (alibiData.dims.size() == 0) {
+                    fastllm::LlamaRotatePosition2D(q, *positionIds[b], sinData, cosData, rotary_dim);
+                    fastllm::LlamaRotatePosition2D(k, *positionIds[b], sinData, cosData, rotary_dim);
+                }
+                PermuteSelf(q, {0, 2, 1, 3});
+                PermuteSelf(k, {0, 2, 1, 3});
+                PermuteSelf(v, {0, 2, 1, 3});
+                qkvSize = {bsz * num_attention_heads, seqLens[b], -1};
+                q.Reshape(qkvSize);
+                k.Reshape(qkvSize);
+                v.Reshape(qkvSize);
+                Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
+                int unitLen = 64;
+#ifdef USE_CUDA
+                unitLen = 128;
+#endif
+                while ((pastKey.dims.size() == 0 &&
+                        (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
+                       || (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
+                    std::vector<int> newDims;
+                    if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                        newDims = std::vector<int>{k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
+                    } else {
+                        newDims = pastKey.dims;
+                        newDims[1] += ((k.dims[1] - 1) / unitLen + 1) * unitLen;
+                    }
+                    pastKey.Expansion(newDims);
+                }
+                while ((pastValue.dims.size() == 0 &&
+                        (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
+                       || (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
+                    std::vector<int> newDims;
+                    if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                        newDims = std::vector<int>{v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
+                    } else {
+                        newDims = pastValue.dims;
+                        newDims[1] += ((v.dims[1] - 1) / unitLen + 1) * unitLen;
+                    }
+                    pastValue.Expansion(newDims);
+                }
+                CatDirect(pastKey, k, 1);
+                CatDirect(pastValue, v, 1);
+                // 1.2 Attention
+                // 1.2.0 q * k^T
+                MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
+                attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
+                if (alibiData.dims.size() != 0) {
+                    AlibiMask(attenWeights, alibiData, -10000);
+                } else if (attentionMask[b] != nullptr) {
+                    AttentionMask(attenWeights, *attentionMask[b], -10000);
+                }
+                Softmax(attenWeights, attenWeights, -1);
+                MatMul(attenWeights, pastValue, curAttenOutput);
+                curAttenOutput.Reshape({curAttenOutput.dims[1], curAttenOutput.dims[2], curAttenOutput.dims[3]});
+                PermuteSelf(curAttenOutput, {1, 0, 2});
+                curAttenOutput.Reshape({seqLens[b], bsz, -1});
+                PermuteSelf(curAttenOutput, {1, 0, 2});
+                if (attenOutput.dims.size() == 0) {
+                    std::vector <int> dims = curAttenOutput.dims;
+                    dims[1] = total;
+                    attenOutput.Expansion(dims);
+                }
+                CatDirect(attenOutput, curAttenOutput, 1);
+            }
+            Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
+            AddTo(hiddenStates, attenLastOutput);
+            // 2. mlp
+            RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-6, attenInput);
+            Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
+            Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
+            Silu(w1, w1);
+            MulTo(w1, w3);
+            Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
+            AddTo(hiddenStates, w2);
+        }
+        RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-6, hiddenStates);
+        Data logits;
+        Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        logits.ToDevice(DataDevice::CPU);
+        std::vector <int> lastRet;
+        int total = 0;
+        for (int b = 0; b < batch; b++) {
+            if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
+                int base = (total + seqLens[b] - 1);
+                (*retLogits)[b]->resize(logits.dims.back());
+                memcpy((float*)(*retLogits)[b]->data(), (float*)(logits.cpuData + base * logits.dims.back() * logits.unitSize), logits.dims.back() * logits.unitSize);
+            }
+            if (generationConfigs[b].IsSimpleGreedy()) {
+                std::pair<float, int> ret = std::make_pair(-1e9, -1);
+                int base = (total + seqLens[b] - 1);
+                total += seqLens[b];
+                for (int i = 0; i < logits.dims.back(); i++) {
+                    ret = max(ret, std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
+                }
+                lastRet.push_back(ret.second);
+            } else {
+                int base = (total + seqLens[b] - 1);
+                total += seqLens[b];
+                lastRet.push_back(LLMSampling(logits, base, generationConfigs[b], lastTokens.units[b]));
+            }
+        }
+        return lastRet;
+    }
+    std::string LlamaModel::Response(const std::string& input, RuntimeResult retCb,
+                                     const GenerationConfig &generationConfig) {
+#ifdef USE_CUDA
+        FastllmCudaClearBigBuffer();
+#endif
+//auto st = std::chrono::system_clock::now();
+#ifdef PY_API
+		size_t pos = input.find_last_of("time_stamp:");
+		std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)?  input.substr(0, pos-10):input;
+		size_t hash_id = std::hash<std::string>{}(input);
+        Data inputIds = this->weight.tokenizer.Encode(prompt);
+#else
+        Data inputIds = this->weight.tokenizer.Encode(input);
+#endif
+        std::vector <float> ids;
+        for (int i = 0; i < inputIds.Count(0); i++) {
+            ids.push_back(((float*)inputIds.cpuData)[i]);
+        }
+        int seqLen = ids.size();
+        inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, ids));
+        std::vector <float> vmask = std::vector <float> (seqLen * seqLen, 0);
+        std::vector <float> vpids = std::vector <float> (seqLen, 0);
+        for (int i = 0; i < seqLen; i++) {
+            vpids[i] = i;
+            for (int j = i + 1; j < seqLen; j++) {
+                vmask[i * seqLen + j] = 1;
+            }
+        }
+        Data attentionMask = Data(DataType::FLOAT32, {seqLen, seqLen}, vmask);
+        Data positionIds = Data(DataType::FLOAT32, {1, seqLen}, vpids);
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+                                                   Data(DataType::FLOAT32)));
+        }
+        std::string retString = "";
+        int len = seqLen;
+        std::vector <float> results;
+        int index = 0;
+        LastTokensManager tokens (1, generationConfig.last_n);
+        while (true) {
+            auto st = std::chrono::system_clock::now();
+            int ret = Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
+            tokens.units[0].Push(ret);
+            if (ret == eos_token_id) {
+                break;
+            }
+            results.push_back(ret);
+            std::string curString = weight.tokenizer.Decode(Data(DataType::FLOAT32, {(int)results.size()}, results)).c_str();
+            retString += curString;
+            if (retCb)
+#ifdef PY_API
+			{
+				if(generationConfig.enable_hash_id){
+					std::stringstream ss;
+					ss << retString << "hash_id:"<<hash_id;
+					retCb(index, pybind11::bytes(ss.str()));
+				}else{
+					retCb(index, pybind11::bytes(retString));
+				}
+			}
+#else
+                retCb(index, curString.c_str());
+#endif
+            index++;
+            if (index == generationConfig.output_token_limit) {
+                break;
+            }
+            results.clear();
+            attentionMask.ToDevice(DataDevice::CPU);
+            positionIds.ToDevice(DataDevice::CPU);
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)ret}));
+            attentionMask = Data();
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)len}));
+            //if (do_sample) {
+            //    tokenPenaltyManager.InsertToken(ret);
+            //}
+            len++;
+            if (index == generationConfig.output_token_limit) {
+                break;
+            }
+            //printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now()));
+        }
+        if (retCb)
+#ifdef PY_API
+		{
+			if(generationConfig.enable_hash_id){
+				std::stringstream ss;
+				ss << retString << "hash_id:"<<hash_id;
+				retCb(-1, pybind11::bytes(ss.str()));
+			}else{
+				retCb(-1, pybind11::bytes(retString));
+			}
+		}
+#else
+            retCb(-1, retString.c_str());
+#endif
+        return retString;
+    }
+    void LlamaModel::ResponseBatch(const std::vector<std::string> &inputs, std::vector<std::string> &outputs,
+                                   RuntimeResultBatch retCb,
+                                   const GenerationConfig &generationConfig) {
+#ifdef USE_CUDA
+        FastllmCudaClearBigBuffer();
+#endif
+#ifdef PY_API
+        std::vector<std::string> prompts;
+        std::vector < size_t > hash_ids;
+        for (auto _input: inputs){
+            size_t hash_id = std::hash<std::string>{}(_input);
+            hash_ids.push_back(hash_id);
+            size_t pos = _input.find_last_of("time_stamp:");
+            std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? _input.substr(0, pos - 10) : _input;
+            prompts.push_back(prompt);
+        }
+#else
+        std::vector<std::string> prompts = inputs;
+#endif
+        int batch = prompts.size();
+        outputs.clear();
+        outputs.resize(batch, "");
+        std::vector <Data> inputTokens;
+        std::vector <int> seqLens;
+        inputTokens.resize(batch);
+        seqLens.resize(batch);
+        int maxLen = 0;
+        for (int i = 0; i < batch; i++) {
+            inputTokens[i].CopyFrom(this->weight.tokenizer.Encode(prompts[i]));
+            maxLen = std::max(maxLen, (int)inputTokens[i].Count(0));
+            seqLens[i] = (int)inputTokens[i].Count(0);
+        }
+        std::vector <float> ids = std::vector <float> (batch * maxLen, 0);
+        std::vector <float> vpids = std::vector <float> (batch * maxLen, 0);
+        std::vector <float> vmask = std::vector <float> (batch * maxLen * maxLen, 0);
+        for (int i = 0; i < batch; i++) {
+            Data &tokens = inputTokens[i];
+            int len = tokens.Count(0), base = maxLen - len;
+            for (int j = 0; j < len; j++) {
+                ids[i * maxLen + base + j] = ((float*)tokens.cpuData)[j];
+            }
+            for (int j = 0; j < len; j++) {
+                vpids[i * maxLen + base + j] = j;
+            }
+            std::fill(vmask.data() + i * maxLen * maxLen,
+                      vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
+            for (int j = maxLen - len; j < maxLen; j++) {
+                std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
+                          vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
+            }
+            for (int j = 0; j < len; j++) {
+                for (int k = j + 1; k < len; k++) {
+                    vmask[i * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
+                }
+            }
+        }
+        Data inputIds = Data(DataType::FLOAT32, {batch, maxLen}, ids);
+        Data attentionMask = Data(DataType::FLOAT32, {batch, maxLen, maxLen}, vmask);
+        Data positionIds = Data(DataType::FLOAT32, {batch, maxLen}, vpids);
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+                                                   Data(DataType::FLOAT32)));
+        }
+        std::string retString = "";
+        std::vector <int> lens = seqLens;
+        std::vector <bool> isEnding = std::vector <bool> (batch, false);
+        std::vector <float> results;
+        int index = 0;
+        LastTokensManager tokensManager (batch, generationConfig.last_n);
+        while (true) {
+            auto st = std::chrono::system_clock::now();
+            std::vector <int> ret = ForwardBatch(batch, inputIds, attentionMask, positionIds, pastKeyValues,
+                                                 generationConfig, tokensManager);
+            for (int i = 0; i < batch; i++) {
+                tokensManager.units[i].Push(ret[i]);
+            }
+            std::vector <float> fret;
+            std::vector <float> results;
+            int endingCount = 0;
+            std::vector <std::string> curStrings;
+            for (int i = 0; i < batch; i++) {
+                fret.push_back(ret[i]);
+                if (ret[i] == eos_token_id) {
+                    isEnding[i] = true;
+                }
+                if (isEnding[i]) {
+                    curStrings.push_back("");
+                    endingCount++;
+                    continue;
+                }
+                results.push_back(ret[i]);
+                std::string curString = weight.tokenizer.Decode(
+                        Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
+                outputs[i] += curString;
+                curStrings.push_back(curString);
+                results.clear();
+            }
+            if (endingCount == batch) {
+                break;
+            }
+            if (retCb) 
+#ifdef PY_API
+            {
+                if (generationConfig.enable_hash_id) {
+                    std::vector<pybind11::bytes> rtnStrings;
+                    for (size_t i=0; i<batch; i++){
+                        std::stringstream ss;
+                        ss << curStrings[i] << "hash_id:" << hash_ids[i];
+                        rtnStrings.push_back(pybind11::bytes(ss.str()));
+                    }
+                    retCb(index, rtnStrings);
+                } else {
+                    std::vector<pybind11::bytes> rtnStrings;
+                    for (size_t i=0; i<batch; i++){
+                        std::stringstream ss;
+                        ss << curStrings[i];
+                        rtnStrings.push_back(pybind11::bytes(ss.str()));
+                    }
+                    retCb(index, rtnStrings);
+                }
+            }
+#else
+                retCb(index, curStrings);
+#endif
+            index++;
+            maxLen++;
+            std::vector <float> pids = std::vector <float> (batch);
+            std::vector <float> vmasks = std::vector <float> (batch * maxLen, 0.0f);
+            for (int i = 0; i < batch; i++) {
+                pids[i] = lens[i];
+                lens[i]++;
+                for (int j = 0; j < maxLen - lens[i]; j++) {
+                    vmasks[i * maxLen + j] = 1.0f;
+                }
+            }
+            positionIds.ToDevice(DataDevice::CPU);
+            attentionMask.ToDevice(DataDevice::CPU);
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, 1, maxLen}, vmasks));
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, fret));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, pids));
+            if (index == generationConfig.output_token_limit) {
+                break;
+            }
+            //printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now()));
+        }
+        if (retCb)
+#ifdef PY_API
+        {
+            if (generationConfig.enable_hash_id) {
+                std::vector<pybind11::bytes> rtnStrings;
+                for (size_t i=0; i<batch; i++){
+                    std::stringstream ss;
+                    ss << outputs[i] << "hash_id:" << hash_ids[i];
+                    rtnStrings.push_back(pybind11::bytes(ss.str()));
+                }
+                retCb(-1, rtnStrings);
+            } else {
+                std::vector<pybind11::bytes> rtnStrings;
+                for (size_t i=0; i<batch; i++){
+                    std::stringstream ss;
+                    ss << outputs[i];
+                    rtnStrings.push_back(pybind11::bytes(ss.str()));
+                }
+                retCb(-1, rtnStrings);
+            }
+        }
+#else
+            retCb(-1, outputs);
+#endif
+    }
+    std::string LlamaModel::MakeInput(const std::string &history, int round, const std::string &input) {
+        if(is_nsql){
+            return input;
+        }
+        else{
+            return (round == 0 ? pre_prompt : history) + user_role + input + bot_role;
+        }
+    }
+    std::string LlamaModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
+        if(is_nsql){
+            return input;
+        }
+        else {
+            return (round == 0 ? pre_prompt : history) + user_role + input + bot_role + output + history_sep;
+        }
+    }
+    void LlamaModel::WarmUp() {
+        printf("Warmup...\n");
+        Data inputIds = Data(DataType::FLOAT32, {1, 1}, {1});
+        Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
+        Data positionIds = Data(DataType::FLOAT32, {1, 1}, {0, 0});
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+                                                   Data(DataType::FLOAT32)));
+        }
+        Forward(inputIds, attentionMask, positionIds, pastKeyValues);
+        printf("finish.\n");
+    }
+    int LlamaModel::LaunchResponseTokens(const std::vector<int> &inputTokens,
+                                         const GenerationConfig &generationConfig) {
+        mainLoopLocker.lock();
+        if (mainLoop == nullptr) {
+            if (mainLoop == nullptr) {
+                mainLoop = new std::thread([](LlamaModel *model) {
+                    while (true) {
+                        std::vector <Data*> attentionMasks;
+                        std::vector <Data*> positionIds;
+                        std::vector <std::pair <Data*, Data*> > pastKeyValues;
+                        std::vector <float> ids;
+                        std::vector <int> seqLens;
+                        std::vector <GenerationConfig> generationConfigs;
+                        LastTokensManager tokensManager;
+                        std::vector <std::vector <float>* > logits;
+                        model->dictLocker.lock();
+                        for (auto &it: model->responseContextDict.dicts) {
+                            if (it.second->isEnding) {
+                                continue;
+                            }
+                            generationConfigs.push_back(it.second->generationConfig);
+                            if (it.second->generationConfig.output_logits) {
+                                it.second->resultLogits.push(new std::vector <float> ());
+                                logits.push_back(it.second->resultLogits.back());
+                            } else {
+                                logits.push_back(nullptr);
+                            }
+                            tokensManager.units.push_back(it.second->tokens);
+                            if (it.second->preTokens == 0) {
+                                int seqLen = it.second->currentTokens.size();
+                                for (int i = 0; i < it.second->currentTokens.size(); i++) {
+                                    ids.push_back(it.second->currentTokens[i]);
+                                }
+                                seqLens.push_back(seqLen);
+                                std::vector <float> vmask = std::vector <float> (seqLen * seqLen, 0);
+                                std::vector <float> vpids = std::vector <float> (seqLen, 0);
+                                for (int i = 0; i < seqLen; i++) {
+                                    vpids[i] = i;
+                                    for (int j = i + 1; j < seqLen; j++) {
+                                        vmask[i * seqLen + j] = 1;
+                                    }
+                                }
+                                it.second->intParams["len"] = seqLen;
+                                attentionMasks.push_back(new Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
+                                positionIds.push_back(new Data(DataType::FLOAT32, {2, seqLen}, vpids));
+                            } else {
+                                int ret = it.second->currentTokens[0];
+                                seqLens.push_back(1);
+                                ids.push_back(ret);
+                                attentionMasks.push_back(nullptr);
+                                positionIds.push_back(new Data(DataType::FLOAT32, {1, 1}, {(float)it.second->intParams["len"]}));
+                                it.second->intParams["len"]++;
+                            }
+                            it.second->preTokens += seqLens.back();
+                            for (int i = 0; i < model->block_cnt; i++) {
+                                pastKeyValues.push_back(std::make_pair(&it.second->pastKeyValues[i].first,
+                                                                       &it.second->pastKeyValues[i].second));
+                            }
+                        }
+                        if (seqLens.size() > 0) {
+#ifdef USE_CUDA
+                            FastllmCudaClearBigBuffer();
+#endif
+                            Data inputIds = Data(DataType::FLOAT32, {1, (int) ids.size()}, ids);
+                            std::vector<int> ret = model->ForwardBatch(seqLens.size(), inputIds, attentionMasks,
+                                                                       positionIds, seqLens, pastKeyValues, generationConfigs, tokensManager, &logits);
+                            int idx = 0;
+                            for (auto &it: model->responseContextDict.dicts) {
+                                if (it.second->isEnding) {
+                                    continue;
+                                }
+                                int curRet = ret[idx++];
+                                if (curRet == model->eos_token_id) {
+                                    it.second->isEnding = true;
+                                } else {
+                                    it.second->currentTokens = std::vector<int>{curRet};
+                                    it.second->resultTokenQueue.push(curRet);
+                                    it.second->tokens.Push(curRet);
+                                    it.second->curTokens++;
+                                    if (it.second->curTokens == it.second->generationConfig.output_token_limit) {
+                                        it.second->isEnding = true;
+                                    }
+                                }
+                            }
+                        }
+                        for (int i = 0; i < attentionMasks.size(); i++) {
+                            delete attentionMasks[i];
+                        }
+                        for (int i = 0; i < positionIds.size(); i++) {
+                            delete positionIds[i];
+                        }
+                        model->dictLocker.unlock();
+                        MySleep(0);
+                    }
+                }, this);
+            }
+        }
+        mainLoopLocker.unlock();
+        dictLocker.lock();
+        int handleId = responseContextDict.CreateHandle();
+        ResponseContext *context = responseContextDict.GetHandle(handleId);
+        context->Init(this->block_cnt);
+        context->currentTokens = inputTokens;
+        context->generationConfig = generationConfig;
+        context->tokens = LastTokensUnit(generationConfig.last_n);
+        dictLocker.unlock();
+        return handleId;
+    }
+    int LlamaModel::FetchResponseTokens(int handleId) {
+        dictLocker.lock();
+        ResponseContext *context = responseContextDict.GetHandle(handleId);
+        if (context == nullptr) {
+            dictLocker.unlock();
+            return -1;
+        } else {
+            while (true) {
+                if (context->resultTokenQueue.size() > 0) {
+                    int ret = context->resultTokenQueue.front();
+                    context->resultTokenQueue.pop();
+                    dictLocker.unlock();
+                    return ret;
+                } else {
+                    if (context->isEnding) {
+                        responseContextDict.RemoveHandle(handleId);
+                        dictLocker.unlock();
+                        return -1;
+                    }
+                }
+                dictLocker.unlock();
+                MySleep(0);
+                dictLocker.lock();
+            }
+        }
+    }
+}
--- a/src/models/moss.cpp
+++ b/src/models/moss.cpp
+//
+// Created by huangyuyang on 5/12/23.
+//
+#include "utils.h"
+#include "moss.h"
+#include <cmath>
+#include <chrono>
+#include <algorithm>
+#include <sstream>
+#include <unordered_map>
+namespace fastllm {
+    extern double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2);
+    MOSSModel::MOSSModel() {
+        this->model_type = "moss";
+        this->pre_prompt = "You are an AI assistant whose name is MOSS. ";
+        this->user_role = "<|Human|>: ";
+        this->bot_role = "<eoh>";
+        this->history_sep = "";
+        // 初始化sin, cos
+		embed_dim = 6144;
+		num_attention_heads = 24;
+		head_dim = embed_dim / num_attention_heads;
+		block_cnt = 34;
+        sin.resize(max_positions);
+        cos.resize(max_positions);
+        std::vector <float> invFreq;
+        for (int i = 0; i < rotary_dim; i += 2) {
+            invFreq.push_back(1.0 / pow(10000, (float)i / rotary_dim));
+        }
+        for (int i = 0; i < max_positions; i++) {
+            sin[i].resize(rotary_dim);
+            cos[i].resize(rotary_dim);
+            for (int j = 0; j < invFreq.size(); j++) {
+                sin[i][j] = ::sin((float)i * invFreq[j]);
+                cos[i][j] = ::cos((float)i * invFreq[j]);
+            }
+        }
+        this->weight.embeddingNames.insert("transformer.wte.weight");
+    }
+    void MOSSModel::CausalMask(Data &data, int start) {
+        int outer = data.dims[0] * data.dims[1];
+        int spatial = data.Count(2);
+        int n = data.dims[2], m = data.dims[3];
+        for (int o = 0; o < outer; o++) {
+            float *d = (float*)data.cpuData + o * spatial;
+            for (int i = 0; i < n; i++) {
+                if (i + start + 1 < m) {
+                    std::fill(d + i * m + i + start + 1, d + (i + 1) * m, -std::numeric_limits<float>::max());
+                }
+            }
+        }
+    }
+    void MOSSModel::RotatePosition2D(Data &data, const Data &positionIds) {
+        int outer = data.dims[0] * data.dims[1];
+        int spatial = data.Count(2);
+        int n = data.dims[2], m = data.dims[3];
+        for (int o = 0; o < outer; o++) {
+            int index = (int)((float*)positionIds.cpuData)[o];
+            std::vector <float> &sin = this->sin[index];
+            std::vector <float> &cos = this->cos[index];
+            float *d = (float*)data.cpuData + o * spatial;
+            for (int i = 0; i < n; i++) {
+                for (int j = 0; j + 1 < rotary_dim && j + 1 < m; j += 2) {
+                    float a = d[j], b = d[j + 1];
+                    d[j] = a * cos[j / 2] - b * sin[j / 2];
+                    d[j + 1] = a * sin[j / 2] + b * cos[j / 2];
+                }
+                d += m;
+            }
+        }
+    }
+    int MOSSModel::Forward(const Data &inputIds, const Data &attentionMask,
+                            const Data &positionIds, std::vector <std::pair <Data, Data> > &pastKeyValues,
+                           const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
+                           std::vector <float> *retLogits) {
+        auto st = std::chrono::system_clock::now();
+        Data inputEmbeddings;
+        Embedding(inputIds, this->weight["transformer.wte.weight"], inputEmbeddings);
+        Data hiddenStates = inputEmbeddings;
+        // MossBlock
+        for (int i = 0; i < block_cnt; i++) {
+            // 1.0 LayerNorm
+            Data residual;
+            Mul(hiddenStates, 1.0, residual);
+            std::string lnWeightName = "transformer.h." + std::to_string(i) + ".ln_1.weight";
+            std::string lnBiasName = "transformer.h." + std::to_string(i) + ".ln_1.bias";
+            LayerNorm(residual, weight[lnWeightName], weight[lnBiasName], -1, hiddenStates);
+            // 1.1 Get query, key, value
+            std::string qkvProjName = "transformer.h." + std::to_string(i) + ".attn.qkv_proj.weight";
+            Data qkv, q, k, v;
+            Linear(hiddenStates, weight[qkvProjName], Data(), qkv);
+            qkv.Reshape({qkv.dims[0], qkv.dims[1], 4, -1});
+            int per = qkv.dims.back() / 3;
+            Split(qkv, -1, 0, per, q);
+            Split(qkv, -1, per, per * 2, v);
+            Split(qkv, -1, per * 2, per * 3, k);
+            q.Reshape({q.dims[0], q.dims[1], -1, head_dim});
+            k.Reshape({k.dims[0], k.dims[1], -1, head_dim});
+            v.Reshape({v.dims[0], v.dims[1], -1, head_dim});
+            q.ToDevice(DataDevice::CPU);
+            k.ToDevice(DataDevice::CPU);
+            RotatePosition2D(q, positionIds);
+            RotatePosition2D(k, positionIds);
+            q.ToDevice(DataDevice::CUDA);
+            k.ToDevice(DataDevice::CUDA);
+            PermuteSelf(q, {0, 2, 1, 3});
+            PermuteSelf(k, {0, 2, 1, 3});
+            PermuteSelf(v, {0, 2, 1, 3});
+            Data pastKey = pastKeyValues[i].first, pastValue = pastKeyValues[i].second;
+            Cat(pastKey, k, -2, pastKeyValues[i].first);
+            Cat(pastValue, v, -2, pastKeyValues[i].second);
+            k.CopyFrom(pastKeyValues[i].first);
+            v.CopyFrom(pastKeyValues[i].second);
+            // 1.2 Attention
+            // 1.2.0 q * k^T
+            Data attnWeights;
+            MatMulTransB(q, k, attnWeights, 1.0 / scale_attn);
+            // 1.2.1 causal_mask
+            attnWeights.ToDevice(DataDevice::CPU);
+            CausalMask(attnWeights, k.dims[2] - q.dims[2]);
+            attnWeights.ToDevice(DataDevice::CUDA);
+            // 1.2.2 attentionMask
+            // TODO: attentionMask, 这里似乎都是1, 暂且跳过了
+            // 1.2.3 softmax
+            Softmax(attnWeights, attnWeights, -1);
+            // 1.2.4 headMask
+            // TODO: headMask, 这里似乎都是None, 暂且跳过了
+            // 1.2.5 attention_weights * v
+            Data attnOutput;
+            PermuteSelf(v, {0, 1, 3, 2});
+            MatMulTransB(attnWeights, v, attnOutput);
+            // 1.3
+            PermuteSelf(attnOutput, {0, 2, 1, 3});
+            attnOutput.Reshape({attnOutput.dims[0], attnOutput.dims[1], -1});
+            std::string outProjName = "transformer.h." + std::to_string(i) + ".attn.out_proj.weight";
+            Data realOutput;
+            Linear(attnOutput, weight[outProjName], Data(), realOutput);
+            // 1.4 MLP
+            std::string fcInKeyName = "transformer.h." + std::to_string(i) + ".mlp.fc_in";
+            std::string fcOutKeyName = "transformer.h." + std::to_string(i) + ".mlp.fc_out";
+            Data middle;
+            Linear(hiddenStates, weight[fcInKeyName + ".weight"], weight[fcInKeyName + ".bias"], middle);
+            GeluNew(middle, middle);
+            Linear(middle, weight[fcOutKeyName + ".weight"], weight[fcOutKeyName + ".bias"], hiddenStates);
+            AddTo(hiddenStates, residual);
+            AddTo(hiddenStates, realOutput);
+        }
+        LayerNorm(hiddenStates, weight["transformer.ln_f.weight"], weight["transformer.ln_f.bias"], -1, hiddenStates);
+        Data logits;
+        Linear(hiddenStates, weight["lm_head.weight"], weight["lm_head.bias"], logits);
+        logits.ToDevice(DataDevice::CPU);
+        int ret = -1;
+        if (generationConfig.IsSimpleGreedy()) {
+            std::vector<std::pair<float, int> > v;
+            int base = logits.dims[logits.dims.size() - 2] - 1;
+            for (int i = 0; i < logits.dims.back(); i++) {
+                v.push_back(std::make_pair(((float *) logits.cpuData)[base * logits.dims.back() + i], i));
+            }
+            std::sort(v.begin(), v.end());
+            std::reverse(v.begin(), v.end());
+            ret = v[0].second;
+        } else if (!lastTokens.units.empty()) {
+            ret = LLMSampling(logits, logits.dims[logits.dims.size() - 2] - 1, generationConfig, lastTokens.units[0]);
+        }
+        float spend = GetSpan(st, std::chrono::system_clock::now());
+        //printf("forward spend %f s.\n", spend);
+        return ret;
+    }
+    std::string MOSSModel::Response(const std::string &input,
+                                    RuntimeResult retCb,
+                                    const GenerationConfig &generationConfig) {
+#ifdef PY_API
+		size_t pos = input.find_last_of("time_stamp:");
+		std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)?  input.substr(0, pos-10):input;
+		size_t hash_id = std::hash<std::string>{}(input);
+        Data inputIds = this->weight.tokenizer.Encode(prompt);
+#else
+        Data inputIds = this->weight.tokenizer.Encode(input);
+#endif
+        Data attentionMask = inputIds;
+        Data positionIds = inputIds;
+        std::vector<std::pair<Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(), Data()));
+        }
+        int len = inputIds.dims[1];
+        for (int i = 0; i < len; i++) {
+            ((float *) attentionMask.cpuData)[i] = 1;
+            ((float *) positionIds.cpuData)[i] = i;
+        }
+        std::vector<float> results;
+        std::string retString = "";
+		int index = 0;
+        LastTokensManager tokens (1, generationConfig.last_n);
+        while (true) {
+            int ret = Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
+            tokens.units[0].Push(ret);
+            if (ret == 106068) {
+                break;
+            }
+            results.push_back(ret);
+            std::string current = weight.tokenizer.Decode(
+                    Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
+            retString += current;
+			if (retCb)
+#ifdef PY_API
+			{
+				if(generationConfig.enable_hash_id){
+					std::stringstream ss;
+					ss << retString << "hash_id:"<<hash_id;
+					retCb(index, pybind11::bytes(ss.str()));
+				}else{
+					retCb(index, pybind11::bytes(retString));
+				}
+			}
+#else
+				retCb(index, current.c_str());
+#endif
+            index++;
+            fflush(stdout);
+            results.clear();
+            len++;
+            inputIds.ToDevice(DataDevice::CPU);
+            attentionMask.ToDevice(DataDevice::CPU);
+            positionIds.ToDevice(DataDevice::CPU);
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) ret}));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, len}, std::vector<float>(len, 1.0f)));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) (len - 1)}));
+            if (index == generationConfig.output_token_limit) {
+                break;
+            }
+        }
+		if (retCb)
+#ifdef PY_API
+		{
+			if(generationConfig.enable_hash_id){
+				std::stringstream ss;
+				ss << retString << "hash_id:"<<hash_id;
+				retCb(-1, pybind11::bytes(ss.str()));
+			}else{
+				retCb(-1, pybind11::bytes(retString));
+			}
+		}
+#else
+			retCb(-1, retString.c_str());
+#endif
+        return retString;
+    }
+    std::string MOSSModel::MakeInput(const std::string &history, int round, const std::string &input) {
+        return (round == 0 ? pre_prompt : history) + user_role + input + bot_role;
+    }
+    std::string MOSSModel::MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) {
+        return (round == 0 ? pre_prompt : history) + user_role + input + bot_role + output + history_sep;
+    }
+    void MOSSModel::WarmUp() {
+        printf("Warmup...\n");
+        Data inputIds = Data(DataType::FLOAT32, {1, 1}, {(float)bos_token_id});
+        Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
+        Data positionIds = Data(DataType::FLOAT32, {1, 1}, {0});
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+                                                   Data(DataType::FLOAT32)));
+        }
+        Forward(inputIds, attentionMask, positionIds, pastKeyValues);
+        printf("finish.\n");
+    }
+    void
+    MOSSModel::FillLLMInputs(std::vector<std::vector<float>> &inputTokens, const std::map<std::string, int> &params,
+                             fastllm::Data &inputIds, fastllm::Data &attentionMask, fastllm::Data &positionIds) {
+        int index = params.find("index")->second;
+        int promptLen = params.find("promptLen")->second;
+        inputIds.ToDevice(DataDevice::CPU);
+        attentionMask.ToDevice(DataDevice::CPU);
+        positionIds.ToDevice(DataDevice::CPU);
+        if (index == 0) {
+            int seqLen = inputTokens[0].size();
+            std::vector<float> vmask = std::vector<float>(seqLen, 1);
+            std::vector<float> vpids = std::vector<float>(seqLen, 0);
+            for (int i = 0; i < seqLen; i++) {
+                vpids[i] = i;
+            }
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vmask));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vpids));
+        } else {
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {1, promptLen + index}, std::vector<float>(promptLen + index, 1.0f)));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) (promptLen + index - 1)}));
+        }
+    }
+}
--- a/src/models/qwen.cpp
+++ b/src/models/qwen.cpp
+//
+// Created by siemon on 8/9/23.
+//
+#include "utils.h"
+#include "qwen.h"
+#include <cmath>
+#include <chrono>
+#include <algorithm>
+#include <sstream>
+#include <unordered_map>
+#include <cstring>
+#ifdef USE_CUDA
+#include "fastllm-cuda.cuh"
+#endif
+namespace fastllm {
+    extern double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2);
+    QWenModel::QWenModel() {
+        this->model_type = "qwen";
+        this->pre_prompt = "You are a helpful assistant.";
+        this->user_role = "user";
+        this->bot_role = "assistant";
+        embed_dim = 4096;
+		num_attention_heads = 32;
+		head_dim = embed_dim / num_attention_heads;
+		block_cnt = 32;
+        rotary_dim = 128;
+        seq_length = 2048;
+        use_log_attn = true;
+        ntk_alpha = 1.f;
+        UpdateRotaryPosEmb(ntk_alpha);
+        if (use_log_attn) {
+            logn_list = Data(DataType::FLOAT32);
+            logn_list.Resize({1, max_positions, 1, 1});
+            logn_list.Allocate();
+            float *logn = (float *) logn_list.cpuData;
+            for (int i = 0; i < seq_length; i++) {
+                logn[i] = 1;
+            }
+            for (int i = seq_length; i < max_positions; i++) {
+                logn[i] = std::log(i) / std::log(seq_length);
+            }
+        }
+        weight.embeddingNames.insert("transformer.wte.weight");
+    }
+    int QWenModel::Forward(const Data &inputIds,
+                           const Data &attentionMask,
+                           const Data &positionIds,
+                           std::vector <std::pair <Data, Data> > &pastKeyValues,
+                           const GenerationConfig &generationConfig,
+                           const LastTokensManager &lastTokens,
+                           std::vector <float> *logits) {
+        std::vector <std::vector <float>*> batchLogits;
+        batchLogits.push_back(logits);
+        return ForwardBatch(1, inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, lastTokens, &batchLogits)[0];
+    }
+    std::vector <int> QWenModel::ForwardBatch(int batch,
+                                              const Data &inputIds,
+                                              const Data &attentionMask,
+                                              const Data &positionIds,
+                                              std::vector <std::pair <Data, Data> > &pastKeyValues,
+                                              const GenerationConfig &generationConfig,
+                                              const LastTokensManager &lastTokens,
+                                              std::vector <std::vector <float>*> *retLogits) {
+        int maxLen = inputIds.dims[1];                                        
+        Data hiddenStates;
+        Data attnInput, attnOutput;
+        Data query, key, value;
+        Data attnWeights, attnLastOutput;
+        Data a1, a2, mlpOutput;
+        // printf("input id: ");
+        // for (int i = 0; i < inputIds.Count(0); i++) {
+        //     printf("%d ", (int )((float *) inputIds.cpuData)[i]);
+        // }
+        // printf("\n");
+        Embedding(inputIds, this->weight["transformer.wte.weight"], hiddenStates);
+        for (int i = 0; i < this->block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            int seqlen = hiddenStates.dims[1];
+            std::string ln_1_name = "transformer.h." + std::to_string(i) + ".ln_1.weight";
+            std::string attn_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.weight";
+            std::string attn_bias_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.bias";
+            RMSNorm(hiddenStates, weight[ln_1_name], 1e-6, attnInput);
+            Linear(attnInput, weight[attn_weight_name], weight[attn_bias_name], attnOutput); // attnOutput [batch, seqlen, embed_dim * 3]
+            Split(attnOutput, 2, 0, embed_dim, query);
+            Split(attnOutput, 2, embed_dim, 2 * embed_dim, key);
+            Split(attnOutput, 2, embed_dim * 2, embed_dim * 3, value);
+            query.Reshape({query.dims[0], query.dims[1], num_attention_heads, head_dim});
+            key.Reshape({key.dims[0], key.dims[1], num_attention_heads, head_dim});
+            value.Reshape({value.dims[0], value.dims[1], num_attention_heads, head_dim});
+            Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
+            if (pastKey.dims.empty()) {
+                // 计算new_ntk_alpha
+                float context_value = std::log2((float) seqlen / seq_length) + 1;
+                float new_ntk_alpha = std::max(std::pow(2, std::ceil(context_value) - 1), 1.);
+                if (new_ntk_alpha != ntk_alpha) {
+                    UpdateRotaryPosEmb(new_ntk_alpha);
+                }
+            }
+            LlamaRotatePosition2D(query, positionIds, sinData, cosData, rotary_dim);
+            LlamaRotatePosition2D(key, positionIds, sinData, cosData, rotary_dim);
+            if (use_log_attn) {
+                ApplyLognAttn(query, logn_list, positionIds);
+            }
+            PermuteSelf(query, {0, 2, 1, 3});
+            PermuteSelf(key, {0, 2, 1, 3});
+            PermuteSelf(value, {0, 2, 1, 3});
+            std::vector<int> qkvSize = {batch * num_attention_heads, seqlen, -1};
+            query.Reshape(qkvSize);
+            key.Reshape(qkvSize);
+            value.Reshape(qkvSize);
+            int unitLen = 64;
+#ifdef USE_CUDA
+            unitLen = 128;
+#endif
+            while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || key.dims[1] > pastKey.expansionDims[1]))
+                   || (pastKey.dims.size() > 0 && pastKey.dims[1] + key.dims[1] > pastKey.expansionDims[1])) {
+                std::vector <int> newDims;
+                if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                    newDims = std::vector <int> {key.dims[0], ((key.dims[1] - 1) / unitLen + 1) * unitLen, key.dims[2]};
+                } else {
+                    newDims = pastKey.dims;
+                    newDims[1] += ((key.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastKey.Expansion(newDims);
+            }
+            while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || value.dims[1] > pastValue.expansionDims[1]))
+                   || (pastValue.dims.size() > 0 && pastValue.dims[1] + value.dims[1] > pastValue.expansionDims[1])) {
+                std::vector <int> newDims;
+                if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                    newDims = std::vector <int> {value.dims[0], ((value.dims[1] - 1) / unitLen + 1) * unitLen, value.dims[2]};
+                } else {
+                    newDims = pastValue.dims;
+                    newDims[1] += ((value.dims[1] - 1) / unitLen + 1) * unitLen;
+                }
+                pastValue.Expansion(newDims);
+            }
+            CatDirect(pastKey, key, 1);
+            CatDirect(pastValue, value, 1);
+            // Attention
+            MatMulTransB(query, pastKey, attnWeights, 1.0 / sqrt(head_dim));
+            attnWeights.Reshape({1, attnWeights.dims[0], attnWeights.dims[1], attnWeights.dims[2]});
+            if (!attentionMask.dims.empty()) {
+                AttentionMask(attnWeights, attentionMask, -10000);
+            }
+            Softmax(attnWeights, attnWeights, -1);
+            MatMul(attnWeights, pastValue, attnOutput);
+            attnOutput.Reshape({attnOutput.dims[1], attnOutput.dims[2], attnOutput.dims[3]});
+            PermuteSelf(attnOutput, {1, 0, 2});
+            attnOutput.Reshape({seqlen, batch, -1});
+            PermuteSelf(attnOutput, {1, 0, 2});
+            std::string proj_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_proj.weight";
+            Linear(attnOutput, weight[proj_weight_name], Data(), attnLastOutput);
+            AddTo(hiddenStates, attnLastOutput);
+            std::string ln_2_name = "transformer.h." + std::to_string(i) + ".ln_2.weight";
+            RMSNorm(hiddenStates, weight[ln_2_name], 1e-6, attnInput);
+            std::string mlp_w1_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w1.weight";
+            std::string mlp_w2_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w2.weight";
+            std::string mlp_proj_weight_name = "transformer.h." + std::to_string(i) + ".mlp.c_proj.weight";
+            Linear(attnInput, weight[mlp_w1_weight_name], Data(), a1);
+            Linear(attnInput, weight[mlp_w2_weight_name], Data(), a2);
+            Silu(a2, a2);
+            MulTo(a1, a2);
+            Linear(a1, weight[mlp_proj_weight_name], Data(), mlpOutput);
+            AddTo(hiddenStates, mlpOutput);
+        }
+        RMSNorm(hiddenStates, weight["transformer.ln_f.weight"], 1e-6, hiddenStates);
+        Data logits, topk;
+        Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        std::vector <int> lastRet;
+        int total = 0;
+        Data curLogitTemp, curLogit;
+        for (int b = 0; b < batch; b++) {
+            Split(logits, 0, b, b + 1, curLogitTemp);
+            Split(curLogitTemp, 1, maxLen - 1, maxLen, curLogit);
+            if (generationConfig.output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
+                curLogit.ToDevice(DataDevice::CPU);
+                (*retLogits)[b]->resize(curLogit.Count(0));
+                memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
+            }
+            if (generationConfig.IsSimpleGreedy()) {
+                Data topk;
+                TopK(curLogit, topk, 1);
+                topk.ToDevice(DataDevice::CPU);
+                lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
+            } else {
+                lastRet.push_back(LLMSampling(curLogit, 0, generationConfig, lastTokens.units[b]));
+            }
+            total += maxLen;
+        }
+        return lastRet;
+    }
+    std::vector <int> QWenModel::ForwardBatch(int batch,
+                                              const Data &inputIds,
+                                              const std::vector <Data*> &attentionMask,
+                                              const std::vector <Data*> &positionIds,
+                                              const std::vector <int> &seqLens,
+                                              std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+                                              const std::vector <GenerationConfig> &generationConfigs,
+                                              const LastTokensManager &lastTokens,
+                                              std::vector <std::vector <float>*> *retLogits) {
+        int maxLen = inputIds.dims[1];
+        Data hiddenStates;
+        Data attnInput, attnOutput;
+        Data query, key, value;
+        Data attnWeights, attnLastOutput;
+        Data a1, a2, mlpOutput;
+        Embedding(inputIds, this->weight["transformer.wte.weight"], hiddenStates);
+        for (int i = 0; i < this->block_cnt; i++) {
+            ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
+            std::string ln_1_name = "transformer.h." + std::to_string(i) + ".ln_1.weight";
+            std::string attn_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.weight";
+            std::string attn_bias_name = "transformer.h." + std::to_string(i) + ".attn.c_attn.bias";
+            RMSNorm(hiddenStates, weight[ln_1_name], 1e-6, attnInput);
+            Linear(attnInput, weight[attn_weight_name], weight[attn_bias_name], attnOutput); // attnOutput [batch, seqlen, embed_dim * 3]
+            Split(attnOutput, 2, 0, embed_dim, query);
+            Split(attnOutput, 2, embed_dim, 2 * embed_dim, key);
+            Split(attnOutput, 2, embed_dim * 2, embed_dim * 3, value);
+            std::vector<Data> curKs, curVs, curQs;
+            curKs.resize(batch);
+            curVs.resize(batch);
+            curQs.resize(batch);
+            int total = 0;
+            for (int b = 0; b < batch; b++) {
+                Split(query, 1, total, total + seqLens[b], curQs[b]);
+                Split(key, 1, total, total + seqLens[b], curKs[b]);
+                Split(value, 1, total, total + seqLens[b], curVs[b]);
+                total += seqLens[b];
+            }
+            Data attnOutputAll = Data(DataType::FLOAT32);
+            for (int b = 0; b < batch; b++) {
+                // in this loop, batch = 1
+                auto &query = curQs[b];
+                auto &key = curKs[b];
+                auto &value = curVs[b];
+                query.Reshape({1, seqLens[b], num_attention_heads, head_dim});
+                key.Reshape({1, seqLens[b], num_attention_heads, head_dim});
+                value.Reshape({1, seqLens[b], num_attention_heads, head_dim});
+                Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
+                if (pastKey.dims.empty()) {
+                    // 计算new_ntk_alpha
+                    float context_value = std::log2((float) seqLens[b] / seq_length) + 1;
+                    float new_ntk_alpha = std::max(std::pow(2, std::ceil(context_value) - 1), 1.);
+                    if (new_ntk_alpha != ntk_alpha) {
+                        UpdateRotaryPosEmb(new_ntk_alpha);
+                    }
+                }
+                LlamaRotatePosition2D(query, *positionIds[b], sinData, cosData, rotary_dim);
+                LlamaRotatePosition2D(key, *positionIds[b], sinData, cosData, rotary_dim);
+                if (use_log_attn) {
+                    ApplyLognAttn(query, logn_list, *positionIds[b]);
+                }
+                PermuteSelf(query, {0, 2, 1, 3});
+                PermuteSelf(key, {0, 2, 1, 3});
+                PermuteSelf(value, {0, 2, 1, 3});
+                std::vector<int> qkvSize = {num_attention_heads, seqLens[b], -1};
+                query.Reshape(qkvSize);
+                key.Reshape(qkvSize);
+                value.Reshape(qkvSize);
+                int unitLen = 64;
+    #ifdef USE_CUDA
+                unitLen = 128;
+    #endif
+                while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || key.dims[1] > pastKey.expansionDims[1]))
+                    || (pastKey.dims.size() > 0 && pastKey.dims[1] + key.dims[1] > pastKey.expansionDims[1])) {
+                    std::vector <int> newDims;
+                    if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
+                        newDims = std::vector <int> {key.dims[0], ((key.dims[1] - 1) / unitLen + 1) * unitLen, key.dims[2]};
+                    } else {
+                        newDims = pastKey.dims;
+                        newDims[1] += ((key.dims[1] - 1) / unitLen + 1) * unitLen;
+                    }
+                    pastKey.Expansion(newDims);
+                }
+                while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || value.dims[1] > pastValue.expansionDims[1]))
+                    || (pastValue.dims.size() > 0 && pastValue.dims[1] + value.dims[1] > pastValue.expansionDims[1])) {
+                    std::vector <int> newDims;
+                    if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
+                        newDims = std::vector <int> {value.dims[0], ((value.dims[1] - 1) / unitLen + 1) * unitLen, value.dims[2]};
+                    } else {
+                        newDims = pastValue.dims;
+                        newDims[1] += ((value.dims[1] - 1) / unitLen + 1) * unitLen;
+                    }
+                    pastValue.Expansion(newDims);
+                }
+                CatDirect(pastKey, key, 1);
+                CatDirect(pastValue, value, 1);
+                MatMulTransB(query, pastKey, attnWeights, 1.0 / sqrt(head_dim));
+                attnWeights.Reshape({1, attnWeights.dims[0], attnWeights.dims[1], attnWeights.dims[2]});
+                if (attentionMask[b]) {
+                    AttentionMask(attnWeights, *attentionMask[b], -10000);
+                }
+                Softmax(attnWeights, attnWeights, -1);
+                MatMul(attnWeights, pastValue, attnOutput);
+                attnOutput.Reshape({attnOutput.dims[1], attnOutput.dims[2], attnOutput.dims[3]});
+                PermuteSelf(attnOutput, {1, 0, 2});
+                attnOutput.Reshape({seqLens[b], 1, -1});
+                PermuteSelf(attnOutput, {1, 0, 2});
+                if (attnOutputAll.dims.size() == 0) {
+                    std::vector <int> dims = attnOutput.dims;
+                    dims[1] = total;
+                    attnOutputAll.Expansion(dims);
+                }
+                CatDirect(attnOutputAll, attnOutput, 1);
+            }
+            std::string proj_weight_name = "transformer.h." + std::to_string(i) + ".attn.c_proj.weight";
+            Linear(attnOutputAll, weight[proj_weight_name], Data(), attnLastOutput);
+            AddTo(hiddenStates, attnLastOutput);
+            std::string ln_2_name = "transformer.h." + std::to_string(i) + ".ln_2.weight";
+            RMSNorm(hiddenStates, weight[ln_2_name], 1e-6, attnInput);
+            std::string mlp_w1_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w1.weight";
+            std::string mlp_w2_weight_name = "transformer.h." + std::to_string(i) + ".mlp.w2.weight";
+            std::string mlp_proj_weight_name = "transformer.h." + std::to_string(i) + ".mlp.c_proj.weight";
+            Linear(attnInput, weight[mlp_w1_weight_name], Data(), a1);
+            Linear(attnInput, weight[mlp_w2_weight_name], Data(), a2);
+            Silu(a2, a2);
+            MulTo(a1, a2);
+            Linear(a1, weight[mlp_proj_weight_name], Data(), mlpOutput);
+            AddTo(hiddenStates, mlpOutput);
+        }
+        RMSNorm(hiddenStates, weight["transformer.ln_f.weight"], 1e-6, hiddenStates);
+        Data logits;
+        Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
+        std::vector <int> lastRet;
+        int total = 0;
+        Data curLogit;
+        for (int b = 0; b < batch; b++) {
+            Split(logits, 1, total + seqLens[b] - 1, total + seqLens[b], curLogit);
+            if (generationConfigs[b].output_logits && retLogits != nullptr && (*retLogits)[b] != nullptr) {
+                curLogit.ToDevice(DataDevice::CPU);
+                (*retLogits)[b]->resize(curLogit.Count(0));
+                memcpy((float*)(*retLogits)[b]->data(), (float*)curLogit.cpuData, curLogit.GetBytes());
+            }
+            if (generationConfigs[b].IsSimpleGreedy()) {
+                Data topk;
+                TopK(curLogit, topk, 1);
+                topk.ToDevice(DataDevice::CPU);
+                lastRet.push_back((int) (((float *) topk.cpuData)[0] + 1e-3));
+            } else {
+                lastRet.push_back(LLMSampling(curLogit, 0, generationConfigs[b], lastTokens.units[b]));
+            }
+            total += seqLens[b];
+        }
+        return lastRet;
+    }
+    std::string QWenModel::MakeInput(const std::string &history, int round, const std::string &input) {
+        if (weight.dicts["chat_format"] == "chatml") {
+            return (round == 0 ? im_start + "system" + "\n" + pre_prompt + im_end : history) + 
+                "\n" + im_start + user_role + "\n" + input + im_end + "\n" + im_start + bot_role + "\n";
+        } else if (weight.dicts["chat_format"] == "raw") {
+            return history + input;
+        } else {
+            ErrorInFastLLM("Unknown char_format for QWen: " + weight.dicts["chat_format"]);
+            return "";
+        }
+    }
+    std::string QWenModel::MakeHistory(const std::string &history, int round, 
+                                       const std::string &input, const std::string &output) {
+        if (weight.dicts["chat_format"] == "chatml") {
+            return (round == 0 ? im_start + "system" + "\n" + pre_prompt + im_end : history) + 
+                "\n" + im_start + user_role + "\n" + input + im_end + "\n" + im_start + bot_role + "\n" + output + im_end;
+        } else if (weight.dicts["chat_format"] == "raw") {
+            return history + input + output;
+        } else {
+            ErrorInFastLLM("Unknown char_format for QWen: " + weight.dicts["chat_format"]);
+            return "";
+        }
+    }
+    void QWenModel::FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
+                                  const std::map <std::string, int> &params,
+                                  Data &inputIds, Data &attentionMask, Data &positionIds) {
+        int index = params.find("index")->second;
+        int promptLen = params.find("promptLen")->second;
+        inputIds.ToDevice(DataDevice::CPU);
+        attentionMask.ToDevice(DataDevice::CPU);
+        positionIds.ToDevice(DataDevice::CPU);
+        if (index == 0) {
+            int seqLen = inputTokens[0].size();
+            std::vector <float> vmask = std::vector <float> (seqLen * seqLen, 0);
+            std::vector<float> vpids = std::vector<float>(seqLen, 0);
+            for (int i = 0; i < seqLen; i++) {
+                vpids[i] = i;
+                for (int j = i + 1; j < seqLen; j++) {
+                    vmask[i * seqLen + j] = 1;
+                }
+            }
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vpids));
+        } else {
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
+            attentionMask.CopyFrom(Data());
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float) (promptLen + index - 1)}));
+        }
+    }
+    void QWenModel::FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
+                                       const std::vector <std::map <std::string, int> > &params,
+                                       Data &inputIds, Data &attentionMask, Data &positionIds) {
+        int batch = inputTokens.size();
+        int index = params[0].find("index")->second;
+        int promptLen = params[0].find("promptLen")->second;
+        inputIds.ToDevice(DataDevice::CPU);
+        attentionMask.ToDevice(DataDevice::CPU);
+        positionIds.ToDevice(DataDevice::CPU);
+        if (index == 0) {
+            int seqLen = inputTokens[0].size();
+            std::vector<float> ids = std::vector<float>(batch * seqLen, 0);
+            std::vector <float> vmask = std::vector <float> (batch * seqLen * seqLen, 0);
+            std::vector<float> vpids = std::vector<float>(batch * seqLen, 0);
+            for (int b = 0; b < batch; b++) {
+                for (int i = 0; i < seqLen; i++) {
+                    ids[b * seqLen + i] = inputTokens[b][i];
+                }
+            }
+            for (int i = 0; i < seqLen; i++) {
+                vpids[i] = i;
+                for (int j = i + 1; j < seqLen; j++) {
+                    vmask[i * seqLen + j] = 1;
+                }
+            }
+            for (int b = 1; b < batch; b++) {
+                memcpy(vmask.data() + b * seqLen * seqLen, vmask.data(), seqLen * seqLen * sizeof(float));
+                memcpy(vpids.data() + b * seqLen, vpids.data(), seqLen * sizeof(float));
+            }
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen}, ids));
+            attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen, seqLen}, vmask));
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen}, vpids));
+        } else {
+            std::vector<float> ids = std::vector<float>(batch * 1, 0);
+            std::vector<float> vpids = std::vector<float>(batch * 1, 0);
+            for (int b = 0; b < batch; b++) {
+                ids[b] = inputTokens[b][0];
+                vpids[b] = (float) (promptLen + index - 1);
+            }
+            inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, ids));
+            attentionMask.CopyFrom(Data());
+            positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, vpids));
+        }
+    }
+    void QWenModel::WarmUp() {
+        printf("Warmup...\n");
+        Data inputIds = Data(DataType::FLOAT32, {1, 1}, {1});
+        Data attentionMask = Data(DataType::FLOAT32, {1, 1}, {0});
+        Data positionIds = Data(DataType::FLOAT32, {1, 1}, {0, 0});
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+                                                   Data(DataType::FLOAT32)));
+        }
+        Forward(inputIds, attentionMask, positionIds, pastKeyValues);
+#ifdef USE_TFACC40T
+        FastllmTfaccReleaseTempMemory();
+#endif
+        printf("finish.\n");
+    }
+    void QWenModel::UpdateRotaryPosEmb(float ntk_alpha) {
+        float base = 10000 * pow(ntk_alpha, (float) rotary_dim / (rotary_dim - 2));
+        if (sin.empty() || cos.empty()) {
+            sin.resize(max_positions);
+            cos.resize(max_positions);
+        }
+        std::vector <float> invFreq;
+        for (int i = 0; i < rotary_dim; i += 2) {
+            invFreq.push_back(1.0 / pow(base, (float)i / rotary_dim));
+        }
+        for (int i = 0; i < max_positions; i++) {
+            sin[i].resize(rotary_dim);
+            cos[i].resize(rotary_dim);
+            for (int j = 0; j < invFreq.size(); j++) {
+                sin[i][j] = ::sin((float)i * invFreq[j]);
+                cos[i][j] = ::cos((float)i * invFreq[j]);
+            }
+        }
+        std::vector <float> fsin, fcos;
+        for (int i = 0; i < sin.size(); i++) {
+            for (int j = 0; j < sin[0].size(); j++) {
+                fsin.push_back(sin[i][j]);
+                fcos.push_back(cos[i][j]);
+            }
+        }
+        sinData.ToDevice(DataDevice::CPU);
+        cosData.ToDevice(DataDevice::CPU);
+        sinData.CopyFrom(Data(DataType::FLOAT32, {(int)this->sin.size(), (int)this->sin[0].size()}, fsin));
+        cosData.CopyFrom(Data(DataType::FLOAT32, {(int)this->cos.size(), (int)this->cos[0].size()}, fcos));
+    }
+}
\ No newline at end of file
--- a/src/pybinding.cpp
+++ b/src/pybinding.cpp
+#include "model.h"
+#include "factoryllm.h"
+#ifdef PY_API
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/chrono.h>
+#include <pybind11/functional.h>
+#include <unordered_map>
+namespace py = pybind11;
+using namespace pybind11::literals;  
+// template <typename... Args>
+// using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
+using pastKV = std::vector<std::pair<fastllm::Data,fastllm::Data>>;
+// PYBIND11_MAKE_OPAQUE(std::vector<std::pair<fastllm::Data,fastllm::Data>>);
+PYBIND11_MAKE_OPAQUE(fastllm::Data);
+PYBIND11_MODULE(pyfastllm, m) {
+  m.doc() = "fastllm python bindings";
+  py::class_<fastllm::GenerationConfig>(m, "GenerationConfig")
+	  .def(py::init<>())
+	  .def_readwrite("max_length", &fastllm::GenerationConfig::output_token_limit) 
+	  .def_readwrite("last_n", &fastllm::GenerationConfig::last_n) 
+	  .def_readwrite("repeat_penalty", &fastllm::GenerationConfig::repeat_penalty) 
+	  .def_readwrite("top_k", &fastllm::GenerationConfig::top_k) 
+	  .def_readwrite("top_p", &fastllm::GenerationConfig::top_p) 
+	  .def_readwrite("temperature", &fastllm::GenerationConfig::temperature)
+	  .def_readwrite("enable_hash_id", &fastllm::GenerationConfig::enable_hash_id)
+	  .def("is_simple_greedy", &fastllm::GenerationConfig::IsSimpleGreedy); 
+  // high level
+  m.def("set_threads", &fastllm::SetThreads)
+    .def("get_threads", &fastllm::GetThreads)
+    .def("set_low_memory", &fastllm::SetLowMemMode)
+    .def("get_low_memory", &fastllm::GetLowMemMode)
+    .def("set_kv_cache", &fastllm::SetKVCacheInCPU)
+    .def("get_kv_cache", &fastllm::GetKVCacheInCPU)
+    .def("set_device_map", &fastllm::SetDeviceMap)
+    .def("create_llm", &fastllm::CreateLLMModelFromFile);
+  m.def("std_hash", [](std::string input) -> size_t {
+		return std::hash<std::string>{}(input);
+  }); 
+  // low level
+  m.def("get_llm_type", &fastllm::GetModelTypeFromFile);
+  py::enum_<fastllm::DataType>(m, "Dtype")
+    .value("float32", fastllm::DataType::FLOAT32)
+    .value("bfloat16", fastllm::DataType::BFLOAT16)
+    .value("int16", fastllm::DataType::INT16)
+    .value("int8", fastllm::DataType::INT8)
+    .value("int4", fastllm::DataType::INT4)
+    .value("int2", fastllm::DataType::INT2)
+    .value("float16", fastllm::DataType::FLOAT16)
+    .value("bit", fastllm::DataType::BIT)
+    .value("int32param", fastllm::DataType::INT32PARAM)
+    .export_values();
+  py::class_<fastllm::Data>(m, "Tensor")
+    .def_readonly("dims", &fastllm::Data::dims)
+    .def(py::init<>())
+    .def(py::init<fastllm::DataType>())
+    .def(py::init<fastllm::DataType, const std::vector<int>&>())
+    .def(py::init<fastllm::DataType, const std::vector<int>&, const std::vector<float>&>())
+    .def(py::init<fastllm::Data>())
+    .def("copy_from", &fastllm::Data::CopyFrom)
+    .def("count", &fastllm::Data::Count)
+    .def("to_list", [](fastllm::Data& data){
+      std::vector <float> vecData;
+      for (int i = 0; i < data.Count(0); i++) {
+            vecData.push_back(((float*)data.cpuData)[i]);
+        }
+        return vecData;
+    })
+    .def("print", &fastllm::Data::Print)
+    .def("to", static_cast<void (fastllm::Data::*)(void *device)>(&fastllm::Data::ToDevice));
+  m.def("zeros", [](const std::vector<int> &dims, fastllm::DataType dtype)->fastllm::Data {
+    int nums = 1;
+    for (auto dim:dims){nums *= dim; } 
+    std::vector<float>zero_data(nums, 0);
+    auto data = fastllm::Data(dtype, dims, zero_data);
+    return data;
+  }, py::arg("dims"), py::arg("dtype"));
+  m.def("cat", [](std::vector<fastllm::Data> datas, int dim)->fastllm::Data {
+    // int pos_dim = 0;
+    // // dim check
+    // for (int i=0;i<datas[0].dims.size();i++){
+    //   int cur_dim = datas[0].dims[i];
+    //   for (auto data:datas){
+    //     if (i == dim){
+    //       pos_dim += data.dims[i];
+    //       continue;
+    //     }
+    //     if (data.dims[i] != cur_dim){
+    //       std::cout<<"dim not the same!!!"<<std::endl;
+    //       return fastllm::Data();
+    //     }
+    //   }
+    // }
+    // auto newDims = datas[0].dims;
+    // newDims[dim] = pos_dim;
+    // TODO use memcpy cp data 
+    // TODO add different dim cat
+     std::vector <float> vecData;
+     for (auto data:datas){
+      for (int i = 0; i < data.Count(0); i++) {
+            vecData.push_back(((float*)data.cpuData)[i]);
+        }
+     }
+     int seqLen = vecData.size();
+     return fastllm::Data(fastllm::DataType::FLOAT32, {1, seqLen}, vecData);
+  });
+  py::class_<fastllm::Tokenizer>(m, "Tokenizer")
+    .def("encode", &fastllm::Tokenizer::Encode)
+    // .def("decode", &fastllm::Tokenizer::Decode)
+    .def("decode", &fastllm::Tokenizer::Decode, "Decode from Tensor")
+    .def("decode", &fastllm::Tokenizer::DecodeTokens, "Decode from Vector")
+    .def("decode_byte", [](fastllm::Tokenizer &tokenizer, const fastllm::Data &data){
+      std::string ret = tokenizer.Decode(data);
+      return py::bytes(ret);
+    })
+    .def("decode_byte", [](fastllm::Tokenizer &tokenizer, const std::vector<int>& data){
+      std::string ret = tokenizer.DecodeTokens(data);
+      return py::bytes(ret);
+    })
+    .def("clear", &fastllm::Tokenizer::Clear)
+    .def("insert", &fastllm::Tokenizer::Insert);
+  py::class_<fastllm::WeightMap>(m, "WeightMap")
+    .def_readonly("tokenizer", &fastllm::WeightMap::tokenizer)
+    .def("save_lowbit", &fastllm::WeightMap::SaveLowBitModel)
+    .def("set_kv", &fastllm::WeightMap::AddDict)
+    .def("set_weight", &fastllm::WeightMap::AddWeight)
+    .def("__getitem__", [](fastllm::WeightMap &weight, std::string key){
+        return weight[key]; });
+  // model classes
+  py::class_<fastllm::basellm>(m, "basellm");
+  py::class_<fastllm::ChatGLMModel, fastllm::basellm>(m, "ChatGLMModel")
+    .def(py::init<>())
+    .def_readonly("model_type", &fastllm::ChatGLMModel::model_type)
+    .def_readonly("weight", &fastllm::ChatGLMModel::weight)
+    .def_readonly("block_cnt", &fastllm::ChatGLMModel::block_cnt)
+    .def_readonly("bos_token_id", &fastllm::ChatGLMModel::bos_token_id)
+    .def_readonly("eos_token_id", &fastllm::ChatGLMModel::eos_token_id)
+    .def("load_weights", &fastllm::ChatGLMModel::LoadFromFile)
+    .def("make_input", &fastllm::ChatGLMModel::MakeInput)
+    .def("make_history", &fastllm::ChatGLMModel::MakeHistory)
+    .def("response", &fastllm::ChatGLMModel::Response)
+    .def("batch_response", [](fastllm::ChatGLMModel &model, 
+                              const std::vector <std::string> &inputs,
+                               RuntimeResultBatch retCb,
+							   fastllm::GenerationConfig config)->std::vector<std::string> {
+      std::vector <std::string> outputs;
+      model.ResponseBatch(inputs, outputs, retCb, config);
+      return outputs;
+    })
+    .def("warmup", &fastllm::ChatGLMModel::WarmUp)
+    .def("forward",
+        [](fastllm::ChatGLMModel &model, 
+           const fastllm::Data &inputIds, 
+           const fastllm::Data &attentionMask,
+           const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
+           const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
+          int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
+          return std::make_tuple(retV, pastKeyValues);
+    })
+    .def("launch_response", &fastllm::ChatGLMModel::LaunchResponseTokens)
+    .def("fetch_response", &fastllm::ChatGLMModel::FetchResponseTokens)
+    .def("save_lowbit_model", &fastllm::ChatGLMModel::SaveLowBitModel)
+    .def("make_input", &fastllm::ChatGLMModel::MakeInput);
+  py::class_<fastllm::MOSSModel, fastllm::basellm>(m, "MOSSModel")
+    .def(py::init<>())
+    .def_readonly("model_type", &fastllm::MOSSModel::model_type)
+    .def_readonly("weight", &fastllm::MOSSModel::weight)
+    .def_readonly("block_cnt", &fastllm::MOSSModel::block_cnt)
+    .def_readonly("bos_token_id", &fastllm::MOSSModel::bos_token_id)
+    .def_readonly("eos_token_id", &fastllm::MOSSModel::eos_token_id)
+    .def("load_weights", &fastllm::MOSSModel::LoadFromFile)
+    .def("make_input", &fastllm::MOSSModel::MakeInput)
+    .def("make_history", &fastllm::MOSSModel::MakeHistory)
+    .def("response", &fastllm::MOSSModel::Response)
+    .def("batch_response", [](fastllm::MOSSModel &model, 
+                              const std::vector <std::string> &inputs,
+                               RuntimeResultBatch retCb,
+							   fastllm::GenerationConfig config)->std::vector<std::string> {
+      std::vector <std::string> outputs;
+      model.ResponseBatch(inputs, outputs, retCb, config);
+      return outputs;
+    })
+    .def("forward",
+        [](fastllm::MOSSModel &model, 
+           const fastllm::Data &inputIds, 
+           const fastllm::Data &attentionMask,
+           const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
+           const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
+          int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
+          return std::make_tuple(retV, pastKeyValues);
+    })
+    .def("launch_response", &fastllm::MOSSModel::LaunchResponseTokens)
+    .def("fetch_response", &fastllm::MOSSModel::FetchResponseTokens)
+    .def("save_lowbit_model", &fastllm::MOSSModel::SaveLowBitModel)
+    .def("make_input", &fastllm::MOSSModel::MakeInput);
+  py::class_<fastllm::LlamaModel, fastllm::basellm>(m, "LlamaModel")
+    .def(py::init<>())
+    .def_readonly("model_type", &fastllm::LlamaModel::model_type)
+    .def_readonly("weight", &fastllm::LlamaModel::weight)
+    .def_readonly("block_cnt", &fastllm::LlamaModel::block_cnt)
+    .def_readonly("bos_token_id", &fastllm::LlamaModel::bos_token_id)
+    .def_readonly("eos_token_id", &fastllm::LlamaModel::eos_token_id)
+    .def("load_weights", &fastllm::LlamaModel::LoadFromFile)
+    .def("make_input", &fastllm::LlamaModel::MakeInput)
+    .def("make_history", &fastllm::LlamaModel::MakeHistory)
+    .def("response", &fastllm::LlamaModel::Response)
+    .def("batch_response", [](fastllm::LlamaModel &model, 
+                              const std::vector <std::string> &inputs,
+                               RuntimeResultBatch retCb,
+							   fastllm::GenerationConfig config)->std::vector<std::string> {
+      std::vector <std::string> outputs;
+      model.ResponseBatch(inputs, outputs, retCb, config);
+      return outputs;
+    })
+    .def("warmup", &fastllm::LlamaModel::WarmUp)
+    .def("forward",
+        [](fastllm::LlamaModel &model, 
+           const fastllm::Data &inputIds, 
+           const fastllm::Data &attentionMask,
+           const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
+           const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
+          int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
+          return std::make_tuple(retV, pastKeyValues);
+    })
+    .def("launch_response", &fastllm::LlamaModel::LaunchResponseTokens)
+    .def("fetch_response", &fastllm::LlamaModel::FetchResponseTokens)
+    .def("save_lowbit_model", &fastllm::LlamaModel::SaveLowBitModel)
+    .def("make_input", &fastllm::LlamaModel::MakeInput);
+  py::class_<fastllm::QWenModel, fastllm::basellm>(m, "QWenModel")
+    .def(py::init<>())
+    .def_readonly("model_type", &fastllm::QWenModel::model_type)
+    .def_readonly("weight", &fastllm::QWenModel::weight)
+    .def_readonly("block_cnt", &fastllm::QWenModel::block_cnt)
+    .def_readonly("bos_token_id", &fastllm::QWenModel::bos_token_id)
+    .def_readonly("eos_token_id", &fastllm::QWenModel::eos_token_id)
+    .def("load_weights", &fastllm::QWenModel::LoadFromFile)
+    .def("make_input", &fastllm::QWenModel::MakeInput)
+    .def("make_history", &fastllm::QWenModel::MakeHistory)
+    .def("response", &fastllm::QWenModel::Response)
+    .def("batch_response", [](fastllm::QWenModel &model, 
+                                const std::vector <std::string> &inputs,
+                                RuntimeResultBatch retCb,
+                                fastllm::GenerationConfig config)->std::vector<std::string> {
+        std::vector <std::string> outputs;
+        model.ResponseBatch(inputs, outputs, retCb, config);
+        return outputs;
+    })
+    .def("warmup", &fastllm::QWenModel::WarmUp)
+    .def("forward",
+        [](fastllm::QWenModel &model, 
+            const fastllm::Data &inputIds, 
+            const fastllm::Data &attentionMask,
+            const fastllm::Data &positionIds, std::vector<std::pair<fastllm::Data, fastllm::Data>> &pastKeyValues,
+            const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) {
+            int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
+            return std::make_tuple(retV, pastKeyValues);
+    })
+    .def("launch_response", &fastllm::QWenModel::LaunchResponseTokens)
+    .def("fetch_response", &fastllm::QWenModel::FetchResponseTokens)
+    .def("save_lowbit_model", &fastllm::QWenModel::SaveLowBitModel)
+    .def("make_input", &fastllm::QWenModel::MakeInput);
+#ifdef VERSION_INFO
+    m.attr("__version__") = VERSION_INFO;
+#else
+    m.attr("__version__") = "dev";
+#endif
+}
+#endif
--- a/test/cmmlu/README.md
+++ b/test/cmmlu/README.md
+CMMLU是一个综合性的中文评估基准，专门用于评估语言模型在中文语境下的知识和推理能力。
+项目官网网址为: https://github.com/haonan-li/CMMLU
+本目录下的chatglm.py程序会调用fastllm框架进行测试
+测试步骤如下：
+- 1. 克隆CMMLU仓库
+``` sh
+git clone https://github.com/haonan-li/CMMLU
+```
+- 2. 测试
+```
+# chatglm测试脚本
+# 这里model_name_or_path可以使用ChatGLM2-6b官方的原始模型、int4模型，dtype支持float16, int8, int4
+python3 chatglm.py --model_name_or_path 此处填写模型路径 --save_dir 此处填写结果保存路径 --dtype float16
+# baichuan13b测试脚本
+# 这里model_name_or_path可以使用Baichuan13B-Base或Baichuan13B-Chat官方的原始模型，dtype支持float16, int8, int4
+python3 baichuan.py --model_name_or_path 此处填写模型路径 --save_dir 此处填写结果保存路径 --dtype float16
+```
+测试数据较多，过程比较漫长，测试中途可以通过以下命令查看已完成的测试成绩
+```
+python3 eval.py 此处填写结果保存路径
+```
+- 3. 参考结果
+|              模型        | Data精度 | Shot     |  CMMLU分数 |
+|-----------------------: |-------- |----------|-----------|
+| ChatGLM2-6b-fp16        | float32 |0         |  50.16    |
+| ChatGLM2-6b-int8        | float32 |0         |  50.14    |
+| ChatGLM2-6b-int4        | float32 |0         |  49.63    |
+| QWen-7b-Base-fp16       | float32 |0         |  57.43    |
+| QWen-7b-Chat-fp16       | float32 |0         |  54.82    |
+| Baichuan-13b-Base-int8  | float32 |5         |  55.12    |
+| Baichuan-13b-Base-int4  | float32 |5         |  52.22    |
--- a/test/cmmlu/baichuan.py
+++ b/test/cmmlu/baichuan.py
+import os
+import torch
+import numpy as np
+import argparse
+from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval
+from peft import PeftModel
+from transformers import LlamaForCausalLM, LlamaTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def eval(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
+    choice_ids = [tokenizer.convert_tokens_to_ids(choice) for choice in choices]
+    cors = []
+    all_conf = []
+    all_preds = []
+    answers = choices[: test_df.shape[1] - 2]
+    for i in range(test_df.shape[0]):
+        prompt_end = format_example(test_df, i, subject, include_answer=False)
+        prompt = gen_prompt(dev_df=dev_df,
+                            subject=subject,
+                            prompt_end=prompt_end,
+                            num_few_shot=num_few_shot,
+                            tokenizer=tokenizer,
+                            max_length=max_length)
+        label = test_df.iloc[i, test_df.shape[1] - 1]
+        logits = model.response_logits(prompt, tokenizer = tokenizer);
+        sel = 0;
+        for j in range(4):
+            if (logits[choice_ids[j]] > logits[choice_ids[sel]]):
+                sel = j;
+        pred = choices[sel];
+        conf = [logits[choice_ids[j]] for j in range(4)]
+        all_preds += pred
+        all_conf.append(conf)
+        cors.append(pred == label)
+        print(i, np.mean(cors))
+    acc = np.mean(cors)
+    print("Average accuracy {:.3f} - {}".format(acc, subject))
+    return acc, all_preds, all_conf
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str, default="")
+    parser.add_argument("--lora_weights", type=str, default="")
+    parser.add_argument("--data_dir", type=str, default="./CMMLU/data")
+    parser.add_argument("--save_dir", type=str, default="../results/not_specified")
+    parser.add_argument("--num_few_shot", type=int, default=0)
+    parser.add_argument("--max_length", type=int, default=2048)
+    parser.add_argument("--load_in_8bit", action='store_true')
+    parser.add_argument("--dtype", type=str, default="float16")
+    parser.add_argument("--with_conf", action='store_true')
+    parser.add_argument("--cot", action='store_true')
+    args = parser.parse_args()
+    # TODO: better handle
+    tokenizer_class = LlamaTokenizer if 'llama' in args.model_name_or_path else AutoTokenizer
+    model_class = LlamaForCausalLM if 'llama' in args.model_name_or_path else AutoModelForCausalLM
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, trust_remote_code=True)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        trust_remote_code=True,
+                                        load_in_8bit=args.load_in_8bit,
+                                        torch_dtype=torch.float16,
+                                        device_map="cpu"
+                                        )
+    if args.lora_weights != "":
+        model = PeftModel.from_pretrained(
+            model,
+            args.lora_weights,
+            torch_dtype=torch.float16,
+        )
+    from fastllm_pytools import llm;
+    model = llm.from_hf(model, tokenizer, dtype = args.dtype);
+    model.direct_query = True;
+    run_eval(model, tokenizer, eval, args)
--- a/test/cmmlu/categories.py
+++ b/test/cmmlu/categories.py
+name_en2zh = {
+    "agronomy": "农学",
+    "anatomy": "解剖学",
+    "ancient_chinese": "古汉语",
+    "arts": "艺术学",
+    "astronomy": "天文学",
+    "business_ethics": "商业伦理",
+    "chinese_civil_service_exam": "中国公务员考试",
+    "chinese_driving_rule": "中国驾驶规则",
+    "chinese_food_culture": "中国饮食文化",
+    "chinese_foreign_policy": "中国外交政策",
+    "chinese_history":"中国历史",
+    "chinese_literature": "中国文学",
+    "chinese_teacher_qualification": "中国教师资格",
+    "clinical_knowledge": "临床知识",
+    "college_actuarial_science":"大学精算学",
+    "college_education":"大学教育学",
+    "college_engineering_hydrology": "大学工程水文学",
+    "college_law": "大学法律",
+    "college_mathematics": "大学数学",
+    "college_medical_statistics":"大学医学统计",
+    "college_medicine": "大学医学",
+    "computer_science": "计算机科学",
+    "computer_security": "计算机安全",
+    "conceptual_physics": "概念物理学",
+    "construction_project_management": "建设工程管理",
+    "economics": "经济学",
+    "education": "教育学",
+    "electrical_engineering": "电气工程",
+    "elementary_chinese":"小学语文",
+    "elementary_commonsense":"小学常识",
+    "elementary_information_and_technology": "小学信息技术",
+    "elementary_mathematics": "初等数学",
+    "ethnology": "民族学",
+    "food_science": "食品科学",
+    "genetics": "遗传学",
+    "global_facts": "全球事实",
+    "high_school_biology": "高中生物",
+    "high_school_chemistry": "高中化学",
+    "high_school_geography": "高中地理",
+    "high_school_mathematics": "高中数学",
+    "high_school_physics": "高中物理学",
+    "high_school_politics": "高中政治",
+    "human_sexuality": "人类性行为",
+    "international_law": "国际法学",
+    "journalism": "新闻学",
+    "jurisprudence": "法理学",
+    "legal_and_moral_basis": "法律与道德基础",
+    "logical": "逻辑学",
+    "machine_learning": "机器学习",
+    "management": "管理学",
+    "marketing": "市场营销",
+    "marxist_theory": "马克思主义理论",
+    "modern_chinese": "现代汉语",
+    "nutrition": "营养学",
+    "philosophy": "哲学",
+    "professional_accounting": "专业会计",
+    "professional_law": "专业法学",
+    "professional_medicine": "专业医学",
+    "professional_psychology": "专业心理学",
+    "public_relations": "公共关系",
+    "security_study":"安全研究",
+    "sociology": "社会学",
+    "sports_science": "体育学",
+    "traditional_chinese_medicine": "中医中药",
+    "virology": "病毒学",
+    "world_history":"世界历史",
+    "world_religions": "世界宗教",
+}
+subcategories = {
+    "agronomy": ['other'],
+    "anatomy": ['biology'],
+    "ancient_chinese": ['linguistics','china specific'],
+    "arts": ['arts'],
+    "astronomy": ['physics'],
+    "business_ethics": ['business'],
+    "chinese_civil_service_exam": ['politics','china specific'],
+    "chinese_driving_rule": ['other','china specific'],
+    "chinese_food_culture": ['culture','china specific'],
+    "chinese_foreign_policy": ['politics','china specific'],
+    "chinese_history":['history','china specific'],
+    "chinese_literature": ['literature','china specific'],
+    "chinese_teacher_qualification": ['education','china specific'],
+    "college_actuarial_science":['math'],
+    "college_education":['education'],
+    "college_engineering_hydrology": ['engineering'],
+    "college_law": ['law'],
+    "college_mathematics": ['math'],
+    "college_medical_statistics":['statistics'],
+    "clinical_knowledge": ['other'],
+    "college_medicine": ['other'],
+    "computer_science": ['computer science'],
+    "computer_security": ['other'],
+    "conceptual_physics": ['physics'],
+    "construction_project_management": ['other','china specific'],
+    "economics": ['economics'],
+    "education": ['education'],
+    "elementary_chinese":['linguistics','china specific'],
+    "elementary_commonsense":['other','china specific'],
+    "elementary_information_and_technology": ['other'],
+    "electrical_engineering": ['engineering'],
+    "elementary_mathematics": ['math'],
+    "ethnology": ['culture','china specific'],
+    "food_science": ['other'],
+    "genetics": ['biology'],
+    "global_facts": ['global'],
+    "high_school_biology": ['biology'],
+    "high_school_chemistry": ['chemistry'],
+    "high_school_geography": ['geography'],
+    "high_school_mathematics": ['math'],
+    "high_school_physics": ['physics'],
+    "high_school_politics": ['politics','china specific'],
+    "human_sexuality": ['other'],
+    "international_law": ['law'],
+    "journalism": ['sociology'],
+    "jurisprudence": ['law'],
+    "legal_and_moral_basis": ['other'],
+    "logical": ['philosophy'],
+    "machine_learning": ['computer science'],
+    "management": ['business'],
+    "marketing": ['business'],
+    "marxist_theory": ['philosophy'],
+    "modern_chinese": ['linguistics','china specific'],
+    "nutrition": ['other'],
+    "philosophy": ['philosophy'],
+    "professional_accounting": ['business'],
+    "professional_law": ['law'],
+    "professional_medicine": ['other'],
+    "professional_psychology": ['psychology'],
+    "public_relations": ['politics'],
+    "security_study": ['politics'],
+    "sociology": ['culture'],
+    "sports_science": ['other'],
+    "traditional_chinese_medicine": ['other','china specific'],
+    "virology": ['biology'],
+    "world_history":['history'],
+    "world_religions": ['global'],
+}
+categories = {
+    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
+    "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
+    "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
+    "Other":["other"],
+    "China specific": ["china specific"],
+}
--- a/test/cmmlu/chatglm.py
+++ b/test/cmmlu/chatglm.py
+import os
+import torch
+import numpy as np
+import argparse
+from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval
+from transformers import AutoModel, AutoTokenizer
+import threading
+def chat(model, tokenizer, prompt, output_list, idx):
+    pred, history = model.chat(tokenizer, prompt, history=[], max_length = 5);
+    if pred[0] not in choices:
+        pred, history = model.chat(tokenizer, prompt, history=[], max_length = 1000);
+    output_list[idx] = pred;
+def eval_chat_multithread(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
+    cors = []
+    all_preds = []
+    answers = choices[: test_df.shape[1] - 2]
+    batch_num = 64;
+    output_list = ["" for i in range(test_df.shape[0])];
+    ths = [None for i in range(test_df.shape[0])];
+    for j in range(0, test_df.shape[0], batch_num):
+        cur_len = min(test_df.shape[0] - j, batch_num);
+        for i in range(j, j + cur_len):
+            prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
+            prompt = gen_prompt(dev_df=dev_df,
+                                subject=subject,
+                                prompt_end=prompt_end,
+                                num_few_shot=num_few_shot,
+                                tokenizer=tokenizer,
+                                max_length=max_length,
+                                cot=cot)
+            ths[i] = threading.Thread(target = chat, args=(model, tokenizer, prompt, output_list, i));
+            ths[i].start();
+        for i in range(j, j + cur_len):
+            ths[i].join();
+            pred = output_list[i];
+            label = test_df.iloc[i, test_df.shape[1] - 1]
+            if pred and pred[0] in choices:
+                cors.append(pred[0] == label);
+            all_preds.append(pred.replace("\n", ""))
+            print(i, test_df.shape[0], np.mean(cors))
+    acc = np.mean(cors)
+    print("Average accuracy {:.3f} - {}".format(acc, subject))
+    print("{} results, {} inappropriate formated answers.".format(len(cors), len(all_preds)-len(cors)))
+    return acc, all_preds, None
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str, default="")
+    parser.add_argument("--lora_weights", type=str, default="")
+    parser.add_argument("--data_dir", type=str, default="./CMMLU/data")
+    parser.add_argument("--save_dir", type=str, default="./results/ChatGLM2-6B")
+    parser.add_argument("--num_few_shot", type=int, default=0)
+    parser.add_argument("--max_length", type=int, default=2048)
+    parser.add_argument("--dtype", type=str, default="float16")
+    parser.add_argument("--cot", action='store_true')
+    args = parser.parse_args()
+    # Initialize models
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True,)
+    model = AutoModel.from_pretrained(args.model_name_or_path, trust_remote_code=True).cpu()
+    from fastllm_pytools import llm;
+    model = llm.from_hf(model, tokenizer, dtype = args.dtype);
+    # model.save("/root/test.flm");
+    # Always use Chat-style evaluation
+    run_eval(model, tokenizer, eval_chat_multithread, args)
--- a/test/cmmlu/eval.py
+++ b/test/cmmlu/eval.py
+import CMMLU.src.mp_utils as mp
+import sys
+print(mp.get_results(sys.argv[1]))
--- a/test/cmmlu/qwen.py
+++ b/test/cmmlu/qwen.py
+import os
+import torch
+import numpy as np
+import argparse
+import threading
+from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+def chat(model, tokenizer, prompt, output_list, idx):
+    pred, history = model.chat(tokenizer, prompt, history=[], max_length = 5)
+    if pred[0] not in choices:
+        pred, history = model.chat(tokenizer, prompt, history=[], max_length = 1000)
+    output_list[idx] = pred
+def eval_chat_multithread(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
+    cors = []
+    all_preds = []
+    answers = choices[: test_df.shape[1] - 2]
+    batch_num = 1
+    output_list = ["" for i in range(test_df.shape[0])]
+    ths = [None for i in range(test_df.shape[0])]
+    for j in range(0, test_df.shape[0], batch_num):
+        cur_len = min(test_df.shape[0] - j, batch_num)
+        for i in range(j, j + cur_len):
+            prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
+            prompt = gen_prompt(dev_df=dev_df,
+                                subject=subject,
+                                prompt_end=prompt_end,
+                                num_few_shot=num_few_shot,
+                                tokenizer=tokenizer,
+                                max_length=max_length,
+                                cot=cot)
+            ths[i] = threading.Thread(target = chat, args=(model, tokenizer, prompt, output_list, i))
+            ths[i].start()
+        for i in range(j, j + cur_len):
+            ths[i].join()
+            pred = output_list[i]
+            label = test_df.iloc[i, test_df.shape[1] - 1]
+            if pred and pred[0] in choices:
+                cors.append(pred[0] == label)
+            all_preds.append(pred.replace("\n", ""))
+            print(i, test_df.shape[0], np.mean(cors))
+    acc = np.mean(cors)
+    print("Average accuracy {:.3f} - {}".format(acc, subject))
+    print("{} results, {} inappropriate formated answers.".format(len(cors), len(all_preds)-len(cors)))
+    return acc, all_preds, None
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str, default="")
+    parser.add_argument("--lora_weights", type=str, default="")
+    parser.add_argument("--data_dir", type=str, default="./CMMLU/data")
+    parser.add_argument("--save_dir", type=str, default="../results/not_specified")
+    parser.add_argument("--num_few_shot", type=int, default=0)
+    parser.add_argument("--max_length", type=int, default=2048)
+    parser.add_argument("--load_in_8bit", action='store_true')
+    parser.add_argument("--dtype", type=str, default="float16")
+    parser.add_argument("--with_conf", action='store_true')
+    parser.add_argument("--cot", action='store_true')
+    args = parser.parse_args()
+    # TODO: better handle
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, device_map="cpu", trust_remote_code=True, fp16=True).eval()
+    model.generation_config = GenerationConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
+    if args.lora_weights != "":
+        model = PeftModel.from_pretrained(
+            model,
+            args.lora_weights,
+            torch_dtype=torch.float16,
+        )
+    from fastllm_pytools import llm;
+    model = llm.from_hf(model, tokenizer, dtype = args.dtype)
+    model.direct_query = True
+    run_eval(model, tokenizer, eval_chat_multithread, args)
\ No newline at end of file
--- a/tools/fastllm_pytools/__init__.py
+++ b/tools/fastllm_pytools/__init__.py
+__all__ = ["llm"]
\ No newline at end of file
--- a/tools/fastllm_pytools/hf_model.py
+++ b/tools/fastllm_pytools/hf_model.py
+from fastllm_pytools import llm
+import torch
+import ctypes
+import numpy as np
+fastllm_data_type_dict = {
+    "int4": 8,
+    "int8": 3,
+    "float16": 7
+}
+fastllm_weight_type_dict = {
+    "linear": 1,
+    "embedding": 2,
+    "QuantizedLinear": 111
+}
+def create(model,
+           tokenizer = None,
+           pre_prompt = None,
+           user_role = None,
+           bot_role = None,
+           history_sep = None,
+           dtype = "float16"):
+    if (dtype not in fastllm_data_type_dict):
+        print("dtype should in ", list(fastllm_data_type_dict.keys()))
+        exit(0)
+    # 0.1 model info
+    modelInfo = model.config.__dict__
+    if model.generation_config is not None:
+        modelInfo.update(model.generation_config.__dict__)
+    if (pre_prompt):
+        modelInfo["pre_prompt"] = pre_prompt
+    if (user_role):
+        modelInfo["user_role"] = user_role
+    if (bot_role):
+        modelInfo["bot_role"] = bot_role
+    if (history_sep):
+        modelInfo["history_sep"] = history_sep
+    if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
+        # Baichuan 2代
+        modelInfo["use_alibi"] = "1"
+        modelInfo["pre_prompt"] = ""
+        modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
+        modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
+        modelInfo["history_sep"] = ""
+    if (modelInfo["model_type"] == "qwen"):
+        if modelInfo["chat_format"] == "chatml":
+            modelInfo["im_end_id"] = tokenizer.im_end_id
+            modelInfo["im_start_id"] = tokenizer.im_start_id
+    weight_type_dict = {}
+    module_dict = {}
+    weight_bits = {}
+    for key, m in model.named_modules():
+        if (str(type(m)).find("QuantizedLinear") != -1):
+            weight_type_dict[key + ".weight"] = "QuantizedLinear"
+            weight_bits[key + ".weight"] = m.weight_bit_width
+        if (isinstance(m, torch.nn.Linear)):
+            weight_type_dict[key + ".weight"] = "linear"
+            module_dict[key + ".weight"] = m
+        if (isinstance(m, torch.nn.Embedding)):
+            weight_type_dict[key] = "embedding"
+    peft_config = {}
+    active_adapter = ""
+    if hasattr(model, "peft_config"):
+        peft_config = model.peft_config
+    if hasattr(model, "active_adapter"):
+        active_adapter = model.active_adapter
+    model = model.cpu()
+    dict = model.state_dict()
+    model_type = model.config.__dict__["model_type"]
+    model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
+    for it in modelInfo.keys():
+        llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
+    for adapter_name in peft_config.keys():
+        adapter_dict = peft_config[adapter_name].__dict__
+        for it in adapter_dict.keys():
+            llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
+    if len(active_adapter) != 0:
+        llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
+    # 1. vocab
+    if (tokenizer):
+        if (hasattr(tokenizer, "tokenizer")):
+            if modelInfo["model_type"] == "qwen":
+                pass
+            else:
+                tokenizer = tokenizer.tokenizer
+        if (hasattr(tokenizer, "sp_model")):
+            piece_size = tokenizer.sp_model.piece_size()
+            for i in range(piece_size):
+                llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
+                                                             i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
+        else:
+            vocab = tokenizer.get_vocab()
+            for v in vocab.keys():
+                if (modelInfo["model_type"] == "moss"):
+                    vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
+                elif (modelInfo["model_type"] == "qwen"):
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
+                else:
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
+    tot = 0
+    for key in dict:
+        ori_data_type = 0
+        ori_np_data_type = np.float32
+        cur_weight_type = 0
+        if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
+            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
+        to_data_type = 0
+        if (cur_weight_type == 1):
+            to_data_type = fastllm_data_type_dict[dtype]
+            if (to_data_type == 7):
+                ori_data_type = 7
+                ori_np_data_type = np.float16
+        elif (cur_weight_type == 2):
+            # TODO bfloat
+            to_data_type = 0
+        weight_name = key
+        if peft_config is not None:
+            weight_name = weight_name.replace('base_model.model.', '')
+        if (cur_weight_type == 111):
+            llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
+                                                 len(dict[key].shape),
+                                                 (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
+                                                 weight_bits[key],
+                                                 dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
+                                                 dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
+        else:
+            llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
+                                             len(dict[key].shape),
+                                             (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
+                                             to_data_type, cur_weight_type, ori_data_type,
+                                             dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
+        tot += 1
+        print("convert (", tot, "/", len(dict), end = " )\r")
+    print("")
+    llm.fastllm_lib.init_params_llm_model(model)
+    llm.fastllm_lib.warmup_llm_model(model)
+    ret = llm.model("", id = model)
+    return ret
--- a/tools/fastllm_pytools/llm.py
+++ b/tools/fastllm_pytools/llm.py
+import ctypes
+import os
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+import platform
+if platform.system() == 'Windows':
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
+else:
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))
+fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
+fastllm_lib.create_llm_model.restype = ctypes.c_int
+fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
+                                                  ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                  ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.launch_response_llm_model.restype = ctypes.c_int
+fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+fastllm_lib.fetch_response_llm_model.restype = ctypes.c_int
+fastllm_lib.fetch_response_logits_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_float)]
+fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int
+fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
+                                               ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                               ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
+fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
+                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int
+fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
+fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
+fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
+fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
+fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
+fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]
+fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
+def set_cpu_threads(threads: int):
+    fastllm_lib.set_cpu_threads(threads)
+def get_cpu_threads() -> int:
+    return fastllm_lib.get_cpu_threads()
+def print_ins_info():
+    fastllm_lib.print_cpu_ins()
+def set_cpu_kvcache(cpu_kvcache):
+    fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
+def get_cpu_kvcache():
+    return fastllm_lib.get_kvcache_in_cpu()
+def set_cpu_low_mem(low_mem):
+    fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
+def get_cpu_low_mem():
+    return fastllm_lib.get_cpu_low_mem()
+def set_device_map(device_map):
+    devices = []
+    values = []
+    if (isinstance(device_map, str)):
+        devices.append(device_map)
+        values.append(1)
+    elif (isinstance(device_map, list)):
+        devices = [str(x) for x in device_map]
+        values = [1 for x in device_map]
+    elif (isinstance(device_map, dict)):
+        devices = [str(x) for x in device_map.keys()]
+        values = [int(device_map[x]) for x in device_map.keys()]
+    else:
+        print("set_device_map error.")
+        return
+    device_str = ''.join(devices)
+    device_len = [len(x) for x in devices]
+    fastllm_lib.set_device_map(len(device_len),
+                               (ctypes.c_int * len(device_len))(*device_len),
+                               device_str.encode(),
+                               (ctypes.c_int * len(values))(*values))
+def from_hf(model,
+            tokenizer = None,
+            dtype = "float16"):
+    from fastllm_pytools import hf_model
+    return hf_model.create(model, tokenizer, dtype = dtype)
+class model:
+    def __init__ (self, path : str,
+                  id : int = -99999):
+        if (id != -99999):
+            self.model = id
+        else:
+            self.model = fastllm_lib.create_llm_model(path.encode())
+        self.direct_query = False
+    def get_prompt(self,
+                   query: str,
+                   history: List[Tuple[str, str]] = None) -> str:
+        if (not(history)):
+            history = []
+        prompt = ""
+        for i, (old_query, response) in enumerate(history):
+            prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
+        prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
+        return prompt
+    def save(self, path : str):
+        fastllm_lib.save_llm_model(self.model, path.encode())
+    def eval(self):
+        pass
+    def response_logits(self,
+                        query: str,
+                        history: List[Tuple[str, str]] = None,
+                        tokenizer = None) -> str:
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        if (tokenizer == None):
+            handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
+                                                           ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
+                                                           ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
+        else:
+            input = tokenizer.encode(prompt)
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           1, False, 1, 1, 1, 1, True)
+        vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
+        logits = list(range(vocab_size))
+        array = (ctypes.c_float * (vocab_size * 4))(*logits)
+        ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
+        out = list(array)[:vocab_size]
+        while (ret != -1):
+            ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
+        return out
+    def response(self,
+                 query: str,
+                 history: List[Tuple[str, str]] = None,
+                 max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
+        ret = ""
+        for i in self.stream_response(query = query,
+                                      history = history,
+                                      max_length = max_length,
+                                      do_sample = do_sample,
+                                      top_p = top_p, top_k = top_k,
+                                      temperature = temperature,
+                                      repeat_penalty = repeat_penalty,
+                                      one_by_one = True):
+            ret += i
+        return ret
+    def stream_response(self,
+                        query: str,
+                        history: List[Tuple[str, str]] = None,
+                        max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                        one_by_one = True):
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
+                                                           ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
+                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
+        res = ""
+        ret = b''
+        fail_cnt = 0
+        while True:
+            ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
+            cur = ""
+            try:
+                cur = ret.decode()
+                ret = b''
+            except:
+                fail_cnt += 1
+                if (fail_cnt == 20):
+                    break
+                else:
+                    continue
+            fail_cnt = 0
+            if (cur == "<flmeos>"):
+                break
+            if one_by_one:
+                yield cur
+            else:
+                res += cur
+                yield res
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
+             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
+        if (not(history)):
+            history = []
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        input = tokenizer.encode(prompt)
+        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                       False)
+        result = []
+        while True:
+            cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
+            if (cur == -1):
+                break
+            result.append(cur)
+        response = tokenizer.decode(result)
+        history = history + [(query, response)]
+        return response, history
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
+                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                    return_past_key_values = False, **kwargs) -> str:
+        if (not(history)):
+            history = []
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        input = tokenizer.encode(prompt)
+        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                       False)
+        tokens = []
+        while True:
+            cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
+            if (cur == -1):
+                break
+            tokens.append(cur)
+            response = tokenizer.decode(tokens)
+            new_history = history + [(query, response)]
+            if return_past_key_values:
+                yield response, new_history, None
+            else:
+                yield response, new_history
+    def set_adapter(self, name: str):
+        fastllm_lib.set_adapter(self.model, str(name).encode())
+    def disable_adapter(self):
+        fastllm_lib.disable_adapter(self.model)
--- a/tools/fastllm_pytools/torch2flm.py
+++ b/tools/fastllm_pytools/torch2flm.py
+import struct
+import numpy as np
+import torch
+def writeString(fo, s):
+    fo.write(struct.pack('i', len(s)))
+    fo.write(s.encode())
+def writeKeyValue(fo, key, value):
+    writeString(fo, key)
+    writeString(fo, value)
+fastllm_data_type_dict = {
+    "int4": 8,
+    "int8": 3,
+    "float16": 7,
+    "float32": 0,
+}
+fastllm_weight_type_dict = {
+    "linear": 1,
+    "embedding": 2
+}
+v = np.random.randint(-127, 127, [10, 20])
+temp = v
+c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
+c_scale = c_max / 127.0
+v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
+def write_int8(fo, v):
+    c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
+    c_scale = c_max / 127.0
+    v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
+    fo.write(struct.pack('i', 3))
+    fo.write(struct.pack('i', 0))
+    for i in range(c_max.shape[0]):
+        fo.write(struct.pack('f', -c_max[i][0]))
+        fo.write(struct.pack('f', c_max[i][0]))
+    fo.write(v.data)
+def write_int4(fo, v):
+    c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
+    c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
+    c_scale = c_max / 7.0
+    c_min = c_scale * -8.0
+    v = (v - c_min) / c_scale
+    v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
+    v = v[:, 0::2] * 16 + v[:, 1::2]
+    fo.write(struct.pack('i', 8))
+    fo.write(struct.pack('i', 0))
+    for i in range(c_min.shape[0]):
+        fo.write(struct.pack('f', c_min[i][0]))
+        fo.write(struct.pack('f', c_max[i][0]))
+    fo.write(v.data)
+def tofile(exportPath,
+           model,
+           tokenizer = None,
+           pre_prompt = None,
+           user_role = None,
+           bot_role = None,
+           history_sep = None,
+           dtype = "float16"):
+    if (dtype not in fastllm_data_type_dict):
+        print("dtype should in ", list(fastllm_data_type_dict.keys()))
+        exit(0)
+    dict = model.state_dict()
+    fo = open(exportPath, "wb")
+    # 0. version id
+    fo.write(struct.pack('i', 2))
+    # 0.1 model info
+    modelInfo = model.config.__dict__
+    if model.generation_config is not None:
+        modelInfo.update(model.generation_config.__dict__)
+    if ("model_type" not in modelInfo):
+        print("unknown model_type.")
+        exit(0)
+    if (pre_prompt):
+        modelInfo["pre_prompt"] = pre_prompt
+    if (user_role):
+        modelInfo["user_role"] = user_role
+    if (bot_role):
+        modelInfo["bot_role"] = bot_role
+    if (history_sep):
+        modelInfo["history_sep"] = history_sep
+    if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
+        # Baichuan 2代
+        modelInfo["use_alibi"] = "1"
+        modelInfo["pre_prompt"] = ""
+        modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
+        modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
+        modelInfo["history_sep"] = ""
+    if modelInfo["model_type"] == "qwen":
+        if modelInfo["chat_format"] == "chatml":
+            modelInfo["im_end_id"] = tokenizer.im_end_id
+            modelInfo["im_start_id"] = tokenizer.im_start_id
+    modelInfo["tokenizer_use_score"] = "1" # 分词带分数
+    if hasattr(model, "peft_config"):
+        adapter_size = len(model.peft_config)
+        modelInfo["peft_size"] = adapter_size
+    fo.write(struct.pack('i', len(modelInfo)))
+    for it in modelInfo.keys():
+        writeKeyValue(fo, str(it), str(modelInfo[it]))
+    if hasattr(model, "peft_config"):
+        for adapter_name in model.peft_config.keys():
+            adapter_dict = model.peft_config[adapter_name].__dict__
+            writeString(fo, adapter_name)
+            fo.write(struct.pack('i', len(adapter_dict)))
+            for it in adapter_dict.keys():
+                writeKeyValue(fo, str(it), str(adapter_dict[it]))
+    # 1. vocab
+    if (tokenizer):
+        if (hasattr(tokenizer, "tokenizer")):
+            if (modelInfo['model_type'] == "qwen"):
+                pass
+            else:
+                tokenizer = tokenizer.tokenizer
+        if (hasattr(tokenizer, "sp_model")):
+            piece_size = tokenizer.sp_model.piece_size()
+            fo.write(struct.pack('i', piece_size))
+            for i in range(piece_size):
+                s = tokenizer.sp_model.id_to_piece(i).encode()
+                fo.write(struct.pack('i', len(s)))
+                for c in s:
+                    fo.write(struct.pack('i', c))
+                fo.write(struct.pack('i', i))
+                fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
+        else:
+            vocab = tokenizer.get_vocab()
+            fo.write(struct.pack('i', len(vocab)))
+            for v in vocab.keys():
+                if (modelInfo['model_type'] == "qwen"):
+                    s = v
+                else:
+                    s = v.encode()
+                if (modelInfo["model_type"] == "moss"):
+                    s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
+                fo.write(struct.pack('i', len(s)))
+                for c in s:
+                    fo.write(struct.pack('i', c))
+                fo.write(struct.pack('i', vocab[v]))
+                fo.write(struct.pack('f', 1.0))
+    else:
+        fo.write(struct.pack('i', 0))
+    weight_type_dict = {}
+    module_dict = {}
+    for key, m in model.named_modules():
+        if (isinstance(m, torch.nn.Linear)):
+            weight_type_dict[key + ".weight"] = "linear"
+            module_dict[key + ".weight"] = m
+        if (isinstance(m, torch.nn.Embedding)):
+            weight_type_dict[key] = "embedding"
+    # 2. weight
+    fo.write(struct.pack('i', len(dict)))
+    tot = 0
+    for key in dict:
+        ori_data_type = 0
+        ori_np_data_type = np.float32
+        cur_weight_type = 0
+        if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
+            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
+        to_data_type = 0
+        if (cur_weight_type == 1):
+            to_data_type = fastllm_data_type_dict[dtype]
+            if (to_data_type == 7):
+                ori_data_type = 7
+                ori_np_data_type = np.float16
+        cur = dict[key].numpy().astype(ori_np_data_type)
+        if hasattr(model, "peft_config"):
+            weight_name = key.replace('base_model.model.', '')
+            fo.write(struct.pack('i', len(weight_name)))
+            fo.write(weight_name.encode())
+        else:
+            fo.write(struct.pack('i', len(key)))
+            fo.write(key.encode())
+        fo.write(struct.pack('i', len(cur.shape)))
+        for i in cur.shape:
+            fo.write(struct.pack('i', i))
+        if (to_data_type == 3):
+            write_int8(fo, cur)
+        elif (to_data_type == 8):
+            write_int4(fo, cur)
+        else:
+            fo.write(struct.pack('i', to_data_type))
+            fo.write(cur.data)
+        tot += 1
+        print("output (", tot, "/", len(dict), end = " )\r")
+    print("\nfinish.")
+    fo.close()
\ No newline at end of file
--- a/tools/scripts/alpaca2flm.py
+++ b/tools/scripts/alpaca2flm.py
+import sys
+from transformers import LlamaTokenizer, LlamaForCausalLM
+from fastllm_pytools import torch2flm
+if __name__ == "__main__":
+    exportPath = sys.argv[1] if (sys.argv[1] is not None) else "alpaca-fp32.flm";
+    tokenizer = LlamaTokenizer.from_pretrained('minlik/chinese-alpaca-33b-merged');
+    model = LlamaForCausalLM.from_pretrained('minlik/chinese-alpaca-33b-merged').float();
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "alpaca-33b-' + dtype + '.flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
--- a/tools/scripts/baichuan2flm.py
+++ b/tools/scripts/baichuan2flm.py
+import sys
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+from fastllm_pytools import torch2flm
+if __name__ == "__main__":
+    modelpath = "baichuan-inc/baichuan-13B-Chat"
+    tokenizer = AutoTokenizer.from_pretrained(modelpath, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(modelpath, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
+    model.to("cpu")
+    try:
+        model.generation_config = GenerationConfig.from_pretrained(modelpath)
+    except:
+        pass
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-' + dtype + '.flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
--- a/tools/scripts/chatglm_export.py
+++ b/tools/scripts/chatglm_export.py
+import sys
+from transformers import AutoTokenizer, AutoModel
+from fastllm_pytools import torch2flm
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
+    model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
+    model = model.eval()
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
--- a/tools/scripts/cli_demo.py
+++ b/tools/scripts/cli_demo.py
+import argparse
+from fastllm_pytools import llm
+def args_parser():
+    parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
+    parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = args_parser()
+    model = llm.model(args.path)
+    history = []
+    print("输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+    while True:
+        query = input("\n用户：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            history = []
+            print("输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+            continue
+        print("AI:", end = "");
+        curResponse = "";
+        for response in model.stream_response(query, history = history):
+            curResponse += response;
+            print(response, flush = True, end = "")
+        history.append((query, curResponse))
\ No newline at end of file
--- a/tools/scripts/moss_export.py
+++ b/tools/scripts/moss_export.py
+import sys
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from fastllm_pytools import torch2flm
+tokenizer = AutoTokenizer.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True);
+model = AutoModelForCausalLM.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True).float();
+model = model.eval();
+if __name__ == "__main__":
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "moss-' + dtype + '.flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
\ No newline at end of file