dcu平台fastllm推理框架

aefd9f11 · zhouxiang · aefd9f11 · aefd9f11 · aefd9f11 · aefd9f11
Commit aefd9f11 authored Sep 06, 2023 by zhouxiang
20 changed files
--- a/include/device.h
+++ b/include/device.h
+//
+// Created by huangyuyang on 6/13/23.
+//
+#ifndef FASTLLM_DEVICE_H
+#define FASTLLM_DEVICE_H
+#include "fastllm.h"
+namespace fastllm {
+    typedef std::map <std::string, Data*> DataDict;
+    typedef std::map <std::string, float> FloatDict;
+    typedef std::map <std::string, int> IntDict;
+    class BaseOperator {
+    public:
+        // 是否可以运行某一个算子
+        virtual bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        // 对某一个算子进行形状推理
+        virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        // 对某一个算子进行推理
+        virtual void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams) = 0;
+    };
+    class BaseBatchOperator : BaseOperator {
+    public:
+        // 对某一个算子进行形状推理
+        virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams,
+                             const IntDict &intParams);
+    };
+    class BaseDevice {
+    public:
+        virtual bool Malloc (void **ret, size_t size) = 0; // 分配尺寸为size的空间
+        virtual bool Malloc (void **ret, Data &data); // 分配形状为dims的空间
+        virtual bool Free(void *ret) = 0; // 释放ret
+        virtual bool CopyDataToCPU(void *dst, void *src, size_t size) = 0; // device上的src拷贝到cpu上的dst
+        virtual bool CopyDataToCPU(Data &data); // data数据从该device移动到CPU
+        virtual bool CopyDataFromCPU(void *dst, void *src, size_t size) = 0; // cpu上的src拷贝到device上的dst
+        virtual bool CopyDataFromCPU(Data &data); // data数据从CPU移动到该device
+        // 是否可以运行某一个算子
+        virtual bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        // 对某一个算子进行形状推理
+        virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        // 对某一个算子进行推理
+        virtual void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        std::string deviceType;
+        std::string deviceName;
+        std::vector <int> deviceIds;
+        std::map <std::string, BaseOperator*> ops;
+    };
+}
+#endif //FASTLLM_DEVICE_H
--- a/include/devices/cpu/cpudevice.h
+++ b/include/devices/cpu/cpudevice.h
+//
+// Created by huangyuyang on 6/13/23.
+//
+#ifndef FASTLLM_CPUDEVICE_H
+#define FASTLLM_CPUDEVICE_H
+#include "device.h"
+#include "cputhreadpool.h"
+namespace fastllm {
+    class CpuDevice : BaseDevice {
+    public:
+        CpuDevice ();
+        bool Malloc (void **ret, size_t size); // 分配尺寸为size的空间
+        bool Free(void *ret); // 释放ret
+        bool CopyDataToCPU(void *dst, void *src, size_t size); // 不重要, cpu device不会进行这个操作
+        bool CopyDataFromCPU(void *dst, void *src, size_t size); // 不重要, cpu device不会进行这个操作
+        int threads = 4;
+        ThreadPool *threadPool = nullptr;
+    };
+    class CpuToFloat16 : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuToFloat32 : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuAttention : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuEmbedding : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuLayerNormOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuRMSNormOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuLinearOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuSplitOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuCatOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuCatDirectOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMatMulOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMatMulTransBOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuSoftMaxOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuSiluOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuGeluNewOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuSwigluOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMulOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMulToOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuAddToOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuAttentionMaskOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuAlibiMaskOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuTopKOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuPermuteOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuPermuteSelfOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuRotatePosition2DOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuNearlyRotatePosition2DOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuLlamaRotatePosition2DOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuRepeatPenaltyOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuApplyLognAttnOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuSplitBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuCatBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMulBatchOp : BaseBatchOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMatMulBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuMatMulTransBBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuSoftmaxBatchOp : BaseBatchOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CpuCatDirectBatchOp : BaseBatchOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+}
+#endif //FASTLLM_CPUDEVICE_H
--- a/include/devices/cpu/cputhreadpool.h
+++ b/include/devices/cpu/cputhreadpool.h
+//
+// Created by huangyuyang on 7/5/23.
+//
+#ifndef FASTLLCPUTHREADPOOL_H
+#define FASTLLCPUTHREADPOOL_H
+#include <mutex>
+#include <queue>
+#include <functional>
+#include <future>
+#include <thread>
+#include <utility>
+#include <vector>
+namespace fastllm {
+    template <typename T>
+    class TaskQueue {
+    private:
+        std::queue <T> q;
+        std::mutex locker;
+    public:
+        TaskQueue() {}
+        ~TaskQueue() {}
+        bool Empty() {
+            std::unique_lock<std::mutex> lock(locker);
+            return q.empty();
+        }
+        int Size() {
+            std::unique_lock<std::mutex> lock(locker);
+            return q.size();
+        }
+        void Push(T &t) {
+            std::unique_lock<std::mutex> lock(locker);
+            q.emplace(t);
+        }
+        bool Pop(T &t) {
+            std::unique_lock<std::mutex> lock(locker);
+            if (q.empty()) {
+                return false;
+            }
+            t = std::move(q.front());
+            q.pop();
+            return true;
+        }
+    };
+    class ThreadPool {
+    private:
+        class ThreadWorker
+        {
+        private:
+            int id;
+            ThreadPool *pool;
+        public:
+            ThreadWorker(ThreadPool *pool, const int id) : pool(pool), id(id) {}
+            void operator()() {
+                std::function<void()> func;
+                bool dequeued;
+                while (!pool->shutdown) {
+                    {
+                        std::unique_lock<std::mutex> lock(pool->locker);
+                        if (pool->queue.Empty()) {
+                            pool->cv.wait(lock);
+                        }
+                        dequeued = pool->queue.Pop(func);
+                    }
+                    if (dequeued) {
+                        func();
+                    }
+                }
+            }
+        };
+        bool shutdown = false;
+        TaskQueue<std::function<void()>> queue;
+        std::vector<std::thread> threads;
+        std::mutex locker;
+        std::condition_variable cv;
+    public:
+        ThreadPool(const int t = 4) : threads(std::vector<std::thread>(t)) {
+            for (int i = 0; i < threads.size(); ++i) {
+                threads[i] = std::thread(ThreadWorker(this, i));
+            }
+        }
+        void Shutdown() {
+            shutdown = true;
+            cv.notify_all();
+            for (int i = 0; i < threads.size(); ++i) {
+                if (threads[i].joinable()) {
+                    threads[i].join();
+                }
+            }
+        }
+        template<typename F, typename... Args>
+        auto Submit(F &&f, Args &&...args) -> std::future<decltype(f(args...))> {
+            std::function<decltype(f(args...))()> func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
+            auto task_ptr = std::make_shared<std::packaged_task<decltype(f(args...))()>>(func);
+            std::function<void()> warpper_func = [task_ptr]() {
+                (*task_ptr)();
+            };
+            queue.Push(warpper_func);
+            cv.notify_one();
+            return task_ptr->get_future();
+        }
+    };
+}
+#endif //FASTLLCPUTHREADPOOL_H
--- a/include/devices/cuda/cudadevice.h
+++ b/include/devices/cuda/cudadevice.h
+//
+// Created by huangyuyang on 6/14/23.
+//
+#ifndef FASTLLM_CUDADEVICE_H
+#define FASTLLM_CUDADEVICE_H
+#include "device.h"
+namespace fastllm {
+    class CudaDevice : BaseDevice {
+    public:
+        CudaDevice ();
+        bool Malloc (void **ret, size_t size); // 分配尺寸为size的空间
+        bool Free(void *ret); // 释放ret
+        bool CopyDataToCPU(void *dst, void *src, size_t size);
+        bool CopyDataFromCPU(void *dst, void *src, size_t size);
+    };
+    class CudaAttention : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaLayerNormOp : BaseOperator {
+        bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaRMSNormOp : BaseOperator {
+        bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaLinearOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaSplitOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaCatDirectOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMatMulOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMatMulTransBOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaSoftMaxOp : BaseOperator {
+        bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaGeluNewOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaSiluOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaSwigluOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMulOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaAddToOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMulToOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaAttentionMaskOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaAlibiMaskOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaTopKOp : BaseOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaPermuteSelfOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaRotatePosition2DOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaLlamaRotatePosition2DOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaNearlyRotatePosition2DOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaApplyLognAttnOp : BaseOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaSplitBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaCatBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMulBatchOp : BaseBatchOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMatMulBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaMatMulTransBBatchOp : BaseBatchOperator {
+        void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaSoftmaxBatchOp : BaseBatchOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+    class CudaCatDirectBatchOp : BaseBatchOperator {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+}
+#endif //FASTLLM_CUDADEVICE_H
--- a/include/devices/cuda/fastllm-cuda.cuh
+++ b/include/devices/cuda/fastllm-cuda.cuh
+#include "fastllm.h"
+#ifdef  __cplusplus
+extern "C" {
+#endif
+void FastllmInitCublas(void);
+void FastllmCudaMallocBigBuffer(size_t size);
+void FastllmCudaClearBigBuffer();
+void *FastllmCudaMalloc(size_t size);
+void FastllmCudaFree(void *ret);
+void FastllmCudaCopyFromHostToDevice(void *dst, void *src, size_t size);
+void FastllmCudaCopyFromDeviceToHost(void *dst, void *src, size_t size);
+void FastllmCudaCopyFromDeviceToDevice(void *dst, void *src, size_t size);
+void FastllmCudaMemcpy2DDeviceToDevice(void * 	dst, size_t 	dpitch, const void * 	src,
+                                       size_t 	spitch, size_t 	width, size_t 	height);
+void FastllmCudaMemcpy2DDeviceToDeviceBatch(void ** 	dsts, size_t *	dpitchs, void ** 	srcs,
+                                       size_t *	spitchs, size_t *widths, size_t *	heights,
+                                       int batch);
+bool FastllmCudaAttention(const fastllm::Data &q, const fastllm::Data &k, const fastllm::Data &v,
+                          const fastllm::Data &mask, const fastllm::Data &output, int group, float scale);
+bool FastllmCudaGeluNew(const fastllm::Data &input, fastllm::Data &output);
+bool FastllmCudaSilu(const fastllm::Data &input, fastllm::Data &output);
+bool FastllmCudaSwiglu(const fastllm::Data &input, fastllm::Data &output);
+bool FastllmCudaMul(const fastllm::Data &input, float v, fastllm::Data &output);
+bool FastllmCudaSoftmax(const fastllm::Data &input, fastllm::Data &output, int axis);
+bool FastllmCudaAddTo(fastllm::Data &input0, const fastllm::Data &input1, float alpha);
+bool FastllmCudaMulTo(fastllm::Data &input0, const fastllm::Data &input1, float alpha);
+bool FastllmCudaAttentionMask(fastllm::Data &input, const fastllm::Data &mask, float maskValue);
+bool FastllmCudaAlibiMask(fastllm::Data &input, const fastllm::Data &mask, float maskValue);
+bool FastllmCudaRMSNorm(const fastllm::Data &input, fastllm::Data &weight, fastllm::Data &output, float eps);
+bool FastllmCudaLayerNorm(const fastllm::Data &input, fastllm::Data &gamma, fastllm::Data &beta, fastllm::Data &output, int axis);
+bool FastllmCudaTopK(const fastllm::Data &input, fastllm::Data &output, int topk);
+bool FastllmCudaPermute(fastllm::Data &input, const std::vector<int> &axis);
+bool FastllmCudaMatMulFloatInt8(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+bool FastllmCudaMatMulFloatInt4(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+bool FastllmCudaMatMulFloatInt4NoZero(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+bool FastllmCudaMatMulFloat32(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+bool FastllmCudaBatchMatMul(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output,
+                                  int input0Spatial, int input1Spatial, int outputSpatial,
+                                  int input0Stride, int input1Stride,
+                                  int batch, int n, int m, int k, float alpha);
+bool FastllmCudaBatchMatMulTransB(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output,
+                              int input0Spatial, int input1Spatial, int outputSpatial,
+                              int input0Stride, int input1Stride,
+                              int batch, int n, int m, int k, float alpha);
+bool FastllmCudaRotatePosition2D(fastllm::Data &data, const fastllm::Data &positionIds,
+                                 const fastllm::Data &sinData, const fastllm::Data &cosData, int rotaryDim);
+bool FastllmCudaNearlyRotatePosition2D(fastllm::Data &data, const fastllm::Data &positionIds,
+                                 const fastllm::Data &sinData, const fastllm::Data &cosData, int rotaryDim);
+bool FastllmCudaLlamaRotatePosition2D(fastllm::Data &data, const fastllm::Data &positionIds,
+                                 const fastllm::Data &sinData, const fastllm::Data &cosData, int rotaryDim);
+bool FastllmCudaApplyLognAttn (fastllm::Data &input, fastllm::Data &lognAttn, fastllm::Data &positionIds);
+bool FastllmCudaSplitBatch(fastllm::Data &input, fastllm::Data **outputs, int axis);
+bool FastllmCudaCatBatch(fastllm::Data **inputs, fastllm::Data &output, int axis);
+bool FastllmCudaMulBatch(fastllm::Data **inputs, float v, int batch, fastllm::Data **outputs);
+bool FastllmCudaSoftmaxBatch(fastllm::Data **inputs, fastllm::Data **outputs, int axis, int batch);
+bool FastllmCudaBatchMatMulTransBBatch(void **i0s, void **i1s, void **os,
+                                      int *ns, int *ms, int *ks,
+                                      int *i0Strides, int *i1Strides, float alpha, int batch);
+bool FastllmCudaBatchMatMulBatch(void **i0s, void **i1s, void **os,
+                                       int *ns, int *ms, int *ks,
+                                       int *i0Strides, int *i1Strides, float alpha, int batch);
+void FastllmCudaSetDevice(int gpu_id);
+#ifdef  __cplusplus
+}
+#endif
--- a/include/executor.h
+++ b/include/executor.h
+//
+// Created by huangyuyang on 6/13/23.
+//
+#ifndef FASTLLM_EXECUTOR_H
+#define FASTLLM_EXECUTOR_H
+#include "device.h"
+namespace fastllm {
+    class Executor {
+    private:
+        std::vector <BaseDevice*> devices;
+        std::map <std::string, float> profiler;
+    public:
+        Executor (); // 创建默认的Executor
+        ~Executor(); // 析构
+        void ClearDevices(); // 清空 devices
+        void AddDevice(BaseDevice *device); // 增加一个device
+        void SetFirstDevice(const std::string &device); // 设定优先的device
+        std::vector <int> GetDeviceIds(const std::string &device); // 获取指定device的deviceIds
+        // 运行一个op
+        void Run(const std::string &opType, const fastllm::DataDict &datas, const fastllm::FloatDict &floatParams,
+                 const fastllm::IntDict &intParams);
+        void ClearProfiler();
+        void PrintProfiler();
+    };
+}
+#endif //FASTLLM_EXECUTOR_H
--- a/include/fastllm.h
+++ b/include/fastllm.h
+//
+// Created by huangyuyang on 5/11/23.
+//
+#ifndef TEST_FASTLLM_H
+#define TEST_FASTLLM_H
+#include <vector>
+#include <cstdint>
+#include <string>
+#include <map>
+#include <set>
+#include <queue>
+#include <unordered_map>
+#include <cmath>
+#include <algorithm>
+#include <iostream>
+#include <functional>
+#include <memory>
+#include "devices/cpu/cputhreadpool.h"
+namespace fastllm {
+    void SetDeviceMap(const std::map <std::string, int> &deviceMap);
+    std::map <std::string, int> GetDeviceMap();
+    void PrintInstructionInfo();
+    void SetThreads(int t);
+    void SetLowMemMode(bool m);
+    void SetKVCacheInCPU(bool kvCacheInCPU);
+    bool GetLowMemMode();
+    int GetThreads();
+    bool GetKVCacheInCPU();
+    ThreadPool *GetPool();
+    struct GenerationConfig {
+        int output_token_limit = -1; // 最多输出多少, <= 0代表无限制
+        int last_n = 64; // 末尾last_n个token计入重复惩罚
+        float repeat_penalty = 1.0f; // 重复惩罚系数，1.0代表不惩罚
+        int top_k = 1; // top_k采样
+        float top_p = 1.0; // top_p采样
+        float temperature = 1.0; // 温度参数，一般在0.1 ~ 1.0之间，设大这个参数可以带来结果的多样性
+        bool output_logits = false; // 是否返回logits
+		bool enable_hash_id = false; // 给会话添加hash id
+        bool IsSimpleGreedy() const {
+            if (fabs(repeat_penalty - 1) > 1e-8) {
+                return false;
+            }
+            if (top_k > 1) {
+                return false;
+            }
+            return true;
+        }
+    };
+    struct LastTokensUnit {
+        int tot = 0;
+        std::multiset <int> tokenSet;
+        std::queue <int> tokenQueue;
+        LastTokensUnit () {}
+        LastTokensUnit (int tot) {
+            Init(tot);
+        }
+        void Init(int tot) {
+            this->tot = tot;
+            tokenSet.clear();
+            while (tokenQueue.size() > 0) {
+                tokenQueue.pop();
+            }
+        }
+        void Push(int id) {
+            if (tokenQueue.size() == tot) {
+                tokenSet.erase(tokenSet.find(tokenQueue.front()));
+                tokenQueue.pop();
+            }
+            tokenQueue.push(id);
+            tokenSet.insert(id);
+        }
+    };
+    struct LastTokensManager {
+        std::vector <LastTokensUnit> units;
+        LastTokensManager () {}
+        LastTokensManager (int batch, int lastN) {
+            units.resize(batch);
+            for (int i = 0; i < batch; i++) {
+                units[i].Init(lastN);
+            }
+        }
+    };
+    struct LowBitConfig {
+        int bit;
+        float min, max;
+        uint8_t zeroPoint;
+        float scale;
+        int type; // 0: 有zero点 1: 不需要zero点
+        LowBitConfig(float min, float max, int bit, int type) {
+            this->min = min;
+            this->max = max;
+            this->bit = bit;
+            this->type = type;
+            Reset();
+        }
+        LowBitConfig () {
+        }
+        void Reset() {
+            /*if (type == 1) {
+                this->scale = (max - min) / 15.0;
+                return;
+            }*/
+            /*if (type == 1) {
+                this->scale = std::max(fabs(max), fabs(min)) / 7.0;
+                this->min = this->scale * (-7.0);
+                return;
+            }*/
+            min = std::min(min, 0.f);
+            max = std::max(max, 0.f);
+            const float qmin = 0;
+            const float qmax = (1 << bit) - 1;
+            scale = (max - min) / (qmax - qmin);
+            const float initial_zero_point = qmin - min / scale;
+            zeroPoint = 0;
+            if (initial_zero_point < qmin) {
+                zeroPoint = qmin;
+            } else if (initial_zero_point > qmax) {
+                zeroPoint = qmax;
+            } else {
+                zeroPoint = static_cast<uint8_t>(std::round(initial_zero_point));
+            }
+            if (type == 1) {
+                this->min = -this->scale * zeroPoint;
+                return;
+            }
+        }
+        uint8_t quantization(const float &realNumber) const {
+            if (type == 0) {
+                return (uint8_t) (std::min((double) ((1 << bit) - 1),
+                                           std::max(realNumber / scale + zeroPoint + 0.5, 0.0)));
+            } else {
+                return (uint8_t) (std::max(0.f, std::min(15.f, (realNumber - min) / scale + 0.5f)));
+            }
+        }
+        float invQuantization(const uint8_t &qNumber) const {
+            if (type == 0) {
+                return (scale * ((float) qNumber - (float) zeroPoint));
+            } else {
+                return min + scale * qNumber;
+            }
+        }
+    };
+    enum DataType {
+        FLOAT32 = 0, BFLOAT16 = 1, INT16 = 2, INT8 = 3, INT4 = 4, INT2 = 5, BIT = 6, FLOAT16 = 7,
+        INT4_NOZERO = 8, // 不用zeroPoint的int4, floatValue = min + uint4Value * scale
+        INT32PARAM = 100 // int32的参数，这种类型的数据永远存在CPU上
+    };
+    enum DataDevice {
+        CPU = 0, CUDA = 1
+    };
+    enum WeightType {
+        NONE = 0, LINEAR = 1, EMBEDDING = 2
+    };
+    struct FileMmap {
+    public:
+        FileMmap(const std::string &path);
+        ~FileMmap();
+        char *data;
+        size_t size;
+    };
+    struct ModelLoader {
+        ModelLoader(const char *buffer, size_t size) : data(buffer), size(size), ptr(buffer) {}
+        int64_t tell() const { return ptr - data; }
+        void seek(int64_t offset, int whence);
+        template <typename T>
+        T read_basic() {
+            T obj = *(T *)ptr;
+            ptr += sizeof(T);
+            return obj;
+        }
+        std::string ReadString();
+        int ReadInt();
+        float ReadFloat();
+        uint8_t* ReadBytes(uint64_t bytes);
+        const char *const data;
+        size_t size;
+        const char *ptr;
+    };
+    class Data {
+    public:
+        bool lockInCPU = false; // 如果lock在CPU上，那么不允许移动到其余设备
+        WeightType weightType = WeightType::NONE; // 权重类型，NONE代表非权重（或未知权重）
+        DataType dataType = DataType::FLOAT32; // 数据类型
+        int unitSize, unitSizeDiv = 1; // 单个元素的字节数 = unitSIze / unitSizeDiv
+        std::vector <int> dims; // 数据形状
+        std::vector <uint64_t> strides; // 跨度
+        uint64_t expansionSize = 0; // 扩容后的尺寸
+        uint64_t expansionBytes = 0; // 扩容后的字节数
+        std::vector <int> expansionDims; // 预扩容的形状
+        uint8_t *cpuData = nullptr; // 数据指针
+	    void *cudaData = nullptr;
+        std::vector <void*> extraCudaData;
+        void *deviceData = nullptr;
+        std::vector <void*> extraDeviceData;
+        DataDevice dataDevice = DataDevice::CPU;
+        std::vector <int> dataDeviceIds;
+        // 这两个参数用于量化，对FLOAT数据不适用
+        int perChannelAxis = -1; // 沿哪个轴分通道量化，-1代表没有分通道
+        std::vector <LowBitConfig> perChannelsConfigs; // perChannelsConfigs[i]代表第i个通道的min, max; 如果没有分通道，perChannelsConfigs[0]代表全局min, max
+        std::vector <float> scales, mins;
+        std::vector <int> zeros;
+        std::vector <int> weightSum; // 作为权重时，有时候需要存一些和加速计算
+        std::string fileName;
+        long long filePos;
+        std::shared_ptr<FileMmap> m_file;
+        Data () {};
+        Data (DataType type);
+        Data (DataType type, const std::vector <int> &dims); // 构造函数
+        // 构造函数，创建好之后从data复制数据
+        // data中是原始数据，如果type不是float那么需要量化
+        Data (DataType type, const std::vector <int> &dims, const std::vector <float> &data);
+        ~Data(); // 析构函数
+        Data (const Data &ori); // 深拷贝
+        void CopyFrom(const Data &ori); // 复制
+        uint64_t GetBytes() const; // 获取总字节数
+        void Allocate(); // 分配内存
+        void Allocate(float v); // 分配内存并初始化
+        void Expansion(const std::vector <int> &dims); // 预扩容到相应尺寸
+        void MallocSpace(uint64_t size); // 在设备上分配
+        void FreeSpace(); // 回收设备上的内存
+        void UpdateUnitSize(); // 更新unitSize
+        void Resize(const std::vector <int> &dims); // 更改尺寸
+        void Reshape(const std::vector <int> &dims); // 更改尺寸,但不修改数据
+        uint64_t Count(int i) const; // dims[i] * strides[i]
+        void PrintShape() const; // 输出形状
+        void Print() const; // 输出
+        void CalcWeightSum(); // 计算WeightSum
+        void ToDevice(DataDevice device); // 移动到指定device
+        void ToDevice(DataDevice device, const std::vector <int> &deviceIds); // 移动到指定device
+        void ToDevice(void *device);
+        void set_file(std::shared_ptr<FileMmap> file) {
+            m_file = file;
+        }
+    };
+    struct Tokenizer {
+        enum TokenizerType {
+            BPE = 0,
+            NORMAL = 1,
+            QWEN = 2
+        };
+        struct TrieNode {
+            int tokenId;
+            float score;
+            std::map <int, TrieNode*> next;
+            TrieNode();
+        };
+        struct Symbol {
+            TrieNode *node;
+            char *s;
+            int pos, len;
+            int prev, next;
+            int fixId;
+            Symbol (Tokenizer::TrieNode *node,
+                    char *s, int pos, int len,
+                    int prev, int next, int fixId) {
+                this->node = node;
+                this->s = s;
+                this->pos = pos;
+                this->len = len;
+                this->prev = prev;
+                this->next = next;
+                this->fixId = fixId;
+            }
+        };
+        struct SymbolPairs {
+            float score;
+            int l, r, size;
+            SymbolPairs(float score, int l, int r, int size) {
+                this->score = score;
+                this->l = l;
+                this->r = r;
+                this->size = size;
+            }
+        };
+        friend bool operator < (const SymbolPairs &a, const SymbolPairs &b) {
+            return a.score < b.score || (a.score == b.score && a.l > b.l);
+        }
+        TrieNode *root;
+        TokenizerType type = TokenizerType::BPE;
+        std::unordered_map <int, std::string> tokenToStringDict;
+        std::unordered_map <int, float> tokenToScoreDict;
+        std::unordered_map <std::string, int> stringToTokenDict;
+        Tokenizer ();
+        ~Tokenizer();
+        void Clear(); // 清空分词器
+        void TryMergePairs(std::vector<Symbol> &symbols, int l, int r, std::priority_queue <SymbolPairs> &q); // 插入备选symbol
+        void Insert(const std::string &s, int tokenId, float score = 1.0f); // 插入一个token
+        Data Encode(const std::string &s); // 编码
+        std::string Decode(const Data &data); // 解码
+        std::string DecodeTokens(const std::vector <int> &tokens); // 解码
+    };
+    std::string GetModelTypeFromFile(const std::string &fileName);
+    struct WeightMap {
+        int versionId = 2;
+        Tokenizer tokenizer;
+        std::map <std::string, std::string> dicts;
+        std::map <std::string, Data> weight;
+        std::map <std::string, std::map <std::string, std::string>> peftDict;
+        std::set <std::string> embeddingNames;
+        void LoadFromFile(const std::string &fileName); // 从文件读取
+        void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型, bit = 0代表直接存
+        void AddTokenizerWord(const std::string &key, int value, float score); // 增加一个词
+        void AddDict(const std::string &key, const std::string &value); // 插入一个词条
+        void AddAdapterDict(const std::string &name, const std::string &key, const std::string &value);
+        void AddWeight(const std::string &key, const std::vector <int> &dims,
+                       DataType dataType, WeightType weightType, DataType oriDataType, uint8_t *oriData); // 插入一个权重
+        void AddQLinearWeight(const std::string &key, const std::vector <int> &dims,
+                              int bit, float *scales, uint8_t *oriData); // 插入一个Qlinear层的权重，量化规则为float value = scales * oriData
+        Data &operator [] (const std::string &key);
+    };
+    void ClearProfiler();
+    void PrintProfiler();
+    void ApplyDeviceMap(const std::map <std::string, int> &deviceMap, int current, int total); // 执行到了current, 一共total，使用deviceMap切换设备
+    int LLMSampling(Data &logits, int outerOffset,
+                    const GenerationConfig &config, const LastTokensUnit &tokens); // 对logits里[outerOffset * vocabSize, (outerOffset + 1) * vocabSize]做Sampling
+    void ToDataType(const Data &input, DataType dataType);
+    void Attention(const Data &q, const Data &k, const Data &v, const Data &mask, Data &output,
+                   int group, float scale, int attentionType);
+    void Embedding(const Data &input, Data &weight, Data &output);
+    void RMSNorm(const Data &input, const Data &weight, float eps, Data &output);
+    void LayerNorm(Data &input, Data &gamma, Data &beta, int axis, Data &output);
+    void Linear(Data &input, Data &weight, const Data &bias, Data &output);
+    void Split(const Data &input, int axis, int start, int end, Data &output);
+    void Cat(const Data &input0, const Data &input1, int axis, Data &output);
+	void CatDirect(Data &input0, const Data &input1, int axis); // 直接把input1的数据拷贝到input0后面（需要input0提前扩容了足够的空间）
+    void MatMul(const Data &input0, const Data &input1, Data &output, float alpha = 1.0);
+    void MatMulTransB(const Data &input0, const Data &input1, Data &output, float alpha = 1.0);
+    void Softmax(const Data &input, Data &output, int axis);
+    void Silu(const fastllm::Data &input, fastllm::Data &output);
+    void GeluNew(const Data &input, Data &output);
+    void Swiglu(const fastllm::Data &input, fastllm::Data &output);
+    void Mul(const Data &input, float v, Data &output);
+    void MulTo(Data &input0, const Data &input1); // input0 *= input1
+    void AddTo(Data &input0, const Data &input1, float alpha = 1.0); // input0 += input1 * alpha
+    void AttentionMask(Data &input, const Data &mask, float maskValue); // 把input里对应位置mask中为1的部分变成maskValue
+    void AlibiMask(Data &input, const Data &mask, float maskValue); // alibi mask
+    void Permute(const Data &input, const std::vector<int> &axis, Data &output); // 转置
+    void PermuteSelf(const Data &input, const std::vector<int> &axis); // 转置
+    void TopK(const Data &input, Data &output, int topK); // 求topk
+    void RotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position
+    void NearlyRotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position, 相邻的元素旋转
+    void LlamaRotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position for llama
+    void RepeatPenalty(Data &input, const Data &penalty); // 惩罚，input[i] = input[i] < 0 ? input[i] * penalty[i] : input[i] / penalty[i];
+    void ApplyLognAttn(Data &input, const Data &lognAttn, const Data &positionIds);
+    void MulBatch(std::vector <Data*> &input, float v, std::vector <Data*> &output);
+    void SplitBatch(const Data &input, int axis, int part, std::vector <Data*> &outputs); // 将input沿着axis轴切开，每份axis上的尺寸为1，放到outputs里
+    void CatBatch(std::vector <Data*> &input, int axis, Data &outputs); // 将input沿着axis轴合起来，每份axis上的尺寸为1，放到output里
+    void MatMulBatch(std::vector <Data*> &input0, std::vector <Data*> &input1, std::vector <Data*> &output, float alpha = 1.0);
+    void MatMulTransBBatch(std::vector <Data*> &input0, std::vector <Data*> &input1, std::vector <Data*> &output, float alpha = 1.0);
+    void SoftmaxBatch(std::vector <Data*> &input, std::vector <Data*> &output, int axis);
+    void CatDirectBatch(std::vector <Data*> &input0, std::vector <Data*> &input1, int axis);
+    void LoraLayer(Data &input, Data &weight, Data &loraA, Data &loraB, const Data &bias, Data &output, 
+                   std::map <std::string, std::string> loraConfig);
+    void IA3Layer(Data &input, Data &weight, Data &ia3_l, Data &bias, Data &output,
+                  std::map <std::string, std::string> ia3Config);
+}
+#endif //TEST_FASTLLM_H
--- a/include/model.h
+++ b/include/model.h
+//
+// Created by huangyuyang on 6/20/23.
+//
+#ifndef FASTLLM_MODEL_H
+#define FASTLLM_MODEL_H
+#include "basellm.h"
+namespace fastllm {
+    std::unique_ptr<basellm> CreateLLMModelFromFile(const std::string &fileName);
+    std::unique_ptr<basellm> CreateEmptyLLMModel(const std::string &modelType);
+}
+#endif //FASTLLM_MODEL_H
--- a/include/models/basellm.h
+++ b/include/models/basellm.h
+#pragma once
+#include "fastllm.h"
+#include <thread>
+#include <mutex>
+#ifdef PY_API
+#include "Python.h"
+#include <pybind11/pytypes.h>
+using RuntimeResult = std::function<void(int index, pybind11::bytes content)>;
+using RuntimeResultBatch = std::function<void(int index, std::vector <pybind11::bytes> &contents)>;
+#else
+using RuntimeResult = std::function<void(int index, const char* content)>;
+using RuntimeResultBatch = std::function<void(int index, std::vector <std::string> &contents)>;
+#endif
+namespace fastllm {
+    struct ResponseContext {
+        bool isEnding = false;
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        std::vector <int> currentTokens;
+        std::queue <int> resultTokenQueue;
+        std::queue <std::vector <float>*> resultLogits;
+        GenerationConfig generationConfig;
+        LastTokensUnit tokens;
+        int preTokens = 0;
+        int curTokens = 0;
+        std::map <std::string, int> intParams;
+        void Init(int blocks);
+    };
+    struct ResponseContextDict {
+        std::mutex locker;
+        std::map <int, ResponseContext*> dicts;
+        int CreateHandle();
+        ResponseContext* GetHandle(int handleId);
+        void RemoveHandle(int handleId);
+    };
+    class basellm {
+    public:
+        basellm() {};
+        ~basellm() {};
+        virtual void LoadFromFile(const std::string &fileName); // 从文件读取
+        virtual void InitParams(); // 初始化参数信息
+        // 推理
+        virtual int Forward(
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector<std::pair<Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <float> *logits = nullptr) = 0;
+        virtual std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *logits = nullptr);
+        virtual std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const std::vector <Data*> &attentionMask,
+                const std::vector <Data*> &positionIds,
+                const std::vector <int> &seqLens,
+                std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+                const std::vector <GenerationConfig> &generationConfigs,
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *logits = nullptr);
+        // 根据输入的tokens生成LLM推理的输入
+        virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
+                                   const std::map <std::string, int> &params,
+                                   Data &inputIds, Data &attentionMask, Data &positionIds);
+        // 根据输入的tokens生成LLM推理的输入
+        virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
+                                        const std::vector <std::map <std::string, int> > &params,
+                                        Data &inputIds, Data &attentionMask, Data &positionIds);
+        virtual std::string Response(const std::string &input,
+                                     RuntimeResult retCb,
+                                     const GenerationConfig &generationConfig = GenerationConfig());
+        virtual void ResponseBatch(const std::vector<std::string> &inputs,
+                                   std::vector<std::string> &outputs,
+                                   RuntimeResultBatch retCb = nullptr,
+                                   const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
+        virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
+                                         const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务，返回分配的handleId
+        virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了
+        virtual int FetchResponseLogits(int handleId, std::vector <float> &logits); // 获取指定handle的输出Logits
+        virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
+        virtual void SaveModel(const std::string &fileName); // 直接导出
+        virtual void WarmUp() {}; // 预热
+        virtual std::string MakeInput(const std::string &history, int round, const std::string &input) = 0; // 根据历史信息和当前输入生成prompt
+        virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) = 0; // 根据当前回复更新history
+        virtual void SetAdapter(const std::string &name);
+        virtual void DisableAdapter();
+        std::string model_type;
+        std::string pre_prompt; // 最初对话的提示语
+        std::string user_role, bot_role, history_sep; // 用于生成每一轮的prompt
+        int bos_token_id;
+        int eos_token_id;
+        int embed_dim = 4096;
+        int num_attention_heads = 32;
+        int head_dim = embed_dim / num_attention_heads;
+        const int max_positions = 32768;
+        int rotary_dim = 64;
+        const float scale_attn = sqrt(head_dim);
+        int block_cnt = 28;
+        std::vector<std::vector<float> > sin, cos;
+        WeightMap weight; // 权重
+        Data sinData, cosData;
+        ResponseContextDict responseContextDict;
+        std::thread *mainLoop = nullptr;
+        std::mutex mainLoopLocker, dictLocker;
+        std::map <std::string, int> deviceMap;
+        std::string adapterName;
+    };
+}
--- a/include/models/chatglm.h
+++ b/include/models/chatglm.h
+//
+// Created by huangyuyang on 5/11/23.
+//
+#ifndef FASTLLM_CHATGLM_H
+#define FASTLLM_CHATGLM_H
+#include "basellm.h"
+#include "cmath"
+#include <iostream>
+namespace fastllm {
+    class ChatGLMModel: public basellm {
+	public:
+        ChatGLMModel (); // 构造函数
+        // 推理
+		virtual int Forward(
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <float> *logits = nullptr);
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *retLogits = nullptr);
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const std::vector <Data*> &attentionMask,
+                const std::vector <Data*> &positionIds,
+                const std::vector <int> &seqLens,
+                std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+                const std::vector <GenerationConfig> &generationConfigs,
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *logits = nullptr);
+        // 根据输入的tokens生成LLM推理的输入
+        virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
+                                   const std::map <std::string, int> &params,
+                                   Data &inputIds, Data &attentionMask, Data &positionIds);
+        // 根据输入的tokens生成LLM推理的输入
+        virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
+                                        const std::vector <std::map <std::string, int> > &params,
+                                        Data &inputIds, Data &attentionMask, Data &positionIds);
+		virtual void WarmUp(); // 预热
+        virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
+        virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
+        int GetVersion();
+        void UpdateSinCos(float rope);
+    private:
+		virtual void CausalMask(Data &data, int start) {}; // 因果mask？
+        float rope = 1.0f;
+    };
+}
+#endif //FASTLLM_CHATGLM_H
--- a/include/models/factoryllm.h
+++ b/include/models/factoryllm.h
+#pragma once
+#include "chatglm.h"
+#include "moss.h"
+#include "basellm.h"
+#include "llama.h"
+#include "qwen.h"
+#include "fastllm.h"
+enum LLM_TYPE {
+	LLM_TYPE_CHATGLM = 0,
+	LLM_TYPE_MOSS = 1,
+	LLM_TYPE_VICUNA = 2,
+	LLM_TYPE_BAICHUAN = 3,
+    LLM_TYPE_QWEN = 4
+};
+class factoryllm {
+public:
+    factoryllm() {};
+    ~factoryllm() {};
+    fastllm::basellm *createllm(LLM_TYPE type) {
+        fastllm::basellm *pLLM = NULL;
+        switch (type) {
+            case LLM_TYPE_CHATGLM:
+                pLLM = new fastllm::ChatGLMModel();
+                break;
+            case LLM_TYPE_MOSS:
+                pLLM = new fastllm::MOSSModel();
+                break;
+            case LLM_TYPE_VICUNA:
+                pLLM = new fastllm::LlamaModel();
+                break;
+            default:
+                pLLM = new fastllm::QWenModel();
+                break;
+        }
+        return pLLM;
+    };
+};
\ No newline at end of file
--- a/include/models/llama.h
+++ b/include/models/llama.h
+//
+// Created by huangyuyang on 6/1/23.
+//
+#ifndef FASTLLM_LLAMA_H
+#define FASTLLM_LLAMA_H
+#include "basellm.h"
+#include "cmath"
+#include <iostream>
+namespace fastllm {
+    class LlamaModel: public basellm {
+    public:
+        LlamaModel (); // 构造函数
+        // 推理
+        virtual int Forward(
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <float> *logits = nullptr);
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *logits = nullptr);
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const std::vector <Data*> &attentionMask,
+                const std::vector <Data*> &positionIds,
+                const std::vector <int> &seqLens,
+                std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+                const std::vector <GenerationConfig> &generationConfigs,
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *logits = nullptr);
+        virtual std::string Response(const std::string& input,
+                                     RuntimeResult retCb,
+                                     const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复
+        virtual void ResponseBatch(const std::vector <std::string> &inputs,
+                                   std::vector <std::string> &outputs,
+                                   RuntimeResultBatch retCb,
+                                   const GenerationConfig &generationConfig = GenerationConfig());
+        virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
+                                         const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务，返回分配的handleId
+        virtual int FetchResponseTokens(int handelId); // 获取指定handle的输出, -1代表输出结束了
+        virtual void WarmUp(); // 预热
+        virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
+        virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
+        bool is_nsql = false;
+    };
+}
+#endif //FASTLLM_LLAMA_H
--- a/include/models/moss.h
+++ b/include/models/moss.h
+//
+// Created by huangyuyang on 5/12/23.
+//
+#ifndef TEST_MOSS_H
+#define TEST_MOSS_H
+#include "basellm.h"
+#include "cmath"
+namespace fastllm {
+    class MOSSModel: public basellm {
+	public:
+        MOSSModel(); // 构造函数
+        // 推理
+		virtual int Forward(
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <float> *logits = nullptr);
+		virtual std::string Response(const std::string &input, RuntimeResult retCb,
+                                     const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复
+        virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
+        virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
+        virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
+                                         const std::map <std::string, int> &params,
+                                         Data &inputIds, Data &attentionMask, Data &positionIds);
+        virtual void WarmUp();
+    private:
+		virtual void RotatePosition2D(Data &data, const Data &positionIds); // 二维位置编码
+		virtual void CausalMask(Data &data, int start); // 因果mask？
+    };
+}
+#endif //TEST_MOSS_H
--- a/include/models/qwen.h
+++ b/include/models/qwen.h
+//
+// Created by siemon on 8/9/23.
+//
+#ifndef TEST_QWEN_H
+#define TEST_QWEN_H
+#include "basellm.h"
+namespace fastllm {
+    class QWenModel : public basellm {
+    public:
+        QWenModel();
+        // 推理
+		virtual int Forward(
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <float> *logits = nullptr);
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues,
+                const GenerationConfig &generationConfig = GenerationConfig(),
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *logits = nullptr);
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const std::vector <Data*> &attentionMask,
+                const std::vector <Data*> &positionIds,
+                const std::vector <int> &seqLens,
+                std::vector <std::pair <Data*, Data*> > &pastKeyValues,
+                const std::vector <GenerationConfig> &generationConfigs,
+                const LastTokensManager &lastTokens = LastTokensManager(),
+                std::vector <std::vector <float>*> *retLogits = nullptr);
+        virtual std::string MakeInput(const std::string &history, int round, const std::string &input);
+        virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output);
+        virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
+                                   const std::map <std::string, int> &params,
+                                   Data &inputIds, Data &attentionMask, Data &positionIds);
+        virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
+                                        const std::vector <std::map <std::string, int> > &params,
+                                        Data &inputIds, Data &attentionMask, Data &positionIds);
+        virtual void WarmUp();
+        void UpdateRotaryPosEmb(float ntk_alpha);
+        int seq_length;
+        float ntk_alpha;
+        bool use_log_attn;
+        Data logn_list;
+    private:
+        std::string im_start = "<|im_start|>";
+        std::string im_end = "<|im_end|>";
+    };
+}
+#endif //TEST_QWEN_H
\ No newline at end of file
--- a/include/utils/armMath.h
+++ b/include/utils/armMath.h
+#ifndef TFDL_ARMFUNCTIONS_H
+#define TFDL_ARMFUNCTIONS_H
+/* NEON implementation of sin, cos, exp and log
+ *
+ *   Inspired by Intel Approximate Math library, and based on the
+ *   corresponding algorithms of the cephes math library
+ */
+/* Copyright (C) 2011  Julien Pommier
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  (this is the zlib license)
+ */
+#include <arm_neon.h>
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+/* natural logarithm computed for 4 simultaneous float
+ *   return NaN for x <= 0
+ */
+static inline float32x4_t log_ps(float32x4_t x)
+{
+    float32x4_t one = vdupq_n_f32(1);
+    x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+    uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+    int32x4_t ux = vreinterpretq_s32_f32(x);
+    int32x4_t emm0 = vshrq_n_s32(ux, 23);
+    /* keep only the fractional part */
+    ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+    ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+    x = vreinterpretq_f32_s32(ux);
+    emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+    float32x4_t e = vcvtq_f32_s32(emm0);
+    e = vaddq_f32(e, one);
+    /* part2:
+     *     if( x < SQRTHF ) {
+     *       e -= 1;
+     *       x = x + x - 1.0;
+     *     } else { x = x - 1.0; }
+     */
+    uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+    float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+    x = vsubq_f32(x, one);
+    e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+    x = vaddq_f32(x, tmp);
+    float32x4_t z = vmulq_f32(x,x);
+    float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+    y = vmulq_f32(y, x);
+    y = vmulq_f32(y, z);
+    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+    y = vaddq_f32(y, tmp);
+    tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+    y = vsubq_f32(y, tmp);
+    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+    x = vaddq_f32(x, y);
+    x = vaddq_f32(x, tmp);
+    x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+    return x;
+}
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+/* exp() computed for 4 float at once */
+static inline float32x4_t exp_ps(float32x4_t x)
+{
+    float32x4_t tmp, fx;
+    float32x4_t one = vdupq_n_f32(1);
+    x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+    x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+    /* perform a floorf */
+    tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+    /* if greater, subtract 1 */
+    uint32x4_t mask = vcgtq_f32(tmp, fx);
+    mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+    fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+    tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+    float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+    x = vsubq_f32(x, tmp);
+    x = vsubq_f32(x, z);
+    static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+    float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
+    float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
+    float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
+    float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
+    float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
+    float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);
+    y = vmulq_f32(y, x);
+    z = vmulq_f32(x, x);
+    y = vaddq_f32(y, c1);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c2);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c3);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c4);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c5);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, x);
+    y = vaddq_f32(y, one);
+    /* build 2^n */
+    int32x4_t mm;
+    mm = vcvtq_s32_f32(fx);
+    mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+    mm = vshlq_n_s32(mm, 23);
+    float32x4_t pow2n = vreinterpretq_f32_s32(mm);
+    y = vmulq_f32(y, pow2n);
+    return y;
+}
+static inline float32x4_t faster_exp_ps(float32x4_t x) {
+    float32x4_t vmul = vdupq_n_f32(1.442695040f);
+    float32x4_t vmin = vdupq_n_f32(-126.0f);
+    float32x4_t vmagicAdd = vdupq_n_f32(126.94269504f);
+    float32x4_t vmagicMul = vdupq_n_f32(1 << 23);
+    float32x4_t y = vreinterpretq_f32_u32(vcvtq_u32_f32(vmulq_f32(vmagicMul, vaddq_f32(vmaxq_f32(vmin, vmulq_f32(x, vmul)), vmagicAdd))));
+    return y;
+}
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+/* evaluation of 4 sines & cosines at once.
+ *
+ *   The code is the exact rewriting of the cephes sinf function.
+ *   Precision is excellent as long as x < 8192 (I did not bother to
+ *   take into account the special handling they have for greater values
+ *   -- it does not return garbage for arguments over 8192, though, but
+ *   the extra precision is missing).
+ *
+ *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+ *   surprising but correct result.
+ *
+ *   Note also that when you compute sin(x), cos(x) is available at
+ *   almost no extra price so both sin_ps and cos_ps make use of
+ *   sincos_ps..
+ */
+static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
+{
+    // any x
+    float32x4_t xmm1, xmm2, xmm3, y;
+    uint32x4_t emm2;
+    uint32x4_t sign_mask_sin, sign_mask_cos;
+    sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+    x = vabsq_f32(x);
+    /* scale by 4/Pi */
+    y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+    /* store the integer part of y in mm0 */
+    emm2 = vcvtq_u32_f32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+    emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+    y = vcvtq_f32_u32(emm2);
+    /* get the polynom selection mask
+     *     there is one polynom for 0 <= x <= Pi/4
+     *     and another one for Pi/4<x<=Pi/2
+     *
+     *     Both branches will be computed.
+     */
+    uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+    /* The magic pass: "Extended precision modular arithmetic"
+     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+    xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+    xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+    x = vaddq_f32(x, xmm1);
+    x = vaddq_f32(x, xmm2);
+    x = vaddq_f32(x, xmm3);
+    sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+    sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+    float32x4_t z = vmulq_f32(x,x);
+    float32x4_t y1, y2;
+    y1 = vmulq_n_f32(z, c_coscof_p0);
+    y2 = vmulq_n_f32(z, c_sincof_p0);
+    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, z);
+    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, z);
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, x);
+    y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+    y2 = vaddq_f32(y2, x);
+    y1 = vaddq_f32(y1, vdupq_n_f32(1));
+    /* select the correct result from the two polynoms */
+    float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+    float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+    *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+    *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+static inline float32x4_t sin_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    return ysin;
+}
+static inline float32x4_t cos_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    return ycos;
+}
+static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b);
+    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+//     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+    return vmulq_f32(a, reciprocal);
+}
+static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp_ps(vmulq_f32(b, log_ps(a)));
+}
+#define c_cephes_HALFMAXLOGF 44.014845935754205f
+#define c_cephes_tanh_C1 0.625f
+#define c_cephes_tanh_p0 - 5.70498872745E-3
+#define c_cephes_tanh_p1 + 2.06390887954E-2
+#define c_cephes_tanh_p2 - 5.37397155531E-2
+#define c_cephes_tanh_p3 + 1.33314422036E-1
+#define c_cephes_tanh_p4 - 3.33332819422E-1
+/* Single precision hyperbolic tangent computed for 4 simultaneous float */
+static inline float32x4_t tanh_ps(float32x4_t x)
+{
+    float32x4_t x2 = vabsq_f32(x);
+    uint32x4_t mask_l = vcgeq_f32(x2, vdupq_n_f32(c_cephes_tanh_C1));
+    uint32x4_t mask_l2 = vcgtq_f32(x2, vdupq_n_f32(c_cephes_HALFMAXLOGF));
+    // abs(x) >= 0.625
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    float32x4_t _one = vdupq_n_f32(1.f);
+    float32x4_t exp_x_x = exp_ps(vaddq_f32(x, x));
+#if __aarch64__
+    float32x4_t y0 = vdivq_f32(vsubq_f32(exp_x_x, _one), vaddq_f32(exp_x_x, _one));
+#else
+    float32x4_t y0 = div_ps(vsubq_f32(exp_x_x, _one), vaddq_f32(exp_x_x, _one));
+#endif
+    // abs(x) < 0.625
+    /*
+        z = x2 * x2;
+        z =
+        (((( -5.70498872745E-3 * z
+        + 2.06390887954E-2) * z
+        - 5.37397155531E-2) * z
+        + 1.33314422036E-1) * z
+        - 3.33332819422E-1) * z * x
+        + x;
+    */
+    static const float cephes_tanh_p[5] = { c_cephes_tanh_p0, c_cephes_tanh_p1, c_cephes_tanh_p2, c_cephes_tanh_p3, c_cephes_tanh_p4 };
+    float32x4_t y = vld1q_dup_f32(cephes_tanh_p+0);
+    float32x4_t c1 = vld1q_dup_f32(cephes_tanh_p+1);
+    float32x4_t c2 = vld1q_dup_f32(cephes_tanh_p+2);
+    float32x4_t c3 = vld1q_dup_f32(cephes_tanh_p+3);
+    float32x4_t c4 = vld1q_dup_f32(cephes_tanh_p+4);
+    float32x4_t z = vmulq_f32(x, x);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c1);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c2);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c3);
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, c4);
+    y = vmulq_f32(y, z);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, x);
+    // abs(x) > HALFMAXLOGF
+    // return 1.0 or -1.0
+    uint32x4_t mask_pos = vcgtq_f32(x2, vdupq_n_f32(0.f));
+    float32x4_t y1 = vreinterpretq_f32_u32(vbslq_u32(mask_pos, vreinterpretq_u32_f32(vdupq_n_f32(1.f)), vreinterpretq_u32_f32(vdupq_n_f32(-1.f))));
+    y = vreinterpretq_f32_u32(vbslq_u32(mask_l, vreinterpretq_u32_f32(y0), vreinterpretq_u32_f32(y)));
+    y = vreinterpretq_f32_u32(vbslq_u32(mask_l2, vreinterpretq_u32_f32(y1), vreinterpretq_u32_f32(y)));
+    return y;
+}
+#endif //TFDL_ARMFUNCTIONS_H
--- a/include/utils/utils.h
+++ b/include/utils/utils.h
+//
+// Created by huangyuyang on 6/2/23.
+//
+#pragma once
+#ifndef FASTLLM_UTILS_H
+#define FASTLLM_UTILS_H
+#include <map>
+#include <chrono>
+#include <string>
+#include <cstdio>
+#include <cstdint>
+#include <vector>
+#if defined(_WIN32) or defined(_WIN64)
+#include <Windows.h>
+#else
+#include <unistd.h>
+#endif
+#ifdef __AVX__
+#include "immintrin.h"
+#endif
+namespace fastllm {
+    static void MySleep(int t) {
+#if defined(_WIN32) or defined(_WIN64)
+        Sleep(t);
+#else
+        sleep(t);
+#endif
+    }
+    static void ErrorInFastLLM(const std::string &error) {
+        printf("FastLLM Error: %s\n", error.c_str());
+        throw error;
+    }
+    static void AssertInFastLLM(bool condition, const std::string &error) {
+        if (!condition) {
+            ErrorInFastLLM(error);
+        }
+    }
+    static uint32_t as_uint(const float x) {
+        return *(uint32_t*)&x;
+    }
+    static float as_float(const uint32_t x) {
+        return *(float*)&x;
+    }
+    static float half_to_float(const uint16_t x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
+        const uint32_t e = (x & 0x7C00) >> 10; // exponent
+        const uint32_t m = (x & 0x03FF) << 13; // mantissa
+        const uint32_t v = as_uint((float) m) >> 23; // evil log2 bit hack to count leading zeros in denormalized format
+        return as_float((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) | ((e == 0) & (m != 0)) * ((v - 37) << 23 |
+                                                                                                         ((m << (150 - v)) &
+                                                                                                          0x007FE000))); // sign : normalized : denormalized
+    }
+    static uint16_t float_to_half(const float x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
+        const uint32_t b = as_uint(x) + 0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
+        const uint32_t e = (b & 0x7F800000) >> 23; // exponent
+        const uint32_t m = b &
+                       0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
+        return (b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
+               ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
+               (e > 143) * 0x7FFF; // sign : normalized : denormalized : saturate
+    }
+    static double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2) {
+        auto duration = std::chrono::duration_cast<std::chrono::microseconds> (time2 - time1);
+        return double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+    };
+    static bool StartWith(const std::string &a, const std::string &b) {
+        return a.size() >= b.size() && a.substr(0, b.size()) == b;
+    }
+    static std::vector <int> ParseDeviceIds(const std::string &s, const std::string &type) {
+        int i = type.size();
+        std::vector <int> ret;
+        std::string cur = "";
+        if (s.size() > i && s[i] == ':') {
+            i++;
+            while (i < s.size()) {
+                if (s[i] >= '0' && s[i] <= '9') {
+                    cur += s[i];
+                } else {
+                    if (cur != "") {
+                        ret.push_back(atoi(cur.c_str()));
+                        cur = "";
+                    }
+                }
+                i++;
+            }
+        }
+        if (cur != "") {
+            ret.push_back(atoi(cur.c_str()));
+        }
+        return ret;
+    }
+    struct TimeRecord {
+        std::map<std::string, float> v;
+        std::chrono::system_clock::time_point t;
+        void Clear() {
+            v.clear();
+        }
+        void Record() {
+            t = std::chrono::system_clock::now();
+        }
+        void Record(const std::string &key) {
+            auto now = std::chrono::system_clock::now();
+            v[key] += GetSpan(t, now);
+            t = now;
+        }
+        void Print() {
+            float s = 0;
+            for (auto &it: v) {
+                printf("%s: %f s.\n", it.first.c_str(), it.second);
+                s += it.second;
+            }
+            printf("Total: %f s.\n", s);
+        }
+    };
+#ifdef __AVX__
+    static inline float Floatsum(const __m256 a) {
+        __m128 res = _mm256_extractf128_ps(a, 1);
+        res = _mm_add_ps(res, _mm256_castps256_ps128(a));
+        res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+        res = _mm_add_ss(res, _mm_movehdup_ps(res));
+        return _mm_cvtss_f32(res);
+    }
+    static inline int I32sum(const __m256i a) {
+        const __m128i sum128 = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1));
+        const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+        const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+        const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+        return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+    }
+    static inline int I16Sum(const __m256i a) {
+        int sum = I32sum(_mm256_madd_epi16(a, _mm256_set1_epi16(1)));
+        return sum;
+    }
+#endif
+}
+#endif //FASTLLM_UTILS_H
--- a/main.cpp
+++ b/main.cpp
+#include "model.h"
+struct RunConfig {
+	std::string path = "chatglm-6b-int4.bin"; // 模型文件路径
+	int threads = 4; // 使用的线程数
+	bool lowMemMode = false; // 是否使用低内存模式
+};
+void Usage() {
+	std::cout << "Usage:" << std::endl;
+	std::cout << "[-h|--help]:                  显示帮助" << std::endl;
+	std::cout << "<-p|--path> <args>:           模型文件的路径" << std::endl;
+	std::cout << "<-t|--threads> <args>:        使用的线程数量" << std::endl;
+	std::cout << "<-l|--low>:                   使用低内存模式" << std::endl;
+    std::cout << "<--top_p> <args>:             采样参数top_p" << std::endl;
+    std::cout << "<--top_k> <args>:             采样参数top_k" << std::endl;
+    std::cout << "<--temperature> <args>:       采样参数温度，越高结果越不固定" << std::endl;
+    std::cout << "<--repeat_penalty> <args>:    采样参数重复惩罚" << std::endl;
+}
+void ParseArgs(int argc, char **argv, RunConfig &config, fastllm::GenerationConfig &generationConfig) {
+	std::vector <std::string> sargv;
+	for (int i = 0; i < argc; i++) {
+		sargv.push_back(std::string(argv[i]));
+	}
+	for (int i = 1; i < argc; i++) {
+		if (sargv[i] == "-h" || sargv[i] == "--help") {
+			Usage();
+			exit(0);
+		} else if (sargv[i] == "-p" || sargv[i] == "--path") {
+			config.path = sargv[++i];
+		} else if (sargv[i] == "-t" || sargv[i] == "--threads") {
+			config.threads = atoi(sargv[++i].c_str());
+		} else if (sargv[i] == "-l" || sargv[i] == "--low") {
+			config.lowMemMode = true;
+		} else if (sargv[i] == "-m" || sargv[i] == "--model") {
+            i++;
+        } else if (sargv[i] == "--top_p") {
+            generationConfig.top_p = atof(sargv[++i].c_str());
+        } else if (sargv[i] == "--top_k") {
+            generationConfig.top_k = atof(sargv[++i].c_str());
+        } else if (sargv[i] == "--temperature") {
+            generationConfig.temperature = atof(sargv[++i].c_str());
+        } else if (sargv[i] == "--repeat_penalty") {
+            generationConfig.repeat_penalty = atof(sargv[++i].c_str());
+        } else {
+			Usage();
+			exit(-1);
+		}
+	}
+}
+int main(int argc, char **argv) {
+    int round = 0;
+    std::string history = "";
+    RunConfig config;
+    fastllm::GenerationConfig generationConfig;
+	ParseArgs(argc, argv, config, generationConfig);
+    fastllm::PrintInstructionInfo();
+    fastllm::SetThreads(config.threads);
+    fastllm::SetLowMemMode(config.lowMemMode);
+    auto model = fastllm::CreateLLMModelFromFile(config.path);
+    static std::string modelType = model->model_type;
+    printf("欢迎使用 %s 模型. 输入内容对话，reset清空历史记录，stop退出程序.\n", model->model_type.c_str());
+    while (true) {
+        printf("用户: ");
+        std::string input;
+        std::getline(std::cin, input);
+        if (input == "reset") {
+            history = "";
+            round = 0;
+            continue;
+        }
+        if (input == "stop") {
+            break;
+        }
+        std::string ret = model->Response(model->MakeInput(history, round, input), [](int index, const char* content) {
+            if (index == 0) {
+                printf("%s:%s", modelType.c_str(), content);
+                fflush(stdout);
+            }
+            if (index > 0) {
+                printf("%s", content);
+                fflush(stdout);
+            }
+            if (index == -1) {
+                printf("\n");
+            }
+        }, generationConfig);
+        history = model->MakeHistory(history, round, input, ret);
+        round++;
+    }
+	return 0;
+}
\ No newline at end of file
--- a/pyfastllm/README.md
+++ b/pyfastllm/README.md
+# pyfastllm 
+by [wildkid1024](https://github.com/wildkid1024) 
+pyfastllm是基于fastllm的python api接口实现，通过pyfastllm可以更加灵活地编码实现pythonic场景，满足更复杂更个性化的业务需求。
+- 对接fastapi、flask等web框架，向外提供数据接口
+- 利用python yield生成器语言特性，流式问答响应
+- 对接Lora、Ptuning等微调方法，下游任务可微调(开发中...)
+- 无缝对接加速HugingFace模型库，无痛加速迁移原有业务代码(开发中...)
+- 其他更多...
+## 版本更新
+### v0.1.3 2023-07-08
+- 增加使用和API接口文档
+- 增加fastllm-convert模型转换接口及命令行转化工具
+- 修复部分因为cpp新接口导致的bug
+## 编译安装
+本地编译安装fastllm的python接口，以两种方式编译运行：
+1. 动态库方式：编译为动态库，需放在python运行加载目录下
+2. wheel包方式：编译为wheel包，安装在python的site-packages下，但暂不支持cuda
+### 动态库方式
+> 动态库安装方式暂不支持模型转换
+首先下载pybind11 c++依赖:
+```sh
+git submodule init 
+git submodule update  # 下载pybind11依赖
+```
+Cpp手动编译：
+```sh
+mkdir build-py
+cd build-py
+cmake .. -DUSE_CUDA=ON -DPY_API=ON
+make -j4
+python cli.py -p chatglm-6b-int8.bin -t 8  # 与cpp编译的运行结果保持一致
+```
+Python脚本编译：
+```sh
+cd pyfastllm
+python build_libs --cuda
+python cli.py -p chatglm-6b-int8.bin -t 8 
+```
+### wheel包方式
+> 注意wheel包安装方式暂不支持cuda 
+首先下载pybind11：
+```bash
+pip install pybind11
+```
+```sh
+cd pyfastllm
+python setup.py build
+python setup.py install 
+python cli.py -p chatglm-6b-int8.bin -t 8 
+```
+## 使用
+### python 调用
+在demo文件夹中存放了几种常见的代码示例：
+demo/cli.py: 以回调函数方式输出回答示例
+demo/cli_thread.py: 多线程调用api接口示例(推荐)
+demo/cli_low_api.py: 底层API调用示例
+demo/convert_model.py: 模型转换示例
+demo/web_api.py, demo/web_api_client.py: fastapi webapi调用
+### 命令行工具
+使用命令行工具对模型进行转换，使用方法与convert_model.py类似：
+```sh
+$ fastllm-convert --help
+$ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path  
+```
+### 动态batch使用示例
+```sh
+mkdir build-py
+cd build-py && cmake .. -DPY_API=ON -DUSE_CUDA=ON && make -j && cd -
+cd pyfastllm/demo
+python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
+```
+可以使用locust进行压测。A100 40G，chatglm fp16 压测部分结果如下：
+|    并发数 | 平均调用时间(s) | TP95(s) | TP99(s) |
+|----------:|------|------|------|
+| 1         | 3.07 | 4.2  | 4.8 |
+| 10        | 6.11 | 11.0 | 12.0 |
+| 16        | 6.82 | 15.0 | 16.0 |
+| 32        | 10.74 | 16.0 | 20.0 |
+## API编程接口
+### fastllm数据结构
+> fattllm.Tensor数据类型
+- fastllm.float32  
+- fastllm.bfloat16
+- fastllm.int16
+- fastllm.int8
+- fastllm.int4
+- fastllm.int2
+- fastllm.float16
+> fastllm.Tensor: fastllm基础张量结构
+- fastllm.Tensor()
+- fastllm.Tensor(Datatype)
+- fastllm.Tensor(Datatype, Dims:list[int])
+- fastllm.Tensor(Datatype, Dims:list[int], Data:list[float])
+- fastllm.Tensor(Data:fastllm.Tensor)
+- fastllm.Tensor.to_list() # 将Tensor转化list并返回
+- fastllm.Tensor.to() # 将Tensor转移到对应设备上
+- fastllm.Tensor.zeros(Dims:list[int]) # 按照Dims生成全零矩阵
+- fastllm.Tensor.cat(Data:list[fastllm.Tensor], axis:int) # 将Tensor按照axis(默认为0)方向上拼接
+### fastllm函数
+> fastllm.get_llm_type(model_path:str)->str # 获取当前model的类型
+> fastllm.set_threads(thread:int) -> None # 设置当前运行线程数，默认为4
+> fastllm.get_threads()->int  # 获取当前运行线程数
+> fastllm.set_low_memory(flag:bool) # 低内存模式下运行，默认为False
+> fastllm.get_low_memory() # 查看当前是否为低内存运行模式
+> fastllm.create_llm(model_path: str)-> fastllm.model  # 从本地权重文件生成对应的模型实例，基于规则匹配
+### fastllm模块
+> fastllm.Tokenizer: 分词及编解码工具
+> Tips: 该类不可直接实例化，只可通过model.weight.tokenizer访问具体实例
+- fastllm.Tokenizer.encode(prompt:str) # 将prompt分词并进行编码
+- fastllm.Tokenizer.decode(output_ids:fastllm.Tensor) # 将fastllm.Tensor解码为对应字符串
+- fastllm.Tokenizer.decode(output_ids: list[int]) # 将list[int]解码为对应的字符串
+- fastllm.Tokenizer.decode_byte(output_ids: fastllm.Tensor) # 将Tensor解码对应字节流
+> fastllm.WeightMap: 模型的权重词典
+> Tips: 该类不可直接实例化，只可通过model.weight访问具体实例
+- fastllm.WeightMap.tokenizer： 访问权重中的tokenizer实例
+- fastllm.WeightMap.save_lowbit(output_path:str, bit:int)：量化并保存低bit的权重
+- fastllm.WeightMap.set_kv(key:str, value:str)：设置模型的weight字典
+- fastllm.WeightMap.set_weight(key:str, )：为weight添加具体Tensor
+- .fastllm.WeightMap\['key'\]: 根据key的名称得到对应的Tensor
+### fastllm模型
+> fastllm.ChatGLMModel: 具体模型实例，其中chatglm可以更换为llama、alpaca、Moss等模型
+- fastllm.ChatGLMModel.model_type: 模型类型属性，区分不同的模型
+- fastllm.ChatGLMModel.weight：对应的weightmap
+- fastllm.ChatGLMModel.block_cnt：模型中block的数量
+- fastllm.ChatGLMModel() # 初始化模型实例
+- __call__(input_ids:fastllm.Tensor, attention_mask:fastllm.Tensor, position_ids:fastllm.Tensor, penalty_factor:fastllm.Tensor, pastKeyValues:memory_view) # 以类call function的方式调用模型进行推理 
+- fastllm.ChatGLMModel.load_weights(model_path:str) # 从文件路径中加载模型权重
+- fastllm.ChatGLMMode.make_history(history:str, round:int, input:str, output:str) # 基于历史对话和当前输入输出构造送入模型的历史对话
+- fastllm.ChatGLMMode.make_input(history:str, round:int, input:str) # 基于历史对话和当前输入构造送入模型的对话输入
+- fastllm.ChatGLMModel.response(inputs:str, callback:function) # 发送字符串到模型中并使用callback函数接受处理返回的答案
+- fastllm.ChatGLMModel.response_batch(inputs:list[str], callback:function) -> outputs:list[str] # 发送列表字符串到模型中并使用callback函数接受处理返回的答案
+- fastllm.ChatGLMModel.warmup()  # GPU热身，填充GPU，防止冷启动 
+- fastllm.ChatGLMModel.launch_response(inputs:str)->handle_id:int  # 多线程下使用，填充第一个token，并返回多线程的线程id
+- fastllm.ChatGLMModel.fetch_response(handle_id:int) # 根据线程ID从消息队列中取出对应的消息并返回
+- fastllm.ChatGLMModel.save_lowbit_model(model_path:str, q_bit:int) # 量化保持低bit的权重并保存模型
+支持的模型列表：
+| 模型名称 | 对应类 | 备注 
+| -- | -- | -- 
+| ChatGLM-6B | fastllm.ChatGLMModel |
+| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本
+| Moss | fastllm.MossModel |
+| Alpaca | fastllm.llamaModel | 
+## 开发计划(TODO)
+- [x]  修改response_batch的output_str函数，以返回值的形式返回答案
+- [x]  编解码部分优化，合并不同的返回类型
+- [ ]  Tensor的深复制和浅复制，以及基础运算符重载
+- [ ]  fix low_api下pastKV复制的bug
+- [ ] 模型运行参数对象类，封装模型运行时参数，包含模型路径、运行线程数、是否为低内存模型、惩罚因子、温度等
+- [ ]  暴露更多的底层api接口，按照module的方式定义模型的点，拼接model实现自定义model
--- a/pyfastllm/build_libs.py
+++ b/pyfastllm/build_libs.py
+import os
+import shutil
+import platform
+import sys
+import argparse
+parser = argparse.ArgumentParser(description='build fastllm libs')
+parser.add_argument('--cuda', dest='cuda', action='store_true', default=False,
+                    help='build with cuda support')
+IS_WINDOWS = (platform.system() == 'Windows')
+IS_DARWIN = (platform.system() == 'Darwin')
+IS_LINUX = (platform.system() == 'Linux')
+BUILD_DIR = 'build-py' # build path
+def build_libs():
+    # create build dir
+    root_dir = os.path.dirname(os.getcwd())
+    cmake_build_dir = os.path.join(root_dir, BUILD_DIR)
+    if os.path.exists(cmake_build_dir):
+        shutil.rmtree(cmake_build_dir)
+    os.makedirs(cmake_build_dir)
+    os.chdir(cmake_build_dir)
+    # build it 
+    args = parser.parse_args()
+    if IS_WINDOWS:
+        os.system('cmake -G "Ninja" -DPY_API=ON .. && ninja pyfastllm')
+    elif IS_LINUX:
+        extra_opts = ' -DPY_API=ON '
+        extra_opts += ' -DUSE_CUDA=ON ' if args.cuda else ' '
+        build_cmd = 'cmake ' + extra_opts + ' .. && make pyfastllm -j4'
+        print(build_cmd)
+        os.system('cmake ' + extra_opts + ' .. && make pyfastllm -j4')
+    else:
+        extra_opts = '-DPY_API=ON'
+        os.system('cmake ' + extra_opts + '.. && make pyfastllm -j4')
+if __name__ == '__main__':
+    build_libs()
--- a/pyfastllm/demo/cli.py
+++ b/pyfastllm/demo/cli.py
+# -*- coding: utf-8 -*-
+import sys
+import platform
+import logging
+import argparse
+sys.path.append('./build-py')
+import pyfastllm # 或fastllm
+logging.info(f"python gcc version:{platform.python_compiler()}")
+def args_parser():
+    parser = argparse.ArgumentParser(description='pyfastllm')
+    parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型，默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)')
+    parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径')
+    parser.add_argument('-t', '--threads', type=int, default=4,  help='使用的线程数量')
+    parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
+    args = parser.parse_args()
+    return args
+LLM_TYPE = ""
+def print_back(idx:int, content: bytearray):
+    content = content.decode(encoding="utf-8", errors="replace")
+    if idx >= 0:
+        print(f"\r{LLM_TYPE}:{content}", end='', flush=True)
+    elif idx == -1:
+        print()
+    sys.stdout.flush()
+def main(args):
+    model_path = args.path
+    OLD_API = False
+    if OLD_API:
+        model = pyfastllm.ChatGLMModel()
+        model.load_weights(model_path)
+        model.warmup()
+    else:
+        global LLM_TYPE 
+        LLM_TYPE = pyfastllm.get_llm_type(model_path)
+        print(f"llm model: {LLM_TYPE}")
+        model = pyfastllm.create_llm(model_path)
+    prompt = ""
+    while prompt != "stop":
+        prompt = input("User: ")
+        config = pyfastllm.GenerationConfig()
+        model.response(model.make_input("", 0, prompt), print_back, config)
+        print()
+        sys.stdout.flush()
+if __name__ == "__main__":
+    args = args_parser()
+    main(args)