Commit aefd9f11 authored by zhouxiang's avatar zhouxiang
Browse files

dcu平台fastllm推理框架

parents
Pipeline #543 failed with stages
in 0 seconds
//
// Created by huangyuyang on 6/13/23.
//
#ifndef FASTLLM_DEVICE_H
#define FASTLLM_DEVICE_H
#include "fastllm.h"
namespace fastllm {
typedef std::map <std::string, Data*> DataDict;
typedef std::map <std::string, float> FloatDict;
typedef std::map <std::string, int> IntDict;
class BaseOperator {
public:
// 是否可以运行某一个算子
virtual bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
// 对某一个算子进行形状推理
virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
// 对某一个算子进行推理
virtual void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams) = 0;
};
class BaseBatchOperator : BaseOperator {
public:
// 对某一个算子进行形状推理
virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams,
const IntDict &intParams);
};
class BaseDevice {
public:
virtual bool Malloc (void **ret, size_t size) = 0; // 分配尺寸为size的空间
virtual bool Malloc (void **ret, Data &data); // 分配形状为dims的空间
virtual bool Free(void *ret) = 0; // 释放ret
virtual bool CopyDataToCPU(void *dst, void *src, size_t size) = 0; // device上的src拷贝到cpu上的dst
virtual bool CopyDataToCPU(Data &data); // data数据从该device移动到CPU
virtual bool CopyDataFromCPU(void *dst, void *src, size_t size) = 0; // cpu上的src拷贝到device上的dst
virtual bool CopyDataFromCPU(Data &data); // data数据从CPU移动到该device
// 是否可以运行某一个算子
virtual bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
// 对某一个算子进行形状推理
virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
// 对某一个算子进行推理
virtual void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
std::string deviceType;
std::string deviceName;
std::vector <int> deviceIds;
std::map <std::string, BaseOperator*> ops;
};
}
#endif //FASTLLM_DEVICE_H
//
// Created by huangyuyang on 6/13/23.
//
#ifndef FASTLLM_CPUDEVICE_H
#define FASTLLM_CPUDEVICE_H
#include "device.h"
#include "cputhreadpool.h"
namespace fastllm {
class CpuDevice : BaseDevice {
public:
CpuDevice ();
bool Malloc (void **ret, size_t size); // 分配尺寸为size的空间
bool Free(void *ret); // 释放ret
bool CopyDataToCPU(void *dst, void *src, size_t size); // 不重要, cpu device不会进行这个操作
bool CopyDataFromCPU(void *dst, void *src, size_t size); // 不重要, cpu device不会进行这个操作
int threads = 4;
ThreadPool *threadPool = nullptr;
};
class CpuToFloat16 : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuToFloat32 : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuAttention : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuEmbedding : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuLayerNormOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuRMSNormOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuLinearOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuSplitOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuCatOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuCatDirectOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMatMulOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMatMulTransBOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuSoftMaxOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuSiluOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuGeluNewOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuSwigluOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMulOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMulToOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuAddToOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuAttentionMaskOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuAlibiMaskOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuTopKOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuPermuteOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuPermuteSelfOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuRotatePosition2DOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuNearlyRotatePosition2DOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuLlamaRotatePosition2DOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuRepeatPenaltyOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuApplyLognAttnOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuSplitBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuCatBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMulBatchOp : BaseBatchOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMatMulBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuMatMulTransBBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuSoftmaxBatchOp : BaseBatchOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CpuCatDirectBatchOp : BaseBatchOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
}
#endif //FASTLLM_CPUDEVICE_H
//
// Created by huangyuyang on 7/5/23.
//
#ifndef FASTLLCPUTHREADPOOL_H
#define FASTLLCPUTHREADPOOL_H
#include <mutex>
#include <queue>
#include <functional>
#include <future>
#include <thread>
#include <utility>
#include <vector>
namespace fastllm {
template <typename T>
class TaskQueue {
private:
std::queue <T> q;
std::mutex locker;
public:
TaskQueue() {}
~TaskQueue() {}
bool Empty() {
std::unique_lock<std::mutex> lock(locker);
return q.empty();
}
int Size() {
std::unique_lock<std::mutex> lock(locker);
return q.size();
}
void Push(T &t) {
std::unique_lock<std::mutex> lock(locker);
q.emplace(t);
}
bool Pop(T &t) {
std::unique_lock<std::mutex> lock(locker);
if (q.empty()) {
return false;
}
t = std::move(q.front());
q.pop();
return true;
}
};
class ThreadPool {
private:
class ThreadWorker
{
private:
int id;
ThreadPool *pool;
public:
ThreadWorker(ThreadPool *pool, const int id) : pool(pool), id(id) {}
void operator()() {
std::function<void()> func;
bool dequeued;
while (!pool->shutdown) {
{
std::unique_lock<std::mutex> lock(pool->locker);
if (pool->queue.Empty()) {
pool->cv.wait(lock);
}
dequeued = pool->queue.Pop(func);
}
if (dequeued) {
func();
}
}
}
};
bool shutdown = false;
TaskQueue<std::function<void()>> queue;
std::vector<std::thread> threads;
std::mutex locker;
std::condition_variable cv;
public:
ThreadPool(const int t = 4) : threads(std::vector<std::thread>(t)) {
for (int i = 0; i < threads.size(); ++i) {
threads[i] = std::thread(ThreadWorker(this, i));
}
}
void Shutdown() {
shutdown = true;
cv.notify_all();
for (int i = 0; i < threads.size(); ++i) {
if (threads[i].joinable()) {
threads[i].join();
}
}
}
template<typename F, typename... Args>
auto Submit(F &&f, Args &&...args) -> std::future<decltype(f(args...))> {
std::function<decltype(f(args...))()> func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
auto task_ptr = std::make_shared<std::packaged_task<decltype(f(args...))()>>(func);
std::function<void()> warpper_func = [task_ptr]() {
(*task_ptr)();
};
queue.Push(warpper_func);
cv.notify_one();
return task_ptr->get_future();
}
};
}
#endif //FASTLLCPUTHREADPOOL_H
//
// Created by huangyuyang on 6/14/23.
//
#ifndef FASTLLM_CUDADEVICE_H
#define FASTLLM_CUDADEVICE_H
#include "device.h"
namespace fastllm {
class CudaDevice : BaseDevice {
public:
CudaDevice ();
bool Malloc (void **ret, size_t size); // 分配尺寸为size的空间
bool Free(void *ret); // 释放ret
bool CopyDataToCPU(void *dst, void *src, size_t size);
bool CopyDataFromCPU(void *dst, void *src, size_t size);
};
class CudaAttention : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaLayerNormOp : BaseOperator {
bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaRMSNormOp : BaseOperator {
bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaLinearOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaSplitOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaCatDirectOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMatMulOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMatMulTransBOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaSoftMaxOp : BaseOperator {
bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaGeluNewOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaSiluOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaSwigluOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMulOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaAddToOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMulToOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaAttentionMaskOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaAlibiMaskOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaTopKOp : BaseOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaPermuteSelfOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaRotatePosition2DOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaLlamaRotatePosition2DOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaNearlyRotatePosition2DOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaApplyLognAttnOp : BaseOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaSplitBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaCatBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMulBatchOp : BaseBatchOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMatMulBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaMatMulTransBBatchOp : BaseBatchOperator {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaSoftmaxBatchOp : BaseBatchOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
class CudaCatDirectBatchOp : BaseBatchOperator {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
}
#endif //FASTLLM_CUDADEVICE_H
#include "fastllm.h"
#ifdef __cplusplus
extern "C" {
#endif
void FastllmInitCublas(void);
void FastllmCudaMallocBigBuffer(size_t size);
void FastllmCudaClearBigBuffer();
void *FastllmCudaMalloc(size_t size);
void FastllmCudaFree(void *ret);
void FastllmCudaCopyFromHostToDevice(void *dst, void *src, size_t size);
void FastllmCudaCopyFromDeviceToHost(void *dst, void *src, size_t size);
void FastllmCudaCopyFromDeviceToDevice(void *dst, void *src, size_t size);
void FastllmCudaMemcpy2DDeviceToDevice(void * dst, size_t dpitch, const void * src,
size_t spitch, size_t width, size_t height);
void FastllmCudaMemcpy2DDeviceToDeviceBatch(void ** dsts, size_t * dpitchs, void ** srcs,
size_t * spitchs, size_t *widths, size_t * heights,
int batch);
bool FastllmCudaAttention(const fastllm::Data &q, const fastllm::Data &k, const fastllm::Data &v,
const fastllm::Data &mask, const fastllm::Data &output, int group, float scale);
bool FastllmCudaGeluNew(const fastllm::Data &input, fastllm::Data &output);
bool FastllmCudaSilu(const fastllm::Data &input, fastllm::Data &output);
bool FastllmCudaSwiglu(const fastllm::Data &input, fastllm::Data &output);
bool FastllmCudaMul(const fastllm::Data &input, float v, fastllm::Data &output);
bool FastllmCudaSoftmax(const fastllm::Data &input, fastllm::Data &output, int axis);
bool FastllmCudaAddTo(fastllm::Data &input0, const fastllm::Data &input1, float alpha);
bool FastllmCudaMulTo(fastllm::Data &input0, const fastllm::Data &input1, float alpha);
bool FastllmCudaAttentionMask(fastllm::Data &input, const fastllm::Data &mask, float maskValue);
bool FastllmCudaAlibiMask(fastllm::Data &input, const fastllm::Data &mask, float maskValue);
bool FastllmCudaRMSNorm(const fastllm::Data &input, fastllm::Data &weight, fastllm::Data &output, float eps);
bool FastllmCudaLayerNorm(const fastllm::Data &input, fastllm::Data &gamma, fastllm::Data &beta, fastllm::Data &output, int axis);
bool FastllmCudaTopK(const fastllm::Data &input, fastllm::Data &output, int topk);
bool FastllmCudaPermute(fastllm::Data &input, const std::vector<int> &axis);
bool FastllmCudaMatMulFloatInt8(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloatInt4(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloatInt4NoZero(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloat32(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaBatchMatMul(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output,
int input0Spatial, int input1Spatial, int outputSpatial,
int input0Stride, int input1Stride,
int batch, int n, int m, int k, float alpha);
bool FastllmCudaBatchMatMulTransB(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output,
int input0Spatial, int input1Spatial, int outputSpatial,
int input0Stride, int input1Stride,
int batch, int n, int m, int k, float alpha);
bool FastllmCudaRotatePosition2D(fastllm::Data &data, const fastllm::Data &positionIds,
const fastllm::Data &sinData, const fastllm::Data &cosData, int rotaryDim);
bool FastllmCudaNearlyRotatePosition2D(fastllm::Data &data, const fastllm::Data &positionIds,
const fastllm::Data &sinData, const fastllm::Data &cosData, int rotaryDim);
bool FastllmCudaLlamaRotatePosition2D(fastllm::Data &data, const fastllm::Data &positionIds,
const fastllm::Data &sinData, const fastllm::Data &cosData, int rotaryDim);
bool FastllmCudaApplyLognAttn (fastllm::Data &input, fastllm::Data &lognAttn, fastllm::Data &positionIds);
bool FastllmCudaSplitBatch(fastllm::Data &input, fastllm::Data **outputs, int axis);
bool FastllmCudaCatBatch(fastllm::Data **inputs, fastllm::Data &output, int axis);
bool FastllmCudaMulBatch(fastllm::Data **inputs, float v, int batch, fastllm::Data **outputs);
bool FastllmCudaSoftmaxBatch(fastllm::Data **inputs, fastllm::Data **outputs, int axis, int batch);
bool FastllmCudaBatchMatMulTransBBatch(void **i0s, void **i1s, void **os,
int *ns, int *ms, int *ks,
int *i0Strides, int *i1Strides, float alpha, int batch);
bool FastllmCudaBatchMatMulBatch(void **i0s, void **i1s, void **os,
int *ns, int *ms, int *ks,
int *i0Strides, int *i1Strides, float alpha, int batch);
void FastllmCudaSetDevice(int gpu_id);
#ifdef __cplusplus
}
#endif
//
// Created by huangyuyang on 6/13/23.
//
#ifndef FASTLLM_EXECUTOR_H
#define FASTLLM_EXECUTOR_H
#include "device.h"
namespace fastllm {
class Executor {
private:
std::vector <BaseDevice*> devices;
std::map <std::string, float> profiler;
public:
Executor (); // 创建默认的Executor
~Executor(); // 析构
void ClearDevices(); // 清空 devices
void AddDevice(BaseDevice *device); // 增加一个device
void SetFirstDevice(const std::string &device); // 设定优先的device
std::vector <int> GetDeviceIds(const std::string &device); // 获取指定device的deviceIds
// 运行一个op
void Run(const std::string &opType, const fastllm::DataDict &datas, const fastllm::FloatDict &floatParams,
const fastllm::IntDict &intParams);
void ClearProfiler();
void PrintProfiler();
};
}
#endif //FASTLLM_EXECUTOR_H
//
// Created by huangyuyang on 5/11/23.
//
#ifndef TEST_FASTLLM_H
#define TEST_FASTLLM_H
#include <vector>
#include <cstdint>
#include <string>
#include <map>
#include <set>
#include <queue>
#include <unordered_map>
#include <cmath>
#include <algorithm>
#include <iostream>
#include <functional>
#include <memory>
#include "devices/cpu/cputhreadpool.h"
namespace fastllm {
void SetDeviceMap(const std::map <std::string, int> &deviceMap);
std::map <std::string, int> GetDeviceMap();
void PrintInstructionInfo();
void SetThreads(int t);
void SetLowMemMode(bool m);
void SetKVCacheInCPU(bool kvCacheInCPU);
bool GetLowMemMode();
int GetThreads();
bool GetKVCacheInCPU();
ThreadPool *GetPool();
struct GenerationConfig {
int output_token_limit = -1; // 最多输出多少, <= 0代表无限制
int last_n = 64; // 末尾last_n个token计入重复惩罚
float repeat_penalty = 1.0f; // 重复惩罚系数,1.0代表不惩罚
int top_k = 1; // top_k采样
float top_p = 1.0; // top_p采样
float temperature = 1.0; // 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性
bool output_logits = false; // 是否返回logits
bool enable_hash_id = false; // 给会话添加hash id
bool IsSimpleGreedy() const {
if (fabs(repeat_penalty - 1) > 1e-8) {
return false;
}
if (top_k > 1) {
return false;
}
return true;
}
};
struct LastTokensUnit {
int tot = 0;
std::multiset <int> tokenSet;
std::queue <int> tokenQueue;
LastTokensUnit () {}
LastTokensUnit (int tot) {
Init(tot);
}
void Init(int tot) {
this->tot = tot;
tokenSet.clear();
while (tokenQueue.size() > 0) {
tokenQueue.pop();
}
}
void Push(int id) {
if (tokenQueue.size() == tot) {
tokenSet.erase(tokenSet.find(tokenQueue.front()));
tokenQueue.pop();
}
tokenQueue.push(id);
tokenSet.insert(id);
}
};
struct LastTokensManager {
std::vector <LastTokensUnit> units;
LastTokensManager () {}
LastTokensManager (int batch, int lastN) {
units.resize(batch);
for (int i = 0; i < batch; i++) {
units[i].Init(lastN);
}
}
};
struct LowBitConfig {
int bit;
float min, max;
uint8_t zeroPoint;
float scale;
int type; // 0: 有zero点 1: 不需要zero点
LowBitConfig(float min, float max, int bit, int type) {
this->min = min;
this->max = max;
this->bit = bit;
this->type = type;
Reset();
}
LowBitConfig () {
}
void Reset() {
/*if (type == 1) {
this->scale = (max - min) / 15.0;
return;
}*/
/*if (type == 1) {
this->scale = std::max(fabs(max), fabs(min)) / 7.0;
this->min = this->scale * (-7.0);
return;
}*/
min = std::min(min, 0.f);
max = std::max(max, 0.f);
const float qmin = 0;
const float qmax = (1 << bit) - 1;
scale = (max - min) / (qmax - qmin);
const float initial_zero_point = qmin - min / scale;
zeroPoint = 0;
if (initial_zero_point < qmin) {
zeroPoint = qmin;
} else if (initial_zero_point > qmax) {
zeroPoint = qmax;
} else {
zeroPoint = static_cast<uint8_t>(std::round(initial_zero_point));
}
if (type == 1) {
this->min = -this->scale * zeroPoint;
return;
}
}
uint8_t quantization(const float &realNumber) const {
if (type == 0) {
return (uint8_t) (std::min((double) ((1 << bit) - 1),
std::max(realNumber / scale + zeroPoint + 0.5, 0.0)));
} else {
return (uint8_t) (std::max(0.f, std::min(15.f, (realNumber - min) / scale + 0.5f)));
}
}
float invQuantization(const uint8_t &qNumber) const {
if (type == 0) {
return (scale * ((float) qNumber - (float) zeroPoint));
} else {
return min + scale * qNumber;
}
}
};
enum DataType {
FLOAT32 = 0, BFLOAT16 = 1, INT16 = 2, INT8 = 3, INT4 = 4, INT2 = 5, BIT = 6, FLOAT16 = 7,
INT4_NOZERO = 8, // 不用zeroPoint的int4, floatValue = min + uint4Value * scale
INT32PARAM = 100 // int32的参数,这种类型的数据永远存在CPU上
};
enum DataDevice {
CPU = 0, CUDA = 1
};
enum WeightType {
NONE = 0, LINEAR = 1, EMBEDDING = 2
};
struct FileMmap {
public:
FileMmap(const std::string &path);
~FileMmap();
char *data;
size_t size;
};
struct ModelLoader {
ModelLoader(const char *buffer, size_t size) : data(buffer), size(size), ptr(buffer) {}
int64_t tell() const { return ptr - data; }
void seek(int64_t offset, int whence);
template <typename T>
T read_basic() {
T obj = *(T *)ptr;
ptr += sizeof(T);
return obj;
}
std::string ReadString();
int ReadInt();
float ReadFloat();
uint8_t* ReadBytes(uint64_t bytes);
const char *const data;
size_t size;
const char *ptr;
};
class Data {
public:
bool lockInCPU = false; // 如果lock在CPU上,那么不允许移动到其余设备
WeightType weightType = WeightType::NONE; // 权重类型,NONE代表非权重(或未知权重)
DataType dataType = DataType::FLOAT32; // 数据类型
int unitSize, unitSizeDiv = 1; // 单个元素的字节数 = unitSIze / unitSizeDiv
std::vector <int> dims; // 数据形状
std::vector <uint64_t> strides; // 跨度
uint64_t expansionSize = 0; // 扩容后的尺寸
uint64_t expansionBytes = 0; // 扩容后的字节数
std::vector <int> expansionDims; // 预扩容的形状
uint8_t *cpuData = nullptr; // 数据指针
void *cudaData = nullptr;
std::vector <void*> extraCudaData;
void *deviceData = nullptr;
std::vector <void*> extraDeviceData;
DataDevice dataDevice = DataDevice::CPU;
std::vector <int> dataDeviceIds;
// 这两个参数用于量化,对FLOAT数据不适用
int perChannelAxis = -1; // 沿哪个轴分通道量化,-1代表没有分通道
std::vector <LowBitConfig> perChannelsConfigs; // perChannelsConfigs[i]代表第i个通道的min, max; 如果没有分通道,perChannelsConfigs[0]代表全局min, max
std::vector <float> scales, mins;
std::vector <int> zeros;
std::vector <int> weightSum; // 作为权重时,有时候需要存一些和加速计算
std::string fileName;
long long filePos;
std::shared_ptr<FileMmap> m_file;
Data () {};
Data (DataType type);
Data (DataType type, const std::vector <int> &dims); // 构造函数
// 构造函数,创建好之后从data复制数据
// data中是原始数据,如果type不是float那么需要量化
Data (DataType type, const std::vector <int> &dims, const std::vector <float> &data);
~Data(); // 析构函数
Data (const Data &ori); // 深拷贝
void CopyFrom(const Data &ori); // 复制
uint64_t GetBytes() const; // 获取总字节数
void Allocate(); // 分配内存
void Allocate(float v); // 分配内存并初始化
void Expansion(const std::vector <int> &dims); // 预扩容到相应尺寸
void MallocSpace(uint64_t size); // 在设备上分配
void FreeSpace(); // 回收设备上的内存
void UpdateUnitSize(); // 更新unitSize
void Resize(const std::vector <int> &dims); // 更改尺寸
void Reshape(const std::vector <int> &dims); // 更改尺寸,但不修改数据
uint64_t Count(int i) const; // dims[i] * strides[i]
void PrintShape() const; // 输出形状
void Print() const; // 输出
void CalcWeightSum(); // 计算WeightSum
void ToDevice(DataDevice device); // 移动到指定device
void ToDevice(DataDevice device, const std::vector <int> &deviceIds); // 移动到指定device
void ToDevice(void *device);
void set_file(std::shared_ptr<FileMmap> file) {
m_file = file;
}
};
struct Tokenizer {
enum TokenizerType {
BPE = 0,
NORMAL = 1,
QWEN = 2
};
struct TrieNode {
int tokenId;
float score;
std::map <int, TrieNode*> next;
TrieNode();
};
struct Symbol {
TrieNode *node;
char *s;
int pos, len;
int prev, next;
int fixId;
Symbol (Tokenizer::TrieNode *node,
char *s, int pos, int len,
int prev, int next, int fixId) {
this->node = node;
this->s = s;
this->pos = pos;
this->len = len;
this->prev = prev;
this->next = next;
this->fixId = fixId;
}
};
struct SymbolPairs {
float score;
int l, r, size;
SymbolPairs(float score, int l, int r, int size) {
this->score = score;
this->l = l;
this->r = r;
this->size = size;
}
};
friend bool operator < (const SymbolPairs &a, const SymbolPairs &b) {
return a.score < b.score || (a.score == b.score && a.l > b.l);
}
TrieNode *root;
TokenizerType type = TokenizerType::BPE;
std::unordered_map <int, std::string> tokenToStringDict;
std::unordered_map <int, float> tokenToScoreDict;
std::unordered_map <std::string, int> stringToTokenDict;
Tokenizer ();
~Tokenizer();
void Clear(); // 清空分词器
void TryMergePairs(std::vector<Symbol> &symbols, int l, int r, std::priority_queue <SymbolPairs> &q); // 插入备选symbol
void Insert(const std::string &s, int tokenId, float score = 1.0f); // 插入一个token
Data Encode(const std::string &s); // 编码
std::string Decode(const Data &data); // 解码
std::string DecodeTokens(const std::vector <int> &tokens); // 解码
};
std::string GetModelTypeFromFile(const std::string &fileName);
struct WeightMap {
int versionId = 2;
Tokenizer tokenizer;
std::map <std::string, std::string> dicts;
std::map <std::string, Data> weight;
std::map <std::string, std::map <std::string, std::string>> peftDict;
std::set <std::string> embeddingNames;
void LoadFromFile(const std::string &fileName); // 从文件读取
void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型, bit = 0代表直接存
void AddTokenizerWord(const std::string &key, int value, float score); // 增加一个词
void AddDict(const std::string &key, const std::string &value); // 插入一个词条
void AddAdapterDict(const std::string &name, const std::string &key, const std::string &value);
void AddWeight(const std::string &key, const std::vector <int> &dims,
DataType dataType, WeightType weightType, DataType oriDataType, uint8_t *oriData); // 插入一个权重
void AddQLinearWeight(const std::string &key, const std::vector <int> &dims,
int bit, float *scales, uint8_t *oriData); // 插入一个Qlinear层的权重,量化规则为float value = scales * oriData
Data &operator [] (const std::string &key);
};
void ClearProfiler();
void PrintProfiler();
void ApplyDeviceMap(const std::map <std::string, int> &deviceMap, int current, int total); // 执行到了current, 一共total,使用deviceMap切换设备
int LLMSampling(Data &logits, int outerOffset,
const GenerationConfig &config, const LastTokensUnit &tokens); // 对logits里[outerOffset * vocabSize, (outerOffset + 1) * vocabSize]做Sampling
void ToDataType(const Data &input, DataType dataType);
void Attention(const Data &q, const Data &k, const Data &v, const Data &mask, Data &output,
int group, float scale, int attentionType);
void Embedding(const Data &input, Data &weight, Data &output);
void RMSNorm(const Data &input, const Data &weight, float eps, Data &output);
void LayerNorm(Data &input, Data &gamma, Data &beta, int axis, Data &output);
void Linear(Data &input, Data &weight, const Data &bias, Data &output);
void Split(const Data &input, int axis, int start, int end, Data &output);
void Cat(const Data &input0, const Data &input1, int axis, Data &output);
void CatDirect(Data &input0, const Data &input1, int axis); // 直接把input1的数据拷贝到input0后面(需要input0提前扩容了足够的空间)
void MatMul(const Data &input0, const Data &input1, Data &output, float alpha = 1.0);
void MatMulTransB(const Data &input0, const Data &input1, Data &output, float alpha = 1.0);
void Softmax(const Data &input, Data &output, int axis);
void Silu(const fastllm::Data &input, fastllm::Data &output);
void GeluNew(const Data &input, Data &output);
void Swiglu(const fastllm::Data &input, fastllm::Data &output);
void Mul(const Data &input, float v, Data &output);
void MulTo(Data &input0, const Data &input1); // input0 *= input1
void AddTo(Data &input0, const Data &input1, float alpha = 1.0); // input0 += input1 * alpha
void AttentionMask(Data &input, const Data &mask, float maskValue); // 把input里对应位置mask中为1的部分变成maskValue
void AlibiMask(Data &input, const Data &mask, float maskValue); // alibi mask
void Permute(const Data &input, const std::vector<int> &axis, Data &output); // 转置
void PermuteSelf(const Data &input, const std::vector<int> &axis); // 转置
void TopK(const Data &input, Data &output, int topK); // 求topk
void RotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position
void NearlyRotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position, 相邻的元素旋转
void LlamaRotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position for llama
void RepeatPenalty(Data &input, const Data &penalty); // 惩罚,input[i] = input[i] < 0 ? input[i] * penalty[i] : input[i] / penalty[i];
void ApplyLognAttn(Data &input, const Data &lognAttn, const Data &positionIds);
void MulBatch(std::vector <Data*> &input, float v, std::vector <Data*> &output);
void SplitBatch(const Data &input, int axis, int part, std::vector <Data*> &outputs); // 将input沿着axis轴切开,每份axis上的尺寸为1,放到outputs里
void CatBatch(std::vector <Data*> &input, int axis, Data &outputs); // 将input沿着axis轴合起来,每份axis上的尺寸为1,放到output里
void MatMulBatch(std::vector <Data*> &input0, std::vector <Data*> &input1, std::vector <Data*> &output, float alpha = 1.0);
void MatMulTransBBatch(std::vector <Data*> &input0, std::vector <Data*> &input1, std::vector <Data*> &output, float alpha = 1.0);
void SoftmaxBatch(std::vector <Data*> &input, std::vector <Data*> &output, int axis);
void CatDirectBatch(std::vector <Data*> &input0, std::vector <Data*> &input1, int axis);
void LoraLayer(Data &input, Data &weight, Data &loraA, Data &loraB, const Data &bias, Data &output,
std::map <std::string, std::string> loraConfig);
void IA3Layer(Data &input, Data &weight, Data &ia3_l, Data &bias, Data &output,
std::map <std::string, std::string> ia3Config);
}
#endif //TEST_FASTLLM_H
//
// Created by huangyuyang on 6/20/23.
//
#ifndef FASTLLM_MODEL_H
#define FASTLLM_MODEL_H
#include "basellm.h"
namespace fastllm {
std::unique_ptr<basellm> CreateLLMModelFromFile(const std::string &fileName);
std::unique_ptr<basellm> CreateEmptyLLMModel(const std::string &modelType);
}
#endif //FASTLLM_MODEL_H
#pragma once
#include "fastllm.h"
#include <thread>
#include <mutex>
#ifdef PY_API
#include "Python.h"
#include <pybind11/pytypes.h>
using RuntimeResult = std::function<void(int index, pybind11::bytes content)>;
using RuntimeResultBatch = std::function<void(int index, std::vector <pybind11::bytes> &contents)>;
#else
using RuntimeResult = std::function<void(int index, const char* content)>;
using RuntimeResultBatch = std::function<void(int index, std::vector <std::string> &contents)>;
#endif
namespace fastllm {
struct ResponseContext {
bool isEnding = false;
std::vector <std::pair <Data, Data> > pastKeyValues;
std::vector <int> currentTokens;
std::queue <int> resultTokenQueue;
std::queue <std::vector <float>*> resultLogits;
GenerationConfig generationConfig;
LastTokensUnit tokens;
int preTokens = 0;
int curTokens = 0;
std::map <std::string, int> intParams;
void Init(int blocks);
};
struct ResponseContextDict {
std::mutex locker;
std::map <int, ResponseContext*> dicts;
int CreateHandle();
ResponseContext* GetHandle(int handleId);
void RemoveHandle(int handleId);
};
class basellm {
public:
basellm() {};
~basellm() {};
virtual void LoadFromFile(const std::string &fileName); // 从文件读取
virtual void InitParams(); // 初始化参数信息
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector<std::pair<Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <float> *logits = nullptr) = 0;
virtual std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);
virtual std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
const std::vector <std::map <std::string, int> > &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
virtual std::string Response(const std::string &input,
RuntimeResult retCb,
const GenerationConfig &generationConfig = GenerationConfig());
virtual void ResponseBatch(const std::vector<std::string> &inputs,
std::vector<std::string> &outputs,
RuntimeResultBatch retCb = nullptr,
const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId
virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了
virtual int FetchResponseLogits(int handleId, std::vector <float> &logits); // 获取指定handle的输出Logits
virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
virtual void SaveModel(const std::string &fileName); // 直接导出
virtual void WarmUp() {}; // 预热
virtual std::string MakeInput(const std::string &history, int round, const std::string &input) = 0; // 根据历史信息和当前输入生成prompt
virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) = 0; // 根据当前回复更新history
virtual void SetAdapter(const std::string &name);
virtual void DisableAdapter();
std::string model_type;
std::string pre_prompt; // 最初对话的提示语
std::string user_role, bot_role, history_sep; // 用于生成每一轮的prompt
int bos_token_id;
int eos_token_id;
int embed_dim = 4096;
int num_attention_heads = 32;
int head_dim = embed_dim / num_attention_heads;
const int max_positions = 32768;
int rotary_dim = 64;
const float scale_attn = sqrt(head_dim);
int block_cnt = 28;
std::vector<std::vector<float> > sin, cos;
WeightMap weight; // 权重
Data sinData, cosData;
ResponseContextDict responseContextDict;
std::thread *mainLoop = nullptr;
std::mutex mainLoopLocker, dictLocker;
std::map <std::string, int> deviceMap;
std::string adapterName;
};
}
//
// Created by huangyuyang on 5/11/23.
//
#ifndef FASTLLM_CHATGLM_H
#define FASTLLM_CHATGLM_H
#include "basellm.h"
#include "cmath"
#include <iostream>
namespace fastllm {
class ChatGLMModel: public basellm {
public:
ChatGLMModel (); // 构造函数
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <float> *logits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *retLogits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
const std::vector <std::map <std::string, int> > &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
virtual void WarmUp(); // 预热
virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
int GetVersion();
void UpdateSinCos(float rope);
private:
virtual void CausalMask(Data &data, int start) {}; // 因果mask?
float rope = 1.0f;
};
}
#endif //FASTLLM_CHATGLM_H
#pragma once
#include "chatglm.h"
#include "moss.h"
#include "basellm.h"
#include "llama.h"
#include "qwen.h"
#include "fastllm.h"
enum LLM_TYPE {
LLM_TYPE_CHATGLM = 0,
LLM_TYPE_MOSS = 1,
LLM_TYPE_VICUNA = 2,
LLM_TYPE_BAICHUAN = 3,
LLM_TYPE_QWEN = 4
};
class factoryllm {
public:
factoryllm() {};
~factoryllm() {};
fastllm::basellm *createllm(LLM_TYPE type) {
fastllm::basellm *pLLM = NULL;
switch (type) {
case LLM_TYPE_CHATGLM:
pLLM = new fastllm::ChatGLMModel();
break;
case LLM_TYPE_MOSS:
pLLM = new fastllm::MOSSModel();
break;
case LLM_TYPE_VICUNA:
pLLM = new fastllm::LlamaModel();
break;
default:
pLLM = new fastllm::QWenModel();
break;
}
return pLLM;
};
};
\ No newline at end of file
//
// Created by huangyuyang on 6/1/23.
//
#ifndef FASTLLM_LLAMA_H
#define FASTLLM_LLAMA_H
#include "basellm.h"
#include "cmath"
#include <iostream>
namespace fastllm {
class LlamaModel: public basellm {
public:
LlamaModel (); // 构造函数
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <float> *logits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);
virtual std::string Response(const std::string& input,
RuntimeResult retCb,
const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复
virtual void ResponseBatch(const std::vector <std::string> &inputs,
std::vector <std::string> &outputs,
RuntimeResultBatch retCb,
const GenerationConfig &generationConfig = GenerationConfig());
virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId
virtual int FetchResponseTokens(int handelId); // 获取指定handle的输出, -1代表输出结束了
virtual void WarmUp(); // 预热
virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
bool is_nsql = false;
};
}
#endif //FASTLLM_LLAMA_H
//
// Created by huangyuyang on 5/12/23.
//
#ifndef TEST_MOSS_H
#define TEST_MOSS_H
#include "basellm.h"
#include "cmath"
namespace fastllm {
class MOSSModel: public basellm {
public:
MOSSModel(); // 构造函数
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <float> *logits = nullptr);
virtual std::string Response(const std::string &input, RuntimeResult retCb,
const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复
virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
virtual void WarmUp();
private:
virtual void RotatePosition2D(Data &data, const Data &positionIds); // 二维位置编码
virtual void CausalMask(Data &data, int start); // 因果mask?
};
}
#endif //TEST_MOSS_H
//
// Created by siemon on 8/9/23.
//
#ifndef TEST_QWEN_H
#define TEST_QWEN_H
#include "basellm.h"
namespace fastllm {
class QWenModel : public basellm {
public:
QWenModel();
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <float> *logits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const std::vector <Data*> &attentionMask,
const std::vector <Data*> &positionIds,
const std::vector <int> &seqLens,
std::vector <std::pair <Data*, Data*> > &pastKeyValues,
const std::vector <GenerationConfig> &generationConfigs,
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *retLogits = nullptr);
virtual std::string MakeInput(const std::string &history, int round, const std::string &input);
virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output);
virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
const std::vector <std::map <std::string, int> > &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
virtual void WarmUp();
void UpdateRotaryPosEmb(float ntk_alpha);
int seq_length;
float ntk_alpha;
bool use_log_attn;
Data logn_list;
private:
std::string im_start = "<|im_start|>";
std::string im_end = "<|im_end|>";
};
}
#endif //TEST_QWEN_H
\ No newline at end of file
#ifndef TFDL_ARMFUNCTIONS_H
#define TFDL_ARMFUNCTIONS_H
/* NEON implementation of sin, cos, exp and log
*
* Inspired by Intel Approximate Math library, and based on the
* corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2011 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* (this is the zlib license)
*/
#include <arm_neon.h>
#define c_inv_mant_mask ~0x7f800000u
#define c_cephes_SQRTHF 0.707106781186547524
#define c_cephes_log_p0 7.0376836292E-2
#define c_cephes_log_p1 - 1.1514610310E-1
#define c_cephes_log_p2 1.1676998740E-1
#define c_cephes_log_p3 - 1.2420140846E-1
#define c_cephes_log_p4 + 1.4249322787E-1
#define c_cephes_log_p5 - 1.6668057665E-1
#define c_cephes_log_p6 + 2.0000714765E-1
#define c_cephes_log_p7 - 2.4999993993E-1
#define c_cephes_log_p8 + 3.3333331174E-1
#define c_cephes_log_q1 -2.12194440e-4
#define c_cephes_log_q2 0.693359375
/* natural logarithm computed for 4 simultaneous float
* return NaN for x <= 0
*/
static inline float32x4_t log_ps(float32x4_t x)
{
float32x4_t one = vdupq_n_f32(1);
x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
int32x4_t ux = vreinterpretq_s32_f32(x);
int32x4_t emm0 = vshrq_n_s32(ux, 23);
/* keep only the fractional part */
ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
x = vreinterpretq_f32_s32(ux);
emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
float32x4_t e = vcvtq_f32_s32(emm0);
e = vaddq_f32(e, one);
/* part2:
* if( x < SQRTHF ) {
* e -= 1;
* x = x + x - 1.0;
* } else { x = x - 1.0; }
*/
uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
x = vsubq_f32(x, one);
e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
x = vaddq_f32(x, tmp);
float32x4_t z = vmulq_f32(x,x);
float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
y = vmulq_f32(y, x);
y = vmulq_f32(y, z);
tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
y = vaddq_f32(y, tmp);
tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
y = vsubq_f32(y, tmp);
tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
x = vaddq_f32(x, y);
x = vaddq_f32(x, tmp);
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
return x;
}
#define c_exp_hi 88.3762626647949f
#define c_exp_lo -88.3762626647949f
#define c_cephes_LOG2EF 1.44269504088896341
#define c_cephes_exp_C1 0.693359375
#define c_cephes_exp_C2 -2.12194440e-4
#define c_cephes_exp_p0 1.9875691500E-4
#define c_cephes_exp_p1 1.3981999507E-3
#define c_cephes_exp_p2 8.3334519073E-3
#define c_cephes_exp_p3 4.1665795894E-2
#define c_cephes_exp_p4 1.6666665459E-1
#define c_cephes_exp_p5 5.0000001201E-1
/* exp() computed for 4 float at once */
static inline float32x4_t exp_ps(float32x4_t x)
{
float32x4_t tmp, fx;
float32x4_t one = vdupq_n_f32(1);
x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
/* express exp(x) as exp(g + n*log(2)) */
fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
/* perform a floorf */
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
/* if greater, subtract 1 */
uint32x4_t mask = vcgtq_f32(tmp, fx);
mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
x = vsubq_f32(x, tmp);
x = vsubq_f32(x, z);
static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);
y = vmulq_f32(y, x);
z = vmulq_f32(x, x);
y = vaddq_f32(y, c1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c5);
y = vmulq_f32(y, z);
y = vaddq_f32(y, x);
y = vaddq_f32(y, one);
/* build 2^n */
int32x4_t mm;
mm = vcvtq_s32_f32(fx);
mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
mm = vshlq_n_s32(mm, 23);
float32x4_t pow2n = vreinterpretq_f32_s32(mm);
y = vmulq_f32(y, pow2n);
return y;
}
static inline float32x4_t faster_exp_ps(float32x4_t x) {
float32x4_t vmul = vdupq_n_f32(1.442695040f);
float32x4_t vmin = vdupq_n_f32(-126.0f);
float32x4_t vmagicAdd = vdupq_n_f32(126.94269504f);
float32x4_t vmagicMul = vdupq_n_f32(1 << 23);
float32x4_t y = vreinterpretq_f32_u32(vcvtq_u32_f32(vmulq_f32(vmagicMul, vaddq_f32(vmaxq_f32(vmin, vmulq_f32(x, vmul)), vmagicAdd))));
return y;
}
#define c_minus_cephes_DP1 -0.78515625
#define c_minus_cephes_DP2 -2.4187564849853515625e-4
#define c_minus_cephes_DP3 -3.77489497744594108e-8
#define c_sincof_p0 -1.9515295891E-4
#define c_sincof_p1 8.3321608736E-3
#define c_sincof_p2 -1.6666654611E-1
#define c_coscof_p0 2.443315711809948E-005
#define c_coscof_p1 -1.388731625493765E-003
#define c_coscof_p2 4.166664568298827E-002
#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
/* evaluation of 4 sines & cosines at once.
*
* The code is the exact rewriting of the cephes sinf function.
* Precision is excellent as long as x < 8192 (I did not bother to
* take into account the special handling they have for greater values
* -- it does not return garbage for arguments over 8192, though, but
* the extra precision is missing).
*
* Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
* surprising but correct result.
*
* Note also that when you compute sin(x), cos(x) is available at
* almost no extra price so both sin_ps and cos_ps make use of
* sincos_ps..
*/
static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
{
// any x
float32x4_t xmm1, xmm2, xmm3, y;
uint32x4_t emm2;
uint32x4_t sign_mask_sin, sign_mask_cos;
sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
x = vabsq_f32(x);
/* scale by 4/Pi */
y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
/* store the integer part of y in mm0 */
emm2 = vcvtq_u32_f32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
y = vcvtq_f32_u32(emm2);
/* get the polynom selection mask
* there is one polynom for 0 <= x <= Pi/4
* and another one for Pi/4<x<=Pi/2
*
* Both branches will be computed.
*/
uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
/* The magic pass: "Extended precision modular arithmetic"
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
x = vaddq_f32(x, xmm1);
x = vaddq_f32(x, xmm2);
x = vaddq_f32(x, xmm3);
sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
* and the second polynom (Pi/4 <= x <= 0) in y2 */
float32x4_t z = vmulq_f32(x,x);
float32x4_t y1, y2;
y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, x);
y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
y2 = vaddq_f32(y2, x);
y1 = vaddq_f32(y1, vdupq_n_f32(1));
/* select the correct result from the two polynoms */
float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
*ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
*ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
}
static inline float32x4_t sin_ps(float32x4_t x)
{
float32x4_t ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ysin;
}
static inline float32x4_t cos_ps(float32x4_t x)
{
float32x4_t ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ycos;
}
static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
{
float32x4_t reciprocal = vrecpeq_f32(b);
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return vmulq_f32(a, reciprocal);
}
static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
{
// pow(x, m) = exp(m * log(x))
return exp_ps(vmulq_f32(b, log_ps(a)));
}
#define c_cephes_HALFMAXLOGF 44.014845935754205f
#define c_cephes_tanh_C1 0.625f
#define c_cephes_tanh_p0 - 5.70498872745E-3
#define c_cephes_tanh_p1 + 2.06390887954E-2
#define c_cephes_tanh_p2 - 5.37397155531E-2
#define c_cephes_tanh_p3 + 1.33314422036E-1
#define c_cephes_tanh_p4 - 3.33332819422E-1
/* Single precision hyperbolic tangent computed for 4 simultaneous float */
static inline float32x4_t tanh_ps(float32x4_t x)
{
float32x4_t x2 = vabsq_f32(x);
uint32x4_t mask_l = vcgeq_f32(x2, vdupq_n_f32(c_cephes_tanh_C1));
uint32x4_t mask_l2 = vcgtq_f32(x2, vdupq_n_f32(c_cephes_HALFMAXLOGF));
// abs(x) >= 0.625
// tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
float32x4_t _one = vdupq_n_f32(1.f);
float32x4_t exp_x_x = exp_ps(vaddq_f32(x, x));
#if __aarch64__
float32x4_t y0 = vdivq_f32(vsubq_f32(exp_x_x, _one), vaddq_f32(exp_x_x, _one));
#else
float32x4_t y0 = div_ps(vsubq_f32(exp_x_x, _one), vaddq_f32(exp_x_x, _one));
#endif
// abs(x) < 0.625
/*
z = x2 * x2;
z =
(((( -5.70498872745E-3 * z
+ 2.06390887954E-2) * z
- 5.37397155531E-2) * z
+ 1.33314422036E-1) * z
- 3.33332819422E-1) * z * x
+ x;
*/
static const float cephes_tanh_p[5] = { c_cephes_tanh_p0, c_cephes_tanh_p1, c_cephes_tanh_p2, c_cephes_tanh_p3, c_cephes_tanh_p4 };
float32x4_t y = vld1q_dup_f32(cephes_tanh_p+0);
float32x4_t c1 = vld1q_dup_f32(cephes_tanh_p+1);
float32x4_t c2 = vld1q_dup_f32(cephes_tanh_p+2);
float32x4_t c3 = vld1q_dup_f32(cephes_tanh_p+3);
float32x4_t c4 = vld1q_dup_f32(cephes_tanh_p+4);
float32x4_t z = vmulq_f32(x, x);
y = vmulq_f32(y, z);
y = vaddq_f32(y, c1);
y = vmulq_f32(y, z);
y = vaddq_f32(y, c2);
y = vmulq_f32(y, z);
y = vaddq_f32(y, c3);
y = vmulq_f32(y, z);
y = vaddq_f32(y, c4);
y = vmulq_f32(y, z);
y = vmulq_f32(y, x);
y = vaddq_f32(y, x);
// abs(x) > HALFMAXLOGF
// return 1.0 or -1.0
uint32x4_t mask_pos = vcgtq_f32(x2, vdupq_n_f32(0.f));
float32x4_t y1 = vreinterpretq_f32_u32(vbslq_u32(mask_pos, vreinterpretq_u32_f32(vdupq_n_f32(1.f)), vreinterpretq_u32_f32(vdupq_n_f32(-1.f))));
y = vreinterpretq_f32_u32(vbslq_u32(mask_l, vreinterpretq_u32_f32(y0), vreinterpretq_u32_f32(y)));
y = vreinterpretq_f32_u32(vbslq_u32(mask_l2, vreinterpretq_u32_f32(y1), vreinterpretq_u32_f32(y)));
return y;
}
#endif //TFDL_ARMFUNCTIONS_H
//
// Created by huangyuyang on 6/2/23.
//
#pragma once
#ifndef FASTLLM_UTILS_H
#define FASTLLM_UTILS_H
#include <map>
#include <chrono>
#include <string>
#include <cstdio>
#include <cstdint>
#include <vector>
#if defined(_WIN32) or defined(_WIN64)
#include <Windows.h>
#else
#include <unistd.h>
#endif
#ifdef __AVX__
#include "immintrin.h"
#endif
namespace fastllm {
static void MySleep(int t) {
#if defined(_WIN32) or defined(_WIN64)
Sleep(t);
#else
sleep(t);
#endif
}
static void ErrorInFastLLM(const std::string &error) {
printf("FastLLM Error: %s\n", error.c_str());
throw error;
}
static void AssertInFastLLM(bool condition, const std::string &error) {
if (!condition) {
ErrorInFastLLM(error);
}
}
static uint32_t as_uint(const float x) {
return *(uint32_t*)&x;
}
static float as_float(const uint32_t x) {
return *(float*)&x;
}
static float half_to_float(const uint16_t x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint32_t e = (x & 0x7C00) >> 10; // exponent
const uint32_t m = (x & 0x03FF) << 13; // mantissa
const uint32_t v = as_uint((float) m) >> 23; // evil log2 bit hack to count leading zeros in denormalized format
return as_float((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) | ((e == 0) & (m != 0)) * ((v - 37) << 23 |
((m << (150 - v)) &
0x007FE000))); // sign : normalized : denormalized
}
static uint16_t float_to_half(const float x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint32_t b = as_uint(x) + 0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
const uint32_t e = (b & 0x7F800000) >> 23; // exponent
const uint32_t m = b &
0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
return (b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
(e > 143) * 0x7FFF; // sign : normalized : denormalized : saturate
}
static double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2) {
auto duration = std::chrono::duration_cast<std::chrono::microseconds> (time2 - time1);
return double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
};
static bool StartWith(const std::string &a, const std::string &b) {
return a.size() >= b.size() && a.substr(0, b.size()) == b;
}
static std::vector <int> ParseDeviceIds(const std::string &s, const std::string &type) {
int i = type.size();
std::vector <int> ret;
std::string cur = "";
if (s.size() > i && s[i] == ':') {
i++;
while (i < s.size()) {
if (s[i] >= '0' && s[i] <= '9') {
cur += s[i];
} else {
if (cur != "") {
ret.push_back(atoi(cur.c_str()));
cur = "";
}
}
i++;
}
}
if (cur != "") {
ret.push_back(atoi(cur.c_str()));
}
return ret;
}
struct TimeRecord {
std::map<std::string, float> v;
std::chrono::system_clock::time_point t;
void Clear() {
v.clear();
}
void Record() {
t = std::chrono::system_clock::now();
}
void Record(const std::string &key) {
auto now = std::chrono::system_clock::now();
v[key] += GetSpan(t, now);
t = now;
}
void Print() {
float s = 0;
for (auto &it: v) {
printf("%s: %f s.\n", it.first.c_str(), it.second);
s += it.second;
}
printf("Total: %f s.\n", s);
}
};
#ifdef __AVX__
static inline float Floatsum(const __m256 a) {
__m128 res = _mm256_extractf128_ps(a, 1);
res = _mm_add_ps(res, _mm256_castps256_ps128(a));
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
res = _mm_add_ss(res, _mm_movehdup_ps(res));
return _mm_cvtss_f32(res);
}
static inline int I32sum(const __m256i a) {
const __m128i sum128 = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(a, 1));
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}
static inline int I16Sum(const __m256i a) {
int sum = I32sum(_mm256_madd_epi16(a, _mm256_set1_epi16(1)));
return sum;
}
#endif
}
#endif //FASTLLM_UTILS_H
#include "model.h"
struct RunConfig {
std::string path = "chatglm-6b-int4.bin"; // 模型文件路径
int threads = 4; // 使用的线程数
bool lowMemMode = false; // 是否使用低内存模式
};
void Usage() {
std::cout << "Usage:" << std::endl;
std::cout << "[-h|--help]: 显示帮助" << std::endl;
std::cout << "<-p|--path> <args>: 模型文件的路径" << std::endl;
std::cout << "<-t|--threads> <args>: 使用的线程数量" << std::endl;
std::cout << "<-l|--low>: 使用低内存模式" << std::endl;
std::cout << "<--top_p> <args>: 采样参数top_p" << std::endl;
std::cout << "<--top_k> <args>: 采样参数top_k" << std::endl;
std::cout << "<--temperature> <args>: 采样参数温度,越高结果越不固定" << std::endl;
std::cout << "<--repeat_penalty> <args>: 采样参数重复惩罚" << std::endl;
}
void ParseArgs(int argc, char **argv, RunConfig &config, fastllm::GenerationConfig &generationConfig) {
std::vector <std::string> sargv;
for (int i = 0; i < argc; i++) {
sargv.push_back(std::string(argv[i]));
}
for (int i = 1; i < argc; i++) {
if (sargv[i] == "-h" || sargv[i] == "--help") {
Usage();
exit(0);
} else if (sargv[i] == "-p" || sargv[i] == "--path") {
config.path = sargv[++i];
} else if (sargv[i] == "-t" || sargv[i] == "--threads") {
config.threads = atoi(sargv[++i].c_str());
} else if (sargv[i] == "-l" || sargv[i] == "--low") {
config.lowMemMode = true;
} else if (sargv[i] == "-m" || sargv[i] == "--model") {
i++;
} else if (sargv[i] == "--top_p") {
generationConfig.top_p = atof(sargv[++i].c_str());
} else if (sargv[i] == "--top_k") {
generationConfig.top_k = atof(sargv[++i].c_str());
} else if (sargv[i] == "--temperature") {
generationConfig.temperature = atof(sargv[++i].c_str());
} else if (sargv[i] == "--repeat_penalty") {
generationConfig.repeat_penalty = atof(sargv[++i].c_str());
} else {
Usage();
exit(-1);
}
}
}
int main(int argc, char **argv) {
int round = 0;
std::string history = "";
RunConfig config;
fastllm::GenerationConfig generationConfig;
ParseArgs(argc, argv, config, generationConfig);
fastllm::PrintInstructionInfo();
fastllm::SetThreads(config.threads);
fastllm::SetLowMemMode(config.lowMemMode);
auto model = fastllm::CreateLLMModelFromFile(config.path);
static std::string modelType = model->model_type;
printf("欢迎使用 %s 模型. 输入内容对话,reset清空历史记录,stop退出程序.\n", model->model_type.c_str());
while (true) {
printf("用户: ");
std::string input;
std::getline(std::cin, input);
if (input == "reset") {
history = "";
round = 0;
continue;
}
if (input == "stop") {
break;
}
std::string ret = model->Response(model->MakeInput(history, round, input), [](int index, const char* content) {
if (index == 0) {
printf("%s:%s", modelType.c_str(), content);
fflush(stdout);
}
if (index > 0) {
printf("%s", content);
fflush(stdout);
}
if (index == -1) {
printf("\n");
}
}, generationConfig);
history = model->MakeHistory(history, round, input, ret);
round++;
}
return 0;
}
\ No newline at end of file
# pyfastllm
by [wildkid1024](https://github.com/wildkid1024)
pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加灵活地编码实现pythonic场景,满足更复杂更个性化的业务需求。
- 对接fastapi、flask等web框架,向外提供数据接口
- 利用python yield生成器语言特性,流式问答响应
- 对接Lora、Ptuning等微调方法,下游任务可微调(开发中...)
- 无缝对接加速HugingFace模型库,无痛加速迁移原有业务代码(开发中...)
- 其他更多...
## 版本更新
### v0.1.3 2023-07-08
- 增加使用和API接口文档
- 增加fastllm-convert模型转换接口及命令行转化工具
- 修复部分因为cpp新接口导致的bug
## 编译安装
本地编译安装fastllm的python接口,以两种方式编译运行:
1. 动态库方式:编译为动态库,需放在python运行加载目录下
2. wheel包方式:编译为wheel包,安装在python的site-packages下,但暂不支持cuda
### 动态库方式
> 动态库安装方式暂不支持模型转换
首先下载pybind11 c++依赖:
```sh
git submodule init
git submodule update # 下载pybind11依赖
```
Cpp手动编译:
```sh
mkdir build-py
cd build-py
cmake .. -DUSE_CUDA=ON -DPY_API=ON
make -j4
python cli.py -p chatglm-6b-int8.bin -t 8 # 与cpp编译的运行结果保持一致
```
Python脚本编译:
```sh
cd pyfastllm
python build_libs --cuda
python cli.py -p chatglm-6b-int8.bin -t 8
```
### wheel包方式
> 注意wheel包安装方式暂不支持cuda
首先下载pybind11:
```bash
pip install pybind11
```
```sh
cd pyfastllm
python setup.py build
python setup.py install
python cli.py -p chatglm-6b-int8.bin -t 8
```
## 使用
### python 调用
在demo文件夹中存放了几种常见的代码示例:
demo/cli.py: 以回调函数方式输出回答示例
demo/cli_thread.py: 多线程调用api接口示例(推荐)
demo/cli_low_api.py: 底层API调用示例
demo/convert_model.py: 模型转换示例
demo/web_api.py, demo/web_api_client.py: fastapi webapi调用
### 命令行工具
使用命令行工具对模型进行转换,使用方法与convert_model.py类似:
```sh
$ fastllm-convert --help
$ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path
```
### 动态batch使用示例
```sh
mkdir build-py
cd build-py && cmake .. -DPY_API=ON -DUSE_CUDA=ON && make -j && cd -
cd pyfastllm/demo
python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
```
可以使用locust进行压测。A100 40G,chatglm fp16 压测部分结果如下:
| 并发数 | 平均调用时间(s) | TP95(s) | TP99(s) |
|----------:|------|------|------|
| 1 | 3.07 | 4.2 | 4.8 |
| 10 | 6.11 | 11.0 | 12.0 |
| 16 | 6.82 | 15.0 | 16.0 |
| 32 | 10.74 | 16.0 | 20.0 |
## API编程接口
### fastllm数据结构
> fattllm.Tensor数据类型
- fastllm.float32
- fastllm.bfloat16
- fastllm.int16
- fastllm.int8
- fastllm.int4
- fastllm.int2
- fastllm.float16
> fastllm.Tensor: fastllm基础张量结构
- fastllm.Tensor()
- fastllm.Tensor(Datatype)
- fastllm.Tensor(Datatype, Dims:list[int])
- fastllm.Tensor(Datatype, Dims:list[int], Data:list[float])
- fastllm.Tensor(Data:fastllm.Tensor)
- fastllm.Tensor.to_list() # 将Tensor转化list并返回
- fastllm.Tensor.to() # 将Tensor转移到对应设备上
- fastllm.Tensor.zeros(Dims:list[int]) # 按照Dims生成全零矩阵
- fastllm.Tensor.cat(Data:list[fastllm.Tensor], axis:int) # 将Tensor按照axis(默认为0)方向上拼接
### fastllm函数
> fastllm.get_llm_type(model_path:str)->str # 获取当前model的类型
> fastllm.set_threads(thread:int) -> None # 设置当前运行线程数,默认为4
> fastllm.get_threads()->int # 获取当前运行线程数
> fastllm.set_low_memory(flag:bool) # 低内存模式下运行,默认为False
> fastllm.get_low_memory() # 查看当前是否为低内存运行模式
> fastllm.create_llm(model_path: str)-> fastllm.model # 从本地权重文件生成对应的模型实例,基于规则匹配
### fastllm模块
> fastllm.Tokenizer: 分词及编解码工具
> Tips: 该类不可直接实例化,只可通过model.weight.tokenizer访问具体实例
- fastllm.Tokenizer.encode(prompt:str) # 将prompt分词并进行编码
- fastllm.Tokenizer.decode(output_ids:fastllm.Tensor) # 将fastllm.Tensor解码为对应字符串
- fastllm.Tokenizer.decode(output_ids: list[int]) # 将list[int]解码为对应的字符串
- fastllm.Tokenizer.decode_byte(output_ids: fastllm.Tensor) # 将Tensor解码对应字节流
> fastllm.WeightMap: 模型的权重词典
> Tips: 该类不可直接实例化,只可通过model.weight访问具体实例
- fastllm.WeightMap.tokenizer: 访问权重中的tokenizer实例
- fastllm.WeightMap.save_lowbit(output_path:str, bit:int):量化并保存低bit的权重
- fastllm.WeightMap.set_kv(key:str, value:str):设置模型的weight字典
- fastllm.WeightMap.set_weight(key:str, ):为weight添加具体Tensor
- .fastllm.WeightMap\['key'\]: 根据key的名称得到对应的Tensor
### fastllm模型
> fastllm.ChatGLMModel: 具体模型实例,其中chatglm可以更换为llama、alpaca、Moss等模型
- fastllm.ChatGLMModel.model_type: 模型类型属性,区分不同的模型
- fastllm.ChatGLMModel.weight:对应的weightmap
- fastllm.ChatGLMModel.block_cnt:模型中block的数量
- fastllm.ChatGLMModel() # 初始化模型实例
- __call__(input_ids:fastllm.Tensor, attention_mask:fastllm.Tensor, position_ids:fastllm.Tensor, penalty_factor:fastllm.Tensor, pastKeyValues:memory_view) # 以类call function的方式调用模型进行推理
- fastllm.ChatGLMModel.load_weights(model_path:str) # 从文件路径中加载模型权重
- fastllm.ChatGLMMode.make_history(history:str, round:int, input:str, output:str) # 基于历史对话和当前输入输出构造送入模型的历史对话
- fastllm.ChatGLMMode.make_input(history:str, round:int, input:str) # 基于历史对话和当前输入构造送入模型的对话输入
- fastllm.ChatGLMModel.response(inputs:str, callback:function) # 发送字符串到模型中并使用callback函数接受处理返回的答案
- fastllm.ChatGLMModel.response_batch(inputs:list[str], callback:function) -> outputs:list[str] # 发送列表字符串到模型中并使用callback函数接受处理返回的答案
- fastllm.ChatGLMModel.warmup() # GPU热身,填充GPU,防止冷启动
- fastllm.ChatGLMModel.launch_response(inputs:str)->handle_id:int # 多线程下使用,填充第一个token,并返回多线程的线程id
- fastllm.ChatGLMModel.fetch_response(handle_id:int) # 根据线程ID从消息队列中取出对应的消息并返回
- fastllm.ChatGLMModel.save_lowbit_model(model_path:str, q_bit:int) # 量化保持低bit的权重并保存模型
支持的模型列表:
| 模型名称 | 对应类 | 备注
| -- | -- | --
| ChatGLM-6B | fastllm.ChatGLMModel |
| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本
| Moss | fastllm.MossModel |
| Alpaca | fastllm.llamaModel |
## 开发计划(TODO)
- [x] 修改response_batch的output_str函数,以返回值的形式返回答案
- [x] 编解码部分优化,合并不同的返回类型
- [ ] Tensor的深复制和浅复制,以及基础运算符重载
- [ ] fix low_api下pastKV复制的bug
- [ ] 模型运行参数对象类,封装模型运行时参数,包含模型路径、运行线程数、是否为低内存模型、惩罚因子、温度等
- [ ] 暴露更多的底层api接口,按照module的方式定义模型的点,拼接model实现自定义model
import os
import shutil
import platform
import sys
import argparse
parser = argparse.ArgumentParser(description='build fastllm libs')
parser.add_argument('--cuda', dest='cuda', action='store_true', default=False,
help='build with cuda support')
IS_WINDOWS = (platform.system() == 'Windows')
IS_DARWIN = (platform.system() == 'Darwin')
IS_LINUX = (platform.system() == 'Linux')
BUILD_DIR = 'build-py' # build path
def build_libs():
# create build dir
root_dir = os.path.dirname(os.getcwd())
cmake_build_dir = os.path.join(root_dir, BUILD_DIR)
if os.path.exists(cmake_build_dir):
shutil.rmtree(cmake_build_dir)
os.makedirs(cmake_build_dir)
os.chdir(cmake_build_dir)
# build it
args = parser.parse_args()
if IS_WINDOWS:
os.system('cmake -G "Ninja" -DPY_API=ON .. && ninja pyfastllm')
elif IS_LINUX:
extra_opts = ' -DPY_API=ON '
extra_opts += ' -DUSE_CUDA=ON ' if args.cuda else ' '
build_cmd = 'cmake ' + extra_opts + ' .. && make pyfastllm -j4'
print(build_cmd)
os.system('cmake ' + extra_opts + ' .. && make pyfastllm -j4')
else:
extra_opts = '-DPY_API=ON'
os.system('cmake ' + extra_opts + '.. && make pyfastllm -j4')
if __name__ == '__main__':
build_libs()
# -*- coding: utf-8 -*-
import sys
import platform
import logging
import argparse
sys.path.append('./build-py')
import pyfastllm # 或fastllm
logging.info(f"python gcc version:{platform.python_compiler()}")
def args_parser():
parser = argparse.ArgumentParser(description='pyfastllm')
parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)')
parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径')
parser.add_argument('-t', '--threads', type=int, default=4, help='使用的线程数量')
parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
args = parser.parse_args()
return args
LLM_TYPE = ""
def print_back(idx:int, content: bytearray):
content = content.decode(encoding="utf-8", errors="replace")
if idx >= 0:
print(f"\r{LLM_TYPE}:{content}", end='', flush=True)
elif idx == -1:
print()
sys.stdout.flush()
def main(args):
model_path = args.path
OLD_API = False
if OLD_API:
model = pyfastllm.ChatGLMModel()
model.load_weights(model_path)
model.warmup()
else:
global LLM_TYPE
LLM_TYPE = pyfastllm.get_llm_type(model_path)
print(f"llm model: {LLM_TYPE}")
model = pyfastllm.create_llm(model_path)
prompt = ""
while prompt != "stop":
prompt = input("User: ")
config = pyfastllm.GenerationConfig()
model.response(model.make_input("", 0, prompt), print_back, config)
print()
sys.stdout.flush()
if __name__ == "__main__":
args = args_parser()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment