"src/vscode:/vscode.git/clone" did not exist on "bc3c73ad0b75ee550fdcce6e124d5a222834d6ed"
Commit 8b19e436 authored by zhouxiang's avatar zhouxiang
Browse files

提交初版chatglm6b的推理代码

parents
cmake_minimum_required(VERSION 3.5)
project(fastllm LANGUAGES CXX)
option(USE_CUDA "use cuda" ON)
message(STATUS "USE_CUDA: ${USE_CUDA}")
set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread --std=c++17 -O2 -g ")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-copy-dt-needed-entries")
message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS})
# 指定动态库位置
LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/lib)
set(CMAKE_INSTALL_RPATH "$ORIGIN/lib")
#enable_language(CUDA)
add_compile_definitions(USE_CUDA)
include_directories(include /opt/dtk-23.04/cuda/include/)
add_executable(main main.cpp)
target_link_libraries(main fastllm)
add_executable(quant quant.cpp)
target_link_libraries(quant fastllm)
add_executable(benchmark benchmark/benchmark.cpp)
target_link_libraries(benchmark fastllm)
# ChatGLM6B_CPP
## 模型介绍
ChatGLM-6B 是一个开源的、支持中英双语的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。
ChatGLM-6B 使用了和 ChatGPT 相似的技术,针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练,辅以监督微调、反馈自助、人类反馈强化学习等技术的加持,62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答,更多信息请参考我们的[博客](https://chatglm.cn/blog)
本项目主要针对ChatGLM-6B推理性能优化,达到DCU平台较快的对话效果
## 模型推理
### 下载镜像
在光源可拉取推理的docker镜像,YoloV7工程推荐的镜像如下:
```python
docker pull
```
### 编译方法
```
mkdir build
cd build
cmake ..
make -j4
```
编译后会在build目录下生成:
1. main: 示例程序
2. quant: 量化程序
3. benchmark: 性能测试程序
### ChatGLM原版模型转换
```
# 将模型转换脚本tools/chatglm_export.py移动到python的ChatGLM-6B环境中
# 如果使用自己finetune的模型可能需要修改chatglm_export.py文件中创建tokenizer, model的代码
# 执行:
python3 chatglm_export.py ./chatglm-6b.bin # 导出浮点模型,参数为导出的模型路径
# 第二步将导出的模型进行低比特量化,将第一步导出的模型文件放在容器中,以/home/model/为例
# 在容器内执行,其中-p参数指定未量化的模型存放路径,-o指定量化后的模型保存路径
cd build
./quant -p /home/model/chatglm-6b.bin -o chatglm-6b-fp16.bin -b 16 #导出fp16模型
./quant -p /home/model/chatglm-6b.bin -o chatglm-6b-int8.bin -b 8 #导出int8模型
```
### 运行示例程序
```
./main -h 可以查看具体参数信息,以下是一些简单示例:
```
### 运行ChatGLM-6B模型实例
```
./main -p chatglm-6b-int8.bin
```
### 推理性能测试
可以使用benchmark程序进行测速,根据./benchmark -h描述进行配置和测试,不同配置、不同输入,推理速度也会有一些差别
```
./benchmark -p ~/chatglm-6b-int4.bin -f ../benchmark/prompts/beijing.txt -b 1
./benchmark -p ~/chatglm-6b-int8.bin -f ../benchmark/prompts/beijing.txt -b 1
./benchmark -p ~/chatglm-6b-fp16.bin -f ../benchmark/prompts/hello.txt -b 512 -l 18
```
## 源码仓库及问题反馈
https://developer.hpccube.com/codes/modelzoo/chatglm6b_cpp
## 参考
https://github.com/THUDM/ChatGLM-6B
//
// Created by huangyuyang on 6/9/23.
//
#include "fstream"
#include <chrono>
#include "chatglm.h"
//static factoryllm fllm;
//static int modeltype = 0;
//static char* modelpath = NULL;
//static fastllm::basellm* chatGlm = fllm.createllm(LLM_TYPE_CHATGLM);
//static fastllm::basellm* moss = fllm.createllm(LLM_TYPE_MOSS);
//static fastllm::basellm* vicuna = fllm.createllm(LLM_TYPE_VICUNA);
struct BenchmarkConfig {
std::string path = "chatglm-6b-fp16.bin"; // 模型文件路径
int limit = -1; // 输出token数限制,如果 < 0 则代表无限制
int batch = -1; // batch数, -1时使用文件中的行数作为batch
std::string file; // 输入文件
std::string output; // 输出文件,如果不设定则输出到屏幕
};
void Usage() {
std::cout << "Usage:" << std::endl;
std::cout << "[-h|--help]: 显示帮助" << std::endl;
std::cout << "<-p|--path> <args>: 模型文件的路径" << std::endl;
std::cout << "<-l|--limit> <args>: 输出token数限制,不设定则表示无限制" << std::endl;
std::cout << "<-b|--batch> <args>: batch数,不设定时使用文件中的行数作为batch" << std::endl;
std::cout << "<-f|--file> <args>: 输入文件,文件中每行一个prompt,如果行数不足batch则用之前的prompt补充" << std::endl;
std::cout << "<-o|--output> <args>: 输出结果写文件,如果不设定则输出到屏幕" << std::endl;
}
void ParseArgs(int argc, char **argv, BenchmarkConfig &config) {
std::vector <std::string> sargv;
for (int i = 0; i < argc; i++) {
sargv.push_back(std::string(argv[i]));
}
for (int i = 1; i < argc; i++) {
if (sargv[i] == "-h" || sargv[i] == "--help") {
Usage();
exit(0);
}
else if (sargv[i] == "-p" || sargv[i] == "--path") {
config.path = sargv[++i];
} else if (sargv[i] == "-l" || sargv[i] == "--limit") {
config.limit = atoi(sargv[++i].c_str());
} else if (sargv[i] == "-b" || sargv[i] == "--batch") {
config.batch = atoi(sargv[++i].c_str());
} else if (sargv[i] == "-f" || sargv[i] == "--file") {
config.file = sargv[++i];
} else if (sargv[i] == "-o" || sargv[i] == "--output") {
config.output = sargv[++i];
} else {
Usage();
exit(-1);
}
}
}
static double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2) {
auto duration = std::chrono::duration_cast<std::chrono::microseconds> (time2 - time1);
return double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
};
int main(int argc, char **argv) {
BenchmarkConfig config;
ParseArgs(argc, argv, config);
fastllm::ChatGLMModel chatGlm;
chatGlm.LoadFromFile(config.path);
chatGlm.WarmUp();
chatGlm.output_token_limit = config.limit;
std::vector <std::string> inputs;
if (config.file != "") {
std::ifstream finputs(config.file, std::ios::in);
while (true) {
std::string input = "";
std::getline(finputs, input);
if (input == "") {
break;
} else {
inputs.push_back(input);
}
}
} else {
inputs.push_back("Hello!");
}
if (config.batch < 0) {
config.batch = inputs.size();
}
while (inputs.size() < config.batch) {
inputs.push_back(inputs[rand() % inputs.size()]);
}
if (inputs.size() > config.batch && config.batch != -1) {
inputs.resize(config.batch);
}
std::vector <std::string> outputs;
static int tokens = 0;
auto st = std::chrono::system_clock::now();
chatGlm.ResponseBatch(inputs, outputs, [](int index, std::vector <std::string> &contents) {
if (index != -1) {
for (int i = 0; i < contents.size(); i++) {
tokens += (contents[i].size() > 0);
}
}
});
float spend = GetSpan(st, std::chrono::system_clock::now());
if (config.output != "") {
FILE *fo = fopen(config.output.c_str(), "w");
for (int i = 0; i < outputs.size(); i++) {
fprintf(fo, "[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
}
fclose(fo);
} else {
for (int i = 0; i < outputs.size(); i++) {
printf("[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
}
}
printf("batch: %d\n", (int)inputs.size());
printf("output %d tokens\nuse %f s\nspeed = %f tokens / s\n", tokens, spend, tokens / spend);
return 0;
}
\ No newline at end of file
北京有什么景点?
\ No newline at end of file
Hello!
\ No newline at end of file
icon.png

2.27 KB

#pragma once
#include "fastllm.h"
// typedef void(*RuntimeResult) (int index, const char* content); //实时生成的内容回调 index: 0开始回复,-1本次回复结束
// typedef void(*RuntimeResultBatch) (int index, std::vector <std::string> &contents); //实时生成的内容回调 index: 0开始回复,-1本次回复结束
using RuntimeResult = std::function<void(int index, const char* content)>;
using RuntimeResultBatch = std::function<void(int index, std::vector <std::string> &contents)>;
namespace fastllm {
class basellm {
public:
basellm() {};
~basellm() {};
virtual void LoadFromFile(const std::string &fileName) = 0; // 从文件读取
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
const Data &penaltyFactor,
std::vector <std::pair <Data, Data> > &pastKeyValues) = 0;
virtual std::string Response(const std::string& input, RuntimeResult retCb) = 0; // 根据给出的内容回复
virtual void ResponseBatch(const std::vector <std::string> &inputs,
std::vector <std::string> &outputs,
RuntimeResultBatch retCb = nullptr) {} // 批量根据给出的内容回复
virtual void SaveLowBitModel(const std::string &fileName, int bit) {}; // 存储成量化模型
virtual void WarmUp() {}; // 预热
virtual void RotatePosition2D(Data &data, const Data &positionIds) {}; // 二维位置编码
virtual void CausalMask(Data &data, int start) {}; // 因果mask
int output_token_limit = -1;
int embed_dim = 4096;
int num_attention_heads = 32;
int head_dim = embed_dim / num_attention_heads;
const int max_positions = 2048;
int rotary_dim = 64;
const float scale_attn = sqrt(head_dim);
int block_cnt = 28;
bool do_sample = false; // 是否进行采样,如不采样则直接取最大值
int last_n = 64; // 末尾last_n个token计入重复惩罚
float repeat_penalty = 1.0f; // 重复惩罚系数
int top_k = 1; // top_k采样
float top_p = 1.0; // top_p采样
float temperature = 1.0; // 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性
std::vector <std::vector <float> > sin, cos;
WeightMap weight; // 权重
Data sinData, cosData;
};
}
\ No newline at end of file
//
// Created by huangyuyang on 5/11/23.
//
#ifndef FASTLLM_CHATGLM_H
#define FASTLLM_CHATGLM_H
#include "basellm.h"
#include "cmath"
#include <iostream>
namespace fastllm {
class ChatGLMModel: public basellm {
public:
ChatGLMModel (); // 构造函数
virtual void LoadFromFile(const std::string &fileName); // 从文件读取
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
const Data &penaltyFactor,
std::vector <std::pair <Data, Data> > &pastKeyValues);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
const Data &penaltyFactor,
std::vector <std::pair <Data, Data> > &pastKeyValues);
virtual std::string Response(const std::string& input, RuntimeResult retCb); // 根据给出的内容回复
virtual void ResponseBatch(const std::vector <std::string> &inputs,
std::vector <std::string> &outputs,
RuntimeResultBatch retCb);
virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
virtual void WarmUp(); // 预热
private:
virtual void CausalMask(Data &data, int start) {}; // 因果mask?
};
}
#endif //FASTLLM_CHATGLM_H
//
// Created by 41020 on 2023/6/19.
//
#ifndef FASTLLM_CHATGLM_C_H
#define FASTLLM_CHATGLM_C_H
#ifdef __cplusplus
extern "C" {
#endif
typedef void* ModelHandle;
/*
* @brief 初始化引擎
* @param model 输入推理引擎handle
* @param modelPath 输入模型存放路径
* @param deviceId 输入模型推理运行的dcu设备号
* @return 成功返回0;失败返回对应错误码
*/
int initLLMEngine(ModelHandle& model, const char* modelPath, int deviceId);
/*
* @brief 获取场景分类单张图片推理结果
* @note 每次返回行人检测个数(最大不超过100)
* @param model 输入推理引擎handle
* @param prompt 输入的问题
* @param output 输出的回答,内存会在接口内申请,下次调用时释放并重新生成
* @return 成功返回0;失败返回对应错误码
*/
int chat(ModelHandle model, const char* prompt, char*& output);
/*
* @brief 释放推理引擎资源
* @param model 输入需要释放推理引擎handle
* @return 成功返回0;失败返回对应错误码
*/
int releaseLLMEngine(ModelHandle model);
#ifdef __cplusplus
}
#endif
#endif //FASTLLM_CHATGLM_C_H
//
// Created by huangyuyang on 5/11/23.
//
#ifndef TEST_FASTLLM_H
#define TEST_FASTLLM_H
#include <vector>
#include <cstdint>
#include <string>
#include <map>
#include <set>
#include <queue>
#include <unordered_map>
#include <cmath>
#include <algorithm>
#include <iostream>
#include <functional>
namespace fastllm {
void SetThreads(int t);
void SetLowMemMode(bool m);
void SetKVCacheInCPU(bool kvCacheInCPU);
bool GetLowMemMode();
int GetThreads();
bool GetKVCacheInCPU();
struct LowBitConfig {
int bit;
float min, max;
uint8_t zeroPoint;
float scale;
LowBitConfig(float min, float max, int bit) {
this->min = min;
this->max = max;
this->bit = bit;
Reset();
}
LowBitConfig () {
}
void Reset() {
min = std::min(min, 0.f);
max = std::max(max, 0.f);
const float qmin = 0;
const float qmax = (1 << bit) - 1;
scale = (max - min) / (qmax - qmin);
const float initial_zero_point = qmin - min / scale;
zeroPoint = 0;
if (initial_zero_point < qmin) {
zeroPoint = qmin;
} else if (initial_zero_point > qmax) {
zeroPoint = qmax;
} else {
zeroPoint = static_cast<uint8_t>(std::round(initial_zero_point));
}
}
uint8_t quantization(const float &realNumber) const {
return (uint8_t) (std::min((double)((1 << bit) - 1), std::max(realNumber / scale + zeroPoint + 0.5, 0.0)));
}
float invQuantization(const uint8_t &qNumber) const {
return (scale * ((float) qNumber - (float) zeroPoint));
}
};
enum DataType {
FLOAT32 = 0, BFLOAT16 = 1, INT16 = 2, INT8 = 3, INT4 = 4, INT2 = 5, BIT = 6, FLOAT16 = 7,
INT32PARAM = 100 // int32的参数,这种类型的数据永远存在CPU上
};
enum DataDevice {
CPU = 0, CUDA = 1
};
enum WeightType {
NONE = 0, LINEAR = 1, EMBEDDING = 2
};
struct Data {
bool lockInCPU = false; // 如果lock在CPU上,那么不允许移动到其余设备
WeightType weightType = WeightType::NONE; // 权重类型,NONE代表非权重(或未知权重)
DataType dataType = DataType::FLOAT32; // 数据类型
int unitSize, unitSizeDiv = 1; // 单个元素的字节数 = unitSIze / unitSizeDiv
std::vector <int> dims; // 数据形状
std::vector <uint64_t> strides; // 跨度
uint64_t expansionSize = 0; // 扩容后的尺寸
uint64_t expansionBytes = 0; // 扩容后的字节数
std::vector <int> expansionDims; // 预扩容的形状
uint8_t *cpuData = nullptr; // 数据指针
void *cudaData = nullptr;
std::vector <void*> extraCudaData;
void *deviceData = nullptr;
std::vector <void*> extraDeviceData;
DataDevice dataDevice = DataDevice::CPU;
// 这两个参数用于量化,对FLOAT数据不适用
int perChannelAxis = -1; // 沿哪个轴分通道量化,-1代表没有分通道
std::vector <LowBitConfig> perChannelsConfigs; // perChannelsConfigs[i]代表第i个通道的min, max; 如果没有分通道,perChannelsConfigs[0]代表全局min, max
std::vector <float> scales;
std::vector <int> zeros;
std::vector <int> weightSum; // 作为权重时,有时候需要存一些和加速计算
std::string fileName;
long long filePos;
Data () {};
Data (DataType type);
Data (DataType type, const std::vector <int> &dims); // 构造函数
// 构造函数,创建好之后从data复制数据
// data中是原始数据,如果type不是float那么需要量化
Data (DataType type, const std::vector <int> &dims, const std::vector <float> &data);
~Data(); // 析构函数
Data (const Data &ori); // 深拷贝
void CopyFrom(const Data &ori); // 复制
uint64_t GetBytes() const; // 获取总字节数
void Allocate(); // 分配内存
void Allocate(float v); // 分配内存并初始化
void Expansion(const std::vector <int> &dims); // 预扩容到相应尺寸
void MallocSpace(uint64_t size); // 在设备上分配
void FreeSpace(); // 回收设备上的内存
void UpdateUnitSize(); // 更新unitSize
void Resize(const std::vector <int> &dims); // 更改尺寸
void Reshape(const std::vector <int> &dims); // 更改尺寸,但不修改数据
uint64_t Count(int i) const; // dims[i] * strides[i]
void PrintShape() const; // 输出形状
void Print() const; // 输出
void CalcWeightSum(); // 计算WeightSum
void ToDevice(DataDevice device); // 移动到指定device
void ToDevice(void *device);
};
struct Tokenizer {
struct TrieNode {
int tokenId;
std::map <int, TrieNode*> next;
TrieNode();
};
TrieNode *root;
std::unordered_map <int, std::string> tokenToStringDict;
Tokenizer ();
~Tokenizer();
void Clear(); // 清空分词器
void Insert(const std::string &s, int tokenId); // 插入一个token
Data Encode(const std::string &s); // 编码
std::string Decode(const Data &data); // 解码
};
struct WeightMap {
int versionId;
Tokenizer tokenizer;
std::map <std::string, std::string> dicts;
std::map <std::string, Data> weight;
std::set <std::string> embeddingNames;
void LoadFromFile(const std::string &fileName); // 从文件读取
void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
Data &operator [] (const std::string &key);
};
struct TokenPenaltyManager {
Data penalty = Data();
std::map <int, int> cnt;
std::queue <int> q;
int vocabSize, lastN;
float value;
void Init (int vocabSize, int lastN, float value);
void Clear();
void InsertToken(int token);
};
void Embedding(const Data &input, Data &weight, Data &output);
void RMSNorm(const Data &input, const Data &weight, float eps, Data &output);
void LayerNorm(Data &input, Data &gamma, Data &beta, int axis, Data &output);
void Linear(Data &input, Data &weight, const Data &bias, Data &output);
void Split(const Data &input, int axis, int start, int end, Data &output);
void Cat(const Data &input0, const Data &input1, int axis, Data &output);
void CatDirect(Data &input0, const Data &input1, int axis); // 直接把input1的数据拷贝到input0后面(需要input0提前扩容了足够的空间)
void MatMul(const Data &input0, const Data &input1, Data &output, float alpha = 1.0);
void MatMulTransB(const Data &input0, const Data &input1, Data &output, float alpha = 1.0);
void Softmax(const Data &input, Data &output, int axis);
void Silu(const fastllm::Data &input, fastllm::Data &output);
void GeluNew(const Data &input, Data &output);
void Mul(const Data &input, float v, Data &output);
void MulTo(Data &input0, const Data &input1); // input0 *= input1
void AddTo(Data &input0, const Data &input1, float alpha = 1.0); // input0 += input1 * alpha
void AttentionMask(Data &input, const Data &mask, float maskValue); // 把input里对应位置mask中为1的部分变成maskValue
void Permute(const Data &input, const std::vector<int> &axis, Data &output); // 转置
void PermuteSelf(const Data &input, const std::vector<int> &axis); // 转置
void TopK(const Data &input, Data &output, int topK); // 求topk
void RotatePosition2D(Data &input, const Data &positionIds, Data &sinData, Data &cosData, int rotaryDim); // 2D position
void RepeatPenalty(Data &input, const Data &penalty); // 惩罚,input[i] = input[i] < 0 ? input[i] * penalty[i] : input[i] / penalty[i];
}
#endif //TEST_FASTLLM_H
\ No newline at end of file
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <chrono>
#include <string>
#include <cstdio>
#include <cstdint>
#ifdef USE_CUDA
#include <cuda_runtime.h>
#endif
#include <stdlib.h>
#include "chatglm.h"
#include "chatglm_c.h"
//static int multi_round_flag = 0;
static double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2) {
auto duration = std::chrono::duration_cast<std::chrono::microseconds> (time2 - time1);
return double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
};
std::map <std::string, int> modelDict = {
{"chatglm", 0}, {"chatglm—c", 1}
};
struct RunConfig {
int model = 0; // 模型类型, 0 chatglm-c++,1 chatglm-c
std::string path = "chatglm-6b-int8.bin"; // 模型文件路径
int device = 0;
int multi_round_flag = 0;
// int threads = 4; // 使用的线程数
// bool lowMemMode = false; // 是否使用低内存模式
};
void Usage() {
std::cout << "Usage:" << std::endl;
std::cout << "[-h|--help]: 显示帮助" << std::endl;
std::cout << "<-p|--path> <args>: 模型文件的路径" << std::endl;
std::cout << "<-m|--model> <args>: 接口类型,默认为0,可以设置为0(chatglm c++接口),1(chatglm c接口)" << std::endl;
std::cout << "<-d|--device> <args>: 推理使用的DCU设备号" << std::endl;
std::cout << "<-r|--multi-round> <args>: 是否启用多轮方式对话,默认为0,可以设置为0(单轮对话),1(多轮对话)" << std::endl;
}
void ParseArgs(int argc, char **argv, RunConfig &config) {
std::vector <std::string> sargv;
for (int i = 0; i < argc; i++) {
sargv.push_back(std::string(argv[i]));
}
for (int i = 1; i < argc; i++) {
if (sargv[i] == "-h" || sargv[i] == "--help") {
Usage();
exit(0);
}
else if (sargv[i] == "-m" || sargv[i] == "--model") {
if (modelDict.find(sargv[i + 1]) != modelDict.end()) {
config.model = modelDict[sargv[++i]];
} else {
config.model = atoi(sargv[++i].c_str());
}
}
else if (sargv[i] == "-p" || sargv[i] == "--path") {
config.path = sargv[++i];
}
else if (sargv[i] == "-d" || sargv[i] == "--device") {
config.device = atoi(sargv[++i].c_str());
}
else if (sargv[i] == "-r" || sargv[i] == "--multi-round") {
config.multi_round_flag = atoi(sargv[++i].c_str());
}
else {
Usage();
exit(-1);
}
}
}
bool fileExists(const std::string& filename) {
std::ifstream file(filename);
return file.good();
}
//获取utf-8字符个数(utf-8下,英文字符一个站位一个字节,中文字符一个站位3个字节)
static int getUtf8LetterNumber(const char *s)
{
int i = 0, j = 0;
while (s[i])
{
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}
int chat_history(fastllm::ChatGLMModel* chatGlm, const char* input_Str) {
static int sRound = 0;
static std::string history;
static int tokens = 0;
std::string ret = "";
std::string input(input_Str);
if (input == "reset") {
history = "";
sRound = 0;
return 0;
}
history += ("[Round " + std::to_string(sRound++) + "]\n问:" + input);
if(getUtf8LetterNumber(history.c_str()) > 2048)
{
history = "";
sRound = 0;
}
tokens = 0;
auto prompt = sRound > 1 ? history : input;
auto st = std::chrono::system_clock::now();
ret = chatGlm->Response((prompt), [](int index, const char* content) {
if (index == 0) {
printf("ChatGLM:%s", content);
tokens += 1;
}
if (index > 0) {
printf("%s", content);
tokens += 1;
}
if (index == -1) {
printf("\n");
}
});
float spend = GetSpan(st, std::chrono::system_clock::now());
//字数统计
int str_len = getUtf8LetterNumber(ret.c_str());
printf("word_count = %d, token_count = %d, spend = %fs, word/s = %f, tokens/s = %f: .\n", str_len, tokens, spend, str_len/spend, tokens/spend);
history += ("\n答:" + ret + "\n");
return ret.length();
}
int main(int argc, char **argv) {
RunConfig config;
ParseArgs(argc, argv, config);
if(!fileExists(config.path)){
printf("model path is not exist!\n");
return -1;
}
if (config.model == 0) {
#ifdef USE_CUDA
cudaSetDevice(config.device);
#endif
fastllm::ChatGLMModel chatGlm;
chatGlm.LoadFromFile(config.path);
chatGlm.WarmUp();
static int tokens = 0;
while (true) {
printf("用户: ");
std::string input;
std::getline(std::cin, input);
if (input == "stop") {
break;
}
if(0 == config.multi_round_flag){
tokens = 0;
auto st = std::chrono::system_clock::now();
std::string ret = chatGlm.Response((input), [](int index, const char* content) {
if (index == 0) {
printf("ChatGLM:%s", content);
tokens += 1;
}
if (index > 0) {
printf("%s", content);
tokens += 1;
}
if (index == -1) {
printf("\n");
}
});
float spend = GetSpan(st, std::chrono::system_clock::now());
//字数统计
int str_len = getUtf8LetterNumber(ret.c_str());
printf("word_count = %d, token_count = %d, spend = %fs, word/s = %f, tokens/s = %f: .\n", str_len, tokens, spend, str_len/spend, tokens/spend);
}
else{
chat_history(&chatGlm, input.c_str());
}
}
} else if (config.model == 1) {
void* modelEngine = NULL;
initLLMEngine(modelEngine, config.path.c_str(), config.device);
char *output = NULL;
while (true) {
printf("用户: ");
std::string input;
std::getline(std::cin, input);
//input = "晚上睡不着怎么办";
if (input == "stop") {
break;
}
chat(modelEngine, input.c_str(), output);
printf("ChatGLM:%s\n", output);
}
releaseLLMEngine(modelEngine);
}
else {
Usage();
exit(-1);
}
return 0;
}
# 模型名称
modelName=ChatGLM6B_CPP
# 模型描述
modelDescription=ChatGLM-6B是一个开源的、支持中英双语的对话语言模型,基于 General Language Model (GLM) 架构,具有 62 亿参数
# 应用场景
appScenario=聊天
# 框架类型
frameType=C/C++,CUDA
//
// Created by huangyuyang on 5/13/23.
//
#include <cstdio>
#include <cstring>
#include <iostream>
#include "chatglm.h"
struct QuantConfig {
std::string path; // 模型文件路径
std::string output; // 输出文件路径
int bits; // 量化位数
};
void Usage() {
std::cout << "Usage:" << std::endl;
std::cout << "[-h|--help]: 显示帮助" << std::endl;
std::cout << "<-p|--path> <args>: 模型文件的路径" << std::endl;
std::cout << "<-b|--bits> <args>: 量化位数, 4 = int4, 8 = int8, 16 = fp16" << std::endl;
std::cout << "<-o|--output> <args>: 输出文件路径" << std::endl;
}
void ParseArgs(int argc, char **argv, QuantConfig &config) {
std::vector <std::string> sargv;
for (int i = 0; i < argc; i++) {
sargv.push_back(std::string(argv[i]));
}
for (int i = 1; i < argc; i++) {
if (sargv[i] == "-h" || sargv[i] == "--help") {
Usage();
exit(0);
} else if (sargv[i] == "-p" || sargv[i] == "--path") {
config.path = sargv[++i];
} else if (sargv[i] == "-b" || sargv[i] == "--bits") {
config.bits = atoi(sargv[++i].c_str());
} else if (sargv[i] == "-o" || sargv[i] == "--output") {
config.output = sargv[++i];
} else {
Usage();
exit(-1);
}
}
}
int main(int argc, char **argv) {
QuantConfig config;
ParseArgs(argc, argv, config);
fastllm::ChatGLMModel chatGlm;
chatGlm.LoadFromFile(config.path);
chatGlm.SaveLowBitModel(config.output, config.bits);
return 0;
}
\ No newline at end of file
import os
import platform
import signal
import sys
import struct
import numpy as np
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).float()
model = model.eval()
if __name__ == "__main__":
exportPath = sys.argv[1] if (sys.argv[1] is not None) else "chatglm-6b.bin";
dict = model.state_dict();
fo = open(exportPath, "wb");
# 0. version id
fo.write(struct.pack('i', 0));
# 1. vocab
vocab = tokenizer.get_vocab();
fo.write(struct.pack('i', len(vocab)));
for v in vocab.keys():
s = v.encode();
fo.write(struct.pack('i', len(s)));
for c in s:
fo.write(struct.pack('i', c));
fo.write(struct.pack('i', vocab[v]));
# 2. weight
fo.write(struct.pack('i', len(dict)));
for key in dict:
cur = dict[key].numpy().astype(np.float32);
#cur = dict[key].numpy();
fo.write(struct.pack('i', len(key)));
fo.write(key.encode());
fo.write(struct.pack('i', len(cur.shape)));
for i in cur.shape:
fo.write(struct.pack('i', i));
fo.write(struct.pack('i', 0));
fo.write(cur.data);
fo.close();
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment