1、同步到最新版本；2、增加batch推理接口；3、解决内存泄漏问题；4、修复llama系列流式输出不流畅的问题

56215723 · zhouxiang · 44be91d3 · 56215723 · 56215723 · 56215723
Commit 56215723 authored Jan 31, 2024 by zhouxiang
20 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git
+	branch = v2.10.5
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,12 +5,21 @@ project(fastllm LANGUAGES CXX)
 option(USE_CUDA "use cuda" ON)

 option(PY_API "python api" OFF)
+
 option(USE_MMAP "use mmap" OFF)

+option(USE_SENTENCEPIECE "use sentencepiece" OFF)
+
+option(USE_IVCOREX "use iluvatar corex gpu" OFF)
+
 message(STATUS "USE_CUDA: ${USE_CUDA}")

 message(STATUS "PYTHON_API: ${PY_API}")

+message(STATUS "USE_SENTENCEPIECE: ${USE_SENTENCEPIECE}")
+
+message(STATUS "USE_IVCOREX: ${USE_IVCOREX}")
+
 set(CMAKE_BUILD_TYPE "Release")

 if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -25,7 +34,7 @@ endif()
 message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS})
 set(FASTLLM_CXX_SOURCES src/fastllm.cpp src/device.cpp src/model.cpp src/executor.cpp
        src/devices/cpu/cpudevice.cpp src/devices/cpu/cpudevicebatch.cpp
-        src/models/chatglm.cpp src/models/moss.cpp src/models/llama.cpp src/models/qwen.cpp src/models/basellm.cpp)
+        src/models/chatglm.cpp src/models/moss.cpp src/models/llama.cpp src/models/qwen.cpp src/models/basellm.cpp src/models/glm.cpp)

 include_directories(include)
 include_directories(include/utils)
@@ -35,6 +44,12 @@ if (USE_MMAP)
    add_compile_definitions(USE_MMAP)
 endif()

+if (USE_SENTENCEPIECE)
+    set(CMAKE_CXX_STANDARD 17)
+    add_compile_definitions(USE_SENTENCEPIECE)
+    set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} sentencepiece)
+endif()
+
 if (USE_CUDA)
    enable_language(CUDA)
    add_compile_definitions(USE_CUDA)
@@ -47,6 +62,11 @@ if (USE_CUDA)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g --gpu-max-threads-per-block=1024")
 endif()

+if (USE_IVCOREX)
+    set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} cudart)
+    set(CMAKE_CUDA_ARCHITECTURES ${IVCOREX_ARCH})
+endif()
+
 if (PY_API)
    set(PYBIND third_party/pybind11)
    add_subdirectory(${PYBIND})
@@ -73,6 +93,9 @@ target_link_libraries(main fastllm)
 add_executable(quant tools/src/quant.cpp)
 target_link_libraries(quant fastllm)

+add_executable(testOps test/ops/cppOps.cpp)
+target_link_libraries(testOps fastllm)
+
 add_executable(webui example/webui/webui.cpp)
 target_link_libraries(webui fastllm)
 add_custom_command(
@@ -84,6 +107,11 @@ add_custom_command(

 add_executable(benchmark example/benchmark/benchmark.cpp)
 target_link_libraries(benchmark fastllm)
+add_custom_command(
+        TARGET benchmark
+        POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/example/benchmark/loop_benchmark.sh ${CMAKE_BINARY_DIR}/
+)

 add_executable(apiserver example/apiserver/apiserver.cpp example/apiserver/json11.cpp)
 target_link_libraries(apiserver fastllm)

--- a/README.md
+++ b/README.md
@@ -170,6 +170,18 @@ python3 tools/baichuan2flm.py baichuan-13b-int8.flm int8 #导出int8模型
 python3 tools/baichuan2flm.py baichuan-13b-int4.flm int4 #导出int4模型
 ```

+### baichuan2模型导出 (默认脚本导出baichuan2-7b-chat模型)
+
+``` sh
+# 需要先安装baichuan2环境
+# 如果使用自己finetune的模型需要修改baichuan2_2flm.py文件中创建tokenizer, model的代码
+# 根据所需的精度，导出相应的模型
+cd build
+python3 tools/baichuan2_2flm.py baichuan2-7b-fp16.flm float16 #导出float16模型
+python3 tools/baichuan2_2flm.py baichuan2-7b-int8.flm int8 #导出int8模型
+python3 tools/baichuan2_2flm.py baichuan2-7b-int4.flm int4 #导出int4模型
+```
+
 ### MOSS模型导出

 ``` sh

--- a/docs/faq.md
+++ b/docs/faq.md
+# 常见问题
+
+## CMAKE
+
+### CMAKE_CUDA_ARCHITECTURES must be non-empty if set.
+
+**现象：**
+
+> CMake Error at cmake/Modules/CMakeDetermineCUDACompiler.cmake:277 (message):  
+>   CMAKE_CUDA_ARCHITECTURES must be non-empty if set.  
+> Call Stack (most recent call first):  
+>   CMakeLists.txt:39 (enable_language)
+
+**解决办法：**
+
+部分版本cmake存在该问题，需手动指定`CMAKE_CUDA_ARCHITECTURES`。执行：
+
+```shell
+cmake .. -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native
+```
+
+### Unsupported gpu architecture 'compute_native'
+
+**现象：**
+
+> nvcc fatal : Unsupported gpu architecture 'compute_native'
+
+**解决办法：**
+
+手动修改 CMakeLists.txt，根据GPU型号手动指定GPU的[Compute Capability](https://developer.nvidia.com/cuda-gpus)。如：
+
+``` diff
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -52,7 +52,7 @@
+     #message(${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+     set(FASTLLM_CUDA_SOURCES src/devices/cuda/cudadevice.cpp src/devices/cuda/cudadevicebatch.cpp src/devices/cuda/fastllm-cuda.cu)
+     set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} cublas)
+-    set(CMAKE_CUDA_ARCHITECTURES "native")
+    set(CMAKE_CUDA_ARCHITECTURES 61 75 86 89)
+ endif()
+ 
+ if (PY_API)
+```
+
+
+## Windows
+
+### fastllm.h error
+
+**现象：**
+
+> include\fastllm.h(50): error : identifier "top_k" is undefined  
+> include\fastllm.h(172): error : expected a "}"  
+> include\fastllm.h(234): error : identifier "DataDevice" is undefined  
+> ....
+
+**解决办法：** 使用cmake构建通常不存在这一问题。参考 [example\README.md](/example/README.md)。签出代码后，**修改 include/fastllm.h**，Visual Studio中点击”文件“ -> "高级保存选项"，在编码中选择”Unicode (UTF-8 **带签名**) -代码页 65001“，或在其他文本编辑器中转为”UTF-8 BOM“编码。（由于linux下gcc不识别BOM头，MSVC依赖BOM判断文件编码，该修改只能手动处理。）
+
+### main.exe 无法识别中文输入
+
+**原因：** Windows下cmd不支持UTF-8编码，
+
+**解决办法：** 编译[Win32Demo](/example/README.md#win32demo-windows平台) 或使用 [WebUI](/example/README.md#web-ui)
+
+### 导入提示 FileNotFoundError
+
+**现象：**
+
+> File "...Python\lib\ctypes\_\_init\_\_.py", line 374, in \_\_init\_\_  
+>     self._handle = _dlopen(self._name, mode)  
+> FileNotFoundError: Could not find module 'tools\fastllm_pytools\fastllm_tools.dll' (or one of its dependencies). Try using the full path with constructor syntax.
+
+**解决办法：** 非CPU编译时，部分版本的python存在这一问题。
+
+GPU编译时，根据使用的CUDA版本，将cudart cublas的相关dll文件复制到fastllm_tools同一目录下，例如：
+
+* CUDA 9.2
+  * %CUDA_PATH%\bin\cublas64_92.dll
+  * %CUDA_PATH%\bin\cudart64_92.dll
+* CUDA 11.x 
+  * %CUDA_PATH%\bin\cudart64_110.dll
+  * %CUDA_PATH%\bin\cublas64_11.dll
+  * %CUDA_PATH%\bin\cublasLt64_11.dll
+* CUDA 12.x 
+  * %CUDA_PATH%\bin\cudart64_12.dll
+  * %CUDA_PATH%\bin\cublas64_12.dll
+  * %CUDA_PATH%\bin\cublasLt64_12.dll
+
+## fastllm_pytools
+
+### 释放内存报错： CUDA error when release memory
+
+**现象：**
+退出时报错：
+> Error: CUDA error when release memory!  
+> CUDA error = 4, cudaErrorCudartUnloading at fastllm/src/devices/cuda/fastllm-cuda.cu:1493  
+> 'driver shutting down'
+
+**原因：** python解释器在终止时常常会优先终止自己的进程，而没有现先析构调用的第三方库，因此在退出python时CUDA Runtime已关闭，释放显存操作失败。由于大多数时候显存已释放，并不会引起问题。
+
+**解决办法：** python程序退出时，先显式调用 `llm.release_memory()`方法。
--- a/example/Android/LLMAssistant/.idea/.gitignore
+++ b/example/Android/LLMAssistant/.idea/.gitignore
-# Default ignored files
-/shelf/
-/workspace.xml
--- a/example/Android/LLMAssistant/.idea/.name
+++ b/example/Android/LLMAssistant/.idea/.name
-XiaoZhihuiAssistant
\ No newline at end of file
--- a/example/Android/LLMAssistant/.idea/compiler.xml
+++ b/example/Android/LLMAssistant/.idea/compiler.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="CompilerConfiguration">
-    <bytecodeTargetLevel target="11" />
-  </component>
-</project>
\ No newline at end of file
--- a/example/Android/LLMAssistant/.idea/dbnavigator.xml
+++ b/example/Android/LLMAssistant/.idea/dbnavigator.xml
--- a/example/Android/LLMAssistant/.idea/deploymentTargetDropDown.xml
+++ b/example/Android/LLMAssistant/.idea/deploymentTargetDropDown.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="deploymentTargetDropDown">
-    <runningDeviceTargetSelectedWithDropDown>
-      <Target>
-        <type value="RUNNING_DEVICE_TARGET" />
-        <deviceKey>
-          <Key>
-            <type value="SERIAL_NUMBER" />
-            <value value="adb-JRF67HLJ9HX85XCU-GNBKo9._adb-tls-connect._tcp" />
-          </Key>
-        </deviceKey>
-      </Target>
-    </runningDeviceTargetSelectedWithDropDown>
-    <timeTargetWasSelectedWithDropDown value="2023-07-28T10:10:24.047223600Z" />
-  </component>
-</project>
\ No newline at end of file
--- a/example/Android/LLMAssistant/.idea/gradle.xml
+++ b/example/Android/LLMAssistant/.idea/gradle.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="GradleMigrationSettings" migrationVersion="1" />
-  <component name="GradleSettings">
-    <option name="linkedExternalProjectsSettings">
-      <GradleProjectSettings>
-        <option name="testRunner" value="GRADLE" />
-        <option name="distributionType" value="DEFAULT_WRAPPED" />
-        <option name="externalProjectPath" value="$PROJECT_DIR$" />
-        <option name="gradleJvm" value="Embedded JDK" />
-        <option name="modules">
-          <set>
-            <option value="$PROJECT_DIR$" />
-            <option value="$PROJECT_DIR$/app" />
-          </set>
-        </option>
-      </GradleProjectSettings>
-    </option>
-  </component>
-</project>
\ No newline at end of file
--- a/example/Android/LLMAssistant/.idea/misc.xml
+++ b/example/Android/LLMAssistant/.idea/misc.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ExternalStorageConfigurationManager" enabled="true" />
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11" project-jdk-type="JavaSDK">
-    <output url="file://$PROJECT_DIR$/build/classes" />
-  </component>
-  <component name="ProjectType">
-    <option name="id" value="Android" />
-  </component>
-</project>
\ No newline at end of file
--- a/example/Android/LLMAssistant/.idea/vcs.xml
+++ b/example/Android/LLMAssistant/.idea/vcs.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
-  </component>
-</project>
\ No newline at end of file
--- a/example/README.md
+++ b/example/README.md
@@ -2,7 +2,9 @@

 ## Benchmark

-测速示例程序，方便大家测试不同软硬件下的推理性能。作者测速度可以可参考[这里](doc/benchmark.md)。
+测速示例程序，方便大家测试不同软硬件下的推理性能。作者测试的速度可以参考[这里](doc/benchmark.md)。
+
+由于实际使用时很难满足batch的条件，也并非贪婪解码，该速度与真实使用时的速度有一定差异。

 ### 使用方法：

@@ -38,13 +40,17 @@ fastllm工程目前分为CPU版本和GPU版本，为简单上手，在没有cmak

 签出代码后，**修改 include/fastllm.h**，Visual Studio中点击”文件“ -> "高级保存选项"，在编码中选择”Unicode (UTF-8 **带签名**) -代码页 65001“，或在其他文本编辑器中转为”UTF-8 BOM“编码。（由于linux下gcc不识别BOM头，该修改只能手动处理。）

+* **CPU版本**：
+  * 如果本机没有安装CUDA，在Win32Demo项目“属性”中找到"链接器" -> "输入" -> "附加依赖项"，点击'从父级或项目设置继承'。
+
 * **GPU版本**：
-  - 需要正确安装CUDA；
+  - 需要正确安装CUDA及其中的Visual Studio Integration；
  - 正确配置CUDA_PATH环境变量，指向要编译的CUDA版本；
-  - 在解决方案中删除fastllm.vcproj，引入fastllm-gpu.vcproj，
+  - 在解决方案资源管理器中移除fastllm.vcproj，引入fastllm-gpu.vcproj，
  - 对fastllm-gpu项目，在”生成依赖项“ -> "生成自定义" 中手动添加已安装的CUDA的自定义项文件；
+  - 对fastllm-gpu项目，在”属性“中找到"CUDA C/C++" -> "Device" -> "Code Generation" 中配置编译后支持的[GPU计算能力](https://developer.nvidia.com/cuda-gpus#compute)；
  - 在Win32Demo项目上选择”添加“ -> "引用“，勾选fastllm-gpu项目；
-  - 配置预处理器定义”USE_CUDA“。
+  - 在Win32Demo项目上配置预处理器定义”USE_CUDA“。

 ### 使用方法：

@@ -61,15 +67,15 @@ Android，使用Android studio工具建立的一個Android平台上运行LLM程

 ### 使用方法：

-1.直接AS打开运行。
+1.在Android Studio直接打开工程运行。

 2.直接下载release目录里里面的apk体验。

 3.可以通过CMake工具链编译main文件(具体步骤见主页的readme)，通过adb shell运行，

-1. adb push main /data/local/tmp 将main文件放到手机的tmp文件夹，
-2. adb shell ,
-3. cd /data/local/tmp 
-4. ./main 运行。
+1. `adb push main /data/local/tmp` 将main文件放到手机的tmp文件夹，
+2. `adb shell` ,
+3. `cd /data/local/tmp` 
+4. `./main` 运行。

-注意：demo apk 会将模型文件复制到应用 data 目录以方便 native 读取，因此设备需准备至少两倍模型大小的空余空间
\ No newline at end of file
+注意：demo apk 会将模型文件复制到应用 data 目录以方便 native 读取，因此设备需准备至少两倍模型大小的空余空间。
\ No newline at end of file
--- a/example/Win32Demo/Win32Demo.cpp
+++ b/example/Win32Demo/Win32Demo.cpp
@@ -135,6 +135,7 @@ int chatllm(const char* prompt, int type) {

 	}, *generationConfig);
 	history = model->MakeHistory(history, sRound, input, ret);
+	sRound++;
 	return ret.length();
 }


--- a/example/Win32Demo/fastllm-gpu.vcxproj
+++ b/example/Win32Demo/fastllm-gpu.vcxproj
@@ -71,7 +71,6 @@
    <LinkIncremental>true</LinkIncremental>
    <LibraryPath>$(CUDA_PATH)\lib\Win32;$(LibraryPath)</LibraryPath>
    <IncludePath>$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
-    <TargetExt>.lib</TargetExt>
    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
    <IntDir>$(Platform)\$(Configuration)\</IntDir>
  </PropertyGroup>
@@ -79,13 +78,11 @@
    <LinkIncremental>true</LinkIncremental>
    <IncludePath>$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
    <LibraryPath>$(CUDA_PATH)\lib\x64;$(LibraryPath)</LibraryPath>
-    <TargetExt>.lib</TargetExt>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <LibraryPath>$(CUDA_PATH)\lib\Win32;$(LibraryPath)</LibraryPath>
    <IncludePath>$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
-    <TargetExt>.lib</TargetExt>
    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
    <IntDir>$(Platform)\$(Configuration)\</IntDir>
  </PropertyGroup>
@@ -93,7 +90,6 @@
    <LinkIncremental>false</LinkIncremental>
    <IncludePath>$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
    <LibraryPath>$(CUDA_PATH)\lib\x64;$(LibraryPath)</LibraryPath>
-    <TargetExt>.lib</TargetExt>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -111,7 +107,7 @@
      <SubSystem>Windows</SubSystem>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration)</CodeGeneration>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -130,7 +126,7 @@
      <SubSystem>Windows</SubSystem>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration)</CodeGeneration>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -153,7 +149,7 @@
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration)</CodeGeneration>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -176,7 +172,7 @@
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration)</CodeGeneration>
      <FastMath>true</FastMath>
    </CudaCompile>
  </ItemDefinitionGroup>
@@ -191,6 +187,7 @@
    <ClInclude Include="..\..\include\models\basellm.h" />
    <ClInclude Include="..\..\include\models\chatglm.h" />
    <ClInclude Include="..\..\include\models\factoryllm.h" />
+    <ClInclude Include="..\..\include\models\glm.h" />
    <ClInclude Include="..\..\include\models\llama.h" />
    <ClInclude Include="..\..\include\models\moss.h" />
    <ClInclude Include="..\..\include\models\qwen.h" />
@@ -208,6 +205,7 @@
    <ClCompile Include="..\..\src\model.cpp" />
    <ClCompile Include="..\..\src\models\basellm.cpp" />
    <ClCompile Include="..\..\src\models\chatglm.cpp" />
+    <ClCompile Include="..\..\src\models\glm.cpp" />
    <ClCompile Include="..\..\src\models\llama.cpp" />
    <ClCompile Include="..\..\src\models\moss.cpp" />
    <ClCompile Include="..\..\src\models\qwen.cpp" />

--- a/example/Win32Demo/fastllm.vcxproj
+++ b/example/Win32Demo/fastllm.vcxproj
@@ -72,25 +72,21 @@
    <LinkIncremental>true</LinkIncremental>
    <LibraryPath>$(CUDA_PATH)\lib\Win32;$(LibraryPath)</LibraryPath>
    <IncludePath>$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
-    <TargetExt>.lib</TargetExt>
    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
    <IntDir>$(Platform)\$(Configuration)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <TargetExt>.lib</TargetExt>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <LibraryPath>$(CUDA_PATH)\lib\Win32;$(LibraryPath)</LibraryPath>
    <IncludePath>$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
-    <TargetExt>.lib</TargetExt>
    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
    <IntDir>$(Platform)\$(Configuration)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <TargetExt>.lib</TargetExt>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -172,10 +168,6 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
-    <CudaCompile>
-      <CodeGeneration>compute_61,sm_61;%(CodeGeneration)</CodeGeneration>
-      <FastMath>true</FastMath>
-    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="..\..\include\device.h" />
@@ -187,6 +179,7 @@
    <ClInclude Include="..\..\include\models\basellm.h" />
    <ClInclude Include="..\..\include\models\chatglm.h" />
    <ClInclude Include="..\..\include\models\factoryllm.h" />
+    <ClInclude Include="..\..\include\models\glm.h" />
    <ClInclude Include="..\..\include\models\llama.h" />
    <ClInclude Include="..\..\include\models\moss.h" />
    <ClInclude Include="..\..\include\models\qwen.h" />
@@ -202,6 +195,7 @@
    <ClCompile Include="..\..\src\model.cpp" />
    <ClCompile Include="..\..\src\models\basellm.cpp" />
    <ClCompile Include="..\..\src\models\chatglm.cpp" />
+    <ClCompile Include="..\..\src\models\glm.cpp" />
    <ClCompile Include="..\..\src\models\llama.cpp" />
    <ClCompile Include="..\..\src\models\moss.cpp" />
    <ClCompile Include="..\..\src\models\qwen.cpp" />

--- a/example/apiserver/apiserver.cpp
+++ b/example/apiserver/apiserver.cpp
@@ -252,7 +252,11 @@ struct WorkQueue {
            while (true) {
                std::unique_lock <std::mutex> lock(ts->locker);
                if (ts->activateQueryNumber >= ts->maxActivateQueryNumber) {
+#ifdef WIN32
+                    Sleep(0);
+#else
                    sleep(0);
+#endif
                    continue;
                }
                if (ts->q.empty()) {
@@ -435,7 +439,11 @@ int main(int argc, char** argv) {
        buff[size] = 0;

        while (workQueue.q.size() > workQueue.maxActivateQueryNumber) {
-            sleep(0);
+#ifdef WIN32
+                    Sleep(0);
+#else
+                    sleep(0);
+#endif
        }
        workQueue.Push(buff, client);
    }

--- a/example/benchmark/benchmark.cpp
+++ b/example/benchmark/benchmark.cpp
-//
-// Created by huangyuyang on 6/9/23.
-//
-
 #include "model.h"
 #include "utils.h"
 #include "fstream"
+#include <unistd.h>

 #if defined(_WIN32) or defined(_WIN64)
 #include <codecvt>
@@ -28,6 +25,7 @@ struct BenchmarkConfig {
    int batch = -1; // batch数, -1时使用文件中的行数作为batch
    std::string file; // 输入文件
    std::string output; // 输出文件，如果不设定则输出到屏幕
+    int runloop = 0;
 };

 void Usage() {
@@ -38,6 +36,7 @@ void Usage() {
    std::cout << "<-l|--limit> <args>:          输出token数限制" << std::endl;
    std::cout << "<-b|--batch> <args>:          batch数"      << std::endl;
    std::cout << "<-f|--file> <args>:           输入文件，文件中每行一个prompt，如果行数不足batch则用之前的prompt补充"      << std::endl;
+    std::cout << "<-o|--output> <args>:         输出结果写文件，如果不设定则输出到屏幕"      << std::endl;
 }

 void ParseArgs(int argc, char **argv, BenchmarkConfig &config) {
@@ -62,6 +61,8 @@ void ParseArgs(int argc, char **argv, BenchmarkConfig &config) {
            config.file = sargv[++i];
        } else if (sargv[i] == "-o" || sargv[i] == "--output") {
            config.output = sargv[++i];
+        } else if (sargv[i] == "--loop") {
+            config.runloop = 1;
        } else {
            Usage();
            exit(-1);
@@ -69,6 +70,11 @@ void ParseArgs(int argc, char **argv, BenchmarkConfig &config) {
    }
 }

+static double GetSpan(std::chrono::system_clock::time_point time1, std::chrono::system_clock::time_point time2) {
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds> (time2 - time1);
+    return double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+};
+
 int main(int argc, char **argv) {
    BenchmarkConfig config;
    ParseArgs(argc, argv, config);
@@ -79,11 +85,13 @@ int main(int argc, char **argv) {
        exit(0);
    }
    model_file.close();
+//    fastllm::SetDeviceMap({{"cuda:0", 1}, {"cuda:1", 1}});
    auto model = fastllm::CreateLLMModelFromFile(config.path);
    fastllm::GenerationConfig generationConfig;
    generationConfig.output_token_limit = config.limit;
+    generationConfig.repeat_penalty = 1.1;

-    fastllm::PrintInstructionInfo();
+//    fastllm::PrintInstructionInfo();
    std::vector <std::string> inputs;
    if (config.file != "") {
        std::ifstream finputs(config.file, std::ios::in);
@@ -119,44 +127,81 @@ int main(int argc, char **argv) {
        promptTokenNum += model->weight.tokenizer.Encode(inputs[i]).Count(0);
    }

-    std::vector <std::string> outputs;
-    static int tokens = 0;
-    auto st = std::chrono::system_clock::now();
-    static auto promptTime = st;
-    model->ResponseBatch(inputs, outputs, [](int index, std::vector<std::string> &contents) {
-        if (index != -1) {
-            if (index == 0) {
-                promptTime = std::chrono::system_clock::now();
-            } else {
-                for (int i = 0; i < contents.size(); i++) {
-                    tokens += (contents[i].size() > 0);
+    if(config.runloop != 1) {
+
+        std::vector <std::string> outputs;
+        static int tokens = 0;
+        auto st = std::chrono::system_clock::now();
+        static auto promptTime = st;
+        model->ResponseBatch(inputs, outputs, [](int index, std::vector<std::string> &contents) {
+            if (index != -1) {
+                if (index == 0) {
+                    promptTime = std::chrono::system_clock::now();
+                } else {
+                    for (int i = 0; i < contents.size(); i++) {
+                        tokens += (contents[i].size() > 0);
+                    }
                }
            }
-        }
-    }, generationConfig);
-    float promptSpend = fastllm::GetSpan(st, promptTime);
-    float spend = fastllm::GetSpan(promptTime, std::chrono::system_clock::now());
+        }, generationConfig);
+        float promptSpend = fastllm::GetSpan(st, promptTime);
+        float spend = fastllm::GetSpan(promptTime, std::chrono::system_clock::now());

-    if (config.output != "") {
-        FILE *fo = fopen(config.output.c_str(), "w");
-        for (int i = 0; i < outputs.size(); i++) {
-            fprintf(fo, "[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
-        }
-        fclose(fo);
-    } else {
-        for (int i = 0; i < outputs.size(); i++) {
-#if defined(_WIN32) or defined(_WIN64)
-            printf("[ user: \"%s\", model: \"%s\"]\n", utf8_to_gbk(inputs[i]).c_str(), utf8_to_gbk(outputs[i]).c_str());
-#else
-            printf("[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
-#endif
+        if (config.output != "") {
+            FILE *fo = fopen(config.output.c_str(), "w");
+            for (int i = 0; i < outputs.size(); i++) {
+                fprintf(fo, "[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
+            }
+            fclose(fo);
+        } else {
+            for (int i = 0; i < outputs.size(); i++) {
+    #if defined(_WIN32) or defined(_WIN64)
+                printf("[ user: \"%s\", model: \"%s\"]\n", utf8_to_gbk(inputs[i]).c_str(), utf8_to_gbk(outputs[i]).c_str());
+    #else
+                printf("[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
+    #endif
+            }
        }
+
+        printf("batch: %d\n", (int)inputs.size());
+        printf("prompt token number = %d\n", promptTokenNum);
+        printf("prompt use %f s\n", promptSpend);
+        printf("prompt speed = %f tokens / s\n", (float)promptTokenNum / promptSpend);
+        printf("output %d tokens\nuse %f s\nspeed = %f tokens / s\n", tokens, spend, tokens / spend);
    }
+    else
+    {
+        static int tokens = 0;
+        while(true){
+            tokens = 0;
+            std::vector <std::string> outputs;
+            auto st = std::chrono::system_clock::now();
+            static auto promptTime = st;
+            model->ResponseBatch(inputs, outputs, [](int index, std::vector<std::string> &contents) {
+                if (index != -1) {
+                    if (index == 0) {
+                        promptTime = std::chrono::system_clock::now();
+                    } else {
+                        for (int i = 0; i < contents.size(); i++) {
+                            tokens += (contents[i].size() > 0);
+                        }
+                    }
+                }
+            }, generationConfig);
+            float promptSpend = fastllm::GetSpan(st, promptTime);
+            float spend = fastllm::GetSpan(promptTime, std::chrono::system_clock::now());

-    printf("batch: %d\n", (int)inputs.size());
-    printf("prompt token number = %d\n", promptTokenNum);
-    printf("prompt use %f s\n", promptSpend);
-    printf("prompt speed = %f tokens / s\n", (float)promptTokenNum / promptSpend);
-    printf("output %d tokens\nuse %f s\nspeed = %f tokens / s\n", tokens, spend, tokens / spend);
+            if (config.output != "") {
+                FILE *fo = fopen(config.output.c_str(), "w");
+                for (int i = 0; i < outputs.size(); i++) {
+                    fprintf(fo, "[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
+                }
+                fclose(fo);
+            }
+            pid_t pid = getpid();
+            printf("pid %d : batch=%d, prompt %d tokens, prompt use %0.2fs, prompt speed=%0.2ftokens/s, output %d tokens, use %0.2fs, tps=%0.2ftokens/s\n",
+                    pid, (int)inputs.size(), promptTokenNum, promptSpend, (float)promptTokenNum / promptSpend, tokens, spend, tokens / spend);
+        }
+    }
    return 0;
 }
\ No newline at end of file
--- a/example/benchmark/benchmark_baichuan13b_test.sh
+++ b/example/benchmark/benchmark_baichuan13b_test.sh
+#!/bin/bash
+
+#fp16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts/beijing.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts/beijing.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts/beijing.txt -l 128 -b 18
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 32
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 64
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 96
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 100
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 102
+
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 18
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 32
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 64
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 92
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 96
+
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 6
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 8
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 30
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 32
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 34
+
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 2
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 8
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-fp16.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 20
+
+#int8
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts/beijing.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts/beijing.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts/beijing.txt -l 128 -b 18
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 32
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 64
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 96
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 112
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 120
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 124
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 128
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 144
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 146
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 148
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//beijing.txt -l 128 -b 150
+
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 18
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 32
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 64
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 96
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 112
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//100tokens.txt -l 128 -b 128
+
+
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 6
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 8
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 15
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 32
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 36
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 40
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 48
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//500tokens.txt -l 128 -b 52
+
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 1
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 2
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 7
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 8
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 16
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 18
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 20
+./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 21
+#./benchmark -p /home/xzhou/baichuan-13b-chat-flm/baichuan-13b-chat-int8.flm -f /home/xzhou/prompts//1000tokens.txt -l 128 -b 26
\ No newline at end of file
--- a/example/benchmark/loop_benchmark.sh
+++ b/example/benchmark/loop_benchmark.sh
+#!/bin/bash
+
+# 定义要运行的程序命令
+program="./benchmark -p ../../../models/chatglm2-6b/chatglm2-6b-flm/chatglm2-6b-int8.flm -f ../../../models/chatglm2-6b/prompts/100tokens.txt --loop"
+
+# 定义要运行的实例数量
+num_instances=2
+
+# 启动后台进程
+for ((i=1; i<=num_instances; i++)); do
+  $program &
+done
+
+# 压测持续时间（秒）
+test_duration=120
+
+# 等待一段时间以进行压测
+sleep $test_duration
+
+# 杀死所有后台进程
+pkill -f benchmark