Adapt to 0.1.0

9484fd1c · xiabo · 477f2db8 · 9484fd1c · 9484fd1c · 9484fd1c
Commit 9484fd1c authored Dec 20, 2023 by xiabo
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,10 +17,10 @@ project(TurboMind LANGUAGES CXX CUDA)

 find_package(CUDA 10.2 REQUIRED)

-if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
-  add_definitions("-DENABLE_BF16")
-  message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
-endif()
+# if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
+#   add_definitions("-DENABLE_BF16")
+#   message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
+# endif()

 # if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12"))
 #   add_definitions("-DENABLE_FP8")
@@ -44,18 +44,18 @@ option(BUILD_TEST "Build tests" OFF)

 include(FetchContent)

-FetchContent_Declare(
-  repo-cutlass
-  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
-  GIT_TAG        6f47420213f757831fae65c686aa471749fa8d60
-)
+#FetchContent_Declare(
+#  repo-cutlass
+#  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+#  GIT_TAG        cc85b64cf676c45f98a17e3a47c0aafcf817f088
+#)

-set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+#set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

-FetchContent_MakeAvailable(repo-cutlass)
+#FetchContent_MakeAvailable(repo-cutlass)

-set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
-set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
+#set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
+#set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)

 option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)

@@ -128,6 +128,7 @@ endif()
 set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") # -Xptxas -v
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --gpu-max-threads-per-block=1024")

 set(SM_SETS 52 60 61 70 75 80 86 89 90)
 set(USING_WMMA False)
@@ -292,7 +293,8 @@ endif()

 if (BUILD_MULTI_GPU)
  list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
-  list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
+  #list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
+  list(APPEND COMMON_LIB_DIRS /opt/mpi/lib)
 endif()

 if(USE_TRITONSERVER_DATATYPE)
@@ -337,11 +339,11 @@ endif()
 add_library(transformer-shared SHARED
  $<TARGET_OBJECTS:BaseSamplingLayer>
  $<TARGET_OBJECTS:DynamicDecodeLayer>
-  $<TARGET_OBJECTS:llama_fmha>
-  $<TARGET_OBJECTS:flash_attention2>
+#  $<TARGET_OBJECTS:llama_fmha>
+#  $<TARGET_OBJECTS:flash_attention2>
  $<TARGET_OBJECTS:Llama>
  $<TARGET_OBJECTS:LlamaTritonBackend>
-  $<TARGET_OBJECTS:gemm_s4_f16>
+#  $<TARGET_OBJECTS:gemm_s4_f16>
  $<TARGET_OBJECTS:TopKSamplingLayer>
  $<TARGET_OBJECTS:TopPSamplingLayer>
  $<TARGET_OBJECTS:TransformerTritonBackend>
@@ -380,15 +382,16 @@ target_link_libraries(transformer-shared PUBLIC
 endif()

 if(USE_NVTX)
-target_link_libraries(transformer-shared PUBLIC
-  -lnvToolsExt
-)
+#target_link_libraries(transformer-shared PUBLIC
+#  -lnvToolsExt
+#)
 endif()

-set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+#set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
+#set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
+#target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
+target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcurand)

 include(GNUInstallDirs)
 set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMind)

--- a/README.md
+++ b/README.md
-<div align="center">
-  <img src="resources/lmdeploy-logo.svg" width="450"/>
+# <div align="center"><strong>LMdeploy</strong></div>
+## 简介
+LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](https://github.com/open-mmlab/mmrazor) 团队联合开发，是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。
+这个强大的工具箱提供以下核心功能：

-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/en/latest/)
-[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions)
-[![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy)
-[![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE)
-[![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
-[![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
+- **高效推理引擎 TurboMind**：基于 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)，我们实现了高效推理引擎 TurboMind，支持 InternLM、LLaMA、vicuna等模型在 NVIDIA GPU 上的推理。

-English | [简体中文](README_zh-CN.md)
+- **交互推理方式**：通过缓存多轮对话过程中 attention 的 k/v，记住对话历史，从而避免重复处理历史会话。

-</div>
+- **多 GPU 部署和量化**：我们提供了全面的模型部署和量化支持，已在不同规模上完成验证。

-<p align="center">
-    👋 join us on <a href="https://twitter.com/intern_lm" target="_blank">Twitter</a>, <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
-</p>
+- **persistent batch 推理**：进一步优化模型执行效率。

-______________________________________________________________________
+persistent batch 推理：进一步优化模型执行效率。
+LMdeploy官方github地址:[https://github.com/InternLM/lmdeploy](https://github.com/InternLM/lmdeploy)
+## 支持模型
+|     模型     | 模型并行 | FP16 | KV INT8 |
+| :----------: | :------: | :--: | :-----: |
+|    Llama     |   Yes    | Yes  |   Yes   |
+|    Llama2    |   Yes    | Yes  |   Yes   |
+| InternLM-7B  |   Yes    | Yes  |   Yes   |
+| InternLM-20B |   Yes    | Yes  |   Yes   |
+|   QWen-7B    |   Yes    | Yes  |   Yes   |
+|   QWen-14B   |   Yes    | Yes  |   Yes   |
+| Baichuan-7B  |   Yes    | Yes  |   Yes   |
+| Baichuan2-7B |   Yes    | Yes  |   No    |

-## News 🎉
+## 安装

- \[2023/11\] Turbomind supports loading hf model directly. Click [here](./docs/en/load_hf.md) for details.
- \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
- \[2023/09\] TurboMind supports Qwen-14B
- \[2023/09\] TurboMind supports InternLM-20B
- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
- \[2023/09\] TurboMind supports Baichuan2-7B
- \[2023/08\] TurboMind supports flash-attention2.
- \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
- \[2023/08\] TurboMind supports Windows (tp=1)
- \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info
- \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.
- \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm.
- \[2023/07\] TurboMind supports Llama-2 70B with GQA.
- \[2023/07\] TurboMind supports Llama-2 7B/13B.
- \[2023/07\] TurboMind supports tensor-parallel inference of InternLM.
+### 使用源码编译方式安装

-______________________________________________________________________
-
-## Introduction
-
-LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams. It has the following core features:
-
- **Efficient Inference Engine (TurboMind)**: Based on [FasterTransformer](https://github.com/NVIDIA/FasterTransformer), we have implemented an efficient inference engine - TurboMind, which supports the inference of LLaMA and its variant models on NVIDIA GPUs.
-
- **Interactive Inference Mode**: By caching the k/v of attention during multi-round dialogue processes, it remembers dialogue history, thus avoiding repetitive processing of historical sessions.
-
- **Multi-GPU Model Deployment and Quantization**: We provide comprehensive model deployment and quantification support, and have been validated at different scales.
-
- **Persistent Batch Inference**: Further optimization of model execution efficiency.
-
-![PersistentBatchInference](https://github.com/InternLM/lmdeploy/assets/67539920/e3876167-0671-44fc-ac52-5a0f9382493e)
-
-## Supported Models
-
-`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
-
-### TurboMind
-
-> **Note**<br />
-> W4A16 inference requires Nvidia GPU with Ampere architecture or above.
-
-|    Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
-|    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|    SOLAR     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   QWen-7B    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   QWen-14B   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan2-7B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
-
-### Pytorch
-
-|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
-|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
-|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
-| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
-
-## Performance
-
-**Case I**: output token throughput with fixed input token and output token number (1, 2048)
-
-**Case II**: request throughput with real conversation data
-
-Test Setting: LLaMA-7B, NVIDIA A100(80G)
-
-The output token throughput of TurboMind exceeds 2000 tokens/s, which is about 5% - 15% higher than DeepSpeed overall and outperforms huggingface transformers by up to 2.3x.
-And the request throughput of TurboMind is 30% higher than vLLM.
-
-![benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/7775c518-608e-4e5b-be73-7645a444e774)
-
-## Quick Start
-
-### Installation
-
-Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
-
-```shell
-pip install lmdeploy
+#### 编译环境准备
+下载光源的镜像，起dcoker
 ```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:lmdeploy-dtk2310-torch1.13-py38

-> **Note**<br />
-> `pip install lmdeploy` can only install the runtime required packages. If users want to run codes from modules like `lmdeploy.lite` and `lmdeploy.serve`, they need to install the extra required packages.
-> For instance, running `pip install lmdeploy[lite]` would install extra dependencies for `lmdeploy.lite` module.
->
-> - `all`: Install lmdeploy with all dependencies in `requirements.txt`
-> - `lite`: Install lmdeploy with extra dependencies in `requirements/lite.txt`
-> - `serve`: Install lmdeploy with dependencies in `requirements/serve.txt`
+# <Image ID>用上面拉取docker镜像的ID替换
+# <Host Path>主机端路径
+# <Container Path>容器映射路径
+docker run -it --name baichuan --shm-size=1024G -v /opt/hyhal:/opt/hyhal --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v <Host Path>:<Container Path> <Image ID> /bin/bash
+```
+注：

-### Deploy InternLM
+1、docker启动  -v /opt/hyhal:/opt/hyhal  这个变量不能少

-To use TurboMind inference engine, you need to first convert the model into TurboMind format. Currently, we support online conversion and offline conversion. With online conversion, TurboMind can load the Huggingface model directly. While with offline conversion, you should save the converted model first before using it.
+2、要是非光源提供镜像，配置环境：(若安装过慢，可以添加源:pip3 install xxx -i  https://pypi.tuna.tsinghua.edu.cn/simple/)
+```
+pip3 install -r requirements.txt
+pip3 install urllib3==1.24
+yum install rapidjson
+# gcc版本需要>=9 安装高版本gcc 要是必须使用gcc7，可以下载对应的gcc7的分支
+yum install -y centos-release-scl
+yum install -y devtoolset-9
+scl enable devtoolset-9 bash
+
+# 执行nccl环境变量
+export NCCL_LAUNCH_MODE=GROUP
+```

-The following use [internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) as a example to show how to use turbomind with online conversion. You can refer to [load_hf.md](docs/en/load_hf.md) for other methods.
+#### 源码编译安装
+- 代码下载
+根据不同的需求下载不同的分支
+- 提供2种源码编译方式（进入lmdeploy目录）：
+```
+1. 源码编译安装
+mkdir build && cd build
+sh ../generate.sh
+make -j 32 && make install
+cd .. && python3 setup.py install
+
+2. 编译成whl包安装
+# 安装wheel 
+pip3 install wheel 
+mkdir build && cd build
+sh ../generate.sh
+make -j 32 && make install
+cd .. && python3 setup.py bdist_wheel
+cd dist && pip3 install lmdeploy*
+```

-#### Inference by TurboMind
+## 模型服务

-```shell
-lmdeploy chat turbomind internlm/internlm-chat-7b --model-name internlm-chat-7b
+### 部署 [LLaMA](https://huggingface.co/huggyllama) 服务
+请从[这里](https://huggingface.co/huggyllama) 下载 llama 模型，参考如下命令部署服务：
+以7B为例：
 ```
+1、模型转换
+# <model_name> 模型的名字 （'llama', 'internlm', 'vicuna', 'internlm-chat-7b', 'internlm-chat', 'internlm-chat-7b-8k', 'internlm-chat-20b', 'internlm-20b', 'baichuan-7b', 'baichuan2-7b', 'llama2', 'qwen-7b', 'qwen-14b',）
+# <model_path> 模型路径
+# <model_format> 模型的格式 （'llama', 'hf', 'qwen'）
+# <tokenizer_path> tokenizer模型的路径（默认None,会去model_path里面找qwen.tiktoken）
+# <model_format> 保存输出的目标路径（默认./workspace）
+# <tp> 用于张量并行的GPU数量应该是2^n

-> **Note**<br /> The internlm/internlm-chat-7b model will be downloaded under `.cache` folder. You can also use a local path here.
+lmdeploy convert --model_name llama --model_path /path/to/model --model_format hf --tokenizer_path None --dst_path ./workspace_llama --tp 1

-> **Note**<br />
-> When inferring with FP16 precision, the InternLM-7B model requires at least 15.7G of GPU memory overhead on TurboMind. <br />
-> It is recommended to use NVIDIA cards such as 3090, V100, A100, etc.
-> Disable GPU ECC can free up 10% memory, try `sudo nvidia-smi --ecc-config=0` and reboot system.
+2、运行
+# bash界面运行
+lmdeploy chat turbomind --model_path ./workspace_llama --tp 1     # 输入问题后执行2次回车进行推理

-> **Note**<br />
-> Tensor parallel is available to perform inference on multiple GPUs. Add `--tp=<num_gpu>` on `chat` to enable runtime TP.
+# 在服务器界面运行：
+在bash端运行：
+# <model_path_or_server> 部署模型的路径或tritonserver URL或restful api URL。前者用于与gradio直接运行服务。后者用于默认情况下使用tritonserver运行。如果输入URL是restful api。请启用另一个标志“restful_api”。
+# <server_name> gradio服务器的ip地址
+# <server_port> gradio服务器的ip的端口
+# <batch_size> 于直接运行Turbomind的batch大小 （默认32）
+# <tp> 用于张量并行的GPU数量应该是2^n （和模型转换的时候保持一致）
+# <restful_api> modelpath_or_server的标志（默认是False）

-#### Serving with gradio
+lmdeploy serve gradio --model_path_or_server ./workspace_llama --server_name {ip} --server_port {pord} --batch_size 32 --tp 1 --restful_api False 

-```shell
-# install lmdeploy with extra dependencies
-pip install lmdeploy[serve]
+在网页上输入{ip}:{pord}即可进行对话

-lmdeploy serve gradio internlm/internlm-chat-7b --model-name internlm-chat-7b
 ```
-
-![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
-
-#### Serving with Restful API
-
-Launch inference server by:
-
-```shell
-# install lmdeploy with extra dependencies
-pip install lmdeploy[serve]
-
-lmdeploy serve api_server internlm/internlm-chat-7b --model-name internlm-chat-7b --instance_num 32 --tp 1
+### 部署 [llama2](https://huggingface.co/meta-llama) 服务
+请从[这里](https://huggingface.co/meta-llama) 下载 llama2 模型，参考如下命令部署服务：
+以7B为例：
 ```
-
-Then, you can communicate with it by command line,
-
-```shell
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
-lmdeploy serve api_client api_server_url
+1、模型转换
+lmdeploy convert --model_name llama2 --model_path /path/to/model --model_format hf --tokenizer_path None --dst_path ./workspace_llama2 --tp 1  # 
+2、运行
+# bash界面运行
+lmdeploy chat turbomind --model_path ./workspace_llama2 --tp 1
+# 在服务器界面运行：
+在bash端运行：
+lmdeploy serve gradio --model_path_or_server ./workspace_llama2 --server_name {ip} --server_port {pord} --batch_size 32 --tp 1 --restful_api False 
+
+在网页上输入{ip}:{pord}即可进行对话
 ```
-
-or webui,
-
-```shell
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
-# server_ip and server_port here are for gradio ui
-# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
-lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
+### 部署 [internlm](https://huggingface.co/internlm/) 服务
+请从[这里](https://huggingface.co/internlm) 下载 internlm 模型，参考如下命令部署服务：
+以7B为例：
 ```
-
-Refer to [restful_api.md](docs/en/restful_api.md) for more details.
-
-### Inference with PyTorch
-
-For detailed instructions on Inference pytorch models, see [here](docs/en/pytorch.md).
-
-#### Single GPU
-
-```shell
-lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
-    --max_new_tokens 64 \
-    --temperture 0.8 \
-    --top_p 0.95 \
-    --seed 0
+1、模型转换
+lmdeploy convert --model_name model_name --model_path /path/to/model --model_format hf --tokenizer_path None --dst_path ./workspace_intern --tp 1  # 根据模型的类型选择model_name是internlm-chat还是internlm
+2、运行
+# bash界面运行
+lmdeploy chat turbomind --model_path ./workspace_intern --tp 1
+# 在服务器界面运行：
+在bash端运行：
+lmdeploy serve gradio --model_path_or_server ./workspace_intern --server_name {ip} --server_port {pord} --batch_size 32 --tp 1 --restful_api False 
+
+在网页上输入{ip}:{pord}即可进行对话
 ```
-
-#### Tensor Parallel with DeepSpeed
-
-```shell
-deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
-    $NAME_OR_PATH_TO_HF_MODEL \
-    --max_new_tokens 64 \
-    --temperture 0.8 \
-    --top_p 0.95 \
-    --seed 0
+### 部署 [baichuan](https://huggingface.co/baichuan-inc) 服务
+请从[这里](https://huggingface.co/baichuan-inc) 下载 baichuan 模型，参考如下命令部署服务：
+以7B为例：
+```
+1、模型转换
+lmdeploy convert --model_name baichuan-7b --model_path /path/to/model --model_format hf --tokenizer_path None --dst_path ./workspace_baichuan --tp 1
+2、运行
+# bash界面运行
+lmdeploy chat turbomind --model_path ./workspace_baichuan --tp 1
+# 在服务器界面运行：
+在bash端运行：
+lmdeploy serve gradio --model_path_or_server ./workspace_baichuan --server_name {ip} --server_port {pord} --batch_size 32 --tp 1 --restful_api False 
+
+在网页上输入{ip}:{pord}即可进行对话
 ```

-You need to install deepspeed first to use this feature.
-
+### 部署 [baichuan2](https://huggingface.co/baichuan-inc) 服务
+请从[这里](https://huggingface.co/baichuan-inc) 下载 baichuan2 模型，参考如下命令部署服务：
+以7B为例：
 ```
-pip install deepspeed
+1、模型转换
+lmdeploy convert --model_name baichuan2-7b --model_path /path/to/model --model_format hf --tokenizer_path None --dst_path ./workspace_baichuan2 --tp 1
+2、运行
+# bash界面运行
+lmdeploy chat turbomind --model_path ./workspace_baichuan2 --tp 1
+# 在服务器界面运行：
+在bash端运行：
+lmdeploy serve gradio --model_path_or_server ./workspace_baichuan2 --server_name {ip} --server_port {pord} --batch_size 32 --tp 1 --restful_api False 
+
+在网页上输入{ip}:{pord}即可进行对话
 ```

-## Quantization
-
-#### Weight INT4 Quantization
-
-LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization
-
-[Click here](./docs/en/w4a16.md) to view the test results for weight int4 usage.
-
-#### KV Cache INT8 Quantization
-
-[Click here](./docs/en/kv_int8.md) to view the usage method, implementation formula, and test results for kv int8.
-
-> **Warning**<br />
-> runtime Tensor Parallel for quantized model is not available. Please setup `--tp` on `deploy` to enable static TP.
-
-## Contributing
+### 部署 [qwen](https://huggingface.co/Qwen) 服务
+请从[这里](https://huggingface.co/Qwen) 下载 qwen 模型，参考如下命令部署服务：
+以7B为例：
+```
+1、模型转换
+lmdeploy convert --model_name qwen-7b --model_path /path/to/model --model_format qwen --tokenizer_path None --dst_path ./workspace_qwen --tp 1
+2、运行
+# bash界面运行
+lmdeploy chat turbomind --model_path ./workspace_qwen --tp 1
+# 在服务器界面运行：
+在bash端运行：
+lmdeploy serve gradio --model_path_or_server ./workspace_qwen --server_name {ip} --server_port {pord} --batch_size 32 --tp 1 --restful_api False 
+
+在网页上输入{ip}:{pord}即可进行对话
+```

-We appreciate all contributions to LMDeploy. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+## result
+![qwen推理](docs/dcu/qwen推理.gif)

-## Acknowledgement
+### 详细可参考 [docs](./docs/zh_cn/serving.md) 
+## 版本号查询
+- python -c "import lmdeploy; lmdeploy.\_\_version__"，版本号与官方版本同步，查询该软件的版本号，例如0.0.6；

- [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)
- [llm-awq](https://github.com/mit-han-lab/llm-awq)
+## Known Issue
+- 无

-## License
+## Note
+ 若使用pip install下载安装过慢，可添加pypi清华源：-i https://pypi.tuna.tsinghua.edu.cn/simple/

-This project is released under the [Apache 2.0 license](LICENSE).
+## 其他参考
+- [README_origin](README_origin.md)
+- [README_zh-CN](README_zh-CN.md)
--- a/README_origin.md
+++ b/README_origin.md
+<div align="center">
+  <img src="resources/lmdeploy-logo.svg" width="450"/>
+
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/en/latest/)
+[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions)
+[![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy)
+[![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
+[![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
+
+English | [简体中文](README_zh-CN.md)
+
+</div>
+
+<p align="center">
+    👋 join us on <a href="https://twitter.com/intern_lm" target="_blank">Twitter</a>, <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
+</p>
+
+______________________________________________________________________
+
+## News 🎉
+
+- \[2023/11\] Turbomind supports loading hf model directly. Click [here](./docs/en/load_hf.md) for details.
+- \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
+- \[2023/09\] TurboMind supports Qwen-14B
+- \[2023/09\] TurboMind supports InternLM-20B
+- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
+- \[2023/09\] TurboMind supports Baichuan2-7B
+- \[2023/08\] TurboMind supports flash-attention2.
+- \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
+- \[2023/08\] TurboMind supports Windows (tp=1)
+- \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info
+- \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.
+- \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm.
+- \[2023/07\] TurboMind supports Llama-2 70B with GQA.
+- \[2023/07\] TurboMind supports Llama-2 7B/13B.
+- \[2023/07\] TurboMind supports tensor-parallel inference of InternLM.
+
+______________________________________________________________________
+
+## Introduction
+
+LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams. It has the following core features:
+
+- **Efficient Inference Engine (TurboMind)**: Based on [FasterTransformer](https://github.com/NVIDIA/FasterTransformer), we have implemented an efficient inference engine - TurboMind, which supports the inference of LLaMA and its variant models on NVIDIA GPUs.
+
+- **Interactive Inference Mode**: By caching the k/v of attention during multi-round dialogue processes, it remembers dialogue history, thus avoiding repetitive processing of historical sessions.
+
+- **Multi-GPU Model Deployment and Quantization**: We provide comprehensive model deployment and quantification support, and have been validated at different scales.
+
+- **Persistent Batch Inference**: Further optimization of model execution efficiency.
+
+![PersistentBatchInference](https://github.com/InternLM/lmdeploy/assets/67539920/e3876167-0671-44fc-ac52-5a0f9382493e)
+
+## Supported Models
+
+`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
+
+### TurboMind
+
+> **Note**<br />
+> W4A16 inference requires Nvidia GPU with Ampere architecture or above.
+
+|    Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    SOLAR     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|   QWen-7B    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|   QWen-14B   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2-7B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
+
+### Pytorch
+
+|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
+|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
+| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
+
+## Performance
+
+**Case I**: output token throughput with fixed input token and output token number (1, 2048)
+
+**Case II**: request throughput with real conversation data
+
+Test Setting: LLaMA-7B, NVIDIA A100(80G)
+
+The output token throughput of TurboMind exceeds 2000 tokens/s, which is about 5% - 15% higher than DeepSpeed overall and outperforms huggingface transformers by up to 2.3x.
+And the request throughput of TurboMind is 30% higher than vLLM.
+
+![benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/7775c518-608e-4e5b-be73-7645a444e774)
+
+## Quick Start
+
+### Installation
+
+Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
+
+```shell
+pip install lmdeploy
+```
+
+> **Note**<br />
+> `pip install lmdeploy` can only install the runtime required packages. If users want to run codes from modules like `lmdeploy.lite` and `lmdeploy.serve`, they need to install the extra required packages.
+> For instance, running `pip install lmdeploy[lite]` would install extra dependencies for `lmdeploy.lite` module.
+>
+> - `all`: Install lmdeploy with all dependencies in `requirements.txt`
+> - `lite`: Install lmdeploy with extra dependencies in `requirements/lite.txt`
+> - `serve`: Install lmdeploy with dependencies in `requirements/serve.txt`
+
+### Deploy InternLM
+
+To use TurboMind inference engine, you need to first convert the model into TurboMind format. Currently, we support online conversion and offline conversion. With online conversion, TurboMind can load the Huggingface model directly. While with offline conversion, you should save the converted model first before using it.
+
+The following use [internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) as a example to show how to use turbomind with online conversion. You can refer to [load_hf.md](docs/en/load_hf.md) for other methods.
+
+#### Inference by TurboMind
+
+```shell
+lmdeploy chat turbomind internlm/internlm-chat-7b --model-name internlm-chat-7b
+```
+
+> **Note**<br /> The internlm/internlm-chat-7b model will be downloaded under `.cache` folder. You can also use a local path here.
+
+> **Note**<br />
+> When inferring with FP16 precision, the InternLM-7B model requires at least 15.7G of GPU memory overhead on TurboMind. <br />
+> It is recommended to use NVIDIA cards such as 3090, V100, A100, etc.
+> Disable GPU ECC can free up 10% memory, try `sudo nvidia-smi --ecc-config=0` and reboot system.
+
+> **Note**<br />
+> Tensor parallel is available to perform inference on multiple GPUs. Add `--tp=<num_gpu>` on `chat` to enable runtime TP.
+
+#### Serving with gradio
+
+```shell
+# install lmdeploy with extra dependencies
+pip install lmdeploy[serve]
+
+lmdeploy serve gradio internlm/internlm-chat-7b --model-name internlm-chat-7b
+```
+
+![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
+
+#### Serving with Restful API
+
+Launch inference server by:
+
+```shell
+# install lmdeploy with extra dependencies
+pip install lmdeploy[serve]
+
+lmdeploy serve api_server internlm/internlm-chat-7b --model-name internlm-chat-7b --instance_num 32 --tp 1
+```
+
+Then, you can communicate with it by command line,
+
+```shell
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
+lmdeploy serve api_client api_server_url
+```
+
+or webui,
+
+```shell
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
+# server_ip and server_port here are for gradio ui
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
+```
+
+Refer to [restful_api.md](docs/en/restful_api.md) for more details.
+
+### Inference with PyTorch
+
+For detailed instructions on Inference pytorch models, see [here](docs/en/pytorch.md).
+
+#### Single GPU
+
+```shell
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
+    --max_new_tokens 64 \
+    --temperture 0.8 \
+    --top_p 0.95 \
+    --seed 0
+```
+
+#### Tensor Parallel with DeepSpeed
+
+```shell
+deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
+    $NAME_OR_PATH_TO_HF_MODEL \
+    --max_new_tokens 64 \
+    --temperture 0.8 \
+    --top_p 0.95 \
+    --seed 0
+```
+
+You need to install deepspeed first to use this feature.
+
+```
+pip install deepspeed
+```
+
+## Quantization
+
+#### Weight INT4 Quantization
+
+LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization
+
+[Click here](./docs/en/w4a16.md) to view the test results for weight int4 usage.
+
+#### KV Cache INT8 Quantization
+
+[Click here](./docs/en/kv_int8.md) to view the usage method, implementation formula, and test results for kv int8.
+
+> **Warning**<br />
+> runtime Tensor Parallel for quantized model is not available. Please setup `--tp` on `deploy` to enable static TP.
+
+## Contributing
+
+We appreciate all contributions to LMDeploy. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+
+## Acknowledgement
+
+- [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)
+- [llm-awq](https://github.com/mit-han-lab/llm-awq)
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
--- a/generate.sh
+++ b/generate.sh
 #!/bin/sh

-builder="-G Ninja"
-
-if [ "$1" == "make" ]; then
-    builder=""
-fi
-
-cmake ${builder} .. \
+cmake .. \
+    -DCMAKE_CXX_COMPILER=nvcc \
+    -DCMAKE_C_COMPILER=nvcc \
    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DBUILD_PY_FFI=ON \
    -DBUILD_MULTI_GPU=ON \
    -DCMAKE_CUDA_FLAGS="-lineinfo" \
-    -DUSE_NVTX=ON
+    -DUSE_NVTX=OFF \
+#    -DBUILD_TEST=ON
--- a/lmdeploy/__init__.py
+++ b/lmdeploy/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-
+from .version import *
 from lmdeploy.api import client, pipeline, serve

 __all__ = ['pipeline', 'serve', 'client']

--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -453,7 +453,7 @@ If a question does not make any sense, or is not factually coherent, explain why
                ret += f'{user} {self.e_inst} '
        return ret

-
+@MODELS.register_module(name='qwen-72b')
 @MODELS.register_module(name='qwen-14b')
 @MODELS.register_module(name='qwen-7b')
 class Qwen7BChat(BaseModel):

--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
-
+__dcu_version__ = '0.1.0'
 __version__ = '0.1.0'
 short_version = __version__

@@ -27,4 +27,4 @@ def parse_version_info(version_str: str) -> Tuple:

 version_info = parse_version_info(__version__)

-__all__ = ['__version__', 'version_info', 'parse_version_info']
+__all__ = ['__version__', '__dcu_version__', 'version_info', 'parse_version_info']
--- a/requirements/readthedocs.txt
+++ b/requirements/readthedocs.txt
 mmengine-lite
 torch
 transformers
-urllib3<2.0.0
+urllib3==1.24
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -5,4 +5,4 @@ safetensors
 sentencepiece
 tiktoken
 torch
-transformers>=4.33.0
+transformers==4.33.2
--- a/requirements/serve.txt
+++ b/requirements/serve.txt
 fastapi
-gradio<4.0.0
+gradio==3.50.2
 pydantic>2.0.0
 shortuuid
 uvicorn
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,11 @@ import sys

 from setuptools import find_packages, setup

+import subprocess
+from typing import Optional, Union
+from pathlib import Path
+import torch
+
 pwd = os.path.dirname(__file__)
 version_file = 'lmdeploy/version.py'

@@ -14,10 +19,63 @@ def readme():
    return content


+def get_sha(pytorch_root: Union[str, Path]) -> str:
+    try:
+        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip()
+    except Exception:
+        return 'Unknown'
+
+
+def get_abi():
+    try:
+        command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI" 
+        result = subprocess.run(command, shell=True, capture_output=True, text=True) 
+        output = result.stdout.strip() 
+        abi = "abi" + output.split(" ")[-1]
+        return abi
+    except Exception:
+        return 'abiUnknown'
+
+
+def get_version_add(sha: Optional[str] = None) -> str:
+    version=''
+    lmdeploy_root = os.path.dirname(os.path.abspath(__file__))
+    add_version_path = os.path.join(os.path.join(lmdeploy_root, "lmdeploy"), "version.py")
+    if sha != 'Unknown':
+        if sha is None:
+            sha = get_sha(lmdeploy_root)
+        version = 'git' + sha[:7]
+
+    # abi
+    version += "." + get_abi()
+
+    # dtk version
+    if os.getenv("ROCM_PATH"):
+        rocm_path = os.getenv('ROCM_PATH', "")
+        rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
+        with open(rocm_version_path, 'r',encoding='utf-8') as file:
+            lines = file.readlines()
+        rocm_version=lines[0][:-2].replace(".", "")
+        version += ".dtk" + rocm_version
+    
+    # torch version
+    version += ".torch" + torch.__version__[:4]
+
+    lines=[]
+    with open(add_version_path, 'r',encoding='utf-8') as file:
+        lines = file.readlines()
+    lines[2] = "__dcu_version__ = '0.1.0+{}'\n".format(version)
+    with open(add_version_path, encoding="utf-8",mode="w") as file:
+        file.writelines(lines)
+    file.close()
+
+
 def get_version():
-    with open(os.path.join(pwd, version_file), 'r') as f:
+    get_version_add()
+    version_file = 'lmdeploy/version.py'
+    with open(version_file, encoding='utf-8') as f:
        exec(compile(f.read(), version_file, 'exec'))
-    return locals()['__version__']
+    return locals()['__dcu_version__']


 def check_ext_modules():

--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -13,62 +13,63 @@
 # limitations under the License.

 cmake_minimum_required(VERSION 3.8)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")

 add_library(ban_bad_words STATIC ban_bad_words.cu)
-set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(stop_criteria STATIC stop_criteria_kernels.cu)
-set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(activation_kernels STATIC activation_kernels.cu)
-set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(logprob_kernels STATIC logprob_kernels.cu)
-set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(unfused_attention_kernels STATIC unfused_attention_kernels.cu)
-set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(bert_preprocess_kernels STATIC bert_preprocess_kernels.cu)
-set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 set(decoder_masked_multihead_attention_files
    decoder_masked_multihead_attention.cu
 )
 file(GLOB decoder_masked_multihead_attention_files ${decoder_masked_multihead_attention_files} ./decoder_masked_multihead_attention/*.cu)
 add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead_attention_files})
-set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(decoding_kernels STATIC decoding_kernels.cu)
-set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(gpt_kernels STATIC gpt_kernels.cu)
-set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(sampling_topk_kernels STATIC sampling_topk_kernels.cu)
-set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(sampling_topp_kernels STATIC sampling_topp_kernels.cu)
-set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(sampling_penalty_kernels STATIC sampling_penalty_kernels.cu)
-set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

 add_library(custom_ar_kernels STATIC custom_ar_kernels.cu)
-set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+#set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+#set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

-add_subdirectory(gemm_s_f16)
-add_subdirectory(decoder_multihead_attention)
+#add_subdirectory(gemm_s_f16)
--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -37,14 +37,14 @@ __forceinline__ __device__ float copysignf_pos(float a, float b)

 __inline__ __device__ float tanh_opt(float x)
 {
-#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
-    float r;
-    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
-    return r;
-#else
+// #if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
+//     float r;
+//     asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
+//     return r;
+// #else
    const float exp_val = -1.f * fabs(2 * x);
    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
-#endif
+// #endif
 }

 template<typename T>
@@ -106,7 +106,8 @@ struct ReluActivation<half2> {
    static __device__ __forceinline__ half2 apply(const half2& val)
    {
        const half zero_half = static_cast<half>(0.0f);
-        return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
+        // return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
+        return make_half2(static_cast<half>(val.data[0]) > zero_half ? static_cast<half>(val.data[0]) : zero_half, static_cast<half>(val.data[1]) > zero_half ? static_cast<half>(val.data[1]) : zero_half);
    }
 };

@@ -142,7 +143,8 @@ struct SiluActivation<half2> {
    using return_type = float2;
    static __device__ __forceinline__ float2 apply(const half2& val)
    {
-        return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
+        // return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
+        return make_float2(SiluActivation<float>::apply(val.data[0]), SiluActivation<float>::apply(val.data[1]));
    }
 };


--- a/src/turbomind/kernels/custom_ar_kernels.cu
+++ b/src/turbomind/kernels/custom_ar_kernels.cu
@@ -24,7 +24,12 @@ namespace turbomind {
 static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
 {
    uint32_t c;
-    asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    // asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    const __half * ha = reinterpret_cast<const __half*>(&a);
+    const __half * hb = reinterpret_cast<const __half*>(&b);
+    __half2 h2c = make_half2(ha[0] + hb[0], ha[1] + hb[1]);
+    __builtin_memcpy(&c, &h2c, sizeof(h2c));
+    // asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
    return c;
 }

@@ -33,7 +38,12 @@ static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
 static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
 {
    uint32_t c;
-    asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    // asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    union {float *f_p; const uint32_t *u_p;} x, y, z;
+    x.u_p = &a;
+    y.u_p = &b;
+    z.u_p = &c;
+    *z.f_p = *x.f_p + *y.f_p;
    return c;
 }

@@ -42,10 +52,12 @@ static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
 static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_addr)
 {
 #if __CUDA_ARCH__ >= 700
-    asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+    // asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+    *flag_addr = flag;
 #else
    __threadfence_system();
-    asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+    // asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+    *flag_addr = flag;
 #endif
 }

@@ -54,9 +66,11 @@ static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_add
 static inline __device__ void ld_flag_acquire(uint32_t& flag, uint32_t* flag_addr)
 {
 #if __CUDA_ARCH__ >= 700
-    asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+    // asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+    flag = *flag_addr;
 #else
-    asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+    // asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+    flag = *flag_addr;
 #endif
 }


--- a/src/turbomind/kernels/custom_ar_kernels.h
+++ b/src/turbomind/kernels/custom_ar_kernels.h
@@ -27,7 +27,8 @@
 #define MAX_ALL_REDUCE_BLOCKS 24
 #define FLAG(a) ((uint32_t)((a) % 0x146))
 #define RANKS_PER_NODE 8
-#define WARP_SIZE 32
+// #define WARP_SIZE 32
+#define WARP_SIZE 64
 #define DEFAULT_BLOCK_SIZE 1024
 #define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216


--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -661,19 +661,68 @@ struct Qk_dot {
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////
+__device__ inline void f16mulf16addf32(uint32_t & a, uint32_t & b, const float * c, float * d){
+
+//   uint32_t res = 0;
+//   asm volatile("v_pk_fma_f16 %0, %1,%2,%3" : "=v"(res) : "v"(a), "v"(b), "v"(res));
+//   __half * h = reinterpret_cast<__half*>(&res);
+  __half * ha = reinterpret_cast<__half*>(&a);
+  __half * hb = reinterpret_cast<__half*>(&b);
+
+  *d = *c + __half2float(ha[0])*__half2float(hb[0]) + __half2float(ha[1])*__half2float(hb[1]);
+
+}
+
+// row 8 col 4
+__device__ inline void m16n8k8(const uint32_t * A, const uint32_t * B, /*const float * C,*/ float * D) {
+  int tid = threadIdx.x;
+  int baseId = tid / 32 * 32;
+  __shared__ uint32_t smem[1024*3];
+
+  int base = tid*3;
+  __builtin_memcpy(smem+base, A, sizeof(uint32_t));
+  __builtin_memcpy(smem+(base+1), A+1, sizeof(uint32_t));
+  __builtin_memcpy(smem+(base+2), B, sizeof(uint32_t));
+  __syncthreads();
+
+  /* 站在D的视角，每个进程负责D数据的计算，从0线程开始循环，获取一行A和两列B
+     s为B矩阵的线程号
+     baseA为A的线程号
+     baseB0为当前线程获取B的第一列，baseB1为当前线程获取B的第二列
+  */
+  int s = baseId+(tid%4)*8, e = s+4;
+  for (int i = s; i < e; ++i) {
+    // A[0]->i A[1]->i+1 B[0]->i+2 
+    int baseA = (tid-tid%4+i-s)*3;  // 当前tid所处行的第一列的进程号+stride 再*3
+    int baseB0 = i*3, baseB1 = (i+4)*3;
+   
+    f16mulf16addf32(smem[baseA], smem[baseB0+2], D, D);
+
+    f16mulf16addf32(smem[baseA], smem[baseB1+2], D+1, D+1);
+
+    f16mulf16addf32(smem[baseA+1], smem[baseB0+2], D+2, D+2);
+
+    f16mulf16addf32(smem[baseA+1], smem[baseB1+2], D+3, D+3);
+  }
+}
+

 inline __device__ float4 hmma_fp32(const uint2& a, uint32_t b)
 {
    float4 c;
    float  zero = 0.f;
-    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
-                 "    {%0, %1, %2, %3}, \n"
-                 "    {%4, %5}, \n"
-                 "    {%6}, \n"
-                 "    {%7, %7, %7, %7}; \n"
-
-                 : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
-                 : "r"(a.x), "r"(a.y), "r"(b), "f"(zero));
+    const uint32_t * A = reinterpret_cast<const uint32_t*>(&a);
+    const uint32_t * B = reinterpret_cast<const uint32_t*>(b);
+    float * C = reinterpret_cast<float*>(&c);
+    m16n8k8(A, B, C);
+    // asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
+    //              "    {%0, %1, %2, %3}, \n"
+    //              "    {%4, %5}, \n"
+    //              "    {%6}, \n"
+    //              "    {%7, %7, %7, %7}; \n"
+
+    //              : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
+    //              : "r"(a.x), "r"(a.y), "r"(b), "f"(zero));
    return c;
 }

@@ -721,7 +770,8 @@ struct Qk_dot<uint16_t, 4> {

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
+// template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
+template<int WARPS_PER_BLOCK, int WARP_SIZE = 64>
 inline __device__ float block_sum(float* red_smem, float sum)
 {

@@ -1192,12 +1242,21 @@ inline __device__ Float8_ dequant(int64_t a, const float scale, const float zp)

 inline __device__ int8_t cast_to_int8(float val)
 {
-    union {
-        int8_t  int8[2];
-        int16_t int16;
-    };
-    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
-    return int8[0];
+    // union {
+    //     int8_t  int8[2];
+    //     int16_t int16;
+    // };
+    // asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
+    // return int8[0];
+    int8_t dst;
+    if (val >= 128){
+        dst = 127;
+    }else if (val < -128){
+        dst = -128;
+    }else{
+        dst = static_cast<int8_t>(val);
+    }
+    return dst;
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1321,7 +1380,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");

    // The size of a warp.
-    constexpr int WARP_SIZE = 32;
+    // constexpr int WARP_SIZE = 32;
+    constexpr int WARP_SIZE = 64;
    // The number of warps in a threadblock.
    constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;

@@ -1554,7 +1614,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
        }
        // We don't need to apply the linear position bias here since qi - ki = 0 yields the position bias 0.

-        printf("QK_last[%d] = %f\n", hi, qk);
+        //printf("QK_last[%d] = %f\n", hi, qk);

        qk_max                        = qk;
        qk_smem[tlength - first_step] = qk;
@@ -1717,9 +1777,9 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    // Broadcast to all the threads in the warp.
    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);

-    if (threadIdx.x == 0) {
-        printf("QK_MAX[%d] = %f\n", hi, (float)qk_max);
-    }
+//    if (threadIdx.x == 0) {
+//        printf("QK_MAX[%d] = %f\n", hi, (float)qk_max);
+//    }

    // Compute the logits and start the sum.
    float sum = 0.f;
@@ -1746,9 +1806,9 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    // Compute the sum.
    sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);

-    if (threadIdx.x == 0) {
-        printf("SUM[%d] = %f\n", hi, (float)sum);
-    }
+//   if (threadIdx.x == 0) {
+//        printf("SUM[%d] = %f\n", hi, (float)sum);
+//    }

    // Normalize the logits.
    float inv_sum = __fdividef(1.f, sum + 1.e-6f);

--- a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
@@ -218,7 +218,8 @@ inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b)
 inline __device__ uint16_t add(uint16_t a, uint16_t b)
 {
    uint16_t c;
-    asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    // asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    asm volatile("v_add_f16 %0, %1, %2;" : "=v"(c) : "v"(a), "v"(b));
    return c;
 }

@@ -227,7 +228,11 @@ inline __device__ uint16_t add(uint16_t a, uint16_t b)
 inline __device__ uint32_t add(uint32_t a, uint32_t b)
 {
    uint32_t c;
-    asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    // asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    const __half  *ha = reinterpret_cast<const __half*>(&a);
+    const __half  *hb = reinterpret_cast<const __half*>(&b);
+    __half2 h2c = make_half2(ha[0]+hb[0], ha[1]+hb[1]);
+    __builtin_memcpy(&c, &h2c, sizeof(h2c));
    return c;
 }

@@ -263,9 +268,13 @@ inline __device__ uint16_t float_to_half(float f)
    } tmp;
 #if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800  // Is it better?
    float zero = 0.f;
-    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
+    // asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
+    __half h=__float2half(f);
+    tmp.u16[0] = reinterpret_cast<const uint16_t&>(h);
 #else
-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
+    // asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
+    __half h=__float2half(f);
+    tmp.u16[0] = reinterpret_cast<const uint16_t&>(h);
 #endif
    return tmp.u16[0];
 }
@@ -279,10 +288,18 @@ inline __device__ uint32_t float2_to_half2(float2 f)
        uint16_t u16[2];
    } tmp;
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+    // asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+    __half h1 = __float2half(f.x);
+    __half h2 = __float2half(f.y);
+    tmp.u16[0] = reinterpret_cast<const uint16_t&>(h1);
+    tmp.u16[1] = reinterpret_cast<const uint16_t&>(h2);
 #else
-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+    // asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+    // asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+    __half h1 = __float2half(f.x);
+    __half h2 = __float2half(f.y);
+    tmp.u16[0] = reinterpret_cast<const uint16_t&>(h1);
+    tmp.u16[1] = reinterpret_cast<const uint16_t&>(h2);
 #endif
    return tmp.u32;
 }
@@ -292,7 +309,8 @@ inline __device__ uint32_t float2_to_half2(float2 f)
 inline __device__ float half_to_float(uint16_t h)
 {
    float f;
-    asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+    // asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+    f = __half2float(reinterpret_cast<const __half&>(h));
    return f;
 }

@@ -301,7 +319,9 @@ inline __device__ float half_to_float(uint16_t h)
 inline __device__ float2 half2_to_float2(uint32_t v)
 {
    uint16_t lo, hi;
-    asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+    // asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+    lo = v & 0xffff;
+    hi = (v >> 16) & 0xffff;
    return make_float2(half_to_float(lo), half_to_float(hi));
 }

@@ -376,7 +396,11 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb)
 inline __device__ uint32_t h0_h0(uint16_t a)
 {
    uint32_t b;
-    asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+    // asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+    uint16_t tmp[2];
+    tmp[0] = a;
+    tmp[1] = a;
+    __builtin_memcpy(&b, tmp, sizeof(uint16_t) * 2);
    return b;
 }

@@ -501,7 +525,8 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb)
 inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c)
 {
    uint32_t d;
-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+    // asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+    asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
    return d;
 }

@@ -840,7 +865,8 @@ template<>
 inline __device__ uint16_t mul(uint16_t a, uint16_t b)
 {
    uint16_t c;
-    asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    // asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
    return c;
 }

@@ -850,7 +876,8 @@ template<>
 inline __device__ uint32_t mul(uint32_t a, uint32_t b)
 {
    uint32_t c;
-    asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    // asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
    return c;
 }


--- a/src/turbomind/kernels/gpt_kernels.cu
+++ b/src/turbomind/kernels/gpt_kernels.cu
@@ -20,7 +20,8 @@
 #elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
 #else
-#include "3rdparty/cub/cub.cuh"
+// #include "3rdparty/cub/cub.cuh"
+#include <cub/cub.cuh>
 #endif
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/utils/memory_utils.h"

--- a/src/turbomind/kernels/logprob_kernels.cu
+++ b/src/turbomind/kernels/logprob_kernels.cu
@@ -23,7 +23,8 @@
 #elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
 #else
-#include "3rdparty/cub/cub.cuh"
+// #include "3rdparty/cub/cub.cuh"
+#include <cub/cub.cuh>
 #endif

 #include "src/turbomind/kernels/logprob_kernels.h"

--- a/src/turbomind/kernels/reduce_kernel_utils.cuh
+++ b/src/turbomind/kernels/reduce_kernel_utils.cuh
@@ -16,11 +16,11 @@
 #pragma once
 #include <array>
 #include <assert.h>
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
-#include <cooperative_groups/reduce.h>
-#else
+// #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+// #include <cooperative_groups/reduce.h>
+// #else
 #include <cooperative_groups.h>
-#endif
+// #endif
 #include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include <cuda_fp16.h>
@@ -244,15 +244,15 @@ __inline__ __device__ void cgBlockReduceSumElements(float* element_list, float*
    const int tid    = cta.thread_rank();
    const int blockz = blockDim.x;
    for (int i = 0; i < NUM; i++) {
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
-        cgBlockReduceSumElements_shm[i * blockz + tid] = cg::reduce(tile, element_list[i], cg::plus<float>());
-#else
+// #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+//         cgBlockReduceSumElements_shm[i * blockz + tid] = cg::reduce(tile, element_list[i], cg::plus<float>());
+// #else
        // TODO Add implementation here
        if (threadIdx.x == 0 && blockIdx.x == 0) {
            printf("[ERROR] Not support cgBlockReduceSumElements when CUDA < 11 \n");
            assert(false);
        }
-#endif
+// #endif
    }
    cg::sync(cta);
    if (tid == 0) {