update v0.0.6

54638dc2 · gaoqiong · 83d1f4b3 · 54638dc2 · 54638dc2 · 54638dc2
Commit 54638dc2 authored Jul 03, 2024 by gaoqiong
7 changed files
--- a/README.md
+++ b/README.md
-# AutoAWQ Kernels
+# <div align="center"><strong>AutoAWQ_kernel</strong></div>
+## 简介
+AutoAWQ_kernel是一个从AutoAWQ分离出来的一个组件，以减少编译时间
+## 安装
+### 使用源码编译方式安装
+#### 编译环境准备
+下载光源的镜像，起dcoker
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
+# <Image ID>用上面拉取docker镜像的ID替换
+# <Host Path>主机端路径
+# <Container Path>容器映射路径
+docker run -it --name baichuan --shm-size=1024G -v /opt/hyhal:/opt/hyhal:ro --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v <Host Path>:<Container Path> <Image ID> /bin/bash
+```
+注：
+1、docker启动  -v /opt/hyhal:/opt/hyhal  这个变量不能少             
+#### 源码编译安装
+- 代码下载
+根据不同的需求下载不同的分支
+- 提供2种源码编译方式（进入AutoAWQ目录）：
+```
+1. 源码编译安装
+pip3 install e.
+2. 编译成whl包安装
+# 安装wheel 
+python3 setup.py bdist_wheel
+cd dist && pip3 install autoawq*
+```
-AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
-## Requirements
- Windows: Must use WSL2.
- NVIDIA:
-  - GPU: Must be compute capability 7.5 or higher.
-  - CUDA Toolkit: Must be 11.8 or higher.
- AMD:
-  - ROCm: Must be 5.6 or higher.
-## Install
-### Install from PyPi
-The package is available on PyPi with CUDA 12.1.1 wheels:
-```
-pip install autoawq-kernels
-```
-### Install release wheels
-For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
-```
-pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
-```
-### Build from source
-You can also build from source:
-```
-git clone https://github.com/casper-hansen/AutoAWQ_kernels
-cd AutoAWQ_kernels
-pip install -e .
-```
-To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
--- a/README_origin.md
+++ b/README_origin.md
+# AutoAWQ Kernels
+AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
+## Requirements
+- Windows: Must use WSL2.
+- NVIDIA:
+  - GPU: Must be compute capability 7.5 or higher.
+  - CUDA Toolkit: Must be 11.8 or higher.
+- AMD:
+  - ROCm: Must be 5.6 or higher.
+## Install
+### Install from PyPi
+The package is available on PyPi with CUDA 12.1.1 wheels:
+```
+pip install autoawq-kernels
+```
+### Install release wheels
+For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
+```
+pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
+```
+### Build from source
+You can also build from source:
+```
+git clone https://github.com/casper-hansen/AutoAWQ_kernels
+cd AutoAWQ_kernels
+pip install -e .
+```
+To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
--- a/awq_ext/exllama/cu_compat.cuh
+++ b/awq_ext/exllama/cu_compat.cuh
@@ -46,10 +46,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
 #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+//__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+//__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
 #endif
 #endif

--- a/awq_ext/exllama/cuda_func/q4_matmul.cu
+++ b/awq_ext/exllama/cuda_func/q4_matmul.cu
@@ -247,14 +247,17 @@ void q4_matmul_recons_cuda
    w->reconstruct(buffers->temp_dq);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-    const float alpha = 1.0f;
-    const float beta = no_zero ? 1.0f : 0.0f;
-    cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
-                  x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
-#else
    const half alpha = __float2half(1.0f);
    const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
-#endif
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+//     const float alpha = 1.0f;
+//     const float beta = no_zero ? 1.0f : 0.0f;
+//     cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
+//                   x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
+// #else
+//     const half alpha = __float2half(1.0f);
+//     const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
+//     cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
+// #endif
 }
--- a/awq_ext/exllamav2/cuda/compat.cuh
+++ b/awq_ext/exllamav2/cuda/compat.cuh
@@ -44,10 +44,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
 #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+//__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+//__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
 #endif
 #endif

--- a/awq_ext/exllamav2/cuda/q_gemm.cu
+++ b/awq_ext/exllamav2/cuda/q_gemm.cu
@@ -20,6 +20,37 @@
 #include "compat_gemm.cuh"
+// #if defined(USE_ROCM)
+// #include <hipblas/hipblas.h>
+// __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
+//                                                                hipblasOperation_t transA,
+//                                                                hipblasOperation_t transB,
+//                                                                int                m,
+//                                                                int                n,
+//                                                                int                k,
+//                                                                const half*        alpha,
+//                                                                const half*        AP,
+//                                                                int                lda,
+//                                                                const half*        BP,
+//                                                                int                ldb,
+//                                                                const half*        beta,
+//                                                                half*              CP,
+//                                                                int                ldc) {
+//     return hipblasHgemm(handle, transA, transB, m, n, k,
+//                         reinterpret_cast<const hipblasHalf *>(alpha),
+//                         reinterpret_cast<const hipblasHalf *>(AP), lda,
+//                         reinterpret_cast<const hipblasHalf *>(BP), ldb,
+//                         reinterpret_cast<const hipblasHalf *>(beta),
+//                         reinterpret_cast<hipblasHalf *>(CP), ldc);
+// }
+// #define hipblasHgemm __compat_hipblasHgemm
+// // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
+// #define rocblas_operation_none HIPBLAS_OP_N
+// #define rocblas_hgemm __compat_hipblasHgemm
+// #endif
 void gemm_half_q_half_cuda_part
 (
    const half* a,

--- a/setup.py
+++ b/setup.py
 import os
 import torch
+import subprocess
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from typing import Optional, Union
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
@@ -12,6 +14,65 @@ PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
 CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
 ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
+def get_sha(pytorch_root: Union[str, Path]) -> str:
+    try:
+        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip()
+    except Exception:
+        return 'Unknown'
+def get_abi():
+    try:
+        command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI" 
+        result = subprocess.run(command, shell=True, capture_output=True, text=True) 
+        output = result.stdout.strip() 
+        abi = "abi" + output.split(" ")[-1]
+        return abi
+    except Exception:
+        return 'abiUnknown'
+def get_version_add(sha: Optional[str] = None) -> str:
+    version=''
+    autoawq_root = os.path.dirname(os.path.abspath(__file__))
+    add_version_path = os.path.join(os.path.join(autoawq_root, ""), "version.py")
+    if sha != 'Unknown':
+        if sha is None:
+            sha = get_sha(autoawq_root)
+        version = 'git' + sha[:7]
+    # abi
+    version += "." + get_abi()
+    # dtk version
+    if os.getenv("ROCM_PATH"):
+        rocm_path = os.getenv('ROCM_PATH', "")
+        rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
+        with open(rocm_version_path, 'r',encoding='utf-8') as file:
+            lines = file.readlines()
+        rocm_version=lines[0][:-2].replace(".", "")
+        version += ".dtk" + rocm_version
+    # torch version
+    version += ".torch" + torch.__version__[:5]
+    lines=[]
+    with open(add_version_path, 'r',encoding='utf-8') as file:
+        lines = file.readlines()
+    lines[1] = "__dcu_version__ = '0.0.6+das1.1.{}'\n".format(version)
+    with open(add_version_path, encoding="utf-8",mode="w") as file:
+        file.writelines(lines)
+    file.close()
+def get_version():
+    get_version_add()
+    version_file = 'version.py'
+    with open(version_file, encoding='utf-8') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__dcu_version__']
 if not PYPI_BUILD:
    # only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
@@ -20,7 +81,8 @@ if not PYPI_BUILD:
        AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
    elif ROCM_VERSION:
        ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
-        AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
+        #AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
+        AUTOAWQ_KERNELS_VERSION = get_version()
    else:
        raise RuntimeError(
            "Your system must have either Nvidia or AMD GPU to build this package."