Commit 54638dc2 authored by gaoqiong's avatar gaoqiong
Browse files

update v0.0.6

parent 83d1f4b3
# AutoAWQ Kernels # <div align="center"><strong>AutoAWQ_kernel</strong></div>
## 简介
AutoAWQ_kernel是一个从AutoAWQ分离出来的一个组件,以减少编译时间
## 安装
### 使用源码编译方式安装
#### 编译环境准备
下载光源的镜像,起dcoker
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
# <Image ID>用上面拉取docker镜像的ID替换
# <Host Path>主机端路径
# <Container Path>容器映射路径
docker run -it --name baichuan --shm-size=1024G -v /opt/hyhal:/opt/hyhal:ro --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v <Host Path>:<Container Path> <Image ID> /bin/bash
```
注:
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
#### 源码编译安装
- 代码下载
根据不同的需求下载不同的分支
- 提供2种源码编译方式(进入AutoAWQ目录):
```
1. 源码编译安装
pip3 install e.
2. 编译成whl包安装
# 安装wheel
python3 setup.py bdist_wheel
cd dist && pip3 install autoawq*
```
AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
## Requirements
- Windows: Must use WSL2.
- NVIDIA:
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- AMD:
- ROCm: Must be 5.6 or higher.
## Install
### Install from PyPi
The package is available on PyPi with CUDA 12.1.1 wheels:
```
pip install autoawq-kernels
```
### Install release wheels
For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
```
pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
```
### Build from source
You can also build from source:
```
git clone https://github.com/casper-hansen/AutoAWQ_kernels
cd AutoAWQ_kernels
pip install -e .
```
To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
# AutoAWQ Kernels
AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
## Requirements
- Windows: Must use WSL2.
- NVIDIA:
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- AMD:
- ROCm: Must be 5.6 or higher.
## Install
### Install from PyPi
The package is available on PyPi with CUDA 12.1.1 wheels:
```
pip install autoawq-kernels
```
### Install release wheels
For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
```
pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
```
### Build from source
You can also build from source:
```
git clone https://github.com/casper-hansen/AutoAWQ_kernels
cd AutoAWQ_kernels
pip install -e .
```
To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
...@@ -46,10 +46,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) ...@@ -46,10 +46,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
#if defined(__CUDA_ARCH__) || defined(USE_ROCM) #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } //__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } //__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif #endif
#endif #endif
......
...@@ -247,14 +247,17 @@ void q4_matmul_recons_cuda ...@@ -247,14 +247,17 @@ void q4_matmul_recons_cuda
w->reconstruct(buffers->temp_dq); w->reconstruct(buffers->temp_dq);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
const float alpha = 1.0f;
const float beta = no_zero ? 1.0f : 0.0f;
cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
#else
const half alpha = __float2half(1.0f); const half alpha = __float2half(1.0f);
const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f); const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width); cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
#endif // #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
// const float alpha = 1.0f;
// const float beta = no_zero ? 1.0f : 0.0f;
// cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
// x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
// #else
// const half alpha = __float2half(1.0f);
// const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
// cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
// #endif
} }
...@@ -44,10 +44,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) ...@@ -44,10 +44,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
#if defined(__CUDA_ARCH__) || defined(USE_ROCM) #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } //__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } //__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif #endif
#endif #endif
......
...@@ -20,6 +20,37 @@ ...@@ -20,6 +20,37 @@
#include "compat_gemm.cuh" #include "compat_gemm.cuh"
// #if defined(USE_ROCM)
// #include <hipblas/hipblas.h>
// __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle,
// hipblasOperation_t transA,
// hipblasOperation_t transB,
// int m,
// int n,
// int k,
// const half* alpha,
// const half* AP,
// int lda,
// const half* BP,
// int ldb,
// const half* beta,
// half* CP,
// int ldc) {
// return hipblasHgemm(handle, transA, transB, m, n, k,
// reinterpret_cast<const hipblasHalf *>(alpha),
// reinterpret_cast<const hipblasHalf *>(AP), lda,
// reinterpret_cast<const hipblasHalf *>(BP), ldb,
// reinterpret_cast<const hipblasHalf *>(beta),
// reinterpret_cast<hipblasHalf *>(CP), ldc);
// }
// #define hipblasHgemm __compat_hipblasHgemm
// // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
// #define rocblas_operation_none HIPBLAS_OP_N
// #define rocblas_hgemm __compat_hipblasHgemm
// #endif
void gemm_half_q_half_cuda_part void gemm_half_q_half_cuda_part
( (
const half* a, const half* a,
......
import os import os
import torch import torch
import subprocess
from pathlib import Path from pathlib import Path
from setuptools import setup, find_packages from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from typing import Optional, Union
os.environ["CC"] = "g++" os.environ["CC"] = "g++"
os.environ["CXX"] = "g++" os.environ["CXX"] = "g++"
...@@ -12,6 +14,65 @@ PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1" ...@@ -12,6 +14,65 @@ PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
def get_sha(pytorch_root: Union[str, Path]) -> str:
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip()
except Exception:
return 'Unknown'
def get_abi():
try:
command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
output = result.stdout.strip()
abi = "abi" + output.split(" ")[-1]
return abi
except Exception:
return 'abiUnknown'
def get_version_add(sha: Optional[str] = None) -> str:
version=''
autoawq_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(os.path.join(autoawq_root, ""), "version.py")
if sha != 'Unknown':
if sha is None:
sha = get_sha(autoawq_root)
version = 'git' + sha[:7]
# abi
version += "." + get_abi()
# dtk version
if os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
with open(rocm_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
rocm_version=lines[0][:-2].replace(".", "")
version += ".dtk" + rocm_version
# torch version
version += ".torch" + torch.__version__[:5]
lines=[]
with open(add_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
lines[1] = "__dcu_version__ = '0.0.6+das1.1.{}'\n".format(version)
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.writelines(lines)
file.close()
def get_version():
get_version_add()
version_file = 'version.py'
with open(version_file, encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__dcu_version__']
if not PYPI_BUILD: if not PYPI_BUILD:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440 # only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
...@@ -20,7 +81,8 @@ if not PYPI_BUILD: ...@@ -20,7 +81,8 @@ if not PYPI_BUILD:
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}" AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
elif ROCM_VERSION: elif ROCM_VERSION:
ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3] ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}" #AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
AUTOAWQ_KERNELS_VERSION = get_version()
else: else:
raise RuntimeError( raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package." "Your system must have either Nvidia or AMD GPU to build this package."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment