Commit 54638dc2 authored by gaoqiong's avatar gaoqiong
Browse files

update v0.0.6

parent 83d1f4b3
# AutoAWQ Kernels
# <div align="center"><strong>AutoAWQ_kernel</strong></div>
## 简介
AutoAWQ_kernel是一个从AutoAWQ分离出来的一个组件,以减少编译时间
## 安装
### 使用源码编译方式安装
#### 编译环境准备
下载光源的镜像,起dcoker
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
# <Image ID>用上面拉取docker镜像的ID替换
# <Host Path>主机端路径
# <Container Path>容器映射路径
docker run -it --name baichuan --shm-size=1024G -v /opt/hyhal:/opt/hyhal:ro --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v <Host Path>:<Container Path> <Image ID> /bin/bash
```
注:
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
#### 源码编译安装
- 代码下载
根据不同的需求下载不同的分支
- 提供2种源码编译方式(进入AutoAWQ目录):
```
1. 源码编译安装
pip3 install e.
2. 编译成whl包安装
# 安装wheel
python3 setup.py bdist_wheel
cd dist && pip3 install autoawq*
```
AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
## Requirements
- Windows: Must use WSL2.
- NVIDIA:
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- AMD:
- ROCm: Must be 5.6 or higher.
## Install
### Install from PyPi
The package is available on PyPi with CUDA 12.1.1 wheels:
```
pip install autoawq-kernels
```
### Install release wheels
For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
```
pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
```
### Build from source
You can also build from source:
```
git clone https://github.com/casper-hansen/AutoAWQ_kernels
cd AutoAWQ_kernels
pip install -e .
```
To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
# AutoAWQ Kernels
AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
## Requirements
- Windows: Must use WSL2.
- NVIDIA:
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- AMD:
- ROCm: Must be 5.6 or higher.
## Install
### Install from PyPi
The package is available on PyPi with CUDA 12.1.1 wheels:
```
pip install autoawq-kernels
```
### Install release wheels
For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
```
pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
```
### Build from source
You can also build from source:
```
git clone https://github.com/casper-hansen/AutoAWQ_kernels
cd AutoAWQ_kernels
pip install -e .
```
To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
......@@ -46,10 +46,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
//__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
//__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif
#endif
......
......@@ -247,14 +247,17 @@ void q4_matmul_recons_cuda
w->reconstruct(buffers->temp_dq);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
const float alpha = 1.0f;
const float beta = no_zero ? 1.0f : 0.0f;
cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
#else
const half alpha = __float2half(1.0f);
const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
#endif
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
// const float alpha = 1.0f;
// const float beta = no_zero ? 1.0f : 0.0f;
// cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
// x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
// #else
// const half alpha = __float2half(1.0f);
// const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
// cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
// #endif
}
......@@ -44,10 +44,10 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
//__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
//__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif
#endif
......
......@@ -20,6 +20,37 @@
#include "compat_gemm.cuh"
// #if defined(USE_ROCM)
// #include <hipblas/hipblas.h>
// __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle,
// hipblasOperation_t transA,
// hipblasOperation_t transB,
// int m,
// int n,
// int k,
// const half* alpha,
// const half* AP,
// int lda,
// const half* BP,
// int ldb,
// const half* beta,
// half* CP,
// int ldc) {
// return hipblasHgemm(handle, transA, transB, m, n, k,
// reinterpret_cast<const hipblasHalf *>(alpha),
// reinterpret_cast<const hipblasHalf *>(AP), lda,
// reinterpret_cast<const hipblasHalf *>(BP), ldb,
// reinterpret_cast<const hipblasHalf *>(beta),
// reinterpret_cast<hipblasHalf *>(CP), ldc);
// }
// #define hipblasHgemm __compat_hipblasHgemm
// // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
// #define rocblas_operation_none HIPBLAS_OP_N
// #define rocblas_hgemm __compat_hipblasHgemm
// #endif
void gemm_half_q_half_cuda_part
(
const half* a,
......
import os
import torch
import subprocess
from pathlib import Path
from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from typing import Optional, Union
os.environ["CC"] = "g++"
os.environ["CXX"] = "g++"
......@@ -12,6 +14,65 @@ PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
def get_sha(pytorch_root: Union[str, Path]) -> str:
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip()
except Exception:
return 'Unknown'
def get_abi():
try:
command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
output = result.stdout.strip()
abi = "abi" + output.split(" ")[-1]
return abi
except Exception:
return 'abiUnknown'
def get_version_add(sha: Optional[str] = None) -> str:
version=''
autoawq_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(os.path.join(autoawq_root, ""), "version.py")
if sha != 'Unknown':
if sha is None:
sha = get_sha(autoawq_root)
version = 'git' + sha[:7]
# abi
version += "." + get_abi()
# dtk version
if os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
with open(rocm_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
rocm_version=lines[0][:-2].replace(".", "")
version += ".dtk" + rocm_version
# torch version
version += ".torch" + torch.__version__[:5]
lines=[]
with open(add_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
lines[1] = "__dcu_version__ = '0.0.6+das1.1.{}'\n".format(version)
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.writelines(lines)
file.close()
def get_version():
get_version_add()
version_file = 'version.py'
with open(version_file, encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__dcu_version__']
if not PYPI_BUILD:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
......@@ -20,7 +81,8 @@ if not PYPI_BUILD:
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
elif ROCM_VERSION:
ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
#AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
AUTOAWQ_KERNELS_VERSION = get_version()
else:
raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment