Unverified Commit 3aed4bfe authored by Ilyas Moutawwakil's avatar Ilyas Moutawwakil Committed by GitHub
Browse files

AMD ROCm support (#5)

* added rocm support

* added rocm build workflow

* rocm version

* updated readme

* rocm 5.6

* remove unnecessary packages

* fix for rocm6.0

* fix

* fix extra_compile_args
parent 4bb0c022
......@@ -32,8 +32,8 @@ jobs:
const script = require('.github/workflows/scripts/github_create_release.js')
await script(github, context, core)
build_wheels:
name: Build AWQ
build_cuda_wheels:
name: Build AWQ with CUDA
runs-on: ${{ matrix.os }}
needs: release
......@@ -119,3 +119,116 @@ jobs:
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/*.whl
build_rocm_wheels:
name: Build AWQ with ROCm
runs-on: ${{ matrix.os }}
needs: release
strategy:
matrix:
os: [ubuntu-20.04]
python: ["3.8", "3.9", "3.10", "3.11"]
rocm: ["5.6.1", "5.7.1"] # we build only for rocm5.6 & 5.7 to match PyTorch 2.1.0 and PyTorch 2.2 nightly
defaults:
run:
shell: bash
env:
ROCM_VERSION: ${{ matrix.rocm }}
steps:
- uses: actions/checkout@v3
- name: Free Disk Space
run: |
df -h
echo "Removing large packages"
sudo apt-get remove -y '^dotnet-.*'
sudo apt-get remove -y 'php.*'
sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get autoclean -y >/dev/null 2>&1
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python }}
- name: Setup Mamba
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.python }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Set up ROCm
run: |
echo "Using python:"
python --version
which python
if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
elif [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.7.50701-1_all.deb
else
echo Unknown rocm version
exit 1
fi
curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
sudo dpkg -i $ROCM_DL_FILE
sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
python -m pip install --upgrade build setuptools wheel
if [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
echo "Using PyTorch nightly"
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm5.7
elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
echo "Using PyTorch stable"
python -m pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
else
echo Unknown rocm version for python install
exit 1
fi
- name: Build Wheel
run: |
echo "Using python for build:"
python --version
which python
ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
- name: Upload Assets
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/*.whl
......@@ -159,3 +159,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*hip*
!hip_compact.hip
......@@ -5,21 +5,38 @@ AutoAWQ Kernels is a new package that is split up from the [main repository](htt
## Requirements
- Windows: Must use WSL2.
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- NVIDIA:
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- AMD:
- ROCm: Must be 5.6 or higher.
## Install
### Install from PyPi
The package is available on PyPi with CUDA 12.1.1 wheels:
```
pip install autoawq-kernels
```
### Install release wheels
For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
```
pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
```
### Build from source
You can also build from source:
```
git clone https://github.com/casper-hansen/AutoAWQ_kernels
cd AutoAWQ_kernels
pip install -e .
```
To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
......@@ -9,9 +9,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) {
static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
}
// ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708
__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
return _Float16_2{_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
}
#define hrcp __compat_hrcp
......
......@@ -3,21 +3,30 @@ import torch
from pathlib import Path
from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
os.environ["CC"] = "g++"
os.environ["CXX"] = "g++"
AUTOAWQ_KERNELS_VERSION = "0.0.2"
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
if not PYPI_BUILD:
try:
CUDA_VERSION = "".join(
os.environ.get("CUDA_VERSION", torch.version.cuda).split(".")
)[:3]
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
if CUDA_VERSION:
CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
except Exception as ex:
raise RuntimeError("Your system must have an Nvidia GPU for installing AutoAWQ")
elif ROCM_VERSION:
ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
else:
raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package."
)
print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")
common_setup_kwargs = {
"version": AUTOAWQ_KERNELS_VERSION,
......@@ -54,11 +63,13 @@ requirements = [
def get_include_dirs():
include_dirs = []
if CUDA_VERSION:
conda_cuda_include_dir = os.path.join(
get_python_lib(), "nvidia/cuda_runtime/include"
)
if os.path.isdir(conda_cuda_include_dir):
include_dirs.append(conda_cuda_include_dir)
this_dir = os.path.dirname(os.path.abspath(__file__))
include_dirs.append(this_dir)
......@@ -67,6 +78,8 @@ def get_include_dirs():
def get_generator_flag():
generator_flag = []
# if CUDA_VERSION:
torch_dir = torch.__path__[0]
if os.path.exists(
os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
......@@ -76,52 +89,37 @@ def get_generator_flag():
return generator_flag
def check_dependencies():
if CUDA_HOME is None:
raise RuntimeError(
f"Cannot find CUDA_HOME. CUDA must be available to build the package."
)
def get_compute_capabilities():
# Collect the compute capabilities of all available GPUs.
capability_flags = []
if CUDA_VERSION:
# Collect the compute capabilities of all available CUDA GPUs
for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i)
cc = major * 10 + minor
if cc < 75:
raise RuntimeError(
"GPUs with compute capability less than 7.5 are not supported."
)
# figure out compute capability
# Figure out compute capability
compute_capabilities = {75, 80, 86, 89, 90}
capability_flags = []
for cap in compute_capabilities:
capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
return capability_flags
check_dependencies()
extra_link_args = []
include_dirs = get_include_dirs()
generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities()
def get_extra_compile_args(arch_flags, generator_flags):
extra_compile_args = {}
if os.name == "nt":
if os.name == "nt" and CUDA_VERSION:
include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
# Relaxed args on Windows
if include_arch:
extra_compile_args = {"nvcc": arch_flags}
else:
extra_compile_args = {}
cuda_path = os.environ.get("CUDA_PATH", None)
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
else:
elif CUDA_VERSION:
extra_compile_args = {
"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
"nvcc": [
......@@ -142,7 +140,30 @@ else:
+ generator_flags,
}
extensions = [
return extra_compile_args
def get_extra_link_args():
extra_link_args = []
if os.name == "nt" and CUDA_VERSION:
cuda_path = os.environ.get("CUDA_PATH", None)
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
return extra_link_args
include_dirs = get_include_dirs()
extra_link_args = get_extra_link_args()
generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities()
extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
extensions = []
if CUDA_VERSION:
# contain un-hipifiable inline PTX
extensions.append(
CUDAExtension(
"awq_ext",
[
......@@ -154,7 +175,7 @@ extensions = [
],
extra_compile_args=extra_compile_args,
)
]
)
extensions.append(
CUDAExtension(
......@@ -183,8 +204,8 @@ extensions.append(
)
)
if os.name != "nt":
if os.name != "nt" and CUDA_VERSION:
# FasterTransformer kernels
extensions.append(
CUDAExtension(
"awq_ft_ext",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment