AMD ROCm support (#5)

* added rocm support * added rocm build workflow * rocm version * updated readme * rocm 5.6 * remove unnecessary packages * fix for rocm6.0 * fix * fix extra_compile_args

AMD ROCm support (#5)
* added rocm support * added rocm build workflow * rocm version * updated readme * rocm 5.6 * remove unnecessary packages * fix for rocm6.0 * fix * fix extra_compile_args
3aed4bfe · Ilyas Moutawwakil · GitHub · 4bb0c022 · 3aed4bfe · 3aed4bfe
Unverified Commit 3aed4bfe authored Jan 26, 2024 by Ilyas Moutawwakil Committed by GitHub Jan 26, 2024
5 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -32,8 +32,8 @@ jobs:
            const script = require('.github/workflows/scripts/github_create_release.js')
            await script(github, context, core)

-  build_wheels:
-    name: Build AWQ
+  build_cuda_wheels:
+    name: Build AWQ with CUDA
    runs-on: ${{ matrix.os }}
    needs: release

@@ -119,3 +119,116 @@ jobs:
        with:
          upload_url: ${{ needs.release.outputs.upload_url }}
          asset_path: ./dist/*.whl
+
+  build_rocm_wheels:
+    name: Build AWQ with ROCm
+    runs-on: ${{ matrix.os }}
+    needs: release
+
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        python: ["3.8", "3.9", "3.10", "3.11"]
+        rocm: ["5.6.1", "5.7.1"] # we build only for rocm5.6 & 5.7 to match PyTorch 2.1.0 and PyTorch 2.2 nightly
+    defaults:
+      run:
+        shell: bash
+    env:
+      ROCM_VERSION: ${{ matrix.rocm }}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Free Disk Space
+        run: |
+          df -h
+          echo "Removing large packages"
+          sudo apt-get remove -y '^dotnet-.*'
+          sudo apt-get remove -y 'php.*'
+          sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Setup Mamba
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.python }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Set up ROCm
+        run: |
+          echo "Using python:"
+          python --version
+          which python
+
+          if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.7.50701-1_all.deb
+          else
+            echo Unknown rocm version
+            exit 1
+          fi
+
+          curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
+          sudo dpkg -i $ROCM_DL_FILE
+          sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
+
+      - name: Install Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
+
+          python -m pip install --upgrade build setuptools wheel
+
+          if [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
+            echo "Using PyTorch nightly"
+            python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm5.7
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            echo "Using PyTorch stable"
+            python -m pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
+          else
+            echo Unknown rocm version for python install
+            exit 1
+          fi
+
+      - name: Build Wheel
+        run: |
+          echo "Using python for build:"
+          python --version
+          which python
+
+          ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
+
+      - name: Upload Assets
+        uses: shogo82148/actions-upload-release-asset@v1
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/*.whl
--- a/.gitignore
+++ b/.gitignore
@@ -159,3 +159,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+*hip*
+!hip_compact.hip
--- a/README.md
+++ b/README.md
@@ -5,21 +5,38 @@ AutoAWQ Kernels is a new package that is split up from the [main repository](htt
 ## Requirements

 - Windows: Must use WSL2.
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
+
+- NVIDIA:
+  - GPU: Must be compute capability 7.5 or higher.
+  - CUDA Toolkit: Must be 11.8 or higher.
+- AMD:
+  - ROCm: Must be 5.6 or higher.

 ## Install

 ### Install from PyPi

+The package is available on PyPi with CUDA 12.1.1 wheels:
+
 ```
 pip install autoawq-kernels
 ```

+### Install release wheels
+
+For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
+
+```
+pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
+```
+
 ### Build from source
+You can also build from source:

 ```
 git clone https://github.com/casper-hansen/AutoAWQ_kernels
 cd AutoAWQ_kernels
 pip install -e .
 ```
+
+To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
--- a/awq_ext/exllama/hip_compat.cuh
+++ b/awq_ext/exllama/hip_compat.cuh
@@ -9,9 +9,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) {
        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
 }

+// ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708
 __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
-    return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
-        static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
+    return _Float16_2{_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
 }

 #define hrcp __compat_hrcp

--- a/setup.py
+++ b/setup.py
@@ -3,21 +3,30 @@ import torch
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
-from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension

 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
 AUTOAWQ_KERNELS_VERSION = "0.0.2"
 PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
+CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
+ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
+

 if not PYPI_BUILD:
-    try:
-        CUDA_VERSION = "".join(
-            os.environ.get("CUDA_VERSION", torch.version.cuda).split(".")
-        )[:3]
+    # only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
+    if CUDA_VERSION:
+        CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
        AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
-    except Exception as ex:
-        raise RuntimeError("Your system must have an Nvidia GPU for installing AutoAWQ")
+    elif ROCM_VERSION:
+        ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
+        AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
+    else:
+        raise RuntimeError(
+            "Your system must have either Nvidia or AMD GPU to build this package."
+        )
+
+print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")

 common_setup_kwargs = {
    "version": AUTOAWQ_KERNELS_VERSION,
@@ -54,11 +63,13 @@ requirements = [
 def get_include_dirs():
    include_dirs = []

+    if CUDA_VERSION:
        conda_cuda_include_dir = os.path.join(
            get_python_lib(), "nvidia/cuda_runtime/include"
        )
        if os.path.isdir(conda_cuda_include_dir):
            include_dirs.append(conda_cuda_include_dir)
+
    this_dir = os.path.dirname(os.path.abspath(__file__))
    include_dirs.append(this_dir)

@@ -67,6 +78,8 @@ def get_include_dirs():

 def get_generator_flag():
    generator_flag = []
+
+    # if CUDA_VERSION:
    torch_dir = torch.__path__[0]
    if os.path.exists(
        os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
@@ -76,52 +89,37 @@ def get_generator_flag():
    return generator_flag


-def check_dependencies():
-    if CUDA_HOME is None:
-        raise RuntimeError(
-            f"Cannot find CUDA_HOME. CUDA must be available to build the package."
-        )
-
-
 def get_compute_capabilities():
-    # Collect the compute capabilities of all available GPUs.
+    capability_flags = []
+
+    if CUDA_VERSION:
+        # Collect the compute capabilities of all available CUDA GPUs
        for i in range(torch.cuda.device_count()):
            major, minor = torch.cuda.get_device_capability(i)
            cc = major * 10 + minor
-
            if cc < 75:
                raise RuntimeError(
                    "GPUs with compute capability less than 7.5 are not supported."
                )

-    # figure out compute capability
+        # Figure out compute capability
        compute_capabilities = {75, 80, 86, 89, 90}
-
-    capability_flags = []
        for cap in compute_capabilities:
            capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]

    return capability_flags


-check_dependencies()
-extra_link_args = []
-include_dirs = get_include_dirs()
-generator_flags = get_generator_flag()
-arch_flags = get_compute_capabilities()
+def get_extra_compile_args(arch_flags, generator_flags):
+    extra_compile_args = {}

-if os.name == "nt":
+    if os.name == "nt" and CUDA_VERSION:
        include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
-
        # Relaxed args on Windows
        if include_arch:
            extra_compile_args = {"nvcc": arch_flags}
-    else:
-        extra_compile_args = {}

-    cuda_path = os.environ.get("CUDA_PATH", None)
-    extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
-else:
+    elif CUDA_VERSION:
        extra_compile_args = {
            "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
            "nvcc": [
@@ -142,7 +140,30 @@ else:
            + generator_flags,
        }

-extensions = [
+    return extra_compile_args
+
+
+def get_extra_link_args():
+    extra_link_args = []
+
+    if os.name == "nt" and CUDA_VERSION:
+        cuda_path = os.environ.get("CUDA_PATH", None)
+        extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
+
+    return extra_link_args
+
+
+include_dirs = get_include_dirs()
+extra_link_args = get_extra_link_args()
+generator_flags = get_generator_flag()
+arch_flags = get_compute_capabilities()
+extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
+
+
+extensions = []
+if CUDA_VERSION:
+    # contain un-hipifiable inline PTX
+    extensions.append(
        CUDAExtension(
            "awq_ext",
            [
@@ -154,7 +175,7 @@ extensions = [
            ],
            extra_compile_args=extra_compile_args,
        )
-]
+    )

 extensions.append(
    CUDAExtension(
@@ -183,8 +204,8 @@ extensions.append(
    )
 )

-
-if os.name != "nt":
+if os.name != "nt" and CUDA_VERSION:
+    # FasterTransformer kernels
    extensions.append(
        CUDAExtension(
            "awq_ft_ext",