AMD ROCm support (#5)

* added rocm support * added rocm build workflow * rocm version * updated readme * rocm 5.6 * remove unnecessary packages * fix for rocm6.0 * fix * fix extra_compile_args

AMD ROCm support (#5)
* added rocm support * added rocm build workflow * rocm version * updated readme * rocm 5.6 * remove unnecessary packages * fix for rocm6.0 * fix * fix extra_compile_args
3aed4bfe · Ilyas Moutawwakil · GitHub · 4bb0c022 · 3aed4bfe · 3aed4bfe
Unverified Commit 3aed4bfe authored Jan 26, 2024 by Ilyas Moutawwakil Committed by GitHub Jan 26, 2024
5 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -32,11 +32,11 @@ jobs:
            const script = require('.github/workflows/scripts/github_create_release.js')
            await script(github, context, core)
-  build_wheels:
+  build_cuda_wheels:
-    name: Build AWQ
+    name: Build AWQ with CUDA
    runs-on: ${{ matrix.os }}
    needs: release
    strategy:
      matrix:
        os: [ubuntu-20.04, windows-latest]
@@ -48,7 +48,7 @@ jobs:
    env:
      PYPI_CUDA_VERSION: "12.1.1"
      CUDA_VERSION: ${{ matrix.cuda }}
    steps:
      - name: Free Disk Space
        uses: jlumbroso/free-disk-space@v1.3.0
@@ -61,7 +61,7 @@ jobs:
          large-packages: false
          docker-images: true
          swap-storage: false
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v3
@@ -78,7 +78,7 @@ jobs:
          use-mamba: true
          add-pip-as-python-dependency: true
          auto-activate-base: false
      - name: Install Dependencies
        run: |
          # Install CUDA toolkit
@@ -87,7 +87,7 @@ jobs:
          # Env variables
          $env:CUDA_PATH = $env:CONDA_PREFIX
          $env:CUDA_HOME = $env:CONDA_PREFIX
          # Install torch
          $cudaVersion = $env:CUDA_VERSION.Replace('.', '')
          $cudaVersionPytorch = $cudaVersion.Substring(0, $cudaVersion.Length - 1)
@@ -113,9 +113,122 @@ jobs:
          }
          python setup.py sdist bdist_wheel
+      - name: Upload Assets
+        uses: shogo82148/actions-upload-release-asset@v1
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/*.whl
+  build_rocm_wheels:
+    name: Build AWQ with ROCm
+    runs-on: ${{ matrix.os }}
+    needs: release
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        python: ["3.8", "3.9", "3.10", "3.11"]
+        rocm: ["5.6.1", "5.7.1"] # we build only for rocm5.6 & 5.7 to match PyTorch 2.1.0 and PyTorch 2.2 nightly
+    defaults:
+      run:
+        shell: bash
+    env:
+      ROCM_VERSION: ${{ matrix.rocm }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Free Disk Space
+        run: |
+          df -h
+          echo "Removing large packages"
+          sudo apt-get remove -y '^dotnet-.*'
+          sudo apt-get remove -y 'php.*'
+          sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Setup Mamba
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.python }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+      - name: Set up ROCm
+        run: |
+          echo "Using python:"
+          python --version
+          which python
+          if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.7.50701-1_all.deb
+          else
+            echo Unknown rocm version
+            exit 1
+          fi
+          curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
+          sudo dpkg -i $ROCM_DL_FILE
+          sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
+      - name: Install Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
+          python -m pip install --upgrade build setuptools wheel
+          if [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
+            echo "Using PyTorch nightly"
+            python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm5.7
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            echo "Using PyTorch stable"
+            python -m pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
+          else
+            echo Unknown rocm version for python install
+            exit 1
+          fi
+      - name: Build Wheel
+        run: |
+          echo "Using python for build:"
+          python --version
+          which python
+          ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
      - name: Upload Assets
        uses: shogo82148/actions-upload-release-asset@v1
        with:
          upload_url: ${{ needs.release.outputs.upload_url }}
          asset_path: ./dist/*.whl
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -159,3 +159,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+*hip*
+!hip_compact.hip
--- a/README.md
+++ b/README.md
@@ -5,21 +5,38 @@ AutoAWQ Kernels is a new package that is split up from the [main repository](htt
 ## Requirements
 - Windows: Must use WSL2.
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
+- NVIDIA:
+  - GPU: Must be compute capability 7.5 or higher.
+  - CUDA Toolkit: Must be 11.8 or higher.
+- AMD:
+  - ROCm: Must be 5.6 or higher.
 ## Install
 ### Install from PyPi
+The package is available on PyPi with CUDA 12.1.1 wheels:
 ```
 pip install autoawq-kernels
 ```
+### Install release wheels
+For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
+```
+pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
+```
 ### Build from source
+You can also build from source:
 ```
 git clone https://github.com/casper-hansen/AutoAWQ_kernels
 cd AutoAWQ_kernels
 pip install -e .
 ```
+To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
--- a/awq_ext/exllama/hip_compat.cuh
+++ b/awq_ext/exllama/hip_compat.cuh
@@ -9,9 +9,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) {
        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
 }
+// ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708
 __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
-    return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
+    return _Float16_2{_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
-        static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
 }
 #define hrcp __compat_hrcp

--- a/setup.py
+++ b/setup.py
@@ -3,21 +3,30 @@ import torch
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
-from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
 AUTOAWQ_KERNELS_VERSION = "0.0.2"
 PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
+CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
+ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
 if not PYPI_BUILD:
-    try:
+    # only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
-        CUDA_VERSION = "".join(
+    if CUDA_VERSION:
-            os.environ.get("CUDA_VERSION", torch.version.cuda).split(".")
+        CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
-        )[:3]
        AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
-    except Exception as ex:
+    elif ROCM_VERSION:
-        raise RuntimeError("Your system must have an Nvidia GPU for installing AutoAWQ")
+        ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
+        AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
+    else:
+        raise RuntimeError(
+            "Your system must have either Nvidia or AMD GPU to build this package."
+        )
+print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")
 common_setup_kwargs = {
    "version": AUTOAWQ_KERNELS_VERSION,
@@ -54,11 +63,13 @@ requirements = [
 def get_include_dirs():
    include_dirs = []
-    conda_cuda_include_dir = os.path.join(
+    if CUDA_VERSION:
-        get_python_lib(), "nvidia/cuda_runtime/include"
+        conda_cuda_include_dir = os.path.join(
-    )
+            get_python_lib(), "nvidia/cuda_runtime/include"
-    if os.path.isdir(conda_cuda_include_dir):
+        )
-        include_dirs.append(conda_cuda_include_dir)
+        if os.path.isdir(conda_cuda_include_dir):
+            include_dirs.append(conda_cuda_include_dir)
    this_dir = os.path.dirname(os.path.abspath(__file__))
    include_dirs.append(this_dir)
@@ -67,6 +78,8 @@ def get_include_dirs():
 def get_generator_flag():
    generator_flag = []
+    # if CUDA_VERSION:
    torch_dir = torch.__path__[0]
    if os.path.exists(
        os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
@@ -76,85 +89,93 @@ def get_generator_flag():
    return generator_flag
-def check_dependencies():
+def get_compute_capabilities():
-    if CUDA_HOME is None:
+    capability_flags = []
-        raise RuntimeError(
-            f"Cannot find CUDA_HOME. CUDA must be available to build the package."
-        )
+    if CUDA_VERSION:
+        # Collect the compute capabilities of all available CUDA GPUs
+        for i in range(torch.cuda.device_count()):
+            major, minor = torch.cuda.get_device_capability(i)
+            cc = major * 10 + minor
+            if cc < 75:
+                raise RuntimeError(
+                    "GPUs with compute capability less than 7.5 are not supported."
+                )
+        # Figure out compute capability
+        compute_capabilities = {75, 80, 86, 89, 90}
+        for cap in compute_capabilities:
+            capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
-def get_compute_capabilities():
+    return capability_flags
-    # Collect the compute capabilities of all available GPUs.
-    for i in range(torch.cuda.device_count()):
-        major, minor = torch.cuda.get_device_capability(i)
-        cc = major * 10 + minor
-        if cc < 75:
-            raise RuntimeError(
-                "GPUs with compute capability less than 7.5 are not supported."
-            )
-    # figure out compute capability
+def get_extra_compile_args(arch_flags, generator_flags):
-    compute_capabilities = {75, 80, 86, 89, 90}
+    extra_compile_args = {}
-    capability_flags = []
+    if os.name == "nt" and CUDA_VERSION:
-    for cap in compute_capabilities:
+        include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
-        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
+        # Relaxed args on Windows
+        if include_arch:
+            extra_compile_args = {"nvcc": arch_flags}
-    return capability_flags
+    elif CUDA_VERSION:
+        extra_compile_args = {
+            "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
+            "nvcc": [
+                "-O3",
+                "-std=c++17",
+                "-DENABLE_BF16",
+                "-U__CUDA_NO_HALF_OPERATORS__",
+                "-U__CUDA_NO_HALF_CONVERSIONS__",
+                "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                "--expt-relaxed-constexpr",
+                "--expt-extended-lambda",
+                "--use_fast_math",
+            ]
+            + arch_flags
+            + generator_flags,
+        }
+    return extra_compile_args
+def get_extra_link_args():
+    extra_link_args = []
+    if os.name == "nt" and CUDA_VERSION:
+        cuda_path = os.environ.get("CUDA_PATH", None)
+        extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
+    return extra_link_args
-check_dependencies()
-extra_link_args = []
 include_dirs = get_include_dirs()
+extra_link_args = get_extra_link_args()
 generator_flags = get_generator_flag()
 arch_flags = get_compute_capabilities()
+extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
-if os.name == "nt":
-    include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
-    # Relaxed args on Windows
+extensions = []
-    if include_arch:
+if CUDA_VERSION:
-        extra_compile_args = {"nvcc": arch_flags}
+    # contain un-hipifiable inline PTX
-    else:
+    extensions.append(
-        extra_compile_args = {}
+        CUDAExtension(
+            "awq_ext",
-    cuda_path = os.environ.get("CUDA_PATH", None)
+            [
-    extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
+                "awq_ext/pybind_awq.cpp",
-else:
+                "awq_ext/quantization/gemm_cuda_gen.cu",
-    extra_compile_args = {
+                "awq_ext/layernorm/layernorm.cu",
-        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
+                "awq_ext/position_embedding/pos_encoding_kernels.cu",
-        "nvcc": [
+                "awq_ext/quantization/gemv_cuda.cu",
-            "-O3",
+            ],
-            "-std=c++17",
+            extra_compile_args=extra_compile_args,
-            "-DENABLE_BF16",
+        )
-            "-U__CUDA_NO_HALF_OPERATORS__",
-            "-U__CUDA_NO_HALF_CONVERSIONS__",
-            "-U__CUDA_NO_BFLOAT16_OPERATORS__",
-            "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-            "-U__CUDA_NO_BFLOAT162_OPERATORS__",
-            "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
-            "--expt-relaxed-constexpr",
-            "--expt-extended-lambda",
-            "--use_fast_math",
-        ]
-        + arch_flags
-        + generator_flags,
-    }
-extensions = [
-    CUDAExtension(
-        "awq_ext",
-        [
-            "awq_ext/pybind_awq.cpp",
-            "awq_ext/quantization/gemm_cuda_gen.cu",
-            "awq_ext/layernorm/layernorm.cu",
-            "awq_ext/position_embedding/pos_encoding_kernels.cu",
-            "awq_ext/quantization/gemv_cuda.cu",
-        ],
-        extra_compile_args=extra_compile_args,
    )
-]
 extensions.append(
    CUDAExtension(
@@ -183,8 +204,8 @@ extensions.append(
    )
 )
+if os.name != "nt" and CUDA_VERSION:
-if os.name != "nt":
+    # FasterTransformer kernels
    extensions.append(
        CUDAExtension(
            "awq_ft_ext",