Unverified Commit 3aed4bfe authored by Ilyas Moutawwakil's avatar Ilyas Moutawwakil Committed by GitHub
Browse files

AMD ROCm support (#5)

* added rocm support

* added rocm build workflow

* rocm version

* updated readme

* rocm 5.6

* remove unnecessary packages

* fix for rocm6.0

* fix

* fix extra_compile_args
parent 4bb0c022
...@@ -32,11 +32,11 @@ jobs: ...@@ -32,11 +32,11 @@ jobs:
const script = require('.github/workflows/scripts/github_create_release.js') const script = require('.github/workflows/scripts/github_create_release.js')
await script(github, context, core) await script(github, context, core)
build_wheels: build_cuda_wheels:
name: Build AWQ name: Build AWQ with CUDA
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
needs: release needs: release
strategy: strategy:
matrix: matrix:
os: [ubuntu-20.04, windows-latest] os: [ubuntu-20.04, windows-latest]
...@@ -48,7 +48,7 @@ jobs: ...@@ -48,7 +48,7 @@ jobs:
env: env:
PYPI_CUDA_VERSION: "12.1.1" PYPI_CUDA_VERSION: "12.1.1"
CUDA_VERSION: ${{ matrix.cuda }} CUDA_VERSION: ${{ matrix.cuda }}
steps: steps:
- name: Free Disk Space - name: Free Disk Space
uses: jlumbroso/free-disk-space@v1.3.0 uses: jlumbroso/free-disk-space@v1.3.0
...@@ -61,7 +61,7 @@ jobs: ...@@ -61,7 +61,7 @@ jobs:
large-packages: false large-packages: false
docker-images: true docker-images: true
swap-storage: false swap-storage: false
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: actions/setup-python@v3 - uses: actions/setup-python@v3
...@@ -78,7 +78,7 @@ jobs: ...@@ -78,7 +78,7 @@ jobs:
use-mamba: true use-mamba: true
add-pip-as-python-dependency: true add-pip-as-python-dependency: true
auto-activate-base: false auto-activate-base: false
- name: Install Dependencies - name: Install Dependencies
run: | run: |
# Install CUDA toolkit # Install CUDA toolkit
...@@ -87,7 +87,7 @@ jobs: ...@@ -87,7 +87,7 @@ jobs:
# Env variables # Env variables
$env:CUDA_PATH = $env:CONDA_PREFIX $env:CUDA_PATH = $env:CONDA_PREFIX
$env:CUDA_HOME = $env:CONDA_PREFIX $env:CUDA_HOME = $env:CONDA_PREFIX
# Install torch # Install torch
$cudaVersion = $env:CUDA_VERSION.Replace('.', '') $cudaVersion = $env:CUDA_VERSION.Replace('.', '')
$cudaVersionPytorch = $cudaVersion.Substring(0, $cudaVersion.Length - 1) $cudaVersionPytorch = $cudaVersion.Substring(0, $cudaVersion.Length - 1)
...@@ -113,9 +113,122 @@ jobs: ...@@ -113,9 +113,122 @@ jobs:
} }
python setup.py sdist bdist_wheel python setup.py sdist bdist_wheel
- name: Upload Assets
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/*.whl
build_rocm_wheels:
name: Build AWQ with ROCm
runs-on: ${{ matrix.os }}
needs: release
strategy:
matrix:
os: [ubuntu-20.04]
python: ["3.8", "3.9", "3.10", "3.11"]
rocm: ["5.6.1", "5.7.1"] # we build only for rocm5.6 & 5.7 to match PyTorch 2.1.0 and PyTorch 2.2 nightly
defaults:
run:
shell: bash
env:
ROCM_VERSION: ${{ matrix.rocm }}
steps:
- uses: actions/checkout@v3
- name: Free Disk Space
run: |
df -h
echo "Removing large packages"
sudo apt-get remove -y '^dotnet-.*'
sudo apt-get remove -y 'php.*'
sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get autoclean -y >/dev/null 2>&1
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python }}
- name: Setup Mamba
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.python }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Set up ROCm
run: |
echo "Using python:"
python --version
which python
if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
elif [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.7.50701-1_all.deb
else
echo Unknown rocm version
exit 1
fi
curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
sudo dpkg -i $ROCM_DL_FILE
sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
python -m pip install --upgrade build setuptools wheel
if [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
echo "Using PyTorch nightly"
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm5.7
elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
echo "Using PyTorch stable"
python -m pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
else
echo Unknown rocm version for python install
exit 1
fi
- name: Build Wheel
run: |
echo "Using python for build:"
python --version
which python
ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
- name: Upload Assets - name: Upload Assets
uses: shogo82148/actions-upload-release-asset@v1 uses: shogo82148/actions-upload-release-asset@v1
with: with:
upload_url: ${{ needs.release.outputs.upload_url }} upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/*.whl asset_path: ./dist/*.whl
\ No newline at end of file
...@@ -159,3 +159,6 @@ cython_debug/ ...@@ -159,3 +159,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
*hip*
!hip_compact.hip
...@@ -5,21 +5,38 @@ AutoAWQ Kernels is a new package that is split up from the [main repository](htt ...@@ -5,21 +5,38 @@ AutoAWQ Kernels is a new package that is split up from the [main repository](htt
## Requirements ## Requirements
- Windows: Must use WSL2. - Windows: Must use WSL2.
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher. - NVIDIA:
- GPU: Must be compute capability 7.5 or higher.
- CUDA Toolkit: Must be 11.8 or higher.
- AMD:
- ROCm: Must be 5.6 or higher.
## Install ## Install
### Install from PyPi ### Install from PyPi
The package is available on PyPi with CUDA 12.1.1 wheels:
``` ```
pip install autoawq-kernels pip install autoawq-kernels
``` ```
### Install release wheels
For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
```
pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
```
### Build from source ### Build from source
You can also build from source:
``` ```
git clone https://github.com/casper-hansen/AutoAWQ_kernels git clone https://github.com/casper-hansen/AutoAWQ_kernels
cd AutoAWQ_kernels cd AutoAWQ_kernels
pip install -e . pip install -e .
``` ```
To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
...@@ -9,9 +9,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) { ...@@ -9,9 +9,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) {
static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
} }
// ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708
__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), return _Float16_2{_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
} }
#define hrcp __compat_hrcp #define hrcp __compat_hrcp
......
...@@ -3,21 +3,30 @@ import torch ...@@ -3,21 +3,30 @@ import torch
from pathlib import Path from pathlib import Path
from setuptools import setup, find_packages from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension from torch.utils.cpp_extension import BuildExtension, CUDAExtension
os.environ["CC"] = "g++" os.environ["CC"] = "g++"
os.environ["CXX"] = "g++" os.environ["CXX"] = "g++"
AUTOAWQ_KERNELS_VERSION = "0.0.2" AUTOAWQ_KERNELS_VERSION = "0.0.2"
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1" PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
if not PYPI_BUILD: if not PYPI_BUILD:
try: # only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
CUDA_VERSION = "".join( if CUDA_VERSION:
os.environ.get("CUDA_VERSION", torch.version.cuda).split(".") CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
)[:3]
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}" AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
except Exception as ex: elif ROCM_VERSION:
raise RuntimeError("Your system must have an Nvidia GPU for installing AutoAWQ") ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
else:
raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package."
)
print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")
common_setup_kwargs = { common_setup_kwargs = {
"version": AUTOAWQ_KERNELS_VERSION, "version": AUTOAWQ_KERNELS_VERSION,
...@@ -54,11 +63,13 @@ requirements = [ ...@@ -54,11 +63,13 @@ requirements = [
def get_include_dirs(): def get_include_dirs():
include_dirs = [] include_dirs = []
conda_cuda_include_dir = os.path.join( if CUDA_VERSION:
get_python_lib(), "nvidia/cuda_runtime/include" conda_cuda_include_dir = os.path.join(
) get_python_lib(), "nvidia/cuda_runtime/include"
if os.path.isdir(conda_cuda_include_dir): )
include_dirs.append(conda_cuda_include_dir) if os.path.isdir(conda_cuda_include_dir):
include_dirs.append(conda_cuda_include_dir)
this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(os.path.abspath(__file__))
include_dirs.append(this_dir) include_dirs.append(this_dir)
...@@ -67,6 +78,8 @@ def get_include_dirs(): ...@@ -67,6 +78,8 @@ def get_include_dirs():
def get_generator_flag(): def get_generator_flag():
generator_flag = [] generator_flag = []
# if CUDA_VERSION:
torch_dir = torch.__path__[0] torch_dir = torch.__path__[0]
if os.path.exists( if os.path.exists(
os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h") os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
...@@ -76,85 +89,93 @@ def get_generator_flag(): ...@@ -76,85 +89,93 @@ def get_generator_flag():
return generator_flag return generator_flag
def check_dependencies(): def get_compute_capabilities():
if CUDA_HOME is None: capability_flags = []
raise RuntimeError(
f"Cannot find CUDA_HOME. CUDA must be available to build the package."
)
if CUDA_VERSION:
# Collect the compute capabilities of all available CUDA GPUs
for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i)
cc = major * 10 + minor
if cc < 75:
raise RuntimeError(
"GPUs with compute capability less than 7.5 are not supported."
)
# Figure out compute capability
compute_capabilities = {75, 80, 86, 89, 90}
for cap in compute_capabilities:
capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
def get_compute_capabilities(): return capability_flags
# Collect the compute capabilities of all available GPUs.
for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i)
cc = major * 10 + minor
if cc < 75:
raise RuntimeError(
"GPUs with compute capability less than 7.5 are not supported."
)
# figure out compute capability def get_extra_compile_args(arch_flags, generator_flags):
compute_capabilities = {75, 80, 86, 89, 90} extra_compile_args = {}
capability_flags = [] if os.name == "nt" and CUDA_VERSION:
for cap in compute_capabilities: include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] # Relaxed args on Windows
if include_arch:
extra_compile_args = {"nvcc": arch_flags}
return capability_flags elif CUDA_VERSION:
extra_compile_args = {
"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
"nvcc": [
"-O3",
"-std=c++17",
"-DENABLE_BF16",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
+ arch_flags
+ generator_flags,
}
return extra_compile_args
def get_extra_link_args():
extra_link_args = []
if os.name == "nt" and CUDA_VERSION:
cuda_path = os.environ.get("CUDA_PATH", None)
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
return extra_link_args
check_dependencies()
extra_link_args = []
include_dirs = get_include_dirs() include_dirs = get_include_dirs()
extra_link_args = get_extra_link_args()
generator_flags = get_generator_flag() generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities() arch_flags = get_compute_capabilities()
extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
if os.name == "nt":
include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
# Relaxed args on Windows extensions = []
if include_arch: if CUDA_VERSION:
extra_compile_args = {"nvcc": arch_flags} # contain un-hipifiable inline PTX
else: extensions.append(
extra_compile_args = {} CUDAExtension(
"awq_ext",
cuda_path = os.environ.get("CUDA_PATH", None) [
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"] "awq_ext/pybind_awq.cpp",
else: "awq_ext/quantization/gemm_cuda_gen.cu",
extra_compile_args = { "awq_ext/layernorm/layernorm.cu",
"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"], "awq_ext/position_embedding/pos_encoding_kernels.cu",
"nvcc": [ "awq_ext/quantization/gemv_cuda.cu",
"-O3", ],
"-std=c++17", extra_compile_args=extra_compile_args,
"-DENABLE_BF16", )
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
+ arch_flags
+ generator_flags,
}
extensions = [
CUDAExtension(
"awq_ext",
[
"awq_ext/pybind_awq.cpp",
"awq_ext/quantization/gemm_cuda_gen.cu",
"awq_ext/layernorm/layernorm.cu",
"awq_ext/position_embedding/pos_encoding_kernels.cu",
"awq_ext/quantization/gemv_cuda.cu",
],
extra_compile_args=extra_compile_args,
) )
]
extensions.append( extensions.append(
CUDAExtension( CUDAExtension(
...@@ -183,8 +204,8 @@ extensions.append( ...@@ -183,8 +204,8 @@ extensions.append(
) )
) )
if os.name != "nt" and CUDA_VERSION:
if os.name != "nt": # FasterTransformer kernels
extensions.append( extensions.append(
CUDAExtension( CUDAExtension(
"awq_ft_ext", "awq_ft_ext",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment