Commit 8907d182 authored by Casper's avatar Casper
Browse files

Bump version

parent e1ed4bd6
import os import os
import torch import torch
from pathlib import Path from pathlib import Path
from setuptools import setup, find_packages from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension from torch.utils.cpp_extension import BuildExtension, CUDAExtension
os.environ["CC"] = "g++" os.environ["CC"] = "g++"
os.environ["CXX"] = "g++" os.environ["CXX"] = "g++"
AUTOAWQ_KERNELS_VERSION = "0.0.3" AUTOAWQ_KERNELS_VERSION = "0.0.4"
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1" PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
if not PYPI_BUILD: if not PYPI_BUILD:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440 # only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
if CUDA_VERSION: if CUDA_VERSION:
CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3] CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}" AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
elif ROCM_VERSION: elif ROCM_VERSION:
ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3] ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}" AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
else: else:
raise RuntimeError( raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package." "Your system must have either Nvidia or AMD GPU to build this package."
) )
print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}") print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")
common_setup_kwargs = { common_setup_kwargs = {
"version": AUTOAWQ_KERNELS_VERSION, "version": AUTOAWQ_KERNELS_VERSION,
"name": "autoawq_kernels", "name": "autoawq_kernels",
"author": "Casper Hansen", "author": "Casper Hansen",
"license": "MIT", "license": "MIT",
"python_requires": ">=3.8.0", "python_requires": ">=3.8.0",
"description": "AutoAWQ Kernels implements the AWQ kernels.", "description": "AutoAWQ Kernels implements the AWQ kernels.",
"long_description": (Path(__file__).parent / "README.md").read_text( "long_description": (Path(__file__).parent / "README.md").read_text(
encoding="UTF-8" encoding="UTF-8"
), ),
"long_description_content_type": "text/markdown", "long_description_content_type": "text/markdown",
"url": "https://github.com/casper-hansen/AutoAWQ_kernels", "url": "https://github.com/casper-hansen/AutoAWQ_kernels",
"keywords": ["awq", "autoawq", "quantization", "transformers"], "keywords": ["awq", "autoawq", "quantization", "transformers"],
"platforms": ["linux", "windows"], "platforms": ["linux", "windows"],
"classifiers": [ "classifiers": [
"Environment :: GPU :: NVIDIA CUDA :: 11.8", "Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 12", "Environment :: GPU :: NVIDIA CUDA :: 12",
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Natural Language :: English", "Natural Language :: English",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: C++", "Programming Language :: C++",
], ],
} }
requirements = [ requirements = [
"torch>=2.0.1", "torch>=2.0.1",
] ]
def get_include_dirs(): def get_include_dirs():
include_dirs = [] include_dirs = []
if CUDA_VERSION: if CUDA_VERSION:
conda_cuda_include_dir = os.path.join( conda_cuda_include_dir = os.path.join(
get_python_lib(), "nvidia/cuda_runtime/include" get_python_lib(), "nvidia/cuda_runtime/include"
) )
if os.path.isdir(conda_cuda_include_dir): if os.path.isdir(conda_cuda_include_dir):
include_dirs.append(conda_cuda_include_dir) include_dirs.append(conda_cuda_include_dir)
this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(os.path.abspath(__file__))
include_dirs.append(this_dir) include_dirs.append(this_dir)
return include_dirs return include_dirs
def get_generator_flag(): def get_generator_flag():
generator_flag = [] generator_flag = []
# if CUDA_VERSION: # if CUDA_VERSION:
torch_dir = torch.__path__[0] torch_dir = torch.__path__[0]
if os.path.exists( if os.path.exists(
os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h") os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
): ):
generator_flag = ["-DOLD_GENERATOR_PATH"] generator_flag = ["-DOLD_GENERATOR_PATH"]
return generator_flag return generator_flag
def get_compute_capabilities(): def get_compute_capabilities():
capability_flags = [] capability_flags = []
if CUDA_VERSION: if CUDA_VERSION:
# Collect the compute capabilities of all available CUDA GPUs # Collect the compute capabilities of all available CUDA GPUs
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i) major, minor = torch.cuda.get_device_capability(i)
cc = major * 10 + minor cc = major * 10 + minor
if cc < 75: if cc < 75:
raise RuntimeError( raise RuntimeError(
"GPUs with compute capability less than 7.5 are not supported." "GPUs with compute capability less than 7.5 are not supported."
) )
# Figure out compute capability # Figure out compute capability
compute_capabilities = {75, 80, 86, 89, 90} compute_capabilities = {75, 80, 86, 89, 90}
for cap in compute_capabilities: for cap in compute_capabilities:
capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"] capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
return capability_flags return capability_flags
def get_extra_compile_args(arch_flags, generator_flags): def get_extra_compile_args(arch_flags, generator_flags):
extra_compile_args = {} extra_compile_args = {}
if os.name == "nt" and CUDA_VERSION: if os.name == "nt" and CUDA_VERSION:
include_arch = os.getenv("INCLUDE_ARCH", "1") == "1" include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
# Relaxed args on Windows # Relaxed args on Windows
if include_arch: if include_arch:
extra_compile_args = {"nvcc": arch_flags} extra_compile_args = {"nvcc": arch_flags}
elif CUDA_VERSION: elif CUDA_VERSION:
extra_compile_args = { extra_compile_args = {
"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"], "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
"nvcc": [ "nvcc": [
"-O3", "-O3",
"-std=c++17", "-std=c++17",
"-DENABLE_BF16", "-DENABLE_BF16",
"-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__", "-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__", "-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__", "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr", "--expt-relaxed-constexpr",
"--expt-extended-lambda", "--expt-extended-lambda",
"--use_fast_math", "--use_fast_math",
] ]
+ arch_flags + arch_flags
+ generator_flags, + generator_flags,
} }
return extra_compile_args return extra_compile_args
def get_extra_link_args(): def get_extra_link_args():
extra_link_args = [] extra_link_args = []
if os.name == "nt" and CUDA_VERSION: if os.name == "nt" and CUDA_VERSION:
cuda_path = os.environ.get("CUDA_PATH", None) cuda_path = os.environ.get("CUDA_PATH", None)
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"] extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
return extra_link_args return extra_link_args
include_dirs = get_include_dirs() include_dirs = get_include_dirs()
extra_link_args = get_extra_link_args() extra_link_args = get_extra_link_args()
generator_flags = get_generator_flag() generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities() arch_flags = get_compute_capabilities()
extra_compile_args = get_extra_compile_args(arch_flags, generator_flags) extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
extensions = [] extensions = []
if CUDA_VERSION: if CUDA_VERSION:
# contain un-hipifiable inline PTX # contain un-hipifiable inline PTX
extensions.append( extensions.append(
CUDAExtension( CUDAExtension(
"awq_ext", "awq_ext",
[ [
"awq_ext/pybind_awq.cpp", "awq_ext/pybind_awq.cpp",
"awq_ext/quantization/gemm_cuda_gen.cu", "awq_ext/quantization/gemm_cuda_gen.cu",
"awq_ext/layernorm/layernorm.cu", "awq_ext/layernorm/layernorm.cu",
"awq_ext/position_embedding/pos_encoding_kernels.cu", "awq_ext/position_embedding/pos_encoding_kernels.cu",
"awq_ext/quantization/gemv_cuda.cu", "awq_ext/quantization/gemv_cuda.cu",
"awq_ext/vllm/moe_alig_block.cu", "awq_ext/vllm/moe_alig_block.cu",
"awq_ext/vllm/activation.cu", "awq_ext/vllm/activation.cu",
"awq_ext/vllm/topk_softmax_kernels.cu", "awq_ext/vllm/topk_softmax_kernels.cu",
], ],
extra_compile_args=extra_compile_args, extra_compile_args=extra_compile_args,
) )
) )
extensions.append( extensions.append(
CUDAExtension( CUDAExtension(
"exl_ext", "exl_ext",
[ [
"awq_ext/exllama/exllama_ext.cpp", "awq_ext/exllama/exllama_ext.cpp",
"awq_ext/exllama/cuda_buffers.cu", "awq_ext/exllama/cuda_buffers.cu",
"awq_ext/exllama/cuda_func/column_remap.cu", "awq_ext/exllama/cuda_func/column_remap.cu",
"awq_ext/exllama/cuda_func/q4_matmul.cu", "awq_ext/exllama/cuda_func/q4_matmul.cu",
"awq_ext/exllama/cuda_func/q4_matrix.cu", "awq_ext/exllama/cuda_func/q4_matrix.cu",
], ],
extra_compile_args=extra_compile_args, extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args, extra_link_args=extra_link_args,
) )
) )
extensions.append( extensions.append(
CUDAExtension( CUDAExtension(
"exlv2_ext", "exlv2_ext",
[ [
"awq_ext/exllamav2/ext.cpp", "awq_ext/exllamav2/ext.cpp",
"awq_ext/exllamav2/cuda/q_matrix.cu", "awq_ext/exllamav2/cuda/q_matrix.cu",
"awq_ext/exllamav2/cuda/q_gemm.cu", "awq_ext/exllamav2/cuda/q_gemm.cu",
], ],
extra_compile_args=extra_compile_args, extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args, extra_link_args=extra_link_args,
) )
) )
if os.name != "nt" and CUDA_VERSION: if os.name != "nt" and CUDA_VERSION:
# FasterTransformer kernels # FasterTransformer kernels
extensions.append( extensions.append(
CUDAExtension( CUDAExtension(
"awq_ft_ext", "awq_ft_ext",
[ [
"awq_ext/pybind_awq_ft.cpp", "awq_ext/pybind_awq_ft.cpp",
"awq_ext/attention/ft_attention.cpp", "awq_ext/attention/ft_attention.cpp",
"awq_ext/attention/decoder_masked_multihead_attention.cu", "awq_ext/attention/decoder_masked_multihead_attention.cu",
], ],
extra_compile_args=extra_compile_args, extra_compile_args=extra_compile_args,
) )
) )
additional_setup_kwargs = { additional_setup_kwargs = {
"ext_modules": extensions, "ext_modules": extensions,
"cmdclass": {"build_ext": BuildExtension}, "cmdclass": {"build_ext": BuildExtension},
} }
common_setup_kwargs.update(additional_setup_kwargs) common_setup_kwargs.update(additional_setup_kwargs)
setup( setup(
packages=find_packages(), packages=find_packages(),
install_requires=requirements, install_requires=requirements,
include_dirs=include_dirs, include_dirs=include_dirs,
**common_setup_kwargs, **common_setup_kwargs,
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment