You need to sign in or sign up before continuing.
Commit 8907d182 authored by Casper's avatar Casper
Browse files

Bump version

parent e1ed4bd6
import os
import torch
from pathlib import Path
from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
os.environ["CC"] = "g++"
os.environ["CXX"] = "g++"
AUTOAWQ_KERNELS_VERSION = "0.0.3"
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
if not PYPI_BUILD:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
if CUDA_VERSION:
CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
elif ROCM_VERSION:
ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
else:
raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package."
)
print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")
common_setup_kwargs = {
"version": AUTOAWQ_KERNELS_VERSION,
"name": "autoawq_kernels",
"author": "Casper Hansen",
"license": "MIT",
"python_requires": ">=3.8.0",
"description": "AutoAWQ Kernels implements the AWQ kernels.",
"long_description": (Path(__file__).parent / "README.md").read_text(
encoding="UTF-8"
),
"long_description_content_type": "text/markdown",
"url": "https://github.com/casper-hansen/AutoAWQ_kernels",
"keywords": ["awq", "autoawq", "quantization", "transformers"],
"platforms": ["linux", "windows"],
"classifiers": [
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 12",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: C++",
],
}
requirements = [
"torch>=2.0.1",
]
def get_include_dirs():
include_dirs = []
if CUDA_VERSION:
conda_cuda_include_dir = os.path.join(
get_python_lib(), "nvidia/cuda_runtime/include"
)
if os.path.isdir(conda_cuda_include_dir):
include_dirs.append(conda_cuda_include_dir)
this_dir = os.path.dirname(os.path.abspath(__file__))
include_dirs.append(this_dir)
return include_dirs
def get_generator_flag():
generator_flag = []
# if CUDA_VERSION:
torch_dir = torch.__path__[0]
if os.path.exists(
os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
):
generator_flag = ["-DOLD_GENERATOR_PATH"]
return generator_flag
def get_compute_capabilities():
capability_flags = []
if CUDA_VERSION:
# Collect the compute capabilities of all available CUDA GPUs
for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i)
cc = major * 10 + minor
if cc < 75:
raise RuntimeError(
"GPUs with compute capability less than 7.5 are not supported."
)
# Figure out compute capability
compute_capabilities = {75, 80, 86, 89, 90}
for cap in compute_capabilities:
capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
return capability_flags
def get_extra_compile_args(arch_flags, generator_flags):
extra_compile_args = {}
if os.name == "nt" and CUDA_VERSION:
include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
# Relaxed args on Windows
if include_arch:
extra_compile_args = {"nvcc": arch_flags}
elif CUDA_VERSION:
extra_compile_args = {
"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
"nvcc": [
"-O3",
"-std=c++17",
"-DENABLE_BF16",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
+ arch_flags
+ generator_flags,
}
return extra_compile_args
def get_extra_link_args():
extra_link_args = []
if os.name == "nt" and CUDA_VERSION:
cuda_path = os.environ.get("CUDA_PATH", None)
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
return extra_link_args
include_dirs = get_include_dirs()
extra_link_args = get_extra_link_args()
generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities()
extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
extensions = []
if CUDA_VERSION:
# contain un-hipifiable inline PTX
extensions.append(
CUDAExtension(
"awq_ext",
[
"awq_ext/pybind_awq.cpp",
"awq_ext/quantization/gemm_cuda_gen.cu",
"awq_ext/layernorm/layernorm.cu",
"awq_ext/position_embedding/pos_encoding_kernels.cu",
"awq_ext/quantization/gemv_cuda.cu",
"awq_ext/vllm/moe_alig_block.cu",
"awq_ext/vllm/activation.cu",
"awq_ext/vllm/topk_softmax_kernels.cu",
],
extra_compile_args=extra_compile_args,
)
)
extensions.append(
CUDAExtension(
"exl_ext",
[
"awq_ext/exllama/exllama_ext.cpp",
"awq_ext/exllama/cuda_buffers.cu",
"awq_ext/exllama/cuda_func/column_remap.cu",
"awq_ext/exllama/cuda_func/q4_matmul.cu",
"awq_ext/exllama/cuda_func/q4_matrix.cu",
],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
)
)
extensions.append(
CUDAExtension(
"exlv2_ext",
[
"awq_ext/exllamav2/ext.cpp",
"awq_ext/exllamav2/cuda/q_matrix.cu",
"awq_ext/exllamav2/cuda/q_gemm.cu",
],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
)
)
if os.name != "nt" and CUDA_VERSION:
# FasterTransformer kernels
extensions.append(
CUDAExtension(
"awq_ft_ext",
[
"awq_ext/pybind_awq_ft.cpp",
"awq_ext/attention/ft_attention.cpp",
"awq_ext/attention/decoder_masked_multihead_attention.cu",
],
extra_compile_args=extra_compile_args,
)
)
additional_setup_kwargs = {
"ext_modules": extensions,
"cmdclass": {"build_ext": BuildExtension},
}
common_setup_kwargs.update(additional_setup_kwargs)
setup(
packages=find_packages(),
install_requires=requirements,
include_dirs=include_dirs,
**common_setup_kwargs,
)
import os
import torch
from pathlib import Path
from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
os.environ["CC"] = "g++"
os.environ["CXX"] = "g++"
AUTOAWQ_KERNELS_VERSION = "0.0.4"
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
ROCM_VERSION = os.environ.get("ROCM_VERSION", None) or torch.version.hip
if not PYPI_BUILD:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
if CUDA_VERSION:
CUDA_VERSION = "".join(CUDA_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
elif ROCM_VERSION:
ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:3]
AUTOAWQ_KERNELS_VERSION += f"+rocm{ROCM_VERSION}"
else:
raise RuntimeError(
"Your system must have either Nvidia or AMD GPU to build this package."
)
print(f"Building AutoAWQ Kernels version {AUTOAWQ_KERNELS_VERSION}")
common_setup_kwargs = {
"version": AUTOAWQ_KERNELS_VERSION,
"name": "autoawq_kernels",
"author": "Casper Hansen",
"license": "MIT",
"python_requires": ">=3.8.0",
"description": "AutoAWQ Kernels implements the AWQ kernels.",
"long_description": (Path(__file__).parent / "README.md").read_text(
encoding="UTF-8"
),
"long_description_content_type": "text/markdown",
"url": "https://github.com/casper-hansen/AutoAWQ_kernels",
"keywords": ["awq", "autoawq", "quantization", "transformers"],
"platforms": ["linux", "windows"],
"classifiers": [
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 12",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: C++",
],
}
requirements = [
"torch>=2.0.1",
]
def get_include_dirs():
include_dirs = []
if CUDA_VERSION:
conda_cuda_include_dir = os.path.join(
get_python_lib(), "nvidia/cuda_runtime/include"
)
if os.path.isdir(conda_cuda_include_dir):
include_dirs.append(conda_cuda_include_dir)
this_dir = os.path.dirname(os.path.abspath(__file__))
include_dirs.append(this_dir)
return include_dirs
def get_generator_flag():
generator_flag = []
# if CUDA_VERSION:
torch_dir = torch.__path__[0]
if os.path.exists(
os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
):
generator_flag = ["-DOLD_GENERATOR_PATH"]
return generator_flag
def get_compute_capabilities():
capability_flags = []
if CUDA_VERSION:
# Collect the compute capabilities of all available CUDA GPUs
for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i)
cc = major * 10 + minor
if cc < 75:
raise RuntimeError(
"GPUs with compute capability less than 7.5 are not supported."
)
# Figure out compute capability
compute_capabilities = {75, 80, 86, 89, 90}
for cap in compute_capabilities:
capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
return capability_flags
def get_extra_compile_args(arch_flags, generator_flags):
extra_compile_args = {}
if os.name == "nt" and CUDA_VERSION:
include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"
# Relaxed args on Windows
if include_arch:
extra_compile_args = {"nvcc": arch_flags}
elif CUDA_VERSION:
extra_compile_args = {
"cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
"nvcc": [
"-O3",
"-std=c++17",
"-DENABLE_BF16",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
]
+ arch_flags
+ generator_flags,
}
return extra_compile_args
def get_extra_link_args():
extra_link_args = []
if os.name == "nt" and CUDA_VERSION:
cuda_path = os.environ.get("CUDA_PATH", None)
extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
return extra_link_args
include_dirs = get_include_dirs()
extra_link_args = get_extra_link_args()
generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities()
extra_compile_args = get_extra_compile_args(arch_flags, generator_flags)
extensions = []
if CUDA_VERSION:
# contain un-hipifiable inline PTX
extensions.append(
CUDAExtension(
"awq_ext",
[
"awq_ext/pybind_awq.cpp",
"awq_ext/quantization/gemm_cuda_gen.cu",
"awq_ext/layernorm/layernorm.cu",
"awq_ext/position_embedding/pos_encoding_kernels.cu",
"awq_ext/quantization/gemv_cuda.cu",
"awq_ext/vllm/moe_alig_block.cu",
"awq_ext/vllm/activation.cu",
"awq_ext/vllm/topk_softmax_kernels.cu",
],
extra_compile_args=extra_compile_args,
)
)
extensions.append(
CUDAExtension(
"exl_ext",
[
"awq_ext/exllama/exllama_ext.cpp",
"awq_ext/exllama/cuda_buffers.cu",
"awq_ext/exllama/cuda_func/column_remap.cu",
"awq_ext/exllama/cuda_func/q4_matmul.cu",
"awq_ext/exllama/cuda_func/q4_matrix.cu",
],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
)
)
extensions.append(
CUDAExtension(
"exlv2_ext",
[
"awq_ext/exllamav2/ext.cpp",
"awq_ext/exllamav2/cuda/q_matrix.cu",
"awq_ext/exllamav2/cuda/q_gemm.cu",
],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
)
)
if os.name != "nt" and CUDA_VERSION:
# FasterTransformer kernels
extensions.append(
CUDAExtension(
"awq_ft_ext",
[
"awq_ext/pybind_awq_ft.cpp",
"awq_ext/attention/ft_attention.cpp",
"awq_ext/attention/decoder_masked_multihead_attention.cu",
],
extra_compile_args=extra_compile_args,
)
)
additional_setup_kwargs = {
"ext_modules": extensions,
"cmdclass": {"build_ext": BuildExtension},
}
common_setup_kwargs.update(additional_setup_kwargs)
setup(
packages=find_packages(),
install_requires=requirements,
include_dirs=include_dirs,
**common_setup_kwargs,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment