setup.py 2.78 KB
Newer Older
Casper Hansen's avatar
Casper Hansen committed
1
import os
Casper's avatar
Casper committed
2
import torch
Casper Hansen's avatar
Casper Hansen committed
3
from setuptools import setup, find_packages
Casper's avatar
Casper committed
4
5
6
7
8
9
10
11
12
13
14
15
16
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

if CUDA_HOME is None:
    raise RuntimeError(
        f"Cannot find CUDA_HOME. CUDA must be available to build the package.")

# Collect the compute capabilities of all available GPUs.
compute_capabilities = set()
for i in range(torch.cuda.device_count()):
    major, minor = torch.cuda.get_device_capability(i)
    if major < 8:
        raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)
Casper Hansen's avatar
Casper Hansen committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

# Get environment variables
build_cuda_extension = os.environ.get('BUILD_CUDA_EXT', '1') == '1'
torch_is_prebuilt = os.environ.get('TORCH_IS_PREBUILT', '0') == '1'

# Define dependencies
dependencies = [
    "accelerate", "sentencepiece", "tokenizers>=0.12.1",
    "transformers>=4.32.0", 
    "lm_eval", "texttable",
    "toml", "attributedict",
    "protobuf"
]

if not torch_is_prebuilt:
    dependencies.extend(["torch>=2.0.0", "torchvision"])

# Setup CUDA extension
ext_modules = []

if build_cuda_extension:
Casper's avatar
Casper committed
38
39
40
41
42
    n_threads = min(os.cpu_count(), 8)

    cxx_args = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17"]
    nvcc_args = ["-O3", "-std=c++17", "--threads", n_threads]

Casper Hansen's avatar
Casper Hansen committed
43
44
45
46
    ext_modules.append(
        CUDAExtension(
            name="awq_inference_engine",
            sources=[
Casper's avatar
Casper committed
47
48
49
50
                "awq_cuda/pybind.cpp",
                "awq_cuda/quantization/gemm_cuda_gen.cu",
                "awq_cuda/layernorm/layernorm.cu",
                "awq_cuda/position_embedding/pos_encoding_kernels.cu"
Casper Hansen's avatar
Casper Hansen committed
51
52
            ],
            extra_compile_args={
Casper's avatar
Casper committed
53
54
                "cxx": cxx_args,
                "nvcc": nvcc_args
Casper Hansen's avatar
Casper Hansen committed
55
56
57
58
59
            },
        )
    )

setup(
Casper's avatar
Casper committed
60
    name="autoawq",
Casper Hansen's avatar
Casper Hansen committed
61
    version="0.1.0",
Casper's avatar
Casper committed
62
63
64
    author="Casper Hansen",
    license="MIT",
    description="AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
Casper Hansen's avatar
Casper Hansen committed
65
66
67
    long_description=open("README.md", "r").read(),
    long_description_content_type="text/markdown",
    python_requires=">=3.8",
Casper's avatar
Casper committed
68
69
    url="https://github.com/casper-hansen/AutoAWQ",
    keywords=["awq", "autoawq", "quantization", "transformers"],
Casper Hansen's avatar
Casper Hansen committed
70
    classifiers=[
Casper's avatar
Casper committed
71
72
73
74
75
76
77
78
79
        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
        "Environment :: GPU :: NVIDIA CUDA :: 12",
        "License :: OSI Approved :: MIT License",
        "Natural Language :: English",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Programming Language :: C++",
Casper Hansen's avatar
Casper Hansen committed
80
81
    ],
    install_requires=dependencies,
Casper's avatar
Casper committed
82
    packages=find_packages(exclude=["examples*"]),
Casper Hansen's avatar
Casper Hansen committed
83
84
85
    ext_modules=ext_modules,
    cmdclass={"build_ext": BuildExtension}
)