setup.py 6.24 KB
Newer Older
Casper's avatar
Casper committed
1
2
3
4
5
6
7
8
9
import os
import torch
from pathlib import Path
from setuptools import setup, find_packages
from distutils.sysconfig import get_python_lib
from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension

os.environ["CC"] = "g++"
os.environ["CXX"] = "g++"
Casper's avatar
Casper committed
10
AUTOAWQ_KERNELS_VERSION = "0.0.2"
11
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
Casper's avatar
Casper committed
12

13
14
if not PYPI_BUILD:
    try:
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
15
16
17
        CUDA_VERSION = "".join(
            os.environ.get("CUDA_VERSION", torch.version.cuda).split(".")
        )[:3]
18
19
20
        AUTOAWQ_KERNELS_VERSION += f"+cu{CUDA_VERSION}"
    except Exception as ex:
        raise RuntimeError("Your system must have an Nvidia GPU for installing AutoAWQ")
Casper's avatar
Casper committed
21
22
23
24
25
26
27
28

common_setup_kwargs = {
    "version": AUTOAWQ_KERNELS_VERSION,
    "name": "autoawq_kernels",
    "author": "Casper Hansen",
    "license": "MIT",
    "python_requires": ">=3.8.0",
    "description": "AutoAWQ Kernels implements the AWQ kernels.",
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
29
30
31
    "long_description": (Path(__file__).parent / "README.md").read_text(
        encoding="UTF-8"
    ),
Casper's avatar
Casper committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    "long_description_content_type": "text/markdown",
    "url": "https://github.com/casper-hansen/AutoAWQ_kernels",
    "keywords": ["awq", "autoawq", "quantization", "transformers"],
    "platforms": ["linux", "windows"],
    "classifiers": [
        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
        "Environment :: GPU :: NVIDIA CUDA :: 12",
        "License :: OSI Approved :: MIT License",
        "Natural Language :: English",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Programming Language :: C++",
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
46
    ],
Casper's avatar
Casper committed
47
48
49
50
51
52
}

requirements = [
    "torch>=2.0.1",
]

ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
53

Casper's avatar
Casper committed
54
55
56
def get_include_dirs():
    include_dirs = []

ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
57
58
59
    conda_cuda_include_dir = os.path.join(
        get_python_lib(), "nvidia/cuda_runtime/include"
    )
Casper's avatar
Casper committed
60
61
62
63
64
65
66
    if os.path.isdir(conda_cuda_include_dir):
        include_dirs.append(conda_cuda_include_dir)
    this_dir = os.path.dirname(os.path.abspath(__file__))
    include_dirs.append(this_dir)

    return include_dirs

ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
67

Casper's avatar
Casper committed
68
69
70
def get_generator_flag():
    generator_flag = []
    torch_dir = torch.__path__[0]
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
71
72
73
    if os.path.exists(
        os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
    ):
Casper's avatar
Casper committed
74
        generator_flag = ["-DOLD_GENERATOR_PATH"]
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
75

Casper's avatar
Casper committed
76
77
    return generator_flag

ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
78

Casper's avatar
Casper committed
79
80
81
def check_dependencies():
    if CUDA_HOME is None:
        raise RuntimeError(
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
82
83
84
            f"Cannot find CUDA_HOME. CUDA must be available to build the package."
        )

Casper's avatar
Casper committed
85
86
87
88
89
90
91
92

def get_compute_capabilities():
    # Collect the compute capabilities of all available GPUs.
    for i in range(torch.cuda.device_count()):
        major, minor = torch.cuda.get_device_capability(i)
        cc = major * 10 + minor

        if cc < 75:
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
93
94
95
            raise RuntimeError(
                "GPUs with compute capability less than 7.5 are not supported."
            )
Casper's avatar
Casper committed
96
97
98
99
100
101
102
103
104
105

    # figure out compute capability
    compute_capabilities = {75, 80, 86, 89, 90}

    capability_flags = []
    for cap in compute_capabilities:
        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]

    return capability_flags

ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
106

Casper's avatar
Casper committed
107
check_dependencies()
Casper's avatar
Casper committed
108
extra_link_args = []
Casper's avatar
Casper committed
109
110
111
112
113
114
115
116
117
include_dirs = get_include_dirs()
generator_flags = get_generator_flag()
arch_flags = get_compute_capabilities()

if os.name == "nt":
    include_arch = os.getenv("INCLUDE_ARCH", "1") == "1"

    # Relaxed args on Windows
    if include_arch:
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
118
        extra_compile_args = {"nvcc": arch_flags}
Casper's avatar
Casper committed
119
    else:
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
120
        extra_compile_args = {}
Casper's avatar
Casper committed
121
122
123
    
    cuda_path = os.environ.get("CUDA_PATH", None)
    extra_link_args = ["-L", f"{cuda_path}/lib/x64/cublas.lib"]
Casper's avatar
Casper committed
124
else:
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
125
    extra_compile_args = {
Casper's avatar
Casper committed
126
127
        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
        "nvcc": [
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
128
            "-O3",
Casper's avatar
Casper committed
129
130
131
132
133
134
135
136
137
138
139
            "-std=c++17",
            "-DENABLE_BF16",
            "-U__CUDA_NO_HALF_OPERATORS__",
            "-U__CUDA_NO_HALF_CONVERSIONS__",
            "-U__CUDA_NO_BFLOAT16_OPERATORS__",
            "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
            "-U__CUDA_NO_BFLOAT162_OPERATORS__",
            "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
            "--expt-relaxed-constexpr",
            "--expt-extended-lambda",
            "--use_fast_math",
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
140
141
142
        ]
        + arch_flags
        + generator_flags,
Casper's avatar
Casper committed
143
144
145
146
147
148
    }

extensions = [
    CUDAExtension(
        "awq_ext",
        [
Casper's avatar
Casper committed
149
150
151
152
            "awq_ext/pybind_awq.cpp",
            "awq_ext/quantization/gemm_cuda_gen.cu",
            "awq_ext/layernorm/layernorm.cu",
            "awq_ext/position_embedding/pos_encoding_kernels.cu",
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
153
154
155
            "awq_ext/quantization/gemv_cuda.cu",
        ],
        extra_compile_args=extra_compile_args,
Casper's avatar
Casper committed
156
157
    )
]
Casper's avatar
Casper committed
158

ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
159
160
161
162
163
164
165
166
167
168
169
extensions.append(
    CUDAExtension(
        "exllama_kernels",
        [
            "awq_ext/exllama/exllama_ext.cpp",
            "awq_ext/exllama/cuda_buffers.cu",
            "awq_ext/exllama/cuda_func/column_remap.cu",
            "awq_ext/exllama/cuda_func/q4_matmul.cu",
            "awq_ext/exllama/cuda_func/q4_matrix.cu",
        ],
        extra_compile_args=extra_compile_args,
Casper's avatar
Casper committed
170
        extra_link_args=extra_link_args,
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
171
172
173
174
175
176
177
178
179
180
181
    )
)
extensions.append(
    CUDAExtension(
        "exllamav2_kernels",
        [
            "awq_ext/exllamav2/ext.cpp",
            "awq_ext/exllamav2/cuda/q_matrix.cu",
            "awq_ext/exllamav2/cuda/q_gemm.cu",
        ],
        extra_compile_args=extra_compile_args,
Casper's avatar
Casper committed
182
        extra_link_args=extra_link_args,
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
183
184
185
    )
)

Casper's avatar
Casper committed
186
187
188
189
190
191

if os.name != "nt":
    extensions.append(
        CUDAExtension(
            "awq_ft_ext",
            [
Casper's avatar
Casper committed
192
193
                "awq_ext/pybind_awq_ft.cpp",
                "awq_ext/attention/ft_attention.cpp",
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
194
195
196
                "awq_ext/attention/decoder_masked_multihead_attention.cu",
            ],
            extra_compile_args=extra_compile_args,
Casper's avatar
Casper committed
197
198
199
200
201
        )
    )

additional_setup_kwargs = {
    "ext_modules": extensions,
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
202
    "cmdclass": {"build_ext": BuildExtension},
Casper's avatar
Casper committed
203
204
205
206
207
208
209
210
}

common_setup_kwargs.update(additional_setup_kwargs)

setup(
    packages=find_packages(),
    install_requires=requirements,
    include_dirs=include_dirs,
ilyas@huggingface.co's avatar
ilyas@huggingface.co committed
211
    **common_setup_kwargs,
Casper's avatar
Casper committed
212
)