Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ_kernels
Commits
8907d182
You need to sign in or sign up before continuing.
Commit
8907d182
authored
Feb 14, 2024
by
Casper
Browse files
Bump version
parent
e1ed4bd6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
236 additions
and
236 deletions
+236
-236
setup.py
setup.py
+236
-236
No files found.
setup.py
View file @
8907d182
import
os
import
torch
from
pathlib
import
Path
from
setuptools
import
setup
,
find_packages
from
distutils.sysconfig
import
get_python_lib
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
os
.
environ
[
"CC"
]
=
"g++"
os
.
environ
[
"CXX"
]
=
"g++"
AUTOAWQ_KERNELS_VERSION
=
"0.0.
3"
PYPI_BUILD
=
os
.
getenv
(
"PYPI_BUILD"
,
"0"
)
==
"1"
CUDA_VERSION
=
os
.
getenv
(
"CUDA_VERSION"
,
None
)
or
torch
.
version
.
cuda
ROCM_VERSION
=
os
.
environ
.
get
(
"ROCM_VERSION"
,
None
)
or
torch
.
version
.
hip
if
not
PYPI_BUILD
:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
if
CUDA_VERSION
:
CUDA_VERSION
=
""
.
join
(
CUDA_VERSION
.
split
(
"."
))[:
3
]
AUTOAWQ_KERNELS_VERSION
+=
f
"+cu
{
CUDA_VERSION
}
"
elif
ROCM_VERSION
:
ROCM_VERSION
=
""
.
join
(
ROCM_VERSION
.
split
(
"."
))[:
3
]
AUTOAWQ_KERNELS_VERSION
+=
f
"+rocm
{
ROCM_VERSION
}
"
else
:
raise
RuntimeError
(
"Your system must have either Nvidia or AMD GPU to build this package."
)
print
(
f
"Building AutoAWQ Kernels version
{
AUTOAWQ_KERNELS_VERSION
}
"
)
common_setup_kwargs
=
{
"version"
:
AUTOAWQ_KERNELS_VERSION
,
"name"
:
"autoawq_kernels"
,
"author"
:
"Casper Hansen"
,
"license"
:
"MIT"
,
"python_requires"
:
">=3.8.0"
,
"description"
:
"AutoAWQ Kernels implements the AWQ kernels."
,
"long_description"
:
(
Path
(
__file__
).
parent
/
"README.md"
).
read_text
(
encoding
=
"UTF-8"
),
"long_description_content_type"
:
"text/markdown"
,
"url"
:
"https://github.com/casper-hansen/AutoAWQ_kernels"
,
"keywords"
:
[
"awq"
,
"autoawq"
,
"quantization"
,
"transformers"
],
"platforms"
:
[
"linux"
,
"windows"
],
"classifiers"
:
[
"Environment :: GPU :: NVIDIA CUDA :: 11.8"
,
"Environment :: GPU :: NVIDIA CUDA :: 12"
,
"License :: OSI Approved :: MIT License"
,
"Natural Language :: English"
,
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: C++"
,
],
}
requirements
=
[
"torch>=2.0.1"
,
]
def
get_include_dirs
():
include_dirs
=
[]
if
CUDA_VERSION
:
conda_cuda_include_dir
=
os
.
path
.
join
(
get_python_lib
(),
"nvidia/cuda_runtime/include"
)
if
os
.
path
.
isdir
(
conda_cuda_include_dir
):
include_dirs
.
append
(
conda_cuda_include_dir
)
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
include_dirs
.
append
(
this_dir
)
return
include_dirs
def
get_generator_flag
():
generator_flag
=
[]
# if CUDA_VERSION:
torch_dir
=
torch
.
__path__
[
0
]
if
os
.
path
.
exists
(
os
.
path
.
join
(
torch_dir
,
"include"
,
"ATen"
,
"CUDAGeneratorImpl.h"
)
):
generator_flag
=
[
"-DOLD_GENERATOR_PATH"
]
return
generator_flag
def
get_compute_capabilities
():
capability_flags
=
[]
if
CUDA_VERSION
:
# Collect the compute capabilities of all available CUDA GPUs
for
i
in
range
(
torch
.
cuda
.
device_count
()):
major
,
minor
=
torch
.
cuda
.
get_device_capability
(
i
)
cc
=
major
*
10
+
minor
if
cc
<
75
:
raise
RuntimeError
(
"GPUs with compute capability less than 7.5 are not supported."
)
# Figure out compute capability
compute_capabilities
=
{
75
,
80
,
86
,
89
,
90
}
for
cap
in
compute_capabilities
:
capability_flags
+=
[
"-gencode"
,
f
"arch=compute_
{
cap
}
,code=sm_
{
cap
}
"
]
return
capability_flags
def
get_extra_compile_args
(
arch_flags
,
generator_flags
):
extra_compile_args
=
{}
if
os
.
name
==
"nt"
and
CUDA_VERSION
:
include_arch
=
os
.
getenv
(
"INCLUDE_ARCH"
,
"1"
)
==
"1"
# Relaxed args on Windows
if
include_arch
:
extra_compile_args
=
{
"nvcc"
:
arch_flags
}
elif
CUDA_VERSION
:
extra_compile_args
=
{
"cxx"
:
[
"-g"
,
"-O3"
,
"-fopenmp"
,
"-lgomp"
,
"-std=c++17"
,
"-DENABLE_BF16"
],
"nvcc"
:
[
"-O3"
,
"-std=c++17"
,
"-DENABLE_BF16"
,
"-U__CUDA_NO_HALF_OPERATORS__"
,
"-U__CUDA_NO_HALF_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT16_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT162_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__"
,
"--expt-relaxed-constexpr"
,
"--expt-extended-lambda"
,
"--use_fast_math"
,
]
+
arch_flags
+
generator_flags
,
}
return
extra_compile_args
def
get_extra_link_args
():
extra_link_args
=
[]
if
os
.
name
==
"nt"
and
CUDA_VERSION
:
cuda_path
=
os
.
environ
.
get
(
"CUDA_PATH"
,
None
)
extra_link_args
=
[
"-L"
,
f
"
{
cuda_path
}
/lib/x64/cublas.lib"
]
return
extra_link_args
include_dirs
=
get_include_dirs
()
extra_link_args
=
get_extra_link_args
()
generator_flags
=
get_generator_flag
()
arch_flags
=
get_compute_capabilities
()
extra_compile_args
=
get_extra_compile_args
(
arch_flags
,
generator_flags
)
extensions
=
[]
if
CUDA_VERSION
:
# contain un-hipifiable inline PTX
extensions
.
append
(
CUDAExtension
(
"awq_ext"
,
[
"awq_ext/pybind_awq.cpp"
,
"awq_ext/quantization/gemm_cuda_gen.cu"
,
"awq_ext/layernorm/layernorm.cu"
,
"awq_ext/position_embedding/pos_encoding_kernels.cu"
,
"awq_ext/quantization/gemv_cuda.cu"
,
"awq_ext/vllm/moe_alig_block.cu"
,
"awq_ext/vllm/activation.cu"
,
"awq_ext/vllm/topk_softmax_kernels.cu"
,
],
extra_compile_args
=
extra_compile_args
,
)
)
extensions
.
append
(
CUDAExtension
(
"exl_ext"
,
[
"awq_ext/exllama/exllama_ext.cpp"
,
"awq_ext/exllama/cuda_buffers.cu"
,
"awq_ext/exllama/cuda_func/column_remap.cu"
,
"awq_ext/exllama/cuda_func/q4_matmul.cu"
,
"awq_ext/exllama/cuda_func/q4_matrix.cu"
,
],
extra_compile_args
=
extra_compile_args
,
extra_link_args
=
extra_link_args
,
)
)
extensions
.
append
(
CUDAExtension
(
"exlv2_ext"
,
[
"awq_ext/exllamav2/ext.cpp"
,
"awq_ext/exllamav2/cuda/q_matrix.cu"
,
"awq_ext/exllamav2/cuda/q_gemm.cu"
,
],
extra_compile_args
=
extra_compile_args
,
extra_link_args
=
extra_link_args
,
)
)
if
os
.
name
!=
"nt"
and
CUDA_VERSION
:
# FasterTransformer kernels
extensions
.
append
(
CUDAExtension
(
"awq_ft_ext"
,
[
"awq_ext/pybind_awq_ft.cpp"
,
"awq_ext/attention/ft_attention.cpp"
,
"awq_ext/attention/decoder_masked_multihead_attention.cu"
,
],
extra_compile_args
=
extra_compile_args
,
)
)
additional_setup_kwargs
=
{
"ext_modules"
:
extensions
,
"cmdclass"
:
{
"build_ext"
:
BuildExtension
},
}
common_setup_kwargs
.
update
(
additional_setup_kwargs
)
setup
(
packages
=
find_packages
(),
install_requires
=
requirements
,
include_dirs
=
include_dirs
,
**
common_setup_kwargs
,
)
import
os
import
torch
from
pathlib
import
Path
from
setuptools
import
setup
,
find_packages
from
distutils.sysconfig
import
get_python_lib
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
os
.
environ
[
"CC"
]
=
"g++"
os
.
environ
[
"CXX"
]
=
"g++"
AUTOAWQ_KERNELS_VERSION
=
"0.0.
4"
PYPI_BUILD
=
os
.
getenv
(
"PYPI_BUILD"
,
"0"
)
==
"1"
CUDA_VERSION
=
os
.
getenv
(
"CUDA_VERSION"
,
None
)
or
torch
.
version
.
cuda
ROCM_VERSION
=
os
.
environ
.
get
(
"ROCM_VERSION"
,
None
)
or
torch
.
version
.
hip
if
not
PYPI_BUILD
:
# only adding CUDA/ROCM version if we are not building for PyPI to comply with PEP 440
if
CUDA_VERSION
:
CUDA_VERSION
=
""
.
join
(
CUDA_VERSION
.
split
(
"."
))[:
3
]
AUTOAWQ_KERNELS_VERSION
+=
f
"+cu
{
CUDA_VERSION
}
"
elif
ROCM_VERSION
:
ROCM_VERSION
=
""
.
join
(
ROCM_VERSION
.
split
(
"."
))[:
3
]
AUTOAWQ_KERNELS_VERSION
+=
f
"+rocm
{
ROCM_VERSION
}
"
else
:
raise
RuntimeError
(
"Your system must have either Nvidia or AMD GPU to build this package."
)
print
(
f
"Building AutoAWQ Kernels version
{
AUTOAWQ_KERNELS_VERSION
}
"
)
common_setup_kwargs
=
{
"version"
:
AUTOAWQ_KERNELS_VERSION
,
"name"
:
"autoawq_kernels"
,
"author"
:
"Casper Hansen"
,
"license"
:
"MIT"
,
"python_requires"
:
">=3.8.0"
,
"description"
:
"AutoAWQ Kernels implements the AWQ kernels."
,
"long_description"
:
(
Path
(
__file__
).
parent
/
"README.md"
).
read_text
(
encoding
=
"UTF-8"
),
"long_description_content_type"
:
"text/markdown"
,
"url"
:
"https://github.com/casper-hansen/AutoAWQ_kernels"
,
"keywords"
:
[
"awq"
,
"autoawq"
,
"quantization"
,
"transformers"
],
"platforms"
:
[
"linux"
,
"windows"
],
"classifiers"
:
[
"Environment :: GPU :: NVIDIA CUDA :: 11.8"
,
"Environment :: GPU :: NVIDIA CUDA :: 12"
,
"License :: OSI Approved :: MIT License"
,
"Natural Language :: English"
,
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: C++"
,
],
}
requirements
=
[
"torch>=2.0.1"
,
]
def
get_include_dirs
():
include_dirs
=
[]
if
CUDA_VERSION
:
conda_cuda_include_dir
=
os
.
path
.
join
(
get_python_lib
(),
"nvidia/cuda_runtime/include"
)
if
os
.
path
.
isdir
(
conda_cuda_include_dir
):
include_dirs
.
append
(
conda_cuda_include_dir
)
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
include_dirs
.
append
(
this_dir
)
return
include_dirs
def
get_generator_flag
():
generator_flag
=
[]
# if CUDA_VERSION:
torch_dir
=
torch
.
__path__
[
0
]
if
os
.
path
.
exists
(
os
.
path
.
join
(
torch_dir
,
"include"
,
"ATen"
,
"CUDAGeneratorImpl.h"
)
):
generator_flag
=
[
"-DOLD_GENERATOR_PATH"
]
return
generator_flag
def
get_compute_capabilities
():
capability_flags
=
[]
if
CUDA_VERSION
:
# Collect the compute capabilities of all available CUDA GPUs
for
i
in
range
(
torch
.
cuda
.
device_count
()):
major
,
minor
=
torch
.
cuda
.
get_device_capability
(
i
)
cc
=
major
*
10
+
minor
if
cc
<
75
:
raise
RuntimeError
(
"GPUs with compute capability less than 7.5 are not supported."
)
# Figure out compute capability
compute_capabilities
=
{
75
,
80
,
86
,
89
,
90
}
for
cap
in
compute_capabilities
:
capability_flags
+=
[
"-gencode"
,
f
"arch=compute_
{
cap
}
,code=sm_
{
cap
}
"
]
return
capability_flags
def
get_extra_compile_args
(
arch_flags
,
generator_flags
):
extra_compile_args
=
{}
if
os
.
name
==
"nt"
and
CUDA_VERSION
:
include_arch
=
os
.
getenv
(
"INCLUDE_ARCH"
,
"1"
)
==
"1"
# Relaxed args on Windows
if
include_arch
:
extra_compile_args
=
{
"nvcc"
:
arch_flags
}
elif
CUDA_VERSION
:
extra_compile_args
=
{
"cxx"
:
[
"-g"
,
"-O3"
,
"-fopenmp"
,
"-lgomp"
,
"-std=c++17"
,
"-DENABLE_BF16"
],
"nvcc"
:
[
"-O3"
,
"-std=c++17"
,
"-DENABLE_BF16"
,
"-U__CUDA_NO_HALF_OPERATORS__"
,
"-U__CUDA_NO_HALF_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT16_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT162_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__"
,
"--expt-relaxed-constexpr"
,
"--expt-extended-lambda"
,
"--use_fast_math"
,
]
+
arch_flags
+
generator_flags
,
}
return
extra_compile_args
def
get_extra_link_args
():
extra_link_args
=
[]
if
os
.
name
==
"nt"
and
CUDA_VERSION
:
cuda_path
=
os
.
environ
.
get
(
"CUDA_PATH"
,
None
)
extra_link_args
=
[
"-L"
,
f
"
{
cuda_path
}
/lib/x64/cublas.lib"
]
return
extra_link_args
include_dirs
=
get_include_dirs
()
extra_link_args
=
get_extra_link_args
()
generator_flags
=
get_generator_flag
()
arch_flags
=
get_compute_capabilities
()
extra_compile_args
=
get_extra_compile_args
(
arch_flags
,
generator_flags
)
extensions
=
[]
if
CUDA_VERSION
:
# contain un-hipifiable inline PTX
extensions
.
append
(
CUDAExtension
(
"awq_ext"
,
[
"awq_ext/pybind_awq.cpp"
,
"awq_ext/quantization/gemm_cuda_gen.cu"
,
"awq_ext/layernorm/layernorm.cu"
,
"awq_ext/position_embedding/pos_encoding_kernels.cu"
,
"awq_ext/quantization/gemv_cuda.cu"
,
"awq_ext/vllm/moe_alig_block.cu"
,
"awq_ext/vllm/activation.cu"
,
"awq_ext/vllm/topk_softmax_kernels.cu"
,
],
extra_compile_args
=
extra_compile_args
,
)
)
extensions
.
append
(
CUDAExtension
(
"exl_ext"
,
[
"awq_ext/exllama/exllama_ext.cpp"
,
"awq_ext/exllama/cuda_buffers.cu"
,
"awq_ext/exllama/cuda_func/column_remap.cu"
,
"awq_ext/exllama/cuda_func/q4_matmul.cu"
,
"awq_ext/exllama/cuda_func/q4_matrix.cu"
,
],
extra_compile_args
=
extra_compile_args
,
extra_link_args
=
extra_link_args
,
)
)
extensions
.
append
(
CUDAExtension
(
"exlv2_ext"
,
[
"awq_ext/exllamav2/ext.cpp"
,
"awq_ext/exllamav2/cuda/q_matrix.cu"
,
"awq_ext/exllamav2/cuda/q_gemm.cu"
,
],
extra_compile_args
=
extra_compile_args
,
extra_link_args
=
extra_link_args
,
)
)
if
os
.
name
!=
"nt"
and
CUDA_VERSION
:
# FasterTransformer kernels
extensions
.
append
(
CUDAExtension
(
"awq_ft_ext"
,
[
"awq_ext/pybind_awq_ft.cpp"
,
"awq_ext/attention/ft_attention.cpp"
,
"awq_ext/attention/decoder_masked_multihead_attention.cu"
,
],
extra_compile_args
=
extra_compile_args
,
)
)
additional_setup_kwargs
=
{
"ext_modules"
:
extensions
,
"cmdclass"
:
{
"build_ext"
:
BuildExtension
},
}
common_setup_kwargs
.
update
(
additional_setup_kwargs
)
setup
(
packages
=
find_packages
(),
install_requires
=
requirements
,
include_dirs
=
include_dirs
,
**
common_setup_kwargs
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment