Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
d62aebfe
Unverified
Commit
d62aebfe
authored
Sep 21, 2023
by
Casper
Committed by
GitHub
Sep 21, 2023
Browse files
Merge pull request #53 from qwopqwop200/main
support windows
parents
72f954ce
14d4f8cb
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
186 additions
and
157 deletions
+186
-157
awq/modules/fused/attn.py
awq/modules/fused/attn.py
+13
-3
awq_cuda/pybind_awq.cpp
awq_cuda/pybind_awq.cpp
+0
-5
awq_cuda/pybind_ft.cpp
awq_cuda/pybind_ft.cpp
+11
-0
setup.py
setup.py
+162
-149
No files found.
awq/modules/fused/attn.py
View file @
d62aebfe
...
...
@@ -5,6 +5,12 @@ import torch.nn as nn
import
awq_inference_engine
from
torch.nn
import
functional
as
F
try
:
import
ft_inference_engine
FT_INSTALLED
=
True
except
:
FT_INSTALLED
=
False
def
precompute_freqs_cis
(
dim
:
int
,
end
:
int
,
theta
:
float
=
10000.0
):
freqs
=
1.0
/
(
theta
**
(
torch
.
arange
(
0
,
dim
,
2
)[:
(
dim
//
2
)].
float
()
/
dim
))
t
=
torch
.
arange
(
end
,
device
=
freqs
.
device
)
# type: ignore
...
...
@@ -156,7 +162,7 @@ class QuantAttentionFused(nn.Module):
xk
=
self
.
attention_shapes
[
"xk_slice"
](
xqkv
)
xv
=
self
.
attention_shapes
[
"xv_slice"
](
xqkv
)
if
seqlen
>
1
:
if
seqlen
>
1
or
not
FT_INSTALLED
:
xq
=
xq
.
view
((
bsz
,
seqlen
)
+
self
.
attention_shapes
[
"xq_view"
])
xk
=
xk
.
view
((
bsz
,
seqlen
)
+
self
.
attention_shapes
[
"xk_view"
])
xv
=
xv
.
view
((
bsz
,
seqlen
)
+
self
.
attention_shapes
[
"xv_view"
])
...
...
@@ -177,6 +183,11 @@ class QuantAttentionFused(nn.Module):
self
.
cache_v
[:
bsz
,
:,
self
.
start_pos
:
self
.
start_pos
+
seqlen
,
:]
=
values_store
self
.
cache_k
[:
bsz
,
:,
:,
self
.
start_pos
:
self
.
start_pos
+
seqlen
,
:]
=
keys_store
if
seqlen
==
1
:
xv
=
self
.
cache_v
[:
bsz
,
:,
:
self
.
start_pos
+
seqlen
,
:].
transpose
(
1
,
2
).
contiguous
()
xk
=
self
.
cache_k
[:
bsz
,
:,
:,
:
self
.
start_pos
+
seqlen
,
:].
transpose
(
2
,
3
).
contiguous
()
xk
=
xk
.
reshape
(
xk
.
shape
[:
-
2
]
+
(
self
.
head_dim
,)).
transpose
(
1
,
2
).
contiguous
()
keys
=
xk
values
=
xv
...
...
@@ -185,7 +196,6 @@ class QuantAttentionFused(nn.Module):
values
=
torch
.
repeat_interleave
(
values
,
dim
=
2
,
repeats
=
self
.
n_kv_groups
)
past_key_value
=
(
xk
,
xv
)
if
use_cache
else
None
xq
=
xq
.
transpose
(
1
,
2
)
keys
=
keys
.
transpose
(
1
,
2
)
values
=
values
.
transpose
(
1
,
2
)
...
...
@@ -231,4 +241,4 @@ class QuantAttentionFused(nn.Module):
else
:
self
.
start_pos
=
0
return
attn_output
,
attention_weight
,
past_key_value
\ No newline at end of file
return
attn_output
,
attention_weight
,
past_key_value
awq_cuda/pybind.cpp
→
awq_cuda/pybind
_awq
.cpp
View file @
d62aebfe
#include <pybind11/pybind11.h>
#include <torch/extension.h>
#include "attention/ft_attention.h"
#include "layernorm/layernorm.h"
#include "quantization/gemm_cuda.h"
#include "quantization/gemv_cuda.h"
...
...
@@ -13,8 +12,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
m
.
def
(
"gemmv2_forward_cuda"
,
&
gemmv2_forward_cuda
,
"Quantized v2 GEMM kernel."
);
m
.
def
(
"gemv_forward_cuda"
,
&
gemv_forward_cuda
,
"Quantized GEMV kernel."
);
m
.
def
(
"rotary_embedding_neox"
,
&
rotary_embedding_neox
,
"Apply GPT-NeoX style rotary embedding to query and key"
);
m
.
def
(
"single_query_attention"
,
&
single_query_attention
,
"Attention with a single query"
,
py
::
arg
(
"q"
),
py
::
arg
(
"k"
),
py
::
arg
(
"v"
),
py
::
arg
(
"k_cache"
),
py
::
arg
(
"v_cache"
),
py
::
arg
(
"length_per_sample_"
),
py
::
arg
(
"alibi_slopes_"
),
py
::
arg
(
"timestep"
),
py
::
arg
(
"rotary_embedding_dim"
)
=
0
,
py
::
arg
(
"rotary_base"
)
=
10000.0
f
,
py
::
arg
(
"neox_rotary_style"
)
=
true
);
}
\ No newline at end of file
awq_cuda/pybind_ft.cpp
0 → 100644
View file @
d62aebfe
#include <pybind11/pybind11.h>
#include <torch/extension.h>
#include "attention/ft_attention.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"single_query_attention"
,
&
single_query_attention
,
"Attention with a single query"
,
py
::
arg
(
"q"
),
py
::
arg
(
"k"
),
py
::
arg
(
"v"
),
py
::
arg
(
"k_cache"
),
py
::
arg
(
"v_cache"
),
py
::
arg
(
"length_per_sample_"
),
py
::
arg
(
"alibi_slopes_"
),
py
::
arg
(
"timestep"
),
py
::
arg
(
"rotary_embedding_dim"
)
=
0
,
py
::
arg
(
"rotary_base"
)
=
10000.0
f
,
py
::
arg
(
"neox_rotary_style"
)
=
true
);
}
\ No newline at end of file
setup.py
View file @
d62aebfe
import
os
import
torch
from
pathlib
import
Path
from
setuptools
import
setup
,
find_packages
from
distutils.sysconfig
import
get_python_lib
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDA_HOME
,
CUDAExtension
os
.
environ
[
"CC"
]
=
"g++"
os
.
environ
[
"CXX"
]
=
"g++"
common_setup_kwargs
=
{
"version"
:
"0.0.2"
,
"name"
:
"autoawq"
,
"author"
:
"Casper Hansen"
,
"license"
:
"MIT"
,
"python_requires"
:
">=3.8.0"
,
"description"
:
"AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference."
,
"long_description"
:
(
Path
(
__file__
).
parent
/
"README.md"
).
read_text
(
encoding
=
"UTF-8"
),
"long_description_content_type"
:
"text/markdown"
,
"url"
:
"https://github.com/casper-hansen/AutoAWQ"
,
"keywords"
:
[
"awq"
,
"autoawq"
,
"quantization"
,
"transformers"
],
"platforms"
:
[
"linux"
,
"windows"
],
"classifiers"
:
[
"Environment :: GPU :: NVIDIA CUDA :: 11.8"
,
"Environment :: GPU :: NVIDIA CUDA :: 12"
,
"License :: OSI Approved :: MIT License"
,
"Natural Language :: English"
,
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: C++"
,
]
}
requirements
=
[
"torch>=2.0.0"
,
"transformers>=4.32.0"
,
"tokenizers>=0.12.1"
,
"accelerate"
,
"sentencepiece"
,
"lm_eval"
,
"texttable"
,
"toml"
,
"attributedict"
,
"protobuf"
,
"torchvision"
,
"tabulate"
]
def
get_include_dirs
():
include_dirs
=
[]
conda_cuda_include_dir
=
os
.
path
.
join
(
get_python_lib
(),
"nvidia/cuda_runtime/include"
)
if
os
.
path
.
isdir
(
conda_cuda_include_dir
):
include_dirs
.
append
(
conda_cuda_include_dir
)
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
include_dirs
.
append
(
this_dir
)
return
include_dirs
def
get_generator_flag
():
generator_flag
=
[]
torch_dir
=
torch
.
__path__
[
0
]
if
os
.
path
.
exists
(
os
.
path
.
join
(
torch_dir
,
"include"
,
"ATen"
,
"CUDAGeneratorImpl.h"
)):
generator_flag
=
[
"-DOLD_GENERATOR_PATH"
]
return
generator_flag
def
check_dependencies
():
if
CUDA_HOME
is
None
:
raise
RuntimeError
(
f
"Cannot find CUDA_HOME. CUDA must be available to build the package."
)
def
get_compute_capabilities
():
# Collect the compute capabilities of all available GPUs.
compute_capabilities
=
set
()
for
i
in
range
(
torch
.
cuda
.
device_count
()):
major
,
minor
=
torch
.
cuda
.
get_device_capability
(
i
)
if
major
<
8
:
raise
RuntimeError
(
"GPUs with compute capability less than 8.0 are not supported."
)
compute_capabilities
.
add
(
major
*
10
+
minor
)
# figure out compute capability
compute_capabilities
=
{
80
,
86
,
89
,
90
}
capability_flags
=
[]
for
cap
in
compute_capabilities
:
capability_flags
+=
[
"-gencode"
,
f
"arch=compute_
{
cap
}
,code=sm_
{
cap
}
"
]
return
capability_flags
check_dependencies
()
include_dirs
=
get_include_dirs
()
generator_flags
=
get_generator_flag
()
arch_flags
=
get_compute_capabilities
()
if
os
.
name
==
"nt"
:
# Relaxed args on Windows
extra_compile_args
=
{
"nvcc"
:
arch_flags
}
else
:
extra_compile_args
=
{
"cxx"
:
[
"-g"
,
"-O3"
,
"-fopenmp"
,
"-lgomp"
,
"-std=c++17"
,
"-DENABLE_BF16"
],
"nvcc"
:
[
"-O3"
,
"-std=c++17"
,
"-DENABLE_BF16"
,
"-U__CUDA_NO_HALF_OPERATORS__"
,
"-U__CUDA_NO_HALF_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT16_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT162_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__"
,
"--expt-relaxed-constexpr"
,
"--expt-extended-lambda"
,
"--use_fast_math"
,
]
+
arch_flags
+
generator_flags
}
extensions
=
[
CUDAExtension
(
"awq_inference_engine"
,
[
"awq_cuda/pybind.cpp"
,
"awq_cuda/quantization/gemm_cuda_gen.cu"
,
"awq_cuda/layernorm/layernorm.cu"
,
"awq_cuda/position_embedding/pos_encoding_kernels.cu"
,
"awq_cuda/quantization/gemv_cuda.cu"
,
"awq_cuda/attention/ft_attention.cpp"
,
"awq_cuda/attention/decoder_masked_multihead_attention.cu"
],
extra_compile_args
=
extra_compile_args
)
]
additional_setup_kwargs
=
{
"ext_modules"
:
extensions
,
"cmdclass"
:
{
'build_ext'
:
BuildExtension
}
}
common_setup_kwargs
.
update
(
additional_setup_kwargs
)
setup
(
packages
=
find_packages
(),
install_requires
=
requirements
,
include_dirs
=
include_dirs
,
**
common_setup_kwargs
)
\ No newline at end of file
import
os
import
torch
from
pathlib
import
Path
from
setuptools
import
setup
,
find_packages
from
distutils.sysconfig
import
get_python_lib
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDA_HOME
,
CUDAExtension
os
.
environ
[
"CC"
]
=
"g++"
os
.
environ
[
"CXX"
]
=
"g++"
common_setup_kwargs
=
{
"version"
:
"0.0.2"
,
"name"
:
"autoawq"
,
"author"
:
"Casper Hansen"
,
"license"
:
"MIT"
,
"python_requires"
:
">=3.8.0"
,
"description"
:
"AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference."
,
"long_description"
:
(
Path
(
__file__
).
parent
/
"README.md"
).
read_text
(
encoding
=
"UTF-8"
),
"long_description_content_type"
:
"text/markdown"
,
"url"
:
"https://github.com/casper-hansen/AutoAWQ"
,
"keywords"
:
[
"awq"
,
"autoawq"
,
"quantization"
,
"transformers"
],
"platforms"
:
[
"linux"
,
"windows"
],
"classifiers"
:
[
"Environment :: GPU :: NVIDIA CUDA :: 11.8"
,
"Environment :: GPU :: NVIDIA CUDA :: 12"
,
"License :: OSI Approved :: MIT License"
,
"Natural Language :: English"
,
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: C++"
,
]
}
requirements
=
[
"torch>=2.0.0"
,
"transformers>=4.32.0"
,
"tokenizers>=0.12.1"
,
"accelerate"
,
"sentencepiece"
,
"lm_eval"
,
"texttable"
,
"toml"
,
"attributedict"
,
"protobuf"
,
"torchvision"
,
"tabulate"
]
def
get_include_dirs
():
include_dirs
=
[]
conda_cuda_include_dir
=
os
.
path
.
join
(
get_python_lib
(),
"nvidia/cuda_runtime/include"
)
if
os
.
path
.
isdir
(
conda_cuda_include_dir
):
include_dirs
.
append
(
conda_cuda_include_dir
)
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
include_dirs
.
append
(
this_dir
)
return
include_dirs
def
get_generator_flag
():
generator_flag
=
[]
torch_dir
=
torch
.
__path__
[
0
]
if
os
.
path
.
exists
(
os
.
path
.
join
(
torch_dir
,
"include"
,
"ATen"
,
"CUDAGeneratorImpl.h"
)):
generator_flag
=
[
"-DOLD_GENERATOR_PATH"
]
return
generator_flag
def
check_dependencies
():
if
CUDA_HOME
is
None
:
raise
RuntimeError
(
f
"Cannot find CUDA_HOME. CUDA must be available to build the package."
)
def
get_compute_capabilities
():
# Collect the compute capabilities of all available GPUs.
compute_capabilities
=
set
()
for
i
in
range
(
torch
.
cuda
.
device_count
()):
major
,
minor
=
torch
.
cuda
.
get_device_capability
(
i
)
if
major
<
8
:
raise
RuntimeError
(
"GPUs with compute capability less than 8.0 are not supported."
)
compute_capabilities
.
add
(
major
*
10
+
minor
)
# figure out compute capability
compute_capabilities
=
{
80
,
86
,
89
,
90
}
capability_flags
=
[]
for
cap
in
compute_capabilities
:
capability_flags
+=
[
"-gencode"
,
f
"arch=compute_
{
cap
}
,code=sm_
{
cap
}
"
]
return
capability_flags
check_dependencies
()
include_dirs
=
get_include_dirs
()
generator_flags
=
get_generator_flag
()
arch_flags
=
get_compute_capabilities
()
if
os
.
name
==
"nt"
:
include_arch
=
os
.
getenv
(
"INCLUDE_ARCH"
,
"1"
)
==
"1"
# Relaxed args on Windows
if
include_arch
:
extra_compile_args
=
{
"nvcc"
:
arch_flags
}
else
:
extra_compile_args
=
{}
else
:
extra_compile_args
=
{
"cxx"
:
[
"-g"
,
"-O3"
,
"-fopenmp"
,
"-lgomp"
,
"-std=c++17"
,
"-DENABLE_BF16"
],
"nvcc"
:
[
"-O3"
,
"-std=c++17"
,
"-DENABLE_BF16"
,
"-U__CUDA_NO_HALF_OPERATORS__"
,
"-U__CUDA_NO_HALF_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT16_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__"
,
"-U__CUDA_NO_BFLOAT162_OPERATORS__"
,
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__"
,
"--expt-relaxed-constexpr"
,
"--expt-extended-lambda"
,
"--use_fast_math"
,
]
+
arch_flags
+
generator_flags
}
extensions
=
[
CUDAExtension
(
"awq_inference_engine"
,
[
"awq_cuda/pybind_awq.cpp"
,
"awq_cuda/quantization/gemm_cuda_gen.cu"
,
"awq_cuda/layernorm/layernorm.cu"
,
"awq_cuda/position_embedding/pos_encoding_kernels.cu"
,
"awq_cuda/quantization/gemv_cuda.cu"
],
extra_compile_args
=
extra_compile_args
)
]
if
os
.
name
!=
"nt"
:
extensions
.
append
(
CUDAExtension
(
"ft_inference_engine"
,
[
"awq_cuda/pybind_ft.cpp"
,
"awq_cuda/attention/ft_attention.cpp"
,
"awq_cuda/attention/decoder_masked_multihead_attention.cu"
],
extra_compile_args
=
extra_compile_args
)
)
additional_setup_kwargs
=
{
"ext_modules"
:
extensions
,
"cmdclass"
:
{
'build_ext'
:
BuildExtension
}
}
common_setup_kwargs
.
update
(
additional_setup_kwargs
)
setup
(
packages
=
find_packages
(),
install_requires
=
requirements
,
include_dirs
=
include_dirs
,
**
common_setup_kwargs
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment