suppport windows

e1884728 · qwopqwop200 · GitHub · a5772f67 · e1884728 · e1884728
Unverified Commit e1884728 authored Sep 15, 2023 by qwopqwop200 Committed by GitHub Sep 15, 2023
Showing with 193 additions and 168 deletions

awq/modules/fused/attn.py awq/modules/fused/attn.py +3 -1

awq_cuda/pybind_linux.cpp awq_cuda/pybind_linux.cpp +19 -19

awq_cuda/pybind_windows.cpp awq_cuda/pybind_windows.cpp +14 -0

setup.py setup.py +157 -148

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -5,6 +5,8 @@ import torch.nn as nn
 import awq_inference_engine
 from torch.nn import functional as F
+have_single_query_attention = hasattr(awq_inference_engine, 'single_query_attention')
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end, device=freqs.device)  # type: ignore
@@ -184,7 +186,7 @@ class QuantAttentionFused(nn.Module):
        xk = self.attention_shapes["xk_slice"](xqkv)
        xv = self.attention_shapes["xv_slice"](xqkv)
-        if seqlen > 1:
+        if seqlen > 1 and have_single_query_attention:
            xq = xq.view((bsz, seqlen) + self.attention_shapes["xq_view"])
            xk = xk.view((bsz, seqlen) + self.attention_shapes["xk_view"])
            xv = xv.view((bsz, seqlen) + self.attention_shapes["xv_view"])

--- a/awq_cuda/pybind_linux.cpp
+++ b/awq_cuda/pybind_linux.cpp
 #include <pybind11/pybind11.h>
 #include <torch/extension.h>
 #include "attention/ft_attention.h"
 #include "layernorm/layernorm.h"
 #include "quantization/gemm_cuda.h"
 #include "quantization/gemv_cuda.h"
 #include "position_embedding/pos_encoding.h"
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
    m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
    m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
    m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
    m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
    m.def("single_query_attention", &single_query_attention, "Attention with a single query",
          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"),
          py::arg("length_per_sample_"), py::arg("alibi_slopes_"), py::arg("timestep"), py::arg("rotary_embedding_dim")=0,
          py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true);
 }
\ No newline at end of file
--- a/awq_cuda/pybind_windows.cpp
+++ b/awq_cuda/pybind_windows.cpp
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+#include "layernorm/layernorm.h"
+#include "quantization/gemm_cuda.h"
+#include "quantization/gemv_cuda.h"
+#include "position_embedding/pos_encoding.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
+    m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
+    m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
+    m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
+}
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
 import os
 import torch
 from pathlib import Path
 from setuptools import setup, find_packages
 from distutils.sysconfig import get_python_lib
 from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
 common_setup_kwargs = {
    "version": "0.0.2",
    "name": "autoawq",
    "author": "Casper Hansen",
    "license": "MIT",
    "python_requires": ">=3.8.0",
    "description": "AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
    "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
    "long_description_content_type": "text/markdown",
    "url": "https://github.com/casper-hansen/AutoAWQ",
    "keywords": ["awq", "autoawq", "quantization", "transformers"],
    "platforms": ["linux", "windows"],
    "classifiers": [
        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
        "Environment :: GPU :: NVIDIA CUDA :: 12",
        "License :: OSI Approved :: MIT License",
        "Natural Language :: English",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Programming Language :: C++",
    ]
 }
 requirements = [
    "torch>=2.0.0",
    "transformers>=4.32.0",
    "tokenizers>=0.12.1",
    "accelerate",
    "sentencepiece",
    "lm_eval",
    "texttable",
    "toml",
    "attributedict",
    "protobuf",
    "torchvision",
    "tabulate"
 ]
 def get_include_dirs():
    include_dirs = []
    conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
    if os.path.isdir(conda_cuda_include_dir):
        include_dirs.append(conda_cuda_include_dir)
    this_dir = os.path.dirname(os.path.abspath(__file__))
    include_dirs.append(this_dir)
    return include_dirs
 def get_generator_flag():
    generator_flag = []
    torch_dir = torch.__path__[0]
    if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
        generator_flag = ["-DOLD_GENERATOR_PATH"]
    return generator_flag
 def check_dependencies():
    if CUDA_HOME is None:
        raise RuntimeError(
            f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
 def get_compute_capabilities():
    # Collect the compute capabilities of all available GPUs.
    compute_capabilities = set()
    for i in range(torch.cuda.device_count()):
        major, minor = torch.cuda.get_device_capability(i)
        if major < 8:
            raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
        compute_capabilities.add(major * 10 + minor)
    # figure out compute capability
    compute_capabilities = {80, 86, 89, 90}
    capability_flags = []
    for cap in compute_capabilities:
        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
    return capability_flags
 check_dependencies()
 include_dirs = get_include_dirs()
 generator_flags = get_generator_flag()
 arch_flags = get_compute_capabilities()
 if os.name == "nt":
    # Relaxed args on Windows
-    extra_compile_args={
+    extensions = [
-        "nvcc": arch_flags
+        CUDAExtension(
-    }
+            "awq_inference_engine",
-else:
+            [
-    extra_compile_args={
+                "awq_cuda/pybind_windows.cpp",
-        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
+                "awq_cuda/quantization/gemm_cuda_gen.cu",
-        "nvcc": [
+                "awq_cuda/layernorm/layernorm.cu",
-            "-O3", 
+                "awq_cuda/position_embedding/pos_encoding_kernels.cu",
-            "-std=c++17",
+                "awq_cuda/quantization/gemv_cuda.cu",
-            "-DENABLE_BF16",
+            ]
-            "-U__CUDA_NO_HALF_OPERATORS__",
+        )
-            "-U__CUDA_NO_HALF_CONVERSIONS__",
+    ]
-            "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+else:
-            "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+    extra_compile_args={
-            "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
-            "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+        "nvcc": [
-            "--expt-relaxed-constexpr",
+            "-O3", 
-            "--expt-extended-lambda",
+            "-std=c++17",
-            "--use_fast_math",
+            "-DENABLE_BF16",
-        ] + arch_flags + generator_flags
+            "-U__CUDA_NO_HALF_OPERATORS__",
-    }
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "-U__CUDA_NO_BFLOAT16_OPERATORS__",
-extensions = [
+            "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-    CUDAExtension(
+            "-U__CUDA_NO_BFLOAT162_OPERATORS__",
-        "awq_inference_engine",
+            "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
-        [
+            "--expt-relaxed-constexpr",
-            "awq_cuda/pybind.cpp",
+            "--expt-extended-lambda",
-            "awq_cuda/quantization/gemm_cuda_gen.cu",
+            "--use_fast_math",
-            "awq_cuda/layernorm/layernorm.cu",
+        ] + arch_flags + generator_flags
-            "awq_cuda/position_embedding/pos_encoding_kernels.cu",
+    }
-            "awq_cuda/quantization/gemv_cuda.cu",
-            "awq_cuda/attention/ft_attention.cpp",
+    extensions = [
-            "awq_cuda/attention/decoder_masked_multihead_attention.cu"
+        CUDAExtension(
-        ], extra_compile_args=extra_compile_args
+            "awq_inference_engine",
-    )
+            [
-]
+                "awq_cuda/pybind_linux.cpp",
+                "awq_cuda/quantization/gemm_cuda_gen.cu",
-additional_setup_kwargs = {
+                "awq_cuda/layernorm/layernorm.cu",
-    "ext_modules": extensions,
+                "awq_cuda/position_embedding/pos_encoding_kernels.cu",
-    "cmdclass": {'build_ext': BuildExtension}
+                "awq_cuda/quantization/gemv_cuda.cu",
-}
+                "awq_cuda/attention/ft_attention.cpp",
+                "awq_cuda/attention/decoder_masked_multihead_attention.cu"
-common_setup_kwargs.update(additional_setup_kwargs)
+            ], extra_compile_args=extra_compile_args
+        )
-setup(
+    ]
-    packages=find_packages(),
-    install_requires=requirements,
+additional_setup_kwargs = {
-    include_dirs=include_dirs,
+    "ext_modules": extensions,
-    **common_setup_kwargs
+    "cmdclass": {'build_ext': BuildExtension}
+}
+common_setup_kwargs.update(additional_setup_kwargs)
+setup(
+    packages=find_packages(),
+    install_requires=requirements,
+    include_dirs=include_dirs,
+    **common_setup_kwargs
 )
\ No newline at end of file