suppport windows

e1884728 · qwopqwop200 · GitHub · a5772f67 · e1884728 · e1884728
Unverified Commit e1884728 authored Sep 15, 2023 by qwopqwop200 Committed by GitHub Sep 15, 2023
Showing with 193 additions and 168 deletions

awq/modules/fused/attn.py awq/modules/fused/attn.py +3 -1

awq_cuda/pybind_linux.cpp awq_cuda/pybind_linux.cpp +19 -19

awq_cuda/pybind_windows.cpp awq_cuda/pybind_windows.cpp +14 -0

setup.py setup.py +157 -148

No files found.
--- a/awq/modules/fused/attn.py
+++ b/awq/modules/fused/attn.py
@@ -5,6 +5,8 @@ import torch.nn as nn
 import awq_inference_engine
 from torch.nn import functional as F

+have_single_query_attention = hasattr(awq_inference_engine, 'single_query_attention')
+
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end, device=freqs.device)  # type: ignore
@@ -184,7 +186,7 @@ class QuantAttentionFused(nn.Module):
        xk = self.attention_shapes["xk_slice"](xqkv)
        xv = self.attention_shapes["xv_slice"](xqkv)

-        if seqlen > 1:
+        if seqlen > 1 and have_single_query_attention:
            xq = xq.view((bsz, seqlen) + self.attention_shapes["xq_view"])
            xk = xk.view((bsz, seqlen) + self.attention_shapes["xk_view"])
            xv = xv.view((bsz, seqlen) + self.attention_shapes["xv_view"])

--- a/awq_cuda/pybind_linux.cpp
+++ b/awq_cuda/pybind_linux.cpp
-#include <pybind11/pybind11.h>
-#include <torch/extension.h>
-#include "attention/ft_attention.h"
-#include "layernorm/layernorm.h"
-#include "quantization/gemm_cuda.h"
-#include "quantization/gemv_cuda.h"
-#include "position_embedding/pos_encoding.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
-    m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
-    m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
-    m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
-    m.def("single_query_attention", &single_query_attention, "Attention with a single query",
-          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"),
-          py::arg("length_per_sample_"), py::arg("alibi_slopes_"), py::arg("timestep"), py::arg("rotary_embedding_dim")=0,
-          py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true);
-}
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+#include "attention/ft_attention.h"
+#include "layernorm/layernorm.h"
+#include "quantization/gemm_cuda.h"
+#include "quantization/gemv_cuda.h"
+#include "position_embedding/pos_encoding.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
+    m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
+    m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
+    m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
+    m.def("single_query_attention", &single_query_attention, "Attention with a single query",
+          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"),
+          py::arg("length_per_sample_"), py::arg("alibi_slopes_"), py::arg("timestep"), py::arg("rotary_embedding_dim")=0,
+          py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true);
+}
\ No newline at end of file
--- a/awq_cuda/pybind_windows.cpp
+++ b/awq_cuda/pybind_windows.cpp
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+#include "layernorm/layernorm.h"
+#include "quantization/gemm_cuda.h"
+#include "quantization/gemv_cuda.h"
+#include "position_embedding/pos_encoding.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
+    m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
+    m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
+    m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
+}
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
-import os
-import torch
-from pathlib import Path
-from setuptools import setup, find_packages
-from distutils.sysconfig import get_python_lib
-from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
-
-os.environ["CC"] = "g++"
-os.environ["CXX"] = "g++"
-
-common_setup_kwargs = {
-    "version": "0.0.2",
-    "name": "autoawq",
-    "author": "Casper Hansen",
-    "license": "MIT",
-    "python_requires": ">=3.8.0",
-    "description": "AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
-    "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
-    "long_description_content_type": "text/markdown",
-    "url": "https://github.com/casper-hansen/AutoAWQ",
-    "keywords": ["awq", "autoawq", "quantization", "transformers"],
-    "platforms": ["linux", "windows"],
-    "classifiers": [
-        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-        "Environment :: GPU :: NVIDIA CUDA :: 12",
-        "License :: OSI Approved :: MIT License",
-        "Natural Language :: English",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: C++",
-    ]
-}
-
-requirements = [
-    "torch>=2.0.0",
-    "transformers>=4.32.0",
-    "tokenizers>=0.12.1",
-    "accelerate",
-    "sentencepiece",
-    "lm_eval",
-    "texttable",
-    "toml",
-    "attributedict",
-    "protobuf",
-    "torchvision",
-    "tabulate"
-]
-
-def get_include_dirs():
-    include_dirs = []
-
-    conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
-    if os.path.isdir(conda_cuda_include_dir):
-        include_dirs.append(conda_cuda_include_dir)
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    include_dirs.append(this_dir)
-
-    return include_dirs
-
-def get_generator_flag():
-    generator_flag = []
-    torch_dir = torch.__path__[0]
-    if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
-        generator_flag = ["-DOLD_GENERATOR_PATH"]
-    
-    return generator_flag
-
-def check_dependencies():
-    if CUDA_HOME is None:
-        raise RuntimeError(
-            f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
-
-def get_compute_capabilities():
-    # Collect the compute capabilities of all available GPUs.
-    compute_capabilities = set()
-    for i in range(torch.cuda.device_count()):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 8:
-            raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
-        compute_capabilities.add(major * 10 + minor)
-
-    # figure out compute capability
-    compute_capabilities = {80, 86, 89, 90}
-
-    capability_flags = []
-    for cap in compute_capabilities:
-        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
-
-    return capability_flags
-
-check_dependencies()
-include_dirs = get_include_dirs()
-generator_flags = get_generator_flag()
-arch_flags = get_compute_capabilities()
-
-if os.name == "nt":
-    # Relaxed args on Windows
-    extra_compile_args={
-        "nvcc": arch_flags
-    }
-else:
-    extra_compile_args={
-        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
-        "nvcc": [
-            "-O3", 
-            "-std=c++17",
-            "-DENABLE_BF16",
-            "-U__CUDA_NO_HALF_OPERATORS__",
-            "-U__CUDA_NO_HALF_CONVERSIONS__",
-            "-U__CUDA_NO_BFLOAT16_OPERATORS__",
-            "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-            "-U__CUDA_NO_BFLOAT162_OPERATORS__",
-            "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
-            "--expt-relaxed-constexpr",
-            "--expt-extended-lambda",
-            "--use_fast_math",
-        ] + arch_flags + generator_flags
-    }
-
-extensions = [
-    CUDAExtension(
-        "awq_inference_engine",
-        [
-            "awq_cuda/pybind.cpp",
-            "awq_cuda/quantization/gemm_cuda_gen.cu",
-            "awq_cuda/layernorm/layernorm.cu",
-            "awq_cuda/position_embedding/pos_encoding_kernels.cu",
-            "awq_cuda/quantization/gemv_cuda.cu",
-            "awq_cuda/attention/ft_attention.cpp",
-            "awq_cuda/attention/decoder_masked_multihead_attention.cu"
-        ], extra_compile_args=extra_compile_args
-    )
-]
-
-additional_setup_kwargs = {
-    "ext_modules": extensions,
-    "cmdclass": {'build_ext': BuildExtension}
-}
-
-common_setup_kwargs.update(additional_setup_kwargs)
-
-setup(
-    packages=find_packages(),
-    install_requires=requirements,
-    include_dirs=include_dirs,
-    **common_setup_kwargs
+import os
+import torch
+from pathlib import Path
+from setuptools import setup, find_packages
+from distutils.sysconfig import get_python_lib
+from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
+
+os.environ["CC"] = "g++"
+os.environ["CXX"] = "g++"
+
+common_setup_kwargs = {
+    "version": "0.0.2",
+    "name": "autoawq",
+    "author": "Casper Hansen",
+    "license": "MIT",
+    "python_requires": ">=3.8.0",
+    "description": "AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.",
+    "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
+    "long_description_content_type": "text/markdown",
+    "url": "https://github.com/casper-hansen/AutoAWQ",
+    "keywords": ["awq", "autoawq", "quantization", "transformers"],
+    "platforms": ["linux", "windows"],
+    "classifiers": [
+        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+        "Environment :: GPU :: NVIDIA CUDA :: 12",
+        "License :: OSI Approved :: MIT License",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: C++",
+    ]
+}
+
+requirements = [
+    "torch>=2.0.0",
+    "transformers>=4.32.0",
+    "tokenizers>=0.12.1",
+    "accelerate",
+    "sentencepiece",
+    "lm_eval",
+    "texttable",
+    "toml",
+    "attributedict",
+    "protobuf",
+    "torchvision",
+    "tabulate"
+]
+
+def get_include_dirs():
+    include_dirs = []
+
+    conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
+    if os.path.isdir(conda_cuda_include_dir):
+        include_dirs.append(conda_cuda_include_dir)
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    include_dirs.append(this_dir)
+
+    return include_dirs
+
+def get_generator_flag():
+    generator_flag = []
+    torch_dir = torch.__path__[0]
+    if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
+        generator_flag = ["-DOLD_GENERATOR_PATH"]
+    
+    return generator_flag
+
+def check_dependencies():
+    if CUDA_HOME is None:
+        raise RuntimeError(
+            f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
+
+def get_compute_capabilities():
+    # Collect the compute capabilities of all available GPUs.
+    compute_capabilities = set()
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 8:
+            raise RuntimeError("GPUs with compute capability less than 8.0 are not supported.")
+        compute_capabilities.add(major * 10 + minor)
+
+    # figure out compute capability
+    compute_capabilities = {80, 86, 89, 90}
+
+    capability_flags = []
+    for cap in compute_capabilities:
+        capability_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
+
+    return capability_flags
+
+check_dependencies()
+include_dirs = get_include_dirs()
+generator_flags = get_generator_flag()
+arch_flags = get_compute_capabilities()
+
+if os.name == "nt":
+    # Relaxed args on Windows
+    extensions = [
+        CUDAExtension(
+            "awq_inference_engine",
+            [
+                "awq_cuda/pybind_windows.cpp",
+                "awq_cuda/quantization/gemm_cuda_gen.cu",
+                "awq_cuda/layernorm/layernorm.cu",
+                "awq_cuda/position_embedding/pos_encoding_kernels.cu",
+                "awq_cuda/quantization/gemv_cuda.cu",
+            ]
+        )
+    ]
+else:
+    extra_compile_args={
+        "cxx": ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
+        "nvcc": [
+            "-O3", 
+            "-std=c++17",
+            "-DENABLE_BF16",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+            "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+            "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+            "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+            "--expt-relaxed-constexpr",
+            "--expt-extended-lambda",
+            "--use_fast_math",
+        ] + arch_flags + generator_flags
+    }
+    
+    extensions = [
+        CUDAExtension(
+            "awq_inference_engine",
+            [
+                "awq_cuda/pybind_linux.cpp",
+                "awq_cuda/quantization/gemm_cuda_gen.cu",
+                "awq_cuda/layernorm/layernorm.cu",
+                "awq_cuda/position_embedding/pos_encoding_kernels.cu",
+                "awq_cuda/quantization/gemv_cuda.cu",
+                "awq_cuda/attention/ft_attention.cpp",
+                "awq_cuda/attention/decoder_masked_multihead_attention.cu"
+            ], extra_compile_args=extra_compile_args
+        )
+    ]
+
+additional_setup_kwargs = {
+    "ext_modules": extensions,
+    "cmdclass": {'build_ext': BuildExtension}
+}
+
+common_setup_kwargs.update(additional_setup_kwargs)
+
+setup(
+    packages=find_packages(),
+    install_requires=requirements,
+    include_dirs=include_dirs,
+    **common_setup_kwargs
 )
\ No newline at end of file