Exllama v2 (#1211)

# What does this PR do? See #1165   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation ). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Florian Zimmermeister <flozi00.fz@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-24-153.ec2.internal>

Exllama v2 (#1211)
# What does this PR do? See #1165   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation ). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Florian Zimmermeister <flozi00.fz@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-24-153.ec2.internal>
ed2a3f61 · Nicolas Patry · GitHub · 3c02262f · ed2a3f61 · ed2a3f61
Unverified Commit ed2a3f61 authored Nov 25, 2023 by Nicolas Patry Committed by GitHub Nov 25, 2023
7 changed files
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
+
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+#define DBGS(__x) printf("%s\n", __x)
+#define DBGI(__x) printf("%s: %i\n", #__x, __x)
+#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
+#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGX(__x) printf("%s: %x\n", #__x, __x)
+#define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y)
+#define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGF(__x) printf("%s: %f\n", #__x, __x)
+#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
+#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x))
+#define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y))
+#define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z))
+
+#define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y))
+#define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z))
+
+__forceinline__ __device__ half dq_scale_(const int qs, const half max_scale)
+{
+    half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f));
+    qs_h = __hmul(qs_h, qs_h);
+    qs_h = __hmul(qs_h, max_scale);
+    return qs_h;
+}
+
+__forceinline__ __device__ float clamp(float x, float a, float b)
+{
+    return fmaxf(a, fminf(b, x));
+}
+
+#define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
+inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+
+#include "config.h"
+
+#include "cuda/q_matrix.cuh"
+#include "cuda/q_gemm.cuh"
+
+#include "cpp/util.h"
+
+// Some decluttering macros
+
+#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
+#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
+#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
+#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
+
+
+// Quant matrix
+
+uintptr_t make_q_matrix
+(
+    torch::Tensor q_weight,
+    torch::Tensor q_perm,
+    torch::Tensor q_invperm,
+    torch::Tensor q_scale,
+    torch::Tensor q_scale_max,
+    torch::Tensor q_groups,
+    torch::Tensor gptq_qzeros,
+    torch::Tensor gptq_scales,
+    torch::Tensor gptq_g_idx,
+    torch::Tensor temp_dq
+)
+{
+    TORCH_CHECK_DTYPE(q_weight, kInt);
+    TORCH_CHECK_DTYPE_OPT(q_perm, kShort);
+    TORCH_CHECK_DTYPE_OPT(q_invperm, kShort);
+    TORCH_CHECK_DTYPE_OPT(q_scale, kInt);
+    TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf);
+    TORCH_CHECK_DTYPE_OPT(q_groups, kShort);
+    TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt);
+    TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf);
+    TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt);
+
+    TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1);
+
+    int device = q_weight.device().index();
+    int width = q_weight.size(1);
+    int groups;
+    int height;
+
+    if (!q_scale.device().is_meta())
+    {
+        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
+        TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1);
+        groups = q_scale.size(0);
+        height = q_invperm.size(0);
+    }
+    else
+    {
+        TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8);
+        TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1);
+        groups = gptq_qzeros.size(0);
+        height = q_weight.size(0) * 8;
+    }
+
+    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
+
+    QMatrix* m = new QMatrix
+    (
+        device,
+        height,
+        width,
+        groups,
+        (uint32_t*) q_weight.data_ptr(),
+        q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(),
+        q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(),
+        q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(),
+        q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(),
+        q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(),
+        gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(),
+        gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(),
+        gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(),
+        (half*) temp_dq.data_ptr()
+    );
+
+    return reinterpret_cast<uintptr_t> (m);
+}
+
+void gemm_half_q_half
+(
+    torch::Tensor a,
+    uintptr_t b,
+    torch::Tensor c,
+    bool force_cuda
+)
+{
+    QMatrix* qm = reinterpret_cast<QMatrix*> (b);
+
+    TORCH_CHECK_DTYPE(a, kHalf);
+    TORCH_CHECK_DTYPE(c, kHalf);
+    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
+    TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes")
+    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+
+    gemm_half_q_half_cuda
+    (
+        at::cuda::getCurrentCUDABlasHandle(),
+        (const half*) a.data_ptr(),
+        qm,
+        (half*) c.data_ptr(),
+        c.size(0), // m
+        c.size(1), // n
+        a.size(1), // k
+        true,
+        NULL,
+        force_cuda
+    );
+}
+
+// Bindings
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("make_q_matrix", &make_q_matrix, "make_q_matrix");
+    m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half");
+}
--- a/server/exllamav2_kernels/setup.py
+++ b/server/exllamav2_kernels/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name="exllamav2_kernels",
+    ext_modules=[
+        CUDAExtension(
+            name="exllamav2_kernels",
+            sources=[
+                "exllamav2_kernels/ext.cpp",
+                "exllamav2_kernels/cuda/q_matrix.cu",
+                "exllamav2_kernels/cuda/q_gemm.cu",
+            ],
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -168,7 +168,7 @@ def serve(
                # When using GPTQ, Exllama kernels need some global kernels
                # For which we have the finale shapes only after the model has loaded
                # This will allocate those buffers.
-                from text_generation_server.utils.gptq.exllama import (
+                from text_generation_server.utils.layers import (
                    create_exllama_buffers,
                    set_device,
                )

--- a/server/text_generation_server/utils/gptq/exllamav2.py
+++ b/server/text_generation_server/utils/gptq/exllamav2.py
+# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
+
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+import math
+
+logger = getLogger(__name__)
+
+try:
+    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
+except ImportError:
+    logger.error('exllamav2_kernels not installed.')
+    raise
+
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+
+def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
+    """Matrix multiplication, returns x @ q4"""
+    output_shape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype = torch.half, device = x.device)
+    gemm_half_q_half(x, q_handle, output, force_cuda)
+    return output.view(output_shape)
+
+def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
+    """
+    Create Q matrix 
+    """
+    # EXL2
+    # won't work as the moment because the tensors are not the same. 
+    if "q_weight" in w:
+        w["q_scale_max"] /= 256
+        w["q_perm"] = w["q_perm"].short()
+        w["q_invperm"] = w["q_invperm"].short()
+        return make_q_matrix(w["q_weight"],
+                                w["q_perm"],
+                                w["q_invperm"],
+                                w["q_scale"],
+                                w["q_scale_max"],
+                                w["q_groups"],
+                                none_tensor,
+                                none_tensor,
+                                none_tensor,
+                                temp_dq)
+    # GPTQ
+    elif "qweight" in w:
+        if w["scales"].dtype == torch.float:
+            w["scales"] = w["scales"].half()
+
+        # GPTQ with g_idx (act_order)
+        if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item():
+            w["q_perm"] = torch.empty((w["qweight"].shape[0] * 8,), dtype = torch.short, device = w["qweight"].device)
+            w["q_invperm"] = torch.empty_like(w["q_perm"])
+            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
+            return make_q_matrix(w["qweight"],
+                                 w["q_perm"],
+                                 w["q_invperm"],
+                                 none_tensor,
+                                 none_tensor,
+                                 none_tensor,
+                                 w["qzeros"],
+                                 w["scales"],
+                                 w["g_idx"].cpu(),
+                                 temp_dq)
+        # GPTQ without g_idx
+        else:
+            return make_q_matrix(w["qweight"],
+                                none_tensor,
+                                none_tensor,
+                                none_tensor,
+                                none_tensor,
+                                none_tensor,
+                                w["qzeros"],
+                                w["scales"],
+                                none_tensor,
+                                temp_dq)
+
+DEVICE = None
+FIXED_BYTES = 0
+LAYERS = []
+
+
+def set_device(device):
+    global DEVICE
+    DEVICE = device
+
+
+def create_exllama_buffers():
+    global FIXED_BYTES, LAYERS, DEVICE
+    temp_dq = ExLlamaV2DeviceTensors(DEVICE, FIXED_BYTES)
+
+    for layer in LAYERS:
+        layer.post_init(temp_dq)
+
+
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "exllamav2"
+
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+
+    # def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        if bits != 4:
+            raise ValueError(
+                f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization.")
+        self.q_handle = None
+        self.q_tensors = None
+        self.bits = bits
+        self.maxq = 2 ** self.bits - 1
+        self.infeatures = qweight.shape[0] // self.bits * 32
+        self.outfeatures = qweight.shape[1]
+        self.padding = - self.outfeatures % 32
+        self.outfeatures = self.outfeatures + self.padding
+
+        self.device = qweight.device
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.g_idx = g_idx
+        self.bias = bias if bias is not None else None
+        self.group_size = groupsize
+
+        infeatures = self.infeatures
+        outfeatures = self.outfeatures
+        assert qweight.shape == (infeatures // 32 * self.bits, outfeatures)
+        assert infeatures % self.group_size == 0
+        assert qzeros.shape == (infeatures // self.group_size, outfeatures // 32 * self.bits)
+        assert scales.shape == (infeatures // self.group_size, outfeatures)
+        assert g_idx.shape == (infeatures, ), f"{g_idx.shape}, {infeatures}"
+
+        global FIXED_BYTES, LAYERS
+        FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed())
+        LAYERS.append(self)
+
+    def post_init(self, temp_dq):
+        assert self.qweight.device.type == "cuda"
+        assert self.qweight.device.index is not None
+        self.q_tensors = {
+            "qweight":self.qweight,
+            "qzeros":self.qzeros,
+            "scales":self.scales,
+            "g_idx":self.g_idx
+        }
+        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
+        self.q_handle = ext_make_q_matrix(
+            self.q_tensors, temp_dq
+        )
+    
+    def forward(self, x, force_cuda = False):
+        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
+
+        if self.bias is not None:
+            output.add_(self.bias)
+        return output
+    
+    def temp_dq_size(self):
+        return self.infeatures * self.outfeatures * 2 + 128
+    
+    def temp_fwd_size(self, max_input_len, max_batch_size):
+        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
+    
+    def scratch_space_fixed(self, max_input_len=4096, max_batch_size=16):
+        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
+               
+    
+class ExLlamaV2DeviceTensors:
+
+    device_idx: int
+    scratch_bytes: int
+    scratch_idx: int
+    scratch: torch.tensor = None
+
+    def __init__(self, device, scratch_bytes):
+        self.device = device
+        self.scratch_bytes = scratch_bytes
+    
+    def prepare(self):
+        self.scratch = torch.empty((self.scratch_bytes // 2,), dtype = torch.half, device = self.device)
+
+    def get_scratch_slice(self, size_bytes):
+
+        if self.scratch is None: self.prepare()
+
+        size_bytes = ((size_bytes + 127) // 128) * 128
+        size_half = size_bytes // 2
+        scratch_slice = self.scratch.narrow(0, 0, size_half)
+        return scratch_slice
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -31,15 +31,31 @@ try:
    major, _minor = torch.cuda.get_device_capability()
 except Exception:
    major = 1
+
 HAS_EXLLAMA = False
 CAN_EXLLAMA = major >= 8
+V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
+if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1:
+    logger.warning("Disabling exllama v2 and using v1 instead because there are issues when sharding")
+    V2 = False
+
 if os.getenv("DISABLE_EXLLAMA") == "True":
    HAS_EXLLAMA = False
 elif CAN_EXLLAMA:
    try:
-        from text_generation_server.utils.gptq.exllama import Ex4bitLinear
+        if V2:
+            from text_generation_server.utils.gptq.exllamav2 import (QuantLinear as ExllamaQuantLinear, 
+                    create_exllama_buffers,
+                    set_device,
+                                                                     )
+            HAS_EXLLAMA = "2"
+        else:
+            from text_generation_server.utils.gptq.exllama import (Ex4bitLinear as ExllamaQuantLinear,
+                    create_exllama_buffers,
+                    set_device,
+                )
+            HAS_EXLLAMA = "1"

-        HAS_EXLLAMA = True
    except ImportError:
        pass

@@ -308,7 +324,7 @@ def get_linear(weight, bias, quantize):
            )

        if use_exllama:
-            linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+            linear = ExllamaQuantLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
        else:
            linear = QuantLinear(
                qweight,

--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -278,23 +278,13 @@ class Weights:
                        )
                    use_exllama = False
                else:
-                    logger.info("Using exllama kernels")
+                    logger.info(f"Using exllama kernels v{HAS_EXLLAMA}")

            if use_exllama:
-                if groupsize >= 0:
-                    # Exllama reorders the weights in advance and the activations on the fly, thus
-                    # the scales and zero-points do not need to be reordered.
-                    qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
-                    scales = self.get_sharded(f"{prefix}.scales", dim=0)
-                else:
-                    qzeros = self.get_tensor(f"{prefix}.qzeros")
-                    scales = self.get_tensor(f"{prefix}.scales")
-
-                # For tp > 1, at this point we know we do not use act-order
-                if self.process_group.size() == 1:
-                    g_idx = self.get_tensor(f"{prefix}.g_idx")
-                else:
-                    g_idx = None
+                qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
+                scales = self.get_sharded(f"{prefix}.scales", dim=0)
+                g_idx = self.get_sharded(f"{prefix}.g_idx", dim= 0)
+                g_idx = g_idx - g_idx[0]
            else:
                # The triton kernel reorders the scales/zero points instead of the weight/activation.
                # Thus, each rank needs the full qzeros/scales.