exl2.py 2.4 KB
Newer Older
1
from dataclasses import dataclass
2
from typing import List, Union
3

4
5
import torch
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
6

7
8

@dataclass
9
class Exl2Weight(Weight):
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
    """
    Exllama2 exl2 quantized weights.
    """

    q_weight: torch.Tensor
    q_scale: torch.Tensor
    q_invperm: torch.Tensor
    q_scale_max: torch.Tensor
    q_groups: torch.Tensor

    def __post_init__(self):
        self.q_scale_max /= 256
        self.q_invperm = self.q_invperm.short()

    @property
    def device(self) -> torch.device:
        return self.q_weight.device
27

28
29
30
31
32
    def get_linear(self, bias: torch.Tensor):
        from text_generation_server.layers.gptq import ExllamaQuantLinear

        return ExllamaQuantLinear(self, bias)

33
34
35
36

class Exl2WeightsLoader(WeightsLoader):
    """Loader for exl2-quantized weights."""

37
38
39
40
    def get_weights(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
        try:
            q_weight = weights.get_tensor(f"{prefix}.q_weight")
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `exl2`-quantized weight, make sure the model is already quantized."
            )

        q_scale = weights.get_tensor(f"{prefix}.q_scale")
        q_invperm = weights.get_tensor(f"{prefix}.q_invperm")
        q_scale_max = weights.get_tensor(f"{prefix}.q_scale_max")
        q_groups = weights.get_tensor(f"{prefix}.q_groups")

        return Exl2Weight(
            q_weight=q_weight,
            q_scale=q_scale,
            q_invperm=q_invperm,
            q_scale_max=q_scale_max,
            q_groups=q_groups,
        )

61
62
63
64
65
66
67
68
69
70
71
72
    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        raise RuntimeError("Column-packed weights are not supported for exl")

    def get_weights_col(self, weights: Weights, prefix: str):
        # Sharding is not yet supported, so we return the weights as-is.
        return self.get_weights(weights, prefix)

73
74
75
76
    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        raise ValueError("get_multi_weights_col is not supported for exl2")

    def get_weights_row(self, weights: Weights, prefix: str):
77
78
        # Sharding is not yet supported, so we return the weights as-is.
        return self.get_weights(weights, prefix)