"profiler/vscode:/vscode.git/clone" did not exist on "2778e99758e149a6cb5309ca307bf7c1e61a562f"
Commit da900c3b authored by yangql's avatar yangql
Browse files

Initial commit

parents
from abc import abstractmethod
from logging import getLogger
import torch.nn as nn
from .triton_utils.mixin import TritonModuleMixin
logger = getLogger(__name__)
class FusedBaseModule(nn.Module, TritonModuleMixin):
@classmethod
@abstractmethod
def inject_to_model(cls, *args, **kwargs):
raise NotImplementedError()
class FusedBaseAttentionModule(FusedBaseModule):
@classmethod
@abstractmethod
def inject_to_model(
cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, trainable=False, **kwargs
):
raise NotImplementedError()
@classmethod
def warmup(cls, model, transpose=False, seqlen=2048):
pass
class FusedBaseMLPModule(FusedBaseModule):
@classmethod
@abstractmethod
def inject_to_model(cls, model, use_triton=False, **kwargs):
raise NotImplementedError()
from typing import Optional, Tuple, Union
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers.models.gptj.modeling_gptj import GPTJAttention
from ..utils.import_utils import compare_pytorch_version, dynamically_import_QuantLinear
from ._fused_base import FusedBaseAttentionModule
def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
dim = x.shape[-1]
if seq_len is None:
seq_len = x.shape[seq_dim]
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
sinusoid_inp = (
torch.einsum("i , j -> i j", torch.arange(seq_len, dtype=torch.float), inv_freq).to(x.device).float()
)
return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)
def rotate_every_two(x):
x1 = x[:, :, :, ::2]
x2 = x[:, :, :, 1::2]
x = torch.stack((-x2, x1), dim=-1)
return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)')
def duplicate_interleave(m):
"""
A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
"""
dim0 = m.shape[0]
m = m.view(-1, 1) # flatten the matrix
m = m.repeat(1, 2) # repeat all elements into the 2nd dimension
m = m.view(dim0, -1) # reshape into a matrix, interleaving the copy
return m
def apply_rotary_pos_emb(x, sincos, offset=0):
sin, cos = (duplicate_interleave(t)[None, offset : x.shape[1] + offset, None, :] for t in sincos)
# einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
return (x * cos) + (rotate_every_two(x) * sin)
class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
def __init__(self, config):
super().__init__()
max_positions = config.max_position_embeddings
self.register_buffer(
"bias",
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
)
self.register_buffer("masked_bias", torch.tensor(-1e9))
self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.attn_dropout_p = config.attn_pdrop
self.resid_dropout = nn.Dropout(config.resid_pdrop)
self.embed_dim = config.hidden_size
self.num_attention_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_attention_heads
if self.head_dim * self.num_attention_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
f" `num_attention_heads`: {self.num_attention_heads})."
)
self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
self.rotary_dim = config.rotary_dim
def _split_heads(self, qkv):
"""
Splits hidden dim into attn_head_size and num_attention_heads
"""
new_shape = qkv.size()[:-1] + (3, self.num_attention_heads, self.head_dim)
qkv = qkv.view(new_shape) # (batch, seq_length, 3, head, head_features)
query = qkv[:, :, 0]
key = qkv[:, :, 1]
value = qkv[:, :, 2]
return query, key, value
def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden dim
"""
if len(tensor.shape) == 5:
tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
elif len(tensor.shape) == 4:
tensor = tensor.permute(0, 2, 1, 3).contiguous()
else:
raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
return tensor.view(new_shape)
def _attn(
self,
query,
key,
value,
attention_mask=None,
head_mask=None,
):
# compute causal mask from causal mask buffer
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
# Keep the attention weights computation in fp32 to avoid overflow issues
query = query.to(torch.float32)
key = key.to(torch.float32)
attn_weights = torch.matmul(query, key.transpose(-1, -2))
mask_value = torch.finfo(attn_weights.dtype).min
# Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
# Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights, mask_value)
attn_weights = attn_weights / self.scale_attn
if attention_mask is not None:
# Apply the attention mask
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.to(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
# Mask heads if we want to
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def forward(
self,
hidden_states: torch.FloatTensor,
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[
Tuple[torch.Tensor, Tuple[torch.Tensor]],
Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
]:
query, key, value = self._split_heads(self.qkv_proj(hidden_states))
seq_len = key.shape[1]
offset = 0
if layer_past is not None:
offset = layer_past[0].shape[-2]
seq_len += offset
if self.rotary_dim is not None:
k_rot = key[:, :, :, : self.rotary_dim]
k_pass = key[:, :, :, self.rotary_dim :]
q_rot = query[:, :, :, : self.rotary_dim]
q_pass = query[:, :, :, self.rotary_dim :]
sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset)
q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset)
key = torch.cat([k_rot, k_pass], dim=-1)
query = torch.cat([q_rot, q_pass], dim=-1)
else:
sincos = fixed_pos_embedding(key, 1, seq_len=seq_len)
key = apply_rotary_pos_emb(key, sincos, offset=offset)
query = apply_rotary_pos_emb(query, sincos, offset=offset)
key = key.permute(0, 2, 1, 3)
query = query.permute(0, 2, 1, 3)
value = value.permute(0, 2, 1, 3)
is_causal = layer_past is None
if layer_past is not None:
past_key = layer_past[0]
past_value = layer_past[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
query = query.contiguous()
key = key.contiguous()
value = value.contiguous()
present = (key, value)
else:
present = None
# compute self-attention: V x Softmax(QK^T)
if compare_pytorch_version("v2.0.0", op="ge"):
attn_output = F.scaled_dot_product_attention(
query,
key,
value,
attn_mask=None if is_causal else attention_mask,
dropout_p=self.attn_dropout_p,
is_causal=is_causal,
)
attn_weights = None
else:
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs # a, present, (attentions)
@classmethod
def inject_to_model(
cls,
model,
use_triton=False,
group_size=-1,
use_cuda_fp16=True,
desc_act=False,
trainable=False,
bits: int = 4,
disable_exllama=True,
disable_exllamav2=False,
**kwargs,
):
config = model.config
QuantLinear = dynamically_import_QuantLinear(
use_triton=use_triton,
desc_act=desc_act,
group_size=group_size,
bits=bits,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
)
for name, m in model.named_modules():
if not isinstance(m, GPTJAttention):
continue
attn = cls(config).to(device=next(m.buffers()).device)
q_proj = m.q_proj
k_proj = m.k_proj
v_proj = m.v_proj
qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1)
qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1)
scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1)
if QuantLinear.QUANT_TYPE == "exllama":
if desc_act:
# See fused_llama_attn.py comment
raise ValueError(
"Exllama kernel does not support query/key/value fusion with act-order. Please either use inject_fused_attention=False or disable_exllama=True."
)
else:
g_idx = None
else:
g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0)
bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
qlinear_args = (
q_proj.bits,
q_proj.group_size,
q_proj.infeatures,
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
True if q_proj.bias is not None else False,
)
qlinear_kwargs = {"trainable": trainable}
if (not desc_act or group_size == -1) and not use_triton:
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
qlinear_kwargs["weight_dtype"] = q_proj.scales.dtype
qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)
qkv_proj.qweight = qweights
qkv_proj.qzeros = qzeros
qkv_proj.scales = scales
qkv_proj.g_idx = g_idx
qkv_proj.bias = bias
if "." in name:
parent_name = name.rsplit(".", 1)[0]
child_name = name[len(parent_name) + 1 :]
parent = model.get_submodule(parent_name)
else:
parent_name = ""
parent = model
child_name = name
attn.qkv_proj = qkv_proj
attn.out_proj = m.out_proj
setattr(parent, child_name, attn)
del m
__all__ = ["FusedGPTJAttentionForQuantizedModel"]
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers.models.llama.modeling_llama import (
LlamaAttention,
apply_rotary_pos_emb,
)
from ..utils.import_utils import compare_pytorch_version, dynamically_import_QuantLinear
from ._fused_base import FusedBaseAttentionModule
class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
hidden_size,
num_heads,
qkv_proj,
o_proj,
rotary_emb,
layer_idx,
):
super().__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
self.layer_idx = layer_idx
if self.head_dim * num_heads != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {num_heads})."
)
self.qkv_proj = qkv_proj
self.o_proj = o_proj
self.rotary_emb = rotary_emb
def _shape(self, tensor, seq_len, bsz):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states,
past_key_value=None,
attention_mask=None,
position_ids=None,
output_attentions=False,
use_cache=False,
**kwargs,
):
"""Input shape: Batch x Time x Channel"""
bsz, q_len, _ = hidden_states.size()
qkv_states = self.qkv_proj(hidden_states)
query_states, key_states, value_states = torch.split(qkv_states, self.hidden_size, dim=2)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index. Please open an issue in AutoGPTQ if you hit this."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
# [bsz, nh, t, hd]
if past_key_value is not None:
cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
if use_cache:
# Since qkv_proj is fused, query_states etc will hold a reference to the original qkv_states tensor
# which can cause excessive memory usage by the cache. `contiguous` is a convenient way to workaround this.
query_states = query_states.contiguous()
key_states = key_states.contiguous()
value_states = value_states.contiguous()
if compare_pytorch_version("v2.0.0", op="ge"):
attn_output = F.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=attention_mask,
is_causal=attention_mask is None and q_len > 1,
)
attn_weights = None
else:
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
@classmethod
def inject_to_model(
cls,
model,
use_triton=False,
group_size=-1,
use_cuda_fp16=True,
desc_act=False,
trainable=False,
bits: int = 4,
disable_exllama=True,
disable_exllamav2=False,
**kwargs,
):
"""
Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
"""
QuantLinear = dynamically_import_QuantLinear(
use_triton=use_triton,
desc_act=desc_act,
group_size=group_size,
bits=bits,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
)
for name, m in model.named_modules():
if not isinstance(m, LlamaAttention):
continue
q_proj = m.q_proj
k_proj = m.k_proj
v_proj = m.v_proj
qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1)
qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1)
scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1)
if QuantLinear.QUANT_TYPE == "exllama":
if desc_act:
# TODO: support it. The issue lies maybe in the line:
# int groups = qzeros.size(0);
# in exllama_ext.cpp
raise ValueError(
"Exllama kernel does not support query/key/value fusion with act-order. Please either use inject_fused_attention=False or disable_exllama=True."
)
else:
g_idx = None
else:
g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0)
bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
qlinear_args = (
q_proj.bits,
q_proj.group_size,
q_proj.infeatures,
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
True if q_proj.bias is not None else False,
)
qlinear_kwargs = {"trainable": trainable}
if (not desc_act or group_size == -1) and not use_triton:
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
qlinear_kwargs["weight_dtype"] = q_proj.scales.dtype
qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)
qkv_layer.qweight = qweights
qkv_layer.qzeros = qzeros
qkv_layer.scales = scales
qkv_layer.g_idx = g_idx
qkv_layer.bias = bias
# Introduced in Transformers 4.36
layer_idx = None
if hasattr(m, "layer_idx"):
layer_idx = m.layer_idx
attn = cls(
m.hidden_size,
m.num_heads,
qkv_layer,
m.o_proj,
m.rotary_emb,
layer_idx=layer_idx,
)
if "." in name:
parent_name = name.rsplit(".", 1)[0]
child_name = name[len(parent_name) + 1 :]
parent = model.get_submodule(parent_name)
else:
parent_name = ""
parent = model
child_name = name
setattr(parent, child_name, attn)
__all__ = ["FusedLlamaAttentionForQuantizedModel"]
import math
from logging import getLogger
import torch
from transformers.models.llama.modeling_llama import LlamaMLP
from ..utils.import_utils import TRITON_AVAILABLE
from ._fused_base import FusedBaseMLPModule
logger = getLogger(__name__)
if TRITON_AVAILABLE:
import triton
import triton.language as tl
from .triton_utils import custom_autotune
from .triton_utils.kernels import silu
@custom_autotune.autotune(
configs=[
triton.Config(
{
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
),
triton.Config(
{
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
),
triton.Config(
{
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
),
triton.Config(
{
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
),
triton.Config(
{
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
),
triton.Config(
{
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
), # 3090
triton.Config(
{
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
), # 3090
triton.Config(
{
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8,
},
num_stages=2,
num_warps=4,
), # 3090
triton.Config(
{
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
), # 3090
triton.Config(
{
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
},
num_stages=4,
num_warps=4,
), # 3090
],
key=["M", "N", "K"],
nearest_power_of_two=True,
prune_configs_by={
"early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
"perf_model": None,
"top_k": None,
},
)
@triton.jit
def quant_fused_matmul_248_kernel(
a_ptr,
c_ptr,
b1_ptr,
scales1_ptr,
zeros1_ptr,
g1_ptr,
b2_ptr,
scales2_ptr,
zeros2_ptr,
g2_ptr,
M,
N,
K,
bits,
maxq,
stride_am,
stride_ak,
stride_bk,
stride_bn,
stride_cm,
stride_cn,
stride_scales,
stride_zeros,
BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr,
BLOCK_SIZE_K: tl.constexpr,
GROUP_SIZE_M: tl.constexpr,
):
"""
Computes: C = silu(A * B1) * (A * B2)
A is of shape (M, K) float16
B is of shape (K//8, N) int32
C is of shape (M, N) float16
scales is of shape (1, N) float16
zeros is of shape (1, N//8) int32
"""
infearure_per_bits = 32 // bits
pid = tl.program_id(axis=0)
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + (pid % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
offs_k = tl.arange(0, BLOCK_SIZE_K)
a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
a_mask = offs_am[:, None] < M
# b_ptrs is set up such that it repeats elements along the K axis 8 times
b1_ptrs = b1_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
b2_ptrs = b2_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
g1_ptrs = g1_ptr + offs_k
g2_ptrs = g2_ptr + offs_k
# shifter is used to extract the N bits of each element in the 32-bit word from B
scales1_ptrs = scales1_ptr + offs_bn[None, :]
scales2_ptrs = scales2_ptr + offs_bn[None, :]
zeros1_ptrs = zeros1_ptr + (offs_bn[None, :] // infearure_per_bits)
zeros2_ptrs = zeros2_ptr + (offs_bn[None, :] // infearure_per_bits)
shifter = (offs_k % infearure_per_bits) * bits
zeros_shifter = (offs_bn % infearure_per_bits) * bits
accumulator1 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
accumulator2 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
for k in range(0, num_pid_k):
g1_idx = tl.load(g1_ptrs)
g2_idx = tl.load(g2_ptrs)
# Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
scales1 = tl.load(scales1_ptrs + g1_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
scales2 = tl.load(scales2_ptrs + g2_idx[:, None] * stride_scales)
zeros1 = tl.load(zeros1_ptrs + g1_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros1 = (zeros1 >> zeros_shifter[None, :]) & maxq
zeros1 = zeros1 + 1
zeros2 = tl.load(zeros2_ptrs + g2_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros2 = (zeros2 >> zeros_shifter[None, :]) & maxq
zeros2 = zeros2 + 1
a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
b1 = tl.load(b1_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
b2 = tl.load(b2_ptrs)
# Now we need to unpack b (which is N-bit values) into 32-bit values
b1 = (b1 >> shifter[:, None]) & maxq # Extract the N-bit values
b1 = (b1 - zeros1) * scales1 # Scale and shift
accumulator1 += tl.dot(a, b1)
b2 = (b2 >> shifter[:, None]) & maxq
b2 = (b2 - zeros2) * scales2
accumulator2 += tl.dot(a, b2)
a_ptrs += BLOCK_SIZE_K
b1_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
b2_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
g1_ptrs += BLOCK_SIZE_K
g2_ptrs += BLOCK_SIZE_K
accumulator1 = silu(accumulator1)
c = accumulator1 * accumulator2
c = c.to(tl.float16)
c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
tl.store(c_ptrs, c, mask=c_mask)
else:
quant_fused_matmul_248_kernel = None
class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
def __init__(
self,
gate_proj,
down_proj,
up_proj,
):
super().__init__()
self.infeatures = gate_proj.infeatures
self.intermediate_size = gate_proj.outfeatures
self.outfeatures = down_proj.outfeatures
self.bits = gate_proj.bits
self.maxq = gate_proj.maxq
self.gate_proj = gate_proj
self.up_proj = up_proj
self.down_proj = down_proj
def forward(self, x):
return self.down_proj(self.triton_llama_mlp(x))
def triton_llama_mlp(self, x):
with torch.cuda.device(x.device):
out_shape = x.shape[:-1] + (self.intermediate_size,)
x = x.reshape(-1, x.shape[-1])
M, K = x.shape
N = self.intermediate_size
c = torch.empty((M, N), device=x.device, dtype=torch.float16)
grid = lambda META: ( # noqa: E731
triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
)
quant_fused_matmul_248_kernel[grid](
x,
c,
self.gate_proj.qweight,
self.gate_proj.scales,
self.gate_proj.qzeros,
self.gate_proj.g_idx,
self.up_proj.qweight,
self.up_proj.scales,
self.up_proj.qzeros,
self.up_proj.g_idx,
M,
N,
K,
self.bits,
self.maxq,
x.stride(0),
x.stride(1),
self.gate_proj.qweight.stride(0),
self.gate_proj.qweight.stride(1),
c.stride(0),
c.stride(1),
self.gate_proj.scales.stride(0),
self.gate_proj.qzeros.stride(0),
)
c = c.reshape(out_shape)
return c
@classmethod
def inject_to_model(cls, model, use_triton=False, **kwargs):
if not use_triton:
logger.warning(
f"Skipping module injection for {cls.__name__} as currently not supported with use_triton=False."
)
return
elif not TRITON_AVAILABLE:
logger.warning(
f"Skipping module injection for {cls.__name__} as Triton is not available. Please check your installation."
)
return
for name, m in model.named_modules():
if not isinstance(m, LlamaMLP):
continue
mlp = cls(m.gate_proj, m.down_proj, m.up_proj)
if "." in name:
parent_name = name.rsplit(".", 1)[0]
child_name = name[len(parent_name) + 1 :]
parent = model.get_submodule(parent_name)
else:
parent_name = ""
parent = model
child_name = name
setattr(parent, child_name, mlp)
@classmethod
def warmup(cls, model, transpose=False, seqlen=2048):
from tqdm import tqdm
kn_values = {}
for _, m in model.named_modules():
if not isinstance(m, cls):
continue
k = m.infeatures
n = m.intermediate_size
if (k, n) not in kn_values:
kn_values[(k, n)] = m
logger.info(f"Found {len(kn_values)} unique fused mlp KN values.")
logger.info("Warming up autotune cache ...")
with torch.no_grad():
for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
m = 2**m
for (k, n), (modules) in kn_values.items():
a = torch.randn(m, k, dtype=torch.float16, device=model.device)
modules.triton_llama_mlp(a)
del kn_values
__all__ = ["FusedLlamaMLPForQuantizedModel"]
import torch.nn as nn
class GeneralQuantLinear(nn.Linear):
def __init__(self, quant_linear_module):
super().__init__(
in_features=quant_linear_module.infeatures,
out_features=quant_linear_module.outfeatures,
bias=True,
)
self.infeatures = quant_linear_module.infeatures
self.outfeatures = quant_linear_module.outfeatures
self.bits = quant_linear_module.bits
self.group_size = quant_linear_module.group_size
self.maxq = quant_linear_module.maxq
self.weight.requires_grad = False
self.weight.data = quant_linear_module.qweight
self.register_buffer("qweight", quant_linear_module.qweight)
self.bias.data = quant_linear_module.bias
self.qweight.requires_grad = False
self.bias.requires_grad = False
self.register_buffer("qzeros", quant_linear_module.qzeros)
self.register_buffer("scales", quant_linear_module.scales)
self.register_buffer("g_idx", quant_linear_module.g_idx)
if hasattr(quant_linear_module, "wf"):
self.wf = quant_linear_module.wf
if hasattr(quant_linear_module, "kernel_switch_threshold"):
self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
if hasattr(quant_linear_module, "autogptq_cuda_available"):
self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
self.trainable = quant_linear_module.trainable
self.forward = quant_linear_module.forward
@classmethod
def inject_to_model(cls, model, target_module_type):
for name, m in model.named_modules():
if not isinstance(m, target_module_type):
continue
new_m = cls(m)
if "." in name:
parent_name = name.rsplit(".", 1)[0]
child_name = name[len(parent_name) + 1 :]
parent = model.get_submodule(parent_name)
else:
parent_name = ""
parent = model
child_name = name
setattr(parent, child_name, new_m)
import math
from logging import getLogger
import numpy as np
import torch
import torch.nn as nn
import transformers
logger = getLogger(__name__)
try:
import autogptq_cuda_64
import autogptq_cuda_256
_autogptq_cuda_available = True
except ImportError:
logger.warning("CUDA extension not installed.")
autogptq_cuda_256 = None
autogptq_cuda_64 = None
_autogptq_cuda_available = False
class QuantLinear(nn.Module):
QUANT_TYPE = "cuda"
def __init__(
self,
bits,
group_size,
infeatures,
outfeatures,
bias,
kernel_switch_threshold=128,
trainable=False,
weight_dtype=torch.float16,
):
super().__init__()
global _autogptq_cuda_available
if bits not in [2, 3, 4, 8]:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
if trainable:
_autogptq_cuda_available = False
self.infeatures = infeatures
self.outfeatures = outfeatures
self.bits = bits
self.group_size = group_size if group_size != -1 else infeatures
self.maxq = 2**self.bits - 1
self.register_buffer(
"qweight",
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
)
self.register_buffer(
"qzeros",
torch.zeros(
(
math.ceil(infeatures / self.group_size),
outfeatures // 32 * self.bits,
),
dtype=torch.int32,
),
)
self.register_buffer(
"scales",
torch.zeros(
(math.ceil(infeatures / self.group_size), outfeatures),
dtype=weight_dtype,
),
)
self.register_buffer(
"g_idx",
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
)
if bias:
self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
else:
self.bias = None
# is performed by unpacking the weights and using torch.matmul
if self.bits in [2, 4, 8]:
self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
elif self.bits == 3:
self.wf = torch.tensor(
[
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
],
dtype=torch.int32,
).reshape(1, 3, 12)
self.kernel_switch_threshold = kernel_switch_threshold
self.autogptq_cuda_available = _autogptq_cuda_available
self.autogptq_cuda = autogptq_cuda_256
if infeatures % 256 != 0 or outfeatures % 256 != 0:
self.autogptq_cuda = autogptq_cuda_64
if infeatures % 64 != 0 or outfeatures % 64 != 0:
self.autogptq_cuda_available = False
self.trainable = trainable
def post_init(self):
pass
def pack(self, linear, scales, zeros, g_idx=None):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
W = W.flatten(1)
if isinstance(linear, transformers.pytorch_utils.Conv1D):
W = W.t()
self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
scales = scales.t().contiguous()
zeros = zeros.t().contiguous()
scale_zeros = zeros * scales
self.scales = scales.clone().to(dtype=linear.weight.dtype)
if linear.bias is not None:
self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
intweight = []
for idx in range(self.infeatures):
intweight.append(
torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
:, None
]
)
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
i = 0
row = 0
qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
while row < qweight.shape[0]:
if self.bits in [2, 4, 8]:
for j in range(i, i + (32 // self.bits)):
qweight[row] |= intweight[j] << (self.bits * (j - i))
i += 32 // self.bits
row += 1
elif self.bits == 3:
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i))
i += 10
qweight[row] |= intweight[i] << 30
row += 1
qweight[row] |= (intweight[i] >> 2) & 1
i += 1
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i) + 1)
i += 10
qweight[row] |= intweight[i] << 31
row += 1
qweight[row] |= (intweight[i] >> 1) & 0x3
i += 1
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i) + 2)
i += 10
row += 1
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
qweight = qweight.astype(np.int32)
self.qweight = torch.from_numpy(qweight)
zeros -= 1
zeros = zeros.numpy().astype(np.uint32)
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
i = 0
col = 0
while col < qzeros.shape[1]:
if self.bits in [2, 4, 8]:
for j in range(i, i + (32 // self.bits)):
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
i += 32 // self.bits
col += 1
elif self.bits == 3:
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
i += 10
qzeros[:, col] |= zeros[:, i] << 30
col += 1
qzeros[:, col] |= (zeros[:, i] >> 2) & 1
i += 1
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
i += 10
qzeros[:, col] |= zeros[:, i] << 31
col += 1
qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
i += 1
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
i += 10
col += 1
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
qzeros = qzeros.astype(np.int32)
self.qzeros = torch.from_numpy(qzeros)
def forward(self, x: torch.Tensor):
out_shape = x.shape[:-1] + (self.outfeatures,)
x = x.reshape(-1, x.shape[-1])
x_dtype = x.dtype
if (
x.device.type == "cuda"
and self.autogptq_cuda_available
and (self.kernel_switch_threshold == 0 or x.shape[0] < self.kernel_switch_threshold)
):
out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
if self.bits == 2:
self.autogptq_cuda.vecquant2matmul(
x.float(),
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.g_idx,
)
elif self.bits == 3:
self.autogptq_cuda.vecquant3matmul(
x.float(),
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.g_idx,
)
elif self.bits == 4:
self.autogptq_cuda.vecquant4matmul(
x.float(),
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.g_idx,
)
elif self.bits == 8:
self.autogptq_cuda.vecquant8matmul(
x.float(),
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.g_idx,
)
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
else:
if self.wf.device != self.qzeros.device:
self.wf = self.wf.to(self.qzeros.device)
if self.bits in [2, 4, 8]:
zeros = torch.bitwise_right_shift(
torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
self.wf.unsqueeze(0),
).to(torch.int16 if self.bits == 8 else torch.int8)
zeros = torch.bitwise_and(zeros, (2**self.bits) - 1)
zeros = zeros + 1
zeros = zeros.reshape(self.scales.shape)
weight = torch.bitwise_right_shift(
torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
self.wf.unsqueeze(-1),
).to(torch.int16 if self.bits == 8 else torch.int8)
weight = torch.bitwise_and(weight, (2**self.bits) - 1)
elif self.bits == 3:
zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
-1, -1, -1, 12
)
zeros = zeros >> self.wf.unsqueeze(0)
zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
zeros = zeros & 0x7
zeros = torch.cat(
[zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
dim=2,
)
zeros = zeros + 1
zeros = zeros.reshape(self.scales.shape)
weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
-1, -1, 12, -1
)
weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
weight = weight & 0x7
weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
num_itr = self.g_idx.shape[0] // x.shape[-1]
if num_itr == 1:
weights = self.scales[self.g_idx.long()] * (weight - zeros[self.g_idx.long()])
else:
num_dim = self.g_idx.shape[0] // num_itr
weights = []
for i in range(num_itr):
scale_i = self.scales[:, i * num_dim : (i + 1) * num_dim]
weight_i = weight[:, i * num_dim : (i + 1) * num_dim]
zeros_i = zeros[:, i * num_dim : (i + 1) * num_dim]
g_idx_i = self.g_idx[i * num_dim : (i + 1) * num_dim]
weights.append(scale_i[g_idx_i.long()] * (weight_i - zeros_i[g_idx_i.long()]))
weights = torch.cat(weights, dim=1)
out = torch.matmul(x, weights)
out = out.to(x_dtype)
out = out.reshape(out_shape)
out = out + self.bias if self.bias is not None else out
return out
__all__ = ["QuantLinear"]
import math
from logging import getLogger
import numpy as np
import torch
import torch.nn as nn
import transformers
logger = getLogger(__name__)
try:
import autogptq_cuda_64
import autogptq_cuda_256
_autogptq_cuda_available = True
except ImportError:
logger.warning("CUDA extension not installed.")
autogptq_cuda_256 = None
autogptq_cuda_64 = None
_autogptq_cuda_available = False
class QuantLinear(nn.Module):
QUANT_TYPE = "cuda-old"
def __init__(
self,
bits,
group_size,
infeatures,
outfeatures,
bias,
use_cuda_fp16=True,
kernel_switch_threshold=128,
trainable=False,
weight_dtype=torch.float16,
):
super().__init__()
global _autogptq_cuda_available
if bits not in [2, 3, 4, 8]:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
if trainable:
_autogptq_cuda_available = False
self.infeatures = infeatures
self.outfeatures = outfeatures
self.bits = bits
self.group_size = group_size if group_size != -1 else infeatures
self.maxq = 2**self.bits - 1
self.register_buffer(
"qweight",
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
)
self.register_buffer(
"qzeros",
torch.zeros(
(
math.ceil(infeatures / self.group_size),
outfeatures // 32 * self.bits,
),
dtype=torch.int32,
),
)
self.register_buffer(
"scales",
torch.zeros(
(math.ceil(infeatures / self.group_size), outfeatures),
dtype=weight_dtype,
),
)
self.register_buffer(
"g_idx",
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
)
if bias:
self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
else:
self.bias = None
self.half_indim = self.infeatures // 2
self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
# is performed by unpacking the weights and using torch.matmul
if self.bits in [2, 4, 8]:
self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
elif self.bits == 3:
self.wf = torch.tensor(
[
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
],
dtype=torch.int32,
).reshape(1, 3, 12)
self.kernel_switch_threshold = kernel_switch_threshold
self.autogptq_cuda_available = _autogptq_cuda_available
self.autogptq_cuda = autogptq_cuda_256
if infeatures % 256 != 0 or outfeatures % 256 != 0:
self.autogptq_cuda = autogptq_cuda_64
if infeatures % 64 != 0 or outfeatures % 64 != 0:
self.autogptq_cuda_available = False
self.trainable = trainable
def post_init(self):
pass
def pack(self, linear, scales, zeros, g_idx):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
W = W.flatten(1)
if isinstance(linear, transformers.pytorch_utils.Conv1D):
W = W.t()
scales = scales.t().contiguous()
zeros = zeros.t().contiguous()
scale_zeros = zeros * scales
self.scales = scales.clone().to(dtype=linear.weight.dtype)
if linear.bias is not None:
self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
intweight = []
for idx in range(self.infeatures):
g_idx = idx // self.group_size
intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
i = 0
row = 0
qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
while row < qweight.shape[0]:
if self.bits in [2, 4, 8]:
for j in range(i, i + (32 // self.bits)):
qweight[row] |= intweight[j] << (self.bits * (j - i))
i += 32 // self.bits
row += 1
elif self.bits == 3:
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i))
i += 10
qweight[row] |= intweight[i] << 30
row += 1
qweight[row] |= (intweight[i] >> 2) & 1
i += 1
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i) + 1)
i += 10
qweight[row] |= intweight[i] << 31
row += 1
qweight[row] |= (intweight[i] >> 1) & 0x3
i += 1
for j in range(i, i + 10):
qweight[row] |= intweight[j] << (3 * (j - i) + 2)
i += 10
row += 1
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
qweight = qweight.astype(np.int32)
self.qweight = torch.from_numpy(qweight)
zeros -= 1
zeros = zeros.numpy().astype(np.uint32)
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
i = 0
col = 0
while col < qzeros.shape[1]:
if self.bits in [2, 4, 8]:
for j in range(i, i + (32 // self.bits)):
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
i += 32 // self.bits
col += 1
elif self.bits == 3:
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
i += 10
qzeros[:, col] |= zeros[:, i] << 30
col += 1
qzeros[:, col] |= (zeros[:, i] >> 2) & 1
i += 1
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
i += 10
qzeros[:, col] |= zeros[:, i] << 31
col += 1
qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
i += 1
for j in range(i, i + 10):
qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
i += 10
col += 1
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
qzeros = qzeros.astype(np.int32)
self.qzeros = torch.from_numpy(qzeros)
def forward(self, x):
x_dtype = x.dtype
out_shape = x.shape[:-1] + (self.outfeatures,)
x = x.reshape(-1, x.shape[-1])
if (
x.device.type == "cuda"
and self.autogptq_cuda_available is True
and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
):
out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
if self.use_cuda_fp16:
if x_dtype != torch.float16:
logger.warning_once(
f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
)
if self.bits == 2:
self.autogptq_cuda.vecquant2matmul_faster_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
self.half_indim,
)
elif self.bits == 3:
self.autogptq_cuda.vecquant3matmul_faster_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
self.half_indim,
)
elif self.bits == 4:
self.autogptq_cuda.vecquant4matmul_faster_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
self.half_indim,
)
else:
raise NotImplementedError("Only 2,3,4 bits are supported.")
else:
x = x.to(torch.float32) # This is required for autocast compatibility.
if self.bits == 2:
self.autogptq_cuda.vecquant2matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
elif self.bits == 3:
self.autogptq_cuda.vecquant3matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
elif self.bits == 4:
self.autogptq_cuda.vecquant4matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
elif self.bits == 8:
self.autogptq_cuda.vecquant8matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
else:
if self.wf.device != self.qzeros.device:
self.wf = self.wf.to(self.qzeros.device)
if self.bits in [2, 4, 8]:
zeros = torch.bitwise_right_shift(
torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
self.wf.unsqueeze(0),
).to(torch.int16 if self.bits == 8 else torch.int8)
zeros = zeros + 1
zeros = torch.bitwise_and(
zeros, (2**self.bits) - 1
) # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
scales = self.scales
scales = scales.reshape(-1, 1, scales.shape[-1])
weight = torch.bitwise_right_shift(
torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
self.wf.unsqueeze(-1),
).to(torch.int16 if self.bits == 8 else torch.int8)
weight = torch.bitwise_and(weight, (2**self.bits) - 1)
weight = weight.reshape(-1, self.group_size, weight.shape[2])
elif self.bits == 3:
zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
-1, -1, -1, 12
)
zeros = zeros >> self.wf.unsqueeze(0)
zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
zeros = zeros & 0x7
zeros = torch.cat(
[zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
dim=2,
)
zeros = zeros + 1
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
scales = self.scales
scales = scales.reshape(-1, 1, scales.shape[-1])
weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
-1, -1, 12, -1
)
weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
weight = weight & 0x7
weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
weight = weight.reshape(-1, self.group_size, weight.shape[2])
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
weight = scales * (weight - zeros)
weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
out = torch.matmul(x, weight)
out = out.to(dtype=x_dtype).reshape(
out_shape
) # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
out = out + self.bias if self.bias is not None else out
return out
__all__ = ["QuantLinear"]
# Adapted from turboderp exllama: https://github.com/turboderp/exllama
import math
from logging import getLogger
import numpy as np
import torch
import torch.nn as nn
import transformers
logger = getLogger(__name__)
try:
from exllama_kernels import make_q4, q4_matmul
except ImportError as e:
exllama_import_exception = e
def error_raiser_exllama(*args, **kwargs):
raise ValueError(
f"Trying to use the exllama backend, but could not import the C++/CUDA dependencies with the following error: {exllama_import_exception}"
)
make_q4 = error_raiser_exllama
q4_matmul = error_raiser_exllama
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
none_tensor = torch.empty((1, 1), device="meta")
def ext_make_q4(qweight, qzeros, scales, g_idx, device):
"""Construct Q4Matrix, return handle"""
return make_q4(qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device)
def ext_q4_matmul(x, q4, q4_width):
"""Matrix multiplication, returns x @ q4"""
outshape = x.shape[:-1] + (q4_width,)
x = x.view(-1, x.shape[-1])
output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
q4_matmul(x, q4, output)
return output.view(outshape)
class QuantLinear(nn.Module):
QUANT_TYPE = "exllama"
"""Linear layer implementation with per-group 4-bit quantization of the weights"""
def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
super().__init__()
if bits != 4:
raise ValueError(
f"Exllama kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
)
if trainable:
raise NotImplementedError("Exllama kernel does not support training.")
self.padding = -outfeatures % 32
self.outfeatures = outfeatures + self.padding
outfeatures = self.outfeatures
self.infeatures = infeatures
self.bits = bits
self.group_size = group_size if group_size != -1 else infeatures
self.trainable = trainable
self.maxq = 2**self.bits - 1
assert infeatures % 32 == 0
assert infeatures % self.group_size == 0
assert outfeatures % 32 == 0
self.register_buffer(
"qweight",
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
)
self.register_buffer(
"qzeros",
torch.zeros(
(
math.ceil(infeatures / self.group_size),
outfeatures // 32 * self.bits,
),
dtype=torch.int32,
),
)
self.register_buffer(
"scales",
torch.zeros(
(math.ceil(infeatures / self.group_size), outfeatures),
dtype=torch.float16,
),
)
self.register_buffer(
"g_idx",
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
)
if bias:
self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
else:
self.bias = None
def post_init(self):
assert self.qweight.device.type == "cuda"
assert self.qweight.device.index is not None
self.width = self.qweight.shape[1]
# make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
self.q4 = ext_make_q4(
self.qweight,
self.qzeros,
self.scales,
self.g_idx.to("cpu") if self._use_act_order else None,
self.qweight.device.index,
)
def pack(self, linear, scales, zeros, g_idx=None):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
W = W.flatten(1)
if isinstance(linear, transformers.pytorch_utils.Conv1D):
W = W.t()
self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
scales = scales.t().contiguous()
zeros = zeros.t().contiguous()
scale_zeros = zeros * scales
self.scales = scales.clone().half()
if linear.bias is not None:
self.bias = linear.bias.clone().half()
intweight = []
for idx in range(self.infeatures):
intweight.append(
torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
:, None
]
)
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
i = 0
row = 0
qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
while row < qweight.shape[0]:
if self.bits in [4]:
for j in range(i, i + (32 // self.bits)):
qweight[row] |= intweight[j] << (self.bits * (j - i))
i += 32 // self.bits
row += 1
else:
raise NotImplementedError("Only 4 bits are supported.")
qweight = qweight.astype(np.int32)
self.qweight = torch.from_numpy(qweight)
zeros -= 1
zeros = zeros.numpy().astype(np.uint32)
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
i = 0
col = 0
while col < qzeros.shape[1]:
if self.bits in [4]:
for j in range(i, i + (32 // self.bits)):
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
i += 32 // self.bits
col += 1
else:
raise NotImplementedError("Only 4 bits are supported.")
qzeros = qzeros.astype(np.int32)
self.qzeros = torch.from_numpy(qzeros)
def forward(self, x):
if x.dtype != torch.float16:
logger.warning_once(
f"The exllama kernel for GPTQ requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
)
x = x.half()
out = ext_q4_matmul(x, self.q4, self.width)
if self.bias is not None:
out.add_(self.bias)
return out
# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
import math
from logging import getLogger
import numpy as np
import torch
import torch.nn as nn
import transformers
logger = getLogger(__name__)
try:
from exllamav2_kernels import gemm_half_q_half, make_q_matrix
except ImportError as e:
exllama_v2_import_exception = e
def error_raiser_exllama(*args, **kwargs):
raise ValueError(
f"Trying to use the exllama v2 backend, but could not import the C++/CUDA dependencies with the following error: {exllama_v2_import_exception}"
)
make_q_matrix = error_raiser_exllama
gemm_half_q_half = error_raiser_exllama
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
none_tensor = torch.empty((1, 1), device="meta")
def _torch_device(idx):
if idx == -1:
return "cpu"
return f"cuda:{idx}"
def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
"""Matrix multiplication, returns x @ q4"""
output_shape = x.shape[:-1] + (q4_width,)
x = x.view(-1, x.shape[-1])
output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
gemm_half_q_half(x, q_handle, output, force_cuda)
return output.view(output_shape)
def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
"""
Create Q matrix
"""
# EXL2
# won't work as the moment because the tensors are not the same.
if "q_weight" in w:
w["q_scale_max"] /= 256
w["q_perm"] = w["q_perm"].short()
w["q_invperm"] = w["q_invperm"].short()
return make_q_matrix(
w["q_weight"],
w["q_perm"],
w["q_invperm"],
w["q_scale"],
w["q_scale_max"],
w["q_groups"],
none_tensor,
none_tensor,
none_tensor,
temp_dq,
)
# GPTQ
elif "qweight" in w:
if w["scales"].dtype == torch.float:
w["scales"] = w["scales"].half()
# GPTQ with g_idx (act_order)
if "g_idx" in w and not (w["g_idx"] == 0).all().item():
w["q_perm"] = torch.empty(
(w["qweight"].shape[0] * 8,),
dtype=torch.short,
device=w["qweight"].device,
)
w["q_invperm"] = torch.empty_like(w["q_perm"])
# make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
return make_q_matrix(
w["qweight"],
w["q_perm"],
w["q_invperm"],
none_tensor,
none_tensor,
none_tensor,
w["qzeros"],
w["scales"],
w["g_idx"].cpu(),
temp_dq,
)
# GPTQ without g_idx
else:
return make_q_matrix(
w["qweight"],
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
w["qzeros"],
w["scales"],
none_tensor,
temp_dq,
)
class QuantLinear(nn.Module):
QUANT_TYPE = "exllamav2"
"""Linear layer implementation with per-group 4-bit quantization of the weights"""
def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
super().__init__()
if bits != 4:
raise ValueError(
f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
)
if trainable:
raise NotImplementedError("Exllamav2 kernel does not support training.")
self.q_handle = None
self.q_tensors = None
self.padding = -outfeatures % 32
self.outfeatures = outfeatures + self.padding
outfeatures = self.outfeatures
self.infeatures = infeatures
self.bits = bits
self.group_size = group_size if group_size != -1 else infeatures
self.trainable = trainable
self.maxq = 2**self.bits - 1
assert infeatures % 32 == 0
assert infeatures % self.group_size == 0
assert outfeatures % 32 == 0
# I need to register the tensors, otherwise, we won't be able to load them easily using transformers ...
self.register_buffer(
"qweight",
torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
)
self.register_buffer(
"qzeros",
torch.zeros(
(
math.ceil(infeatures / self.group_size),
outfeatures // 32 * self.bits,
),
dtype=torch.int32,
),
)
self.register_buffer(
"scales",
torch.zeros(
(math.ceil(infeatures / self.group_size), outfeatures),
dtype=torch.float16,
),
)
self.register_buffer(
"g_idx",
torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
)
if bias:
self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
else:
self.bias = None
def post_init(self, temp_dq):
assert self.qweight.device.type == "cuda"
assert self.qweight.device.index is not None
self.q_tensors = {
"qweight": self.qweight,
"qzeros": self.qzeros,
"scales": self.scales,
"g_idx": self.g_idx,
}
temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
def pack(self, linear, scales, zeros, g_idx=None):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
W = W.flatten(1)
if isinstance(linear, transformers.pytorch_utils.Conv1D):
W = W.t()
self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
scales = scales.t().contiguous()
zeros = zeros.t().contiguous()
scale_zeros = zeros * scales
self.scales = scales.clone().half()
if linear.bias is not None:
self.bias = linear.bias.clone().half()
intweight = []
for idx in range(self.infeatures):
intweight.append(
torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
:, None
]
)
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
i = 0
row = 0
qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
while row < qweight.shape[0]:
if self.bits in [4]:
for j in range(i, i + (32 // self.bits)):
qweight[row] |= intweight[j] << (self.bits * (j - i))
i += 32 // self.bits
row += 1
else:
raise NotImplementedError("Only 4 bits are supported.")
qweight = qweight.astype(np.int32)
self.qweight = torch.from_numpy(qweight)
zeros -= 1
zeros = zeros.numpy().astype(np.uint32)
qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
i = 0
col = 0
while col < qzeros.shape[1]:
if self.bits in [4]:
for j in range(i, i + (32 // self.bits)):
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
i += 32 // self.bits
col += 1
else:
raise NotImplementedError("Only 4 bits are supported.")
qzeros = qzeros.astype(np.int32)
self.qzeros = torch.from_numpy(qzeros)
def forward(self, x, force_cuda=False):
if x.dtype != torch.float16:
logger.warning_once(
f"The exllama v2 kernel for GPTQ requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
)
x = x.half()
output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
if self.bias is not None:
output.add_(self.bias)
return output
def temp_dq_size(self):
return self.infeatures * self.outfeatures * 2 + 128
def temp_fwd_size(self, max_input_len, max_batch_size):
return self.outfeatures * max_input_len * max_batch_size * 4 + 128
def scratch_space_fixed(self, max_input_len=2048, max_batch_size=8):
return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
class ExLlamaV2DeviceTensors:
device_idx: int
scratch_bytes: int
scratch_idx: int
scratch: torch.tensor = None
def __init__(self, device_idx, scratch_bytes):
self.device_idx = device_idx
self.scratch_bytes = scratch_bytes
def prepare(self):
self.scratch = torch.empty(
(self.scratch_bytes // 2,),
dtype=torch.half,
device=_torch_device(self.device_idx),
)
def get_scratch_slice(self, size_bytes):
if self.scratch is None:
self.prepare()
size_bytes = ((size_bytes + 127) // 128) * 128
size_half = size_bytes // 2
scratch_slice = self.scratch.narrow(0, 0, size_half)
return scratch_slice
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment