Commit fcd9637c authored by gaoqiong's avatar gaoqiong
Browse files

Merge branch 'v0.2.5_develop' into 'main'

v0.2.5

See merge request dcutoolkit/deeplearing/autoawq!2
parents 7724cca1 427f5481
import tqdm
import torch
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.gemma.modeling_gemma import (
GemmaDecoderLayer as OldGemmaDecoderLayer,
GemmaForCausalLM as OldGemmaForCausalLM,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class GemmaAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "GemmaDecoderLayer"
max_new_tokens_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldGemmaDecoderLayer):
fuser = GemmaFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldGemmaForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module: OldGemmaDecoderLayer):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldGemmaForCausalLM, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(module: OldGemmaDecoderLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class GemmaFuser:
def __init__(self, model: OldGemmaForCausalLM):
self.model = model
self.Gemma_blocks: List[Tuple[str, OldGemmaDecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "GemmaDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldGemmaDecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
with torch.no_grad():
# GemmaRMSNorm is different from Llama's in that it multiplies
# (1 + weight) to the output, instead of just weight.
module.input_layernorm.weight += 1
module.post_attention_layernorm.weight += 1
norm_1 = FasterTransformerRMSNorm(
module.input_layernorm.weight, module.input_layernorm.eps
)
norm_2 = FasterTransformerRMSNorm(
module.post_attention_layernorm.weight,
module.post_attention_layernorm.eps,
)
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
rope_theta=self.model.config.rope_theta,
head_dim=self.model.config.head_dim,
)
)
with torch.no_grad():
# Normalize Gemma's embedding layer
self.model.model.embed_tokens.weight *= self.model.config.hidden_size**0.5
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
from .base import BaseAWQForCausalLM
from transformers.models.gpt_bigcode.modeling_gpt_bigcode import (
GPTBigCodeForCausalLM,
GPTBigCodeBlock as OldGptBigCodeBlock,
)
class GptBigCodeAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "GPTBigCodeBlock"
max_seq_len_key = "n_positions"
@staticmethod
def get_model_layers(model: GPTBigCodeForCausalLM):
return model.transformer.h
@staticmethod
def get_act_for_scaling(module: OldGptBigCodeBlock):
return dict(
is_scalable=True,
scale_name="mlp.act",
scale_layer=module.mlp.act,
scale_shape=module.mlp.c_fc.out_features,
)
@staticmethod
def move_embed(model: GPTBigCodeForCausalLM, device):
model.transformer.wte = model.transformer.wte.to(device)
model.transformer.wpe = model.transformer.wpe.to(device)
model.transformer.drop = model.transformer.drop.to(device)
@staticmethod
def get_layers_for_scaling(module: OldGptBigCodeBlock, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.ln_1,
layers=[module.attn.c_attn],
inp=input_feat["attn.c_attn"],
module2inspect=module.attn,
kwargs=module_kwargs,
)
)
# linear 1
layers.append(
dict(
prev_op=module.ln_2,
layers=[module.mlp.c_fc],
inp=input_feat["mlp.c_fc"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.act,
layers=[module.mlp.c_proj],
inp=input_feat["mlp.c_proj"],
)
)
return layers
from .base import BaseAWQForCausalLM
from transformers.models.gpt_neox.modeling_gpt_neox import (
GPTNeoXLayer,
GPTNeoXForCausalLM,
)
class GPTNeoXAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "GPTNeoXDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def get_model_layers(model: GPTNeoXForCausalLM):
return model.gpt_neox.layers
@staticmethod
def get_act_for_scaling(module: GPTNeoXLayer):
return dict(
is_scalable=True,
scale_name="mlp.act",
scale_layer=module.mlp.act,
scale_shape=module.mlp.dense_h_to_4h.out_features,
)
@staticmethod
def move_embed(model: GPTNeoXForCausalLM, device: str):
model.gpt_neox.embed_in = model.gpt_neox.embed_in.to(device)
@staticmethod
def get_layers_for_scaling(module: GPTNeoXLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[module.attention.query_key_value],
inp=input_feat["attention.query_key_value"],
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/issues/2#issuecomment-1606297469
"""
layers.append(dict(
prev_op=module.attention.query_key_value,
layers=[module.attention.dense],
inp=input_feat['attention.dense'],
))
"""
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.dense_h_to_4h],
inp=input_feat["mlp.dense_h_to_4h"],
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.act,
layers=[module.mlp.dense_4h_to_h],
inp=input_feat["mlp.dense_4h_to_h"],
)
)
return layers
from .base import BaseAWQForCausalLM
from transformers.models.gptj.modeling_gptj import GPTJForCausalLM, GPTJBlock
class GPTJAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "GPTJBlock"
max_seq_len_key = "n_positions"
@staticmethod
def get_model_layers(model: GPTJForCausalLM):
return model.transformer.h
@staticmethod
def get_act_for_scaling(module: GPTJBlock):
return dict(
is_scalable=True,
scale_name="mlp.act",
scale_layer=module.mlp.act,
scale_shape=module.mlp.fc_in.out_features,
)
@staticmethod
def move_embed(model: GPTJForCausalLM, device: str):
model.transformer.wte = model.transformer.wte.to(device)
@staticmethod
def get_layers_for_scaling(module: GPTJBlock, input_feat, module_kwargs):
layers = []
# attention input + linear 1
layers.append(
dict(
prev_op=module.ln_1,
layers=[
module.attn.q_proj,
module.attn.k_proj,
module.attn.v_proj,
module.mlp.fc_in,
],
inp=input_feat["attn.q_proj"],
module2inspect=module,
kwargs=module_kwargs,
)
)
# attention out
layers.append(
dict(
prev_op=module.attn.v_proj,
layers=[module.attn.out_proj],
inp=input_feat["attn.out_proj"],
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.act,
layers=[module.mlp.fc_out],
inp=input_feat["mlp.fc_out"],
)
)
return layers
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.llama.modeling_llama import (
LlamaDecoderLayer as OldLlamaDecoderLayer,
LlamaForCausalLM as OldLlamaForCausalLM,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class LlamaAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "LlamaDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldLlamaForCausalLM):
fuser = LlamaFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldLlamaForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module: OldLlamaDecoderLayer):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldLlamaForCausalLM, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(module: OldLlamaDecoderLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class LlamaFuser:
def __init__(self, model: OldLlamaForCausalLM):
self.model = model
self.llama_blocks: List[Tuple[str, OldLlamaDecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "LlamaDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldLlamaDecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = FasterTransformerRMSNorm(
module.input_layernorm.weight, module.input_layernorm.variance_epsilon
)
norm_2 = FasterTransformerRMSNorm(
module.post_attention_layernorm.weight,
module.post_attention_layernorm.variance_epsilon,
)
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
rope_theta=self.model.config.rope_theta,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.llama.modeling_llama import (
LlamaDecoderLayer as OldLlamaDecoderLayer,
)
from transformers.models.llava.modeling_llava import (
LlavaForConditionalGeneration as OldLlavaForConditionalGeneration,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class LlavaAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "LlamaDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldLlavaForConditionalGeneration):
fuser = LlavaFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldLlavaForConditionalGeneration):
return model.language_model.model.layers
@staticmethod
def get_act_for_scaling(module: OldLlamaDecoderLayer):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldLlavaForConditionalGeneration, device: str):
model.language_model.model.embed_tokens = model.get_input_embeddings().to(
device
)
@staticmethod
def get_layers_for_scaling(module: OldLlamaDecoderLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class LlavaFuser:
def __init__(self, model: OldLlavaForConditionalGeneration):
self.model = model.language_model
self.llama_blocks: List[Tuple[str, OldLlamaDecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "LlamaDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldLlamaDecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = FasterTransformerRMSNorm(
module.input_layernorm.weight, module.input_layernorm.variance_epsilon
)
norm_2 = FasterTransformerRMSNorm(
module.post_attention_layernorm.weight,
module.post_attention_layernorm.variance_epsilon,
)
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.mistral.modeling_mistral import (
MistralDecoderLayer as OldMistralDecoderLayer,
MistralForCausalLM as OldMistralForCausalLM,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class MistralAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "MistralDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldMistralForCausalLM):
fuser = MistralFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldMistralForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module: OldMistralDecoderLayer):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldMistralForCausalLM, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(
module: OldMistralDecoderLayer, input_feat, module_kwargs
):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class MistralFuser:
def __init__(self, model: OldMistralForCausalLM):
self.model = model
self.mistral_blocks: List[Tuple[str, OldMistralDecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "MistralDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldMistralDecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = FasterTransformerRMSNorm(
module.input_layernorm.weight, module.input_layernorm.variance_epsilon
)
norm_2 = FasterTransformerRMSNorm(
module.post_attention_layernorm.weight,
module.post_attention_layernorm.variance_epsilon,
)
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
import tqdm
import torch
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.modules.fused.block import MixtralBlock
from awq.modules.fused.model import MixtralModel
from awq.modules.fused.moe import FusedSparseMoeBlock
from awq.utils.fused_utils import fuse_qkv, fuse_linears
from transformers.models.mixtral.modeling_mixtral import (
MixtralDecoderLayer as OldMixtralDecoderLayer,
MixtralForCausalLM as OldMixtralForCausalLM,
)
from awq.modules.linear import WQLinear_GEMM
from awq.modules.fused.norm import FasterTransformerRMSNorm
class MixtralAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "MixtralDecoderLayer"
max_seq_len_key = "max_position_embeddings"
modules_to_not_convert = ["gate"]
@staticmethod
def fuse_layers(model: OldMixtralForCausalLM):
fuser = MixtralFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldMixtralForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldMixtralForCausalLM, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(
module: OldMixtralDecoderLayer, input_feat, module_kwargs
):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear in
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[
w
for expert in module.block_sparse_moe.experts
for w in [expert.w1, expert.w3]
],
inp=input_feat["block_sparse_moe"],
module2inspect=module.block_sparse_moe,
)
)
# linear out
for i, expert in enumerate(module.block_sparse_moe.experts):
layers.append(
dict(
prev_op=expert.w3,
layers=[expert.w2],
inp=input_feat[f"block_sparse_moe.experts.{i}.w2"],
)
)
return layers
class MixtralFuser:
def __init__(self, model: OldMixtralForCausalLM):
self.model = model
self.mixtral_blocks: List[Tuple[str, OldMixtralDecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "MixtralDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldMixtralDecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = FasterTransformerRMSNorm(
module.input_layernorm.weight, module.input_layernorm.variance_epsilon
)
norm_2 = FasterTransformerRMSNorm(
module.post_attention_layernorm.weight,
module.post_attention_layernorm.variance_epsilon,
)
sparse_moe = module.block_sparse_moe
if isinstance(sparse_moe.experts[0].w1, WQLinear_GEMM):
fused_w1w3s = [
fuse_linears(
[
sparse_moe.experts[i].w1,
sparse_moe.experts[i].w3,
],
device,
)
for i in range(len(sparse_moe.experts))
]
stacked_w1w3s = fuse_linears(
fused_w1w3s, device, dim=0, operation=torch.stack
)
stacked_w2s = fuse_linears(
[expert.w2 for expert in sparse_moe.experts],
device,
dim=0,
operation=torch.stack,
)
sparse_moe = FusedSparseMoeBlock(
top_k=sparse_moe.top_k,
gate=sparse_moe.gate,
ws=stacked_w1w3s,
w2s=stacked_w2s,
)
blocks.append(
MixtralBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
moe=sparse_moe,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
rope_theta=self.model.config.rope_theta,
)
)
model_norm = FasterTransformerRMSNorm(
self.model.model.norm.weight,
self.model.model.norm.variance_epsilon,
)
self.model.model = MixtralModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
model_norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
from .base import BaseAWQForCausalLM
from transformers.models.mpt.modeling_mpt import MptBlock as OldMptBlock, MptForCausalLM
class MptAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "MPTBlock"
max_seq_len_key = "max_seq_len"
@staticmethod
def fuse_layers(model: MptForCausalLM):
fuser = MptFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: MptForCausalLM):
return model.transformer.blocks
@staticmethod
def get_act_for_scaling(module: OldMptBlock):
return dict(
is_scalable=True,
scale_name="ffn.act",
scale_layer=module.ffn.act,
scale_shape=module.ffn.up_proj.out_features,
)
@staticmethod
def move_embed(model: MptForCausalLM, device: str):
model.transformer.wte = model.transformer.wte.to(device)
model.transformer.emb_drop = model.transformer.emb_drop.to(device)
@staticmethod
def get_layers_for_scaling(module: OldMptBlock, input_feat, module_kwargs):
layers = []
if module_kwargs.get("output_attentions") is not None:
module_kwargs.pop("output_attentions")
# attention input
layers.append(
dict(
prev_op=module.norm_1,
layers=[module.attn.Wqkv],
inp=input_feat["attn.Wqkv"],
module2inspect=module.attn,
kwargs=module_kwargs,
)
)
# attention output
layers.append(
dict(
prev_op=module.attn.Wqkv,
layers=[module.attn.out_proj],
inp=input_feat["attn.out_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.norm_2,
layers=[module.ffn.up_proj],
inp=input_feat["ffn.up_proj"],
module2inspect=module.ffn,
)
)
# linear 2
layers.append(
dict(
prev_op=module.ffn.act,
layers=[module.ffn.down_proj],
inp=input_feat["ffn.down_proj"],
)
)
return layers
from typing import List, Tuple
from awq.utils.utils import set_module_name
from awq.modules.fused.block import MPTBlock
from awq.modules.fused.model import MPTModel
class MptFuser:
def __init__(self, model: MptForCausalLM):
self.model = model
self.mpt_blocks: List[Tuple[str, OldMptBlock]] = [
(name, module)
for name, module in self.model.named_modules()
if "mptblock" in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldMptBlock
for module in self.model.transformer.blocks:
blocks.append(
MPTBlock(
self.model.config.d_model,
self.model.config.n_heads,
module.attn.Wqkv,
module.attn.out_proj,
module.ffn,
module.norm_1,
module.norm_2,
next(iter(module.state_dict().values())).device,
self.model.config.max_seq_len,
)
)
self.model.transformer = MPTModel(
self.model.config.vocab_size,
blocks,
self.model.transformer.wte,
self.model.transformer.norm_f,
)
setattr(self.model.transformer, "blocks", self.model.transformer.blocks)
from .base import BaseAWQForCausalLM
from transformers.models.opt.modeling_opt import OPTForCausalLM, OPTDecoderLayer
class OptAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "OPTDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def get_model_layers(model: OPTForCausalLM):
return model.model.decoder.layers
@staticmethod
def get_act_for_scaling(module: OPTDecoderLayer):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OPTForCausalLM, device: str):
model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device)
model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(
device
)
@staticmethod
def get_layers_for_scaling(module: OPTDecoderLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.self_attn_layer_norm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.out_proj],
inp=input_feat["self_attn.out_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.final_layer_norm,
layers=[module.fc1],
inp=input_feat["fc1"],
)
)
# linear 2
layers.append(
dict(
prev_op=module.fc1,
layers=[module.fc2],
inp=input_feat["fc2"],
)
)
return layers
from .base import BaseAWQForCausalLM
class QwenAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "QWenBlock"
max_seq_len_key = "seq_length"
@staticmethod
def get_model_layers(model):
return model.transformer.h
@staticmethod
def get_act_for_scaling(module):
return dict(is_scalable=False)
@staticmethod
def move_embed(model, device: str):
model.transformer.wte = model.transformer.wte.to(device)
model.transformer.rotary_emb = model.transformer.rotary_emb.to(device)
@staticmethod
def get_layers_for_scaling(module, input_feat, module_kwargs):
layers = []
# attention
layers.append(
dict(
prev_op=module.ln_1,
layers=[module.attn.c_attn],
inp=input_feat["attn.c_attn"],
module2inspect=module.attn,
kwargs=module_kwargs,
)
)
# mlp
layers.append(
dict(
prev_op=module.ln_2,
layers=[module.mlp.w2, module.mlp.w1],
inp=input_feat["mlp.w2"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.w1,
layers=[module.mlp.c_proj],
inp=input_feat["mlp.c_proj"],
)
)
return layers
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.qwen2.modeling_qwen2 import (
Qwen2DecoderLayer as OldQwen2DecoderLayer,
Qwen2ForCausalLM as OldQwen2ForCausalLM,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class Qwen2AWQForCausalLM(BaseAWQForCausalLM):
layer_type = "Qwen2DecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldQwen2ForCausalLM):
fuser = Qwen2Fuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldQwen2ForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module: OldQwen2DecoderLayer):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldQwen2ForCausalLM, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(module: OldQwen2DecoderLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class Qwen2Fuser:
def __init__(self, model: OldQwen2ForCausalLM):
self.model = model
self.qwen2_blocks: List[Tuple[str, OldQwen2DecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "Qwen2DecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldQwen2DecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = FasterTransformerRMSNorm(
module.input_layernorm.weight, module.input_layernorm.variance_epsilon
)
norm_2 = FasterTransformerRMSNorm(
module.post_attention_layernorm.weight,
module.post_attention_layernorm.variance_epsilon,
)
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.stablelm import StableLmForCausalLM as OldStableLmForCausalLM
from transformers.models.stablelm.modeling_stablelm import (
StableLmDecoderLayer as OldStableLmDecoderLayer,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class StableLmAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "StableLmDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldStableLmForCausalLM):
fuser = StableLmFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldStableLmForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module: OldStableLmForCausalLM):
return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldStableLmForCausalLM, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(
module: OldStableLmDecoderLayer, input_feat, module_kwargs
):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class StableLmFuser:
def __init__(self, model: OldStableLmForCausalLM):
self.model = model
self.stablelm_blocks: List[Tuple[str, OldStableLmDecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "StableLmDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldStableLmDecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = module.input_layernorm
norm_2 = module.post_attention_layernorm
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
rope_theta=self.model.config.rope_theta,
partial_rotary_factor=self.model.config.partial_rotary_factor,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from transformers.models.starcoder2.modeling_starcoder2 import (
Starcoder2ForCausalLM as OldStarcoder2ForCausalLM,
Starcoder2DecoderLayer as OldStarcoder2DecoderLayer,
)
from awq.modules.fused.norm import FasterTransformerRMSNorm
class Starcoder2AWQForCausalLM(BaseAWQForCausalLM):
layer_type = "Starcoder2DecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model: OldStarcoder2ForCausalLM):
fuser = Starcoder2Fuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model: OldStarcoder2ForCausalLM):
return model.model.layers
@staticmethod
def get_act_for_scaling(module: OldStarcoder2DecoderLayer):
return dict(
is_scalable=True,
scale_name="mlp.act",
scale_layer=module.mlp.act,
scale_shape=module.mlp.c_fc.out_features,
)
# return dict(is_scalable=False)
@staticmethod
def move_embed(model: OldStarcoder2ForCausalLM, device):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(module: OldStarcoder2DecoderLayer, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.c_fc],
inp=input_feat["mlp.c_fc"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.act,
layers=[module.mlp.c_proj],
inp=input_feat["mlp.c_proj"],
)
)
return layers
class Starcoder2Fuser:
def __init__(self, model: OldStarcoder2ForCausalLM):
self.model = model
self.starcoder2_blocks: List[Tuple[str, OldStarcoder2DecoderLayer]] = [
(name, module)
for name, module in self.model.named_modules()
if "Starcoder2DecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
module: OldStarcoder2DecoderLayer
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
# SC2 use normal LayerNorm
norm_1 = module.input_layernorm
norm_2 = module.post_attention_layernorm
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
\ No newline at end of file
import tqdm
from typing import List, Tuple
from .base import BaseAWQForCausalLM
from awq.utils.fused_utils import fuse_qkv
from awq.modules.fused.block import LlamaLikeBlock
from awq.modules.fused.model import LlamaLikeModel
from awq.modules.fused.norm import FasterTransformerRMSNorm
class YiAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "YiDecoderLayer"
max_seq_len_key = "max_position_embeddings"
@staticmethod
def fuse_layers(model):
fuser = YiFuser(model)
fuser.fuse_transformer()
@staticmethod
def get_model_layers(model):
return model.model.layers
@staticmethod
def get_act_for_scaling(module):
return dict(is_scalable=False)
@staticmethod
def move_embed(model, device: str):
model.model.embed_tokens = model.model.embed_tokens.to(device)
@staticmethod
def get_layers_for_scaling(module, input_feat, module_kwargs):
layers = []
# attention input
layers.append(
dict(
prev_op=module.ln1,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
# linear 1
layers.append(
dict(
prev_op=module.ln2,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
class YiFuser:
def __init__(self, model):
self.model = model
self.yi_blocks: List[Tuple[str, object]] = [
(name, module)
for name, module in self.model.named_modules()
if "YiDecoderLayer".lower() in module.__class__.__name__.lower()
]
def fuse_transformer(self):
blocks = []
for module in tqdm.tqdm(self.model.model.layers, desc="Fusing layers..."):
device = next(iter(module.state_dict().values())).device
qkv = fuse_qkv(
module,
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
)
norm_1 = FasterTransformerRMSNorm(
module.ln1.weight, module.ln1.variance_epsilon
)
norm_2 = FasterTransformerRMSNorm(
module.ln2.weight, module.ln2.variance_epsilon
)
blocks.append(
LlamaLikeBlock(
hidden_size=self.model.config.hidden_size,
n_heads=self.model.config.num_attention_heads,
n_kv_heads=self.model.config.num_key_value_heads,
qkv_layer=qkv,
o_proj=module.self_attn.o_proj,
mlp=module.mlp,
norm_1=norm_1,
norm_2=norm_2,
dev=device,
max_seq_len=self.model.config.max_seq_len,
rope_theta=self.model.config.rope_theta,
)
)
self.model.model = LlamaLikeModel(
self.model.config.vocab_size,
blocks,
self.model.model.embed_tokens,
self.model.model.norm,
)
setattr(self.model.model, "blocks", self.model.model.blocks)
import torch.nn as nn
class ScaledActivation(nn.Module):
def __init__(self, module, scales):
super().__init__()
self.act = module
self.scales = nn.Parameter(scales.data)
def forward(self, x):
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment