Commit d73d13b2 authored by Casper Hansen's avatar Casper Hansen
Browse files

Support Falcon 7B+40B

parent 06073073
from .mpt import MptAWQForCausalLM from .mpt import MptAWQForCausalLM
from .llama import LlamaAWQForCausalLM from .llama import LlamaAWQForCausalLM
from .opt import OptAWQForCausalLM from .opt import OptAWQForCausalLM
from .falcon import FalconAWQForCausalLM
\ No newline at end of file
...@@ -5,7 +5,9 @@ from awq.models.base import BaseAWQForCausalLM ...@@ -5,7 +5,9 @@ from awq.models.base import BaseAWQForCausalLM
AWQ_CAUSAL_LM_MODEL_MAP = { AWQ_CAUSAL_LM_MODEL_MAP = {
"mpt": MptAWQForCausalLM, "mpt": MptAWQForCausalLM,
"llama": LlamaAWQForCausalLM, "llama": LlamaAWQForCausalLM,
"opt": OptAWQForCausalLM "opt": OptAWQForCausalLM,
"RefinedWeb": FalconAWQForCausalLM,
"RefinedWebModel": FalconAWQForCausalLM
} }
def check_and_get_model_type(model_dir, trust_remote_code=True): def check_and_get_model_type(model_dir, trust_remote_code=True):
......
from .base import BaseAWQForCausalLM
from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconForCausalLM
class FalconAWQForCausalLM(BaseAWQForCausalLM):
layer_type = "FalconDecoderLayer"
@staticmethod
def get_model_layers(model: FalconForCausalLM):
return model.transformer.h
@staticmethod
def get_act_for_scaling(module: FalconDecoderLayer):
return dict(
is_scalable=True,
scale_name="mlp.act",
scale_layer=module.mlp.act,
scale_shape=module.mlp.dense_h_to_4h.out_features
)
@staticmethod
def move_embed(model: FalconForCausalLM, device):
model.transformer.word_embeddings = model.transformer.word_embeddings.to(device)
@staticmethod
def get_layers_for_scaling(module: FalconDecoderLayer, input_feat, module_kwargs):
layers = []
# Falcon 7B (older architecture)
if module.config.num_attention_heads == 71:
# linear 1 + attention
layers.append(dict(
prev_op=module.input_layernorm,
layers=[module.mlp.dense_h_to_4h, module.self_attention.query_key_value],
inp=input_feat['self_attention.query_key_value'],
module2inspect=module,
kwargs=module_kwargs,
))
# Falcon 40B (newer architecture)
else:
# linear 1 + attention
layers.append(dict(
prev_op=module.ln_attn,
layers=[module.self_attention.query_key_value],
inp=input_feat['self_attention.query_key_value'],
module2inspect=module,
kwargs=module_kwargs,
))
# linear 2
layers.append(dict(
prev_op=module.ln_mlp,
layers=[module.mlp.dense_h_to_4h],
inp=input_feat['mlp.dense_h_to_4h'],
module2inspect=module,
kwargs=module_kwargs,
))
return layers
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment