Unverified Commit d5cb0be2 authored by pppppM's avatar pppppM Committed by GitHub
Browse files

[Fix] Fix llama2 70b & qwen quantization error (#273)

* fix llama2 70b

* fix qwen quantization

* remove pdb

* add faq
parent e5bfd387
......@@ -210,7 +210,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
#### Weight INT4 Quantization
LMDeploy uses AWQ algorithm for model weight quantization
LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization
> Requires input from the $WORK_DIR of step 1, and the quantized weights will also be stored in this folder.
......
......@@ -48,3 +48,9 @@ export LD_LIBRARY_PATH={Location}/nvidia/nccl/lib:$LD_LIBRARY_PATH
## 服务
## 量化
### RuntimeError: \[enforce fail at inline_container.cc:337\] . unexpected pos 4566829760 vs 4566829656
请检查你的硬盘空间。
这个错误是因为保存权重时硬盘空间不足导致的,在量化 70B 模型时可能会遇到
......@@ -4,8 +4,10 @@ from pathlib import Path
import fire
import torch
from accelerate import (infer_auto_device_map, init_empty_weights,
load_checkpoint_in_model)
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
quant_weights, smooth_layers)
......@@ -26,24 +28,42 @@ NORM_TYPE_MAP = {
def auto_awq(model: str,
work_dir: str,
w_bits: int = 4,
w_sym: bool = False,
w_group_size: int = 128,
work_dir: str = './work_dir',
device: str = 'cuda'):
# Load tokenizer and configuration
tokenizer = AutoTokenizer.from_pretrained(model,
use_fast=False,
trust_remote_code=True)
hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
checkpoint = hf_config._name_or_path
model = AutoModelForCausalLM.from_pretrained(model,
torch_dtype=torch.float16,
trust_remote_code=True)
with init_empty_weights():
# Load model
model = AutoModelForCausalLM.from_pretrained(model,
torch_dtype=torch.float16,
trust_remote_code=True)
model.config.use_cache = False
layer_type = LAYER_TYPE_MAP[type(model).__name__]
fc2fcs = FC_FCS_MAP[layer_type]
norm2fcs = NORM_FCS_MAP[layer_type]
decoder_layers = collect_target_modules(model, layer_type)
# Infer device map
device_map = infer_auto_device_map(model,
no_split_module_classes=[layer_type])
for name in device_map.keys():
if name in decoder_layers or 'lm_head' in name:
device_map[name] = 'cpu'
else:
device_map[name] = 0
load_checkpoint_in_model(model, checkpoint, device_map)
work_dir = Path(work_dir)
act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmean']
......
......@@ -74,7 +74,7 @@ def calibrate(model: str,
device_map = infer_auto_device_map(model,
no_split_module_classes=[layer_type])
for name in device_map.keys():
if name in decoder_layers:
if name in decoder_layers or 'lm_head' in name:
device_map[name] = 'cpu'
else:
device_map[name] = 0
......
......@@ -37,9 +37,15 @@ class KVCacheObserver(GlobalAvailMixin):
x : Input tensor
"""
assert len(x.shape) == 4
x = x.transpose(1, 2)
assert x.size(2) == self.num_head
assert x.size(3) == self.head_dim
if x.size(2) == self.num_head and x.size(3) == self.head_dim:
# layout: (bs, seqlen, heads, dims)
x = x
elif x.size(1) == self.num_head and x.size(3) == self.head_dim:
# layout: (bs, heads, seqlen, dims)
x = x.transpose(1, 2)
else:
raise RuntimeError
cur_max = x.flatten(0, 1).max(0)[0].cpu()
cur_min = x.flatten(0, 1).min(0)[0].cpu()
......
......@@ -14,6 +14,10 @@ NORM_FCS_MAP = {
'input_layernorm':
['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
},
'QWenBlock': {
'ln_1': ['attn.c_attn'],
'ln_2': ['mlp.w1', 'mlp.w2']
}
}
......@@ -25,6 +29,10 @@ FC_FCS_MAP = {
'InternLMDecoderLayer': {
'self_attn.v_proj': ['self_attn.o_proj'],
'mlp.up_proj': ['mlp.down_proj']
},
'QWenBlock': {
'attn.c_attn': ['attn.c_proj'],
'mlp.w1': ['mlp.c_proj']
}
}
......@@ -94,6 +102,14 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
:return: Scales
"""
device, dtype = pre_fc.weight.device, pre_fc.weight.dtype
size_a = act_scales.size(0)
size_pre_fc = pre_fc.weight.size(0)
# (for llama2) use group query attention, pre_fc is v_proj, fc is o_proj
if size_pre_fc < size_a and size_a % size_pre_fc == 0:
return
act_scales = act_scales.to(device=device, dtype=dtype)
concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
......@@ -103,10 +119,19 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()
pre_fc.weight.div_(scales.view(-1, 1))
# (for qwen) pre_fc is packed QKV, only V needs to scale
if size_pre_fc > size_a and size_pre_fc % size_a == 0 \
and size_pre_fc // size_a == 3:
if getattr(pre_fc, 'bias', None) is not None:
pre_fc.bias.div_(scales)
pre_fc.weight[-size_a:].div_(scales.view(-1, 1))
if getattr(pre_fc, 'bias', None) is not None:
pre_fc.bias[-size_a:].div_(scales)
else:
pre_fc.weight.div_(scales.view(-1, 1))
if getattr(pre_fc, 'bias', None) is not None:
pre_fc.bias.div_(scales)
for fc in fcs:
fc.weight.mul_(scales.view(1, -1))
......@@ -186,6 +211,7 @@ def smooth_layers(layers,
fc = layer.get_submodule(f_name)
fcs = [layer.get_submodule(n) for n in fc_names]
smooth_fc_fcs(fc, fcs, a_scales[a_name], group_size)
layer.to('cpu')
......
......@@ -49,9 +49,12 @@ class CalibrationContext():
self.layer_type = layer_type
self.norm_type = norm_type
self.num_head = self._guess_num_heads(model)
self.head_dim = model.config.hidden_size // self.num_head
num_kv_heads, num_attn_heads = self._guess_num_heads(model)
self.num_kv_heads = num_kv_heads
self.head_dim = model.config.hidden_size // num_attn_heads
self.model = model
del self.model.lm_head
self.tokenizer = tokenizer
# Collect modules to observe
......@@ -74,12 +77,15 @@ class CalibrationContext():
self.device = device
def _guess_num_heads(self, model):
if hasattr(model.config, 'num_attention_heads'):
return model.config.num_attention_heads
elif hasattr(model.config, 'num_key_value_heads'):
return model.config.num_key_value_heads
if hasattr(model.config, 'num_key_value_heads'):
num_kv_heads = model.config.num_key_value_heads
else:
raise KeyError
num_kv_heads = model.config.num_attention_heads
num_attn_heads = model.config.num_attention_heads
return num_kv_heads, num_attn_heads
def _init_input_observers(self, name2mod):
"""Initialize input observers for given modules."""
......@@ -96,8 +102,8 @@ class CalibrationContext():
def _init_kv_observers(self, name2mod):
"""Initialize KV observers for given modules."""
for name in name2mod.keys():
k_obs = KVCacheObserver(self.num_head, self.head_dim)
v_obs = KVCacheObserver(self.num_head, self.head_dim)
k_obs = KVCacheObserver(self.num_kv_heads, self.head_dim)
v_obs = KVCacheObserver(self.num_kv_heads, self.head_dim)
k_obs.global_available(name, group=self.key_obs_group)
v_obs.global_available(name, group=self.value_obs_group)
......@@ -270,8 +276,13 @@ class CalibrationContext():
def calibrate(self, data):
"""Forward pass through the model in inference mode with given data."""
if type(self.model).__name__ == 'QWenLMHeadModel':
model = self.model.transformer
else:
model = self.model.model
with torch.inference_mode():
_ = self.model.model(data.to(self.device))
_ = model(data.to(self.device))
def __enter__(self):
"""Prepares the Calibration object for a 'with' statement by
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment