Unverified Commit 9bbd39b7 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

fix model conversion (#51)

parent 62e0fa9a
...@@ -307,7 +307,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -307,7 +307,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
model_params = {} model_params = {}
_qweight = 'weight' _qweight = 'weight'
_suffixes = [_qweight] _suffixes = [_qweight, 'bias']
_files = [file for file in os.listdir(model_path) if file.endswith('.bin')] _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
_files = sorted(_files) _files = sorted(_files)
...@@ -321,7 +321,9 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -321,7 +321,9 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
def get_tensor(name): def get_tensor(name):
return _params[name] return _params[name]
def get_tensor_transposed(name): def get_tensor_transposed(name: str):
if not name in _params and name.find('bias'):
return None
return _params[name].t() return _params[name].t()
for i in range(1000): for i in range(1000):
...@@ -331,20 +333,21 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -331,20 +333,21 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
for suffix in _suffixes: for suffix in _suffixes:
q, k, v, o = map(get_tensor_transposed, q, k, v, o = map(get_tensor_transposed,
map(('{}.' + suffix).format, _qkvo)) map(('{}.' + suffix).format, _qkvo))
if suffix == 'bias': if q is None:
check_zero(q), check_zero(k), check_zero(v), check_zero(o) continue
else: # q, k has different layout for fb & hf, convert to fb's
# q, k has different layout for fb & hf, convert to fb's # layout
# layout q = permute(q)
q = permute(q) k = permute(k)
k = permute(k) if suffix == _qweight: # weight, qweight
if suffix == _qweight: # weight, qweight # insert a dimension for splitting heads later
# insert a dimension for splitting heads later qkv = torch.stack((q, k, v), dim=1)
qkv = torch.stack((q, k, v), dim=1) else: # scales, zeros, bias
else: # scales, zeros qkv = torch.stack((q.squeeze(), k.squeeze(), v.squeeze()),
qkv = torch.stack((q, k, v), dim=0).squeeze(dim=-1) dim=0).squeeze(dim=-1)
for k, v in [('w_qkv', qkv), ('wo', o)]: print(suffix, qkv.shape)
model_params[f'layers.{i}.attention.{k}.{suffix}'] = v for k, v in [('w_qkv', qkv), ('wo', o)]:
model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
# ffn weights # ffn weights
_w123 = [ _w123 = [
f'model.layers.{i}.mlp.{t}_proj' f'model.layers.{i}.mlp.{t}_proj'
...@@ -353,15 +356,12 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -353,15 +356,12 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
for suffix in _suffixes: for suffix in _suffixes:
w1, w2, w3 = map(get_tensor_transposed, w1, w2, w3 = map(get_tensor_transposed,
map(('{}.' + suffix).format, _w123)) map(('{}.' + suffix).format, _w123))
if suffix == 'bias': if w1 is None:
check_zero(w1), check_zero(w2), check_zero(w3) continue
else: if suffix in ['scales', 'zeros', 'bias']:
if suffix in ['scales', 'zeros']: w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
[w1, w2, w3]) model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
model_params[
f'layers.{i}.feed_forward.{k}.{suffix}'] = v
other = [('attention_norm.weight', 'input_layernorm.weight'), other = [('attention_norm.weight', 'input_layernorm.weight'),
('ffn_norm.weight', 'post_attention_layernorm.weight')] ('ffn_norm.weight', 'post_attention_layernorm.weight')]
for ft, hf in other: for ft, hf in other:
...@@ -372,7 +372,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -372,7 +372,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
except KeyError: except KeyError:
break break
assert num_layer == i, 'miss matched layers: {num_layer} vs {i}' assert num_layer == i, f'miss matched layers: {num_layer} vs {i}'
other = [('tok_embeddings.weight', 'model.embed_tokens.weight'), other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
('norm.weight', 'model.norm.weight'), ('norm.weight', 'model.norm.weight'),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment