support deploy qwen-14b-chat (#482)

* support deploy qwen-14b-chat * update README * load safetensors first

support deploy qwen-14b-chat (#482)
* support deploy qwen-14b-chat * update README * load safetensors first
b21239a8 · Chen Xin · GitHub · 27e12477 · b21239a8 · b21239a8
Unverified Commit b21239a8 authored Oct 12, 2023 by Chen Xin Committed by GitHub Oct 12, 2023
Showing with 37 additions and 16 deletions

README.md README.md +2 -0

README_zh-CN.md README_zh-CN.md +2 -0

lmdeploy/model.py lmdeploy/model.py +1 -0

lmdeploy/serve/turbomind/deploy.py lmdeploy/serve/turbomind/deploy.py +32 -16

No files found.
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 ## News 🎉
+- \[2023/09\] TurboMind supports Qwen-14B
 - \[2023/09\] TurboMind supports InternLM-20B
 - \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
@@ -65,6 +66,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |       Yes       | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
 |  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 ## 更新 🎉
+- \[2023/09\] TurboMind 支持 Qwen-14B
 - \[2023/09\] TurboMind 支持 InternLM-20B 模型
 - \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
@@ -66,6 +67,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |   Yes    | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
 |  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |

--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -448,6 +448,7 @@ If a question does not make any sense, or is not factually coherent, explain why
        return ret
+@MODELS.register_module(name='qwen-14b')
 @MODELS.register_module(name='qwen-7b')
 class Qwen7BChat(BaseModel):
    """Chat template for Qwen-7B-Chat."""

--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -11,6 +11,7 @@ from pathlib import Path
 import fire
 import safetensors
 import torch
+from safetensors.torch import load_file
 from sentencepiece import SentencePieceProcessor
 import lmdeploy
@@ -108,6 +109,35 @@ def tokenizer_info_qwen(model_dir: str):
    return n_words, bos_id, eos_id
+def load_checkpoint(model_path):
+    """Load checkpoint files into torch format.
+    Args:
+        model_path (str): the checkpoint folder
+    Returns:
+        Dict[str, torch.Tensor]: weight in torch format
+    """
+    suffixes = ['.safetensors', '.bin']
+    for suffix in suffixes:
+        files = [
+            file for file in os.listdir(model_path) if file.endswith(suffix)
+        ]
+        if len(files) > 0:
+            break
+    assert len(files) > 0, f'could not find checkpoints in {model_path}'
+    files = sorted(files)
+    print(files)
+    params = {}
+    for file in files:
+        if file.endswith('.bin'):
+            tmp = torch.load(osp.join(model_path, file), map_location='cpu')
+        else:
+            tmp = load_file(osp.join(model_path, file))
+        params.update(tmp)
+    return params
 def export(model_name: str,
           num_layer: int,
           norm_eps: float,
@@ -437,14 +467,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
    _qweight = 'weight'
    _suffixes = [_qweight, 'bias']
-    _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
+    _params = load_checkpoint(model_path)
-    _files = sorted(_files)
-    print(_files)
-    _params = {}
-    for _file in _files:
-        _tmp = torch.load(osp.join(model_path, _file), map_location='cpu')
-        _params.update(_tmp)
    def get_tensor(name):
        """return tensor according its name."""
@@ -837,14 +860,7 @@ def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
    # convert weights from hf to turbomind
    model_params = {}
-    _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
+    _params = load_checkpoint(model_path)
-    _files = sorted(_files)
-    print(_files)
-    _params = {}
-    for _file in _files:
-        _tmp = torch.load(osp.join(model_path, _file), map_location='cpu')
-        _params.update(_tmp)
    def get_tensor(name, trans=True):
        """return a transposed tensor according its name."""