Unverified Commit df7955de authored by Lyu Han's avatar Lyu Han Committed by GitHub
Browse files

Support InternLM 20B (#440)



* better profiler

* wait for releasing mem

* remove fire

* remove support for multiple model benchmark

* comments

* support actual seqlen

* change chat template

* update

* fix ut

* int->size_t

* output more details

* correct tp

* rollback

* update

* update readme

* add 'internlm-chat' as the default tag for internlm chat models

* rollback tokenizer

---------
Co-authored-by: default avatarAllentDan <AllentDan@yeah.net>
Co-authored-by: default avatargrimoire <yaoqian@pjlab.org.cn>
parent 19ff47df
......@@ -20,6 +20,7 @@ ______________________________________________________________________
## News 🎉
- \[2023/09\] TurboMind supports InternLM-20B
- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
- \[2023/09\] TurboMind supports Baichuan2-7B
- \[2023/08\] TurboMind supports flash-attention2.
......@@ -61,7 +62,8 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | Yes | Yes | No |
| Llama2 | Yes | Yes | Yes | Yes | No |
| InternLM | Yes | Yes | Yes | Yes | No |
| InternLM-7B | Yes | Yes | Yes | Yes | No |
| InternLM-20B | Yes | Yes | Yes | Yes | No |
| QWen-7B | Yes | Yes | Yes | No | No |
| Baichuan-7B | Yes | Yes | Yes | Yes | No |
| Baichuan2-7B | Yes | Yes | No | No | No |
......@@ -69,11 +71,11 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
### Pytorch
| Models | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
| :------: | :-------------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | No | No | No |
| Llama2 | Yes | Yes | No | No | No |
| InternLM | Yes | Yes | No | No | No |
| Models | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | No | No | No |
| Llama2 | Yes | Yes | No | No | No |
| InternLM-7B | Yes | Yes | No | No | No |
## Performance
......
......@@ -20,6 +20,7 @@ ______________________________________________________________________
## 更新 🎉
- \[2023/09\] TurboMind 支持 InternLM-20B 模型
- \[2023/09\] TurboMind 支持 Code Llama 所有功能:代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
- \[2023/09\] TurboMind 支持 Baichuan2-7B
- \[2023/08\] TurboMind 支持 flash-attention2
......@@ -62,7 +63,8 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
| :----------: | :------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | Yes | Yes | No |
| Llama2 | Yes | Yes | Yes | Yes | No |
| InternLM | Yes | Yes | Yes | Yes | No |
| InternLM-7B | Yes | Yes | Yes | Yes | No |
| InternLM-20B | Yes | Yes | Yes | Yes | No |
| QWen-7B | Yes | Yes | Yes | No | No |
| Baichuan-7B | Yes | Yes | Yes | Yes | No |
| Baichuan2-7B | Yes | Yes | No | No | No |
......@@ -70,11 +72,11 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
### Pytorch
| 模型 | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
| :------: | :------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | No | No | No |
| Llama2 | Yes | Yes | No | No | No |
| InternLM | Yes | Yes | No | No | No |
| 模型 | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
| :---------: | :------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | No | No | No |
| Llama2 | Yes | Yes | No | No | No |
| InternLM-7B | Yes | Yes | No | No | No |
## 性能
......
......@@ -55,7 +55,7 @@ class BaseModel:
@abstractmethod
def decorate_prompt(self, prompt, sequence_start):
pass
return prompt
@staticmethod
def _translate_messages(messages: List):
......@@ -169,6 +169,7 @@ class Vicuna(BaseModel):
return ret
@MODELS.register_module(name='internlm-chat')
@MODELS.register_module(name='internlm-chat-7b')
class InternLMChat7B(BaseModel):
"""Chat template of InternLM model."""
......@@ -176,7 +177,7 @@ class InternLMChat7B(BaseModel):
def __init__(self,
system='',
user='<|User|>',
eoh='<eoh>',
eoh='',
eoa='<eoa>',
assistant='<|Bot|>',
**kwargs):
......@@ -223,7 +224,7 @@ class InternLMChat7B(BaseModel):
for user, assistant in zip(users, assistants):
if assistant:
ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
f'{assistant}{self.eoa}'
f'{assistant}{self.eoa}\n'
else:
ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
return ret
......@@ -231,19 +232,33 @@ class InternLMChat7B(BaseModel):
@property
def stop_words(self):
"""Return the stop-words' token ids."""
return [103027, 103028]
return [103028]
@MODELS.register_module(name='internlm-chat-20b')
@MODELS.register_module(name='internlm-chat-7b-8k')
class InternLMChat7B8K(InternLMChat7B):
"""Chat template and generation parameters of InternLM-Chat-7B-8K and
InternLM-Chat-20B models."""
def __init__(self, session_len=8192, **kwargs):
super(InternLMChat7B8K, self).__init__(**kwargs)
self.session_len = session_len
@MODELS.register_module(name='internlm-20b')
class InternLMBaseModel20B(BaseModel):
"""Generation parameters of InternLM-20B-Base model."""
def __init__(self, session_len=4096, capability='completion', **kwargs):
super().__init__(session_len=session_len,
capability=capability,
**kwargs)
@MODELS.register_module(name='baichuan-7b')
class Baichuan7B(BaseModel):
"""Generation parameters of Baichuan-7B base model."""
def __init__(self, repetition_penalty=1.1, **kwargs):
super().__init__(**kwargs)
......@@ -252,6 +267,8 @@ class Baichuan7B(BaseModel):
@MODELS.register_module(name='baichuan2-7b')
class Baichuan2_7B(BaseModel):
"""Chat template and generation parameters of Baichuan2-7B-Base and
Baichuan2-7B-Chat models."""
def __init__(self,
temperature=0.3,
......
......@@ -7,7 +7,7 @@ def test_base_model():
model = MODELS.get('llama')()
assert model is not None
assert model.capability == 'chat'
assert model.get_prompt('test') is None
assert model.get_prompt('test') == 'test'
assert model.stop_words is None
model = MODELS.get('internlm')(capability='completion')
......@@ -72,7 +72,7 @@ def test_baichuan():
model = MODELS.get('baichuan-7b')(capability='chat')
_prompt = model.get_prompt(prompt, sequence_start=True)
assert _prompt is None
assert _prompt == prompt
def test_llama2():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment