Unverified Commit 79595cd1 authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Fix tensor-parallel inference of internlm with bias (#135)

* remove copy

* repetition_penalty=1

* add repetition_penalty to chat args

* update readme

* update readme
parent 39350031
......@@ -9,6 +9,14 @@ English | [简体中文](README_zh-CN.md)
👋 join us on <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
</p>
______________________________________________________________________
## News
\[2023/07\] TurboMind supports tensor-parallel inference of InternLM.
______________________________________________________________________
## Introduction
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams. It has the following core features:
......
......@@ -9,6 +9,14 @@
👋 join us on <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
</p>
______________________________________________________________________
## 更新
\[2023/07\] TurboMind 支持 InternLM 的 Tensor Parallel 推理
______________________________________________________________________
## 简介
LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy)[MMRazor](https://github.com/open-mmlab/mmrazor) 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。
......
......@@ -58,7 +58,7 @@ class InternLM:
str: the concatenated prompt
"""
if sequence_start:
return f'<bos>{self.user}:{prompt}{self.eoh}\n' \
return f'<BOS>{self.user}:{prompt}{self.eoh}\n' \
f'{self.assistant}:'
else:
return f'\n{self.user}:{prompt}{self.eoh}\n' \
......
......@@ -134,12 +134,9 @@ def export(model_name: str,
attn_bias = True
copy = False
if key in ['w1', 'w3', 'w_qkv']:
if ext in ['bias']:
copy = True
else:
split_dim = -1
if key == 'w1':
inter_size = param_data.shape[-1]
split_dim = -1
if key == 'w1':
inter_size = param_data.shape[-1]
elif key in ['w2', 'wo']:
if ext in ['scales', 'zeros', 'bias']:
copy = True
......
......@@ -29,7 +29,10 @@ def valid_str(string, coding='utf-8'):
return ret
def main(model_name, model_path, session_id: int = 1):
def main(model_name,
model_path,
session_id: int = 1,
repetition_penalty: float = 1.0):
"""An example to perform model inference through the command line
interface.
......@@ -88,7 +91,7 @@ def main(model_name, model_path, session_id: int = 1):
top_k=40,
top_p=0.8,
temperature=0.8,
repetition_penalty=1.05,
repetition_penalty=repetition_penalty,
ignore_eos=False,
random_seed=seed if nth_round == 1 else None):
res, tokens = outputs[0]
......
......@@ -215,7 +215,7 @@ class TurboMindInstance:
top_p=0.8,
top_k=40,
temperature=0.8,
repetition_penalty=1.05,
repetition_penalty=1.0,
ignore_eos=False,
random_seed=None,
stream_output=False):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment