"docs/git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "c1de4c9bbf427b3786546eb16caf37f29cc39651"
Unverified Commit e8ab4ba3 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

rename serve/fastertransformer to serve/turbomind (#31)

* rename lmdeploy/serve/fastertransformer to lmdeploy/serve/turbomind

* update

* update
parent 46f4738c
...@@ -4,7 +4,7 @@ import time ...@@ -4,7 +4,7 @@ import time
import fire import fire
import numpy as np import numpy as np
from lmdeploy.serve.fastertransformer.chatbot import Chatbot from lmdeploy.serve.turbomind.chatbot import Chatbot
def infer(chatbot, session_id: int, prompt: str, output_seqlen: int, def infer(chatbot, session_id: int, prompt: str, output_seqlen: int,
......
...@@ -9,7 +9,7 @@ import fire ...@@ -9,7 +9,7 @@ import fire
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
from lmdeploy.serve.fastertransformer.chatbot import Chatbot from lmdeploy.serve.turbomind.chatbot import Chatbot
class Tokenizer: class Tokenizer:
......
...@@ -7,7 +7,7 @@ from typing import Sequence ...@@ -7,7 +7,7 @@ from typing import Sequence
import fire import fire
import gradio as gr import gradio as gr
from lmdeploy.serve.fastertransformer.chatbot import Chatbot from lmdeploy.serve.turbomind.chatbot import Chatbot
CSS = """ CSS = """
#container { #container {
...@@ -37,7 +37,7 @@ def chat_stream(instruction: str, ...@@ -37,7 +37,7 @@ def chat_stream(instruction: str,
llama_chatbot: Chatbot, llama_chatbot: Chatbot,
model_name: str = None): model_name: str = None):
bot_summarized_response = '' bot_summarized_response = ''
model_type = 'fastertransformer' model_type = 'turbomind'
state_chatbot = state_chatbot + [(instruction, None)] state_chatbot = state_chatbot + [(instruction, None)]
session_id = threading.current_thread().ident session_id = threading.current_thread().ident
bot_response = llama_chatbot.stream_infer( bot_response = llama_chatbot.stream_infer(
......
...@@ -3,7 +3,7 @@ import os ...@@ -3,7 +3,7 @@ import os
import fire import fire
from lmdeploy.serve.fastertransformer.chatbot import Chatbot from lmdeploy.serve.turbomind.chatbot import Chatbot
def input_prompt(): def input_prompt():
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from lmdeploy.serve.fastertransformer.chatbot import Chatbot # noqa: F401,F403 from lmdeploy.serve.turbomind.chatbot import Chatbot # noqa: F401,F403
...@@ -16,9 +16,8 @@ import tritonclient.grpc as grpcclient ...@@ -16,9 +16,8 @@ import tritonclient.grpc as grpcclient
from tritonclient.grpc.service_pb2 import ModelInferResponse from tritonclient.grpc.service_pb2 import ModelInferResponse
from lmdeploy.model import MODELS from lmdeploy.model import MODELS
from lmdeploy.serve.fastertransformer.utils import (Postprocessor, from lmdeploy.serve.turbomind.utils import (Postprocessor, Preprocessor,
Preprocessor, prepare_tensor)
prepare_tensor)
@dataclass @dataclass
...@@ -56,8 +55,7 @@ def get_logger(log_file=None, log_level=logging.INFO): ...@@ -56,8 +55,7 @@ def get_logger(log_file=None, log_level=logging.INFO):
class Chatbot: class Chatbot:
"""Chatbot for LLaMA series models with fastertransformer as inference """Chatbot for LLaMA series models with turbomind as inference engine.
engine.
Args: Args:
tritonserver_addr (str): communicating address '<ip>:<port>' of tritonserver_addr (str): communicating address '<ip>:<port>' of
...@@ -277,7 +275,7 @@ class Chatbot: ...@@ -277,7 +275,7 @@ class Chatbot:
f'stop_words must be a list but got {type(stop_words)}' f'stop_words must be a list but got {type(stop_words)}'
# each id in stop_words represents a stop word # each id in stop_words represents a stop word
# refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
# detailed explanation about fastertransformer's stop_words # detailed explanation about turbomind's stop_words
stop_word_offsets = range(1, len(stop_words) + 1) stop_word_offsets = range(1, len(stop_words) + 1)
stop_words = np.array([[stop_words, stop_words = np.array([[stop_words,
stop_word_offsets]]).astype(np.int32) stop_word_offsets]]).astype(np.int32)
...@@ -418,7 +416,7 @@ class Chatbot: ...@@ -418,7 +416,7 @@ class Chatbot:
random_seed * np.ones((1, 1), dtype=np.uint64)) random_seed * np.ones((1, 1), dtype=np.uint64))
] ]
client.start_stream(callback) client.start_stream(callback)
client.async_stream_infer('fastertransformer', client.async_stream_infer('turbomind',
inputs, inputs,
sequence_id=session.session_id, sequence_id=session.session_id,
request_id=session.request_id, request_id=session.request_id,
...@@ -438,7 +436,7 @@ class Chatbot: ...@@ -438,7 +436,7 @@ class Chatbot:
session.sequence_length - preseq_length session.sequence_length - preseq_length
break break
if 'errcode' in result: if 'errcode' in result:
logger.error(f'got error from fastertransformer, code ' logger.error(f'got error from turbomind, code '
f"{result['errcode']}, {result['errmsg']}, " f"{result['errcode']}, {result['errmsg']}, "
f'token {session.sequence_length}') f'token {session.sequence_length}')
session.sequence_length = preseq_length session.sequence_length = preseq_length
......
...@@ -140,7 +140,7 @@ def export(model_name: str, ...@@ -140,7 +140,7 @@ def export(model_name: str,
start_id=bos_id, start_id=bos_id,
end_id=eos_id, end_id=eos_id,
weight_type='fp16', weight_type='fp16',
# parameters for fastertransformer # parameters for turbomind
max_batch_size=32, max_batch_size=32,
max_context_token_num=4, max_context_token_num=4,
session_len=2048, session_len=2048,
...@@ -179,7 +179,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str, ...@@ -179,7 +179,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}') print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}')
return False return False
# convert weights from llama to fastertransformer # convert weights from llama to turbomind format
checkpoints = [] checkpoints = []
for pattern in ['*.pth', '*.pt']: for pattern in ['*.pth', '*.pt']:
checkpoints += sorted(Path(model_path).glob(pattern)) checkpoints += sorted(Path(model_path).glob(pattern))
...@@ -303,7 +303,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -303,7 +303,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
f'{params_path} failed: {e}') f'{params_path} failed: {e}')
return False return False
# convert weights from hf to fastertransformer # convert weights from hf to turbomind
model_params = {} model_params = {}
_qweight = 'weight' _qweight = 'weight'
...@@ -388,7 +388,7 @@ def pack_model_repository(workspace_path: str): ...@@ -388,7 +388,7 @@ def pack_model_repository(workspace_path: str):
model_repo_dir = osp.join(workspace_path, 'model_repository') model_repo_dir = osp.join(workspace_path, 'model_repository')
os.makedirs(model_repo_dir, exist_ok=True) os.makedirs(model_repo_dir, exist_ok=True)
os.symlink(src=osp.join('../triton_models/interactive'), os.symlink(src=osp.join('../triton_models/interactive'),
dst=osp.join(model_repo_dir, 'fastertransformer')) dst=osp.join(model_repo_dir, 'turbomind'))
os.symlink(src=osp.join('../triton_models/preprocessing'), os.symlink(src=osp.join('../triton_models/preprocessing'),
dst=osp.join(model_repo_dir, 'preprocessing')) dst=osp.join(model_repo_dir, 'preprocessing'))
os.symlink(src=osp.join('../triton_models/postprocessing'), os.symlink(src=osp.join('../triton_models/postprocessing'),
...@@ -401,7 +401,7 @@ def main(model_name: str, ...@@ -401,7 +401,7 @@ def main(model_name: str,
tokenizer_path: str = None, tokenizer_path: str = None,
dst_path: str = './workspace', dst_path: str = './workspace',
tp: int = 1): tp: int = 1):
"""deploy llama family models via fastertransformer. """deploy llama family models via turbomind.
Args: Args:
model_name (str): the name of the to-be-deployed model, such as model_name (str): the name of the to-be-deployed model, such as
...@@ -445,7 +445,7 @@ def main(model_name: str, ...@@ -445,7 +445,7 @@ def main(model_name: str,
'string_value: ' + f'"{tp}"\n' + ' }\n}\n' 'string_value: ' + f'"{tp}"\n' + ' }\n}\n'
f.write(param) f.write(param)
if not res: if not res:
print(f'deploy model "{model_name}" via fastertransformer failed') print(f'deploy model "{model_name}" via turbomind failed')
destroy_workspace(dst_path) destroy_workspace(dst_path)
exit(-1) exit(-1)
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "fastertransformer" name: "turbomind"
backend: "fastertransformer" backend: "fastertransformer"
default_model_filename: "weights" default_model_filename: "weights"
max_batch_size: 1 max_batch_size: 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment