Unverified Commit e8ab4ba3 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

rename serve/fastertransformer to serve/turbomind (#31)

* rename lmdeploy/serve/fastertransformer to lmdeploy/serve/turbomind

* update

* update
parent 46f4738c
......@@ -4,7 +4,7 @@ import time
import fire
import numpy as np
from lmdeploy.serve.fastertransformer.chatbot import Chatbot
from lmdeploy.serve.turbomind.chatbot import Chatbot
def infer(chatbot, session_id: int, prompt: str, output_seqlen: int,
......
......@@ -9,7 +9,7 @@ import fire
import numpy as np
from sentencepiece import SentencePieceProcessor
from lmdeploy.serve.fastertransformer.chatbot import Chatbot
from lmdeploy.serve.turbomind.chatbot import Chatbot
class Tokenizer:
......
......@@ -7,7 +7,7 @@ from typing import Sequence
import fire
import gradio as gr
from lmdeploy.serve.fastertransformer.chatbot import Chatbot
from lmdeploy.serve.turbomind.chatbot import Chatbot
CSS = """
#container {
......@@ -37,7 +37,7 @@ def chat_stream(instruction: str,
llama_chatbot: Chatbot,
model_name: str = None):
bot_summarized_response = ''
model_type = 'fastertransformer'
model_type = 'turbomind'
state_chatbot = state_chatbot + [(instruction, None)]
session_id = threading.current_thread().ident
bot_response = llama_chatbot.stream_infer(
......
......@@ -3,7 +3,7 @@ import os
import fire
from lmdeploy.serve.fastertransformer.chatbot import Chatbot
from lmdeploy.serve.turbomind.chatbot import Chatbot
def input_prompt():
......
# Copyright (c) OpenMMLab. All rights reserved.
from lmdeploy.serve.fastertransformer.chatbot import Chatbot # noqa: F401,F403
from lmdeploy.serve.turbomind.chatbot import Chatbot # noqa: F401,F403
......@@ -16,9 +16,8 @@ import tritonclient.grpc as grpcclient
from tritonclient.grpc.service_pb2 import ModelInferResponse
from lmdeploy.model import MODELS
from lmdeploy.serve.fastertransformer.utils import (Postprocessor,
Preprocessor,
prepare_tensor)
from lmdeploy.serve.turbomind.utils import (Postprocessor, Preprocessor,
prepare_tensor)
@dataclass
......@@ -56,8 +55,7 @@ def get_logger(log_file=None, log_level=logging.INFO):
class Chatbot:
"""Chatbot for LLaMA series models with fastertransformer as inference
engine.
"""Chatbot for LLaMA series models with turbomind as inference engine.
Args:
tritonserver_addr (str): communicating address '<ip>:<port>' of
......@@ -277,7 +275,7 @@ class Chatbot:
f'stop_words must be a list but got {type(stop_words)}'
# each id in stop_words represents a stop word
# refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
# detailed explanation about fastertransformer's stop_words
# detailed explanation about turbomind's stop_words
stop_word_offsets = range(1, len(stop_words) + 1)
stop_words = np.array([[stop_words,
stop_word_offsets]]).astype(np.int32)
......@@ -418,7 +416,7 @@ class Chatbot:
random_seed * np.ones((1, 1), dtype=np.uint64))
]
client.start_stream(callback)
client.async_stream_infer('fastertransformer',
client.async_stream_infer('turbomind',
inputs,
sequence_id=session.session_id,
request_id=session.request_id,
......@@ -438,7 +436,7 @@ class Chatbot:
session.sequence_length - preseq_length
break
if 'errcode' in result:
logger.error(f'got error from fastertransformer, code '
logger.error(f'got error from turbomind, code '
f"{result['errcode']}, {result['errmsg']}, "
f'token {session.sequence_length}')
session.sequence_length = preseq_length
......
......@@ -140,7 +140,7 @@ def export(model_name: str,
start_id=bos_id,
end_id=eos_id,
weight_type='fp16',
# parameters for fastertransformer
# parameters for turbomind
max_batch_size=32,
max_context_token_num=4,
session_len=2048,
......@@ -179,7 +179,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}')
return False
# convert weights from llama to fastertransformer
# convert weights from llama to turbomind format
checkpoints = []
for pattern in ['*.pth', '*.pt']:
checkpoints += sorted(Path(model_path).glob(pattern))
......@@ -303,7 +303,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
f'{params_path} failed: {e}')
return False
# convert weights from hf to fastertransformer
# convert weights from hf to turbomind
model_params = {}
_qweight = 'weight'
......@@ -388,7 +388,7 @@ def pack_model_repository(workspace_path: str):
model_repo_dir = osp.join(workspace_path, 'model_repository')
os.makedirs(model_repo_dir, exist_ok=True)
os.symlink(src=osp.join('../triton_models/interactive'),
dst=osp.join(model_repo_dir, 'fastertransformer'))
dst=osp.join(model_repo_dir, 'turbomind'))
os.symlink(src=osp.join('../triton_models/preprocessing'),
dst=osp.join(model_repo_dir, 'preprocessing'))
os.symlink(src=osp.join('../triton_models/postprocessing'),
......@@ -401,7 +401,7 @@ def main(model_name: str,
tokenizer_path: str = None,
dst_path: str = './workspace',
tp: int = 1):
"""deploy llama family models via fastertransformer.
"""deploy llama family models via turbomind.
Args:
model_name (str): the name of the to-be-deployed model, such as
......@@ -445,7 +445,7 @@ def main(model_name: str,
'string_value: ' + f'"{tp}"\n' + ' }\n}\n'
f.write(param)
if not res:
print(f'deploy model "{model_name}" via fastertransformer failed')
print(f'deploy model "{model_name}" via turbomind failed')
destroy_workspace(dst_path)
exit(-1)
......
......@@ -24,7 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "fastertransformer"
name: "turbomind"
backend: "fastertransformer"
default_model_filename: "weights"
max_batch_size: 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment