"server/text_generation_server/models/idefics.py" did not exist on "91e674bb85760a19afb509cc0010d46b090183fd"
Unverified Commit 3de27ead authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

update internlm‘s chat template (#54)

* update internlm model

* update

* update

* update

* update

* update temperature, topk and top_p

* update

* update

* loosen log level
parent d2c9caa4
......@@ -23,30 +23,42 @@ class Vicuna:
return None
@MODELS.register_module(name='puyu')
class Puyu:
@MODELS.register_module(name='internlm')
class InternLM:
def __init__(self):
self.system = """meta instruction
You are an AI assistant whose name is InternLM (书生·浦语).
- 书生·浦语 is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- 书生·浦语 can understand and communicate fluently in the language chosen by the user such as English and 中文.
conversation""" # noqa: E501
self.user = '<|Human|>'
self.eou = 'െ'
self.assistant = '<|Assistant|>'
self.system = ''
self.user = '<|User|>'
self.eoh = '<eoh>'
self.eoa = '<eoa>'
self.assistant = '<|Bot|>'
def get_prompt(self, prompt, sequence_start=True):
if sequence_start:
return f'{self.system}\n' \
f'{self.user}:{prompt}{self.eou}\n' \
f'{self.user}:{prompt}{self.eoh}\n' \
f'{self.assistant}:'
else:
return f'\n{self.user}:{prompt}{self.eou}\n{self.assistant}:'
return f'\n{self.user}:{prompt}{self.eoh}\n' \
f'{self.assistant}:'
@property
def stop_words(self):
return [45623]
return [103027, 103028]
@MODELS.register_module(name='llama')
class Llama:
def __init__(self):
pass
def get_prompt(self, prompt, sequence_start=True):
return prompt
@property
def stop_words(self):
return None
def main(model_name: str = 'test'):
......
......@@ -13,7 +13,7 @@ def input_prompt():
def main(tritonserver_addr: str, model_name: str, session_id: int = 1):
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
chatbot = Chatbot(tritonserver_addr,
model_name,
log_level=log_level,
......@@ -33,7 +33,6 @@ def main(tritonserver_addr: str, model_name: str, session_id: int = 1):
request_id=request_id,
request_output_len=512):
continue
print(f'session {session_id}, {status}, {tokens}, {res}')
nth_round += 1
......
......@@ -34,6 +34,7 @@ class Session:
class StatusCode(Enum):
TRITON_STREAM_END = 0 # end of streaming
TRITON_STREAM_ING = 1 # response is in streaming
TRITON_SESSION_READY = 2 # session is ready for inference
TRITON_SERVER_ERR = -1 # triton server's error
TRITON_SESSION_CLOSED = -2 # session has been closed
TRITON_SESSION_OUT_OF_LIMIT = -3 # request length out of limit
......@@ -79,9 +80,9 @@ class Chatbot:
tritonserver_addr: str,
model_name: str,
session_len: int = 2048,
top_p: float = 1.0,
top_k: int = 40,
temperature: float = 1.0,
top_p: float = 0.8,
top_k: int = None,
temperature: float = 0.8,
repetition_penalty: float = 1.0,
ignore_eos: bool = False,
log_level: int = logging.INFO,
......@@ -340,6 +341,7 @@ class Chatbot:
preseq_length = session.sequence_length
session.response = ''
session.status = StatusCode.TRITON_SESSION_READY
que = queue.Queue()
producer = threading.Thread(target=self._stream_producer,
......@@ -375,8 +377,6 @@ class Chatbot:
prepare_tensor('input_ids', input_ids),
prepare_tensor('input_lengths', input_lengths),
prepare_tensor('request_output_len', request_output_len),
prepare_tensor('runtime_top_k',
cfg.top_k * np.ones((1, 1), dtype=np.uint32)),
prepare_tensor('runtime_top_p',
cfg.top_p * np.ones((1, 1), dtype=np.float32)),
prepare_tensor(
......@@ -389,6 +389,10 @@ class Chatbot:
prepare_tensor('step',
preseq_length * np.ones((1, 1), dtype=np.int32))
]
if cfg.top_k is not None:
inputs += prepare_tensor(
'runtime_top_k',
cfg.top_k * np.ones((1, 1), dtype=np.uint32)),
if cfg.stop_words is not None:
inputs += [prepare_tensor('stop_words_list', cfg.stop_words)]
if cfg.bad_words is not None:
......@@ -435,6 +439,7 @@ class Chatbot:
yield StatusCode.TRITON_STREAM_END, \
session.response[len(session.prompt):], \
session.sequence_length - preseq_length
session.status = StatusCode.TRITON_STREAM_END
break
if 'errcode' in result:
logger.error(f'got error from turbomind, code '
......@@ -472,10 +477,16 @@ class Chatbot:
sequence_length)
text = output_str[0].decode()
if display:
if len(text) > len(session.prompt):
if session.status == StatusCode.TRITON_SESSION_READY:
new_text = text[len(session.prompt):]
session.status = StatusCode.TRITON_STREAM_ING
else:
new_text = text[len(session.response):]
print(new_text, end='', flush=True)
session.response = text
if len(session.response) > len(session.prompt):
session.status = StatusCode.TRITON_STREAM_ING
yield (StatusCode.TRITON_STREAM_ING,
session.response[len(session.prompt):],
sequence_length.squeeze())
......
......@@ -280,13 +280,11 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
if osp.exists(tokenizer_path):
shutil.copy(tokenizer_path,
osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
for json_file in os.listdir(model_path):
if json_file.endswith(
'.json') and json_file != 'pytorch_model.bin.index.json':
json_path = osp.join(model_path, json_file)
shutil.copy(
json_path,
osp.join(triton_models_path, 'tokenizer', json_file))
for _file in os.listdir(model_path):
if _file.endswith('.json') or _file.endswith('.py'):
json_path = osp.join(model_path, _file)
shutil.copy(json_path,
osp.join(triton_models_path, 'tokenizer', _file))
else:
print(f'tokenizer model {tokenizer_path} does not exist')
exit(-1)
......
......@@ -27,12 +27,14 @@ class Tokenizer:
if not osp.exists(backend_tokenizer_file):
print('WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_folder)
self.model = AutoTokenizer.from_pretrained(model_folder,
trust_remote_code=True)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file):
if not osp.exists(backend_tokenizer_file) and \
hasattr(self.model, 'backend_tokenizer'):
self.model.backend_tokenizer.save(backend_tokenizer_file)
def encode(self, s: str):
......
......@@ -29,12 +29,14 @@ class Tokenizer:
if not osp.exists(backend_tokenizer_file):
print('WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_folder)
self.model = AutoTokenizer.from_pretrained(model_folder,
trust_remote_code=True)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file):
if not osp.exists(backend_tokenizer_file) and \
hasattr(self.model, 'backend_tokenizer'):
self.model.backend_tokenizer.save(backend_tokenizer_file)
def encode(self, s: str):
......
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import random
......@@ -50,8 +51,8 @@ def main(model_name, model_path, session_id: int = 1):
random_seed=seed if nth_round == 1 else None):
res, tokens = outputs[0]
# decode res
response = tokenizer.decode(
res[step:], skip_special_tokens=True)
response = tokenizer.decode(res[step:],
skip_special_tokens=True)
print(f'session {session_id}, {tokens}, {response}')
# update step
step = tokens - 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment